This commit is contained in:
ihciah 2022-04-10 01:51:04 +08:00
commit 07277145d4
No known key found for this signature in database
GPG Key ID: 97CE6E121061F3BA
47 changed files with 6364 additions and 0 deletions

2
.cargo/config Normal file
View File

@ -0,0 +1,2 @@
[build]
rustflags = ["--cfg", "unsound_local_offset"]

2
.dockerignore Normal file
View File

@ -0,0 +1,2 @@
/target/
/.git/

73
.github/workflows/ci.yaml vendored Normal file
View File

@ -0,0 +1,73 @@
name: CI
on:
push:
paths-ignore:
- '**.md'
- '**.png'
pull_request:
paths-ignore:
- '**.md'
- '**.png'
env:
RUST_TOOLCHAIN: nightly
TOOLCHAIN_PROFILE: minimal
jobs:
lints:
name: Run cargo fmt and cargo clippy
runs-on: ubuntu-latest
steps:
- name: Checkout sources
uses: actions/checkout@v2
- name: Install toolchain
uses: actions-rs/toolchain@v1
with:
profile: ${{ env.TOOLCHAIN_PROFILE }}
toolchain: ${{ env.RUST_TOOLCHAIN }}
override: true
components: rustfmt, clippy
- name: Cache
uses: Swatinem/rust-cache@v1
- name: Run cargo fmt
uses: actions-rs/cargo@v1
with:
command: fmt
args: --all -- --check
- name: Run cargo check with no default features
uses: actions-rs/cargo@v1
with:
command: check
args: --no-default-features
- name: Run cargo check with all features
uses: actions-rs/cargo@v1
with:
command: check
args: --all-features
- name: Run cargo clippy
uses: actions-rs/cargo@v1
with:
command: clippy
args: -- -D warnings
test:
name: Run cargo test
runs-on: ubuntu-latest
steps:
- name: Checkout sources
uses: actions/checkout@v2
- name: Install toolchain
uses: actions-rs/toolchain@v1
with:
profile: ${{ env.TOOLCHAIN_PROFILE }}
toolchain: ${{ env.RUST_TOOLCHAIN }}
override: true
- name: Cache
uses: Swatinem/rust-cache@v1
- name: Run cargo test --no-run
uses: actions-rs/cargo@v1
with:
command: test
args: --all-features --no-run
- name: Run cargo test
run: sudo bash -c "ulimit -Sl 512 && ulimit -Hl 512 && sudo -u runner RUSTUP_TOOLCHAIN=nightly /home/runner/.cargo/bin/cargo test --all-features"

43
.github/workflows/docker-build.yml vendored Normal file
View File

@ -0,0 +1,43 @@
name: docker build and push
on:
push:
tags:
- 'v*'
jobs:
build:
name: 'Build'
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Extract tag
id: prep
if: "startsWith(github.ref, 'refs/tags/v')"
run: |
echo ::set-output name=tags::ghcr.io/qini7-sese/ehbot:${GITHUB_REF#refs/tags/v}
- name: Set up QEMU
uses: docker/setup-qemu-action@v1
with:
platforms: all
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v1
- name: Login to GHCR
uses: docker/login-action@v1
with:
registry: ghcr.io
username: qini7-sese
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build docker image
uses: docker/build-push-action@v2
with:
push: true
tags: |
ghcr.io/qini7-sese/ehbot:amd64
${{ steps.prep.outputs.tags }}
- name: Docker manifest push
run: |
docker manifest create ghcr.io/qini7-sese/ehbot:latest ghcr.io/qini7-sese/ehbot:amd64
docker manifest push ghcr.io/qini7-sese/ehbot:latest

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/target
config.yaml

2291
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

9
Cargo.toml Normal file
View File

@ -0,0 +1,9 @@
[workspace]
members = [
"bot",
"eh2telegraph",
]
[profile.release]
lto = true
opt-level = 3

9
Dockerfile Normal file
View File

@ -0,0 +1,9 @@
FROM rust:1-bullseye as builder
WORKDIR /usr/src/eh2telegraph
COPY . .
RUN cargo build --release
FROM debian:bullseye-slim
RUN apt-get update && apt-get -y install ca-certificates && rm -rf /var/lib/apt/lists/*
COPY --from=builder /usr/src/eh2telegraph/target/release/bot /usr/local/bin/bot
CMD ["/usr/local/bin/bot"]

92
README-zh.md Normal file
View File

@ -0,0 +1,92 @@
# eh2telegraph
中文|[英文](README.md)
自动从 EH/EX/NH 下载图片集并上传至 Telegraph 的 Bot。
本代码只保证在 MacOS部分功能和 Linux 上可以正确运行。
## 部署指引
1. 安装 Docker 和 docker-compose。
2. 创建新文件夹 `ehbot`
2. 复制项目中的 `config_example.yaml``ehbot` 并重命名为 `config.yaml`,之后修改配置细节(请参考下一节)。
3. 复制 `docker-compose.yml``ehbot`
4. 开启与关闭:
1. 开启:在该路径中运行 `docker-compose up -d`
2. 关闭:在该路径中运行 `docker-compose down`
3. 查看日志:在该路径中运行 `docker-compose logs`
4. 更新镜像:在该路径中运行 `docker-compose pull`
## 配置指引
1. 基础配置:
1. Bot TokenTelegram 内找 @BotFather 申请。
2. Admin可空你的 Telegram ID随便找个相关 Bot 就可以拿到(也可以通过本 Bot `/id` 拿到)。
3. Telegraph使用浏览器通过[这个链接](https://api.telegra.ph/createAccount?short_name=test_account&author_name=test_author)创建 Telegraph Token 并填写。你也可以修改作者名字和 URL。
2. 代理配置:
1. 部署本仓库中的 `worker/web_proxy.js` 至 CloudFlare Workers并配置 `KEY` 环境变量为一段随机字符串(该 KEY 目的是防止对代理的未授权请求)。
2. 填写 URL 和 Key 到配置中。
3. 该代理用于请求一些有频率限制的服务,请勿滥用。
3. IPv6 配置:
1. 可以填写一个 IPv6 段,如果你并没有拥有一个较大的(指比 `/64`IPv6 段,请留空。
2. 填写的话需要开启 `net.ipv6.ip_nonlocal_bind` 内核参数(参考后续章节说明)。
3. 配置 IPv6 可以一定程度上缓解针对单 IP 的限流。
4. 配置部分 Collector 的 Cookie
1. 目前只有 exhentai 需要。
5. KV 配置:
1. 本项目内置使用了一个缓存服务,可以避免对一个图片集的重复同步。
2. 请参考 [cloudflare-kv-proxy](https://github.com/ihciah/cloudflare-kv-proxy) 进行部署,并填写至配置文件。
3. 如果不想使用远程缓存,也可以使用纯内存缓存(重启后会失效),需要自行改代码并重新编译。
## 开发指引
### 环境
需要 Rust 最新的 Nightly 版本。推荐使用 VSCode 或 Clion 开发。
中国大陆推荐使用 [RsProxy](https://rsproxy.cn/) 作为 crates.io 镜像与工具链安装源。
### 版本发布
`v` 开头的 Tag 即可触发 Docker 构建。你可以直接在 git 中打 tag 之后 push 上去;但更方便的是在 github 中发布 release并填写 `v` 开头的命名。
## 技术细节
虽然本项目就是一个简单的爬虫,但是还是有一些注意事项需要说明一下。
### Github Action 构建
Github Action 可以用于自动构建 Docker 镜像,本项目支持自动构建 `x86_64` 平台的版本。
但事实上也可以构建 `arm64` 的版本,由于其机制上使用了 qemu 在 x86_64 上模拟了 arm 环境,所以速度极其缓慢(单次构建需要 1h 以上),于是没有开启。
### IPv6 幽灵客户端(口胡的名字)
某些网站有针对 IP 的访问频率限制,使用多个 IP 即可缓解该限制。实践上最常用的办法是代理池,但代理池往往极不稳定,并需要维护,可能还有一定成本。
观察本项目的目标网站,很多使用了 Cloudflare而 Cloudflare 支持 IPv6 且限流粒度是 `/64`。如果我们为本机绑定一个更大的 IPv6 段并从中随机选择 IP 作为客户端出口地址,则可以稳定地进行更高频率的请求。
由于网卡只会绑定单个 IPv6 地址,所以我们需要开启 `net.ipv6.ip_nonlocal_bind`
配置 IPv6 后,对于可以走 IPv6 的目标站点,本项目会使用 IPv6 段中的随机 IP 请求。
配置(对网卡的配置可以写在 `if-up` 中便于持久化):
1. `sudo ip add add local 2001:x:x::/48 dev lo`
2. `sudo ip route add local 2001:x:x::/48 dev your-interface`
3. 在 Sysctl 中配置 `net.ipv6.ip_nonlocal_bind=1`。该步骤因发行版而异(比如常见的 `/etc/sysctl.conf` 在 Arch Linux 中不存在)。
去哪搞 IPv6he.net 提供了相关免费服务,当然自己购买一个 IPv6 IP 段也并不昂贵。
你可以通过 `curl --interface 2001:***** ifconfig.co` 测试配置是否正确。
### 强制 IPv6
前一小节提到的网站虽然用了 Cloudflare但是事实上并没有真正启用 IPv6。当你直接使用 curl 指定 ipv6 请求时会发现,它根本就没有 AAAA 记录。但是由于 CF 的基础设施是 Anycast 的,所以如果目标网站不在代码中明确地拒绝 IPv6 访客,它们还是可以通过 IPv6 访问的。
1. telegra.ph: 无 AAAA 记录,但是强制解析到 Telegram 的入口 IP 可以访问,但证书是 `*.telegram.org` 的。
~~本项目写了一个校验指定域名证书有效性的 TLS 验证器,用于在保证安全性的情况下允许其证书配置错误。~~
但是 Telegraph 以极快的速度修掉了该问题,所以该 TLS 校验器目前处于禁用状态。
2. EH/NH: 强制 IPv6 可用。
3. EX: 未使用 CF 且无 IPv6 服务。
### 代理
本项目使用 Cloudflare Workers 作为部分 API 代理,在 IPv6 不可用时缓解限流问题。参考 `src/http_proxy.rs``worker/web_proxy.js`
### 缓存
为了尽可能少地重复拉取,本项目使用了内存缓存与远程持久化缓存。远程持久化缓存使用 Cloudflare Worker 配合 Cloudflare KV 搭建。项目主代码参考 [cloudflare-kv-proxy](https://github.com/ihciah/cloudflare-kv-proxy)。
由于同步图片集需要一定时间,为了避免重复同步,本项目使用了 [singleflight-async](https://github.com/ihciah/singleflight-async) 减少这类浪费。

94
README.md Normal file
View File

@ -0,0 +1,94 @@
# eh2telegraph
[中文](README-zh.md)|英文
Bot that automatically downloads image sets from EH/EX/NH and uploads them to Telegraph.
This code is only guaranteed to work correctly on MacOS (partial functionality) and Linux.
## Deployment Guidelines
1. Install Docker and docker-compose.
2. Create a new folder `ehbot`.
2. Copy `config_example.yaml` from the project to `ehbot` and rename it to `config.yaml`, then change the configuration details (see the next section).
3. Copy `docker-compose.yml` to `ehbot`.
4. Start and Shutdown.
1. Start: Run `docker-compose up -d` in this folder.
2. Shutdown: Run `docker-compose down` in this folder.
3. View logs: Run `docker-compose logs` in this folder.
4. Update the image: Run `docker-compose pull` in this folder.
## Configuration Guidelines
1. Basic Configuration
Bot Token: Find @BotFather in Telegram to apply.
2. Admin (can be empty): your Telegram ID, you can get it from any relevant Bot (you can also get it from this Bot `/id`).
3. Telegraph: Use your browser to create a Telegraph Token via [this link](https://api.telegra.ph/createAccount?short_name=test_account&author_name=test_author) and fill in. You can also change the author name and URL.
2. Proxy Configuration
1. Deploy `worker/web_proxy.js` of this repository to Cloudflare Workers and configure the `KEY` environment variable to be a random string (the purpose of the `KEY` is to prevent unauthorized requests to the proxy).
2. Fill in the URL and Key into the yaml.
3. The proxy is used to request some services with frequency limitation, so do not abuse it.
3. IPv6 configuration
1. You can specify an IPv6 segment, if you do not have a larger (meaning larger than `/64`) IPv6 segment, please leave it blank.
2. Configure IPv6 to somewhat alleviate the flow restriction for single IP.
4. Configure cookies for some Collectors.
1. Currently, only exhentai is required.
5. KV configuration
1. This project uses a built-in caching service to avoid repeated synchronization of an image set.
2. Please refer to [cloudflare-kv-proxy](https://github.com/ihciah/cloudflare-kv-proxy) for deployment and fill in the yaml file.
3. If you don't want to use remote caching, you can also use pure memory caching (it will be invalid after reboot). If you want to do so, you need to modify the code and recompile it by yourself.
## Development Guidelines
### Environment
Requires the latest Nightly version of Rust. Recommended to use VSCode or Clion for development.
[RsProxy](https://rsproxy.cn/) is recommended as the crates.io source and toolchain installation source for users in China Mainland.
### Version Release
A Docker build can be triggered by typing a Tag starting with `v`. You can type the tag directly in git and push it up; however, it is easier to publish the release in github and fill in the `v` prefix.
## Technical Details
Although this project is a simple crawler, there are still some considerations that need to be explained.
### Github Action Builds
Github Action can be used to automatically build Docker images, and this project supports automatic builds for the `x86_64` platform.
However, it can also build `arm64` versions, but it is not enabled because it uses qemu to emulate the arm environment on x86_64, so it is extremely slow (more than 1h for a single build).
### IPv6 Ghost Client (it's not a well-known name, just made up by myself)
Some sites have IP-specific access frequency limits, which can be mitigated by using multiple IPs. The most common approach in practice is proxy pooling, but proxy pools are often extremely unstable and require maintenance and possibly some cost.
Observe the target sites of this project, many use Cloudflare, and Cloudflare supports IPv6 and the granularity of flow limitation is `/64`. If we bind a larger IPv6 segment for the local machine and randomly select IPs from it as client exit addresses, we can make more frequent requests steadily.
Since the NIC will only bind a single IPv6 address, we need to enable `net.ipv6.ip_nonlocal_bind`.
After configuring IPv6, for target sites that can use IPv6, this project will use random IP requests from the IPv6 segment.
Configuration (configuration for the NIC can be written in `if-up` for persistence).
1. `sudo ip add add local 2001:x:x::/48 dev lo`
2. `sudo ip route add local 2001:x:x::/48 dev your-interface`
3. Configure `net.ipv6.ip_nonlocal_bind=1` in Sysctl. This step varies by distribution (for example, the common `/etc/sysctl.conf` does not exist in Arch Linux).
Where to get IPv6? he.net offers a free service for this, but of course it is not expensive to buy an IPv6 IP segment yourself.
You can test the configuration with `curl --interface 2001:***** ifconfig.co` to see if it is correct.
### Forcing IPv6
The site mentioned in the previous subsection uses Cloudflare, but in fact does not really enable IPv6. when you specify the ipv6 request directly using curl, you will find that it has no AAAA records at all. But because the CF infrastructure is Anycast, so if the target site does not explicitly deny IPv6 visitors in the code, they can still be accessed through IPv6.
1. telegra.ph: No AAAA records, but force resolves to Telegram's entry IP for access, but the certificate is `*.telegram.org`.
~~This project writes a TLS validator that checks the validity of a given domain's certificate, to allow for misconfiguration of its certificate while maintaining security.~~
However, Telegraph fixed the problem very quickly, so the TLS verifier is currently disabled.
2. EH/NH: Forced IPv6 availability.
3. EX: CF is not used and no IPv6 service is available.
### Proxy
This project uses Cloudflare Workers as a partial API proxy to alleviate the flow limitation problem when IPv6 is not available. See `src/http_proxy.rs` and `worker/web_proxy.js`.
### Caching
To minimize duplicate pulls, this project uses in-memory caching and remote persistent caching. Remote persistent cache using Cloudflare Worker with Cloudflare KV to build. The main project code reference is [cloudflare-kv-proxy](https://github.com/ihciah/cloudflare-kv-proxy).
Since it takes some time to synchronize image sets, to avoid repeated synchronization, this project uses [singleflight-async](https://github.com/ihciah/singleflight-async) to reduce this kind of waste.
Translated with www.DeepL.com/Translator (free version)

24
bot/Cargo.toml Normal file
View File

@ -0,0 +1,24 @@
[package]
edition = "2021"
name = "bot"
version = "0.1.3"
[dependencies]
eh2telegraph = {path = "../eh2telegraph"}
anyhow = "1"
clap = {version = "3", features = ["derive"]}
dptree = "0.1"
once_cell = "1"
regex = "1"
reqwest = {version = "0.11", default-features = false, features = ["json", "multipart", "rustls-tls"]}
serde = {version = "1", features = ["derive"]}
singleflight-async = {version = "0.1", features = ["hardware-lock-elision"]}
teloxide = {version = "0.7", features = ["macros", "ctrlc_handler", "dispatching2", "auto-send"]}
time = {version = "0.3", features = ["local-offset", "std", "macros"]}
tokio = {version = "1", default-features = false, features = ["rt-multi-thread", "macros", "net", "sync", "time", "parking_lot"]}
tracing = "0.1"
tracing-subscriber = {version = "0.3", features = ["local-time", "parking_lot", "time"]}
[build-dependencies]
vergen = {version = "7", default_features = false, features = ["build", "cargo", "rustc"]}

6
bot/build.rs Normal file
View File

@ -0,0 +1,6 @@
use vergen::{vergen, Config};
fn main() {
// Generate the default 'cargo:' instruction output
vergen(Config::default()).unwrap()
}

413
bot/src/handler.rs Normal file
View File

@ -0,0 +1,413 @@
use std::{borrow::Cow, collections::HashSet};
use eh2telegraph::{
collector::{e_hentai::EHCollector, exhentai::EXCollector, nhentai::NHCollector},
searcher::{
f_hash::FHashConvertor,
saucenao::{SaucenaoOutput, SaucenaoParsed, SaucenaoSearcher},
ImageSearcher,
},
storage::KVStorage,
sync::Synchronizer,
};
use reqwest::Url;
use teloxide::{
adaptors::DefaultParseMode,
prelude2::*,
utils::{
command::BotCommand,
markdown::{code_inline, escape, link},
},
};
use tracing::{info, trace};
use crate::{ok_or_break, util::PrettyChat};
const MIN_SIMILARITY: u8 = 70;
const MIN_SIMILARITY_PRIVATE: u8 = 50;
#[derive(BotCommand, Clone)]
#[command(
rename = "lowercase",
description = "\
This is a gallery synchronization robot that is convenient for users to view pictures directly in Telegram.\n\
便 Telegram \n\
Join develop group or contact @ByteRabbit if you need.\n\
@ByteRabbit\n\n\
Bot supports sync with command, text url, or image(private chat search thrashold is lower).\n\
() \n\n\
Bot develop group / Bot https://t.me/TGSyncBotWorkGroup\n\
And welcome to join our channel / https://t.me/sesecollection\n\n\
These commands are supported:\n\
:"
)]
pub enum Command {
#[command(description = "Display this help. 显示这条帮助信息。")]
Help,
#[command(description = "Show bot verison. 显示机器人版本。")]
Version,
#[command(description = "Show your account id. 显示你的账号 ID。")]
Id,
#[command(
description = "Sync a gallery(e-hentai/exhentai/nhentai are supported now). 同步一个画廊(目前支持 EH/EX/NH)"
)]
Sync(String),
}
#[derive(BotCommand, Clone)]
#[command(rename = "lowercase", description = "Command for admins")]
pub enum AdminCommand {
#[command(description = "Delete cache with given key.")]
Delete(String),
}
pub struct Handler<C> {
pub synchronizer: Synchronizer<C>,
pub searcher: SaucenaoSearcher,
pub convertor: FHashConvertor,
pub admins: HashSet<i64>,
single_flight: singleflight_async::SingleFlight<String>,
}
impl<C> Handler<C>
where
C: KVStorage<String> + Send + Sync + 'static,
{
pub fn new(synchronizer: Synchronizer<C>, admins: HashSet<i64>) -> Self {
Self {
synchronizer,
searcher: SaucenaoSearcher::new_from_config(),
convertor: FHashConvertor::new_from_config(),
admins,
single_flight: Default::default(),
}
}
/// Executed when a command comes in and parsed successfully.
pub async fn respond_cmd(
&'static self,
bot: AutoSend<DefaultParseMode<Bot>>,
msg: Message,
command: Command,
) -> ControlFlow<()> {
match command {
Command::Help => {
let _ = bot
.send_message(msg.chat.id, escape(&Command::descriptions()))
.reply_to_message_id(msg.id)
.await;
}
Command::Version => {
let _ = bot
.send_message(msg.chat.id, escape(crate::version::VERSION))
.reply_to_message_id(msg.id)
.await;
}
Command::Id => {
let _ = bot
.send_message(
msg.chat.id,
format!(
"Current chat id is {} \\(in private chat this is your account id\\)",
code_inline(&msg.chat.id.to_string())
),
)
.reply_to_message_id(msg.id)
.await;
}
Command::Sync(url) => {
if url.is_empty() {
let _ = bot
.send_message(msg.chat.id, escape("Usage: /sync url"))
.reply_to_message_id(msg.id)
.await;
return ControlFlow::BREAK;
}
info!(
"[cmd handler] receive sync request from {:?} for {url}",
PrettyChat(&msg.chat)
);
let msg: Message = ok_or_break!(
bot.send_message(msg.chat.id, escape(&format!("Syncing url {url}")))
.reply_to_message_id(msg.id)
.await
);
tokio::spawn(async move {
let _ = bot
.edit_message_text(msg.chat.id, msg.id, self.sync_response(&url).await)
.await;
});
}
};
ControlFlow::BREAK
}
pub async fn respond_admin_cmd(
&'static self,
bot: AutoSend<DefaultParseMode<Bot>>,
msg: Message,
command: AdminCommand,
) -> ControlFlow<()> {
match command {
AdminCommand::Delete(key) => {
let _ = self.synchronizer.delete_cache(&key).await;
let _ = bot
.send_message(msg.chat.id, escape(&format!("Key {key} deleted.")))
.reply_to_message_id(msg.id)
.await;
ControlFlow::BREAK
}
}
}
pub async fn respond_text(
&'static self,
bot: AutoSend<DefaultParseMode<Bot>>,
msg: Message,
) -> ControlFlow<()> {
let maybe_link = {
let entries = msg
.entities()
.map(|es| {
es.iter().filter_map(|e| {
if let teloxide::types::MessageEntityKind::TextLink { url } = &e.kind {
Synchronizer::match_url_from_text(url.as_ref()).map(ToOwned::to_owned)
} else {
None
}
})
})
.into_iter()
.flatten();
msg.text()
.and_then(|content| {
Synchronizer::match_url_from_text(content).map(ToOwned::to_owned)
})
.into_iter()
.chain(entries)
.next()
};
if let Some(url) = maybe_link {
info!(
"[text handler] receive sync request from {:?} for {url}",
PrettyChat(&msg.chat)
);
let msg: Message = ok_or_break!(
bot.send_message(msg.chat.id, escape(&format!("Syncing url {url}")))
.reply_to_message_id(msg.id)
.await
);
tokio::spawn(async move {
let _ = bot
.edit_message_text(msg.chat.id, msg.id, self.sync_response(&url).await)
.await;
});
return ControlFlow::BREAK;
}
// fallback to the next branch
ControlFlow::CONTINUE
}
pub async fn respond_caption(
&'static self,
bot: AutoSend<DefaultParseMode<Bot>>,
msg: Message,
) -> ControlFlow<()> {
let caption_entities = msg.caption_entities();
let mut final_url = None;
for entry in caption_entities.map(|x| x.iter()).into_iter().flatten() {
let url = match &entry.kind {
teloxide::types::MessageEntityKind::Url => {
let raw = msg
.caption()
.expect("Url MessageEntry found but caption is None");
let encoded: Vec<_> = raw
.encode_utf16()
.into_iter()
.skip(entry.offset)
.take(entry.length)
.collect();
let content = ok_or_break!(String::from_utf16(&encoded));
Cow::from(content)
}
teloxide::types::MessageEntityKind::TextLink { url } => Cow::from(url.as_ref()),
_ => {
continue;
}
};
let url = if let Some(c) = Synchronizer::match_url_from_url(&url) {
c
} else {
continue;
};
final_url = Some(url.to_string());
break;
}
match final_url {
Some(url) => {
info!(
"[caption handler] receive sync request from {:?} for {url}",
PrettyChat(&msg.chat)
);
let msg: Message = ok_or_break!(
bot.send_message(msg.chat.id, escape(&format!("Syncing url {url}")))
.reply_to_message_id(msg.id)
.await
);
let url = url.to_string();
tokio::spawn(async move {
let _ = bot
.edit_message_text(msg.chat.id, msg.id, self.sync_response(&url).await)
.await;
});
ControlFlow::BREAK
}
None => ControlFlow::CONTINUE,
}
}
pub async fn respond_photo(
&'static self,
bot: AutoSend<DefaultParseMode<Bot>>,
msg: Message,
) -> ControlFlow<()> {
let first_photo = match msg.photo().and_then(|x| x.first()) {
Some(p) => p,
None => {
return ControlFlow::CONTINUE;
}
};
let f = ok_or_break!(bot.get_file(&first_photo.file_id).await);
let mut buf: Vec<u8> = Vec::with_capacity(f.file_size as usize);
ok_or_break!(teloxide::net::Download::download_file(&bot, &f.file_path, &mut buf).await);
let search_result: SaucenaoOutput = ok_or_break!(self.searcher.search(buf).await);
let mut url_sim = None;
let threshold = if msg.chat.is_private() {
MIN_SIMILARITY_PRIVATE
} else {
MIN_SIMILARITY
};
for element in search_result
.data
.into_iter()
.filter(|x| x.similarity >= threshold)
{
match element.parsed {
SaucenaoParsed::EHentai(f_hash) => {
url_sim = Some((
ok_or_break!(self.convertor.convert_to_gallery(&f_hash).await),
element.similarity,
));
break;
}
SaucenaoParsed::NHentai(nid) => {
url_sim = Some((format!("https://nhentai.net/g/{nid}/"), element.similarity));
break;
}
_ => continue,
}
}
let (url, sim) = match url_sim {
Some(u) => u,
None => {
trace!("[photo handler] image not found");
return ControlFlow::CONTINUE;
}
};
info!(
"[photo handler] receive sync request from {:?} for {url} with similarity {sim}",
PrettyChat(&msg.chat)
);
if let Ok(msg) = bot
.send_message(msg.chat.id, escape(&format!("Syncing url {url}")))
.reply_to_message_id(msg.id)
.await
{
tokio::spawn(async move {
let _ = bot
.edit_message_text(msg.chat.id, msg.id, self.sync_response(&url).await)
.await;
});
}
ControlFlow::BREAK
}
pub async fn respond_default(
&'static self,
bot: AutoSend<DefaultParseMode<Bot>>,
msg: Message,
) -> ControlFlow<()> {
if msg.chat.is_private() {
ok_or_break!(
bot.send_message(msg.chat.id, escape("Unrecognized message."))
.reply_to_message_id(msg.id)
.await
);
}
#[cfg(debug_assertions)]
tracing::warn!("{:?}", msg);
ControlFlow::BREAK
}
async fn sync_response(&self, url: &str) -> String {
self.single_flight
.work(url, || async {
match self.route_sync(url).await {
Ok(url) => {
format!("Sync to telegraph finished: {}", link(&url, &escape(&url)))
}
Err(e) => {
format!("Sync to telegraph failed: {}", escape(&e.to_string()))
}
}
})
.await
}
async fn route_sync(&self, url: &str) -> anyhow::Result<String> {
let u = Url::parse(url).map_err(|_| anyhow::anyhow!("Invalid url"))?;
let host = u.host_str().unwrap_or_default();
let path = u.path().to_string();
// TODO: use macro to generate them
#[allow(clippy::single_match)]
match host {
"e-hentai.org" => {
info!("[registry] sync e-hentai for path {}", path);
self.synchronizer
.sync::<EHCollector>(path)
.await
.map_err(anyhow::Error::from)
}
"nhentai.to" | "nhentai.net" => {
info!("[registry] sync nhentai for path {}", path);
self.synchronizer
.sync::<NHCollector>(path)
.await
.map_err(anyhow::Error::from)
}
"exhentai.org" => {
info!("[registry] sync exhentai for path {}", path);
self.synchronizer
.sync::<EXCollector>(path)
.await
.map_err(anyhow::Error::from)
}
_ => Err(anyhow::anyhow!("no matching collector")),
}
}
}

216
bot/src/main.rs Normal file
View File

@ -0,0 +1,216 @@
#![feature(control_flow_enum)]
use eh2telegraph::{
collector::Registry,
config::{self},
http_proxy::ProxiedClient,
storage,
sync::Synchronizer,
telegraph::Telegraph,
};
use clap::Parser;
use teloxide::{
adaptors::DefaultParseMode,
dispatching::update_listeners,
error_handlers::IgnoringErrorHandler,
prelude2::*,
types::{AllowedUpdate, ChatPermissions, ParseMode, UpdateKind},
};
use handler::{Command, Handler};
use crate::{
handler::AdminCommand,
util::{wrap_endpoint, PrettyChat},
};
mod handler;
mod util;
mod version;
#[derive(Debug, serde::Deserialize)]
pub struct BaseConfig {
pub bot_token: String,
pub telegraph: TelegraphConfig,
#[serde(default)]
pub admins: Vec<i64>,
}
#[derive(Debug, serde::Deserialize)]
pub struct TelegraphConfig {
pub tokens: Vec<String>,
pub author_name: Option<String>,
pub author_url: Option<String>,
}
#[derive(Parser, Debug)]
#[clap(author, version=version::VERSION, about, long_about = "eh2telegraph sync bot")]
struct Args {
#[clap(short, long, help = "Config file path")]
config: Option<String>,
}
#[tokio::main]
async fn main() {
let args = Args::parse();
let timer = tracing_subscriber::fmt::time::LocalTime::new(time::macros::format_description!(
"[month]-[day] [hour]:[minute]:[second]"
));
tracing_subscriber::fmt().with_timer(timer).init();
tracing::info!("initializing...");
config::init(args.config);
let base_config: BaseConfig = config::parse("base")
.expect("unable to parse base config")
.expect("base config can not be empty");
let telegraph_config = base_config.telegraph;
let telegraph =
Telegraph::new(telegraph_config.tokens).with_proxy(ProxiedClient::new_from_config());
let registry = Registry::new_from_config();
#[cfg(debug_assertions)]
let cache = storage::SimpleMemStorage::default();
#[cfg(not(debug_assertions))]
let cache =
storage::cloudflare_kv::CFStorage::new_from_config().expect("unable to build storage");
let mut synchronizer = Synchronizer::new(telegraph, registry, cache);
if telegraph_config.author_name.is_some() {
synchronizer =
synchronizer.with_author(telegraph_config.author_name, telegraph_config.author_url);
}
let admins = base_config.admins.into_iter().collect();
let handler = Box::leak(Box::new(Handler::new(synchronizer, admins))) as &Handler<_>;
// === Bot related ===
let command_handler = move |bot: AutoSend<DefaultParseMode<Bot>>,
message: Message,
command: Command| async move {
handler.respond_cmd(bot, message, command).await
};
let admin_command_handler = move |bot: AutoSend<DefaultParseMode<Bot>>,
message: Message,
command: AdminCommand| async move {
handler.respond_admin_cmd(bot, message, command).await
};
let text_handler = move |bot: AutoSend<DefaultParseMode<Bot>>, message: Message| async move {
handler.respond_text(bot, message).await
};
let caption_handler = move |bot: AutoSend<DefaultParseMode<Bot>>, message: Message| async move {
handler.respond_caption(bot, message).await
};
let photo_handler = move |bot: AutoSend<DefaultParseMode<Bot>>, message: Message| async move {
handler.respond_photo(bot, message).await
};
let default_handler = move |bot: AutoSend<DefaultParseMode<Bot>>, message: Message| async move {
handler.respond_default(bot, message).await
};
let permission_filter = |bot: AutoSend<DefaultParseMode<Bot>>, message: Message| async move {
// If the bot is blocked, we will leave chat and not respond.
let blocked = message
.chat
.permissions()
.map(|p| !p.contains(ChatPermissions::SEND_MESSAGES))
.unwrap_or_default();
if blocked {
tracing::info!(
"[permission filter] leave chat {:?}",
PrettyChat(&message.chat)
);
let _ = bot.leave_chat(message.chat.id).await;
None
} else {
Some(message)
}
};
let bot = Bot::new(base_config.bot_token)
.parse_mode(ParseMode::MarkdownV2)
.auto_send();
let mut bot_dispatcher = Dispatcher::builder(
bot.clone(),
dptree::entry()
.chain(dptree::filter_map(move |update: Update| {
match update.kind {
UpdateKind::Message(x) | UpdateKind::EditedMessage(x) => Some(x),
_ => None,
}
}))
.chain(dptree::filter_map_async(permission_filter))
.branch(
dptree::entry()
.chain(dptree::filter(move |message: Message| {
handler.admins.contains(&message.chat.id)
}))
.filter_command::<AdminCommand>()
.branch(wrap_endpoint(admin_command_handler)),
)
.branch(
dptree::entry()
.filter_command::<Command>()
.branch(wrap_endpoint(command_handler)),
)
.branch(
dptree::entry()
.chain(dptree::filter_map(move |message: Message| {
// Ownership mechanism does not allow using map.
#[allow(clippy::manual_map)]
match message.text() {
Some(v) if !v.is_empty() => Some(message),
_ => None,
}
}))
.branch(wrap_endpoint(text_handler)),
)
.branch(
dptree::entry()
.chain(dptree::filter_map(move |message: Message| {
// Ownership mechanism does not allow using map.
#[allow(clippy::manual_map)]
match message.caption_entities() {
Some(v) if !v.is_empty() => Some(message),
_ => None,
}
}))
.branch(wrap_endpoint(caption_handler)),
)
.branch(
dptree::entry()
.chain(dptree::filter_map(move |message: Message| {
// Ownership mechanism does not allow using map.
#[allow(clippy::manual_map)]
match message.photo() {
Some(v) if !v.is_empty() => Some(message),
_ => None,
}
}))
.branch(wrap_endpoint(photo_handler)),
)
.branch(wrap_endpoint(default_handler)),
)
.default_handler(Box::new(|_upd| {
#[cfg(debug_assertions)]
tracing::warn!("Unhandled update: {:?}", _upd);
Box::pin(async {})
}))
.error_handler(std::sync::Arc::new(IgnoringErrorHandler))
.build();
bot_dispatcher.setup_ctrlc_handler();
let bot_listener = update_listeners::polling(
bot,
Some(std::time::Duration::from_secs(10)),
None,
Some(vec![AllowedUpdate::Message]),
);
tracing::info!("initializing finished, bot is running");
bot_dispatcher
.dispatch_with_listener(
bot_listener,
LoggingErrorHandler::with_custom_text("An error from the update listener"),
)
.await;
}

69
bot/src/util.rs Normal file
View File

@ -0,0 +1,69 @@
use std::{convert::Infallible, ops::ControlFlow, sync::Arc};
use dptree::{di::Injectable, from_fn, Handler};
pub struct PrettyChat<'a>(pub &'a teloxide::types::Chat);
impl<'a> std::fmt::Debug for PrettyChat<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.0.is_group() || self.0.is_supergroup() {
write!(f, "GroupChat")?;
self.0.title().map(|x| write!(f, " title: {}", x));
self.0
.description()
.map(|x| write!(f, " description: {}", x));
} else if self.0.is_private() {
write!(f, "PrivateChat")?;
self.0.username().map(|x| write!(f, " username: @{}", x));
self.0.first_name().map(|x| write!(f, " first_name: {}", x));
self.0.last_name().map(|x| write!(f, " last_name: {}", x));
self.0.bio().map(|x| write!(f, " bio: {}", x));
} else if self.0.is_channel() {
write!(f, "Channel")?;
self.0.username().map(|x| write!(f, " username: @{}", x));
self.0.title().map(|x| write!(f, " title: {}", x));
self.0
.description()
.map(|x| write!(f, ", description: {}", x));
}
Ok(())
}
}
pub fn wrap_endpoint<'a, F, Input, Output, FnArgs>(
f: F,
) -> Handler<'a, Input, Result<Output, Infallible>, Infallible>
where
F: Injectable<Input, ControlFlow<Output>, FnArgs> + Send + Sync + 'a,
Input: Send + Sync + 'a,
Output: Send + Sync + 'a,
{
let f = Arc::new(f);
from_fn(move |event, _cont| {
let f = Arc::clone(&f);
async move {
let f = f.inject(&event);
let cf = f().await;
drop(f);
match cf {
ControlFlow::Continue(_) => ControlFlow::Continue(event),
ControlFlow::Break(out) => ControlFlow::Break(Ok(out)),
}
}
})
}
#[macro_export]
macro_rules! ok_or_break {
($e: expr) => {
match $e {
Ok(r) => r,
Err(_) => {
return ControlFlow::BREAK;
}
}
};
}

18
bot/src/version.rs Normal file
View File

@ -0,0 +1,18 @@
pub(crate) static VERSION: &str = concat!(
"\n",
"Build Timestamp: \t",
env!("VERGEN_BUILD_TIMESTAMP"),
"\n",
"Package Version: \t",
env!("VERGEN_BUILD_SEMVER"),
"\n",
"rustc Version: \t\t",
env!("VERGEN_RUSTC_SEMVER"),
"\n",
"cargo Profile: \t\t",
env!("VERGEN_CARGO_PROFILE"),
"\n",
"cargo Target: \t\t",
env!("VERGEN_CARGO_TARGET_TRIPLE"),
"\n",
);

27
config_example.yaml Normal file
View File

@ -0,0 +1,27 @@
base:
bot_token: xxx:xxxx
admins:
- 0
telegraph:
tokens:
- xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
author_name: Test Name
author_url: https://github.com/qini7-sese/eh2telegraph
proxy:
endpoint: https://proxy.xxx.workers.dev/
authorization: xxx
http:
ipv6_prefix:
exhentai:
ipb_pass_hash: xxx
ipb_member_id: xxx
igneous: xxx
worker_kv:
endpoint: https://kv.xxx.workers.dev
token: xxx
cache_size: 10240
expire_sec: 5184000 # 60 days

14
docker-compose.yml Normal file
View File

@ -0,0 +1,14 @@
version: "3"
services:
ehbot:
image: ghcr.io/qini7-sese/ehbot:latest
container_name: ehbot
restart: always
network_mode: "host"
environment:
CONFIG_FILE: "/config.yaml"
TZ: Asia/Shanghai
volumes:
- "./config.yaml:/config.yaml:ro"
logging:
driver: journald

30
eh2telegraph/Cargo.toml Normal file
View File

@ -0,0 +1,30 @@
[package]
edition = "2021"
name = "eh2telegraph"
version = "0.1.0"
[dependencies]
again = {version = "0.1", default_features = false, features = ["rand"]}
anyhow = "1"
bytes = "1"
clap = "3"
cloudflare-kv-proxy = "0.1"
derive_more = {version = "0.99", features = ["from_str"]}
futures = "0.3"
hashlink = "0.8"
ipnet = "2"
lazy_static = "1"
once_cell = "1"
parking_lot = {version = "0.12", features = ["hardware-lock-elision"]}
rand = "0.8"
regex = "1"
reqwest = {version = "0.11", default-features = false, features = ["json", "multipart", "rustls-tls"]}
rustls = {version = "0.20", features = ["dangerous_configuration"]}
serde = {version = "1", features = ["derive"]}
serde_with = {version = "1", features = ["macros", "json"]}
serde_yaml = "0.8"
thiserror = "1"
tokio = {version = "1", default-features = false, features = ["rt-multi-thread", "macros", "net", "sync", "time", "parking_lot"]}
tracing = "0.1"
webpki = "0.22"
webpki-roots = "0.22"

View File

@ -0,0 +1,95 @@
/// ImageBuffer for upload in batch.
pub struct ImageBuffer<T> {
buf: Vec<T>,
size: usize,
}
impl<T> Default for ImageBuffer<T> {
#[inline]
fn default() -> Self {
Self {
buf: Vec::new(),
size: 0,
}
}
}
impl<T> ImageBuffer<T>
where
T: DataSized,
{
#[inline]
pub fn new() -> Self {
Self::default()
}
#[inline]
pub fn with_capacity(n: usize) -> Self {
Self {
buf: Vec::with_capacity(n),
size: 0,
}
}
#[inline]
pub fn push(&mut self, data: T) {
self.size += data.size();
self.buf.push(data);
}
#[inline]
pub fn swap(&mut self) -> (Vec<T>, usize) {
let mut out = Vec::with_capacity(self.buf.len() * 2);
std::mem::swap(&mut self.buf, &mut out);
let mut size = 0;
std::mem::swap(&mut self.size, &mut size);
(out, size)
}
#[inline]
pub fn len(&self) -> usize {
self.buf.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.buf.len() == 0
}
#[inline]
pub fn size(&self) -> usize {
self.size
}
#[inline]
pub fn clear(&mut self) {
self.size = 0;
self.buf.clear();
}
}
pub trait DataSized {
fn size(&self) -> usize;
}
impl DataSized for bytes::Bytes {
#[inline]
fn size(&self) -> usize {
self.len()
}
}
impl DataSized for Vec<u8> {
#[inline]
fn size(&self) -> usize {
self.len()
}
}
impl<const N: usize> DataSized for Box<[u8; N]> {
#[inline]
fn size(&self) -> usize {
N
}
}

View File

@ -0,0 +1,254 @@
/// nhentai collector.
/// Host matching: e-hentai.org
use crate::{
http_client::{GhostClient, GhostClientBuilder, UA},
stream::AsyncStream,
util::match_first_group,
util::{get_bytes, get_string},
};
use again::RetryPolicy;
use ipnet::Ipv6Net;
use regex::Regex;
use reqwest::header;
use std::time::Duration;
use super::{
utils::paged::{PageFormatter, PageIndicator, Paged},
AlbumMeta, Collector, ImageData, ImageMeta,
};
lazy_static::lazy_static! {
static ref PAGE_RE: Regex = Regex::new(r#"<a href="(https://e-hentai\.org/s/\w+/[\w-]+)">"#).unwrap();
static ref IMG_RE: Regex = Regex::new(r#"<img id="img" src="(.*?)""#).unwrap();
static ref TITLE_RE: Regex = Regex::new(r#"<h1 id="gn">(.*?)</h1>"#).unwrap();
static ref RETRY_POLICY: RetryPolicy = RetryPolicy::fixed(Duration::from_millis(200))
.with_max_retries(5)
.with_jitter(true);
}
#[derive(Debug, Clone, Default)]
pub struct EHCollector {
client: GhostClient,
raw_client: reqwest::Client,
}
impl EHCollector {
pub fn new(prefix: Option<Ipv6Net>) -> Self {
let mut request_headers = header::HeaderMap::new();
request_headers.insert(
header::COOKIE,
header::HeaderValue::from_str("nw=1").unwrap(),
);
Self {
client: GhostClientBuilder::default()
.with_default_headers(request_headers)
.with_cf_resolve(&["e-hentai.org"])
.build(prefix),
raw_client: reqwest::Client::builder().user_agent(UA).build().unwrap(),
}
}
pub fn new_from_config() -> anyhow::Result<Self> {
let mut request_headers = header::HeaderMap::new();
request_headers.insert(
header::COOKIE,
header::HeaderValue::from_str("nw=1").unwrap(),
);
Ok(Self {
client: GhostClientBuilder::default()
.with_default_headers(request_headers)
.with_cf_resolve(&["e-hentai.org"])
.build_from_config()?,
raw_client: reqwest::Client::builder().user_agent(UA).build().unwrap(),
})
}
}
impl Collector for EHCollector {
type FetchError = anyhow::Error;
type FetchFuture<'a> =
impl std::future::Future<Output = anyhow::Result<(AlbumMeta, Self::ImageStream)>>;
type StreamError = anyhow::Error;
type ImageStream = EHImageStream;
#[inline]
fn name() -> &'static str {
"e-hentai"
}
fn fetch(&self, path: String) -> Self::FetchFuture<'_> {
async move {
// normalize url
let mut parts = path.trim_matches(|c| c == '/').split('/');
let g = parts.next();
let album_id = parts.next();
let album_token = parts.next();
let (album_id, album_token) = match (g, album_id, album_token) {
(Some("g"), Some(album_id), Some(album_token)) => (album_id, album_token),
_ => {
return Err(anyhow::anyhow!("invalid input path({path}), gallery url is expected(like https://e-hentai.org/g/2127986/da1deffea5)"));
}
};
let url = format!("https://e-hentai.org/g/{album_id}/{album_token}");
tracing::info!("[e-hentai] process {url}");
// clone client to force changing ip
let client = self.client.clone();
let mut paged = Paged::new(0, EHPageIndicator { base: url.clone() });
let gallery_pages = paged.pages(&client).await?;
// Since paged returns at least one page, we can safely get it.
let title = match_first_group(&TITLE_RE, &gallery_pages[0])
.unwrap_or("No Title")
.to_string();
let mut image_page_links = Vec::new();
for gallery_page in gallery_pages.iter() {
PAGE_RE.captures_iter(gallery_page).for_each(|c| {
let matching = c.get(1).expect("regexp is matched but no group 1 found");
image_page_links.push(matching.as_str().to_string());
});
}
if image_page_links.is_empty() {
return Err(anyhow::anyhow!(
"invalid url, maybe resource has been deleted."
));
}
Ok((
AlbumMeta {
link: url,
name: title,
class: None,
description: None,
authors: None,
tags: None,
},
EHImageStream {
client,
raw_client: self.raw_client.clone(),
image_page_links: image_page_links.into_iter(),
},
))
}
}
}