This commit is contained in:
ihciah 2022-04-10 01:51:04 +08:00
commit 07277145d4
No known key found for this signature in database
GPG Key ID: 97CE6E121061F3BA
47 changed files with 6364 additions and 0 deletions

2
.cargo/config Normal file
View File

@ -0,0 +1,2 @@
[build]
rustflags = ["--cfg", "unsound_local_offset"]

2
.dockerignore Normal file
View File

@ -0,0 +1,2 @@
/target/
/.git/

73
.github/workflows/ci.yaml vendored Normal file
View File

@ -0,0 +1,73 @@
name: CI
on:
push:
paths-ignore:
- '**.md'
- '**.png'
pull_request:
paths-ignore:
- '**.md'
- '**.png'
env:
RUST_TOOLCHAIN: nightly
TOOLCHAIN_PROFILE: minimal
jobs:
lints:
name: Run cargo fmt and cargo clippy
runs-on: ubuntu-latest
steps:
- name: Checkout sources
uses: actions/checkout@v2
- name: Install toolchain
uses: actions-rs/toolchain@v1
with:
profile: ${{ env.TOOLCHAIN_PROFILE }}
toolchain: ${{ env.RUST_TOOLCHAIN }}
override: true
components: rustfmt, clippy
- name: Cache
uses: Swatinem/rust-cache@v1
- name: Run cargo fmt
uses: actions-rs/cargo@v1
with:
command: fmt
args: --all -- --check
- name: Run cargo check with no default features
uses: actions-rs/cargo@v1
with:
command: check
args: --no-default-features
- name: Run cargo check with all features
uses: actions-rs/cargo@v1
with:
command: check
args: --all-features
- name: Run cargo clippy
uses: actions-rs/cargo@v1
with:
command: clippy
args: -- -D warnings
test:
name: Run cargo test
runs-on: ubuntu-latest
steps:
- name: Checkout sources
uses: actions/checkout@v2
- name: Install toolchain
uses: actions-rs/toolchain@v1
with:
profile: ${{ env.TOOLCHAIN_PROFILE }}
toolchain: ${{ env.RUST_TOOLCHAIN }}
override: true
- name: Cache
uses: Swatinem/rust-cache@v1
- name: Run cargo test --no-run
uses: actions-rs/cargo@v1
with:
command: test
args: --all-features --no-run
- name: Run cargo test
run: sudo bash -c "ulimit -Sl 512 && ulimit -Hl 512 && sudo -u runner RUSTUP_TOOLCHAIN=nightly /home/runner/.cargo/bin/cargo test --all-features"

43
.github/workflows/docker-build.yml vendored Normal file
View File

@ -0,0 +1,43 @@
name: docker build and push
on:
push:
tags:
- 'v*'
jobs:
build:
name: 'Build'
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Extract tag
id: prep
if: "startsWith(github.ref, 'refs/tags/v')"
run: |
echo ::set-output name=tags::ghcr.io/qini7-sese/ehbot:${GITHUB_REF#refs/tags/v}
- name: Set up QEMU
uses: docker/setup-qemu-action@v1
with:
platforms: all
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v1
- name: Login to GHCR
uses: docker/login-action@v1
with:
registry: ghcr.io
username: qini7-sese
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build docker image
uses: docker/build-push-action@v2
with:
push: true
tags: |
ghcr.io/qini7-sese/ehbot:amd64
${{ steps.prep.outputs.tags }}
- name: Docker manifest push
run: |
docker manifest create ghcr.io/qini7-sese/ehbot:latest ghcr.io/qini7-sese/ehbot:amd64
docker manifest push ghcr.io/qini7-sese/ehbot:latest

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/target
config.yaml

2291
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

9
Cargo.toml Normal file
View File

@ -0,0 +1,9 @@
[workspace]
members = [
"bot",
"eh2telegraph",
]
[profile.release]
lto = true
opt-level = 3

9
Dockerfile Normal file
View File

@ -0,0 +1,9 @@
FROM rust:1-bullseye as builder
WORKDIR /usr/src/eh2telegraph
COPY . .
RUN cargo build --release
FROM debian:bullseye-slim
RUN apt-get update && apt-get -y install ca-certificates && rm -rf /var/lib/apt/lists/*
COPY --from=builder /usr/src/eh2telegraph/target/release/bot /usr/local/bin/bot
CMD ["/usr/local/bin/bot"]

92
README-zh.md Normal file
View File

@ -0,0 +1,92 @@
# eh2telegraph
中文|[英文](README.md)
自动从 EH/EX/NH 下载图片集并上传至 Telegraph 的 Bot。
本代码只保证在 MacOS部分功能和 Linux 上可以正确运行。
## 部署指引
1. 安装 Docker 和 docker-compose。
2. 创建新文件夹 `ehbot`
2. 复制项目中的 `config_example.yaml``ehbot` 并重命名为 `config.yaml`,之后修改配置细节(请参考下一节)。
3. 复制 `docker-compose.yml``ehbot`
4. 开启与关闭:
1. 开启:在该路径中运行 `docker-compose up -d`
2. 关闭:在该路径中运行 `docker-compose down`
3. 查看日志:在该路径中运行 `docker-compose logs`
4. 更新镜像:在该路径中运行 `docker-compose pull`
## 配置指引
1. 基础配置:
1. Bot TokenTelegram 内找 @BotFather 申请。
2. Admin可空你的 Telegram ID随便找个相关 Bot 就可以拿到(也可以通过本 Bot `/id` 拿到)。
3. Telegraph使用浏览器通过[这个链接](https://api.telegra.ph/createAccount?short_name=test_account&author_name=test_author)创建 Telegraph Token 并填写。你也可以修改作者名字和 URL。
2. 代理配置:
1. 部署本仓库中的 `worker/web_proxy.js` 至 CloudFlare Workers并配置 `KEY` 环境变量为一段随机字符串(该 KEY 目的是防止对代理的未授权请求)。
2. 填写 URL 和 Key 到配置中。
3. 该代理用于请求一些有频率限制的服务,请勿滥用。
3. IPv6 配置:
1. 可以填写一个 IPv6 段,如果你并没有拥有一个较大的(指比 `/64`IPv6 段,请留空。
2. 填写的话需要开启 `net.ipv6.ip_nonlocal_bind` 内核参数(参考后续章节说明)。
3. 配置 IPv6 可以一定程度上缓解针对单 IP 的限流。
4. 配置部分 Collector 的 Cookie
1. 目前只有 exhentai 需要。
5. KV 配置:
1. 本项目内置使用了一个缓存服务,可以避免对一个图片集的重复同步。
2. 请参考 [cloudflare-kv-proxy](https://github.com/ihciah/cloudflare-kv-proxy) 进行部署,并填写至配置文件。
3. 如果不想使用远程缓存,也可以使用纯内存缓存(重启后会失效),需要自行改代码并重新编译。
## 开发指引
### 环境
需要 Rust 最新的 Nightly 版本。推荐使用 VSCode 或 Clion 开发。
中国大陆推荐使用 [RsProxy](https://rsproxy.cn/) 作为 crates.io 镜像与工具链安装源。
### 版本发布
`v` 开头的 Tag 即可触发 Docker 构建。你可以直接在 git 中打 tag 之后 push 上去;但更方便的是在 github 中发布 release并填写 `v` 开头的命名。
## 技术细节
虽然本项目就是一个简单的爬虫,但是还是有一些注意事项需要说明一下。
### Github Action 构建
Github Action 可以用于自动构建 Docker 镜像,本项目支持自动构建 `x86_64` 平台的版本。
但事实上也可以构建 `arm64` 的版本,由于其机制上使用了 qemu 在 x86_64 上模拟了 arm 环境,所以速度极其缓慢(单次构建需要 1h 以上),于是没有开启。
### IPv6 幽灵客户端(口胡的名字)
某些网站有针对 IP 的访问频率限制,使用多个 IP 即可缓解该限制。实践上最常用的办法是代理池,但代理池往往极不稳定,并需要维护,可能还有一定成本。
观察本项目的目标网站,很多使用了 Cloudflare而 Cloudflare 支持 IPv6 且限流粒度是 `/64`。如果我们为本机绑定一个更大的 IPv6 段并从中随机选择 IP 作为客户端出口地址,则可以稳定地进行更高频率的请求。
由于网卡只会绑定单个 IPv6 地址,所以我们需要开启 `net.ipv6.ip_nonlocal_bind`
配置 IPv6 后,对于可以走 IPv6 的目标站点,本项目会使用 IPv6 段中的随机 IP 请求。
配置(对网卡的配置可以写在 `if-up` 中便于持久化):
1. `sudo ip add add local 2001:x:x::/48 dev lo`
2. `sudo ip route add local 2001:x:x::/48 dev your-interface`
3. 在 Sysctl 中配置 `net.ipv6.ip_nonlocal_bind=1`。该步骤因发行版而异(比如常见的 `/etc/sysctl.conf` 在 Arch Linux 中不存在)。
去哪搞 IPv6he.net 提供了相关免费服务,当然自己购买一个 IPv6 IP 段也并不昂贵。
你可以通过 `curl --interface 2001:***** ifconfig.co` 测试配置是否正确。
### 强制 IPv6
前一小节提到的网站虽然用了 Cloudflare但是事实上并没有真正启用 IPv6。当你直接使用 curl 指定 ipv6 请求时会发现,它根本就没有 AAAA 记录。但是由于 CF 的基础设施是 Anycast 的,所以如果目标网站不在代码中明确地拒绝 IPv6 访客,它们还是可以通过 IPv6 访问的。
1. telegra.ph: 无 AAAA 记录,但是强制解析到 Telegram 的入口 IP 可以访问,但证书是 `*.telegram.org` 的。
~~本项目写了一个校验指定域名证书有效性的 TLS 验证器,用于在保证安全性的情况下允许其证书配置错误。~~
但是 Telegraph 以极快的速度修掉了该问题,所以该 TLS 校验器目前处于禁用状态。
2. EH/NH: 强制 IPv6 可用。
3. EX: 未使用 CF 且无 IPv6 服务。
### 代理
本项目使用 Cloudflare Workers 作为部分 API 代理,在 IPv6 不可用时缓解限流问题。参考 `src/http_proxy.rs``worker/web_proxy.js`
### 缓存
为了尽可能少地重复拉取,本项目使用了内存缓存与远程持久化缓存。远程持久化缓存使用 Cloudflare Worker 配合 Cloudflare KV 搭建。项目主代码参考 [cloudflare-kv-proxy](https://github.com/ihciah/cloudflare-kv-proxy)。
由于同步图片集需要一定时间,为了避免重复同步,本项目使用了 [singleflight-async](https://github.com/ihciah/singleflight-async) 减少这类浪费。

94
README.md Normal file
View File

@ -0,0 +1,94 @@
# eh2telegraph
[中文](README-zh.md)|英文
Bot that automatically downloads image sets from EH/EX/NH and uploads them to Telegraph.
This code is only guaranteed to work correctly on MacOS (partial functionality) and Linux.
## Deployment Guidelines
1. Install Docker and docker-compose.
2. Create a new folder `ehbot`.
2. Copy `config_example.yaml` from the project to `ehbot` and rename it to `config.yaml`, then change the configuration details (see the next section).
3. Copy `docker-compose.yml` to `ehbot`.
4. Start and Shutdown.
1. Start: Run `docker-compose up -d` in this folder.
2. Shutdown: Run `docker-compose down` in this folder.
3. View logs: Run `docker-compose logs` in this folder.
4. Update the image: Run `docker-compose pull` in this folder.
## Configuration Guidelines
1. Basic Configuration
Bot Token: Find @BotFather in Telegram to apply.
2. Admin (can be empty): your Telegram ID, you can get it from any relevant Bot (you can also get it from this Bot `/id`).
3. Telegraph: Use your browser to create a Telegraph Token via [this link](https://api.telegra.ph/createAccount?short_name=test_account&author_name=test_author) and fill in. You can also change the author name and URL.
2. Proxy Configuration
1. Deploy `worker/web_proxy.js` of this repository to Cloudflare Workers and configure the `KEY` environment variable to be a random string (the purpose of the `KEY` is to prevent unauthorized requests to the proxy).
2. Fill in the URL and Key into the yaml.
3. The proxy is used to request some services with frequency limitation, so do not abuse it.
3. IPv6 configuration
1. You can specify an IPv6 segment, if you do not have a larger (meaning larger than `/64`) IPv6 segment, please leave it blank.
2. Configure IPv6 to somewhat alleviate the flow restriction for single IP.
4. Configure cookies for some Collectors.
1. Currently, only exhentai is required.
5. KV configuration
1. This project uses a built-in caching service to avoid repeated synchronization of an image set.
2. Please refer to [cloudflare-kv-proxy](https://github.com/ihciah/cloudflare-kv-proxy) for deployment and fill in the yaml file.
3. If you don't want to use remote caching, you can also use pure memory caching (it will be invalid after reboot). If you want to do so, you need to modify the code and recompile it by yourself.
## Development Guidelines
### Environment
Requires the latest Nightly version of Rust. Recommended to use VSCode or Clion for development.
[RsProxy](https://rsproxy.cn/) is recommended as the crates.io source and toolchain installation source for users in China Mainland.
### Version Release
A Docker build can be triggered by typing a Tag starting with `v`. You can type the tag directly in git and push it up; however, it is easier to publish the release in github and fill in the `v` prefix.
## Technical Details
Although this project is a simple crawler, there are still some considerations that need to be explained.
### Github Action Builds
Github Action can be used to automatically build Docker images, and this project supports automatic builds for the `x86_64` platform.
However, it can also build `arm64` versions, but it is not enabled because it uses qemu to emulate the arm environment on x86_64, so it is extremely slow (more than 1h for a single build).
### IPv6 Ghost Client (it's not a well-known name, just made up by myself)
Some sites have IP-specific access frequency limits, which can be mitigated by using multiple IPs. The most common approach in practice is proxy pooling, but proxy pools are often extremely unstable and require maintenance and possibly some cost.
Observe the target sites of this project, many use Cloudflare, and Cloudflare supports IPv6 and the granularity of flow limitation is `/64`. If we bind a larger IPv6 segment for the local machine and randomly select IPs from it as client exit addresses, we can make more frequent requests steadily.
Since the NIC will only bind a single IPv6 address, we need to enable `net.ipv6.ip_nonlocal_bind`.
After configuring IPv6, for target sites that can use IPv6, this project will use random IP requests from the IPv6 segment.
Configuration (configuration for the NIC can be written in `if-up` for persistence).
1. `sudo ip add add local 2001:x:x::/48 dev lo`
2. `sudo ip route add local 2001:x:x::/48 dev your-interface`
3. Configure `net.ipv6.ip_nonlocal_bind=1` in Sysctl. This step varies by distribution (for example, the common `/etc/sysctl.conf` does not exist in Arch Linux).
Where to get IPv6? he.net offers a free service for this, but of course it is not expensive to buy an IPv6 IP segment yourself.
You can test the configuration with `curl --interface 2001:***** ifconfig.co` to see if it is correct.
### Forcing IPv6
The site mentioned in the previous subsection uses Cloudflare, but in fact does not really enable IPv6. when you specify the ipv6 request directly using curl, you will find that it has no AAAA records at all. But because the CF infrastructure is Anycast, so if the target site does not explicitly deny IPv6 visitors in the code, they can still be accessed through IPv6.
1. telegra.ph: No AAAA records, but force resolves to Telegram's entry IP for access, but the certificate is `*.telegram.org`.
~~This project writes a TLS validator that checks the validity of a given domain's certificate, to allow for misconfiguration of its certificate while maintaining security.~~
However, Telegraph fixed the problem very quickly, so the TLS verifier is currently disabled.
2. EH/NH: Forced IPv6 availability.
3. EX: CF is not used and no IPv6 service is available.
### Proxy
This project uses Cloudflare Workers as a partial API proxy to alleviate the flow limitation problem when IPv6 is not available. See `src/http_proxy.rs` and `worker/web_proxy.js`.
### Caching
To minimize duplicate pulls, this project uses in-memory caching and remote persistent caching. Remote persistent cache using Cloudflare Worker with Cloudflare KV to build. The main project code reference is [cloudflare-kv-proxy](https://github.com/ihciah/cloudflare-kv-proxy).
Since it takes some time to synchronize image sets, to avoid repeated synchronization, this project uses [singleflight-async](https://github.com/ihciah/singleflight-async) to reduce this kind of waste.
Translated with www.DeepL.com/Translator (free version)

24
bot/Cargo.toml Normal file
View File

@ -0,0 +1,24 @@
[package]
edition = "2021"
name = "bot"
version = "0.1.3"
[dependencies]
eh2telegraph = {path = "../eh2telegraph"}
anyhow = "1"
clap = {version = "3", features = ["derive"]}
dptree = "0.1"
once_cell = "1"
regex = "1"
reqwest = {version = "0.11", default-features = false, features = ["json", "multipart", "rustls-tls"]}
serde = {version = "1", features = ["derive"]}
singleflight-async = {version = "0.1", features = ["hardware-lock-elision"]}
teloxide = {version = "0.7", features = ["macros", "ctrlc_handler", "dispatching2", "auto-send"]}
time = {version = "0.3", features = ["local-offset", "std", "macros"]}
tokio = {version = "1", default-features = false, features = ["rt-multi-thread", "macros", "net", "sync", "time", "parking_lot"]}
tracing = "0.1"
tracing-subscriber = {version = "0.3", features = ["local-time", "parking_lot", "time"]}
[build-dependencies]
vergen = {version = "7", default_features = false, features = ["build", "cargo", "rustc"]}

6
bot/build.rs Normal file
View File

@ -0,0 +1,6 @@
use vergen::{vergen, Config};
fn main() {
// Generate the default 'cargo:' instruction output
vergen(Config::default()).unwrap()
}

413
bot/src/handler.rs Normal file
View File

@ -0,0 +1,413 @@
use std::{borrow::Cow, collections::HashSet};
use eh2telegraph::{
collector::{e_hentai::EHCollector, exhentai::EXCollector, nhentai::NHCollector},
searcher::{
f_hash::FHashConvertor,
saucenao::{SaucenaoOutput, SaucenaoParsed, SaucenaoSearcher},
ImageSearcher,
},
storage::KVStorage,
sync::Synchronizer,
};
use reqwest::Url;
use teloxide::{
adaptors::DefaultParseMode,
prelude2::*,
utils::{
command::BotCommand,
markdown::{code_inline, escape, link},
},
};
use tracing::{info, trace};
use crate::{ok_or_break, util::PrettyChat};
const MIN_SIMILARITY: u8 = 70;
const MIN_SIMILARITY_PRIVATE: u8 = 50;
#[derive(BotCommand, Clone)]
#[command(
rename = "lowercase",
description = "\
This is a gallery synchronization robot that is convenient for users to view pictures directly in Telegram.\n\
便 Telegram \n\
Join develop group or contact @ByteRabbit if you need.\n\
@ByteRabbit\n\n\
Bot supports sync with command, text url, or image(private chat search thrashold is lower).\n\
() \n\n\
Bot develop group / Bot https://t.me/TGSyncBotWorkGroup\n\
And welcome to join our channel / https://t.me/sesecollection\n\n\
These commands are supported:\n\
:"
)]
pub enum Command {
#[command(description = "Display this help. 显示这条帮助信息。")]
Help,
#[command(description = "Show bot verison. 显示机器人版本。")]
Version,
#[command(description = "Show your account id. 显示你的账号 ID。")]
Id,
#[command(
description = "Sync a gallery(e-hentai/exhentai/nhentai are supported now). 同步一个画廊(目前支持 EH/EX/NH)"
)]
Sync(String),
}
#[derive(BotCommand, Clone)]
#[command(rename = "lowercase", description = "Command for admins")]
pub enum AdminCommand {
#[command(description = "Delete cache with given key.")]
Delete(String),
}
pub struct Handler<C> {
pub synchronizer: Synchronizer<C>,
pub searcher: SaucenaoSearcher,
pub convertor: FHashConvertor,
pub admins: HashSet<i64>,
single_flight: singleflight_async::SingleFlight<String>,
}
impl<C> Handler<C>
where
C: KVStorage<String> + Send + Sync + 'static,
{
pub fn new(synchronizer: Synchronizer<C>, admins: HashSet<i64>) -> Self {
Self {
synchronizer,
searcher: SaucenaoSearcher::new_from_config(),
convertor: FHashConvertor::new_from_config(),
admins,
single_flight: Default::default(),
}
}
/// Executed when a command comes in and parsed successfully.
pub async fn respond_cmd(
&'static self,
bot: AutoSend<DefaultParseMode<Bot>>,
msg: Message,
command: Command,
) -> ControlFlow<()> {
match command {
Command::Help => {
let _ = bot
.send_message(msg.chat.id, escape(&Command::descriptions()))
.reply_to_message_id(msg.id)
.await;
}
Command::Version => {
let _ = bot
.send_message(msg.chat.id, escape(crate::version::VERSION))
.reply_to_message_id(msg.id)
.await;
}
Command::Id => {
let _ = bot
.send_message(
msg.chat.id,
format!(
"Current chat id is {} \\(in private chat this is your account id\\)",
code_inline(&msg.chat.id.to_string())
),
)
.reply_to_message_id(msg.id)
.await;
}
Command::Sync(url) => {
if url.is_empty() {
let _ = bot
.send_message(msg.chat.id, escape("Usage: /sync url"))
.reply_to_message_id(msg.id)
.await;
return ControlFlow::BREAK;
}
info!(
"[cmd handler] receive sync request from {:?} for {url}",
PrettyChat(&msg.chat)
);
let msg: Message = ok_or_break!(
bot.send_message(msg.chat.id, escape(&format!("Syncing url {url}")))
.reply_to_message_id(msg.id)
.await
);
tokio::spawn(async move {
let _ = bot
.edit_message_text(msg.chat.id, msg.id, self.sync_response(&url).await)
.await;
});
}
};
ControlFlow::BREAK
}
pub async fn respond_admin_cmd(
&'static self,
bot: AutoSend<DefaultParseMode<Bot>>,
msg: Message,
command: AdminCommand,
) -> ControlFlow<()> {
match command {
AdminCommand::Delete(key) => {
let _ = self.synchronizer.delete_cache(&key).await;
let _ = bot
.send_message(msg.chat.id, escape(&format!("Key {key} deleted.")))
.reply_to_message_id(msg.id)
.await;
ControlFlow::BREAK
}
}
}
pub async fn respond_text(
&'static self,
bot: AutoSend<DefaultParseMode<Bot>>,
msg: Message,
) -> ControlFlow<()> {
let maybe_link = {
let entries = msg
.entities()
.map(|es| {
es.iter().filter_map(|e| {
if let teloxide::types::MessageEntityKind::TextLink { url } = &e.kind {
Synchronizer::match_url_from_text(url.as_ref()).map(ToOwned::to_owned)
} else {
None
}
})
})
.into_iter()
.flatten();
msg.text()
.and_then(|content| {
Synchronizer::match_url_from_text(content).map(ToOwned::to_owned)
})
.into_iter()
.chain(entries)
.next()
};
if let Some(url) = maybe_link {
info!(
"[text handler] receive sync request from {:?} for {url}",
PrettyChat(&msg.chat)
);
let msg: Message = ok_or_break!(
bot.send_message(msg.chat.id, escape(&format!("Syncing url {url}")))
.reply_to_message_id(msg.id)
.await
);
tokio::spawn(async move {
let _ = bot
.edit_message_text(msg.chat.id, msg.id, self.sync_response(&url).await)
.await;
});
return ControlFlow::BREAK;
}
// fallback to the next branch
ControlFlow::CONTINUE
}
pub async fn respond_caption(
&'static self,
bot: AutoSend<DefaultParseMode<Bot>>,
msg: Message,
) -> ControlFlow<()> {
let caption_entities = msg.caption_entities();
let mut final_url = None;
for entry in caption_entities.map(|x| x.iter()).into_iter().flatten() {
let url = match &entry.kind {
teloxide::types::MessageEntityKind::Url => {
let raw = msg
.caption()
.expect("Url MessageEntry found but caption is None");
let encoded: Vec<_> = raw
.encode_utf16()
.into_iter()
.skip(entry.offset)
.take(entry.length)
.collect();
let content = ok_or_break!(String::from_utf16(&encoded));
Cow::from(content)
}
teloxide::types::MessageEntityKind::TextLink { url } => Cow::from(url.as_ref()),
_ => {
continue;
}
};
let url = if let Some(c) = Synchronizer::match_url_from_url(&url) {
c
} else {
continue;
};
final_url = Some(url.to_string());
break;
}
match final_url {
Some(url) => {
info!(
"[caption handler] receive sync request from {:?} for {url}",
PrettyChat(&msg.chat)
);
let msg: Message = ok_or_break!(
bot.send_message(msg.chat.id, escape(&format!("Syncing url {url}")))
.reply_to_message_id(msg.id)
.await
);
let url = url.to_string();
tokio::spawn(async move {
let _ = bot
.edit_message_text(msg.chat.id, msg.id, self.sync_response(&url).await)
.await;
});
ControlFlow::BREAK
}
None => ControlFlow::CONTINUE,
}
}
pub async fn respond_photo(
&'static self,
bot: AutoSend<DefaultParseMode<Bot>>,
msg: Message,
) -> ControlFlow<()> {
let first_photo = match msg.photo().and_then(|x| x.first()) {
Some(p) => p,
None => {
return ControlFlow::CONTINUE;
}
};
let f = ok_or_break!(bot.get_file(&first_photo.file_id).await);
let mut buf: Vec<u8> = Vec::with_capacity(f.file_size as usize);
ok_or_break!(teloxide::net::Download::download_file(&bot, &f.file_path, &mut buf).await);
let search_result: SaucenaoOutput = ok_or_break!(self.searcher.search(buf).await);
let mut url_sim = None;
let threshold = if msg.chat.is_private() {
MIN_SIMILARITY_PRIVATE
} else {
MIN_SIMILARITY
};
for element in search_result
.data
.into_iter()
.filter(|x| x.similarity >= threshold)
{
match element.parsed {
SaucenaoParsed::EHentai(f_hash) => {
url_sim = Some((
ok_or_break!(self.convertor.convert_to_gallery(&f_hash).await),
element.similarity,
));
break;
}
SaucenaoParsed::NHentai(nid) => {
url_sim = Some((format!("https://nhentai.net/g/{nid}/"), element.similarity));
break;
}
_ => continue,
}
}
let (url, sim) = match url_sim {
Some(u) => u,
None => {
trace!("[photo handler] image not found");
return ControlFlow::CONTINUE;
}
};
info!(
"[photo handler] receive sync request from {:?} for {url} with similarity {sim}",
PrettyChat(&msg.chat)
);
if let Ok(msg) = bot
.send_message(msg.chat.id, escape(&format!("Syncing url {url}")))
.reply_to_message_id(msg.id)
.await
{
tokio::spawn(async move {
let _ = bot
.edit_message_text(msg.chat.id, msg.id, self.sync_response(&url).await)
.await;
});
}
ControlFlow::BREAK
}
pub async fn respond_default(
&'static self,
bot: AutoSend<DefaultParseMode<Bot>>,
msg: Message,
) -> ControlFlow<()> {
if msg.chat.is_private() {
ok_or_break!(
bot.send_message(msg.chat.id, escape("Unrecognized message."))
.reply_to_message_id(msg.id)
.await
);
}
#[cfg(debug_assertions)]
tracing::warn!("{:?}", msg);
ControlFlow::BREAK
}
async fn sync_response(&self, url: &str) -> String {
self.single_flight
.work(url, || async {
match self.route_sync(url).await {
Ok(url) => {
format!("Sync to telegraph finished: {}", link(&url, &escape(&url)))
}
Err(e) => {
format!("Sync to telegraph failed: {}", escape(&e.to_string()))
}
}
})
.await
}
async fn route_sync(&self, url: &str) -> anyhow::Result<String> {
let u = Url::parse(url).map_err(|_| anyhow::anyhow!("Invalid url"))?;
let host = u.host_str().unwrap_or_default();
let path = u.path().to_string();
// TODO: use macro to generate them
#[allow(clippy::single_match)]
match host {
"e-hentai.org" => {
info!("[registry] sync e-hentai for path {}", path);
self.synchronizer
.sync::<EHCollector>(path)
.await
.map_err(anyhow::Error::from)
}
"nhentai.to" | "nhentai.net" => {
info!("[registry] sync nhentai for path {}", path);
self.synchronizer
.sync::<NHCollector>(path)
.await
.map_err(anyhow::Error::from)
}
"exhentai.org" => {
info!("[registry] sync exhentai for path {}", path);
self.synchronizer
.sync::<EXCollector>(path)
.await
.map_err(anyhow::Error::from)
}
_ => Err(anyhow::anyhow!("no matching collector")),
}
}
}

216
bot/src/main.rs Normal file
View File

@ -0,0 +1,216 @@
#![feature(control_flow_enum)]
use eh2telegraph::{
collector::Registry,
config::{self},
http_proxy::ProxiedClient,
storage,
sync::Synchronizer,
telegraph::Telegraph,
};
use clap::Parser;
use teloxide::{
adaptors::DefaultParseMode,
dispatching::update_listeners,
error_handlers::IgnoringErrorHandler,
prelude2::*,
types::{AllowedUpdate, ChatPermissions, ParseMode, UpdateKind},
};
use handler::{Command, Handler};
use crate::{
handler::AdminCommand,
util::{wrap_endpoint, PrettyChat},
};
mod handler;
mod util;
mod version;
#[derive(Debug, serde::Deserialize)]
pub struct BaseConfig {
pub bot_token: String,
pub telegraph: TelegraphConfig,
#[serde(default)]
pub admins: Vec<i64>,
}
#[derive(Debug, serde::Deserialize)]
pub struct TelegraphConfig {
pub tokens: Vec<String>,
pub author_name: Option<String>,
pub author_url: Option<String>,
}
#[derive(Parser, Debug)]
#[clap(author, version=version::VERSION, about, long_about = "eh2telegraph sync bot")]
struct Args {
#[clap(short, long, help = "Config file path")]
config: Option<String>,
}
#[tokio::main]
async fn main() {
let args = Args::parse();
let timer = tracing_subscriber::fmt::time::LocalTime::new(time::macros::format_description!(
"[month]-[day] [hour]:[minute]:[second]"
));
tracing_subscriber::fmt().with_timer(timer).init();
tracing::info!("initializing...");
config::init(args.config);
let base_config: BaseConfig = config::parse("base")
.expect("unable to parse base config")
.expect("base config can not be empty");
let telegraph_config = base_config.telegraph;
let telegraph =
Telegraph::new(telegraph_config.tokens).with_proxy(ProxiedClient::new_from_config());
let registry = Registry::new_from_config();
#[cfg(debug_assertions)]
let cache = storage::SimpleMemStorage::default();
#[cfg(not(debug_assertions))]
let cache =
storage::cloudflare_kv::CFStorage::new_from_config().expect("unable to build storage");
let mut synchronizer = Synchronizer::new(telegraph, registry, cache);
if telegraph_config.author_name.is_some() {
synchronizer =
synchronizer.with_author(telegraph_config.author_name, telegraph_config.author_url);
}
let admins = base_config.admins.into_iter().collect();
let handler = Box::leak(Box::new(Handler::new(synchronizer, admins))) as &Handler<_>;
// === Bot related ===
let command_handler = move |bot: AutoSend<DefaultParseMode<Bot>>,
message: Message,
command: Command| async move {
handler.respond_cmd(bot, message, command).await
};
let admin_command_handler = move |bot: AutoSend<DefaultParseMode<Bot>>,
message: Message,
command: AdminCommand| async move {
handler.respond_admin_cmd(bot, message, command).await
};
let text_handler = move |bot: AutoSend<DefaultParseMode<Bot>>, message: Message| async move {
handler.respond_text(bot, message).await
};
let caption_handler = move |bot: AutoSend<DefaultParseMode<Bot>>, message: Message| async move {
handler.respond_caption(bot, message).await
};
let photo_handler = move |bot: AutoSend<DefaultParseMode<Bot>>, message: Message| async move {
handler.respond_photo(bot, message).await
};
let default_handler = move |bot: AutoSend<DefaultParseMode<Bot>>, message: Message| async move {
handler.respond_default(bot, message).await
};
let permission_filter = |bot: AutoSend<DefaultParseMode<Bot>>, message: Message| async move {
// If the bot is blocked, we will leave chat and not respond.
let blocked = message
.chat
.permissions()
.map(|p| !p.contains(ChatPermissions::SEND_MESSAGES))
.unwrap_or_default();
if blocked {
tracing::info!(
"[permission filter] leave chat {:?}",
PrettyChat(&message.chat)
);
let _ = bot.leave_chat(message.chat.id).await;
None
} else {
Some(message)
}
};
let bot = Bot::new(base_config.bot_token)
.parse_mode(ParseMode::MarkdownV2)
.auto_send();
let mut bot_dispatcher = Dispatcher::builder(
bot.clone(),
dptree::entry()
.chain(dptree::filter_map(move |update: Update| {
match update.kind {
UpdateKind::Message(x) | UpdateKind::EditedMessage(x) => Some(x),
_ => None,
}
}))
.chain(dptree::filter_map_async(permission_filter))
.branch(
dptree::entry()
.chain(dptree::filter(move |message: Message| {
handler.admins.contains(&message.chat.id)
}))
.filter_command::<AdminCommand>()
.branch(wrap_endpoint(admin_command_handler)),
)
.branch(
dptree::entry()
.filter_command::<Command>()
.branch(wrap_endpoint(command_handler)),
)
.branch(
dptree::entry()
.chain(dptree::filter_map(move |message: Message| {
// Ownership mechanism does not allow using map.
#[allow(clippy::manual_map)]
match message.text() {
Some(v) if !v.is_empty() => Some(message),
_ => None,
}
}))
.branch(wrap_endpoint(text_handler)),
)
.branch(
dptree::entry()
.chain(dptree::filter_map(move |message: Message| {
// Ownership mechanism does not allow using map.
#[allow(clippy::manual_map)]
match message.caption_entities() {
Some(v) if !v.is_empty() => Some(message),
_ => None,
}
}))
.branch(wrap_endpoint(caption_handler)),
)
.branch(
dptree::entry()
.chain(dptree::filter_map(move |message: Message| {
// Ownership mechanism does not allow using map.
#[allow(clippy::manual_map)]
match message.photo() {
Some(v) if !v.is_empty() => Some(message),
_ => None,
}
}))
.branch(wrap_endpoint(photo_handler)),
)
.branch(wrap_endpoint(default_handler)),
)
.default_handler(Box::new(|_upd| {
#[cfg(debug_assertions)]
tracing::warn!("Unhandled update: {:?}", _upd);
Box::pin(async {})
}))
.error_handler(std::sync::Arc::new(IgnoringErrorHandler))
.build();
bot_dispatcher.setup_ctrlc_handler();
let bot_listener = update_listeners::polling(
bot,
Some(std::time::Duration::from_secs(10)),
None,
Some(vec![AllowedUpdate::Message]),
);
tracing::info!("initializing finished, bot is running");
bot_dispatcher
.dispatch_with_listener(
bot_listener,
LoggingErrorHandler::with_custom_text("An error from the update listener"),
)
.await;
}

69
bot/src/util.rs Normal file
View File

@ -0,0 +1,69 @@
use std::{convert::Infallible, ops::ControlFlow, sync::Arc};
use dptree::{di::Injectable, from_fn, Handler};
pub struct PrettyChat<'a>(pub &'a teloxide::types::Chat);
impl<'a> std::fmt::Debug for PrettyChat<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.0.is_group() || self.0.is_supergroup() {
write!(f, "GroupChat")?;
self.0.title().map(|x| write!(f, " title: {}", x));
self.0
.description()
.map(|x| write!(f, " description: {}", x));
} else if self.0.is_private() {
write!(f, "PrivateChat")?;
self.0.username().map(|x| write!(f, " username: @{}", x));
self.0.first_name().map(|x| write!(f, " first_name: {}", x));
self.0.last_name().map(|x| write!(f, " last_name: {}", x));
self.0.bio().map(|x| write!(f, " bio: {}", x));
} else if self.0.is_channel() {
write!(f, "Channel")?;
self.0.username().map(|x| write!(f, " username: @{}", x));
self.0.title().map(|x| write!(f, " title: {}", x));
self.0
.description()
.map(|x| write!(f, ", description: {}", x));
}
Ok(())
}
}
pub fn wrap_endpoint<'a, F, Input, Output, FnArgs>(
f: F,
) -> Handler<'a, Input, Result<Output, Infallible>, Infallible>
where
F: Injectable<Input, ControlFlow<Output>, FnArgs> + Send + Sync + 'a,
Input: Send + Sync + 'a,
Output: Send + Sync + 'a,
{
let f = Arc::new(f);
from_fn(move |event, _cont| {
let f = Arc::clone(&f);
async move {
let f = f.inject(&event);
let cf = f().await;
drop(f);
match cf {
ControlFlow::Continue(_) => ControlFlow::Continue(event),
ControlFlow::Break(out) => ControlFlow::Break(Ok(out)),
}
}
})
}
#[macro_export]
macro_rules! ok_or_break {
($e: expr) => {
match $e {
Ok(r) => r,
Err(_) => {
return ControlFlow::BREAK;
}
}
};
}

18
bot/src/version.rs Normal file
View File

@ -0,0 +1,18 @@
pub(crate) static VERSION: &str = concat!(
"\n",
"Build Timestamp: \t",
env!("VERGEN_BUILD_TIMESTAMP"),
"\n",
"Package Version: \t",
env!("VERGEN_BUILD_SEMVER"),
"\n",
"rustc Version: \t\t",
env!("VERGEN_RUSTC_SEMVER"),
"\n",
"cargo Profile: \t\t",
env!("VERGEN_CARGO_PROFILE"),
"\n",
"cargo Target: \t\t",
env!("VERGEN_CARGO_TARGET_TRIPLE"),
"\n",
);

27
config_example.yaml Normal file
View File

@ -0,0 +1,27 @@
base:
bot_token: xxx:xxxx
admins:
- 0
telegraph:
tokens:
- xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
author_name: Test Name
author_url: https://github.com/qini7-sese/eh2telegraph
proxy:
endpoint: https://proxy.xxx.workers.dev/
authorization: xxx
http:
ipv6_prefix:
exhentai:
ipb_pass_hash: xxx
ipb_member_id: xxx
igneous: xxx
worker_kv:
endpoint: https://kv.xxx.workers.dev
token: xxx
cache_size: 10240
expire_sec: 5184000 # 60 days

14
docker-compose.yml Normal file
View File

@ -0,0 +1,14 @@
version: "3"
services:
ehbot:
image: ghcr.io/qini7-sese/ehbot:latest
container_name: ehbot
restart: always
network_mode: "host"
environment:
CONFIG_FILE: "/config.yaml"
TZ: Asia/Shanghai
volumes:
- "./config.yaml:/config.yaml:ro"
logging:
driver: journald

30
eh2telegraph/Cargo.toml Normal file
View File

@ -0,0 +1,30 @@
[package]
edition = "2021"
name = "eh2telegraph"
version = "0.1.0"
[dependencies]
again = {version = "0.1", default_features = false, features = ["rand"]}
anyhow = "1"
bytes = "1"
clap = "3"
cloudflare-kv-proxy = "0.1"
derive_more = {version = "0.99", features = ["from_str"]}
futures = "0.3"
hashlink = "0.8"
ipnet = "2"
lazy_static = "1"
once_cell = "1"
parking_lot = {version = "0.12", features = ["hardware-lock-elision"]}
rand = "0.8"
regex = "1"
reqwest = {version = "0.11", default-features = false, features = ["json", "multipart", "rustls-tls"]}
rustls = {version = "0.20", features = ["dangerous_configuration"]}
serde = {version = "1", features = ["derive"]}
serde_with = {version = "1", features = ["macros", "json"]}
serde_yaml = "0.8"
thiserror = "1"
tokio = {version = "1", default-features = false, features = ["rt-multi-thread", "macros", "net", "sync", "time", "parking_lot"]}
tracing = "0.1"
webpki = "0.22"
webpki-roots = "0.22"

View File

@ -0,0 +1,95 @@
/// ImageBuffer for upload in batch.
pub struct ImageBuffer<T> {
buf: Vec<T>,
size: usize,
}
impl<T> Default for ImageBuffer<T> {
#[inline]
fn default() -> Self {
Self {
buf: Vec::new(),
size: 0,
}
}
}
impl<T> ImageBuffer<T>
where
T: DataSized,
{
#[inline]
pub fn new() -> Self {
Self::default()
}
#[inline]
pub fn with_capacity(n: usize) -> Self {
Self {
buf: Vec::with_capacity(n),
size: 0,
}
}
#[inline]
pub fn push(&mut self, data: T) {
self.size += data.size();
self.buf.push(data);
}
#[inline]
pub fn swap(&mut self) -> (Vec<T>, usize) {
let mut out = Vec::with_capacity(self.buf.len() * 2);
std::mem::swap(&mut self.buf, &mut out);
let mut size = 0;
std::mem::swap(&mut self.size, &mut size);
(out, size)
}
#[inline]
pub fn len(&self) -> usize {
self.buf.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.buf.len() == 0
}
#[inline]
pub fn size(&self) -> usize {
self.size
}
#[inline]
pub fn clear(&mut self) {
self.size = 0;
self.buf.clear();
}
}
pub trait DataSized {
fn size(&self) -> usize;
}
impl DataSized for bytes::Bytes {
#[inline]
fn size(&self) -> usize {
self.len()
}
}
impl DataSized for Vec<u8> {
#[inline]
fn size(&self) -> usize {
self.len()
}
}
impl<const N: usize> DataSized for Box<[u8; N]> {
#[inline]
fn size(&self) -> usize {
N
}
}

View File

@ -0,0 +1,254 @@
/// nhentai collector.
/// Host matching: e-hentai.org
use crate::{
http_client::{GhostClient, GhostClientBuilder, UA},
stream::AsyncStream,
util::match_first_group,
util::{get_bytes, get_string},
};
use again::RetryPolicy;
use ipnet::Ipv6Net;
use regex::Regex;
use reqwest::header;
use std::time::Duration;
use super::{
utils::paged::{PageFormatter, PageIndicator, Paged},
AlbumMeta, Collector, ImageData, ImageMeta,
};
lazy_static::lazy_static! {
static ref PAGE_RE: Regex = Regex::new(r#"<a href="(https://e-hentai\.org/s/\w+/[\w-]+)">"#).unwrap();
static ref IMG_RE: Regex = Regex::new(r#"<img id="img" src="(.*?)""#).unwrap();
static ref TITLE_RE: Regex = Regex::new(r#"<h1 id="gn">(.*?)</h1>"#).unwrap();
static ref RETRY_POLICY: RetryPolicy = RetryPolicy::fixed(Duration::from_millis(200))
.with_max_retries(5)
.with_jitter(true);
}
#[derive(Debug, Clone, Default)]
pub struct EHCollector {
client: GhostClient,
raw_client: reqwest::Client,
}
impl EHCollector {
pub fn new(prefix: Option<Ipv6Net>) -> Self {
let mut request_headers = header::HeaderMap::new();
request_headers.insert(
header::COOKIE,
header::HeaderValue::from_str("nw=1").unwrap(),
);
Self {
client: GhostClientBuilder::default()
.with_default_headers(request_headers)
.with_cf_resolve(&["e-hentai.org"])
.build(prefix),
raw_client: reqwest::Client::builder().user_agent(UA).build().unwrap(),
}
}
pub fn new_from_config() -> anyhow::Result<Self> {
let mut request_headers = header::HeaderMap::new();
request_headers.insert(
header::COOKIE,
header::HeaderValue::from_str("nw=1").unwrap(),
);
Ok(Self {
client: GhostClientBuilder::default()
.with_default_headers(request_headers)
.with_cf_resolve(&["e-hentai.org"])
.build_from_config()?,
raw_client: reqwest::Client::builder().user_agent(UA).build().unwrap(),
})
}
}
impl Collector for EHCollector {
type FetchError = anyhow::Error;
type FetchFuture<'a> =
impl std::future::Future<Output = anyhow::Result<(AlbumMeta, Self::ImageStream)>>;
type StreamError = anyhow::Error;
type ImageStream = EHImageStream;
#[inline]
fn name() -> &'static str {
"e-hentai"
}
fn fetch(&self, path: String) -> Self::FetchFuture<'_> {
async move {
// normalize url
let mut parts = path.trim_matches(|c| c == '/').split('/');
let g = parts.next();
let album_id = parts.next();
let album_token = parts.next();
let (album_id, album_token) = match (g, album_id, album_token) {
(Some("g"), Some(album_id), Some(album_token)) => (album_id, album_token),
_ => {
return Err(anyhow::anyhow!("invalid input path({path}), gallery url is expected(like https://e-hentai.org/g/2127986/da1deffea5)"));
}
};
let url = format!("https://e-hentai.org/g/{album_id}/{album_token}");
tracing::info!("[e-hentai] process {url}");
// clone client to force changing ip
let client = self.client.clone();
let mut paged = Paged::new(0, EHPageIndicator { base: url.clone() });
let gallery_pages = paged.pages(&client).await?;
// Since paged returns at least one page, we can safely get it.
let title = match_first_group(&TITLE_RE, &gallery_pages[0])
.unwrap_or("No Title")
.to_string();
let mut image_page_links = Vec::new();
for gallery_page in gallery_pages.iter() {
PAGE_RE.captures_iter(gallery_page).for_each(|c| {
let matching = c.get(1).expect("regexp is matched but no group 1 found");
image_page_links.push(matching.as_str().to_string());
});
}
if image_page_links.is_empty() {
return Err(anyhow::anyhow!(
"invalid url, maybe resource has been deleted."
));
}
Ok((
AlbumMeta {
link: url,
name: title,
class: None,
description: None,
authors: None,
tags: None,
},
EHImageStream {
client,
raw_client: self.raw_client.clone(),
image_page_links: image_page_links.into_iter(),
},
))
}
}
}
#[derive(Debug)]
pub struct EHImageStream {
client: GhostClient,
raw_client: reqwest::Client,
image_page_links: std::vec::IntoIter<String>,
}
impl EHImageStream {
async fn load_image(
client: &GhostClient,
raw_client: &reqwest::Client,
link: String,
) -> anyhow::Result<(ImageMeta, ImageData)> {
let content = RETRY_POLICY
.retry(|| async { get_string(client, &link).await })
.await?;
let img_url = match_first_group(&IMG_RE, &content)
.ok_or_else(|| anyhow::anyhow!("unable to find image in page"))?;
let image_data = RETRY_POLICY
.retry(|| async { get_bytes(raw_client, img_url).await })
.await?;
tracing::trace!(
"download e-hentai image with size {}, link: {link}",
image_data.len()
);
let meta = ImageMeta {
id: link,
url: img_url.to_string(),
description: None,
};
Ok((meta, image_data))
}
}
impl AsyncStream for EHImageStream {
type Item = anyhow::Result<(ImageMeta, ImageData)>;
type Future = impl std::future::Future<Output = Self::Item>;
fn next(&mut self) -> Option<Self::Future> {
let link = self.image_page_links.next()?;
let client = self.client.clone();
let raw_client = self.raw_client.clone();
Some(async move { Self::load_image(&client, &raw_client, link).await })
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.image_page_links.size_hint()
}
}
struct EHPageIndicator {
base: String,
}
impl PageFormatter for EHPageIndicator {
fn format_n(&self, n: usize) -> String {
format!("{}/?p={}", self.base, n)
}
}
impl PageIndicator for EHPageIndicator {
fn is_last_page(&self, content: &str, next_page: usize) -> bool {
let html = format!(
"<a href=\"{}/?p={}\" onclick=\"return false\">",
self.base, next_page
);
!content.contains(&html)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[ignore]
#[tokio::test]
async fn demo() {
let collector = EHCollector {
raw_client: Default::default(),
client: Default::default(),
};
let (album, mut image_stream) = collector
.fetch("/g/2122174/fd2525031e".to_string())
.await
.unwrap();
println!("album: {:?}", album);
let maybe_first_image = image_stream.next().unwrap().await;
if let Ok((meta, data)) = maybe_first_image {
println!("first image meta: {meta:?}");
println!("first image data length: {}", data.len());
}
}
#[ignore]
#[test]
fn regex_match() {
// test page: https://e-hentai.org/g/2122174/fd2525031e
let r = Regex::new(r#"<a href="(https://e-hentai\.org/s/\w+/[\w-]+)">"#).unwrap();
let h = r#"<div class="gdtm" style="height:170px"><div style="margin:1px auto 0; width:100px; height:140px; background:transparent url(https://ehgt.org/m/002122/2122174-00.jpg) -600px 0 no-repeat"><a href="https://e-hentai.org/s/bd2b37d829/2122174-7"><img alt="007" title="Page 7: 2.png" src="https://ehgt.org/g/blank.gif" style="width:100px; height:139px; margin:-1px 0 0 -1px" /></a></div></div><div class="gdtm" style="height:170px"><div style="margin:1px auto 0; width:100px; height:100px; background:transparent url(https://ehgt.org/m/002122/2122174-00.jpg) -700px 0 no-repeat"><a href="https://e-hentai.org/s/4ca72f757d/2122174-8"><img alt="008" title="Page 8: 3.png" src="https://ehgt.org/g/blank.gif" style="width:100px; height:99px; margin:-1px 0 0 -1px" />"#;
let mut iter = r.captures_iter(h);
let first = iter.next().unwrap();
println!("{}", first.get(1).unwrap().as_str());
let second = iter.next().unwrap();
println!("{}", second.get(1).unwrap().as_str());
}
}

View File

@ -0,0 +1,282 @@
use std::time::Duration;
use again::RetryPolicy;
use regex::Regex;
use reqwest::header;
use serde::Deserialize;
use crate::{
config,
http_client::UA,
http_proxy::ProxiedClient,
stream::AsyncStream,
util::match_first_group,
util::{get_bytes, get_string},
};
use super::{
utils::paged::{PageFormatter, PageIndicator, Paged},
AlbumMeta, Collector, ImageData, ImageMeta,
};
lazy_static::lazy_static! {
static ref PAGE_RE: Regex = Regex::new(r#"<a href="(https://exhentai\.org/s/\w+/[\w-]+)">"#).unwrap();
static ref IMG_RE: Regex = Regex::new(r#"<img id="img" src="(.*?)""#).unwrap();
static ref TITLE_RE: Regex = Regex::new(r#"<h1 id="gn">(.*?)</h1>"#).unwrap();
static ref RETRY_POLICY: RetryPolicy = RetryPolicy::fixed(Duration::from_millis(200))
.with_max_retries(5)
.with_jitter(true);
}
const CONFIG_KEY: &str = "exhentai";
#[derive(Debug, Clone)]
pub struct EXCollector {
proxy_client: ProxiedClient,
client: reqwest::Client,
}
#[derive(Debug, Deserialize)]
pub struct ExConfig {
pub ipb_pass_hash: String,
pub ipb_member_id: String,
pub igneous: String,
}
impl EXCollector {
pub fn new(config: &ExConfig, proxy_client: ProxiedClient) -> anyhow::Result<Self> {
let cookie_value = format!(
"ipb_pass_hash={};ipb_member_id={};igneous={};nw=1",
config.ipb_pass_hash, config.ipb_member_id, config.igneous
);
// set headers with exhentai cookies
let mut request_headers = header::HeaderMap::new();
request_headers.insert(
header::COOKIE,
header::HeaderValue::from_str(&cookie_value)?,
);
Ok(Self {
client: {
reqwest::Client::builder()
.user_agent(UA)
.default_headers(request_headers.clone())
.build()
.expect("build reqwest client failed")
},
proxy_client: proxy_client.with_default_headers(request_headers),
})
}
pub fn new_from_config() -> anyhow::Result<Self> {
let config: ExConfig = config::parse(CONFIG_KEY)?
.ok_or_else(|| anyhow::anyhow!("exhentai config(key: exhentai) not found"))?;
let proxy_client = ProxiedClient::new_from_config();
Self::new(&config, proxy_client)
}
pub fn get_client(&self) -> reqwest::Client {
self.client.clone()
}
}
impl Collector for EXCollector {
type FetchError = anyhow::Error;
type FetchFuture<'a> =
impl std::future::Future<Output = anyhow::Result<(AlbumMeta, Self::ImageStream)>>;
type StreamError = anyhow::Error;
type ImageStream = EXImageStream;
#[inline]
fn name() -> &'static str {
"exhentai"
}
fn fetch(&self, path: String) -> Self::FetchFuture<'_> {
async move {
// normalize url
let mut parts = path.trim_matches(|c| c == '/').split('/');
let g = parts.next();
let album_id = parts.next();
let album_token = parts.next();
let (album_id, album_token) = match (g, album_id, album_token) {
(Some("g"), Some(album_id), Some(album_token)) => (album_id, album_token),
_ => {
return Err(anyhow::anyhow!("invalid input path({path}), gallery url is expected(like https://exhentai.org/g/2129939/01a6e086b9)"));
}
};
let url = format!("https://exhentai.org/g/{album_id}/{album_token}");
tracing::info!("[exhentai] process {url}");
let mut paged = Paged::new(0, EXPageIndicator { base: url.clone() });
let gallery_pages = paged.pages(&self.proxy_client).await?;
// Since paged returns at least one page, we can safely get it.
let title = match_first_group(&TITLE_RE, &gallery_pages[0])
.unwrap_or("No Title")
.to_string();
let mut image_page_links = Vec::new();
for gallery_page in gallery_pages.iter() {
PAGE_RE.captures_iter(gallery_page).for_each(|c| {
let matching = c.get(1).expect("regexp is matched but no group 1 found");
image_page_links.push(matching.as_str().to_string());
});
}
if image_page_links.is_empty() {
return Err(anyhow::anyhow!(
"invalid url, maybe resource has been deleted, or our ip is blocked."
));
}
Ok((
AlbumMeta {
link: url,
name: title,
class: None,
description: None,
authors: None,
tags: None,
},
EXImageStream {
client: self.client.clone(),
proxy_client: self.proxy_client.clone(),
image_page_links: image_page_links.into_iter(),
},
))
}
}
}
#[derive(Debug)]
pub struct EXImageStream {
client: reqwest::Client,
proxy_client: ProxiedClient,
image_page_links: std::vec::IntoIter<String>,
}
impl EXImageStream {
async fn load_image(
proxy_client: ProxiedClient,
client: reqwest::Client,
link: String,
) -> anyhow::Result<(ImageMeta, ImageData)> {
let content = RETRY_POLICY
.retry(|| async { get_string(&proxy_client, &link).await })
.await?;
let img_url = match_first_group(&IMG_RE, &content)
.ok_or_else(|| anyhow::anyhow!("unable to find image in page"))?;
let image_data = RETRY_POLICY
.retry(|| async { get_bytes(&client, img_url).await })
.await?;
tracing::trace!(
"download exhentai image with size {}, link: {link}",
image_data.len()
);
let meta = ImageMeta {
id: link,
url: img_url.to_string(),
description: None,
};
Ok((meta, image_data))
}
}
impl AsyncStream for EXImageStream {
type Item = anyhow::Result<(ImageMeta, ImageData)>;
type Future = impl std::future::Future<Output = Self::Item>;
fn next(&mut self) -> Option<Self::Future> {
let link = self.image_page_links.next()?;
let client = self.client.clone();
let proxy_client = self.proxy_client.clone();
Some(async move { Self::load_image(proxy_client, client, link).await })
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.image_page_links.size_hint()
}
}
struct EXPageIndicator {
base: String,
}
impl PageFormatter for EXPageIndicator {
fn format_n(&self, n: usize) -> String {
format!("{}/?p={}", self.base, n)
}
}
impl PageIndicator for EXPageIndicator {
fn is_last_page(&self, content: &str, next_page: usize) -> bool {
let html = format!(
"<a href=\"{}/?p={}\" onclick=\"return false\">",
self.base, next_page
);
!content.contains(&html)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[ignore]
#[tokio::test]
async fn demo() {
let config = ExConfig {
ipb_pass_hash: "balabala".to_string(),
ipb_member_id: "balabala".to_string(),
igneous: "balabala".to_string(),
};
println!("config {:#?}", config);
let collector = EXCollector::new(&config, ProxiedClient::default()).unwrap();
let (album, mut image_stream) = collector
.fetch("/g/2129939/01a6e086b9".to_string())
.await
.unwrap();
println!("album: {:?}", album);
let maybe_first_image = image_stream.next().unwrap().await;
if let Ok((meta, data)) = maybe_first_image {
println!("first image meta: {meta:?}");
println!("first image data length: {}", data.len());
}
}
#[ignore]
#[tokio::test]
async fn invalid_url() {
let config = ExConfig {
ipb_pass_hash: "balabala".to_string(),
ipb_member_id: "balabala".to_string(),
igneous: "balabala".to_string(),
};
println!("config {:#?}", config);
let collector = EXCollector::new(&config, ProxiedClient::default()).unwrap();
let output = collector.fetch("/g/2129939/00000".to_string()).await;
assert!(output.is_err());
println!("output err {:?}", output);
}
#[ignore]
#[test]
fn regex_match() {
// test page: https://exhentai.org/g/2122174/fd2525031e
let r = Regex::new(r#"<a href="(https://exhentai\.org/s/\w+/[\w-]+)">"#).unwrap();
let h = r#"<div class="gdtm" style="height:170px"><div style="margin:1px auto 0; width:100px; height:140px; background:transparent url(https://ehgt.org/m/002122/2122174-00.jpg) -600px 0 no-repeat"><a href="https://exhentai.org/s/bd2b37d829/2122174-7"><img alt="007" title="Page 7: 2.png" src="https://ehgt.org/g/blank.gif" style="width:100px; height:139px; margin:-1px 0 0 -1px" /></a></div></div><div class="gdtm" style="height:170px"><div style="margin:1px auto 0; width:100px; height:100px; background:transparent url(https://ehgt.org/m/002122/2122174-00.jpg) -700px 0 no-repeat"><a href="https://exhentai.org/s/4ca72f757d/2122174-8"><img alt="008" title="Page 8: 3.png" src="https://ehgt.org/g/blank.gif" style="width:100px; height:99px; margin:-1px 0 0 -1px" />"#;
let mut iter = r.captures_iter(h);
let first = iter.next().unwrap();
println!("{}", first.get(1).unwrap().as_str());
let second = iter.next().unwrap();
println!("{}", second.get(1).unwrap().as_str());
}
}

View File

@ -0,0 +1,99 @@
//! Built-in collectors and trait.
use once_cell::sync::Lazy;
use regex::Regex;
use crate::stream::AsyncStream;
use self::{e_hentai::EHCollector, exhentai::EXCollector, nhentai::NHCollector};
pub mod utils;
pub mod e_hentai;
pub mod exhentai;
pub mod nhentai;
pub mod pixiv;
#[derive(Debug, Clone)]
pub struct ImageMeta {
pub id: String,
pub url: String,
pub description: Option<String>,
}
pub type ImageData = bytes::Bytes;
#[derive(Debug, Clone)]
pub struct AlbumMeta {
pub link: String,
pub name: String,
pub class: Option<String>,
pub description: Option<String>,
pub authors: Option<Vec<String>>,
pub tags: Option<Vec<String>>,
}
/// Generic collector.
/// The `async fetch` returns the result of `AlbumMeta` and `ImageStream`.
/// By exposing `ImageStream`, we can fetch the images lazily. For low
/// memory VM, it will keep only a small amount in memory.
pub trait Collector {
type FetchError;
type FetchFuture<'a>: std::future::Future<
Output = Result<(AlbumMeta, Self::ImageStream), Self::FetchError>,
>
where
Self: 'a;
type StreamError;
type ImageStream: AsyncStream<Item = Result<(ImageMeta, ImageData), Self::StreamError>>;
fn name() -> &'static str;
fn fetch(&self, path: String) -> Self::FetchFuture<'_>;
}
pub(crate) static URL_FROM_TEXT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"((https://exhentai\.org/g/\w+/[\w-]+)|(https://e-hentai\.org/g/\w+/[\w-]+)|(https://nhentai\.net/g/\d+)|(https://nhentai\.to/g/\d+))"#).unwrap()
});
pub(crate) static URL_FROM_URL_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"^((https://exhentai\.org/g/\w+/[\w-]+)|(https://e-hentai\.org/g/\w+/[\w-]+)|(https://nhentai\.net/g/\d+)|(https://nhentai\.to/g/\d+))"#).unwrap()
});
#[derive(Debug, Clone)]
pub struct Registry {
eh: EHCollector,
nh: NHCollector,
ex: EXCollector,
}
pub trait Param<T> {
fn get(&self) -> &T;
}
impl Param<EHCollector> for Registry {
fn get(&self) -> &EHCollector {
&self.eh
}
}
impl Param<NHCollector> for Registry {
fn get(&self) -> &NHCollector {
&self.nh
}
}
impl Param<EXCollector> for Registry {
fn get(&self) -> &EXCollector {
&self.ex
}
}
impl Registry {
pub fn new_from_config() -> Self {
Self {
eh: EHCollector::new_from_config().expect("unable to build e-hentai collector"),
nh: NHCollector::new_from_config().expect("unable to build nhentai collector"),
ex: EXCollector::new_from_config().expect("unable to build exhentai collector"),
}
}
}

View File

@ -0,0 +1,176 @@
/// nhentai collector.
/// Host matching: nhentai.to or nhentai.net
use again::RetryPolicy;
use ipnet::Ipv6Net;
use regex::Regex;
use reqwest::Response;
use std::time::Duration;
use crate::{
http_client::{GhostClient, GhostClientBuilder},
stream::AsyncStream,
util::get_bytes,
util::match_first_group,
};
use super::{AlbumMeta, Collector, ImageData, ImageMeta};
lazy_static::lazy_static! {
static ref TITLE_RE: Regex = Regex::new(r#"<span class="pretty">(.*?)</span>"#).unwrap();
static ref PAGE_RE: Regex = Regex::new(r#"<noscript><img src="(https://t\d?\.nhentai\.net/galleries/\d+/\d+t\.\w+)"#).unwrap();
static ref RETRY_POLICY: RetryPolicy = RetryPolicy::fixed(Duration::from_millis(200))
.with_max_retries(5)
.with_jitter(true);
}
const DOMAIN_LIST: [&str; 10] = [
"nhentai.net",
"i.nhentai.net",
"i2.nhentai.net",
"i3.nhentai.net",
"i4.nhentai.net",
"i5.nhentai.net",
"i6.nhentai.net",
"i7.nhentai.net",
"i8.nhentai.net",
"i9.nhentai.net",
];
#[derive(Debug, Clone, Default)]
pub struct NHCollector {
client: GhostClient,
}
impl NHCollector {
pub fn new(prefix: Option<Ipv6Net>) -> Self {
Self {
client: GhostClientBuilder::default()
.with_cf_resolve(&DOMAIN_LIST)
.build(prefix),
}
}
pub fn new_from_config() -> anyhow::Result<Self> {
Ok(Self {
client: GhostClientBuilder::default()
.with_cf_resolve(&DOMAIN_LIST)
.build_from_config()?,
})
}
}
impl Collector for NHCollector {
type FetchError = anyhow::Error;
type FetchFuture<'a> =
impl std::future::Future<Output = anyhow::Result<(AlbumMeta, Self::ImageStream)>>;
type StreamError = anyhow::Error;
type ImageStream = NHImageStream;
#[inline]
fn name() -> &'static str {
"nhentai"
}
fn fetch(&self, path: String) -> Self::FetchFuture<'_> {
async move {
// normalize url
let mut parts = path.trim_matches(|c| c == '/').split('/');
let g = parts.next();
let album_id = parts.next();
let album_id = match (g, album_id) {
(Some("g"), Some(album_id)) => album_id,
_ => {
return Err(anyhow::anyhow!("invalid input path({path}), gallery url is expected(like https://nhentai.net/g/333678)"));
}
};
let url = format!("https://nhentai.net/g/{album_id}");
tracing::info!("[nhentai] process {url}");
// clone client to force changing ip
let client = self.client.clone();
let index = client
.get(&url)
.send()
.await
.and_then(Response::error_for_status)?
.text()
.await?;
let title = match_first_group(&TITLE_RE, &index)
.unwrap_or("No Title")
.to_string();
let image_urls = PAGE_RE
.captures_iter(&index)
.map(|c| {
let thumb_url = c
.get(1)
.expect("regexp is matched but no group 1 found")
.as_str();
thumb_url
.replace("https://t", "https://i")
.replace("t.", ".")
})
.collect::<Vec<_>>()
.into_iter();
Ok((
AlbumMeta {
link: url,
name: title,
class: None,
description: None,
authors: None,
tags: None,
},
NHImageStream { client, image_urls },
))
}
}
}
#[derive(Debug)]
pub struct NHImageStream {
client: GhostClient,
image_urls: std::vec::IntoIter<String>,
}
impl NHImageStream {
async fn load_image(
client: GhostClient,
link: String,
) -> anyhow::Result<(ImageMeta, ImageData)> {
let image_data = RETRY_POLICY
.retry(|| async { get_bytes(&client, &link).await })
.await?;
tracing::trace!(
"download nhentai image with size {}, link: {link}",
image_data.len()
);
let meta = ImageMeta {
id: link.clone(),
url: link,
description: None,
};
Ok((meta, image_data))
}
}
impl AsyncStream for NHImageStream {
type Item = anyhow::Result<(ImageMeta, ImageData)>;
type Future = impl std::future::Future<Output = Self::Item>;
fn next(&mut self) -> Option<Self::Future> {
let link = self.image_urls.next()?;
let client = self.client.clone();
Some(async move { Self::load_image(client, link).await })
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.image_urls.size_hint()
}
}

View File

@ -0,0 +1 @@

View File

@ -0,0 +1 @@
pub mod paged;

View File

@ -0,0 +1,74 @@
use reqwest::Response;
use crate::http_proxy::HttpRequestBuilder;
pub trait PageFormatter {
fn format_n(&self, n: usize) -> String;
}
pub trait PageIndicator {
fn is_last_page(&self, content: &str, next_page: usize) -> bool;
}
#[derive(thiserror::Error, Debug)]
pub enum PagedError {
#[error("reqwest error")]
Reqwest(#[from] reqwest::Error),
}
pub struct Paged<T> {
next_page: usize,
page_indicator: T,
}
impl<T> Paged<T> {
pub fn new(init_page: usize, page_indicator: T) -> Self {
Self {
next_page: init_page,
page_indicator,
}
}
}
impl<T> Paged<T>
where
T: PageFormatter,
{
pub async fn next<C>(&mut self, client: &C) -> Result<String, PagedError>
where
C: HttpRequestBuilder,
{
let url = self.page_indicator.format_n(self.next_page);
let content = client
.get_builder(&url)
.send()
.await
.and_then(Response::error_for_status)?
.text()
.await?;
self.next_page += 1;
Ok(content)
}
}
impl<T> Paged<T>
where
T: PageFormatter + PageIndicator,
{
/// pages returns at least one element if it is Ok
pub async fn pages<C>(&mut self, client: &C) -> Result<Vec<String>, PagedError>
where
C: HttpRequestBuilder,
{
let mut results = Vec::new();
loop {
let content = self.next(client).await?;
let terminated = self.page_indicator.is_last_page(&content, self.next_page);
results.push(content);
if terminated {
return Ok(results);
}
}
}
}

View File

@ -0,0 +1,45 @@
use std::{collections::HashMap, env};
use once_cell::sync::OnceCell;
static CFG_PATH: OnceCell<String> = OnceCell::new();
lazy_static::lazy_static! {
static ref CONFIG_MAPPING: HashMap<String, serde_yaml::Value> = {
let file_path = CFG_PATH.get_or_init(get_config_path);
let file_content = std::fs::read_to_string(file_path).expect("config file not found");
serde_yaml::from_str(&file_content).expect("unable to parse config file")
};
}
fn get_config_path() -> String {
// read from env
if let Ok(p) = env::var("CONFIG_FILE") {
if !p.is_empty() {
return p;
}
}
// default
"config.yaml".to_string()
}
/// Initialize config, will panic on failure.
pub fn init(config_path: Option<String>) {
if let Some(p) = config_path {
let _ = CFG_PATH.set(p);
}
lazy_static::initialize(&CONFIG_MAPPING);
}
/// Parse struct from global config.
pub fn parse<T>(key: &str) -> serde_yaml::Result<Option<T>>
where
T: serde::de::DeserializeOwned,
{
CONFIG_MAPPING
.get(key)
.cloned()
.map(|v| serde_yaml::from_value(v))
.transpose()
}

View File

@ -0,0 +1,205 @@
// A wrapper for reqwest to provide ability to bind to random ip.
// Since apparently I can not afford a ipv4 subnet, here I assume ipv6.
// Using he.net tunnel broker works fine.
// Setup:
// 1. sudo ip add add local 2001:x:x::/48 dev lo
// 2. sudo ip route add local 2001:x:x::/48 dev he-ipv6
// 3. Set net.ipv6.ip_nonlocal_bind=1
pub const UA: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36";
const CONFIG_KEY: &str = "http";
use std::{
net::{IpAddr, Ipv6Addr, SocketAddr},
ops::{Deref, DerefMut},
sync::Arc,
};
use ipnet::Ipv6Net;
use reqwest::header;
use rustls::ClientConfig;
use crate::{config, tls::WhitelistVerifier};
const CF_ADDR: Ipv6Addr = Ipv6Addr::new(0x2606, 0x4700, 0x4700, 0, 0, 0, 0, 0x1111);
const TG_ADDR: Ipv6Addr = Ipv6Addr::new(0x2001, 0x67c, 0x4e8, 0x1033, 0x1, 0x100, 0, 0xa);
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, derive_more::From, derive_more::Into)]
pub struct Ipv6Net2(Ipv6Net);
impl<'de> serde::Deserialize<'de> for Ipv6Net2 {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
use std::str::FromStr;
let data = String::deserialize(deserializer)?;
Ipv6Net::from_str(&data)
.map(Ipv6Net2)
.map_err(serde::de::Error::custom)
}
}
#[derive(serde::Deserialize, Clone, Debug, Default)]
struct HTTPConfig {
ipv6_prefix: Option<Ipv6Net2>,
}
#[derive(Debug, Default)]
pub struct GhostClientBuilder {
mapping: Vec<(&'static str, SocketAddr)>,
headers: Option<header::HeaderMap>,
}
impl GhostClientBuilder {
pub fn with_default_headers(self, headers: header::HeaderMap) -> Self {
Self {
headers: Some(headers),
..self
}
}
pub fn with_cf_resolve(mut self, domains: &[&'static str]) -> Self {
let cf = SocketAddr::new(IpAddr::V6(CF_ADDR), 443);
for &domain in domains.iter() {
self.mapping.push((domain, cf));
}
self
}
#[deprecated = "telegra.ph has fixed it and returns 501 when using ipv6"]
pub fn with_tg_resolve(mut self) -> Self {
let tg = SocketAddr::new(IpAddr::V6(TG_ADDR), 443);
self.mapping.push(("telegra.ph", tg));
self.mapping.push(("api.telegra.ph", tg));
self
}
pub fn build(self, prefix: Option<Ipv6Net>) -> GhostClient {
let inner = GhostClient::build_raw(&prefix, &self.mapping, self.headers.clone());
GhostClient {
prefix,
mapping: Arc::new(self.mapping),
headers: self.headers,
inner,
}
}
pub fn build_from_config(self) -> anyhow::Result<GhostClient> {
let config: HTTPConfig = config::parse(CONFIG_KEY)?.unwrap_or_default();
let prefix = config.ipv6_prefix.map(Into::into);
Ok(self.build(prefix))
}
}
#[derive(Debug, Default)]
pub struct GhostClient {
prefix: Option<Ipv6Net>,
mapping: Arc<Vec<(&'static str, SocketAddr)>>,
headers: Option<header::HeaderMap>,
inner: reqwest::Client,
}
impl GhostClient {
pub fn builder() -> GhostClientBuilder {
GhostClientBuilder::default()
}
}
impl Clone for GhostClient {
fn clone(&self) -> Self {
let inner = Self::build_raw(&self.prefix, &self.mapping, self.headers.clone());
Self {
prefix: self.prefix,
mapping: self.mapping.clone(),
headers: self.headers.clone(),
inner,
}
}
}
impl Deref for GhostClient {
type Target = reqwest::Client;
fn deref(&self) -> &Self::Target {
&self.inner
}
}
impl DerefMut for GhostClient {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.inner
}
}
impl GhostClient {
fn build_raw(
net: &Option<Ipv6Net>,
mapping: &[(&'static str, SocketAddr)],
headers: Option<header::HeaderMap>,
) -> reqwest::Client {
let mut builder = reqwest::Client::builder().user_agent(UA);
if let Some(headers) = headers {
builder = builder.default_headers(headers);
}
if let Some(net) = net {
let addr: u128 = net.addr().into();
let prefix_len = net.prefix_len();
let mask = !u128::max_value()
.checked_shl((128 - prefix_len) as u32)
.unwrap_or(u128::min_value());
// use random ipv6
let rand: u128 = rand::Rng::gen(&mut rand::thread_rng());
let addr = IpAddr::V6(Ipv6Addr::from(rand & mask | addr));
builder = builder.local_address(addr);
// apply resolve
for (domain, addr) in mapping {
builder = builder.resolve(*domain, *addr);
}
// not add preconfigured tls
// let tls_config = TLS_CFG.clone();
// builder = builder.use_preconfigured_tls(tls_config);
}
builder.build().expect("build reqwest client failed")
}
pub fn refresh(&mut self) {
self.inner = Self::build_raw(&self.prefix, &self.mapping, self.headers.clone());
}
}
lazy_static::lazy_static! {
// here we only meet telegra.ph with wrong tls config, so we write them as fixed values.
static ref TLS_CFG: ClientConfig = WhitelistVerifier::new(["telegram.org"]).into();
}
#[cfg(test)]
mod tests {
use super::{TLS_CFG, UA};
#[ignore]
#[tokio::test]
async fn test_tls() {
let tls_config = TLS_CFG.clone();
// use a telegram.org ip address(normally it fails in browser)
let cli = reqwest::Client::builder()
.user_agent(UA)
.resolve("api.telegra.ph", "149.154.167.99:443".parse().unwrap())
.use_preconfigured_tls(tls_config)
.build()
.unwrap();
let resp = cli
.get("https://api.telegra.ph/getPage")
.send()
.await
.unwrap();
assert_eq!(resp.status(), 200);
}
}

View File

@ -0,0 +1,127 @@
use reqwest::header::HeaderValue;
use crate::{
config,
http_client::{GhostClient, UA},
};
const CONFIG_KEY: &str = "proxy";
#[derive(serde::Deserialize, Clone, Debug, Default)]
struct ProxyConfig {
endpoint: String,
authorization: String,
}
/// RequestBuilder helps create a Request with proxy.
/// Note: Users should not replace headers.
#[derive(Debug, Clone, Default)]
pub struct ProxiedClient {
proxy: Option<Proxy>,
inner: reqwest::Client,
}
#[derive(Debug, Clone)]
pub struct Proxy {
endpoint: reqwest::Url,
authorization: HeaderValue,
}
impl ProxiedClient {
pub fn new(endpoint: &str, authorization: &str) -> Self {
let proxy = Some(Proxy {
endpoint: endpoint.parse().expect("unable to parse proxy endpoint"),
authorization: authorization
.parse()
.expect("unable to parse proxy authorization"),
});
Self {
proxy,
inner: reqwest::Client::builder()
.user_agent(UA)
.build()
.expect("unable to build reqwest client"),
}
}
pub fn new_from_config() -> Self {
match config::parse::<ProxyConfig>(CONFIG_KEY)
.expect("unable to parse proxy config(key is {CONFIG_KEY})")
{
Some(cfg) => Self::new(&cfg.endpoint, &cfg.authorization),
None => {
tracing::warn!("initialized ProxiedClient without proxy config");
Self::default()
}
}
}
pub fn with_default_headers(self, headers: reqwest::header::HeaderMap) -> Self {
Self {
inner: reqwest::Client::builder()
.user_agent(UA)
.default_headers(headers)
.build()
.expect("unable to build reqwest client"),
..self
}
}
}
macro_rules! impl_method {
($method: ident) => {
pub fn $method(&self, url: &str) -> reqwest::RequestBuilder {
match &self.proxy {
Some(p) => self
.inner
.$method(p.endpoint.clone())
.header("X-Forwarded-For", url)
.header("X-Authorization", p.authorization.clone()),
None => self.inner.$method(url),
}
}
};
}
impl ProxiedClient {
impl_method!(get);
impl_method!(post);
impl_method!(head);
impl_method!(put);
impl_method!(delete);
impl_method!(patch);
pub fn request(&self, method: reqwest::Method, url: &str) -> reqwest::RequestBuilder {
match &self.proxy {
Some(p) => self
.inner
.request(method, p.endpoint.clone())
.header("X-Forwarded-For", url)
.header("X-Authorization", p.authorization.clone()),
None => self.inner.request(method, url),
}
}
}
pub trait HttpRequestBuilder {
fn get_builder(&self, url: &str) -> reqwest::RequestBuilder;
fn post_builder(&self, url: &str) -> reqwest::RequestBuilder;
}
macro_rules! gen_impl {
($ty: ty) => {
impl HttpRequestBuilder for $ty {
fn get_builder(&self, url: &str) -> reqwest::RequestBuilder {
self.get(url)
}
fn post_builder(&self, url: &str) -> reqwest::RequestBuilder {
self.post(url)
}
}
};
}
gen_impl!(reqwest::Client);
gen_impl!(ProxiedClient);
gen_impl!(GhostClient);

View File

@ -0,0 +1,15 @@
// Indexer + Filters(FilterType+Value) -> EntryStream
#[derive(Debug, Clone)]
pub enum Filter {
Name(String),
Category(String),
}
#[derive(Debug, Clone)]
pub enum OrderBy {
TimeDesc,
ClickDesc,
}
pub trait Indexer {}

16
eh2telegraph/src/lib.rs Normal file
View File

@ -0,0 +1,16 @@
#![feature(generic_associated_types)]
#![feature(type_alias_impl_trait)]
pub mod buffer;
pub mod collector;
pub mod config;
pub mod http_client;
pub mod http_proxy;
pub mod indexer;
pub mod searcher;
pub mod storage;
pub mod stream;
pub mod sync;
pub mod telegraph;
pub mod tls;
pub mod util;

View File

@ -0,0 +1,68 @@
use ipnet::Ipv6Net;
use regex::Regex;
use crate::{
collector::exhentai::EXCollector,
http_client::{GhostClient, GhostClientBuilder},
util::{get_string, match_first_group},
};
lazy_static::lazy_static! {
static ref EHENTAI_URL_RE: Regex = Regex::new(r#"<a href="(https://e(-|x)hentai\.org/g/\w+/[\w-]+)/">"#).unwrap();
}
/// FHashConverter can convert f-hash(usually comes from a search result) to the first gallery url.
/// Works for both e-hentai and ex-hentai.
pub struct FHashConvertor {
client: GhostClient,
raw_client: reqwest::Client,
}
impl FHashConvertor {
pub fn new(prefix: Option<Ipv6Net>) -> Self {
Self {
client: GhostClientBuilder::default()
.with_cf_resolve(&["e-hentai.org"])
.build(prefix),
raw_client: EXCollector::new_from_config()
.expect("unable to build ex-client")
.get_client(),
}
}
pub fn new_from_config() -> Self {
Self {
client: GhostClientBuilder::default()
.with_cf_resolve(&["e-hentai.org"])
.build_from_config()
.expect("unable to build client for f-hash convertor"),
raw_client: EXCollector::new_from_config()
.expect("unable to build ex-client")
.get_client(),
}
}
// TODO: impl a trait?
pub async fn convert_to_gallery(&self, f_hash: &str) -> anyhow::Result<String> {
tracing::info!("[f-hash] converting hash {f_hash}");
// find in e-hentai
let url = format!("https://e-hentai.org/?f_shash={f_hash}&f_sh=on&f_sname=on&f_stags=on&f_sh=on&f_spf=&f_spt=&f_sfl=on&f_sfu=on&f_sft=on");
let text = get_string(&self.client, &url).await?;
if let Some(url) = match_first_group(&EHENTAI_URL_RE, &text) {
tracing::info!("[f-hash] hash {f_hash} -> {url}");
return Ok(url.to_string());
}
// find in exhentai
let url = format!("https://exhentai.org/?f_shash={f_hash}&f_sh=on&f_sname=on&f_stags=on&f_sh=on&f_spf=&f_spt=&f_sfl=on&f_sfu=on&f_sft=on");
let text = get_string(&self.raw_client, &url).await?;
if let Some(url) = match_first_group(&EHENTAI_URL_RE, &text) {
tracing::info!("[f-hash] hash {f_hash} -> {url}");
return Ok(url.to_string());
}
tracing::info!("[f-hash] hash {f_hash} not found");
Err(anyhow::anyhow!("not found in e-hentai or exhentai"))
}
}

View File

@ -0,0 +1,24 @@
pub mod f_hash;
pub mod saucenao;
pub trait ImageSearcher {
type SeacheError;
type SearchOutput;
type FetchFuture<T>: std::future::Future<Output = Result<Self::SearchOutput, Self::SeacheError>>;
fn search<T: Into<std::borrow::Cow<'static, [u8]>>>(&self, data: T) -> Self::FetchFuture<T>;
}
#[cfg(test)]
mod tests {
use super::*;
#[ignore]
#[tokio::test]
async fn demo() {
let data = std::fs::read("./image.png").unwrap();
let searcher = saucenao::SaucenaoSearcher::new(None);
let r = searcher.search(data).await;
println!("result: {r:?}");
}
}

View File

@ -0,0 +1,189 @@
use std::{borrow::Cow, str::FromStr};
use futures::Future;
use ipnet::Ipv6Net;
use regex::Regex;
use reqwest::{
multipart::{self, Part},
Response,
};
use crate::http_client::GhostClient;
use super::ImageSearcher;
lazy_static::lazy_static! {
static ref SEARCH_ELEMENT_RE: Regex = Regex::new(r#"<tr><td class="resulttableimage">(.*?)</tr>"#).unwrap();
static ref S_URL_RE: Regex = Regex::new(r#"src="(https://.*?)""#).unwrap();
static ref TITLE_RE: Regex = Regex::new(r#"<div class="resulttitle"><strong>(.*?)</strong>"#).unwrap();
static ref SIM_RE: Regex = Regex::new(r#"<div class="resultsimilarityinfo">(\d+)\.?\d*%</div>"#).unwrap();
static ref SITE_PARSE_RE: Regex = Regex::new(r#"saucenao\.com/(res/pixiv(_historical)?/\d+/manga/(?P<pixiv_id>\d+)_)|(ehentai/\w+/\w+/(?P<ehentai_fhash>\w+))|(res/nhentai/(?P<nhentai_id>\d+))"#).unwrap();
}
macro_rules! extract_first {
($re: expr, $input: expr, $err_msg: expr) => {
$re.captures($input)
.ok_or_else(|| anyhow::anyhow!($err_msg))?
.get(1)
.expect("regexp is matched but no group 1 found")
.as_str()
};
}
macro_rules! extract_first_opt {
($re: expr, $input: expr, $default: expr) => {
match $re.captures($input) {
Some(t) => t
.get(1)
.expect("regexp is matched but no group 1 found")
.as_str(),
None => $default,
}
};
}
/// Saucenao searcher.
/// Note: even saucenao resolves to an ipv6 address, we still use force resolving.
#[derive(Debug, Clone)]
pub struct SaucenaoSearcher {
client: GhostClient,
}
impl SaucenaoSearcher {
pub fn new(prefix: Option<Ipv6Net>) -> Self {
Self {
client: GhostClient::builder()
.with_cf_resolve(&["saucenao.com", "e-hentai.org"])
.build(prefix),
}
}
pub fn new_from_config() -> Self {
Self {
client: GhostClient::builder()
.with_cf_resolve(&["saucenao.com", "e-hentai.org"])
.build_from_config()
.expect("unable to build client for saucenao"),
}
}
async fn search(client: &reqwest::Client, file: Part) -> anyhow::Result<SaucenaoOutput> {
let response = client
.post("https://saucenao.com/search.php")
.multipart(multipart::Form::new().part("file", file))
.send()
.await
.and_then(Response::error_for_status)?
.text()
.await?;
// check if the response is as expected
if !response.contains("<title>Sauce Found?</title>") {
return Err(anyhow::anyhow!("saucenao response is not as expected"));
}
SaucenaoOutput::from_str(&response)
}
}
#[non_exhaustive]
#[derive(Debug, Clone)]
pub enum SaucenaoParsed {
EHentai(String),
NHentai(String),
Pixiv(String),
Other,
}
#[derive(Debug, Clone)]
pub struct SaucenaoOuputElement {
pub raw_url: String,
pub name: String,
pub similarity: u8,
pub parsed: SaucenaoParsed,
}
#[derive(Debug, Clone)]
pub struct SaucenaoOutput {
pub data: Vec<SaucenaoOuputElement>,
}
impl IntoIterator for SaucenaoOutput {
type Item = <Vec<SaucenaoOuputElement> as IntoIterator>::Item;
type IntoIter = <Vec<SaucenaoOuputElement> as IntoIterator>::IntoIter;
fn into_iter(self) -> Self::IntoIter {
self.data.into_iter()
}
}
impl ImageSearcher for SaucenaoSearcher {
type SeacheError = anyhow::Error;
type SearchOutput = SaucenaoOutput;
type FetchFuture<T> = impl Future<Output = Result<Self::SearchOutput, Self::SeacheError>>;
fn search<T: Into<Cow<'static, [u8]>>>(&self, data: T) -> Self::FetchFuture<T> {
let file_part = Part::bytes(data).file_name("image.jpg");
let client = self.client.clone();
async move { Self::search(&client, file_part).await }
}
}
impl FromStr for SaucenaoOutput {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut data = Vec::new();
// match all
for cap in SEARCH_ELEMENT_RE.captures_iter(s) {
let s = cap
.get(1)
.expect("regexp is matched but no group 1 found")
.as_str();
let element = SaucenaoOuputElement::from_str(s)?;
data.push(element);
}
// sort
data.sort_unstable_by(|a, b| b.similarity.cmp(&a.similarity));
Ok(Self { data })
}
}
impl FromStr for SaucenaoOuputElement {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
// raw_url examples:
// https://img1.saucenao.com/res/pixiv/7594/manga/75943246_p1.jpg?auth=dKnHvUUPQ0wi8G6yv-HWZQ&exp=1645560000
// https://img1.saucenao.com/res/seiga_illust/157/1574075.jpg?auth=KKGjLqCUyouLUKieJ5g4Rw&exp=1645560000
// https://img3.saucenao.com/ehentai/c5/17/c517710f0654ea883df1e0fea7117c671fb03bc1.jpg?auth=Hu-H_4c3lTKdh_rtZJv50w&exp=1645560000
let raw_url =
extract_first!(S_URL_RE, s, "unable to parse saucenao result url").to_string();
let name = extract_first_opt!(TITLE_RE, s, "NO TITLE").to_string();
let similarity =
extract_first!(SIM_RE, s, "unable to parse saucenao result similarity").parse()?;
let parsed = SITE_PARSE_RE
.captures(&raw_url)
.and_then(|cap| {
if let Some(pixiv) = cap.name("pixiv_id") {
return Some(SaucenaoParsed::Pixiv(pixiv.as_str().to_string()));
}
if let Some(eh) = cap.name("ehentai_fhash") {
return Some(SaucenaoParsed::EHentai(eh.as_str().to_string()));
}
if let Some(nh) = cap.name("nhentai_id") {
return Some(SaucenaoParsed::NHentai(nh.as_str().to_string()));
}
None
})
.unwrap_or(SaucenaoParsed::Other);
Ok(Self {
raw_url,
name,
similarity,
parsed,
})
}
}

View File

@ -0,0 +1,73 @@
use std::{sync::Arc, time::Duration};
use cloudflare_kv_proxy::{Client, ClientError, NotFoundMapping};
use futures::Future;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use crate::config;
use super::KVStorage;
const CONFIG_KEY: &str = "worker_kv";
#[derive(Debug, Deserialize)]
pub struct CFConfig {
pub endpoint: String,
pub token: String,
pub cache_size: usize,
pub expire_sec: u64,
}
#[derive(Clone, Debug)]
pub struct CFStorage(Arc<Client>);
impl CFStorage {
pub fn new<T: Into<String>, E: Into<String>>(
endpoint: E,
token: T,
cache_size: usize,
expire: Duration,
) -> Result<Self, ClientError> {
Ok(Self(Arc::new(Client::new(
endpoint, token, cache_size, expire,
)?)))
}
pub fn new_from_config() -> anyhow::Result<Self> {
let config: CFConfig = config::parse(CONFIG_KEY)?
.ok_or_else(|| anyhow::anyhow!("cloudflare worker config(key: worker_kv) not found"))?;
Self::new(
config.endpoint,
config.token,
config.cache_size,
Duration::from_secs(config.expire_sec),
)
.map_err(Into::into)
}
}
impl<T> KVStorage<T> for CFStorage
where
T: DeserializeOwned + Serialize + Send + Sync,
{
type GetFuture<'a> = impl Future<Output = anyhow::Result<Option<T>>> where Self: 'a;
fn get<'a>(&'a self, key: &'a str) -> Self::GetFuture<'_> {
async move {
self.0
.get(key)
.await
.map_not_found_to_option()
.map_err(Into::into)
}
}
type SetFuture<'a> = impl Future<Output = anyhow::Result<()>> where Self: 'a;
fn set<'a>(&self, key: String, value: T, _expire_ttl: Option<usize>) -> Self::SetFuture<'_> {
async move { self.0.put(&key, &value).await.map_err(Into::into) }
}
type DeleteFuture<'a> = impl Future<Output = anyhow::Result<()>> where Self: 'a;
fn delete<'a>(&'a self, key: &'a str) -> Self::DeleteFuture<'_> {
async move { self.0.delete(key).await.map_err(Into::into) }
}
}

View File

@ -0,0 +1,41 @@
use std::sync::Arc;
use futures::Future;
use hashlink::LruCache;
use parking_lot::Mutex;
use super::KVStorage;
#[derive(Clone, Debug)]
pub struct LruStorage(Arc<Mutex<LruCache<String, String>>>);
impl LruStorage {
pub fn new(capacity: usize) -> Self {
Self(Arc::new(Mutex::new(LruCache::new(capacity))))
}
}
impl KVStorage<String> for LruStorage {
type GetFuture<'a> = impl Future<Output = anyhow::Result<Option<String>>> where Self: 'a;
fn get<'a>(&'a self, key: &'a str) -> Self::GetFuture<'_> {
let v = self.0.lock().get(key).cloned();
async move { Ok(v) }
}
type SetFuture<'a> = impl Future<Output = anyhow::Result<()>> where Self: 'a;
fn set<'a>(
&self,
key: String,
value: String,
_expire_ttl: Option<usize>,
) -> Self::SetFuture<'_> {
self.0.lock().insert(key, value);
async move { Ok(()) }
}
type DeleteFuture<'a> = impl Future<Output = anyhow::Result<()>> where Self: 'a;
fn delete<'a>(&'a self, key: &'a str) -> Self::DeleteFuture<'_> {
self.0.lock().remove(key);
async move { Ok(()) }
}
}

View File

@ -0,0 +1,57 @@
use futures::Future;
use parking_lot::RwLock;
use std::{collections::HashMap, sync::Arc};
pub mod cloudflare_kv;
pub mod lru;
pub trait KVStorage<V> {
type GetFuture<'a>: Future<Output = anyhow::Result<Option<V>>> + Send
where
Self: 'a;
fn get<'a>(&'a self, key: &'a str) -> Self::GetFuture<'_>;
type SetFuture<'a>: Future<Output = anyhow::Result<()>> + Send
where
Self: 'a;
fn set(&self, key: String, value: V, expire_ttl: Option<usize>) -> Self::SetFuture<'_>;
type DeleteFuture<'a>: Future<Output = anyhow::Result<()>> + Send
where
Self: 'a;
fn delete<'a>(&'a self, key: &'a str) -> Self::DeleteFuture<'_>;
}
#[derive(Default, Clone, Debug)]
pub struct SimpleMemStorage(Arc<RwLock<HashMap<String, String>>>);
impl SimpleMemStorage {
pub fn with_capacity(capacity: usize) -> Self {
Self(Arc::new(RwLock::new(HashMap::with_capacity(capacity))))
}
}
impl KVStorage<String> for SimpleMemStorage {
type GetFuture<'a> = impl Future<Output = anyhow::Result<Option<String>>> where Self: 'a;
fn get<'a>(&'a self, key: &'a str) -> Self::GetFuture<'_> {
let v = self.0.read().get(key).cloned();
async move { Ok(v) }
}
type SetFuture<'a> = impl Future<Output = anyhow::Result<()>> where Self: 'a;
fn set<'a>(
&self,
key: String,
value: String,
_expire_ttl: Option<usize>,
) -> Self::SetFuture<'_> {
self.0.write().insert(key, value);
async move { Ok(()) }
}
type DeleteFuture<'a> = impl Future<Output = anyhow::Result<()>> where Self: 'a;
fn delete<'a>(&'a self, key: &'a str) -> Self::DeleteFuture<'_> {
self.0.write().remove(key);
async move { Ok(()) }
}
}

108
eh2telegraph/src/stream.rs Normal file
View File

@ -0,0 +1,108 @@
use std::collections::VecDeque;
use std::fmt;
use std::future::Future;
use futures::FutureExt;
use tokio::sync::oneshot;
/// We define a AsyncStream to replace futures::Stream since we don't want to implement
/// poll_next nor using async_stream.
/// Although we use GAT, we don't want the future to capture self's ref. We did like
/// that before, and this makes it hard to load stream in parallel like Buffered.
/// Also, our AsyncStream is not like Stream in signature. We return `Option<Future>`
/// instead of `Future<Output = Option<_>>`.
pub trait AsyncStream {
type Item;
type Future: Future<Output = Self::Item>;
fn next(&mut self) -> Option<Self::Future>;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
(0, None)
}
}
/// Buffered Stream.
/// By decorating Buffered, the output future of stream will be polled
/// concurrently.
/// Here I implement it by spawning tasks. It is indeed not efficient as
/// `FuturesOrdered` which is used by `futures-util::stream::Buffered`.
/// As a decorator of an async trait, it is hard to implement it in a poll
/// way. We can do that, but it breaks the safety boundary which requires
/// user to make sure that the AsyncStream exists when polling the future
/// since in our trait definition, the future has no relation with self.
/// And without poll, we can not drive multiple futures by one future.
pub struct Buffered<St>
where
St: AsyncStream,
{
stream: Option<St>,
queue: VecDeque<oneshot::Receiver<St::Item>>,
max: usize,
}
impl<St> Buffered<St>
where
St: AsyncStream,
{
pub fn new(stream: St, buffer_size: usize) -> Self {
Self {
stream: Some(stream),
queue: VecDeque::with_capacity(buffer_size),
max: buffer_size,
}
}
}
impl<St> fmt::Debug for Buffered<St>
where
St: AsyncStream + fmt::Debug,
St::Item: fmt::Debug,
{
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Buffered")
.field("stream", &self.stream)
.field("queue", &self.queue)
.field("max", &self.max)
.finish()
}
}
impl<St> AsyncStream for Buffered<St>
where
St: AsyncStream,
St::Item: Send + 'static,
St::Future: Send + 'static,
{
type Item = St::Item;
type Future = impl std::future::Future<Output = Self::Item>;
fn next(&mut self) -> Option<Self::Future> {
while self.queue.len() < self.max {
let item = match self.stream.as_mut() {
Some(st) => match st.next() {
Some(item) => Some(item),
None => {
self.stream = None;
None
}
},
None => None,
};
match item {
Some(f) => {
let (tx, rx) = oneshot::channel::<Self::Item>();
tokio::spawn(async move {
let _ = tx.send(f.await);
});
self.queue.push_back(rx);
}
None => break,
}
}
self.queue
.pop_front()
.map(|x| x.map(|xx| xx.expect("oneshot tx dropped which is unexpected")))
}
}

265
eh2telegraph/src/sync.rs Normal file
View File

@ -0,0 +1,265 @@
use crate::{
buffer::{DataSized, ImageBuffer},
collector::{
AlbumMeta, Collector, ImageData, ImageMeta, Param, Registry, URL_FROM_TEXT_RE,
URL_FROM_URL_RE,
},
http_proxy::ProxiedClient,
storage::{cloudflare_kv::CFStorage, KVStorage},
stream::{AsyncStream, Buffered},
telegraph::{
types::{Node, Page, PageCreate},
RandomAccessToken, Telegraph, TelegraphError, MAX_SINGLE_FILE_SIZE,
},
util::match_first_group,
};
const ERR_THRESHOLD: usize = 10;
const BATCH_LEN_THRESHOLD: usize = 20;
const BATCH_SIZE_THRESHOLD: usize = 5 * 1024 * 1024;
const DEFAULT_CONCURRENT: usize = 20;
#[derive(thiserror::Error, Debug)]
pub enum UploadError<SE> {
#[error("stream error {0}")]
Stream(SE),
#[error("telegraph error")]
Reqwest(#[from] TelegraphError),
}
pub struct Synchronizer<C = CFStorage> {
tg: Telegraph<RandomAccessToken, ProxiedClient>,
limit: Option<usize>,
author_name: Option<String>,
author_url: Option<String>,
cache_ttl: Option<usize>,
registry: Registry,
cache: C,
}
impl<CACHE> Synchronizer<CACHE>
where
CACHE: KVStorage<String>,
{
// cache ttl is 45 days
const DEFAULT_CACHE_TTL: usize = 3600 * 24 * 45;
pub fn new(
tg: Telegraph<RandomAccessToken, ProxiedClient>,
registry: Registry,
cache: CACHE,
) -> Self {
Self {
tg,
limit: None,
author_name: None,
author_url: None,
cache_ttl: None,
registry,
cache,
}
}
pub fn with_concurrent_limit(mut self, limit: usize) -> Self {
self.limit = Some(limit);
self
}
pub fn with_author<S: Into<String>>(mut self, name: Option<S>, url: Option<S>) -> Self {
self.author_name = name.map(Into::into);
self.author_url = url.map(Into::into);
self
}
pub fn with_cache_ttl(mut self, ttl: Option<usize>) -> Self {
self.cache_ttl = ttl;
self
}
pub async fn delete_cache(&self, key: &str) -> anyhow::Result<()> {
self.cache.delete(key).await
}
pub async fn sync<C: Collector>(&self, path: String) -> anyhow::Result<String>
where
Registry: Param<C>,
C::FetchError: Into<anyhow::Error> + Send + 'static,
C::StreamError:
Into<anyhow::Error> + std::fmt::Debug + std::fmt::Display + Send + Sync + 'static,
C::ImageStream: Send + 'static,
<C::ImageStream as AsyncStream>::Future: Send + 'static,
{
// check cache
let cache_key = format!("{}|{}", C::name(), path);
if let Ok(Some(v)) = self.cache.get(&cache_key).await {
tracing::info!("[cache] hit key {cache_key}");
return Ok(v);
}
tracing::info!("[cache] miss key {cache_key}");
let collector: &C = self.registry.get();
let (meta, stream) = collector.fetch(path).await.map_err(Into::into)?;
let page = self
.sync_stream(meta, stream)
.await
.map_err(anyhow::Error::from)?;
// set cache
let _ = self
.cache
.set(
cache_key,
page.url.clone(),
Some(self.cache_ttl.unwrap_or(Self::DEFAULT_CACHE_TTL)),
)
.await;
Ok(page.url)
}
pub async fn sync_stream<S, SE>(
&self,
meta: AlbumMeta,
stream: S,
) -> Result<Page, UploadError<SE>>
where
SE: Send + std::fmt::Debug + 'static,
S: AsyncStream<Item = Result<(ImageMeta, ImageData), SE>>,
S::Future: Send + 'static,
{
let buffered_stream = Buffered::new(stream, self.limit.unwrap_or(DEFAULT_CONCURRENT));
let r = self.inner_sync_stream(meta, buffered_stream).await;
match &r {
Ok(p) => {
tracing::info!("[sync] sync success with url {}", p.url);
}
Err(e) => {
tracing::error!("[sync] sync fail! {e:?}");
}
}
r
}
async fn inner_sync_stream<S, SE>(
&self,
meta: AlbumMeta,
mut stream: S,
) -> Result<Page, UploadError<SE>>
where
S: AsyncStream<Item = Result<(ImageMeta, ImageData), SE>>,
{
let mut err_count = 0;
let mut uploaded = Vec::new();
let mut buffer = ImageBuffer::new();
// in this big loop, we will download images, and upload them in batch.
// then, all meta info will be saved in `uploaded`.
loop {
// TODO: load images one by one is too slow!
// We can spawn a background task(FuturesUnordered) and use channel, but expose as AsyncStream,
// which does not require changes on consuming side.
// 1. download images in batch
while let Some(fut) = stream.next() {
let data = match fut.await {
Err(e) => {
err_count += 1;
if err_count > ERR_THRESHOLD {
return Err(UploadError::Stream(e));
}
continue;
}
Ok(d) => {
err_count = 0;
d
}
};
// if the data size is too big to upload, we will discard it.
if data.1.len() >= MAX_SINGLE_FILE_SIZE {
tracing::error!("Too big file, discarded. Meta: {:?}", data.0);
continue;
}
buffer.push(data);
if buffer.len() > BATCH_LEN_THRESHOLD || buffer.size() > BATCH_SIZE_THRESHOLD {
break;
}
}
// all data is uploaded, and no data to process.
// just break the big loop.
if buffer.is_empty() {
break;
}
// 2. upload the batch
let (full_data, size) = buffer.swap();
let image_count = full_data.len();
tracing::debug!("download {image_count} images with size {size}, will upload them",);
let (meta, data) = full_data
.into_iter()
.map(|(a, b)| (a, b.as_ref().to_owned()))
.unzip::<_, _, Vec<_>, Vec<_>>();
let medium = self.tg.upload(data).await?;
err_count = 0;
// 3. add to uploaded
tracing::debug!("upload {image_count} images with size {size}, medium: {medium:?}");
uploaded.extend(
meta.into_iter()
.zip(medium.into_iter().map(|x| x.src))
.map(|(meta, src)| UploadedImage { meta, src }),
);
}
// create telegraph page
let mut content: Vec<_> = uploaded.into_iter().map(Into::into).collect();
content.push(Node::new_p_text("Generated by eh2telegraph."));
content.push(Node::new_p_text(format!("Original link: {}", meta.link)));
self.tg
.create_page(&PageCreate {
title: meta.name,
content,
author_name: self
.author_name
.clone()
.or_else(|| meta.authors.map(|x| x.join(", "))),
author_url: self.author_url.clone(),
})
.await
.map_err(Into::into)
}
}
impl Synchronizer {
pub fn match_url_from_text(content: &str) -> Option<&str> {
match_first_group(&URL_FROM_TEXT_RE, content)
}
pub fn match_url_from_url(content: &str) -> Option<&str> {
match_first_group(&URL_FROM_URL_RE, content)
}
}
impl DataSized for (ImageMeta, ImageData) {
#[inline]
fn size(&self) -> usize {
self.1.size()
}
}
struct UploadedImage {
#[allow(unused)]
meta: ImageMeta,
src: String,
}
impl From<UploadedImage> for Node {
fn from(i: UploadedImage) -> Self {
Node::new_image(format!("https://telegra.ph{}", i.src))
}
}

View File

@ -0,0 +1,47 @@
// Partly borrowed from https://github.com/Aloxaf/telegraph-rs/blob/master/src/error.rs
use serde::Deserialize;
use super::types::MediaInfo;
#[derive(thiserror::Error, Debug)]
pub enum TelegraphError {
#[error("api error {0}")]
Api(String),
#[error("reqwest error")]
Reqwest(#[from] reqwest::Error),
#[error("unexpected server result")]
Server,
}
#[derive(Debug, Deserialize)]
#[serde(untagged)]
pub(crate) enum ApiResult<T> {
Ok { result: T },
Err { error: String },
}
impl<T> From<ApiResult<T>> for Result<T, TelegraphError> {
fn from(r: ApiResult<T>) -> Self {
match r {
ApiResult::Ok { result: v } => Ok(v),
ApiResult::Err { error: e, .. } => Err(TelegraphError::Api(e)),
}
}
}
#[derive(Debug, Deserialize)]
#[serde(untagged)]
pub(crate) enum UploadResult {
Ok(Vec<MediaInfo>),
Err { error: String },
}
impl From<UploadResult> for Result<Vec<MediaInfo>, TelegraphError> {
fn from(r: UploadResult) -> Self {
match r {
UploadResult::Ok(v) => Ok(v),
UploadResult::Err { error } => Err(TelegraphError::Api(error)),
}
}
}

View File

@ -0,0 +1,284 @@
/// Telegraph API Client
pub use error::TelegraphError;
pub mod types;
pub const MAX_SINGLE_FILE_SIZE: usize = 5 * 1024 * 1024;
mod error;
use std::{borrow::Cow, sync::Arc};
use reqwest::{
multipart::{Form, Part},
Client, Response,
};
use serde::Serialize;
use crate::http_proxy::HttpRequestBuilder;
use self::{
error::{ApiResult, UploadResult},
types::{MediaInfo, Page, PageCreate, PageEdit},
};
#[derive(Debug, Clone)]
pub struct Telegraph<T, C = Client> {
// http client
client: C,
// access token
access_token: T,
}
pub trait AccessToken {
fn token(&self) -> &str;
fn select_token(&self, _path: &str) -> &str {
Self::token(self)
}
}
#[derive(Debug, Clone)]
pub struct SingleAccessToken(pub Arc<String>);
#[derive(Debug, Clone)]
pub struct RandomAccessToken(pub Arc<Vec<String>>);
impl AccessToken for SingleAccessToken {
fn token(&self) -> &str {
&self.0
}
}
impl From<String> for SingleAccessToken {
fn from(s: String) -> Self {
Self(Arc::new(s))
}
}
impl AccessToken for RandomAccessToken {
fn token(&self) -> &str {
use rand::prelude::SliceRandom;
self.0
.choose(&mut rand::thread_rng())
.expect("token list must contains at least one element")
}
}
impl From<String> for RandomAccessToken {
fn from(s: String) -> Self {
Self(Arc::new(vec![s]))
}
}
impl From<Vec<String>> for RandomAccessToken {
fn from(ts: Vec<String>) -> Self {
assert!(!ts.is_empty());
Self(Arc::new(ts))
}
}
macro_rules! execute {
($send: expr) => {
$send
.send()
.await
.and_then(Response::error_for_status)?
.json::<ApiResult<_>>()
.await?
.into()
};
}
#[derive(Debug, Clone, PartialEq, Eq, derive_more::From, derive_more::Into)]
pub struct TelegraphToken(Arc<String>);
impl<T> Telegraph<T, Client> {
pub fn new<AT>(access_token: AT) -> Telegraph<T, Client>
where
AT: Into<T>,
{
Telegraph {
client: Client::new(),
access_token: access_token.into(),
}
}
}
impl<T, C> Telegraph<T, C> {
pub fn with_proxy<P: HttpRequestBuilder + 'static>(self, proxy: P) -> Telegraph<T, P> {
Telegraph {
client: proxy,
access_token: self.access_token,
}
}
}
impl<T, C> Telegraph<T, C>
where
T: AccessToken,
C: HttpRequestBuilder,
{
/// Create page.
pub async fn create_page(&self, page: &PageCreate) -> Result<Page, TelegraphError> {
#[derive(Serialize)]
struct PagePostWithToken<'a> {
access_token: &'a str,
#[serde(flatten)]
page: &'a PageCreate,
}
let to_post = PagePostWithToken {
access_token: self.access_token.token(),
page,
};
execute!(self
.client
.post_builder("https://api.telegra.ph/createPage")
.form(&to_post))
}
/// Edit page.
pub async fn edit_page(&self, page: &PageEdit) -> Result<Page, TelegraphError> {
#[derive(Serialize)]
struct PageEditWithToken<'a> {
access_token: &'a str,
#[serde(flatten)]
page: &'a PageEdit,
}
let to_post = PageEditWithToken {
access_token: self.access_token.select_token(&page.path),
page,
};
execute!(self
.client
.post_builder("https://api.telegra.ph/editPage")
.form(&to_post))
}
/// Get page.
/// path: Path to the Telegraph page (in the format Title-12-31, i.e. everything
/// that comes after http://telegra.ph/)
pub async fn get_page(&self, path: &str) -> Result<Page, TelegraphError> {
#[derive(Serialize)]
struct PageGet<'a> {
path: &'a str,
#[serde(flatten)]
return_content: Option<bool>,
}
let to_post = PageGet {
path,
return_content: Some(true),
};
execute!(self
.client
.post_builder("https://api.telegra.ph/getPage")
.form(&to_post))
}
/// Upload file.
/// If the result is Ok, it's length must eq to files'.
pub async fn upload<IT, I>(&self, files: IT) -> Result<Vec<MediaInfo>, TelegraphError>
where
IT: IntoIterator<Item = I>,
I: Into<Cow<'static, [u8]>>,
{
let mut form = Form::new();
let mut cnt = 0;
for (idx, data) in files.into_iter().enumerate() {
let part = Part::bytes(data).file_name(idx.to_string());
form = form.part(idx.to_string(), part);
cnt += 1;
}
let r: Result<Vec<MediaInfo>, TelegraphError> = self
.client
.post_builder("https://telegra.ph/upload")
.multipart(form)
.send()
.await
.and_then(Response::error_for_status)?
.json::<UploadResult>()
.await?
.into();
// Here we check if server returns the same amount as files posted
r.and_then(|x| {
if x.len() != cnt {
Err(TelegraphError::Server)
} else {
Ok(x)
}
})
}
}
#[cfg(test)]
mod tests {
use crate::telegraph::{
types::{Node, PageCreate},
SingleAccessToken, Telegraph,
};
use super::types::{NodeElement, NodeElementAttr, Tag};
pub const TELEGRAPH_TOKEN: &str =
"f42d3570f95412b59b08d64450049e4d609b1f2a57657fce6ce8acc908aa";
#[ignore]
#[tokio::test]
async fn demo_create_page() {
let telegraph = Telegraph::<SingleAccessToken>::new(TELEGRAPH_TOKEN.to_string());
let page = PageCreate {
title: "title".to_string(),
content: vec![Node::Text("test text".to_string())],
author_name: Some("test_author".to_string()),
author_url: Some("https://t.co".to_string()),
};
let page = telegraph.create_page(&page).await.unwrap();
println!("test page: {:?}", page);
}
#[ignore]
#[tokio::test]
async fn demo_upload() {
let demo_image: Vec<u8> = reqwest::get("https://t.co/static/images/bird.png")
.await
.unwrap()
.bytes()
.await
.unwrap()
.as_ref()
.to_owned();
let telegraph = Telegraph::<SingleAccessToken>::new(TELEGRAPH_TOKEN.to_string());
let ret = telegraph
.upload(Some(demo_image))
.await
.unwrap()
.pop()
.unwrap();
println!("uploaded file link: {}", ret.src);
}
#[ignore]
#[tokio::test]
async fn demo_create_images_page() {
let telegraph = Telegraph::<SingleAccessToken>::new(TELEGRAPH_TOKEN.to_string());
let node = Node::NodeElement(NodeElement {
tag: Tag::Img,
attrs: Some(NodeElementAttr {
src: Some("https://telegra.ph/file/e31b40e99b0c028601ccb.png".to_string()),
href: None,
}),
children: None,
});
let page = PageCreate {
title: "title".to_string(),
content: vec![node],
author_name: Some("test_author".to_string()),
author_url: None,
};
let page = telegraph.create_page(&page).await.unwrap();
println!("test page: {:?}", page);
}
}

View File

@ -0,0 +1,215 @@
// Partly borrowed from https://github.com/Aloxaf/telegraph-rs/blob/master/src/types.rs
use serde::{Deserialize, Serialize};
/// This object represents a Telegraph account.
#[derive(Debug, Clone, Deserialize)]
pub struct Account {
/// Account name, helps users with several accounts remember which they are currently using.
///
/// Displayed to the user above the "Edit/Publish" button on Telegra.ph, other users don't see this name.
pub short_name: Option<String>,
/// Default author name used when creating new articles.
pub author_name: Option<String>,
/// Profile link, opened when users click on the author's name below the title.
///
/// Can be any link, not necessarily to a Telegram profile or channel.
pub author_url: Option<String>,
/// Optional. Only returned by the createAccount and revokeAccessToken method.
///
/// Access token of the Telegraph account.
pub access_token: Option<String>,
/// Optional. URL to authorize a browser on telegra.ph and connect it to a Telegraph account.
///
/// This URL is valid for only one use and for 5 minutes only.
pub auth_url: Option<String>,
/// Optional. Number of pages belonging to the Telegraph account.
pub page_count: Option<i32>,
}
/// This object represents a list of Telegraph articles belonging to an account. Most recently created articles first.
#[derive(Debug, Clone, Deserialize)]
pub struct PageList {
/// Total number of pages belonging to the target Telegraph account.
pub total_count: i32,
/// Requested pages of the target Telegraph account.
pub pages: Vec<Page>,
}
/// This object represents a page to create on Telegraph.
#[derive(Debug, Clone, Serialize)]
pub struct PageCreate {
/// Title of the page.
pub title: String,
/// Content of the page.
#[serde(with = "serde_with::json::nested")]
pub content: Vec<Node>,
/// Optional. Name of the author, displayed below the title.
#[serde(skip_serializing_if = "Option::is_none")]
pub author_name: Option<String>,
/// Optional. Profile link, opened when users click on the author's name below the title.
/// Can be any link, not necessarily to a Telegram profile or channel.
#[serde(skip_serializing_if = "Option::is_none")]
pub author_url: Option<String>,
}
/// This object represents a page to edit on Telegraph.
#[derive(Debug, Clone, Serialize)]
pub struct PageEdit {
/// Title of the page.
pub title: String,
/// Path to the page.
pub path: String,
/// Content of the page.
#[serde(with = "serde_with::json::nested")]
pub content: Vec<Node>,
/// Optional. Name of the author, displayed below the title.
#[serde(skip_serializing_if = "Option::is_none")]
pub author_name: Option<String>,
/// Optional. Profile link, opened when users click on the author's name below the title.
/// Can be any link, not necessarily to a Telegram profile or channel.
#[serde(skip_serializing_if = "Option::is_none")]
pub author_url: Option<String>,
}
/// This object represents a page on Telegraph.
#[derive(Debug, Clone, Deserialize)]
pub struct Page {
/// Path to the page.
pub path: String,
/// URL of the page.
pub url: String,
/// Title of the page.
pub title: String,
/// Description of the page.
pub description: String,
/// Optional. Name of the author, displayed below the title.
pub author_name: Option<String>,
/// Optional. Profile link, opened when users click on the author's name below the title.
///
/// Can be any link, not necessarily to a Telegram profile or channel.
pub author_url: Option<String>,
/// Optional. Image URL of the page.
pub image_url: Option<String>,
/// Optional. Content of the page.
pub content: Option<Vec<Node>>,
/// Number of page views for the page.
pub views: i32,
/// Optional. Only returned if access_token passed.
///
/// True, if the target Telegraph account can edit the page.
pub can_edit: Option<bool>,
}
/// This object represents the number of page views for a Telegraph article.
#[derive(Debug, Clone, Deserialize)]
pub struct PageViews {
/// Number of page views for the target page.
pub views: i32,
}
/// This abstract object represents a DOM Node.
///
/// It can be a String which represents a DOM text node or a NodeElement object.
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(untagged)]
pub enum Node {
Text(String),
NodeElement(NodeElement),
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub enum Tag {
A,
Aside,
B,
Blockquote,
Br,
Code,
Em,
Figcaption,
Figure,
H3,
H4,
Hr,
I,
Iframe,
Img,
Li,
Ol,
P,
Pre,
S,
Strong,
U,
Ul,
Video,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct NodeElementAttr {
#[serde(skip_serializing_if = "Option::is_none")]
pub href: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub src: Option<String>,
}
/// This object represents a DOM element node.
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct NodeElement {
/// Name of the DOM element.
/// Available tags: a, aside, b, blockquote, br, code, em, figcaption, figure, h3, h4, hr,
/// i, iframe, img, li, ol, p, pre, s, strong, u, ul, video.
pub tag: Tag,
/// Optional. Attributes of the DOM element.
///
/// Key of object represents name of attribute, value represents value of attribute.
///
/// Available attributes: href, src.
#[serde(skip_serializing_if = "Option::is_none")]
pub attrs: Option<NodeElementAttr>,
/// Optional. List of child nodes for the DOM element.
#[serde(skip_serializing_if = "Option::is_none")]
pub children: Option<Vec<Node>>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct MediaInfo {
/// Path of the file uploaded.
pub src: String,
}
impl From<Page> for PageEdit {
fn from(p: Page) -> Self {
Self {
title: p.title,
path: p.path,
content: p.content.unwrap_or_default(),
author_name: p.author_name,
author_url: p.author_url,
}
}
}
impl Node {
pub fn new_p_text<S: Into<String>>(text: S) -> Self {
Node::NodeElement(NodeElement {
tag: Tag::P,
attrs: None,
children: Some(vec![Node::Text(text.into())]),
})
}
pub fn new_image<S: Into<String>>(src: S) -> Self {
Node::NodeElement(NodeElement {
tag: Tag::Img,
attrs: Some(NodeElementAttr {
src: Some(src.into()),
href: None,
}),
children: None,
})
}
}

83
eh2telegraph/src/tls.rs Normal file
View File

@ -0,0 +1,83 @@
use std::sync::Arc;
use rustls::{
client::{ServerCertVerifier, WebPkiVerifier},
Certificate, ClientConfig, RootCertStore, ServerName,
};
pub struct WhitelistVerifier<const N: usize> {
verifier: WebPkiVerifier,
dns_names: [&'static str; N],
}
/// Custom verifier that allow hostname difference with specified dns names.
impl<const N: usize> WhitelistVerifier<N> {
pub fn new(dns_names: [&'static str; N]) -> Self {
use rustls::OwnedTrustAnchor;
let mut root_cert_store = RootCertStore::empty();
let trust_anchors = webpki_roots::TLS_SERVER_ROOTS.0.iter().map(|trust_anchor| {
OwnedTrustAnchor::from_subject_spki_name_constraints(
trust_anchor.subject,
trust_anchor.spki,
trust_anchor.name_constraints,
)
});
root_cert_store.add_server_trust_anchors(trust_anchors);
let verifier = WebPkiVerifier::new(root_cert_store, None);
Self {
verifier,
dns_names,
}
}
}
impl<const N: usize> From<WhitelistVerifier<N>> for ClientConfig {
fn from(v: WhitelistVerifier<N>) -> Self {
let mut cfg = ClientConfig::builder()
.with_safe_defaults()
.with_root_certificates(RootCertStore::empty())
.with_no_client_auth();
cfg.dangerous().set_certificate_verifier(Arc::new(v));
cfg
}
}
impl<const N: usize> ServerCertVerifier for WhitelistVerifier<N> {
fn verify_server_cert(
&self,
end_entity: &Certificate,
intermediates: &[Certificate],
server_name: &rustls::ServerName,
scts: &mut dyn Iterator<Item = &[u8]>,
ocsp_response: &[u8],
now: std::time::SystemTime,
) -> Result<rustls::client::ServerCertVerified, rustls::Error> {
let original_validate_result = self.verifier.verify_server_cert(
end_entity,
intermediates,
server_name,
scts,
ocsp_response,
now,
);
if original_validate_result.is_ok() {
return original_validate_result;
}
for dns_name in self.dns_names.iter() {
if let Ok(dns_name) = ServerName::try_from(*dns_name) {
let whitelist_validate_result = self.verifier.verify_server_cert(
end_entity,
intermediates,
&dns_name,
scts,
ocsp_response,
now,
);
if whitelist_validate_result.is_ok() {
return whitelist_validate_result;
}
}
}
original_validate_result
}
}

36
eh2telegraph/src/util.rs Normal file
View File

@ -0,0 +1,36 @@
use bytes::Bytes;
use regex::Regex;
use reqwest::Response;
use crate::http_proxy::HttpRequestBuilder;
#[inline]
pub fn match_first_group<'a>(regexp: &'a Regex, content: &'a str) -> Option<&'a str> {
regexp.captures(content).map(|c| {
c.get(1)
.expect("regexp is matched but no group 1 found")
.as_str()
})
}
#[inline]
pub async fn get_bytes<C: HttpRequestBuilder>(client: &C, link: &str) -> reqwest::Result<Bytes> {
client
.get_builder(link)
.send()
.await
.and_then(Response::error_for_status)?
.bytes()
.await
}
#[inline]
pub async fn get_string<C: HttpRequestBuilder>(client: &C, link: &str) -> reqwest::Result<String> {
client
.get_builder(link)
.send()
.await
.and_then(Response::error_for_status)?
.text()
.await
}

1
rust-toolchain Normal file
View File

@ -0,0 +1 @@
nightly

49
worker/web_proxy.js Normal file
View File

@ -0,0 +1,49 @@
/*
Cloudflare workers telegraph proxy.
Deploy and set `KEY` variable in browser.
*/
addEventListener('fetch', event => {
event.respondWith(handleRequest(event.request))
})
const RESPONSE_HEADERS = {
"Server": "web-proxy",
};
async function handleRequest(request) {
// validate request key
if (request.headers.get("X-Authorization") != KEY) {
return new Response(null, {
status: 401,
headers: RESPONSE_HEADERS
});
}
// read original url
var url = request.headers.get("X-Forwarded-For");
if (url == null || url == "") {
return new Response(null, {
status: 400,
headers: RESPONSE_HEADERS
});
}
// construct new url and request
var req = new Request(new URL(url), {
method: request.method,
headers: request.headers,
body: request.body
});
// remove headers
req.headers.delete("X-Authorization");
req.headers.delete("X-Forwarded-For");
req.headers.delete("CF-Connecting-IP");
req.headers.delete("CF-Worker");
req.headers.delete("CF-EW-Via");
// send request
var result = await fetch(req);
return result;
}