This commit is contained in:
explosiver 2024-04-20 16:21:36 +08:00
commit 5f8dbb9678
49 changed files with 7548 additions and 0 deletions

2
.cargo/config.toml Normal file
View File

@ -0,0 +1,2 @@
[build]
rustflags = ["--cfg", "unsound_local_offset"]

1
.dockerignore Normal file
View File

@ -0,0 +1 @@
/target/

73
.github/workflows/ci.yaml vendored Normal file
View File

@ -0,0 +1,73 @@
name: CI
on:
push:
paths-ignore:
- "**.md"
- "**.png"
pull_request:
paths-ignore:
- "**.md"
- "**.png"
env:
RUST_TOOLCHAIN: nightly
TOOLCHAIN_PROFILE: minimal
jobs:
lints:
name: Run cargo fmt and cargo clippy
runs-on: ubuntu-latest
steps:
- name: Checkout sources
uses: actions/checkout@v2
- name: Install toolchain
uses: actions-rs/toolchain@v1
with:
profile: ${{ env.TOOLCHAIN_PROFILE }}
toolchain: ${{ env.RUST_TOOLCHAIN }}
override: true
components: rustfmt, clippy
- name: Cache
uses: Swatinem/rust-cache@v1
- name: Run cargo fmt
uses: actions-rs/cargo@v1
with:
command: fmt
args: --all -- --check
- name: Run cargo check with no default features
uses: actions-rs/cargo@v1
with:
command: check
args: --no-default-features
- name: Run cargo check with all features
uses: actions-rs/cargo@v1
with:
command: check
args: --all-features
- name: Run cargo clippy
uses: actions-rs/cargo@v1
with:
command: clippy
args: -- -D warnings
test:
name: Run cargo test
runs-on: ubuntu-latest
steps:
- name: Checkout sources
uses: actions/checkout@v2
- name: Install toolchain
uses: actions-rs/toolchain@v1
with:
profile: ${{ env.TOOLCHAIN_PROFILE }}
toolchain: ${{ env.RUST_TOOLCHAIN }}
override: true
- name: Cache
uses: Swatinem/rust-cache@v1
- name: Run cargo test --no-run
uses: actions-rs/cargo@v1
with:
command: test
args: --all-features --no-run
- name: Run cargo test
run: sudo bash -c "ulimit -Sl 512 && ulimit -Hl 512 && sudo -u runner RUSTUP_TOOLCHAIN=nightly /home/runner/.cargo/bin/cargo test --all-features"

45
.github/workflows/docker-build.yml vendored Normal file
View File

@ -0,0 +1,45 @@
name: docker build and push
on:
push:
tags:
- 'v*'
env:
CARGO_NET_GIT_FETCH_WITH_CLI: true
jobs:
build:
name: 'Build'
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Extract tag
id: prep
if: "startsWith(github.ref, 'refs/tags/v')"
run: |
echo ::set-output name=tags::ghcr.io/qini7-sese/ehbot:${GITHUB_REF#refs/tags/v}
- name: Set up QEMU
uses: docker/setup-qemu-action@v1
with:
platforms: all
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v1
- name: Login to GHCR
uses: docker/login-action@v1
with:
registry: ghcr.io
username: qini7-sese
password: ${{ secrets.GITHUB_TOKEN }}
- name: Generate App Version
run: echo VERSIONED_TAG=`git describe --tags --always` >> $GITHUB_ENV
- name: Build docker image
uses: docker/build-push-action@v2
with:
push: true
platforms: linux/amd64,linux/arm64
tags: |
ghcr.io/qini7-sese/ehbot:latest
ghcr.io/qini7-sese/ehbot:${{ env.VERSIONED_TAG }}

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/target
config.yaml

2816
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

7
Cargo.toml Normal file
View File

@ -0,0 +1,7 @@
[workspace]
members = ["bot", "eh2telegraph"]
resolver = "2"
[profile.release]
lto = true
opt-level = 3

10
Dockerfile Normal file
View File

@ -0,0 +1,10 @@
FROM rust:1-bullseye as builder
WORKDIR /usr/src/eh2telegraph
COPY . .
RUN cargo update
RUN cargo build --release
FROM debian:bullseye-slim
RUN apt-get update && apt-get -y install ca-certificates && rm -rf /var/lib/apt/lists/*
COPY --from=builder /usr/src/eh2telegraph/target/release/bot /usr/local/bin/bot
CMD ["/usr/local/bin/bot"]

201
LICENSE-APACHE Normal file
View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

23
LICENSE-MIT Normal file
View File

@ -0,0 +1,23 @@
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

95
README-zh.md Normal file
View File

@ -0,0 +1,95 @@
# eh2telegraph
中文|[英文](README.md)
自动从 EH/EX/NH 下载图片集并上传至 Telegraph 的 Bot。
本代码只保证在 MacOS部分功能和 Linux 上可以正确运行。
## 部署指引
1. 安装 Docker 和 docker-compose。
2. 创建新文件夹 `ehbot`
2. 复制项目中的 `config_example.yaml``ehbot` 并重命名为 `config.yaml`,之后修改配置细节(请参考下一节)。
3. 复制 `docker-compose.yml``ehbot`
4. 开启与关闭:
1. 开启:在该路径中运行 `docker-compose up -d`
2. 关闭:在该路径中运行 `docker-compose down`
3. 查看日志:在该路径中运行 `docker-compose logs`
4. 更新镜像:在该路径中运行 `docker-compose pull`
## 配置指引
1. 基础配置:
1. Bot TokenTelegram 内找 @BotFather 申请。
2. Admin可空你的 Telegram ID随便找个相关 Bot 就可以拿到(也可以通过本 Bot `/id` 拿到)。
3. Telegraph使用浏览器通过[这个链接](https://api.telegra.ph/createAccount?short_name=test_account&author_name=test_author)创建 Telegraph Token 并填写。你也可以修改作者名字和 URL。
2. 代理配置:
1. 部署本仓库中的 `worker/web_proxy.js` 至 CloudFlare Workers并配置 `KEY` 环境变量为一段随机字符串(该 KEY 目的是防止对代理的未授权请求)。
2. 填写 URL 和 Key 到配置中。
3. 该代理用于请求一些有频率限制的服务,请勿滥用。
3. IPv6 配置:
1. 可以填写一个 IPv6 段,如果你并没有拥有一个较大的(指比 `/64`IPv6 段,请留空。
2. 填写的话需要开启 `net.ipv6.ip_nonlocal_bind` 内核参数(参考后续章节说明)。
3. 配置 IPv6 可以一定程度上缓解针对单 IP 的限流。
4. 配置部分 Collector 的 Cookie
1. 目前只有 exhentai 需要。
5. KV 配置:
1. 本项目内置使用了一个缓存服务,可以避免对一个图片集的重复同步。
2. 请参考 [cloudflare-kv-proxy](https://github.com/ihciah/cloudflare-kv-proxy) 进行部署,并填写至配置文件。
3. 如果不想使用远程缓存,也可以使用纯内存缓存(重启后会失效),需要自行改代码并重新编译。
## 开发指引
### 环境
需要 Rust 最新的 Nightly 版本。推荐使用 VSCode 或 Clion 开发。
中国大陆推荐使用 [RsProxy](https://rsproxy.cn/) 作为 crates.io 镜像与工具链安装源。
### 版本发布
`v` 开头的 Tag 即可触发 Docker 构建。你可以直接在 git 中打 tag 之后 push 上去;但更方便的是在 github 中发布 release并填写 `v` 开头的命名。
## 技术细节
虽然本项目就是一个简单的爬虫,但是还是有一些注意事项需要说明一下。
### Github Action 构建
Github Action 可以用于自动构建 Docker 镜像,本项目支持自动构建 `x86_64` 平台的版本。
但事实上也可以构建 `arm64` 的版本,由于其机制上使用了 qemu 在 x86_64 上模拟了 arm 环境,所以速度极其缓慢(单次构建需要 1h 以上),于是没有开启。
### IPv6 幽灵客户端(口胡的名字)
某些网站有针对 IP 的访问频率限制,使用多个 IP 即可缓解该限制。实践上最常用的办法是代理池,但代理池往往极不稳定,并需要维护,可能还有一定成本。
观察本项目的目标网站,很多使用了 Cloudflare而 Cloudflare 支持 IPv6 且限流粒度是 `/64`。如果我们为本机绑定一个更大的 IPv6 段并从中随机选择 IP 作为客户端出口地址,则可以稳定地进行更高频率的请求。
由于网卡只会绑定单个 IPv6 地址,所以我们需要开启 `net.ipv6.ip_nonlocal_bind`
配置 IPv6 后,对于可以走 IPv6 的目标站点,本项目会使用 IPv6 段中的随机 IP 请求。
配置(对网卡的配置可以写在 `if-up` 中便于持久化):
1. `sudo ip add add local 2001:x:x::/48 dev lo`
2. `sudo ip route add local 2001:x:x::/48 dev your-interface`
3. 在 Sysctl 中配置 `net.ipv6.ip_nonlocal_bind=1`。该步骤因发行版而异(比如常见的 `/etc/sysctl.conf` 在 Arch Linux 中不存在)。
去哪搞 IPv6he.net 提供了相关免费服务,当然自己购买一个 IPv6 IP 段也并不昂贵。
你可以通过 `curl --interface 2001:***** ifconfig.co` 测试配置是否正确。
### 强制 IPv6
前一小节提到的网站虽然用了 Cloudflare但是事实上并没有真正启用 IPv6。当你直接使用 curl 指定 ipv6 请求时会发现,它根本就没有 AAAA 记录。但是由于 CF 的基础设施是 Anycast 的,所以如果目标网站不在代码中明确地拒绝 IPv6 访客,它们还是可以通过 IPv6 访问的。
1. telegra.ph: 无 AAAA 记录,但是强制解析到 Telegram 的入口 IP 可以访问,但证书是 `*.telegram.org` 的。
~~本项目写了一个校验指定域名证书有效性的 TLS 验证器,用于在保证安全性的情况下允许其证书配置错误。~~
但是 Telegraph 以极快的速度修掉了该问题,所以该 TLS 校验器目前处于禁用状态。
2. EH/NH: 强制 IPv6 可用。
3. EX: 未使用 CF 且无 IPv6 服务。
### 代理
本项目使用 Cloudflare Workers 作为部分 API 代理,在 IPv6 不可用时缓解限流问题。参考 `src/http_proxy.rs``worker/web_proxy.js`
### 缓存
为了尽可能少地重复拉取,本项目使用了内存缓存与远程持久化缓存。远程持久化缓存使用 Cloudflare Worker 配合 Cloudflare KV 搭建。项目主代码参考 [cloudflare-kv-proxy](https://github.com/ihciah/cloudflare-kv-proxy)。
由于同步图片集需要一定时间,为了避免重复同步,本项目使用了 [singleflight-async](https://github.com/ihciah/singleflight-async) 减少这类浪费。
## 贡献指引
欢迎你对本项目贡献代码!无论大小我们都欢迎!

94
README.md Normal file
View File

@ -0,0 +1,94 @@
# eh2telegraph
[中文](README-zh.md)|英文
Bot that automatically downloads image sets from EH/EX/NH and uploads them to Telegraph.
This code is only guaranteed to work correctly on MacOS (partial functionality) and Linux.
## Deployment Guidelines
1. Install Docker and docker-compose.
2. Create a new folder `ehbot`.
2. Copy `config_example.yaml` from the project to `ehbot` and rename it to `config.yaml`, then change the configuration details (see the next section).
3. Copy `docker-compose.yml` to `ehbot`.
4. Start and Shutdown.
1. Start: Run `docker-compose up -d` in this folder.
2. Shutdown: Run `docker-compose down` in this folder.
3. View logs: Run `docker-compose logs` in this folder.
4. Update the image: Run `docker-compose pull` in this folder.
## Configuration Guidelines
1. Basic Configuration
Bot Token: Find @BotFather in Telegram to apply.
2. Admin (can be empty): your Telegram ID, you can get it from any relevant Bot (you can also get it from this Bot `/id`).
3. Telegraph: Use your browser to create a Telegraph Token via [this link](https://api.telegra.ph/createAccount?short_name=test_account&author_name=test_author) and fill in. You can also change the author name and URL.
2. Proxy Configuration
1. Deploy `worker/web_proxy.js` of this repository to Cloudflare Workers and configure the `KEY` environment variable to be a random string (the purpose of the `KEY` is to prevent unauthorized requests to the proxy).
2. Fill in the URL and Key into the yaml.
3. The proxy is used to request some services with frequency limitation, so do not abuse it.
3. IPv6 configuration
1. You can specify an IPv6 segment, if you do not have a larger (meaning larger than `/64`) IPv6 segment, please leave it blank.
2. Configure IPv6 to somewhat alleviate the flow restriction for single IP.
4. Configure cookies for some Collectors.
1. Currently, only exhentai is required.
5. KV configuration
1. This project uses a built-in caching service to avoid repeated synchronization of an image set.
2. Please refer to [cloudflare-kv-proxy](https://github.com/ihciah/cloudflare-kv-proxy) for deployment and fill in the yaml file.
3. If you don't want to use remote caching, you can also use pure memory caching (it will be invalid after reboot). If you want to do so, you need to modify the code and recompile it by yourself.
## Development Guidelines
### Environment
Requires the latest Nightly version of Rust. Recommended to use VSCode or Clion for development.
[RsProxy](https://rsproxy.cn/) is recommended as the crates.io source and toolchain installation source for users in China Mainland.
### Version Release
A Docker build can be triggered by typing a Tag starting with `v`. You can type the tag directly in git and push it up; however, it is easier to publish the release in github and fill in the `v` prefix.
## Technical Details
Although this project is a simple crawler, there are still some considerations that need to be explained.
### Github Action Builds
Github Action can be used to automatically build Docker images, and this project supports automatic builds for the `x86_64` platform.
However, it can also build `arm64` versions, but it is not enabled because it uses qemu to emulate the arm environment on x86_64, so it is extremely slow (more than 1h for a single build).
### IPv6 Ghost Client (it's not a well-known name, just made up by myself)
Some sites have IP-specific access frequency limits, which can be mitigated by using multiple IPs. The most common approach in practice is proxy pooling, but proxy pools are often extremely unstable and require maintenance and possibly some cost.
Observe the target sites of this project, many use Cloudflare, and Cloudflare supports IPv6 and the granularity of flow limitation is `/64`. If we bind a larger IPv6 segment for the local machine and randomly select IPs from it as client exit addresses, we can make more frequent requests steadily.
Since the NIC will only bind a single IPv6 address, we need to enable `net.ipv6.ip_nonlocal_bind`.
After configuring IPv6, for target sites that can use IPv6, this project will use random IP requests from the IPv6 segment.
Configuration (configuration for the NIC can be written in `if-up` for persistence).
1. `sudo ip add add local 2001:x:x::/48 dev lo`
2. `sudo ip route add local 2001:x:x::/48 dev your-interface`
3. Configure `net.ipv6.ip_nonlocal_bind=1` in Sysctl. This step varies by distribution (for example, the common `/etc/sysctl.conf` does not exist in Arch Linux).
Where to get IPv6? he.net offers a free service for this, but of course it is not expensive to buy an IPv6 IP segment yourself.
You can test the configuration with `curl --interface 2001:***** ifconfig.co` to see if it is correct.
### Forcing IPv6
The site mentioned in the previous subsection uses Cloudflare, but in fact does not really enable IPv6. when you specify the ipv6 request directly using curl, you will find that it has no AAAA records at all. But because the CF infrastructure is Anycast, so if the target site does not explicitly deny IPv6 visitors in the code, they can still be accessed through IPv6.
1. telegra.ph: No AAAA records, but force resolves to Telegram's entry IP for access, but the certificate is `*.telegram.org`.
~~This project writes a TLS validator that checks the validity of a given domain's certificate, to allow for misconfiguration of its certificate while maintaining security.~~
However, Telegraph fixed the problem very quickly, so the TLS verifier is currently disabled.
2. EH/NH: Forced IPv6 availability.
3. EX: CF is not used and no IPv6 service is available.
### Proxy
This project uses Cloudflare Workers as a partial API proxy to alleviate the flow limitation problem when IPv6 is not available. See `src/http_proxy.rs` and `worker/web_proxy.js`.
### Caching
To minimize duplicate pulls, this project uses in-memory caching and remote persistent caching. Remote persistent cache using Cloudflare Worker with Cloudflare KV to build. The main project code reference is [cloudflare-kv-proxy](https://github.com/ihciah/cloudflare-kv-proxy).
Since it takes some time to synchronize image sets, to avoid repeated synchronization, this project uses [singleflight-async](https://github.com/ihciah/singleflight-async) to reduce this kind of waste.
## Contribute Guidelines
You are welcome to contribute code to this project(no matter how small the commit is)!

49
bot/Cargo.toml Normal file
View File

@ -0,0 +1,49 @@
[package]
edition = "2021"
name = "bot"
version = "0.1.17"
[dependencies]
eh2telegraph = { path = "../eh2telegraph" }
anyhow = "1"
chrono = "0.4"
clap = { version = "4", features = ["derive"] }
dptree = "0.3"
once_cell = "1"
regex = "1"
reqwest = { version = "0.12", default-features = false, features = [
"json",
"multipart",
"rustls-tls",
] }
serde = { version = "1", features = ["derive"] }
singleflight-async = { version = "0.1", features = ["hardware-lock-elision"] }
teloxide = { version = "0.12", features = [
"macros",
"ctrlc_handler",
"auto-send",
] }
time = { version = "0.3.34", features = ["local-offset", "std", "macros"] }
tokio = { version = "1", default-features = false, features = [
"rt-multi-thread",
"macros",
"net",
"sync",
"time",
"parking_lot",
] }
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = [
"local-time",
"parking_lot",
"time",
"env-filter",
] }
[build-dependencies]
vergen = { version = "8", default_features = false, features = [
"build",
"cargo",
"rustc",
] }

11
bot/build.rs Normal file
View File

@ -0,0 +1,11 @@
use vergen::EmitBuilder;
fn main() {
// Generate the default 'cargo:' instruction output
EmitBuilder::builder()
.all_build()
.all_cargo()
.all_rustc()
.emit()
.unwrap();
}

412
bot/src/handler.rs Normal file
View File

@ -0,0 +1,412 @@
use std::{borrow::Cow, collections::HashSet};
use eh2telegraph::{
collector::{e_hentai::EHCollector, exhentai::EXCollector, nhentai::NHCollector},
searcher::{
f_hash::FHashConvertor,
saucenao::{SaucenaoOutput, SaucenaoParsed, SaucenaoSearcher},
ImageSearcher,
},
storage::KVStorage,
sync::Synchronizer,
};
use reqwest::Url;
use teloxide::{
adaptors::DefaultParseMode,
prelude::*,
utils::{
command::BotCommands,
markdown::{code_inline, escape, link},
},
};
use tracing::{info, trace};
use crate::{ok_or_break, util::PrettyChat};
const MIN_SIMILARITY: u8 = 70;
const MIN_SIMILARITY_PRIVATE: u8 = 50;
#[derive(BotCommands, Clone)]
#[command(
rename_rule = "lowercase",
description = "\
This is a gallery synchronization robot that is convenient for users to view pictures directly in Telegram.\n\
便 Telegram \n\
Bot supports sync with command, text url, or image(private chat search thrashold is lower).\n\
() \n\n\
Bot develop group / Bot https://t.me/TGSyncBotWorkGroup\n\
And welcome to join image channel / https://t.me/sesecollection\n\n\
These commands are supported:\n\
:"
)]
pub enum Command {
#[command(description = "Display this help. 显示这条帮助信息。")]
Help,
#[command(description = "Show bot verison. 显示机器人版本。")]
Version,
#[command(description = "Show your account id. 显示你的账号 ID。")]
Id,
#[command(
description = "Sync a gallery(e-hentai/exhentai/nhentai are supported now). 同步一个画廊(目前支持 EH/EX/NH)"
)]
Sync(String),
}
#[derive(BotCommands, Clone)]
#[command(rename_rule = "lowercase", description = "Command for admins")]
pub enum AdminCommand {
#[command(description = "Delete cache with given key.")]
Delete(String),
}
pub struct Handler<C> {
pub synchronizer: Synchronizer<C>,
pub searcher: SaucenaoSearcher,
pub convertor: FHashConvertor,
pub admins: HashSet<i64>,
single_flight: singleflight_async::SingleFlight<String>,
}
impl<C> Handler<C>
where
C: KVStorage<String> + Send + Sync + 'static,
{
pub fn new(synchronizer: Synchronizer<C>, admins: HashSet<i64>) -> Self {
Self {
synchronizer,
searcher: SaucenaoSearcher::new_from_config(),
convertor: FHashConvertor::new_from_config(),
admins,
single_flight: Default::default(),
}
}
/// Executed when a command comes in and parsed successfully.
pub async fn respond_cmd(
&'static self,
bot: DefaultParseMode<Bot>,
msg: Message,
command: Command,
) -> ControlFlow<()> {
match command {
Command::Help => {
let _ = bot
.send_message(msg.chat.id, escape(&Command::descriptions().to_string()))
.reply_to_message_id(msg.id)
.await;
}
Command::Version => {
let _ = bot
.send_message(msg.chat.id, escape(crate::version::VERSION))
.reply_to_message_id(msg.id)
.await;
}
Command::Id => {
let _ = bot
.send_message(
msg.chat.id,
format!(
"Current chat id is {} \\(in private chat this is your account id\\)",
code_inline(&msg.chat.id.to_string())
),
)
.reply_to_message_id(msg.id)
.await;
}
Command::Sync(url) => {
if url.is_empty() {
let _ = bot
.send_message(msg.chat.id, escape("Usage: /sync url"))
.reply_to_message_id(msg.id)
.await;
return ControlFlow::Break(());
}
info!(
"[cmd handler] receive sync request from {:?} for {url}",
PrettyChat(&msg.chat)
);
let msg: Message = ok_or_break!(
bot.send_message(msg.chat.id, escape(&format!("Syncing url {url}")))
.reply_to_message_id(msg.id)
.await
);
tokio::spawn(async move {
let _ = bot
.edit_message_text(msg.chat.id, msg.id, self.sync_response(&url).await)
.await;
});
}
};
ControlFlow::Break(())
}
pub async fn respond_admin_cmd(
&'static self,
bot: DefaultParseMode<Bot>,
msg: Message,
command: AdminCommand,
) -> ControlFlow<()> {
match command {
AdminCommand::Delete(key) => {
tokio::spawn(async move {
let _ = self.synchronizer.delete_cache(&key).await;
let _ = bot
.send_message(msg.chat.id, escape(&format!("Key {key} deleted.")))
.reply_to_message_id(msg.id)
.await;
});
ControlFlow::Break(())
}
}
}
pub async fn respond_text(
&'static self,
bot: DefaultParseMode<Bot>,
msg: Message,
) -> ControlFlow<()> {
let maybe_link = {
let entries = msg
.entities()
.map(|es| {
es.iter().filter_map(|e| {
if let teloxide::types::MessageEntityKind::TextLink { url } = &e.kind {
Synchronizer::match_url_from_text(url.as_ref()).map(ToOwned::to_owned)
} else {
None
}
})
})
.into_iter()
.flatten();
msg.text()
.and_then(|content| {
Synchronizer::match_url_from_text(content).map(ToOwned::to_owned)
})
.into_iter()
.chain(entries)
.next()
};
if let Some(url) = maybe_link {
info!(
"[text handler] receive sync request from {:?} for {url}",
PrettyChat(&msg.chat)
);
let msg: Message = ok_or_break!(
bot.send_message(msg.chat.id, escape(&format!("Syncing url {url}")))
.reply_to_message_id(msg.id)
.await
);
tokio::spawn(async move {
let _ = bot
.edit_message_text(msg.chat.id, msg.id, self.sync_response(&url).await)
.await;
});
return ControlFlow::Break(());
}
// fallback to the next branch
ControlFlow::Continue(())
}
pub async fn respond_caption(
&'static self,
bot: DefaultParseMode<Bot>,
msg: Message,
) -> ControlFlow<()> {
let caption_entities = msg.caption_entities();
let mut final_url = None;
for entry in caption_entities.map(|x| x.iter()).into_iter().flatten() {
let url = match &entry.kind {
teloxide::types::MessageEntityKind::Url => {
let raw = msg
.caption()
.expect("Url MessageEntry found but caption is None");
let encoded: Vec<_> = raw
.encode_utf16()
.skip(entry.offset)
.take(entry.length)
.collect();
let content = ok_or_break!(String::from_utf16(&encoded));
Cow::from(content)
}
teloxide::types::MessageEntityKind::TextLink { url } => Cow::from(url.as_ref()),
_ => {
continue;
}
};
let url = if let Some(c) = Synchronizer::match_url_from_url(&url) {
c
} else {
continue;
};
final_url = Some(url.to_string());
break;
}
match final_url {
Some(url) => {
info!(
"[caption handler] receive sync request from {:?} for {url}",
PrettyChat(&msg.chat)
);
let msg: Message = ok_or_break!(
bot.send_message(msg.chat.id, escape(&format!("Syncing url {url}")))
.reply_to_message_id(msg.id)
.await
);
let url = url.to_string();
tokio::spawn(async move {
let _ = bot
.edit_message_text(msg.chat.id, msg.id, self.sync_response(&url).await)
.await;
});
ControlFlow::Break(())
}
None => ControlFlow::Continue(()),
}
}
pub async fn respond_photo(
&'static self,
bot: DefaultParseMode<Bot>,
msg: Message,
) -> ControlFlow<()> {
let first_photo = match msg.photo().and_then(|x| x.first()) {
Some(p) => p,
None => {
return ControlFlow::Continue(());
}
};
let f = ok_or_break!(bot.get_file(&first_photo.file.id).await);
let mut buf: Vec<u8> = Vec::with_capacity(f.size as usize);
ok_or_break!(teloxide::net::Download::download_file(&bot, &f.path, &mut buf).await);
let search_result: SaucenaoOutput = ok_or_break!(self.searcher.search(buf).await);
let mut url_sim = None;
let threshold = if msg.chat.is_private() {
MIN_SIMILARITY_PRIVATE
} else {
MIN_SIMILARITY
};
for element in search_result
.data
.into_iter()
.filter(|x| x.similarity >= threshold)
{
match element.parsed {
SaucenaoParsed::EHentai(f_hash) => {
url_sim = Some((
ok_or_break!(self.convertor.convert_to_gallery(&f_hash).await),
element.similarity,
));
break;
}
SaucenaoParsed::NHentai(nid) => {
url_sim = Some((format!("https://nhentai.net/g/{nid}/"), element.similarity));
break;
}
_ => continue,
}
}
let (url, sim) = match url_sim {
Some(u) => u,
None => {
trace!("[photo handler] image not found");
return ControlFlow::Continue(());
}
};
info!(
"[photo handler] receive sync request from {:?} for {url} with similarity {sim}",
PrettyChat(&msg.chat)
);
if let Ok(msg) = bot
.send_message(msg.chat.id, escape(&format!("Syncing url {url}")))
.reply_to_message_id(msg.id)
.await
{
tokio::spawn(async move {
let _ = bot
.edit_message_text(msg.chat.id, msg.id, self.sync_response(&url).await)
.await;
});
}
ControlFlow::Break(())
}
pub async fn respond_default(
&'static self,
bot: DefaultParseMode<Bot>,
msg: Message,
) -> ControlFlow<()> {
if msg.chat.is_private() {
ok_or_break!(
bot.send_message(msg.chat.id, escape("Unrecognized message."))
.reply_to_message_id(msg.id)
.await
);
}
#[cfg(debug_assertions)]
tracing::warn!("{:?}", msg);
ControlFlow::Break(())
}
async fn sync_response(&self, url: &str) -> String {
self.single_flight
.work(url, || async {
match self.route_sync(url).await {
Ok(url) => {
format!("Sync to telegraph finished: {}", link(&url, &escape(&url)))
}
Err(e) => {
format!("Sync to telegraph failed: {}", escape(&e.to_string()))
}
}
})
.await
}
async fn route_sync(&self, url: &str) -> anyhow::Result<String> {
let u = Url::parse(url).map_err(|_| anyhow::anyhow!("Invalid url"))?;
let host = u.host_str().unwrap_or_default();
let path = u.path().to_string();
// TODO: use macro to generate them
#[allow(clippy::single_match)]
match host {
"e-hentai.org" => {
info!("[registry] sync e-hentai for path {}", path);
self.synchronizer
.sync::<EHCollector>(path)
.await
.map_err(anyhow::Error::from)
}
"nhentai.to" | "nhentai.net" => {
info!("[registry] sync nhentai for path {}", path);
self.synchronizer
.sync::<NHCollector>(path)
.await
.map_err(anyhow::Error::from)
}
"exhentai.org" => {
info!("[registry] sync exhentai for path {}", path);
self.synchronizer
.sync::<EXCollector>(path)
.await
.map_err(anyhow::Error::from)
}
_ => Err(anyhow::anyhow!("no matching collector")),
}
}
}

239
bot/src/main.rs Normal file
View File

@ -0,0 +1,239 @@
#![feature(control_flow_enum)]
use eh2telegraph::{
collector::Registry,
config::{self},
http_proxy::ProxiedClient,
storage,
sync::Synchronizer,
telegraph::Telegraph,
};
use clap::Parser;
use once_cell::sync::OnceCell;
use teloxide::{
adaptors::DefaultParseMode,
error_handlers::IgnoringErrorHandler,
prelude::*,
types::{AllowedUpdate, ChatPermissions, ParseMode, UpdateKind},
update_listeners,
};
use tracing::level_filters::LevelFilter;
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
use handler::{Command, Handler};
use crate::{
handler::AdminCommand,
util::{wrap_endpoint, PrettyChat},
};
mod handler;
mod util;
mod version;
#[derive(Debug, serde::Deserialize)]
pub struct BaseConfig {
pub bot_token: String,
pub telegraph: TelegraphConfig,
#[serde(default)]
pub admins: Vec<i64>,
}
#[derive(Debug, serde::Deserialize)]
pub struct TelegraphConfig {
pub tokens: Vec<String>,
pub author_name: Option<String>,
pub author_url: Option<String>,
}
#[derive(Parser, Debug)]
#[clap(author, version=version::VERSION, about, long_about = "eh2telegraph sync bot")]
struct Args {
#[clap(short, long, help = "Config file path")]
config: Option<String>,
}
static PROCESS_MESSAGE_DATE: OnceCell<chrono::DateTime<chrono::Utc>> = OnceCell::new();
#[tokio::main]
async fn main() {
let args = Args::parse();
let timer = tracing_subscriber::fmt::time::LocalTime::new(time::macros::format_description!(
"[month]-[day] [hour]:[minute]:[second]"
));
// We will only process messages from 1 day earlier.
PROCESS_MESSAGE_DATE
.set(
chrono::Utc::now()
.checked_sub_signed(chrono::Duration::try_days(1).unwrap())
.expect("illegal current date"),
)
.expect("unable to set global date");
tracing_subscriber::registry()
.with(fmt::layer().with_timer(timer))
.with(
EnvFilter::builder()
.with_default_directive(LevelFilter::INFO.into())
.from_env_lossy(),
)
.init();
tracing::info!("initializing...");
config::init(args.config);
let base_config: BaseConfig = config::parse("base")
.expect("unable to parse base config")
.expect("base config can not be empty");
let telegraph_config = base_config.telegraph;
let telegraph =
Telegraph::new(telegraph_config.tokens).with_proxy(ProxiedClient::new_from_config());
let registry = Registry::new_from_config();
#[cfg(debug_assertions)]
let cache = storage::SimpleMemStorage::default();
#[cfg(not(debug_assertions))]
let cache = storage::cloudflare_kv::CFOrMemStorage::new_from_config();
let mut synchronizer = Synchronizer::new(telegraph, registry, cache);
if telegraph_config.author_name.is_some() {
synchronizer =
synchronizer.with_author(telegraph_config.author_name, telegraph_config.author_url);
}
let admins = base_config.admins.into_iter().collect();
let handler = Box::leak(Box::new(Handler::new(synchronizer, admins))) as &Handler<_>;
// === Bot related ===
let command_handler = move |bot: DefaultParseMode<Bot>, message: Message, command: Command| async move {
handler.respond_cmd(bot, message, command).await
};
let admin_command_handler =
move |bot: DefaultParseMode<Bot>, message: Message, command: AdminCommand| async move {
handler.respond_admin_cmd(bot, message, command).await
};
let text_handler = move |bot: DefaultParseMode<Bot>, message: Message| async move {
handler.respond_text(bot, message).await
};
let caption_handler = move |bot: DefaultParseMode<Bot>, message: Message| async move {
handler.respond_caption(bot, message).await
};
let photo_handler = move |bot: DefaultParseMode<Bot>, message: Message| async move {
handler.respond_photo(bot, message).await
};
let default_handler = move |bot: DefaultParseMode<Bot>, message: Message| async move {
handler.respond_default(bot, message).await
};
let permission_filter = |bot: DefaultParseMode<Bot>, message: Message| async move {
// If the bot is blocked, we will leave chat and not respond.
let blocked = message
.chat
.permissions()
.map(|p| !p.contains(ChatPermissions::SEND_MESSAGES))
.unwrap_or_default();
if blocked {
tracing::info!(
"[permission filter] leave chat {:?}",
PrettyChat(&message.chat)
);
let _ = bot.leave_chat(message.chat.id).await;
None
} else {
Some(message)
}
};
let time_filter = |message: Message| async move {
// Ignore old message.
// # Safety:
// We already set PROCESS_MESSAGE_DATE.
if &message.date > unsafe { PROCESS_MESSAGE_DATE.get_unchecked() } {
Some(message)
} else {
None
}
};
let bot = Bot::new(base_config.bot_token).parse_mode(ParseMode::MarkdownV2);
let mut bot_dispatcher = Dispatcher::builder(
bot.clone(),
dptree::entry()
.chain(dptree::filter_map(move |update: Update| {
match update.kind {
UpdateKind::Message(x) | UpdateKind::EditedMessage(x) => Some(x),
_ => None,
}
}))
.chain(dptree::filter_map_async(time_filter))
.chain(dptree::filter_map_async(permission_filter))
.branch(
dptree::entry()
.chain(dptree::filter(move |message: Message| {
handler.admins.contains(&message.chat.id.0)
}))
.filter_command::<AdminCommand>()
.branch(wrap_endpoint(admin_command_handler)),
)
.branch(
dptree::entry()
.filter_command::<Command>()
.branch(wrap_endpoint(command_handler)),
)
.branch(
dptree::entry()
.chain(dptree::filter_map(move |message: Message| {
// Ownership mechanism does not allow using map.
#[allow(clippy::manual_map)]
match message.text() {
Some(v) if !v.is_empty() => Some(message),
_ => None,
}
}))
.branch(wrap_endpoint(text_handler)),
)
.branch(
dptree::entry()
.chain(dptree::filter_map(move |message: Message| {
// Ownership mechanism does not allow using map.
#[allow(clippy::manual_map)]
match message.caption_entities() {
Some(v) if !v.is_empty() => Some(message),
_ => None,
}
}))
.branch(wrap_endpoint(caption_handler)),
)
.branch(
dptree::entry()
.chain(dptree::filter_map(move |message: Message| {
// Ownership mechanism does not allow using map.
#[allow(clippy::manual_map)]
match message.photo() {
Some(v) if !v.is_empty() => Some(message),
_ => None,
}
}))
.branch(wrap_endpoint(photo_handler)),
)
.branch(wrap_endpoint(default_handler)),
)
.default_handler(Box::new(|_upd| {
#[cfg(debug_assertions)]
tracing::warn!("Unhandled update: {:?}", _upd);
Box::pin(async {})
}))
.error_handler(std::sync::Arc::new(IgnoringErrorHandler))
.enable_ctrlc_handler()
.build();
let bot_listener = update_listeners::Polling::builder(bot)
.allowed_updates(vec![AllowedUpdate::Message])
.timeout(std::time::Duration::from_secs(10))
.build();
tracing::info!("initializing finished, bot is running");
bot_dispatcher
.dispatch_with_listener(
bot_listener,
LoggingErrorHandler::with_custom_text("An error from the update listener"),
)
.await;
}

67
bot/src/util.rs Normal file
View File

@ -0,0 +1,67 @@
use std::{convert::Infallible, ops::ControlFlow, sync::Arc};
use dptree::{di::Injectable, from_fn_with_description, Handler, HandlerDescription};
pub struct PrettyChat<'a>(pub &'a teloxide::types::Chat);
impl<'a> std::fmt::Debug for PrettyChat<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.0.is_group() || self.0.is_supergroup() {
write!(f, "GroupChat")?;
self.0.title().map(|x| write!(f, " title: {x}"));
self.0.description().map(|x| write!(f, " description: {x}"));
} else if self.0.is_private() {
write!(f, "PrivateChat")?;
self.0.username().map(|x| write!(f, " username: @{x}"));
self.0.first_name().map(|x| write!(f, " first_name: {x}"));
self.0.last_name().map(|x| write!(f, " last_name: {x}"));
self.0.bio().map(|x| write!(f, " bio: {x}"));
} else if self.0.is_channel() {
write!(f, "Channel")?;
self.0.username().map(|x| write!(f, " username: @{x}"));
self.0.title().map(|x| write!(f, " title: {x}"));
self.0
.description()
.map(|x| write!(f, ", description: {x}"));
}
Ok(())
}
}
pub fn wrap_endpoint<'a, F, Input, Output, FnArgs, Descr>(
f: F,
) -> Handler<'a, Input, Result<Output, Infallible>, Descr>
where
Input: Send + Sync + 'a,
Output: Send + Sync + 'a,
Descr: HandlerDescription,
F: Injectable<Input, ControlFlow<Output>, FnArgs> + Send + Sync + 'a,
{
let f = Arc::new(f);
from_fn_with_description(Descr::endpoint(), move |event, _cont| {
let f = Arc::clone(&f);
async move {
let f = f.inject(&event);
let cf = f().await;
drop(f);
match cf {
ControlFlow::Continue(_) => ControlFlow::Continue(event),
ControlFlow::Break(out) => ControlFlow::Break(Ok(out)),
}
}
})
}
#[macro_export]
macro_rules! ok_or_break {
($e: expr) => {
match $e {
Ok(r) => r,
Err(_) => {
return ControlFlow::Break(());
}
}
};
}

16
bot/src/version.rs Normal file
View File

@ -0,0 +1,16 @@
pub(crate) static VERSION: &str = concat!(
"\n",
"Build Timestamp:\t",
env!("VERGEN_BUILD_TIMESTAMP"),
"\n",
"Package Version:\t",
env!("CARGO_PKG_VERSION"),
"\n",
"rustc Version: \t",
env!("VERGEN_RUSTC_SEMVER"),
"\n",
"Cargo Target: \t",
env!("VERGEN_CARGO_TARGET_TRIPLE"),
"\n",
"Source code: \thttps://github.com/qini7-sese/eh2telegraph"
);

27
config_example.yaml Normal file
View File

@ -0,0 +1,27 @@
base:
bot_token: xxx:xxxx
admins:
- 0
telegraph:
tokens:
- xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
author_name: Test Name
author_url: https://github.com/qini7-sese/eh2telegraph
proxy:
endpoint: https://proxy.xxx.workers.dev/
authorization: xxx
http:
ipv6_prefix:
exhentai:
ipb_pass_hash: xxx
ipb_member_id: xxx
igneous: xxx
worker_kv:
endpoint: https://kv.xxx.workers.dev
token: xxx
cache_size: 10240
expire_sec: 5184000 # 60 days

14
docker-compose.yml Normal file
View File

@ -0,0 +1,14 @@
version: "3"
services:
ehbot:
image: ghcr.io/qini7-sese/ehbot:latest
container_name: ehbot
restart: always
network_mode: "host"
environment:
CONFIG_FILE: "/config.yaml"
TZ: Asia/Shanghai
volumes:
- "./config.yaml:/config.yaml:ro"
logging:
driver: journald

40
eh2telegraph/Cargo.toml Normal file
View File

@ -0,0 +1,40 @@
[package]
edition = "2021"
name = "eh2telegraph"
version = "0.1.0"
[dependencies]
again = { version = "0.1", default_features = false, features = ["rand"] }
anyhow = "1"
bytes = "1"
cloudflare-kv-proxy = "0.2"
derive_more = { version = "0.99", features = ["from_str"] }
futures = "0.3"
hashlink = "0.9"
ipnet = "2"
lazy_static = "1"
once_cell = "1"
parking_lot = { version = "0.12", features = ["hardware-lock-elision"] }
rand = "0.8"
regex = "1"
reqwest = { version = "0.12", default-features = false, features = [
"json",
"multipart",
"rustls-tls",
] }
rustls = { version = "0.20", features = ["dangerous_configuration"] }
serde = { version = "1", features = ["derive"] }
serde_json = { version = "1" }
serde_yaml = "0.9"
thiserror = "1"
tokio = { version = "1", default-features = false, features = [
"rt-multi-thread",
"macros",
"net",
"sync",
"time",
"parking_lot",
] }
tracing = "0.1"
webpki = "0.22"
webpki-roots = "0.22"

View File

@ -0,0 +1,95 @@
/// ImageBuffer for upload in batch. ceshi
pub struct ImageBuffer<T> {
buf: Vec<T>,
size: usize,
}
impl<T> Default for ImageBuffer<T> {
#[inline]
fn default() -> Self {
Self {
buf: Vec::new(),
size: 0,
}
}
}
impl<T> ImageBuffer<T>
where
T: DataSized,
{
#[inline]
pub fn new() -> Self {
Self::default()
}
#[inline]
pub fn with_capacity(n: usize) -> Self {
Self {
buf: Vec::with_capacity(n),
size: 0,
}
}
#[inline]
pub fn push(&mut self, data: T) {
self.size += data.size();
self.buf.push(data);
}
#[inline]
pub fn swap(&mut self) -> (Vec<T>, usize) {
let mut out = Vec::with_capacity(self.buf.len() * 2);
std::mem::swap(&mut self.buf, &mut out);
let mut size = 0;
std::mem::swap(&mut self.size, &mut size);
(out, size)
}
#[inline]
pub fn len(&self) -> usize {
self.buf.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.buf.len() == 0
}
#[inline]
pub fn size(&self) -> usize {
self.size
}
#[inline]
pub fn clear(&mut self) {
self.size = 0;
self.buf.clear();
}
}
pub trait DataSized {
fn size(&self) -> usize;
}
impl DataSized for bytes::Bytes {
#[inline]
fn size(&self) -> usize {
self.len()
}
}
impl DataSized for Vec<u8> {
#[inline]
fn size(&self) -> usize {
self.len()
}
}
impl<const N: usize> DataSized for Box<[u8; N]> {
#[inline]
fn size(&self) -> usize {
N
}
}

View File

@ -0,0 +1,253 @@
/// nhentai collector.
/// Host matching: e-hentai.org
use crate::{
http_client::{GhostClient, GhostClientBuilder},
stream::AsyncStream,
util::match_first_group,
util::{get_bytes, get_string},
};
use again::RetryPolicy;
use ipnet::Ipv6Net;
use regex::Regex;
use reqwest::header;
use std::time::Duration;
use super::{
utils::paged::{PageFormatter, PageIndicator, Paged},
AlbumMeta, Collector, ImageData, ImageMeta,
};
lazy_static::lazy_static! {
static ref PAGE_RE: Regex = Regex::new(r#"<a href="(https://e-hentai\.org/s/\w+/[\w-]+)">"#).unwrap();
static ref IMG_RE: Regex = Regex::new(r#"<img id="img" src="(.*?)""#).unwrap();
static ref TITLE_RE: Regex = Regex::new(r#"<h1 id="gn">(.*?)</h1>"#).unwrap();
static ref RETRY_POLICY: RetryPolicy = RetryPolicy::fixed(Duration::from_millis(200))
.with_max_retries(5)
.with_jitter(true);
}
const TIMEOUT: Duration = Duration::from_secs(30);
#[derive(Debug, Clone, Default)]
pub struct EHCollector {
client: GhostClient,
raw_client: reqwest::Client,
}
impl EHCollector {
pub fn new(prefix: Option<Ipv6Net>) -> Self {
let mut request_headers = header::HeaderMap::new();
request_headers.insert(
header::COOKIE,
header::HeaderValue::from_str("nw=1").unwrap(),
);
Self {
client: GhostClientBuilder::default()
.with_default_headers(request_headers)
.with_cf_resolve(&["e-hentai.org"])
.build(prefix),
raw_client: reqwest::Client::builder().timeout(TIMEOUT).build().unwrap(),
}
}
pub fn new_from_config() -> anyhow::Result<Self> {
let mut request_headers = header::HeaderMap::new();
request_headers.insert(
header::COOKIE,
header::HeaderValue::from_str("nw=1").unwrap(),
);
Ok(Self {
client: GhostClientBuilder::default()
.with_default_headers(request_headers)
.with_cf_resolve(&["e-hentai.org"])
.build_from_config()?,
raw_client: reqwest::Client::builder().timeout(TIMEOUT).build().unwrap(),
})
}
}
impl Collector for EHCollector {
type FetchError = anyhow::Error;
type StreamError = anyhow::Error;
type ImageStream = EHImageStream;
#[inline]
fn name() -> &'static str {
"e-hentai"
}
async fn fetch(
&self,
path: String,
) -> Result<(AlbumMeta, Self::ImageStream), Self::FetchError> {
// normalize url
let mut parts = path.trim_matches(|c| c == '/').split('/');
let g = parts.next();
let album_id = parts.next();
let album_token = parts.next();
let (album_id, album_token) = match (g, album_id, album_token) {
(Some("g"), Some(album_id), Some(album_token)) => (album_id, album_token),
_ => {
return Err(anyhow::anyhow!("invalid input path({path}), gallery url is expected(like https://e-hentai.org/g/2127986/da1deffea5)"));
}
};
let url = format!("https://e-hentai.org/g/{album_id}/{album_token}");
tracing::info!("[e-hentai] process {url}");
// clone client to force changing ip
let client = self.client.clone();
let mut paged = Paged::new(0, EHPageIndicator { base: url.clone() });
let gallery_pages = paged.pages(&client).await?;
// Since paged returns at least one page, we can safely get it.
let title = match_first_group(&TITLE_RE, &gallery_pages[0])
.unwrap_or("No Title")
.to_string();
let mut image_page_links = Vec::new();
for gallery_page in gallery_pages.iter() {
PAGE_RE.captures_iter(gallery_page).for_each(|c| {
let matching = c.get(1).expect("regexp is matched but no group 1 found");
image_page_links.push(matching.as_str().to_string());
});
}
if image_page_links.is_empty() {
return Err(anyhow::anyhow!(
"invalid url, maybe resource has been deleted."
));
}
Ok((
AlbumMeta {
link: url,
name: title,
class: None,
description: None,
authors: None,
tags: None,
},
EHImageStream {
client,
raw_client: self.raw_client.clone(),
image_page_links: image_page_links.into_iter(),
},
))
}
}
#[derive(Debug)]
pub struct EHImageStream {
client: GhostClient,
raw_client: reqwest::Client,
image_page_links: std::vec::IntoIter<String>,
}
impl EHImageStream {
async fn load_image(
client: &GhostClient,
raw_client: &reqwest::Client,
link: String,
) -> anyhow::Result<(ImageMeta, ImageData)> {
let content = RETRY_POLICY
.retry(|| async { get_string(client, &link).await })
.await?;
let img_url = match_first_group(&IMG_RE, &content)
.ok_or_else(|| anyhow::anyhow!("unable to find image in page"))?;
let image_data = RETRY_POLICY
.retry(|| async { get_bytes(raw_client, img_url).await })
.await?;
tracing::trace!(
"download e-hentai image with size {}, link: {link}",
image_data.len()
);
let meta = ImageMeta {
id: link,
url: img_url.to_string(),
description: None,
};
Ok((meta, image_data))
}
}
impl AsyncStream for EHImageStream {
type Item = anyhow::Result<(ImageMeta, ImageData)>;
type Future = impl std::future::Future<Output = Self::Item>;
fn next(&mut self) -> Option<Self::Future> {
let link = self.image_page_links.next()?;
let client = self.client.clone();
let raw_client = self.raw_client.clone();
Some(async move { Self::load_image(&client, &raw_client, link).await })
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.image_page_links.size_hint()
}
}
struct EHPageIndicator {
base: String,
}
impl PageFormatter for EHPageIndicator {
fn format_n(&self, n: usize) -> String {
format!("{}/?p={}", self.base, n)
}
}
impl PageIndicator for EHPageIndicator {
fn is_last_page(&self, content: &str, next_page: usize) -> bool {
let html = format!(
"<a href=\"{}/?p={}\" onclick=\"return false\">",
self.base, next_page
);
!content.contains(&html)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[ignore]
#[tokio::test]
async fn demo() {
let collector = EHCollector {
raw_client: Default::default(),
client: Default::default(),
};
let (album, mut image_stream) = collector
.fetch("/g/2122174/fd2525031e".to_string())
.await
.unwrap();
println!("album: {album:?}");
let maybe_first_image = image_stream.next().unwrap().await;
if let Ok((meta, data)) = maybe_first_image {
println!("first image meta: {meta:?}");
println!("first image data length: {}", data.len());
}
}
#[ignore]
#[test]
fn regex_match() {
// test page: https://e-hentai.org/g/2122174/fd2525031e
let r = Regex::new(r#"<a href="(https://e-hentai\.org/s/\w+/[\w-]+)">"#).unwrap();
let h = r#"<div class="gdtm" style="height:170px"><div style="margin:1px auto 0; width:100px; height:140px; background:transparent url(https://ehgt.org/m/002122/2122174-00.jpg) -600px 0 no-repeat"><a href="https://e-hentai.org/s/bd2b37d829/2122174-7"><img alt="007" title="Page 7: 2.png" src="https://ehgt.org/g/blank.gif" style="width:100px; height:139px; margin:-1px 0 0 -1px" /></a></div></div><div class="gdtm" style="height:170px"><div style="margin:1px auto 0; width:100px; height:100px; background:transparent url(https://ehgt.org/m/002122/2122174-00.jpg) -700px 0 no-repeat"><a href="https://e-hentai.org/s/4ca72f757d/2122174-8"><img alt="008" title="Page 8: 3.png" src="https://ehgt.org/g/blank.gif" style="width:100px; height:99px; margin:-1px 0 0 -1px" />"#;
let mut iter = r.captures_iter(h);
let first = iter.next().unwrap();
println!("{}", first.get(1).unwrap().as_str());
let second = iter.next().unwrap();
println!("{}", second.get(1).unwrap().as_str());
}
}

View File

@ -0,0 +1,284 @@
use std::time::Duration;
use again::RetryPolicy;
use regex::Regex;
use reqwest::header;
use serde::Deserialize;
use crate::{
config,
http_proxy::ProxiedClient,
stream::AsyncStream,
util::match_first_group,
util::{get_bytes, get_string},
};
use super::{
utils::paged::{PageFormatter, PageIndicator, Paged},
AlbumMeta, Collector, ImageData, ImageMeta,
};
lazy_static::lazy_static! {
static ref PAGE_RE: Regex = Regex::new(r#"<a href="(https://exhentai\.org/s/\w+/[\w-]+)">"#).unwrap();
static ref IMG_RE: Regex = Regex::new(r#"<img id="img" src="(.*?)""#).unwrap();
static ref TITLE_RE: Regex = Regex::new(r#"<h1 id="gn">(.*?)</h1>"#).unwrap();
static ref RETRY_POLICY: RetryPolicy = RetryPolicy::fixed(Duration::from_millis(200))
.with_max_retries(5)
.with_jitter(true);
}
const CONFIG_KEY: &str = "exhentai";
const TIMEOUT: Duration = Duration::from_secs(30);
#[derive(Debug, Clone)]
pub struct EXCollector {
proxy_client: ProxiedClient,
client: reqwest::Client,
}
#[derive(Debug, Deserialize)]
pub struct ExConfig {
pub ipb_pass_hash: String,
pub ipb_member_id: String,
pub igneous: String,
}
impl EXCollector {
pub fn new(config: &ExConfig, proxy_client: ProxiedClient) -> anyhow::Result<Self> {
let cookie_value = format!(
"ipb_pass_hash={};ipb_member_id={};igneous={};nw=1",
config.ipb_pass_hash, config.ipb_member_id, config.igneous
);
// set headers with exhentai cookies
let mut request_headers = header::HeaderMap::new();
request_headers.insert(
header::COOKIE,
header::HeaderValue::from_str(&cookie_value)?,
);
Ok(Self {
client: {
reqwest::Client::builder()
.default_headers(request_headers.clone())
.timeout(TIMEOUT)
.build()
.expect("build reqwest client failed")
},
proxy_client: proxy_client.with_default_headers(request_headers),
})
}
pub fn new_from_config() -> anyhow::Result<Self> {
let config: ExConfig = config::parse(CONFIG_KEY)?
.ok_or_else(|| anyhow::anyhow!("exhentai config(key: exhentai) not found"))?;
let proxy_client = ProxiedClient::new_from_config();
Self::new(&config, proxy_client)
}
pub fn get_client(&self) -> reqwest::Client {
self.client.clone()
}
}
impl Collector for EXCollector {
type FetchError = anyhow::Error;
type StreamError = anyhow::Error;
type ImageStream = EXImageStream;
#[inline]
fn name() -> &'static str {
"exhentai"
}
async fn fetch(
&self,
path: String,
) -> Result<(AlbumMeta, Self::ImageStream), Self::FetchError> {
// normalize url
let mut parts = path.trim_matches(|c| c == '/').split('/');
let g = parts.next();
let album_id = parts.next();
let album_token = parts.next();
let (album_id, album_token) = match (g, album_id, album_token) {
(Some("g"), Some(album_id), Some(album_token)) => (album_id, album_token),
_ => {
return Err(anyhow::anyhow!("invalid input path({path}), gallery url is expected(like https://exhentai.org/g/2129939/01a6e086b9)"));
}
};
let url = format!("https://exhentai.org/g/{album_id}/{album_token}");
tracing::info!("[exhentai] process {url}");
let mut paged = Paged::new(0, EXPageIndicator { base: url.clone() });
let gallery_pages = paged.pages(&self.proxy_client).await.map_err(|e| {
tracing::error!("[exhentai] load page failed: {e:?}");
e
})?;
tracing::info!("[exhentai] pages loaded for {album_id}/{album_token}");
// Since paged returns at least one page, we can safely get it.
let title = match_first_group(&TITLE_RE, &gallery_pages[0])
.unwrap_or("No Title")
.to_string();
let mut image_page_links = Vec::new();
for gallery_page in gallery_pages.iter() {
PAGE_RE.captures_iter(gallery_page).for_each(|c| {
let matching = c.get(1).expect("regexp is matched but no group 1 found");
image_page_links.push(matching.as_str().to_string());
});
}
if image_page_links.is_empty() {
return Err(anyhow::anyhow!(
"invalid url, maybe resource has been deleted, or our ip is blocked."
));
}
Ok((
AlbumMeta {
link: url,
name: title,
class: None,
description: None,
authors: None,
tags: None,
},
EXImageStream {
client: self.client.clone(),
proxy_client: self.proxy_client.clone(),
image_page_links: image_page_links.into_iter(),
},
))
}
}
#[derive(Debug)]
pub struct EXImageStream {
client: reqwest::Client,
proxy_client: ProxiedClient,
image_page_links: std::vec::IntoIter<String>,
}
impl EXImageStream {
async fn load_image(
proxy_client: ProxiedClient,
client: reqwest::Client,
link: String,
) -> anyhow::Result<(ImageMeta, ImageData)> {
let content = RETRY_POLICY
.retry(|| async { get_string(&proxy_client, &link).await })
.await?;
let img_url = match_first_group(&IMG_RE, &content)
.ok_or_else(|| anyhow::anyhow!("unable to find image in page"))?;
let image_data = RETRY_POLICY
.retry(|| async { get_bytes(&client, img_url).await })
.await?;
tracing::trace!(
"download exhentai image with size {}, link: {link}",
image_data.len()
);
let meta = ImageMeta {
id: link,
url: img_url.to_string(),
description: None,
};
Ok((meta, image_data))
}
}
impl AsyncStream for EXImageStream {
type Item = anyhow::Result<(ImageMeta, ImageData)>;
type Future = impl std::future::Future<Output = Self::Item>;
fn next(&mut self) -> Option<Self::Future> {
let link = self.image_page_links.next()?;
let client = self.client.clone();
let proxy_client = self.proxy_client.clone();
Some(async move { Self::load_image(proxy_client, client, link).await })
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.image_page_links.size_hint()
}
}
struct EXPageIndicator {
base: String,
}
impl PageFormatter for EXPageIndicator {
fn format_n(&self, n: usize) -> String {
format!("{}/?p={}", self.base, n)
}
}
impl PageIndicator for EXPageIndicator {
fn is_last_page(&self, content: &str, next_page: usize) -> bool {
let html = format!(
"<a href=\"{}/?p={}\" onclick=\"return false\">",
self.base, next_page
);
!content.contains(&html)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[ignore]
#[tokio::test]
async fn demo() {
let config = ExConfig {
ipb_pass_hash: "balabala".to_string(),
ipb_member_id: "balabala".to_string(),
igneous: "balabala".to_string(),
};
println!("config {config:#?}");
let collector = EXCollector::new(&config, ProxiedClient::default()).unwrap();
let (album, mut image_stream) = collector
.fetch("/g/2129939/01a6e086b9".to_string())
.await
.unwrap();
println!("album: {album:?}");
let maybe_first_image = image_stream.next().unwrap().await;
if let Ok((meta, data)) = maybe_first_image {
println!("first image meta: {meta:?}");
println!("first image data length: {}", data.len());
}
}
#[ignore]
#[tokio::test]
async fn invalid_url() {
let config = ExConfig {
ipb_pass_hash: "balabala".to_string(),
ipb_member_id: "balabala".to_string(),
igneous: "balabala".to_string(),
};
println!("config {config:#?}");
let collector = EXCollector::new(&config, ProxiedClient::default()).unwrap();
let output = collector.fetch("/g/2129939/00000".to_string()).await;
assert!(output.is_err());
println!("output err {output:?}");
}
#[ignore]
#[test]
fn regex_match() {
// test page: https://exhentai.org/g/2122174/fd2525031e
let r = Regex::new(r#"<a href="(https://exhentai\.org/s/\w+/[\w-]+)">"#).unwrap();
let h = r#"<div class="gdtm" style="height:170px"><div style="margin:1px auto 0; width:100px; height:140px; background:transparent url(https://ehgt.org/m/002122/2122174-00.jpg) -600px 0 no-repeat"><a href="https://exhentai.org/s/bd2b37d829/2122174-7"><img alt="007" title="Page 7: 2.png" src="https://ehgt.org/g/blank.gif" style="width:100px; height:139px; margin:-1px 0 0 -1px" /></a></div></div><div class="gdtm" style="height:170px"><div style="margin:1px auto 0; width:100px; height:100px; background:transparent url(https://ehgt.org/m/002122/2122174-00.jpg) -700px 0 no-repeat"><a href="https://exhentai.org/s/4ca72f757d/2122174-8"><img alt="008" title="Page 8: 3.png" src="https://ehgt.org/g/blank.gif" style="width:100px; height:99px; margin:-1px 0 0 -1px" />"#;
let mut iter = r.captures_iter(h);
let first = iter.next().unwrap();
println!("{}", first.get(1).unwrap().as_str());
let second = iter.next().unwrap();
println!("{}", second.get(1).unwrap().as_str());
}
}

View File

@ -0,0 +1,97 @@
//! Built-in collectors and trait.
use once_cell::sync::Lazy;
use regex::Regex;
use std::future::Future;
use crate::stream::AsyncStream;
use self::{e_hentai::EHCollector, exhentai::EXCollector, nhentai::NHCollector};
pub mod utils;
pub mod e_hentai;
pub mod exhentai;
pub mod nhentai;
pub mod pixiv;
#[derive(Debug, Clone)]
pub struct ImageMeta {
pub id: String,
pub url: String,
pub description: Option<String>,
}
pub type ImageData = bytes::Bytes;
#[derive(Debug, Clone)]
pub struct AlbumMeta {
pub link: String,
pub name: String,
pub class: Option<String>,
pub description: Option<String>,
pub authors: Option<Vec<String>>,
pub tags: Option<Vec<String>>,
}
/// Generic collector.
/// The `async fetch` returns the result of `AlbumMeta` and `ImageStream`.
/// By exposing `ImageStream`, we can fetch the images lazily. For low
/// memory VM, it will keep only a small amount in memory.
pub trait Collector {
type FetchError;
type StreamError;
type ImageStream: AsyncStream<Item = Result<(ImageMeta, ImageData), Self::StreamError>>;
fn name() -> &'static str;
fn fetch(
&self,
path: String,
) -> impl Future<Output = Result<(AlbumMeta, Self::ImageStream), Self::FetchError>>;
}
pub(crate) static URL_FROM_TEXT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"((https://exhentai\.org/g/\w+/[\w-]+)|(https://e-hentai\.org/g/\w+/[\w-]+)|(https://nhentai\.net/g/\d+)|(https://nhentai\.to/g/\d+))"#).unwrap()
});
pub(crate) static URL_FROM_URL_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"^((https://exhentai\.org/g/\w+/[\w-]+)|(https://e-hentai\.org/g/\w+/[\w-]+)|(https://nhentai\.net/g/\d+)|(https://nhentai\.to/g/\d+))"#).unwrap()
});
#[derive(Debug, Clone)]
pub struct Registry {
eh: EHCollector,
nh: NHCollector,
ex: EXCollector,
}
pub trait Param<T> {
fn get(&self) -> &T;
}
impl Param<EHCollector> for Registry {
fn get(&self) -> &EHCollector {
&self.eh
}
}
impl Param<NHCollector> for Registry {
fn get(&self) -> &NHCollector {
&self.nh
}
}
impl Param<EXCollector> for Registry {
fn get(&self) -> &EXCollector {
&self.ex
}
}
impl Registry {
pub fn new_from_config() -> Self {
Self {
eh: EHCollector::new_from_config().expect("unable to build e-hentai collector"),
nh: NHCollector::new_from_config().expect("unable to build nhentai collector"),
ex: EXCollector::new_from_config().expect("unable to build exhentai collector"),
}
}
}

View File

@ -0,0 +1,269 @@
/// nhentai collector.
/// Host matching: nhentai.to or nhentai.net
///
/// Since nhentai.net always enable CloudFlare Firewall, so we will
/// use nhapi.cat42.uk(there will be some syncing latency).
use again::RetryPolicy;
use rand::seq::SliceRandom;
use reqwest::Response;
use serde::Deserialize;
use std::time::Duration;
use crate::{
http_client::{GhostClient, GhostClientBuilder},
stream::AsyncStream,
util::get_bytes,
};
use super::{AlbumMeta, Collector, ImageData, ImageMeta};
const NHAPI: &str = "https://nhapi.cat42.uk/gallery/";
lazy_static::lazy_static! {
static ref RETRY_POLICY: RetryPolicy = RetryPolicy::fixed(Duration::from_millis(200))
.with_max_retries(5)
.with_jitter(true);
}
const DOMAIN_LIST: [&str; 0] = [];
const NH_CDN_LIST: [&str; 5] = [
"https://i.nhentai.net/galleries",
"https://i2.nhentai.net/galleries",
"https://i3.nhentai.net/galleries",
"https://i5.nhentai.net/galleries",
"https://i7.nhentai.net/galleries",
];
#[derive(Debug, Clone, Default)]
pub struct NHCollector {
client: GhostClient,
}
impl NHCollector {
pub fn new() -> Self {
Self {
client: GhostClientBuilder::default()
.with_cf_resolve(&DOMAIN_LIST)
.build(None),
}
}
pub fn new_from_config() -> anyhow::Result<Self> {
Ok(Self::new())
}
}
#[derive(Deserialize)]
struct NhAlbum {
// id: u32,
media_id: String,
title: Title,
images: Images,
// tags: Vec<Tag>,
// num_pages: usize,
}
#[derive(Deserialize)]
struct Title {
pretty: Option<String>,
english: Option<String>,
japanese: Option<String>,
}
impl Title {
fn title(&self, f: impl Fn() -> String) -> String {
if let Some(pretty) = &self.pretty {
return pretty.clone();
}
if let Some(english) = &self.english {
return english.clone();
}
if let Some(japanese) = &self.japanese {
return japanese.clone();
}
f()
}
}
#[derive(Deserialize)]
struct Images {
pages: Vec<Image>,
}
#[derive(Deserialize, Clone, Copy)]
struct Image {
t: ImageType,
}
#[derive(Debug, Deserialize, Clone, Copy)]
enum ImageType {
#[serde(rename = "j")]
Jpg,
#[serde(rename = "p")]
Png,
#[serde(rename = "g")]
Gif,
}
impl ImageType {
fn as_str(&self) -> &'static str {
match self {
ImageType::Jpg => ".jpg",
ImageType::Png => ".png",
ImageType::Gif => ".gif",
}
}
}
// #[derive(Deserialize)]
// struct Tag {
// #[serde(rename = "type")]
// typ: String,
// name: String,
// }
impl Collector for NHCollector {
type FetchError = anyhow::Error;
type StreamError = anyhow::Error;
type ImageStream = NHImageStream;
#[inline]
fn name() -> &'static str {
"nhentai"
}
async fn fetch(
&self,
path: String,
) -> Result<(AlbumMeta, Self::ImageStream), Self::FetchError> {
// normalize url
let mut parts = path.trim_matches(|c| c == '/').split('/');
let g = parts.next();
let album_id = parts.next();
let album_id = match (g, album_id) {
(Some("g"), Some(album_id)) => album_id,
_ => {
return Err(anyhow::anyhow!("invalid input path({path}), gallery url is expected(like https://nhentai.net/g/333678)"));
}
};
// Note: Since nh enables CF firewall, we use nhentai.to instead.
let api_url = format!("{NHAPI}{album_id}");
let original_url = format!("https://nhentai.net/g/{album_id}");
tracing::info!("[nhentai] process {api_url}(original url {original_url})");
// clone client to force changing ip
let client = self.client.clone();
let album: NhAlbum = client
.get(&api_url)
.send()
.await
.and_then(Response::error_for_status)?
.json()
.await?;
let title = album.title.title(|| format!("Nhentai-{album_id}"));
let image_urls = album
.images
.pages
.iter()
.enumerate()
.map(|(idx, page)| ImageURL::new(album.media_id.clone(), idx + 1, page.t))
.collect::<Vec<_>>()
.into_iter();
Ok((
AlbumMeta {
link: original_url,
name: title,
class: None,
description: None,
authors: None,
tags: None,
},
NHImageStream { client, image_urls },
))
}
}
#[derive(Debug)]
struct ImageURL {
raw: String,
media: String,
id: usize,
typ: ImageType,
}
impl ImageURL {
fn new(media: String, id: usize, typ: ImageType) -> Self {
Self {
raw: Self::random_cdn_link(&media, id, typ),
media,
id,
typ,
}
}
fn raw(&self) -> &str {
&self.raw
}
fn fallback(&self) -> String {
Self::random_cdn_link(&self.media, self.id, self.typ)
}
fn random_cdn_link(media: &str, id: usize, typ: ImageType) -> String {
let cdn = NH_CDN_LIST
.choose(&mut rand::thread_rng())
.expect("empty CDN list");
format!("{cdn}/{media}/{id}{}", typ.as_str())
}
}
#[derive(Debug)]
pub struct NHImageStream {
client: GhostClient,
image_urls: std::vec::IntoIter<ImageURL>,
}
impl NHImageStream {
async fn load_image(client: GhostClient, link: &str) -> anyhow::Result<(ImageMeta, ImageData)> {
let image_data = RETRY_POLICY
.retry(|| async { get_bytes(&client, link).await })
.await?;
tracing::trace!(
"download nhentai image with size {}, link: {link}",
image_data.len()
);
let meta = ImageMeta {
id: link.to_string(),
url: link.to_string(),
description: None,
};
Ok((meta, image_data))
}
}
impl AsyncStream for NHImageStream {
type Item = anyhow::Result<(ImageMeta, ImageData)>;
type Future = impl std::future::Future<Output = Self::Item>;
fn next(&mut self) -> Option<Self::Future> {
let link = self.image_urls.next()?;
let client = self.client.clone();
Some(async move {
match Self::load_image(client.clone(), link.raw()).await {
Ok(r) => Ok(r),
Err(e) => {
tracing::error!("fallback for nh image {link:?}: {e}");
Self::load_image(client, &link.fallback()).await
}
}
})
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.image_urls.size_hint()
}
}

View File

@ -0,0 +1 @@

View File

@ -0,0 +1 @@
pub mod paged;

View File

@ -0,0 +1,74 @@
use reqwest::Response;
use crate::http_client::HttpRequestBuilder;
pub trait PageFormatter {
fn format_n(&self, n: usize) -> String;
}
pub trait PageIndicator {
fn is_last_page(&self, content: &str, next_page: usize) -> bool;
}
#[derive(thiserror::Error, Debug)]
pub enum PagedError {
#[error("reqwest error")]
Reqwest(#[from] reqwest::Error),
}
pub struct Paged<T> {
next_page: usize,
page_indicator: T,
}
impl<T> Paged<T> {
pub fn new(init_page: usize, page_indicator: T) -> Self {
Self {
next_page: init_page,
page_indicator,
}
}
}
impl<T> Paged<T>
where
T: PageFormatter,
{
pub async fn next<C>(&mut self, client: &C) -> Result<String, PagedError>
where
C: HttpRequestBuilder,
{
let url = self.page_indicator.format_n(self.next_page);
let content = client
.get_builder(&url)
.send()
.await
.and_then(Response::error_for_status)?
.text()
.await?;
self.next_page += 1;
Ok(content)
}
}
impl<T> Paged<T>
where
T: PageFormatter + PageIndicator,
{
/// pages returns at least one element if it is Ok
pub async fn pages<C>(&mut self, client: &C) -> Result<Vec<String>, PagedError>
where
C: HttpRequestBuilder,
{
let mut results = Vec::new();
loop {
let content = self.next(client).await?;
let terminated = self.page_indicator.is_last_page(&content, self.next_page);
results.push(content);
if terminated {
return Ok(results);
}
}
}
}

View File

@ -0,0 +1,45 @@
use std::{collections::HashMap, env};
use once_cell::sync::OnceCell;
static CFG_PATH: OnceCell<String> = OnceCell::new();
lazy_static::lazy_static! {
static ref CONFIG_MAPPING: HashMap<String, serde_yaml::Value> = {
let file_path = CFG_PATH.get_or_init(get_config_path);
let file_content = std::fs::read_to_string(file_path).expect("config file not found");
serde_yaml::from_str(&file_content).expect("unable to parse config file")
};
}
fn get_config_path() -> String {
// read from env
if let Ok(p) = env::var("CONFIG_FILE") {
if !p.is_empty() {
return p;
}
}
// default
"config.yaml".to_string()
}
/// Initialize config, will panic on failure.
pub fn init(config_path: Option<String>) {
if let Some(p) = config_path {
let _ = CFG_PATH.set(p);
}
lazy_static::initialize(&CONFIG_MAPPING);
}
/// Parse struct from global config.
pub fn parse<T>(key: &str) -> serde_yaml::Result<Option<T>>
where
T: serde::de::DeserializeOwned,
{
CONFIG_MAPPING
.get(key)
.cloned()
.map(|v| serde_yaml::from_value(v))
.transpose()
}

View File

@ -0,0 +1,284 @@
// A wrapper for reqwest to provide ability to bind to random ip.
// Since apparently I can not afford a ipv4 subnet, here I assume ipv6.
// Using he.net tunnel broker works fine.
// Setup:
// 1. sudo ip add add local 2001:x:x::/48 dev lo
// 2. sudo ip route add local 2001:x:x::/48 dev he-ipv6
// 3. Set net.ipv6.ip_nonlocal_bind=1
pub const UAS: [&str; 45] = [
"Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 12.3; x64; rv:107.0) Gecko/20100101 Firefox/107.0",
"Mozilla/5.0 (Linux; Android 12; SM-G988B Build/SP1A.210812.016; wv) Gecko/20100101 Firefox/107.0 Mobile/15E148",
"Mozilla/5.0 (iPhone; CPU iPhone OS 15_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/107.0 Mobile/15E148",
"Mozilla/5.0 (iPhone; CPU iPhone OS 15_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/108.0.5359.95 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Linux; Android 12; SM-G988B Build/SP1A.210812.016; wv) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.95 Mobile Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 12.3; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.95 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.95 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3.1 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 12.3; x64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.2365.80",
"Mozilla/5.0 (X11; CrOS x86_64 15633.69.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.6045.212 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 OPR/108.0.0.0",
"Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.105 Mobile Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3.1 Mobile/15E148 Safari/604.1",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Linux; U; Android 11; zh-cn; PDRM00 Build/RKQ1.200903.002) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/70.0.3538.80 Mobile Safari/537.36 HeyTapBrowser/40.7.27.2",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.71 Safari/537.36 Edg/108.0.1462.42",
"Mozilla/5.0 (Linux; U; Android 12; zh-cn; 2201122C Build/SKQ1.211006.001) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/89.0.4389.116 Mobile Safari/537.36 XiaoMi/MiuiBrowser/15.9.18 swan-mibrowser",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.95 Safari/537.36 OPR/74.0.3911.104",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 QQBrowser/10.8.4313.400",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 360SE/13.0.1920.1000",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 UCBrowser/13.2.8.1300",
"Mozilla/5.0 (iPhone; CPU iPhone OS 15_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 QQ/10.8.4313.400 NetType/WIFI",
"Mozilla/5.0 (Linux; Android 12; SM-G988B Build/SP1A.210812.016; wv) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Mobile Safari/537.36 QQBrowser/10.8.4313.400",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 12.3; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 QQBrowser/10.8.4313.400",
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Mobile Safari/537.36 MicroMessenger/7.0.1",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/74.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13.3; rv:109.0) Gecko/20100101 Firefox/109.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.77 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.77 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:109.0) Gecko/20100101 Firefox/109.0",
"Mozilla/5.0 (Linux; Android 13; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.63 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 9; SM-G960F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.117 Mobile Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPad; CPU OS 16_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 YaBrowser/19.1.3.322 Yowser/2.5 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3706.400 QQBrowser/10.4.3620.400",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Whale/2.7.99.13 Safari/537.36",
];
const CONFIG_KEY: &str = "http";
const TIMTOUT: Duration = Duration::from_secs(30);
use std::{
net::{IpAddr, Ipv6Addr, SocketAddr},
ops::{Deref, DerefMut},
sync::Arc,
time::Duration,
};
use ipnet::Ipv6Net;
use reqwest::header;
use rustls::ClientConfig;
use crate::{config, tls::WhitelistVerifier};
const CF_ADDR: Ipv6Addr = Ipv6Addr::new(0x2606, 0x4700, 0x4700, 0, 0, 0, 0, 0x1111);
const TG_ADDR: Ipv6Addr = Ipv6Addr::new(0x2001, 0x67c, 0x4e8, 0x1033, 0x1, 0x100, 0, 0xa);
pub fn rand_ua() -> &'static str {
use rand::seq::SliceRandom;
use rand::thread_rng;
UAS.choose(&mut thread_rng()).expect("Empty UA List!")
}
pub trait HttpRequestBuilder {
fn get_builder(&self, url: &str) -> reqwest::RequestBuilder;
fn post_builder(&self, url: &str) -> reqwest::RequestBuilder;
}
macro_rules! gen_impl {
($ty: ty) => {
impl HttpRequestBuilder for $ty {
#[inline]
fn get_builder(&self, url: &str) -> reqwest::RequestBuilder {
self.get(url).header(reqwest::header::USER_AGENT, rand_ua())
}
#[inline]
fn post_builder(&self, url: &str) -> reqwest::RequestBuilder {
self.post(url)
.header(reqwest::header::USER_AGENT, rand_ua())
}
}
};
}
gen_impl!(reqwest::Client);
gen_impl!(crate::http_proxy::ProxiedClient);
gen_impl!(GhostClient);
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, derive_more::From, derive_more::Into)]
pub struct Ipv6Net2(Ipv6Net);
impl<'de> serde::Deserialize<'de> for Ipv6Net2 {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
use std::str::FromStr;
let data = String::deserialize(deserializer)?;
Ipv6Net::from_str(&data)
.map(Ipv6Net2)
.map_err(serde::de::Error::custom)
}
}
#[derive(serde::Deserialize, Clone, Debug, Default)]
struct HTTPConfig {
ipv6_prefix: Option<Ipv6Net2>,
}
#[derive(Debug, Default)]
pub struct GhostClientBuilder {
mapping: Vec<(&'static str, SocketAddr)>,
headers: Option<header::HeaderMap>,
}
impl GhostClientBuilder {
pub fn with_default_headers(self, headers: header::HeaderMap) -> Self {
Self {
headers: Some(headers),
..self
}
}
pub fn with_cf_resolve(mut self, domains: &[&'static str]) -> Self {
let cf = SocketAddr::new(IpAddr::V6(CF_ADDR), 443);
for &domain in domains.iter() {
self.mapping.push((domain, cf));
}
self
}
#[deprecated = "telegra.ph has fixed it and returns 501 when using ipv6"]
pub fn with_tg_resolve(mut self) -> Self {
let tg = SocketAddr::new(IpAddr::V6(TG_ADDR), 443);
self.mapping.push(("telegra.ph", tg));
self.mapping.push(("api.telegra.ph", tg));
self
}
pub fn build(self, prefix: Option<Ipv6Net>) -> GhostClient {
let inner = GhostClient::build_raw(&prefix, &self.mapping, self.headers.clone());
GhostClient {
prefix,
mapping: Arc::new(self.mapping),
headers: self.headers,
inner,
}
}
pub fn build_from_config(self) -> anyhow::Result<GhostClient> {
let config: HTTPConfig = config::parse(CONFIG_KEY)?.unwrap_or_default();
let prefix = config.ipv6_prefix.map(Into::into);
Ok(self.build(prefix))
}
}
#[derive(Debug, Default)]
pub struct GhostClient {
prefix: Option<Ipv6Net>,
mapping: Arc<Vec<(&'static str, SocketAddr)>>,
headers: Option<header::HeaderMap>,
inner: reqwest::Client,
}
impl GhostClient {
pub fn builder() -> GhostClientBuilder {
GhostClientBuilder::default()
}
}
impl Clone for GhostClient {
fn clone(&self) -> Self {
let inner = Self::build_raw(&self.prefix, &self.mapping, self.headers.clone());
Self {
prefix: self.prefix,
mapping: self.mapping.clone(),
headers: self.headers.clone(),
inner,
}
}
}
impl Deref for GhostClient {
type Target = reqwest::Client;
fn deref(&self) -> &Self::Target {
&self.inner
}
}
impl DerefMut for GhostClient {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.inner
}
}
impl GhostClient {
fn build_raw(
net: &Option<Ipv6Net>,
mapping: &[(&'static str, SocketAddr)],
headers: Option<header::HeaderMap>,
) -> reqwest::Client {
let mut builder = reqwest::Client::builder().timeout(TIMTOUT);
if let Some(headers) = headers {
builder = builder.default_headers(headers);
}
if let Some(net) = net {
let addr: u128 = net.addr().into();
let prefix_len = net.prefix_len();
let mask = !u128::MAX
.checked_shl((128 - prefix_len) as u32)
.unwrap_or(u128::MIN);
// use random ipv6
let rand: u128 = rand::Rng::gen(&mut rand::thread_rng());
let addr = IpAddr::V6(Ipv6Addr::from(rand & mask | addr));
builder = builder.local_address(addr);
// apply resolve
for (domain, addr) in mapping {
builder = builder.resolve(domain, *addr);
}
// not add preconfigured tls
// let tls_config = TLS_CFG.clone();
// builder = builder.use_preconfigured_tls(tls_config);
}
builder.build().expect("build reqwest client failed")
}
pub fn refresh(&mut self) {
self.inner = Self::build_raw(&self.prefix, &self.mapping, self.headers.clone());
}
}
lazy_static::lazy_static! {
// here we only meet telegra.ph with wrong tls config, so we write them as fixed values.
static ref TLS_CFG: ClientConfig = WhitelistVerifier::new(["telegram.org"]).into();
}
#[cfg(test)]
mod tests {
use super::TLS_CFG;
#[ignore]
#[tokio::test]
async fn test_tls() {
let tls_config = TLS_CFG.clone();
// use a telegram.org ip address(normally it fails in browser)
let cli = reqwest::Client::builder()
.resolve("api.telegra.ph", "149.154.167.99:443".parse().unwrap())
.use_preconfigured_tls(tls_config)
.build()
.unwrap();
let resp = cli
.get("https://api.telegra.ph/getPage")
.send()
.await
.unwrap();
assert_eq!(resp.status(), 200);
}
}

View File

@ -0,0 +1,104 @@
use std::time::Duration;
use reqwest::header::HeaderValue;
use crate::config;
const CONFIG_KEY: &str = "proxy";
const TIMEOUT: Duration = Duration::from_secs(30);
#[derive(serde::Deserialize, Clone, Debug, Default)]
struct ProxyConfig {
endpoint: String,
authorization: String,
}
/// RequestBuilder helps create a Request with proxy.
/// Note: Users should not replace headers.
#[derive(Debug, Clone, Default)]
pub struct ProxiedClient {
proxy: Option<Proxy>,
inner: reqwest::Client,
}
#[derive(Debug, Clone)]
pub struct Proxy {
endpoint: reqwest::Url,
authorization: HeaderValue,
}
impl ProxiedClient {
pub fn new(endpoint: &str, authorization: &str) -> Self {
let proxy = Some(Proxy {
endpoint: endpoint.parse().expect("unable to parse proxy endpoint"),
authorization: authorization
.parse()
.expect("unable to parse proxy authorization"),
});
Self {
proxy,
inner: reqwest::Client::builder()
.timeout(TIMEOUT)
.build()
.expect("unable to build reqwest client"),
}
}
pub fn new_from_config() -> Self {
match config::parse::<ProxyConfig>(CONFIG_KEY)
.expect("unable to parse proxy config(key is {CONFIG_KEY})")
{
Some(cfg) => Self::new(&cfg.endpoint, &cfg.authorization),
None => {
tracing::warn!("initialized ProxiedClient without proxy config");
Self::default()
}
}
}
pub fn with_default_headers(self, headers: reqwest::header::HeaderMap) -> Self {
Self {
inner: reqwest::Client::builder()
.timeout(TIMEOUT)
.default_headers(headers)
.build()
.expect("unable to build reqwest client"),
..self
}
}
}
macro_rules! impl_method {
($method: ident) => {
pub fn $method(&self, url: &str) -> reqwest::RequestBuilder {
match &self.proxy {
Some(p) => self
.inner
.$method(p.endpoint.clone())
.header("X-Forwarded-For", url)
.header("X-Authorization", p.authorization.clone()),
None => self.inner.$method(url),
}
}
};
}
impl ProxiedClient {
impl_method!(get);
impl_method!(post);
impl_method!(head);
impl_method!(put);
impl_method!(delete);
impl_method!(patch);
pub fn request(&self, method: reqwest::Method, url: &str) -> reqwest::RequestBuilder {
match &self.proxy {
Some(p) => self
.inner
.request(method, p.endpoint.clone())
.header("X-Forwarded-For", url)
.header("X-Authorization", p.authorization.clone()),
None => self.inner.request(method, url),
}
}
}

View File

@ -0,0 +1,15 @@
// Indexer + Filters(FilterType+Value) -> EntryStream
#[derive(Debug, Clone)]
pub enum Filter {
Name(String),
Category(String),
}
#[derive(Debug, Clone)]
pub enum OrderBy {
TimeDesc,
ClickDesc,
}
pub trait Indexer {}

18
eh2telegraph/src/lib.rs Normal file
View File

@ -0,0 +1,18 @@
#![feature(type_alias_impl_trait)]
#![feature(impl_trait_in_assoc_type)]
#[macro_use]
pub mod telegraph;
pub mod buffer;
pub mod collector;
pub mod config;
pub mod http_client;
pub mod http_proxy;
pub mod indexer;
pub mod searcher;
pub mod storage;
pub mod stream;
pub mod sync;
pub mod tls;
pub mod util;

View File

@ -0,0 +1,68 @@
use ipnet::Ipv6Net;
use regex::Regex;
use crate::{
collector::exhentai::EXCollector,
http_client::{GhostClient, GhostClientBuilder},
util::{get_string, match_first_group},
};
lazy_static::lazy_static! {
static ref EHENTAI_URL_RE: Regex = Regex::new(r#"<a href="(https://e(-|x)hentai\.org/g/\w+/[\w-]+)/">"#).unwrap();
}
/// FHashConverter can convert f-hash(usually comes from a search result) to the first gallery url.
/// Works for both e-hentai and ex-hentai.
pub struct FHashConvertor {
client: GhostClient,
raw_client: reqwest::Client,
}
impl FHashConvertor {
pub fn new(prefix: Option<Ipv6Net>) -> Self {
Self {
client: GhostClientBuilder::default()
.with_cf_resolve(&["e-hentai.org"])
.build(prefix),
raw_client: EXCollector::new_from_config()
.expect("unable to build ex-client")
.get_client(),
}
}
pub fn new_from_config() -> Self {
Self {
client: GhostClientBuilder::default()
.with_cf_resolve(&["e-hentai.org"])
.build_from_config()
.expect("unable to build client for f-hash convertor"),
raw_client: EXCollector::new_from_config()
.expect("unable to build ex-client")
.get_client(),
}
}
// TODO: impl a trait?
pub async fn convert_to_gallery(&self, f_hash: &str) -> anyhow::Result<String> {
tracing::info!("[f-hash] converting hash {f_hash}");
// find in e-hentai
let url = format!("https://e-hentai.org/?f_shash={f_hash}&f_sh=on&f_sname=on&f_stags=on&f_sh=on&f_spf=&f_spt=&f_sfl=on&f_sfu=on&f_sft=on");
let text = get_string(&self.client, &url).await?;
if let Some(url) = match_first_group(&EHENTAI_URL_RE, &text) {
tracing::info!("[f-hash] hash {f_hash} -> {url}");
return Ok(url.to_string());
}
// find in exhentai
let url = format!("https://exhentai.org/?f_shash={f_hash}&f_sh=on&f_sname=on&f_stags=on&f_sh=on&f_spf=&f_spt=&f_sfl=on&f_sfu=on&f_sft=on");
let text = get_string(&self.raw_client, &url).await?;
if let Some(url) = match_first_group(&EHENTAI_URL_RE, &text) {
tracing::info!("[f-hash] hash {f_hash} -> {url}");
return Ok(url.to_string());
}
tracing::info!("[f-hash] hash {f_hash} not found");
Err(anyhow::anyhow!("not found in e-hentai or exhentai"))
}
}

View File

@ -0,0 +1,24 @@
pub mod f_hash;
pub mod saucenao;
pub trait ImageSearcher<T> {
type SeacheError;
type SearchOutput;
type FetchFuture: std::future::Future<Output = Result<Self::SearchOutput, Self::SeacheError>>;
fn search(&self, data: T) -> Self::FetchFuture;
}
#[cfg(test)]
mod tests {
use super::*;
#[ignore]
#[tokio::test]
async fn demo() {
let data = std::fs::read("./image.png").unwrap();
let searcher = saucenao::SaucenaoSearcher::new(None);
let r = searcher.search(data).await;
println!("result: {r:?}");
}
}

View File

@ -0,0 +1,195 @@
use std::{borrow::Cow, str::FromStr};
use futures::Future;
use ipnet::Ipv6Net;
use regex::Regex;
use reqwest::{
multipart::{self, Part},
Response,
};
use crate::http_client::{GhostClient, HttpRequestBuilder};
use super::ImageSearcher;
lazy_static::lazy_static! {
static ref SEARCH_ELEMENT_RE: Regex = Regex::new(r#"<tr><td class="resulttableimage">(.*?)</tr>"#).unwrap();
static ref S_URL_RE: Regex = Regex::new(r#"src="(https://.*?)""#).unwrap();
static ref TITLE_RE: Regex = Regex::new(r#"<div class="resulttitle"><strong>(.*?)</strong>"#).unwrap();
static ref SIM_RE: Regex = Regex::new(r#"<div class="resultsimilarityinfo">(\d+)\.?\d*%</div>"#).unwrap();
static ref SITE_PARSE_RE: Regex = Regex::new(r#"saucenao\.com/(res/pixiv(_historical)?/\d+/manga/(?P<pixiv_id>\d+)_)|(ehentai/\w+/\w+/(?P<ehentai_fhash>\w+))|(res/nhentai/(?P<nhentai_id>\d+))"#).unwrap();
}
macro_rules! extract_first {
($re: expr, $input: expr, $err_msg: expr) => {
$re.captures($input)
.ok_or_else(|| anyhow::anyhow!($err_msg))?
.get(1)
.expect("regexp is matched but no group 1 found")
.as_str()
};
}
macro_rules! extract_first_opt {
($re: expr, $input: expr, $default: expr) => {
match $re.captures($input) {
Some(t) => t
.get(1)
.expect("regexp is matched but no group 1 found")
.as_str(),
None => $default,
}
};
}
/// Saucenao searcher.
/// Note: even saucenao resolves to an ipv6 address, we still use force resolving.
#[derive(Debug, Clone)]
pub struct SaucenaoSearcher {
client: GhostClient,
}
impl SaucenaoSearcher {
pub fn new(prefix: Option<Ipv6Net>) -> Self {
Self {
client: GhostClient::builder()
.with_cf_resolve(&["saucenao.com", "e-hentai.org"])
.build(prefix),
}
}
pub fn new_from_config() -> Self {
Self {
client: GhostClient::builder()
.with_cf_resolve(&["saucenao.com", "e-hentai.org"])
.build_from_config()
.expect("unable to build client for saucenao"),
}
}
async fn do_search<C: HttpRequestBuilder>(
client: &C,
file: Part,
) -> anyhow::Result<SaucenaoOutput> {
let response = client
.post_builder("https://saucenao.com/search.php")
.multipart(multipart::Form::new().part("file", file))
.send()
.await
.and_then(Response::error_for_status)?
.text()
.await?;
// check if the response is as expected
if !response.contains("<title>Sauce Found?</title>") {
return Err(anyhow::anyhow!("saucenao response is not as expected"));
}
SaucenaoOutput::from_str(&response)
}
}
#[non_exhaustive]
#[derive(Debug, Clone)]
pub enum SaucenaoParsed {
EHentai(String),
NHentai(String),
Pixiv(String),
Other,
}
#[derive(Debug, Clone)]
pub struct SaucenaoOuputElement {
pub raw_url: String,
pub name: String,
pub similarity: u8,
pub parsed: SaucenaoParsed,
}
#[derive(Debug, Clone)]
pub struct SaucenaoOutput {
pub data: Vec<SaucenaoOuputElement>,
}
impl IntoIterator for SaucenaoOutput {
type Item = <Vec<SaucenaoOuputElement> as IntoIterator>::Item;
type IntoIter = <Vec<SaucenaoOuputElement> as IntoIterator>::IntoIter;
fn into_iter(self) -> Self::IntoIter {
self.data.into_iter()
}
}
impl<T> ImageSearcher<T> for SaucenaoSearcher
where
T: Into<Cow<'static, [u8]>>,
{
type SeacheError = anyhow::Error;
type SearchOutput = SaucenaoOutput;
type FetchFuture = impl Future<Output = Result<Self::SearchOutput, Self::SeacheError>>;
fn search(&self, data: T) -> Self::FetchFuture {
let file_part = Part::bytes(data).file_name("image.jpg");
let client = self.client.clone();
async move { Self::do_search(&client, file_part).await }
}
}
impl FromStr for SaucenaoOutput {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut data = Vec::new();
// match all
for cap in SEARCH_ELEMENT_RE.captures_iter(s) {
let s = cap
.get(1)
.expect("regexp is matched but no group 1 found")
.as_str();
let element = SaucenaoOuputElement::from_str(s)?;
data.push(element);
}
// sort
data.sort_unstable_by(|a, b| b.similarity.cmp(&a.similarity));
Ok(Self { data })
}
}
impl FromStr for SaucenaoOuputElement {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
// raw_url examples:
// https://img1.saucenao.com/res/pixiv/7594/manga/75943246_p1.jpg?auth=dKnHvUUPQ0wi8G6yv-HWZQ&exp=1645560000
// https://img1.saucenao.com/res/seiga_illust/157/1574075.jpg?auth=KKGjLqCUyouLUKieJ5g4Rw&exp=1645560000
// https://img3.saucenao.com/ehentai/c5/17/c517710f0654ea883df1e0fea7117c671fb03bc1.jpg?auth=Hu-H_4c3lTKdh_rtZJv50w&exp=1645560000
let raw_url =
extract_first!(S_URL_RE, s, "unable to parse saucenao result url").to_string();
let name = extract_first_opt!(TITLE_RE, s, "NO TITLE").to_string();
let similarity =
extract_first!(SIM_RE, s, "unable to parse saucenao result similarity").parse()?;
let parsed = SITE_PARSE_RE
.captures(&raw_url)
.and_then(|cap| {
if let Some(pixiv) = cap.name("pixiv_id") {
return Some(SaucenaoParsed::Pixiv(pixiv.as_str().to_string()));
}
if let Some(eh) = cap.name("ehentai_fhash") {
return Some(SaucenaoParsed::EHentai(eh.as_str().to_string()));
}
if let Some(nh) = cap.name("nhentai_id") {
return Some(SaucenaoParsed::NHentai(nh.as_str().to_string()));
}
None
})
.unwrap_or(SaucenaoParsed::Other);
Ok(Self {
raw_url,
name,
similarity,
parsed,
})
}
}

View File

@ -0,0 +1,115 @@
use std::{sync::Arc, time::Duration};
use cloudflare_kv_proxy::{Client, ClientError, NotFoundMapping};
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use crate::config;
use super::{KVStorage, SimpleMemStorage};
const CONFIG_KEY: &str = "worker_kv";
const TIMEOUT: Duration = Duration::from_secs(3);
#[derive(Debug, Deserialize)]
pub struct CFConfig {
pub endpoint: String,
pub token: String,
pub cache_size: usize,
pub expire_sec: u64,
}
#[derive(Clone, Debug)]
pub struct CFStorage(Arc<Client>);
impl CFStorage {
pub fn new<T: Into<String>, E: Into<String>>(
endpoint: E,
token: T,
cache_size: usize,
expire: Duration,
) -> Result<Self, ClientError> {
Ok(Self(Arc::new(Client::new(
endpoint, token, TIMEOUT, cache_size, expire,
)?)))
}
pub fn new_from_config() -> anyhow::Result<Self> {
let config: CFConfig = config::parse(CONFIG_KEY)?
.ok_or_else(|| anyhow::anyhow!("cloudflare worker config(key: worker_kv) not found"))?;
Self::new(
config.endpoint,
config.token,
config.cache_size,
Duration::from_secs(config.expire_sec),
)
.map_err(Into::into)
}
}
impl<T> KVStorage<T> for CFStorage
where
T: DeserializeOwned + Serialize + Send + Sync,
{
async fn get(&self, key: &str) -> anyhow::Result<Option<T>> {
self.0
.get(key)
.await
.map_not_found_to_option()
.map_err(Into::into)
}
async fn set(&self, key: String, value: T, _expire_ttl: Option<usize>) -> anyhow::Result<()> {
self.0.put(&key, &value).await.map_err(Into::into)
}
async fn delete(&self, key: &str) -> anyhow::Result<()> {
self.0.delete(key).await.map_err(Into::into)
}
}
#[derive(Clone, Debug)]
pub enum CFOrMemStorage<T> {
Mem(SimpleMemStorage<T>),
CF(CFStorage),
}
impl<T> CFOrMemStorage<T> {
pub fn new_from_config() -> Self {
match CFStorage::new_from_config() {
Ok(s) => CFOrMemStorage::CF(s),
Err(e) => {
tracing::error!(
"unable to read cloudflare cache settings, will use in memory cache: {e:?}"
);
CFOrMemStorage::Mem(SimpleMemStorage::<T>::default())
}
}
}
}
impl<T> KVStorage<T> for CFOrMemStorage<T>
where
T: Clone + Send + Sync,
CFStorage: KVStorage<T>,
{
async fn get(&self, key: &str) -> anyhow::Result<Option<T>> {
match self {
CFOrMemStorage::Mem(inner) => inner.get(key).await,
CFOrMemStorage::CF(inner) => inner.get(key).await,
}
}
async fn set(&self, key: String, value: T, expire_ttl: Option<usize>) -> anyhow::Result<()> {
match self {
CFOrMemStorage::Mem(inner) => inner.set(key, value, expire_ttl).await,
CFOrMemStorage::CF(inner) => inner.set(key, value, expire_ttl).await,
}
}
async fn delete(&self, key: &str) -> anyhow::Result<()> {
match self {
CFOrMemStorage::Mem(inner) => inner.delete(key).await,
CFOrMemStorage::CF(inner) => inner.delete(key).await,
}
}
}

View File

@ -0,0 +1,37 @@
use std::sync::Arc;
use hashlink::LruCache;
use parking_lot::Mutex;
use super::KVStorage;
#[derive(Clone, Debug)]
pub struct LruStorage(Arc<Mutex<LruCache<String, String>>>);
impl LruStorage {
pub fn new(capacity: usize) -> Self {
Self(Arc::new(Mutex::new(LruCache::new(capacity))))
}
}
impl KVStorage<String> for LruStorage {
async fn get(&self, key: &str) -> anyhow::Result<Option<String>> {
let v = self.0.lock().get(key).cloned();
Ok(v)
}
async fn set(
&self,
key: String,
value: String,
_expire_ttl: Option<usize>,
) -> anyhow::Result<()> {
self.0.lock().insert(key, value);
Ok(())
}
async fn delete(&self, key: &str) -> anyhow::Result<()> {
self.0.lock().remove(key);
Ok(())
}
}

View File

@ -0,0 +1,52 @@
use futures::Future;
use parking_lot::RwLock;
use std::{collections::HashMap, sync::Arc};
pub mod cloudflare_kv;
pub mod lru;
pub trait KVStorage<V> {
fn get(&self, key: &str) -> impl Future<Output = anyhow::Result<Option<V>>> + Send;
fn set(
&self,
key: String,
value: V,
expire_ttl: Option<usize>,
) -> impl Future<Output = anyhow::Result<()>> + Send;
fn delete(&self, key: &str) -> impl Future<Output = anyhow::Result<()>> + Send;
}
#[derive(Clone, Debug)]
pub struct SimpleMemStorage<T>(Arc<RwLock<HashMap<String, T>>>);
impl<T> Default for SimpleMemStorage<T> {
fn default() -> Self {
Self(Arc::new(RwLock::new(HashMap::new())))
}
}
impl<T> SimpleMemStorage<T> {
pub fn with_capacity(capacity: usize) -> Self {
Self(Arc::new(RwLock::new(HashMap::with_capacity(capacity))))
}
}
impl<T> KVStorage<T> for SimpleMemStorage<T>
where
T: Clone + Send + Sync,
{
async fn get(&self, key: &str) -> anyhow::Result<Option<T>> {
let v = self.0.read().get(key).cloned();
Ok(v)
}
async fn set(&self, key: String, value: T, _expire_ttl: Option<usize>) -> anyhow::Result<()> {
self.0.write().insert(key, value);
Ok(())
}
async fn delete(&self, key: &str) -> anyhow::Result<()> {
self.0.write().remove(key);
Ok(())
}
}

108
eh2telegraph/src/stream.rs Normal file
View File

@ -0,0 +1,108 @@
use std::collections::VecDeque;
use std::fmt;
use std::future::Future;
use futures::FutureExt;
use tokio::sync::oneshot;
/// We define a AsyncStream to replace futures::Stream since we don't want to implement
/// poll_next nor using async_stream.
/// Although we use GAT, we don't want the future to capture self's ref. We did like
/// that before, and this makes it hard to load stream in parallel like Buffered.
/// Also, our AsyncStream is not like Stream in signature. We return `Option<Future>`
/// instead of `Future<Output = Option<_>>`.
pub trait AsyncStream {
type Item;
type Future: Future<Output = Self::Item>;
fn next(&mut self) -> Option<Self::Future>;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
(0, None)
}
}
/// Buffered Stream.
/// By decorating Buffered, the output future of stream will be polled
/// concurrently.
/// Here I implement it by spawning tasks. It is indeed not efficient as
/// `FuturesOrdered` which is used by `futures-util::stream::Buffered`.
/// As a decorator of an async trait, it is hard to implement it in a poll
/// way. We can do that, but it breaks the safety boundary which requires
/// user to make sure that the AsyncStream exists when polling the future
/// since in our trait definition, the future has no relation with self.
/// And without poll, we can not drive multiple futures by one future.
pub struct Buffered<St>
where
St: AsyncStream,
{
stream: Option<St>,
queue: VecDeque<oneshot::Receiver<St::Item>>,
max: usize,
}
impl<St> Buffered<St>
where
St: AsyncStream,
{
pub fn new(stream: St, buffer_size: usize) -> Self {
Self {
stream: Some(stream),
queue: VecDeque::with_capacity(buffer_size),
max: buffer_size,
}
}
}
impl<St> fmt::Debug for Buffered<St>
where
St: AsyncStream + fmt::Debug,
St::Item: fmt::Debug,
{
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Buffered")
.field("stream", &self.stream)
.field("queue", &self.queue)
.field("max", &self.max)
.finish()
}
}
impl<St> AsyncStream for Buffered<St>
where
St: AsyncStream,
St::Item: Send + 'static,
St::Future: Send + 'static,
{
type Item = St::Item;
type Future = impl std::future::Future<Output = Self::Item>;
fn next(&mut self) -> Option<Self::Future> {
while self.queue.len() < self.max {
let item = match self.stream.as_mut() {
Some(st) => match st.next() {
Some(item) => Some(item),
None => {
self.stream = None;
None
}
},
None => None,
};
match item {
Some(f) => {
let (tx, rx) = oneshot::channel::<Self::Item>();
tokio::spawn(async move {
let _ = tx.send(f.await);
});
self.queue.push_back(rx);
}
None => break,
}
}
self.queue
.pop_front()
.map(|x| x.map(|xx| xx.expect("oneshot tx dropped which is unexpected")))
}
}

307
eh2telegraph/src/sync.rs Normal file
View File

@ -0,0 +1,307 @@
use crate::{
buffer::{DataSized, ImageBuffer},
collector::{
AlbumMeta, Collector, ImageData, ImageMeta, Param, Registry, URL_FROM_TEXT_RE,
URL_FROM_URL_RE,
},
http_proxy::ProxiedClient,
storage::{cloudflare_kv::CFStorage, KVStorage},
stream::{AsyncStream, Buffered},
telegraph::{
types::{Node, NodeElement, NodeElementAttr, Page, PageCreate, Tag},
RandomAccessToken, Telegraph, TelegraphError, MAX_SINGLE_FILE_SIZE,
},
util::match_first_group,
};
const ERR_THRESHOLD: usize = 10;
const BATCH_LEN_THRESHOLD: usize = 20;
const BATCH_SIZE_THRESHOLD: usize = 5 * 1024 * 1024;
const DEFAULT_CONCURRENT: usize = 20;
#[derive(thiserror::Error, Debug)]
pub enum UploadError<SE> {
#[error("stream error {0}")]
Stream(SE),
#[error("telegraph error {0}")]
Reqwest(#[from] TelegraphError),
}
pub struct Synchronizer<C = CFStorage> {
tg: Telegraph<RandomAccessToken, ProxiedClient>,
limit: Option<usize>,
author_name: Option<String>,
author_url: Option<String>,
cache_ttl: Option<usize>,
registry: Registry,
cache: C,
}
impl<CACHE> Synchronizer<CACHE>
where
CACHE: KVStorage<String>,
{
// cache ttl is 45 days
const DEFAULT_CACHE_TTL: usize = 3600 * 24 * 45;
pub fn new(
tg: Telegraph<RandomAccessToken, ProxiedClient>,
registry: Registry,
cache: CACHE,
) -> Self {
Self {
tg,
limit: None,
author_name: None,
author_url: None,
cache_ttl: None,
registry,
cache,
}
}
pub fn with_concurrent_limit(mut self, limit: usize) -> Self {
self.limit = Some(limit);
self
}
pub fn with_author<S: Into<String>>(mut self, name: Option<S>, url: Option<S>) -> Self {
self.author_name = name.map(Into::into);
self.author_url = url.map(Into::into);
self
}
pub fn with_cache_ttl(mut self, ttl: Option<usize>) -> Self {
self.cache_ttl = ttl;
self
}
pub async fn delete_cache(&self, key: &str) -> anyhow::Result<()> {
self.cache.delete(key).await
}
pub async fn sync<C: Collector>(&self, path: String) -> anyhow::Result<String>
where
Registry: Param<C>,
C::FetchError: Into<anyhow::Error> + Send + 'static,
C::StreamError:
Into<anyhow::Error> + std::fmt::Debug + std::fmt::Display + Send + Sync + 'static,
C::ImageStream: Send + 'static,
<C::ImageStream as AsyncStream>::Future: Send + 'static,
{
// check cache
let cache_key = format!("{}|{}", C::name(), path);
if let Ok(Some(v)) = self.cache.get(&cache_key).await {
tracing::info!("[cache] hit key {cache_key}");
return Ok(v);
}
tracing::info!("[cache] miss key {cache_key}");
let collector: &C = self.registry.get();
let (meta, stream) = collector.fetch(path).await.map_err(Into::into)?;
let page = self
.sync_stream(meta, stream)
.await
.map_err(anyhow::Error::from)?;
// set cache
let _ = self
.cache
.set(
cache_key,
page.url.clone(),
Some(self.cache_ttl.unwrap_or(Self::DEFAULT_CACHE_TTL)),
)
.await;
Ok(page.url)
}
pub async fn sync_stream<S, SE>(
&self,
meta: AlbumMeta,
stream: S,
) -> Result<Page, UploadError<SE>>
where
SE: Send + std::fmt::Debug + 'static,
S: AsyncStream<Item = Result<(ImageMeta, ImageData), SE>>,
S::Future: Send + 'static,
{
let buffered_stream = Buffered::new(stream, self.limit.unwrap_or(DEFAULT_CONCURRENT));
let r = self.inner_sync_stream(meta, buffered_stream).await;
match &r {
Ok(p) => {
tracing::info!("[sync] sync success with url {}", p.url);
}
Err(e) => {
tracing::error!("[sync] sync fail! {e:?}");
}
}
r
}
async fn inner_sync_stream<S, SE>(
&self,
meta: AlbumMeta,
mut stream: S,
) -> Result<Page, UploadError<SE>>
where
S: AsyncStream<Item = Result<(ImageMeta, ImageData), SE>>,
{
let mut err_count = 0;
let mut uploaded = Vec::new();
let mut buffer = ImageBuffer::new();
// in this big loop, we will download images, and upload them in batch.
// then, all meta info will be saved in `uploaded`.
loop {
// TODO: load images one by one is too slow!
// We can spawn a background task(FuturesUnordered) and use channel, but expose as AsyncStream,
// which does not require changes on consuming side.
// 1. download images in batch
while let Some(fut) = stream.next() {
let data = match fut.await {
Err(e) => {
err_count += 1;
if err_count > ERR_THRESHOLD {
return Err(UploadError::Stream(e));
}
continue;
}
Ok(d) => {
err_count = 0;
d
}
};
// if the data size is too big to upload, we will discard it.
if data.1.len() >= MAX_SINGLE_FILE_SIZE {
tracing::error!("Too big file, discarded. Meta: {:?}", data.0);
continue;
}
buffer.push(data);
if buffer.len() > BATCH_LEN_THRESHOLD || buffer.size() > BATCH_SIZE_THRESHOLD {
break;
}
}
// all data is uploaded, and no data to process.
// just break the big loop.
if buffer.is_empty() {
break;
}
// 2. upload the batch
let (full_data, size) = buffer.swap();
let image_count = full_data.len();
tracing::debug!("download {image_count} images with size {size}, will upload them",);
let (meta, data) = full_data
.into_iter()
.map(|(a, b)| (a, b.as_ref().to_owned()))
.unzip::<_, _, Vec<_>, Vec<_>>();
let medium = self.tg.upload(data).await?;
err_count = 0;
// 3. add to uploaded
tracing::debug!("upload {image_count} images with size {size}, medium: {medium:?}");
uploaded.extend(
meta.into_iter()
.zip(medium.into_iter().map(|x| x.src))
.map(|(meta, src)| UploadedImage { meta, src }),
);
}
// create telegraph page, or multi pages
// Telegraph has 64K limit, since our estimate is not accurate, here we use 48K.
const PAGE_SIZE_LIMIT: usize = 48 * 1024;
let mut chunks = Vec::with_capacity(8);
chunks.push(Vec::new());
let mut last_chunk_size = 0;
for item in uploaded.into_iter().map(Into::<Node>::into) {
let item_size = item.estimate_size();
if last_chunk_size + item_size > PAGE_SIZE_LIMIT {
chunks.push(Vec::new());
last_chunk_size = 0;
}
last_chunk_size += item_size;
chunks.last_mut().unwrap().push(item);
}
let mut last_page: Option<Page> = None;
let title = meta.name.replace('|', "");
while let Some(last_chunk) = chunks.pop() {
let mut content = last_chunk;
write_footer(
&mut content,
meta.link.as_str(),
last_page.as_ref().map(|p| p.url.as_str()),
);
let title = match chunks.len() {
0 => title.clone(),
n => format!("{}-Page{}", title, n + 1),
};
tracing::debug!("create page with content: {content:?}");
let page = self
.tg
.create_page(&PageCreate {
title,
content,
author_name: self
.author_name
.clone()
.or_else(|| meta.authors.as_ref().map(|x| x.join(", "))),
author_url: self.author_url.clone(),
})
.await
.map_err(UploadError::Reqwest)?;
last_page = Some(page);
}
Ok(last_page.unwrap())
}
}
fn write_footer(content: &mut Vec<Node>, original_link: &str, next_page: Option<&str>) {
if let Some(page) = next_page {
content.push(np!(na!(@page, nt!("Next Page"))));
}
content.push(np!(nt!("Generated by eh2telegraph.")));
content.push(np!(
nt!("Original link: "),
na!(@original_link, nt!(original_link))
));
}
impl Synchronizer {
pub fn match_url_from_text(content: &str) -> Option<&str> {
match_first_group(&URL_FROM_TEXT_RE, content)
}
pub fn match_url_from_url(content: &str) -> Option<&str> {
match_first_group(&URL_FROM_URL_RE, content)
}
}
impl DataSized for (ImageMeta, ImageData) {
#[inline]
fn size(&self) -> usize {
self.1.size()
}
}
struct UploadedImage {
#[allow(unused)]
meta: ImageMeta,
src: String,
}
// Size: {"tag":"img","attrs":{"src":"https://telegra.ph..."}}
impl From<UploadedImage> for Node {
fn from(i: UploadedImage) -> Self {
Node::new_image(format!("https://telegra.ph{}", i.src))
}
}

View File

@ -0,0 +1,47 @@
// Partly borrowed from https://github.com/Aloxaf/telegraph-rs/blob/master/src/error.rs
use serde::Deserialize;
use super::types::MediaInfo;
#[derive(thiserror::Error, Debug)]
pub enum TelegraphError {
#[error("api error {0}")]
Api(String),
#[error("reqwest error {0}")]
Reqwest(#[from] reqwest::Error),
#[error("unexpected server result")]
Server,
}
#[derive(Debug, Deserialize)]
#[serde(untagged)]
pub(crate) enum ApiResult<T> {
Ok { result: T },
Err { error: String },
}
impl<T> From<ApiResult<T>> for Result<T, TelegraphError> {
fn from(r: ApiResult<T>) -> Self {
match r {
ApiResult::Ok { result: v } => Ok(v),
ApiResult::Err { error: e, .. } => Err(TelegraphError::Api(e)),
}
}
}
#[derive(Debug, Deserialize)]
#[serde(untagged)]
pub(crate) enum UploadResult {
Ok(Vec<MediaInfo>),
Err { error: String },
}
impl From<UploadResult> for Result<Vec<MediaInfo>, TelegraphError> {
fn from(r: UploadResult) -> Self {
match r {
UploadResult::Ok(v) => Ok(v),
UploadResult::Err { error } => Err(TelegraphError::Api(error)),
}
}
}

View File

@ -0,0 +1,354 @@
/// Telegraph API Client
pub use error::TelegraphError;
#[macro_use]
pub mod types;
pub const MAX_SINGLE_FILE_SIZE: usize = 5 * 1024 * 1024;
mod error;
use std::{borrow::Cow, sync::Arc};
use reqwest::{
multipart::{Form, Part},
Client, Response,
};
use serde::Serialize;
use crate::http_client::HttpRequestBuilder;
use self::{
error::{ApiResult, UploadResult},
types::{MediaInfo, Node, Page, PageCreate, PageEdit},
};
const TITLE_LENGTH_MAX: usize = 200;
#[derive(Debug, Clone)]
pub struct Telegraph<T, C = Client> {
// http client
client: C,
// access token
access_token: T,
}
pub trait AccessToken {
fn token(&self) -> &str;
fn select_token(&self, _path: &str) -> &str {
Self::token(self)
}
}
#[derive(Debug, Clone)]
pub struct SingleAccessToken(pub Arc<String>);
#[derive(Debug, Clone)]
pub struct RandomAccessToken(pub Arc<Vec<String>>);
impl AccessToken for SingleAccessToken {
fn token(&self) -> &str {
&self.0
}
}
impl From<String> for SingleAccessToken {
fn from(s: String) -> Self {
Self(Arc::new(s))
}
}
impl AccessToken for RandomAccessToken {
fn token(&self) -> &str {
use rand::prelude::SliceRandom;
self.0
.choose(&mut rand::thread_rng())
.expect("token list must contains at least one element")
}
}
impl From<String> for RandomAccessToken {
fn from(s: String) -> Self {
Self(Arc::new(vec![s]))
}
}
impl From<Vec<String>> for RandomAccessToken {
fn from(ts: Vec<String>) -> Self {
assert!(!ts.is_empty());
Self(Arc::new(ts))
}
}
macro_rules! execute {
($send: expr) => {
$send
.send()
.await
.and_then(Response::error_for_status)?
.json::<ApiResult<_>>()
.await?
.into()
};
}
#[derive(Debug, Clone, PartialEq, Eq, derive_more::From, derive_more::Into)]
pub struct TelegraphToken(Arc<String>);
impl<T> Telegraph<T, Client> {
pub fn new<AT>(access_token: AT) -> Telegraph<T, Client>
where
AT: Into<T>,
{
Telegraph {
client: Client::new(),
access_token: access_token.into(),
}
}
}
impl<T, C> Telegraph<T, C> {
pub fn with_proxy<P: HttpRequestBuilder + 'static>(self, proxy: P) -> Telegraph<T, P> {
Telegraph {
client: proxy,
access_token: self.access_token,
}
}
}
impl<T, C> Telegraph<T, C>
where
T: AccessToken,
C: HttpRequestBuilder,
{
/// Create page.
pub async fn create_page(&self, page: &PageCreate) -> Result<Page, TelegraphError> {
#[derive(Serialize)]
struct PageCreateShadow<'a> {
/// Title of the page.
pub title: &'a str,
/// Content of the page.
pub content: &'a str,
/// Optional. Name of the author, displayed below the title.
#[serde(skip_serializing_if = "Option::is_none")]
pub author_name: &'a Option<String>,
/// Optional. Profile link, opened when users click on the author's name below the title.
/// Can be any link, not necessarily to a Telegram profile or channel.
#[serde(skip_serializing_if = "Option::is_none")]
pub author_url: &'a Option<String>,
}
#[derive(Serialize)]
struct PagePostWithToken<'a> {
access_token: &'a str,
#[serde(flatten)]
page: &'a PageCreateShadow<'a>,
}
let title = page
.title
.chars()
.take(TITLE_LENGTH_MAX)
.collect::<String>();
let content =
serde_json::to_string(&page.content).expect("unable to content serialize json");
let to_post = PagePostWithToken {
access_token: self.access_token.token(),
page: &PageCreateShadow {
title: &title,
content: &content,
author_name: &page.author_name,
author_url: &page.author_url,
},
};
execute!(self
.client
.post_builder("https://api.telegra.ph/createPage")
.form(&to_post))
}
/// Edit page.
pub async fn edit_page(&self, page: &PageEdit) -> Result<Page, TelegraphError> {
#[derive(Serialize)]
struct PageEditShadow<'a> {
/// Title of the page.
pub title: &'a str,
/// Path to the page.
pub path: &'a str,
/// Content of the page.
pub content: &'a Vec<Node>,
/// Optional. Name of the author, displayed below the title.
#[serde(skip_serializing_if = "Option::is_none")]
pub author_name: &'a Option<String>,
/// Optional. Profile link, opened when users click on the author's name below the title.
/// Can be any link, not necessarily to a Telegram profile or channel.
#[serde(skip_serializing_if = "Option::is_none")]
pub author_url: &'a Option<String>,
}
#[derive(Serialize)]
struct PageEditWithToken<'a> {
access_token: &'a str,
#[serde(flatten)]
page: &'a PageEditShadow<'a>,
}
let title = page
.title
.chars()
.take(TITLE_LENGTH_MAX)
.collect::<String>();
let to_post = PageEditWithToken {
access_token: self.access_token.select_token(&page.path),
page: &PageEditShadow {
title: &title,
path: &page.path,
content: &page.content,
author_name: &page.author_name,
author_url: &page.author_url,
},
};
execute!(self
.client
.post_builder("https://api.telegra.ph/editPage")
.form(&to_post))
}
/// Get page.
/// path: Path to the Telegraph page (in the format Title-12-31, i.e. everything
/// that comes after http://telegra.ph/)
pub async fn get_page(&self, path: &str) -> Result<Page, TelegraphError> {
#[derive(Serialize)]
struct PageGet<'a> {
path: &'a str,
#[serde(flatten)]
return_content: Option<bool>,
}
let to_post = PageGet {
path,
return_content: Some(true),
};
execute!(self
.client
.post_builder("https://api.telegra.ph/getPage")
.form(&to_post))
}
/// Upload file.
/// If the result is Ok, it's length must eq to files'.
pub async fn upload<IT, I>(&self, files: IT) -> Result<Vec<MediaInfo>, TelegraphError>
where
IT: IntoIterator<Item = I>,
I: Into<Cow<'static, [u8]>>,
{
let mut form = Form::new();
let mut cnt = 0;
for (idx, data) in files.into_iter().enumerate() {
let part = Part::bytes(data).file_name(idx.to_string());
form = form.part(idx.to_string(), part);
cnt += 1;
}
let r: Result<Vec<MediaInfo>, TelegraphError> = self
.client
.post_builder("https://telegra.ph/upload")
.multipart(form)
.send()
.await
.and_then(Response::error_for_status)?
.json::<UploadResult>()
.await?
.into();
// Here we check if server returns the same amount as files posted
r.and_then(|x| {
if x.len() != cnt {
Err(TelegraphError::Server)
} else {
Ok(x)
}
})
}
}
#[cfg(test)]
mod tests {
use crate::telegraph::{
types::{Node, PageCreate},
SingleAccessToken, Telegraph,
};
use super::types::{NodeElement, NodeElementAttr, Tag};
pub const TELEGRAPH_TOKEN: &str =
"f42d3570f95412b59b08d64450049e4d609b1f2a57657fce6ce8acc908aa";
#[ignore]
#[tokio::test]
async fn demo_create_page() {
let telegraph = Telegraph::<SingleAccessToken>::new(TELEGRAPH_TOKEN.to_string());
let page = PageCreate {
title: "title".to_string(),
content: vec![
Node::Text("test text".to_string()),
Node::NodeElement(NodeElement {
tag: Tag::A,
attrs: Some(NodeElementAttr {
href: Some("https://google.com".to_string()),
src: None,
}),
children: Some(vec![Node::Text("link".to_string())]),
}),
],
author_name: Some("test_author".to_string()),
author_url: Some("https://t.co".to_string()),
};
let page = telegraph.create_page(&page).await.unwrap();
println!("test page: {page:?}");
}
#[ignore]
#[tokio::test]
async fn demo_upload() {
let demo_image: Vec<u8> = reqwest::get("https://t.co/static/images/bird.png")
.await
.unwrap()
.bytes()
.await
.unwrap()
.as_ref()
.to_owned();
let telegraph = Telegraph::<SingleAccessToken>::new(TELEGRAPH_TOKEN.to_string());
let ret = telegraph
.upload(Some(demo_image))
.await
.unwrap()
.pop()
.unwrap();
println!("uploaded file link: {}", ret.src);
}
#[ignore]
#[tokio::test]
async fn demo_create_images_page() {
let telegraph = Telegraph::<SingleAccessToken>::new(TELEGRAPH_TOKEN.to_string());
let node = Node::NodeElement(NodeElement {
tag: Tag::Img,
attrs: Some(NodeElementAttr {
src: Some("https://telegra.ph/file/e31b40e99b0c028601ccb.png".to_string()),
href: None,
}),
children: None,
});
let page = PageCreate {
title: "title".to_string(),
content: vec![node],
author_name: Some("test_author".to_string()),
author_url: None,
};
let page = telegraph.create_page(&page).await.unwrap();
println!("test page: {page:?}");
}
}

View File

@ -0,0 +1,280 @@
// Partly borrowed from https://github.com/Aloxaf/telegraph-rs/blob/master/src/types.rs
use serde::{Deserialize, Serialize};
/// This object represents a Telegraph account.
#[derive(Debug, Clone, Deserialize)]
pub struct Account {
/// Account name, helps users with several accounts remember which they are currently using.
///
/// Displayed to the user above the "Edit/Publish" button on Telegra.ph, other users don't see this name.
pub short_name: Option<String>,
/// Default author name used when creating new articles.
pub author_name: Option<String>,
/// Profile link, opened when users click on the author's name below the title.
///
/// Can be any link, not necessarily to a Telegram profile or channel.
pub author_url: Option<String>,
/// Optional. Only returned by the createAccount and revokeAccessToken method.
///
/// Access token of the Telegraph account.
pub access_token: Option<String>,
/// Optional. URL to authorize a browser on telegra.ph and connect it to a Telegraph account.
///
/// This URL is valid for only one use and for 5 minutes only.
pub auth_url: Option<String>,
/// Optional. Number of pages belonging to the Telegraph account.
pub page_count: Option<i32>,
}
/// This object represents a list of Telegraph articles belonging to an account. Most recently created articles first.
#[derive(Debug, Clone, Deserialize)]
pub struct PageList {
/// Total number of pages belonging to the target Telegraph account.
pub total_count: i32,
/// Requested pages of the target Telegraph account.
pub pages: Vec<Page>,
}
/// This object represents a page to create on Telegraph.
#[derive(Debug, Clone, Serialize)]
pub struct PageCreate {
/// Title of the page.
pub title: String,
/// Content of the page.
pub content: Vec<Node>,
/// Optional. Name of the author, displayed below the title.
#[serde(skip_serializing_if = "Option::is_none")]
pub author_name: Option<String>,
/// Optional. Profile link, opened when users click on the author's name below the title.
/// Can be any link, not necessarily to a Telegram profile or channel.
#[serde(skip_serializing_if = "Option::is_none")]
pub author_url: Option<String>,
}
/// This object represents a page to edit on Telegraph.
#[derive(Debug, Clone, Serialize)]
pub struct PageEdit {
/// Title of the page.
pub title: String,
/// Path to the page.
pub path: String,
/// Content of the page.
pub content: Vec<Node>,
/// Optional. Name of the author, displayed below the title.
#[serde(skip_serializing_if = "Option::is_none")]
pub author_name: Option<String>,
/// Optional. Profile link, opened when users click on the author's name below the title.
/// Can be any link, not necessarily to a Telegram profile or channel.
#[serde(skip_serializing_if = "Option::is_none")]
pub author_url: Option<String>,
}
/// This object represents a page on Telegraph.
#[derive(Debug, Clone, Deserialize)]
pub struct Page {
/// Path to the page.
pub path: String,
/// URL of the page.
pub url: String,
/// Title of the page.
pub title: String,
/// Description of the page.
pub description: String,
/// Optional. Name of the author, displayed below the title.
pub author_name: Option<String>,
/// Optional. Profile link, opened when users click on the author's name below the title.
///
/// Can be any link, not necessarily to a Telegram profile or channel.
pub author_url: Option<String>,
/// Optional. Image URL of the page.
pub image_url: Option<String>,
/// Optional. Content of the page.
pub content: Option<Vec<Node>>,
/// Number of page views for the page.
pub views: i32,
/// Optional. Only returned if access_token passed.
///
/// True, if the target Telegraph account can edit the page.
pub can_edit: Option<bool>,
}
/// This object represents the number of page views for a Telegraph article.
#[derive(Debug, Clone, Deserialize)]
pub struct PageViews {
/// Number of page views for the target page.
pub views: i32,
}
/// This abstract object represents a DOM Node.
///
/// It can be a String which represents a DOM text node or a NodeElement object.
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(untagged)]
pub enum Node {
Text(String),
NodeElement(NodeElement),
}
impl Node {
// Estimate approximate size of serialized string.
// We don't consider escape.
pub fn estimate_size(&self) -> usize {
match self {
Node::Text(s) => s.len(),
Node::NodeElement(e) => {
// {"tag":"?","attrs":?,"children":?}
// Init size for : {"tag":""} + tag length max(11)
let mut size = 21;
if let Some(attrs) = &e.attrs {
// size add: ,"attrs":{}
size += 11;
if let Some(href) = &attrs.href {
// size add: "href":""
size += 9 + href.len();
}
if let Some(src) = &attrs.src {
// size add: ,"src":""
size += 9 + src.len();
}
}
if let Some(children) = &e.children {
// size add: ,"children":[]
size += 14;
for child in children {
size += child.estimate_size() + 1;
}
}
size
}
}
}
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub enum Tag {
A,
Aside,
B,
Blockquote,
Br,
Code,
Em,
Figcaption,
Figure,
H3,
H4,
Hr,
I,
Iframe,
Img,
Li,
Ol,
P,
Pre,
S,
Strong,
U,
Ul,
Video,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct NodeElementAttr {
#[serde(skip_serializing_if = "Option::is_none")]
pub href: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub src: Option<String>,
}
/// This object represents a DOM element node.
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct NodeElement {
/// Name of the DOM element.
/// Available tags: a, aside, b, blockquote, br, code, em, figcaption, figure, h3, h4, hr,
/// i, iframe, img, li, ol, p, pre, s, strong, u, ul, video.
pub tag: Tag,
/// Optional. Attributes of the DOM element.
///
/// Key of object represents name of attribute, value represents value of attribute.
///
/// Available attributes: href, src.
#[serde(skip_serializing_if = "Option::is_none")]
pub attrs: Option<NodeElementAttr>,
/// Optional. List of child nodes for the DOM element.
#[serde(skip_serializing_if = "Option::is_none")]
pub children: Option<Vec<Node>>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct MediaInfo {
/// Path of the file uploaded.
pub src: String,
}
impl From<Page> for PageEdit {
fn from(p: Page) -> Self {
Self {
title: p.title,
path: p.path,
content: p.content.unwrap_or_default(),
author_name: p.author_name,
author_url: p.author_url,
}
}
}
impl Node {
pub fn new_p_text<S: Into<String>>(text: S) -> Self {
Node::NodeElement(NodeElement {
tag: Tag::P,
attrs: None,
children: Some(vec![Node::Text(text.into())]),
})
}
pub fn new_image<S: Into<String>>(src: S) -> Self {
Node::NodeElement(NodeElement {
tag: Tag::Img,
attrs: Some(NodeElementAttr {
src: Some(src.into()),
href: None,
}),
children: None,
})
}
}
macro_rules! nt {
($s:expr) => {
Node::Text($s.into())
};
}
macro_rules! np {
($($n:expr),+) => {
Node::NodeElement(NodeElement {
tag: Tag::P,
attrs: Some(NodeElementAttr {
src: None,
href: None,
}),
children: Some(vec![$($n),+]),
})
};
}
macro_rules! na {
(@$href:expr,$($n:expr),+) => {
Node::NodeElement(NodeElement {
tag: Tag::A,
attrs: Some(NodeElementAttr {
src: None,
href: Some($href.into()),
}),
children: Some(vec![$($n),+]),
})
};
}

83
eh2telegraph/src/tls.rs Normal file
View File

@ -0,0 +1,83 @@
use std::sync::Arc;
use rustls::{
client::{ServerCertVerifier, WebPkiVerifier},
Certificate, ClientConfig, RootCertStore, ServerName,
};
pub struct WhitelistVerifier<const N: usize> {
verifier: WebPkiVerifier,
dns_names: [&'static str; N],
}
/// Custom verifier that allow hostname difference with specified dns names.
impl<const N: usize> WhitelistVerifier<N> {
pub fn new(dns_names: [&'static str; N]) -> Self {
use rustls::OwnedTrustAnchor;
let mut root_cert_store = RootCertStore::empty();
let trust_anchors = webpki_roots::TLS_SERVER_ROOTS.0.iter().map(|trust_anchor| {
OwnedTrustAnchor::from_subject_spki_name_constraints(
trust_anchor.subject,
trust_anchor.spki,
trust_anchor.name_constraints,
)
});
root_cert_store.add_server_trust_anchors(trust_anchors);
let verifier = WebPkiVerifier::new(root_cert_store, None);
Self {
verifier,
dns_names,
}
}
}
impl<const N: usize> From<WhitelistVerifier<N>> for ClientConfig {
fn from(v: WhitelistVerifier<N>) -> Self {
let mut cfg = ClientConfig::builder()
.with_safe_defaults()
.with_root_certificates(RootCertStore::empty())
.with_no_client_auth();
cfg.dangerous().set_certificate_verifier(Arc::new(v));
cfg
}
}
impl<const N: usize> ServerCertVerifier for WhitelistVerifier<N> {
fn verify_server_cert(
&self,
end_entity: &Certificate,
intermediates: &[Certificate],
server_name: &rustls::ServerName,
scts: &mut dyn Iterator<Item = &[u8]>,
ocsp_response: &[u8],
now: std::time::SystemTime,
) -> Result<rustls::client::ServerCertVerified, rustls::Error> {
let original_validate_result = self.verifier.verify_server_cert(
end_entity,
intermediates,
server_name,
scts,
ocsp_response,
now,
);
if original_validate_result.is_ok() {
return original_validate_result;
}
for dns_name in self.dns_names.iter() {
if let Ok(dns_name) = ServerName::try_from(*dns_name) {
let whitelist_validate_result = self.verifier.verify_server_cert(
end_entity,
intermediates,
&dns_name,
scts,
ocsp_response,
now,
);
if whitelist_validate_result.is_ok() {
return whitelist_validate_result;
}
}
}
original_validate_result
}
}

36
eh2telegraph/src/util.rs Normal file
View File

@ -0,0 +1,36 @@
use bytes::Bytes;
use regex::Regex;
use reqwest::Response;
use crate::http_client::HttpRequestBuilder;
#[inline]
pub fn match_first_group<'a>(regexp: &'a Regex, content: &'a str) -> Option<&'a str> {
regexp.captures(content).map(|c| {
c.get(1)
.expect("regexp is matched but no group 1 found")
.as_str()
})
}
#[inline]
pub async fn get_bytes<C: HttpRequestBuilder>(client: &C, link: &str) -> reqwest::Result<Bytes> {
client
.get_builder(link)
.send()
.await
.and_then(Response::error_for_status)?
.bytes()
.await
}
#[inline]
pub async fn get_string<C: HttpRequestBuilder>(client: &C, link: &str) -> reqwest::Result<String> {
client
.get_builder(link)
.send()
.await
.and_then(Response::error_for_status)?
.text()
.await
}

1
rust-toolchain Normal file
View File

@ -0,0 +1 @@
nightly

57
worker/web_proxy.js Normal file
View File

@ -0,0 +1,57 @@
/*
Cloudflare workers telegraph proxy.
Deploy and set `KEY` variable in browser.
*/
addEventListener('fetch', event => {
event.respondWith(handleRequest(event.request))
})
const RESPONSE_HEADERS = {
"Server": "web-proxy",
};
async function handleRequest(request) {
// validate request key
if (request.headers.get("X-Authorization") != KEY) {
return new Response(null, {
status: 401,
headers: RESPONSE_HEADERS
});
}
// read original url
var url = request.headers.get("X-Forwarded-For");
if (url == null || url == "") {
return new Response(null, {
status: 400,
headers: RESPONSE_HEADERS
});
}
// construct new url and request
var req;
if (request.body && request.method != 'GET' && request.method != 'HEAD') {
req = new Request(new URL(url), {
method: request.method,
headers: request.headers,
body: request.body
});
} else {
req = new Request(new URL(url), {
method: request.method,
headers: request.headers,
});
}
// remove headers
req.headers.delete("X-Authorization");
req.headers.delete("X-Forwarded-For");
req.headers.delete("CF-Connecting-IP");
req.headers.delete("CF-Worker");
req.headers.delete("CF-EW-Via");
// send request
var result = await fetch(req);
return result;
}