eh2telegraph/eh2telegraph/src/sync.rs

266 lines
8.0 KiB
Rust

use crate::{
buffer::{DataSized, ImageBuffer},
collector::{
AlbumMeta, Collector, ImageData, ImageMeta, Param, Registry, URL_FROM_TEXT_RE,
URL_FROM_URL_RE,
},
http_proxy::ProxiedClient,
storage::{cloudflare_kv::CFStorage, KVStorage},
stream::{AsyncStream, Buffered},
telegraph::{
types::{Node, Page, PageCreate},
RandomAccessToken, Telegraph, TelegraphError, MAX_SINGLE_FILE_SIZE,
},
util::match_first_group,
};
const ERR_THRESHOLD: usize = 10;
const BATCH_LEN_THRESHOLD: usize = 20;
const BATCH_SIZE_THRESHOLD: usize = 5 * 1024 * 1024;
const DEFAULT_CONCURRENT: usize = 20;
#[derive(thiserror::Error, Debug)]
pub enum UploadError<SE> {
#[error("stream error {0}")]
Stream(SE),
#[error("telegraph error")]
Reqwest(#[from] TelegraphError),
}
pub struct Synchronizer<C = CFStorage> {
tg: Telegraph<RandomAccessToken, ProxiedClient>,
limit: Option<usize>,
author_name: Option<String>,
author_url: Option<String>,
cache_ttl: Option<usize>,
registry: Registry,
cache: C,
}
impl<CACHE> Synchronizer<CACHE>
where
CACHE: KVStorage<String>,
{
// cache ttl is 45 days
const DEFAULT_CACHE_TTL: usize = 3600 * 24 * 45;
pub fn new(
tg: Telegraph<RandomAccessToken, ProxiedClient>,
registry: Registry,
cache: CACHE,
) -> Self {
Self {
tg,
limit: None,
author_name: None,
author_url: None,
cache_ttl: None,
registry,
cache,
}
}
pub fn with_concurrent_limit(mut self, limit: usize) -> Self {
self.limit = Some(limit);
self
}
pub fn with_author<S: Into<String>>(mut self, name: Option<S>, url: Option<S>) -> Self {
self.author_name = name.map(Into::into);
self.author_url = url.map(Into::into);
self
}
pub fn with_cache_ttl(mut self, ttl: Option<usize>) -> Self {
self.cache_ttl = ttl;
self
}
pub async fn delete_cache(&self, key: &str) -> anyhow::Result<()> {
self.cache.delete(key).await
}
pub async fn sync<C: Collector>(&self, path: String) -> anyhow::Result<String>
where
Registry: Param<C>,
C::FetchError: Into<anyhow::Error> + Send + 'static,
C::StreamError:
Into<anyhow::Error> + std::fmt::Debug + std::fmt::Display + Send + Sync + 'static,
C::ImageStream: Send + 'static,
<C::ImageStream as AsyncStream>::Future: Send + 'static,
{
// check cache
let cache_key = format!("{}|{}", C::name(), path);
if let Ok(Some(v)) = self.cache.get(&cache_key).await {
tracing::info!("[cache] hit key {cache_key}");
return Ok(v);
}
tracing::info!("[cache] miss key {cache_key}");
let collector: &C = self.registry.get();
let (meta, stream) = collector.fetch(path).await.map_err(Into::into)?;
let page = self
.sync_stream(meta, stream)
.await
.map_err(anyhow::Error::from)?;
// set cache
let _ = self
.cache
.set(
cache_key,
page.url.clone(),
Some(self.cache_ttl.unwrap_or(Self::DEFAULT_CACHE_TTL)),
)
.await;
Ok(page.url)
}
pub async fn sync_stream<S, SE>(
&self,
meta: AlbumMeta,
stream: S,
) -> Result<Page, UploadError<SE>>
where
SE: Send + std::fmt::Debug + 'static,
S: AsyncStream<Item = Result<(ImageMeta, ImageData), SE>>,
S::Future: Send + 'static,
{
let buffered_stream = Buffered::new(stream, self.limit.unwrap_or(DEFAULT_CONCURRENT));
let r = self.inner_sync_stream(meta, buffered_stream).await;
match &r {
Ok(p) => {
tracing::info!("[sync] sync success with url {}", p.url);
}
Err(e) => {
tracing::error!("[sync] sync fail! {e:?}");
}
}
r
}
async fn inner_sync_stream<S, SE>(
&self,
meta: AlbumMeta,
mut stream: S,
) -> Result<Page, UploadError<SE>>
where
S: AsyncStream<Item = Result<(ImageMeta, ImageData), SE>>,
{
let mut err_count = 0;
let mut uploaded = Vec::new();
let mut buffer = ImageBuffer::new();
// in this big loop, we will download images, and upload them in batch.
// then, all meta info will be saved in `uploaded`.
loop {
// TODO: load images one by one is too slow!
// We can spawn a background task(FuturesUnordered) and use channel, but expose as AsyncStream,
// which does not require changes on consuming side.
// 1. download images in batch
while let Some(fut) = stream.next() {
let data = match fut.await {
Err(e) => {
err_count += 1;
if err_count > ERR_THRESHOLD {
return Err(UploadError::Stream(e));
}
continue;
}
Ok(d) => {
err_count = 0;
d
}
};
// if the data size is too big to upload, we will discard it.
if data.1.len() >= MAX_SINGLE_FILE_SIZE {
tracing::error!("Too big file, discarded. Meta: {:?}", data.0);
continue;
}
buffer.push(data);
if buffer.len() > BATCH_LEN_THRESHOLD || buffer.size() > BATCH_SIZE_THRESHOLD {
break;
}
}
// all data is uploaded, and no data to process.
// just break the big loop.
if buffer.is_empty() {
break;
}
// 2. upload the batch
let (full_data, size) = buffer.swap();
let image_count = full_data.len();
tracing::debug!("download {image_count} images with size {size}, will upload them",);
let (meta, data) = full_data
.into_iter()
.map(|(a, b)| (a, b.as_ref().to_owned()))
.unzip::<_, _, Vec<_>, Vec<_>>();
let medium = self.tg.upload(data).await?;
err_count = 0;
// 3. add to uploaded
tracing::debug!("upload {image_count} images with size {size}, medium: {medium:?}");
uploaded.extend(
meta.into_iter()
.zip(medium.into_iter().map(|x| x.src))
.map(|(meta, src)| UploadedImage { meta, src }),
);
}
// create telegraph page
let mut content: Vec<_> = uploaded.into_iter().map(Into::into).collect();
content.push(Node::new_p_text("Generated by eh2telegraph."));
content.push(Node::new_p_text(format!("Original link: {}", meta.link)));
self.tg
.create_page(&PageCreate {
title: meta.name,
content,
author_name: self
.author_name
.clone()
.or_else(|| meta.authors.map(|x| x.join(", "))),
author_url: self.author_url.clone(),
})
.await
.map_err(Into::into)
}
}
impl Synchronizer {
pub fn match_url_from_text(content: &str) -> Option<&str> {
match_first_group(&URL_FROM_TEXT_RE, content)
}
pub fn match_url_from_url(content: &str) -> Option<&str> {
match_first_group(&URL_FROM_URL_RE, content)
}
}
impl DataSized for (ImageMeta, ImageData) {
#[inline]
fn size(&self) -> usize {
self.1.size()
}
}
struct UploadedImage {
#[allow(unused)]
meta: ImageMeta,
src: String,
}
impl From<UploadedImage> for Node {
fn from(i: UploadedImage) -> Self {
Node::new_image(format!("https://telegra.ph{}", i.src))
}
}