diff --git a/Dockerfile b/Dockerfile index e30b1ac8..697a176d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,18 @@ ARG PYTHON_VERSION=3.8 +ARG CURL_IMPERSONATE_VERSION=0.5-chrome +FROM lwthiker/curl-impersonate:${CURL_IMPERSONATE_VERSION} as curl + # Builder FROM python:${PYTHON_VERSION}-alpine as builder -RUN apk add --update git build-base libffi-dev +RUN apk add --update git build-base libffi-dev curl-dev WORKDIR /root +COPY --from=curl /usr/local/bin/curl_* /usr/local/bin/ +COPY --from=curl /usr/local/lib/ /usr/local/lib/ + # Install requirements COPY requirements.txt /root RUN pip install --prefix="/install" --no-warn-script-location -r requirements.txt @@ -34,9 +40,18 @@ RUN apk add --no-cache curl # Install FFmpeg COPY --from=builder /usr/local/bin/ffmpeg /usr/local/bin/ +# cURL Impersonate libraries +COPY --from=builder /usr/local/bin/curl_* /usr/local/bin/ +COPY --from=builder /usr/local/lib/libcurl-* /usr/local/lib/ + # Copy pip requirements COPY --from=builder /install /usr/local +# Copy CA certificates for curl_cffi, can be removed once v0.6 is officially released +RUN PYTHON_LIB_PATH="$(python -c "import site; print(site.getsitepackages()[0])")" &&\ + CA_FILE="$(python -c "import certifi; print(certifi.where())")" && \ + cp $CA_FILE $PYTHON_LIB_PATH/curl_cffi/ + WORKDIR /app COPY nazurin ./nazurin diff --git a/Dockerfile.debian b/Dockerfile.debian index d7c3b440..e7838312 100644 --- a/Dockerfile.debian +++ b/Dockerfile.debian @@ -1,12 +1,20 @@ ARG PYTHON_VERSION=3.8 +ARG CURL_IMPERSONATE_VERSION=0.5-chrome +FROM lwthiker/curl-impersonate:${CURL_IMPERSONATE_VERSION} as curl + # Builder FROM python:${PYTHON_VERSION}-slim as builder -RUN apt-get update && apt-get install -y --no-install-recommends git wget gcc xz-utils +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git wget gcc xz-utils libcurl4-openssl-dev WORKDIR /root +COPY --from=curl /usr/local/bin/curl_* /usr/local/bin/ +COPY --from=curl /usr/local/lib/ /usr/local/lib/ + # Install requirements COPY requirements.txt /root RUN pip install --prefix="/install" --no-warn-script-location -r requirements.txt @@ -34,9 +42,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl # Install FFmpeg COPY --from=builder /usr/local/bin/ffmpeg /usr/local/bin/ +# cURL Impersonate libraries +COPY --from=builder /usr/local/bin/curl_* /usr/local/bin/ +COPY --from=builder /usr/local/lib/libcurl-* /usr/local/lib/ + # Copy pip requirements COPY --from=builder /install /usr/local +# Copy CA certificates for curl_cffi, can be removed once v0.6 is officially released +RUN PYTHON_LIB_PATH="$(python -c "import site; print(site.getsitepackages()[0])")" &&\ + CA_FILE="$(python -c "import certifi; print(certifi.where())")" && \ + cp $CA_FILE $PYTHON_LIB_PATH/curl_cffi/ + WORKDIR /app COPY nazurin ./nazurin diff --git a/nazurin/config.py b/nazurin/config.py index 16f454be..abc6eee7 100644 --- a/nazurin/config.py +++ b/nazurin/config.py @@ -1,4 +1,5 @@ from os import path +from typing import List, Optional from environs import Env @@ -6,45 +7,45 @@ # read config from .env file if exists env.read_env() -ENV = env.str("ENV", default="production") -TOKEN = env.str("TOKEN") +ENV: str = env.str("ENV", default="production") +TOKEN: str = env.str("TOKEN") # Webhook url, eg: https://xxx.fly.dev/, should end with '/' -WEBHOOK_URL = env.str("WEBHOOK_URL", default=None) -HOST = env.str("HOST", default="0.0.0.0") +WEBHOOK_URL: str = env.str("WEBHOOK_URL", default=None) +HOST: str = env.str("HOST", default="0.0.0.0") # Port is automatically set if on Heroku or fly.io -PORT = env.int("PORT", default=80) +PORT: int = env.int("PORT", default=80) -STORAGE = env.list("STORAGE", subcast=str, default=["Local"]) -STORAGE_DIR = env.str("STORAGE_DIR", default="Pictures") +STORAGE: List[str] = env.list("STORAGE", subcast=str, default=["Local"]) +STORAGE_DIR: str = env.str("STORAGE_DIR", default="Pictures") -DATABASE = env.str("DATABASE", default="Local") +DATABASE: str = env.str("DATABASE", default="Local") # Nazurin data collection in database -NAZURIN_DATA = "nazurin" +NAZURIN_DATA: str = "nazurin" # Ignored items in image caption -CAPTION_IGNORE = env.list("CAPTION_IGNORE", subcast=str, default=[]) +CAPTION_IGNORE: List[str] = env.list("CAPTION_IGNORE", subcast=str, default=[]) -GALLERY_ID = env.int("GALLERY_ID", default=None) +GALLERY_ID: Optional[int] = env.int("GALLERY_ID", default=None) -ADMIN_ID = env.int("ADMIN_ID") -IS_PUBLIC = env.bool("IS_PUBLIC", default=False) +ADMIN_ID: int = env.int("ADMIN_ID") +IS_PUBLIC: bool = env.bool("IS_PUBLIC", default=False) # If IS_PUBLIC is True, the following items will be ignored -ALLOW_ID = env.list("ALLOW_ID", subcast=int, default=[]) -ALLOW_USERNAME = env.list("ALLOW_USERNAME", default=[]) -ALLOW_GROUP = env.list("ALLOW_GROUP", subcast=int, default=[]) - -RETRIES = env.int("RETRIES", default=5) -TIMEOUT = env.int("TIMEOUT", default=20) -DOWNLOAD_CHUNK_SIZE = env.int("DOWNLOAD_CHUNK_SIZE", default=4096) -PROXY = env.str("HTTP_PROXY", default=None) -UA = ( +ALLOW_ID: List[int] = env.list("ALLOW_ID", subcast=int, default=[]) +ALLOW_USERNAME: List[str] = env.list("ALLOW_USERNAME", default=[]) +ALLOW_GROUP: List[int] = env.list("ALLOW_GROUP", subcast=int, default=[]) + +RETRIES: int = env.int("RETRIES", default=5) +TIMEOUT: int = env.int("TIMEOUT", default=20) +DOWNLOAD_CHUNK_SIZE: int = env.int("DOWNLOAD_CHUNK_SIZE", default=4096) +PROXY: str = env.str("HTTP_PROXY", default=None) +UA: str = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ) # Local directory to store database and temporary files -DATA_DIR = "data" -TEMP_DIR = path.join(DATA_DIR, "temp") -CLEANUP_INTERVAL = env.int("CLEANUP_INTERVAL", default=7) -ACCESS_LOG_FORMAT = '%a "%r" %s %b "%{Referer}i" "%{User-Agent}i"' +DATA_DIR: str = "data" +TEMP_DIR: str = path.join(DATA_DIR, "temp") +CLEANUP_INTERVAL: int = env.int("CLEANUP_INTERVAL", default=7) +ACCESS_LOG_FORMAT: str = '%a "%r" %s %b "%{Referer}i" "%{User-Agent}i"' diff --git a/nazurin/models/file.py b/nazurin/models/file.py index 653b1085..1134065e 100644 --- a/nazurin/models/file.py +++ b/nazurin/models/file.py @@ -4,9 +4,8 @@ import aiofiles import aiofiles.os -import aiohttp -from nazurin.config import DOWNLOAD_CHUNK_SIZE, STORAGE_DIR, TEMP_DIR +from nazurin.config import STORAGE_DIR, TEMP_DIR from nazurin.utils import logger from nazurin.utils.decorators import network_retry from nazurin.utils.helpers import ( @@ -14,6 +13,7 @@ sanitize_filename, sanitize_path, ) +from nazurin.utils.network import NazurinRequestSession @dataclass @@ -63,15 +63,11 @@ async def exists(self) -> bool: return False @network_retry - async def download(self, session: aiohttp.ClientSession): + async def download(self, session: NazurinRequestSession): if await self.exists(): logger.info("File {} already exists", self.path) return True await ensure_existence_async(TEMP_DIR) - async with session.get(self.url) as response: - logger.info("Downloading {} to {}...", self.url, self.path) - response.raise_for_status() - async with aiofiles.open(self.path, "wb") as f: - async for chunk in response.content.iter_chunked(DOWNLOAD_CHUNK_SIZE): - await f.write(chunk) + logger.info("Downloading {} to {}...", self.url, self.path) + await session.download(self.url, self.path) logger.info("Downloaded to {}", self.path) diff --git a/nazurin/models/illust.py b/nazurin/models/illust.py index 2039756f..1be2c356 100644 --- a/nazurin/models/illust.py +++ b/nazurin/models/illust.py @@ -3,6 +3,7 @@ from typing import List from nazurin.utils import Request +from nazurin.utils.network import NazurinRequestSession from .caption import Caption from .file import File @@ -26,8 +27,10 @@ def has_image(self) -> bool: def has_multiple_images(self) -> bool: return len(self.images) > 1 - async def download(self, **kwargs): - async with Request(**kwargs) as session: + async def download( + self, *, request_class: NazurinRequestSession = Request, **kwargs + ): + async with request_class(**kwargs) as session: tasks = [] for file in self.all_files: if not file.url: diff --git a/nazurin/models/image.py b/nazurin/models/image.py index f2bbf4b4..1d3340a7 100644 --- a/nazurin/models/image.py +++ b/nazurin/models/image.py @@ -98,7 +98,9 @@ async def download(self, session: aiohttp.ClientSession): i + 1, RETRIES, ) - os.remove(self.path) + if i < RETRIES - 1: + # Keep the last one for debugging + os.remove(self.path) if not is_valid: raise NazurinError( "Download failed with invalid image, please check logs for details" diff --git a/nazurin/sites/danbooru/api.py b/nazurin/sites/danbooru/api.py index 49a0291e..ebb3c652 100644 --- a/nazurin/sites/danbooru/api.py +++ b/nazurin/sites/danbooru/api.py @@ -7,7 +7,8 @@ from pybooru import Danbooru as danbooru from pybooru import PybooruHTTPError -from nazurin.models import Caption, File, Illust, Image +from nazurin.models import Caption, File, Image +from nazurin.sites.danbooru.models import DanbooruIllust from nazurin.utils.decorators import async_wrap from nazurin.utils.exceptions import NazurinError from nazurin.utils.helpers import is_image @@ -43,12 +44,12 @@ async def get_post(self, post_id: Optional[int] = None, md5: Optional[str] = Non async def view( self, post_id: Optional[int] = None, md5: Optional[str] = None - ) -> Illust: + ) -> DanbooruIllust: post = await self.get_post(post_id, md5) illust = self.parse_post(post) return illust - def parse_post(self, post) -> Illust: + def parse_post(self, post) -> DanbooruIllust: """Get images and build caption.""" # Get images url = post["file_url"] @@ -88,7 +89,7 @@ def parse_post(self, post) -> Illust: "has_children": post["has_children"], } ) - return Illust(imgs, caption, post, files) + return DanbooruIllust(imgs, caption, post, files) @staticmethod def get_storage_dest(post: dict, filename: str) -> Tuple[str, str]: diff --git a/nazurin/sites/danbooru/models.py b/nazurin/sites/danbooru/models.py new file mode 100644 index 00000000..59ad4648 --- /dev/null +++ b/nazurin/sites/danbooru/models.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass + +from nazurin.models import Illust +from nazurin.utils.network import CurlRequest + + +@dataclass +class DanbooruIllust(Illust): + async def download(self, **kwargs): + await super().download(request_class=CurlRequest, **kwargs) diff --git a/nazurin/utils/helpers.py b/nazurin/utils/helpers.py index 22ebde51..567c2045 100644 --- a/nazurin/utils/helpers.py +++ b/nazurin/utils/helpers.py @@ -208,6 +208,6 @@ def check_image(path: Union[str, os.PathLike]) -> bool: image = Image.open(path) image.load() return True - except Exception as error: + except OSError as error: logger.warning("Invalid image {}: {}", path, error) return False diff --git a/nazurin/utils/network.py b/nazurin/utils/network.py index 06384c52..8421d53b 100644 --- a/nazurin/utils/network.py +++ b/nazurin/utils/network.py @@ -1,17 +1,53 @@ +import abc +import os +from contextlib import AbstractAsyncContextManager, asynccontextmanager +from typing import AsyncContextManager, Generator, Optional, Union + +import aiofiles from aiohttp import ClientSession, ClientTimeout, TCPConnector +from curl_cffi.requests import AsyncSession as CurlSession +from curl_cffi.requests import Response as CurlResponse + +from nazurin.config import DOWNLOAD_CHUNK_SIZE, PROXY, TIMEOUT, UA +from nazurin.utils.logging import logger + -from nazurin.config import PROXY, TIMEOUT, UA +class NazurinRequestSession(AbstractAsyncContextManager): + def __init__( + self, + cookies: Optional[dict] = None, + headers: Optional[dict] = None, + timeout: int = TIMEOUT, + **kwargs + ): + raise NotImplementedError + @abc.abstractmethod + async def get(self, *args, **kwargs) -> AsyncContextManager: + raise NotImplementedError -class Request(ClientSession): - """Wrapped ClientSession with default user agent, timeout and proxy support.""" + @abc.abstractmethod + async def download(self, url: str, destination: Union[str, os.PathLike]): + raise NotImplementedError + + +class Request(ClientSession, NazurinRequestSession): + """ + Wrapped ClientSession with default user agent, + timeout and proxy support. + """ def __init__( - self, cookies=None, headers=None, timeout=ClientTimeout(total=TIMEOUT), **kwargs + self, + cookies: Optional[dict] = None, + headers: Optional[dict] = None, + timeout: int = TIMEOUT, + **kwargs ): - if not headers: - headers = {} + headers = headers or {} headers.update({"User-Agent": UA}) + timeout = ClientTimeout(total=timeout) + connector = None if PROXY: connector = TCPConnector(ssl=False) @@ -23,3 +59,60 @@ def __init__( timeout=timeout, **kwargs ) + + async def download(self, url: str, destination: Union[str, os.PathLike]): + async with self.get(url) as response: + if not response.ok: + logger.error("Download failed with status code {}", response.status) + logger.info("Response: {}", await response.content.read()) + response.raise_for_status() + async with aiofiles.open(destination, "wb") as f: + async for chunk in response.content.iter_chunked(DOWNLOAD_CHUNK_SIZE): + await f.write(chunk) + + +class CurlRequest(CurlSession, NazurinRequestSession): + """ + Wrapped curl_cffi AsyncSession to impersonate a browser, + with timeout and proxy support. + """ + + def __init__( + self, + cookies: Optional[dict] = None, + headers: Optional[dict] = None, + timeout: int = TIMEOUT, + **kwargs + ): + self.cookies = cookies + self.headers = headers + self.timeout = timeout + self.proxies = {"https": PROXY, "http": PROXY} if PROXY else None + super().__init__(**kwargs) + + @asynccontextmanager + async def get( + self, *args, impersonate: str = "chrome110", **kwargs + ) -> Generator[CurlResponse, None, None]: + yield await super().request( + "GET", + *args, + cookies=self.cookies, + headers=self.headers, + timeout=self.timeout, + impersonate=impersonate, + proxies=self.proxies, + **kwargs + ) + + async def download(self, url: str, destination: Union[str, os.PathLike]): + async with self.get(url, stream=True) as response: + if not response.ok: + logger.error( + "Download failed with status code {}", response.status_code + ) + logger.info("Response: {}", response.content) + response.raise_for_status() + async with aiofiles.open(destination, "wb") as f: + async for chunk in response.aiter_content(): + await f.write(chunk) diff --git a/requirements.txt b/requirements.txt index 08808ac6..2b923b3c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ async_lru~=2.0.2 loguru~=0.6.0 humanize~=4.8.0 pillow~=10.1.0 +curl_cffi~=0.5.10 pixivpy3~=3.7.2 beautifulsoup4~=4.10.0