From 540eaa5addf0e8489deccc9f14c2d84c49802aa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 23 Jul 2024 20:31:04 +0200 Subject: [PATCH] [tumblr] implement 'pagination' option (#5880) restore pagination behavior from before de670bd7de8600a1481ee6366680d3e08659f0f4 --- docs/configuration.rst | 17 ++++++++++++ gallery_dl/extractor/tumblr.py | 47 ++++++++++++++++++++++++++-------- 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index c3ed562629..eb4cfd91f5 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3735,6 +3735,23 @@ Description use an extra HTTP request to find the URL to its full-resolution version. +extractor.tumblr.pagination +--------------------------- +Type + ``string`` +Default + ``"offset"`` +Description + Controls how to paginate over blog posts. + + * ``"api"``: ``next`` parameter provided by the API + (potentially misses posts due to a + `bug `__ + in Tumblr's API) + * ``"before"``: timestamp of last post + * ``"offset"``: post offset number + + extractor.tumblr.ratelimit -------------------------- Type diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index c34910f815..ff29c046f1 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -386,7 +386,7 @@ def avatar(self, blog, size="512"): def posts(self, blog, params): """Retrieve published posts""" params["offset"] = self.extractor.config("offset") - params["limit"] = "50" + params["limit"] = 50 params["reblog_info"] = "true" params["type"] = self.posts_type params["before"] = self.before @@ -398,8 +398,14 @@ def posts(self, blog, params): def likes(self, blog): """Retrieve liked posts""" + endpoint = "/v2/blog/{}/likes".format(blog) params = {"limit": "50", "before": self.before} - return self._pagination(blog, "/likes", params, key="liked_posts") + while True: + posts = self._call(endpoint, params)["liked_posts"] + if not posts: + return + yield from posts + params["before"] = posts[-1]["liked_timestamp"] def _call(self, endpoint, params, **kwargs): url = self.ROOT + endpoint @@ -474,6 +480,7 @@ def _pagination(self, blog, endpoint, params, key="posts", cache=False): if self.api_key: params["api_key"] = self.api_key + strategy = self.extractor.config("pagination") while True: data = self._call(endpoint, params) @@ -481,13 +488,31 @@ def _pagination(self, blog, endpoint, params, key="posts", cache=False): self.BLOG_CACHE[blog] = data["blog"] cache = False - yield from data[key] - - try: - endpoint = data["_links"]["next"]["href"] - except KeyError: - return + posts = data[key] + yield from posts - params = None - if self.api_key: - endpoint += "&api_key=" + self.api_key + if strategy == "api": + try: + endpoint = data["_links"]["next"]["href"] + except KeyError: + return + + params = None + if self.api_key: + endpoint += "&api_key=" + self.api_key + + elif strategy == "before": + if not posts: + return + timestamp = posts[-1]["timestamp"] + 1 + if params["before"] and timestamp >= params["before"]: + return + params["before"] = timestamp + params["offset"] = None + + else: # offset + params["offset"] = \ + text.parse_int(params["offset"]) + params["limit"] + params["before"] = None + if params["offset"] >= data["total_posts"]: + return