[tumblr] implement 'pagination' option (#5880)

restore pagination behavior from before de670bd
mikf · Jul 23, 2024 · 540eaa5 · 540eaa5
1 parent 7b445ec
commit 540eaa5
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 11 deletions.
diff --git a/docs/configuration.rst b/docs/configuration.rst
@@ -3735,6 +3735,23 @@ Description
     use an extra HTTP request to find the URL to its full-resolution version.
 
 
+extractor.tumblr.pagination
+---------------------------
+Type
+    ``string``
+Default
+    ``"offset"``
+Description
+    Controls how to paginate over blog posts.
+
+    * ``"api"``: ``next`` parameter provided by the API
+      (potentially misses posts due to a
+      `bug <https://github.com/tumblr/docs/issues/76>`__
+      in Tumblr's API)
+    * ``"before"``: timestamp of last post
+    * ``"offset"``: post offset number
+
+
 extractor.tumblr.ratelimit
 --------------------------
 Type

diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
@@ -386,7 +386,7 @@ def avatar(self, blog, size="512"):
     def posts(self, blog, params):
         """Retrieve published posts"""
         params["offset"] = self.extractor.config("offset")
-        params["limit"] = "50"
+        params["limit"] = 50
         params["reblog_info"] = "true"
         params["type"] = self.posts_type
         params["before"] = self.before
@@ -398,8 +398,14 @@ def posts(self, blog, params):
 
     def likes(self, blog):
         """Retrieve liked posts"""
+        endpoint = "/v2/blog/{}/likes".format(blog)
         params = {"limit": "50", "before": self.before}
-        return self._pagination(blog, "/likes", params, key="liked_posts")
+        while True:
+            posts = self._call(endpoint, params)["liked_posts"]
+            if not posts:
+                return
+            yield from posts
+            params["before"] = posts[-1]["liked_timestamp"]
 
     def _call(self, endpoint, params, **kwargs):
         url = self.ROOT + endpoint
@@ -474,20 +480,39 @@ def _pagination(self, blog, endpoint, params, key="posts", cache=False):
         if self.api_key:
             params["api_key"] = self.api_key
 
+        strategy = self.extractor.config("pagination")
         while True:
             data = self._call(endpoint, params)
 
             if cache:
                 self.BLOG_CACHE[blog] = data["blog"]
                 cache = False
 
-            yield from data[key]
-
-            try:
-                endpoint = data["_links"]["next"]["href"]
-            except KeyError:
-                return
+            posts = data[key]
+            yield from posts
 
-            params = None
-            if self.api_key:
-                endpoint += "&api_key=" + self.api_key
+            if strategy == "api":
+                try:
+                    endpoint = data["_links"]["next"]["href"]
+                except KeyError:
+                    return
+
+                params = None
+                if self.api_key:
+                    endpoint += "&api_key=" + self.api_key
+
+            elif strategy == "before":
+                if not posts:
+                    return
+                timestamp = posts[-1]["timestamp"] + 1
+                if params["before"] and timestamp >= params["before"]:
+                    return
+                params["before"] = timestamp
+                params["offset"] = None
+
+            else:  # offset
+                params["offset"] = \
+                    text.parse_int(params["offset"]) + params["limit"]
+                params["before"] = None
+                if params["offset"] >= data["total_posts"]:
+                    return