Skip to content

Commit

Permalink
[photos18] do pagination
Browse files Browse the repository at this point in the history
Photos18ListExtractor bails if there are no posts within a page, since
(as of this commit), some pages might have improper navigation bars
(for example, https://www.photos18.com/cat/2?page=75 still has a link
to the next page despite the list of posts being far outside that range)
  • Loading branch information
the-blank-x committed Jan 19, 2024
1 parent 2b3b08b commit 3f82852
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 41 deletions.
90 changes: 49 additions & 41 deletions gallery_dl/extractor/photos18.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,41 +22,6 @@ class Photos18Extractor(Extractor):
archive_fmt = "{filename}"
root = "https://www.photos18.com"

def items(self):
for post_id in self.posts():
url = self.root + "/v/" + post_id
page = self.request(url).text
extr = text.extract_from(page)

category_id = int(extr(
'<li class="breadcrumb-item"><a href="/cat/', '"'))
category_name = text.unescape(extr('>', '<'))
date = text.parse_datetime(extr('"datePublished":"', '"'))
title = text.unescape(extr(
'<h1 class="title py-1">', '</h1>')).strip()

urls = []
while True:
url = text.unescape(extr(
'<div class="my-2 imgHolder"><a href="', '"'))
if not url:
break

urls.append(url)

data = {
"post_id": post_id,
"title": title,
"category_id": category_id,
"category_name": category_name,
"date": date,
"count": len(urls),
"_http_headers": {"Referer": self.root},
}
yield Message.Directory, data
for data["num"], url in enumerate(urls, 1):
yield Message.Url, url, text.nameext_from_url(url, data)


class Photos18AlbumExtractor(Photos18Extractor):
"""Extractor for a single album URL"""
Expand All @@ -68,8 +33,39 @@ def __init__(self, match):
Photos18Extractor.__init__(self, match)
self.post_id = match.group(1)

def posts(self):
return (self.post_id,)
def items(self):
url = self.root + "/v/" + self.post_id
page = self.request(url).text
extr = text.extract_from(page)

category_id = int(extr(
'<li class="breadcrumb-item"><a href="/cat/', '"'))
category_name = text.unescape(extr('>', '<'))
date = text.parse_datetime(extr('"datePublished":"', '"'))
title = text.unescape(extr(
'<h1 class="title py-1">', '</h1>')).strip()

urls = []
while True:
url = text.unescape(extr(
'<div class="my-2 imgHolder"><a href="', '"'))
if not url:
break

urls.append(url)

data = {
"post_id": self.post_id,
"title": title,
"category_id": category_id,
"category_name": category_name,
"date": date,
"count": len(urls),
"_http_headers": {"Referer": self.root},
}
yield Message.Directory, data
for data["num"], url in enumerate(urls, 1):
yield Message.Url, url, text.nameext_from_url(url, data)


class Photos18ListExtractor(Photos18Extractor):
Expand All @@ -86,9 +82,9 @@ def __init__(self, match):
self.q = text.unquote(match.group(4) or "") or query.get("q")
self.category_id = match.group(1) or query.get("category_id")
self.sort = match.group(2) or match.group(3) or query.get("sort")
self.page = query.get("page")
self.page = int(query.get("page") or 1)

def posts(self):
def items(self):
query = {}
if self.q:
query["q"] = self.q
Expand All @@ -99,5 +95,17 @@ def posts(self):
if self.page:
query["page"] = self.page

page = self.request(self.root, params=query).text
return text.extract_iter(page, '<a class="visited" href="/v/', '"')
while True:
has_post = False
page = self.request(self.root, params=query).text

for i in text.extract_iter(
page, '<a class="visited" href="/v/', '"'):
has_post = True
url = self.root + "/v/" + i
data = {"_extractor": Photos18AlbumExtractor}
yield Message.Queue, url, data

if not has_post or '<li class="page-item next">' not in page:
break
query["page"] += 1
8 changes: 8 additions & 0 deletions test/results/photos18.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,12 @@
"category_name": "歐美寫真",
},

{
"#url" : "https://www.photos18.com",
"#category": ("", "photos18", "list"),
"#class" : photos18.Photos18ListExtractor,
"#range" : "1-200",
"#count" : 200,
},

)

0 comments on commit 3f82852

Please sign in to comment.