Skip to content

Commit

Permalink
feat: ettoday crawler & adjust the return URL for factcheckcenter news
Browse files Browse the repository at this point in the history
  • Loading branch information
FlianLiu committed Oct 6, 2024
1 parent 8c90796 commit c8fe566
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 2 deletions.
1 change: 1 addition & 0 deletions sync_crawler/crawlers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@
from .base_crawler import BaseCrawler as BaseCrawler
from .cts_crawler import CtsCrawler as CtsCrawler
from .ebc_crawler import EbcCrawler as EbcCrawler
from .ettoday_crawler import EttodayCrawler as EttodayCrawler
from .factcheckcenter_crawler import FactcheckcenterCrawler as FactcheckcenterCrawler
from .ltn_crawler import LtnCrawler as LtnCrawler
158 changes: 158 additions & 0 deletions sync_crawler/crawlers/ettoday_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
from collections.abc import Iterable
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from datetime import datetime
from itertools import count
from typing import Optional

import bs4
import requests
from dateutil.parser import parse

from sync_crawler.crawlers.base_crawler import BaseCrawler
from sync_crawler.models.news import News


@dataclass
class EttodayNewsMetadata:
publish_time: datetime
category: Optional[str]
url: str


class EttodayCrawler(BaseCrawler):
media_name = "ettoday"
metadata_api = "https://www.ettoday.net"

def read(self, start_from: datetime) -> Iterable[News]:
for page in count(0, step=1):
try:
metadatas = self._fetch_metadata(page)

except Exception as e:
self.logger.error(
f'Stop crawling, because of {type(e).__name__}: "{e}"'
)

if (
not isinstance(metadatas, list)
or metadatas[0].publish_time < start_from
):
self.logger.info("Finish crawling.")
return

with ThreadPoolExecutor() as executor:
metadatas = filter(lambda x: x.publish_time >= start_from, metadatas)

future_to_url = {
executor.submit(self._crawl_news, metadata): metadata
for metadata in metadatas
}
for future in as_completed(future_to_url):
try:
news = future.result()
except Exception as e:
self.logger.error(
f'{future_to_url[future].url}: {type(e).__name__}: "{e}"'
)
else:
yield news

def _crawl_news(self, metadata: EttodayNewsMetadata) -> News:
response = requests.get(
metadata.url,
allow_redirects=False,
)
response.raise_for_status()
response.encoding = "utf-8"

soup = bs4.BeautifulSoup(response.text, "html.parser")
html = soup.select("div.story")

if not html:
raise ValueError("Article element not found.")

title_element = soup.select("header > h1.title")
if not title_element:
title_element = soup.select("header > h1.title_article")
if not title_element:
raise ValueError("Title element not found.")
title = title_element[0].string.strip()

content = soup.select("div.story > p:not(:has(img))")
content = [
c.string.strip()
for c in content
if isinstance(c, bs4.element.Tag) and c.string
]

return News(
title=title,
content=content,
content_hash=self.hash("".join(content)),
category=metadata.category,
modified_date=metadata.publish_time,
media=self.media_name,
tags=[metadata.category],
url=metadata.url,
url_hash=self.hash(metadata.url),
)

def _fetch_metadata(self, page) -> Iterable[EttodayNewsMetadata]:
response = None
now = datetime.now()

if page == 0: # if page == 0 then request potal.
response = requests.post(
self.metadata_api
+ "/news/news-list-"
+ f"{now.year}-{now.month}-{now.day}"
+ "-0.htm",
allow_redirects=False,
)
else:
response = requests.post(
self.metadata_api + "/show_roll.php",
allow_redirects=False,
data={
"offset": page,
"tPage": 3,
"tFile": now.strftime("%Y%M%D") + ".xml",
"tOt": 0,
"tSi": 100,
"tAr": 0,
},
)

response.raise_for_status()
response.encoding = "utf-8"
soup = bs4.BeautifulSoup(response.text, "html.parser")
html = None
if page == 0:
html = soup.select("div.part_list_2 > h3")
else:
html = soup.select("h3")

if not html:
raise ValueError(f'Invalid response: {response}, missing "a" field.')

parsed_news = list(map(self._parse_ettoday_api_response, html))
return parsed_news

def _parse_ettoday_api_response(self, html: bs4.element.Tag) -> EttodayNewsMetadata:
publish_time = parse(html.find("span", {"class": "date"}).string)
category = html.find("em", {"class": "tag"}).string
url = html.a["href"]

if not publish_time:
raise ValueError("New's [publish time] is not found.")
if not category:
raise ValueError("New's [category] is not found.")
if not url:
raise ValueError("New's [url] is not found.")

return EttodayNewsMetadata(
publish_time,
category,
url,
)
4 changes: 2 additions & 2 deletions sync_crawler/crawlers/factcheckcenter_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def read(self, start_from: datetime) -> Iterable[News]:
yield news

def _crawl_news(self, metadata: FactcheckcenterNewsMetadata) -> News:
response = requests.get(self.metadata_api + metadata.url, allow_redirects=False)
response = requests.get(metadata.url, allow_redirects=False)
response.raise_for_status()
response.encoding = "utf-8"

Expand Down Expand Up @@ -127,5 +127,5 @@ def _parse_factcheckcenter_api_response(
return FactcheckcenterNewsMetadata(
publish_time,
category,
url,
url=self.metadata_api + url,
)

0 comments on commit c8fe566

Please sign in to comment.