From c8fe5667d63fcaff031ef311932c24ebcb7531c1 Mon Sep 17 00:00:00 2001 From: FlianLiu Date: Sun, 6 Oct 2024 22:47:09 +0800 Subject: [PATCH] feat: ettoday crawler & adjust the return URL for factcheckcenter news --- sync_crawler/crawlers/__init__.py | 1 + sync_crawler/crawlers/ettoday_crawler.py | 158 ++++++++++++++++++ .../crawlers/factcheckcenter_crawler.py | 4 +- 3 files changed, 161 insertions(+), 2 deletions(-) create mode 100644 sync_crawler/crawlers/ettoday_crawler.py diff --git a/sync_crawler/crawlers/__init__.py b/sync_crawler/crawlers/__init__.py index 614b4ee..e9e914e 100644 --- a/sync_crawler/crawlers/__init__.py +++ b/sync_crawler/crawlers/__init__.py @@ -3,5 +3,6 @@ from .base_crawler import BaseCrawler as BaseCrawler from .cts_crawler import CtsCrawler as CtsCrawler from .ebc_crawler import EbcCrawler as EbcCrawler +from .ettoday_crawler import EttodayCrawler as EttodayCrawler from .factcheckcenter_crawler import FactcheckcenterCrawler as FactcheckcenterCrawler from .ltn_crawler import LtnCrawler as LtnCrawler diff --git a/sync_crawler/crawlers/ettoday_crawler.py b/sync_crawler/crawlers/ettoday_crawler.py new file mode 100644 index 0000000..b16710f --- /dev/null +++ b/sync_crawler/crawlers/ettoday_crawler.py @@ -0,0 +1,158 @@ +from collections.abc import Iterable +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass +from datetime import datetime +from itertools import count +from typing import Optional + +import bs4 +import requests +from dateutil.parser import parse + +from sync_crawler.crawlers.base_crawler import BaseCrawler +from sync_crawler.models.news import News + + +@dataclass +class EttodayNewsMetadata: + publish_time: datetime + category: Optional[str] + url: str + + +class EttodayCrawler(BaseCrawler): + media_name = "ettoday" + metadata_api = "https://www.ettoday.net" + + def read(self, start_from: datetime) -> Iterable[News]: + for page in count(0, step=1): + try: + metadatas = self._fetch_metadata(page) + + except Exception as e: + self.logger.error( + f'Stop crawling, because of {type(e).__name__}: "{e}"' + ) + + if ( + not isinstance(metadatas, list) + or metadatas[0].publish_time < start_from + ): + self.logger.info("Finish crawling.") + return + + with ThreadPoolExecutor() as executor: + metadatas = filter(lambda x: x.publish_time >= start_from, metadatas) + + future_to_url = { + executor.submit(self._crawl_news, metadata): metadata + for metadata in metadatas + } + for future in as_completed(future_to_url): + try: + news = future.result() + except Exception as e: + self.logger.error( + f'{future_to_url[future].url}: {type(e).__name__}: "{e}"' + ) + else: + yield news + + def _crawl_news(self, metadata: EttodayNewsMetadata) -> News: + response = requests.get( + metadata.url, + allow_redirects=False, + ) + response.raise_for_status() + response.encoding = "utf-8" + + soup = bs4.BeautifulSoup(response.text, "html.parser") + html = soup.select("div.story") + + if not html: + raise ValueError("Article element not found.") + + title_element = soup.select("header > h1.title") + if not title_element: + title_element = soup.select("header > h1.title_article") + if not title_element: + raise ValueError("Title element not found.") + title = title_element[0].string.strip() + + content = soup.select("div.story > p:not(:has(img))") + content = [ + c.string.strip() + for c in content + if isinstance(c, bs4.element.Tag) and c.string + ] + + return News( + title=title, + content=content, + content_hash=self.hash("".join(content)), + category=metadata.category, + modified_date=metadata.publish_time, + media=self.media_name, + tags=[metadata.category], + url=metadata.url, + url_hash=self.hash(metadata.url), + ) + + def _fetch_metadata(self, page) -> Iterable[EttodayNewsMetadata]: + response = None + now = datetime.now() + + if page == 0: # if page == 0 then request potal. + response = requests.post( + self.metadata_api + + "/news/news-list-" + + f"{now.year}-{now.month}-{now.day}" + + "-0.htm", + allow_redirects=False, + ) + else: + response = requests.post( + self.metadata_api + "/show_roll.php", + allow_redirects=False, + data={ + "offset": page, + "tPage": 3, + "tFile": now.strftime("%Y%M%D") + ".xml", + "tOt": 0, + "tSi": 100, + "tAr": 0, + }, + ) + + response.raise_for_status() + response.encoding = "utf-8" + soup = bs4.BeautifulSoup(response.text, "html.parser") + html = None + if page == 0: + html = soup.select("div.part_list_2 > h3") + else: + html = soup.select("h3") + + if not html: + raise ValueError(f'Invalid response: {response}, missing "a" field.') + + parsed_news = list(map(self._parse_ettoday_api_response, html)) + return parsed_news + + def _parse_ettoday_api_response(self, html: bs4.element.Tag) -> EttodayNewsMetadata: + publish_time = parse(html.find("span", {"class": "date"}).string) + category = html.find("em", {"class": "tag"}).string + url = html.a["href"] + + if not publish_time: + raise ValueError("New's [publish time] is not found.") + if not category: + raise ValueError("New's [category] is not found.") + if not url: + raise ValueError("New's [url] is not found.") + + return EttodayNewsMetadata( + publish_time, + category, + url, + ) diff --git a/sync_crawler/crawlers/factcheckcenter_crawler.py b/sync_crawler/crawlers/factcheckcenter_crawler.py index b732da7..0d38448 100644 --- a/sync_crawler/crawlers/factcheckcenter_crawler.py +++ b/sync_crawler/crawlers/factcheckcenter_crawler.py @@ -55,7 +55,7 @@ def read(self, start_from: datetime) -> Iterable[News]: yield news def _crawl_news(self, metadata: FactcheckcenterNewsMetadata) -> News: - response = requests.get(self.metadata_api + metadata.url, allow_redirects=False) + response = requests.get(metadata.url, allow_redirects=False) response.raise_for_status() response.encoding = "utf-8" @@ -127,5 +127,5 @@ def _parse_factcheckcenter_api_response( return FactcheckcenterNewsMetadata( publish_time, category, - url, + url=self.metadata_api + url, )