Skip to content

Commit

Permalink
feat: logging
Browse files Browse the repository at this point in the history
  • Loading branch information
david20571015 committed Aug 23, 2024
1 parent c06a9bd commit edfe5dc
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 16 deletions.
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pymongo = "^4.5.0"
pydantic = "^2.7.3"
requests = "^2.32.3"
beautifulsoup4 = "^4.12.3"
python-dateutil = "^2.9.0"

[tool.poetry.group.chroma]
optional = true
Expand Down
7 changes: 7 additions & 0 deletions sync_crawler/crawlers/base_crawler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import hashlib
import logging
from abc import ABC, abstractmethod
from collections.abc import Iterable
from datetime import datetime
Expand All @@ -19,6 +20,8 @@ def wrapper(*args, **kwargs):


class BaseCrawler(DataReader, ABC):
media_name: str

@override
@abstractmethod
def read(self, start_from: datetime) -> Iterable[News]:
Expand All @@ -34,3 +37,7 @@ def read(self, start_from: datetime) -> Iterable[News]:

def hash(self, data: str) -> str:
return hashlib.sha1(data.encode("utf-8")).hexdigest()

@property
def logger(self) -> logging.Logger:
return logging.getLogger(self.media_name)
47 changes: 32 additions & 15 deletions sync_crawler/crawlers/ltn_crawler.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import json
import logging
from collections.abc import Iterable
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
Expand All @@ -8,6 +6,7 @@

import bs4
import requests
from dateutil.parser import parse

from sync_crawler.crawlers.base_crawler import BaseCrawler, ignore_exception
from sync_crawler.models.news import News
Expand All @@ -28,8 +27,10 @@ def read(self, start_from: datetime) -> Iterable[News]:
for page in count(1, step=1):
try:
metadatas = list(self._fetch_metadata(page))
except json.JSONDecodeError:
logging.error(f"Failed to fetch metadata from page {page}.")
except Exception as e:
self.logger.error(
f"Failed to fetch metadata from page {page}, because {e}"
)
break

with ThreadPoolExecutor() as executor:
Expand All @@ -41,7 +42,7 @@ def read(self, start_from: datetime) -> Iterable[News]:

yield from news

if metadatas[-1].publish_time < start_from:
if not metadatas or metadatas[-1].publish_time < start_from:
break

@ignore_exception
Expand Down Expand Up @@ -73,22 +74,38 @@ def _crawl_news(self, metadata: LtnNewsMetadata) -> News:

def _fetch_metadata(self, page: int) -> Iterable[LtnNewsMetadata]:
url = self.metadata_api.format(page)
response = requests.get(url).json()

if page == 1:
try:
response = requests.get(url)
response.raise_for_status()
except requests.RequestException as e:
self.logger.error(f"HTTP request failed: {e}")
return []

response = response.json()
if "data" not in response:
self.logger.error(f"Invalid response: {response}, missing 'data' field.")
return []

if isinstance(response["data"], list):
responses = response["data"]
else:
elif isinstance(response["data"], dict):
responses = response["data"].values()
else:
self.logger.error(
f"Invalid response: {response}, expected 'data' to be list or dict, but got {type(response['data'])}."
)
return []

try:
parsed_news = map(self._parse_ltn_api_response, responses)
except Exception as e:
self.logger.error(f"Failed to parse response: {e}")
return []

parsed_news = map(self._parse_ltn_api_response, responses)
return parsed_news

def _parse_ltn_api_response(self, response: dict) -> LtnNewsMetadata:
try:
publish_time = datetime.strptime(response["time"], "%Y/%m/%d %H:%M")
except ValueError:
publish_time = datetime.strptime(response["time"], "%H:%M")
publish_time = datetime.combine(datetime.now(), publish_time.time())
publish_time = parse(response["time"])

return LtnNewsMetadata(
publish_time=publish_time,
Expand Down

0 comments on commit edfe5dc

Please sign in to comment.