Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Graceful handling of connection errors #35

Merged
merged 2 commits into from
Dec 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion paperscraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Initialize the module."""
__name__ = "paperscraper"
__version__ = "0.2.7"
__version__ = "0.2.8"

import logging
import os
Expand Down
6 changes: 3 additions & 3 deletions paperscraper/get_dumps/chemrxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ def chemrxiv(
stored in jsonl format in save_path.

Args:
begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
begin_date (Optional[str]): begin date expressed as YYYY-MM-DD.
Defaults to None.
end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
end_date (Optional[str]): end date expressed as YYYY-MM-DD.
Defaults to None.
save_path (str, optional): Path where the dump is stored.
save_path (str): Path where the dump is stored.
Defaults to save_path.
"""

Expand Down
57 changes: 42 additions & 15 deletions paperscraper/xrxiv/xrxiv_api.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
"""API for bioRxiv and medRXiv."""
import logging
from datetime import datetime
from time import sleep
from typing import Generator, List, Optional

import requests
from requests.exceptions import ConnectionError, Timeout

launch_dates = {"biorxiv": "2013-01-01", "medrxiv": "2019-06-01"}
logger = logging.getLogger(__name__)


class XRXivApi:
Expand Down Expand Up @@ -38,15 +42,17 @@ def get_papers(
begin_date: Optional[str] = None,
end_date: Optional[str] = None,
fields: List[str] = ["title", "doi", "authors", "abstract", "date", "journal"],
max_retries: int = 10,
) -> Generator:
"""
Get paper metadata.

Args:
begin_date (Optional[str], optional): begin date. Defaults to None, a.k.a. launch date.
end_date (Optional[str], optional): end date. Defaults to None, a.k.a. today.
begin_date (Optional[str]): begin date. Defaults to None, a.k.a. launch date.
end_date (Optional[str]): end date. Defaults to None, a.k.a. today.
fields (List[str], optional): fields to return per paper.
Defaults to ['title', 'doi', 'authors', 'abstract', 'date', 'journal'].
max_retries (int): Number of retries on connection failure. Defaults to 10.

Yields:
Generator: a generator of paper metadata (dict) with the desired fields.
Expand All @@ -68,20 +74,41 @@ def get_papers(
do_loop = True
cursor = 0
while do_loop:
json_response = requests.get(
self.get_papers_url.format(
begin_date=begin_date, end_date=end_date, cursor=cursor
)
).json()
do_loop = json_response["messages"][0]["status"] == "ok"
if do_loop:
cursor += json_response["messages"][0]["count"]
for paper in json_response["collection"]:
processed_paper = {
field: paper.get(field, "") for field in fields
}
yield processed_paper
papers = []
for attempt in range(max_retries):
try:
json_response = requests.get(
self.get_papers_url.format(
begin_date=begin_date, end_date=end_date, cursor=cursor
)
).json()
do_loop = json_response["messages"][0]["status"] == "ok"
if do_loop:
cursor += json_response["messages"][0]["count"]
for paper in json_response["collection"]:
processed_paper = {
field: paper.get(field, "") for field in fields
}
papers.append(processed_paper)

if do_loop:
yield from papers
break
except (ConnectionError, Timeout) as e:
logger.error(
f"Connection error: {e}. Retrying ({attempt + 1}/{max_retries})"
)
sleep(5)
continue
except Exception as exc:
logger.exception(f"Failed getting papers: {exc}")
raise RuntimeError(
"Failed getting papers: {} - {}".format(
exc.__class__.__name__, exc
)
)
except Exception as exc:
logger.exception(f"Failed getting papers: {exc}")
raise RuntimeError(
"Failed getting papers: {} - {}".format(exc.__class__.__name__, exc)
)
Expand Down
Loading