Skip to content

Commit

Permalink
fix: lower default max_results (#41)
Browse files Browse the repository at this point in the history
* fix: lower default max_results

* chore: logging message for pubmed max_results

* doc: pubmed upper bound

* chore: enable logging
  • Loading branch information
jannisborn authored Feb 22, 2024
1 parent 06c553f commit 8cca29c
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 6 deletions.
3 changes: 2 additions & 1 deletion paperscraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Initialize the module."""

__name__ = "paperscraper"
__version__ = "0.2.10"
__version__ = "0.2.11"

import logging
import os
Expand Down
1 change: 0 additions & 1 deletion paperscraper/impact.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logging.disable(logging.INFO)


class Impactor:
Expand Down
23 changes: 19 additions & 4 deletions paperscraper/pubmed/pubmed.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import logging
from typing import List, Union

import pandas as pd
Expand All @@ -7,6 +8,9 @@
from ..utils import dump_papers
from .utils import get_emails, get_query_from_keywords_and_date

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

PUBMED = PubMed(tool="MyTool", email="[email protected]")

pubmed_field_mapper = {"publication_date": "date"}
Expand All @@ -28,9 +32,9 @@
def get_pubmed_papers(
query: str,
fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
max_results: int = 999999,
max_results: int = 9998,
*args,
**kwargs
**kwargs,
) -> pd.DataFrame:
"""
Performs PubMed API request of a query and returns list of papers with
Expand All @@ -41,13 +45,24 @@ def get_pubmed_papers(
fields (list[str]): List of strings with fields to keep in output.
NOTE: If 'emails' is passed, an attempt is made to extract author mail
addresses.
max_results (int): Maximal number of results retrieved from DB.
max_results (int): Maximal number of results retrieved from DB. Defaults
to 9998, higher values likely raise problems due to PubMedAPI, see:
https://stackoverflow.com/questions/75353091/biopython-entrez-article-limit
NOTE: *args, **kwargs are additional arguments for pubmed.query
Returns:
pd.DataFrame. One paper per row.
"""
if max_results > 9998:
logger.warning(
f"\nmax_results cannot be larger than 9998, received {max_results}."
"This will likely result in a JSONDecodeError. Considering lowering `max_results`.\n"
"For PubMed, ESearch can only retrieve the first 9,999 records matching the query. "
"To obtain more than 9,999 PubMed records, consider using EDirect that contains additional"
"logic to batch PubMed search results automatically so that an arbitrary number can be retrieved"
)
raw = list(PUBMED.query(query, max_results=max_results, *args, **kwargs))

get_mails = "emails" in fields
Expand Down Expand Up @@ -78,7 +93,7 @@ def get_and_dump_pubmed_papers(
start_date: str = "None",
end_date: str = "None",
*args,
**kwargs
**kwargs,
) -> None:
"""
Combines get_pubmed_papers and dump_papers.
Expand Down

0 comments on commit 8cca29c

Please sign in to comment.