fix: lower default max_results (#41)

* fix: lower default max_results * chore: logging message for pubmed max_results * doc: pubmed upper bound * chore: enable logging
jannisborn · Feb 22, 2024 · 8cca29c · 8cca29c
1 parent 06c553f
commit 8cca29c
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 6 deletions.
diff --git a/paperscraper/__init__.py b/paperscraper/__init__.py
@@ -1,6 +1,7 @@
 """Initialize the module."""
+
 __name__ = "paperscraper"
-__version__ = "0.2.10"
+__version__ = "0.2.11"
 
 import logging
 import os

diff --git a/paperscraper/impact.py b/paperscraper/impact.py
@@ -7,7 +7,6 @@
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
-logging.disable(logging.INFO)
 
 
 class Impactor:

diff --git a/paperscraper/pubmed/pubmed.py b/paperscraper/pubmed/pubmed.py
@@ -1,4 +1,5 @@
 import datetime
+import logging
 from typing import List, Union
 
 import pandas as pd
@@ -7,6 +8,9 @@
 from ..utils import dump_papers
 from .utils import get_emails, get_query_from_keywords_and_date
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
 PUBMED = PubMed(tool="MyTool", email="[email protected]")
 
 pubmed_field_mapper = {"publication_date": "date"}
@@ -28,9 +32,9 @@
 def get_pubmed_papers(
     query: str,
     fields: List = ["title", "authors", "date", "abstract", "journal", "doi"],
-    max_results: int = 999999,
+    max_results: int = 9998,
     *args,
-    **kwargs
+    **kwargs,
 ) -> pd.DataFrame:
     """
     Performs PubMed API request of a query and returns list of papers with
@@ -41,13 +45,24 @@ def get_pubmed_papers(
         fields (list[str]): List of strings with fields to keep in output.
             NOTE: If 'emails' is passed, an attempt is made to extract author mail
             addresses.
-        max_results (int): Maximal number of results retrieved from DB.
+        max_results (int): Maximal number of results retrieved from DB. Defaults
+            to 9998, higher values likely raise problems due to PubMedAPI, see:
+            https://stackoverflow.com/questions/75353091/biopython-entrez-article-limit
+
         NOTE: *args, **kwargs are additional arguments for pubmed.query
 
     Returns:
         pd.DataFrame. One paper per row.
 
     """
+    if max_results > 9998:
+        logger.warning(
+            f"\nmax_results cannot be larger than 9998, received {max_results}."
+            "This will likely result in a JSONDecodeError. Considering lowering `max_results`.\n"
+            "For PubMed, ESearch can only retrieve the first 9,999 records matching the query. "
+            "To obtain more than 9,999 PubMed records, consider using EDirect that contains additional"
+            "logic to batch PubMed search results automatically so that an arbitrary number can be retrieved"
+        )
     raw = list(PUBMED.query(query, max_results=max_results, *args, **kwargs))
 
     get_mails = "emails" in fields
@@ -78,7 +93,7 @@ def get_and_dump_pubmed_papers(
     start_date: str = "None",
     end_date: str = "None",
     *args,
-    **kwargs
+    **kwargs,
 ) -> None:
     """
     Combines get_pubmed_papers and dump_papers.