Expand unit tests (#49)

jannisborn · Jul 6, 2024 · dc681c5 · dc681c5
1 parent 86f7bcb
commit dc681c5
Show file tree

Hide file tree

Showing 17 changed files with 333 additions and 70 deletions.
diff --git a/.github/workflows/test_tip.yml b/.github/workflows/test_tip.yml
@@ -34,11 +34,11 @@ jobs:
           coverage report
           coverage xml -o coverage.xml
       - name: Upload to Codecov
-        if: matrix.python-version == '3.9'
+        if: matrix.python-version == '3.8'
         uses: codecov/codecov-action@v2
         with:
           files: coverage.xml
-          token: ${{ secrets.CODECOV_TOKEN }}  # Use the token here
+          token: ${{ secrets.CODECOV_TOKEN }}
           fail_ci_if_error: true
 
   test-potential-wheel-install:

diff --git a/paperscraper/arxiv/arxiv.py b/paperscraper/arxiv/arxiv.py
@@ -1,10 +1,9 @@
 from typing import Dict, List, Union
 
+import arxiv
 import pandas as pd
 from tqdm import tqdm
 
-import arxiv
-
 from ..utils import dump_papers
 from .utils import get_query_from_keywords
 

diff --git a/paperscraper/get_dumps/utils/chemrxiv/utils.py b/paperscraper/get_dumps/utils/chemrxiv/utils.py
@@ -1,4 +1,5 @@
 """Misc utils to download chemRxiv dump"""
+
 import json
 import logging
 import os

diff --git a/paperscraper/impact.py b/paperscraper/impact.py
@@ -97,13 +97,15 @@ def calculate_fuzziness_score(row):
 
         # Prepare the final result
         results = [
-            row.to_dict()
-            if return_all
-            else {
-                "journal": row["journal"],
-                "factor": row["factor"],
-                "score": row["score"],
-            }
+            (
+                row.to_dict()
+                if return_all
+                else {
+                    "journal": row["journal"],
+                    "factor": row["factor"],
+                    "score": row["score"],
+                }
+            )
             for _, row in matched_df.iterrows()
         ]
 

diff --git a/paperscraper/load_dumps.py b/paperscraper/load_dumps.py
@@ -40,6 +40,6 @@
 
 if len(QUERY_FN_DICT) == 2:
     logger.warning(
-        " No dumps found for either biorxiv or medrxiv."
+        " No dumps found for either biorxiv, medrxiv and chemrxiv."
         " Consider using paperscraper.get_dumps.* to fetch the dumps."
     )
diff --git a/paperscraper/plotting.py b/paperscraper/plotting.py
@@ -304,8 +304,10 @@ def plot_single(
             plt.bar(ind, np.zeros((len(ind),)), color="k", bottom=bottom)
         )
 
-    plt.ylabel("Counts", size=17) if not logscale else plt.ylabel(
-        "Counts (log scale)", size=17
+    (
+        plt.ylabel("Counts", size=17)
+        if not logscale
+        else plt.ylabel("Counts (log scale)", size=17)
     )
     plt.xlabel("Years", size=17)
     plt.title(title_text, size=17)

diff --git a/paperscraper/pubmed/tests/__init__.py b/paperscraper/pubmed/tests/__init__.py
diff --git a/paperscraper/pubmed/tests/test_pubmed.py b/paperscraper/pubmed/tests/test_pubmed.py
@@ -0,0 +1,34 @@
+import os
+import tempfile
+from unittest.mock import patch
+
+import pytest
+
+from paperscraper.pubmed import get_and_dump_pubmed_papers, get_pubmed_papers
+from paperscraper.pubmed.utils import get_query_from_keywords_and_date
+
+KEYWORDS = [["machine learning", "deep learning"], ["zoology"]]
+
+
+class TestPubMed:
+
+    def test_get_and_dump_pubmed(self):
+        with tempfile.TemporaryDirectory() as temp_dir:
+            output_filepath = os.path.join(temp_dir, "tmp.jsonl")
+            get_and_dump_pubmed_papers(KEYWORDS, output_filepath=output_filepath)
+            assert os.path.exists(output_filepath), "File was not created"
+
+    def test_email(self):
+        query = get_query_from_keywords_and_date(KEYWORDS, start_date="2020/07/20")
+        df = get_pubmed_papers(query, fields=["emails", "title", "authors"])
+        assert "emails" in df.columns
+
+        query = get_query_from_keywords_and_date(KEYWORDS, end_date="2020/07/20")
+        df = get_pubmed_papers(query, fields=["emails", "title", "authors"])
+        assert "emails" in df.columns
+
+        query = get_query_from_keywords_and_date(
+            KEYWORDS, start_date="2020/07/10", end_date="2020/07/20"
+        )
+        df = get_pubmed_papers(query, fields=["emails", "title", "authors"])
+        assert "emails" in df.columns
diff --git a/paperscraper/scholar/tests/test_scholar.py b/paperscraper/scholar/tests/test_scholar.py
@@ -1,8 +1,9 @@
+import functools
 import logging
+
 import pandas as pd
 import pytest
 from scholarly._proxy_generator import MaxTriesExceededException
-import functools
 
 from paperscraper.scholar import (
     get_and_dump_scholar_papers,
@@ -30,8 +31,7 @@ class TestScholar:
     @handle_scholar_exception
     def test_citations(self):
         num = get_citations_from_title("GT4SD")
-        assert isinstance(num, int)
-        assert num > 0
+        assert isinstance(num, int) and num > 0
 
     @handle_scholar_exception
     def test_dump_search(self, tmpdir):
@@ -43,8 +43,7 @@ def test_dump_search(self, tmpdir):
     @handle_scholar_exception
     def test_basic_search(self):
         results = get_scholar_papers("GT4SD")
-        assert len(results) > 0  # Ensure we get some results
-        assert isinstance(results, pd.DataFrame)
+        assert len(results) > 0 and isinstance(results, pd.DataFrame)
         assert all(
             [
                 x in results.columns
@@ -58,3 +57,8 @@ def test_basic_search(self):
                 ]
             ]
         )
+
+    @handle_scholar_exception
+    def test_bad_search(self):
+        results = get_scholar_papers("GT4SDfsdhfiobfpsdfbsdp")
+        assert len(results) == 0
diff --git a/paperscraper/tests/test_dump.jsonl b/paperscraper/tests/test_dump.jsonl
@@ -0,0 +1,3 @@
+{"title": "Population genomics of Saccharomyces cerevisiae human isolates: passengers, colonizers, invaders.", "doi": "10.1101/001891", "authors": "Carlotta De Filippo;Monica Di Paola;Irene Stefanini;Lisa Rizzetto;Luisa Bern\u00e1;Matteo Ramazzotti;Leonardo Dapporto;Damariz Rivero;Ivo G Gut;Marta Gut;M\u00f3nica Bay\u00e9s;Jean-Luc Legras;Roberto Viola;Cristina Massi-Benedetti;Antonella De Luca;Luigina Romani;Paolo Lionetti;Duccio Cavalieri;", "abstract": "The quest for the ecological niches of Saccharomyces cerevisiae ranged from wineries to oaks and more recently to the gut of Crabro Wasps. Here we propose the role of the human gut in shaping S. cerevisiae evolution, presenting the genetic structure of a previously unknown population of yeasts, associated with Crohns disease, providing evidence for clonal expansion within humans gut. To understand the role of immune function in the human-yeast interaction we classified strains according to their immunomodulatory properties, discovering a set of genetically homogeneous isolates, capable of inducing anti-inflammatory signals via regulatory T cells proliferation, and on the contrary, a positive association between strain mosaicism and ability to elicit inflammatory, IL-17 driven, immune responses. The approach integrating genomics with immune phenotyping showed selection on genes involved in sporulation and cell wall remodeling as central for the evolution of S. cerevisiae Crohns strains from passengers to commensals to potential pathogens.", "date": "2014-01-17", "journal": ""}
+{"title": "Estimating seed bank accumulation and dynamics in three obligate-seeder Proteaceae species", "doi": "10.1101/001867", "authors": "Meaghan E. Jenkins;David Morrison;Tony D. Auld;", "abstract": "The seed bank dynamics of the three co-occurring obligate-seeder (i.e. fire-sensitive) Proteaceae species, Banksia ericifolia, Banksia marginata and Petrophile pulchella, were examined at sites of varying time since the most recent fire (i.e. plant age) in the Sydney region. Significant variation among species was found in the number of cones produced, the position of the cones within the canopy, the percentage of barren cones produced (Banksia species only), the number of follicles/bracts produced per cone, and the number of seeds lost/released due to spontaneous fruit rupture. Thus, three different regeneration strategies were observed, highlighting the variation in reproductive strategies of co-occurring Proteaceae species. Ultimately, B. marginata potentially accumulated a seed bank of [~]3000 seeds per plant after 20 years, with [~]1500 seeds per plant for P. pulchella and [~]500 for B. ericifolia. Based on these data, B. marginata and B. ericifolia require a minimum fire-free period of 8-10 years, with 7-8 years for P. pulchella, to allow for an adequate seed bank to accumulate and thus ensure local persistence of these species in fire-prone habitats.", "date": "2014-01-17", "journal": ""}
+{"title": "How and where to look for tRNAs in Metazoan mitochondrial genomes, and what you might find when you get there", "doi": "10.1101/001875", "authors": "David Morrison;", "abstract": "The ability to locate and annotate mitochondrial genes is an important practical issue, given the rapidly increasing number of mitogenomes appearing in the public databases. Unfortunately, tRNA genes in Metazoan mitochondria have proved to be problematic because they often vary in number (genes missing or duplicated) and also in the secondary structure of the transcribed tRNAs (T or D arms missing). I have performed a series of comparative analyses of the tRNA genes of a broad range of Metazoan mitogenomes in order to address this issue. I conclude that no single computer program is necessarily capable of finding all of the tRNA genes in any given mitogenome, and that use of both the ARWEN and DOGMA programs is sometimes necessary because they produce complementary false negatives. There are apparently a very large number of erroneous annotations in the databased mitogenome sequences, including missed genes, wrongly annotated locations, false complements, and inconsistent criteria for assigning the 5' and 3' boundaries; and I have listed many of these. The extent of overlap between genes is often greatly exaggerated due to inconsistent annotations, although notable overlaps involving tRNAs are apparently real. Finally, three novel hypotheses were examined and found to have support from the comparative analyses: (1) some organisms have mitogenomic locations that simultaneously code for multiple tRNAs; (2) some organisms have mitogenomic locations that simultaneously code for tRNAs and proteins (but not rRNAs); and (3) one group of nematodes has several genes that code for tRNAs lacking both the D and T arms.", "date": "2014-01-17", "journal": ""}
diff --git a/paperscraper/tests/test_dump.py b/paperscraper/tests/test_dump.py
@@ -0,0 +1,97 @@
+import logging
+import importlib
+import os
+import threading
+
+import pytest
+
+from paperscraper import dump_queries
+from paperscraper.arxiv import get_and_dump_arxiv_papers
+from paperscraper.get_dumps import biorxiv, chemrxiv, medrxiv
+from paperscraper.load_dumps import QUERY_FN_DICT
+import paperscraper.load_dumps as load_dumps_module
+
+
+logging.disable(logging.INFO)
+
+covid19 = ["COVID-19", "SARS-CoV-2"]
+ai = ["Artificial intelligence", "Deep learning", "Machine learning"]
+mi = ["Medical imaging"]
+
+
+class TestDumper:
+
+    def test_dump_existence_initial(self):
+        # This test checks the initial state, should be run first if order matters
+        assert len(QUERY_FN_DICT) == 2, "Initial length of QUERY_FN_DICT should be 2"
+
+
+    @pytest.fixture
+    def setup_medrxiv(self):
+        return medrxiv
+
+    @pytest.fixture
+    def setup_biorxiv(self):
+        return lambda: biorxiv(max_retries=2)
+
+    @pytest.fixture
+    def setup_chemrxiv(self):
+        return chemrxiv
+
+    def run_function_with_timeout(self, func, timeout):
+        # Define the target function for the thread
+        def target():
+            func()
+
+        # Create a daemon thread that runs the target function
+        thread = threading.Thread(target=target)
+        thread.daemon = True  # This makes the thread exit when the main thread exits
+        thread.start()
+        thread.join(
+            timeout=timeout
+        )  # Wait for the specified time or until the function finishes
+        if thread.is_alive():
+            return True  # Function is still running, which is our success condition
+        return False  # Function has completed or failed within the timeout, which we don't expect
+
+    @pytest.mark.timeout(30)
+    def test_medrxiv(self, setup_medrxiv):
+        # Check that the function runs for at least 15 seconds
+        assert self.run_function_with_timeout(
+            setup_medrxiv, 15
+        ), "medrxiv should still be running after 15 seconds"
+
+    @pytest.mark.timeout(30)
+    def test_biorxiv(self, setup_biorxiv):
+        # Check that the function runs for at least 15 seconds
+        assert self.run_function_with_timeout(
+            setup_biorxiv, 15
+        ), "biorxiv should still be running after 15 seconds"
+
+    @pytest.mark.timeout(30)
+    def test_chemrxiv(self, setup_chemrxiv):
+        # Check that the function runs for at least 15 seconds
+        assert self.run_function_with_timeout(
+            setup_chemrxiv, 15
+        ), "chemrxiv should still be running after 15 seconds"
+
+    def test_chemrxiv_date(self):
+        chemrxiv(begin_date="2024-06-01", end_date="2024-06-02")
+
+    def test_biorxiv_date(self):
+        biorxiv(begin_date="2024-06-01", end_date="2024-06-02")
+
+    def test_dumping(self):
+        queries = [[covid19, ai, mi]]
+        dump_queries(queries, "tmpdir")
+        assert os.path.exists("tmpdir/pubmed")
+
+    def test_arxiv_dumping(self):
+        query = [covid19, ai, mi]
+        get_and_dump_arxiv_papers(query, output_filepath="covid19_ai_imaging.jsonl")
+        assert os.path.exists("covid19_ai_imaging.jsonl")
+
+    def test_dump_existence(self):
+        importlib.reload(load_dumps_module)
+        from paperscraper.load_dumps import QUERY_FN_DICT
+        assert len(QUERY_FN_DICT) > 2, "Expected QUERY_FN_DICT to be updated by previous tests"
diff --git a/paperscraper/tests/test_dumpy.py b/paperscraper/tests/test_dumpy.py
diff --git a/paperscraper/tests/test_impactor.py b/paperscraper/tests/test_impactor.py
@@ -40,9 +40,10 @@ def test_impact_factor_filtering(self, impactor: Impactor):
 
     def test_return_all_fields(self, impactor: Impactor):
         results = impactor.search("nature chem", return_all=True)
-        assert all(
-            len(r) > 3 for r in results
-        )  # Check if more than the basic fields are returned
+        for sorting in ["impact", "journal", "score"]:
+            assert all(
+                len(r) > 3 for r in results
+            )  # Check if more than the basic fields are returned
 
     def test_quantum_information_search(self, impactor):
         expected_results = [
@@ -67,3 +68,17 @@ def test_quantum_information_search(self, impactor):
             assert (
                 expected["score"] == actual["score"]
             ), f"Score does not match for {expected['journal']}"
+
+    def test_type_error(self, impactor: Impactor):
+        with pytest.raises(TypeError):
+            impactor.search(123, threshold=99)  # query is not a str
+        with pytest.raises(TypeError):
+            impactor.search("Nature", threshold="99")  # threshold is not an int
+
+    def test_value_error(self, impactor: Impactor):
+        with pytest.raises(ValueError):
+            impactor.search("Nature", threshold=-1)
+
+    def test_nlm_id(self, impactor: Impactor):
+        results = impactor.search("101528555", return_all=True)
+        assert len(results) > 0