Skip to content

Commit

Permalink
Expand unit tests (#49)
Browse files Browse the repository at this point in the history
  • Loading branch information
jannisborn authored Jul 6, 2024
1 parent 86f7bcb commit dc681c5
Show file tree
Hide file tree
Showing 17 changed files with 333 additions and 70 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/test_tip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ jobs:
coverage report
coverage xml -o coverage.xml
- name: Upload to Codecov
if: matrix.python-version == '3.9'
if: matrix.python-version == '3.8'
uses: codecov/codecov-action@v2
with:
files: coverage.xml
token: ${{ secrets.CODECOV_TOKEN }} # Use the token here
token: ${{ secrets.CODECOV_TOKEN }}
fail_ci_if_error: true

test-potential-wheel-install:
Expand Down
3 changes: 1 addition & 2 deletions paperscraper/arxiv/arxiv.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from typing import Dict, List, Union

import arxiv
import pandas as pd
from tqdm import tqdm

import arxiv

from ..utils import dump_papers
from .utils import get_query_from_keywords

Expand Down
1 change: 1 addition & 0 deletions paperscraper/get_dumps/utils/chemrxiv/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Misc utils to download chemRxiv dump"""

import json
import logging
import os
Expand Down
16 changes: 9 additions & 7 deletions paperscraper/impact.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,15 @@ def calculate_fuzziness_score(row):

# Prepare the final result
results = [
row.to_dict()
if return_all
else {
"journal": row["journal"],
"factor": row["factor"],
"score": row["score"],
}
(
row.to_dict()
if return_all
else {
"journal": row["journal"],
"factor": row["factor"],
"score": row["score"],
}
)
for _, row in matched_df.iterrows()
]

Expand Down
2 changes: 1 addition & 1 deletion paperscraper/load_dumps.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,6 @@

if len(QUERY_FN_DICT) == 2:
logger.warning(
" No dumps found for either biorxiv or medrxiv."
" No dumps found for either biorxiv, medrxiv and chemrxiv."
" Consider using paperscraper.get_dumps.* to fetch the dumps."
)
6 changes: 4 additions & 2 deletions paperscraper/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,10 @@ def plot_single(
plt.bar(ind, np.zeros((len(ind),)), color="k", bottom=bottom)
)

plt.ylabel("Counts", size=17) if not logscale else plt.ylabel(
"Counts (log scale)", size=17
(
plt.ylabel("Counts", size=17)
if not logscale
else plt.ylabel("Counts (log scale)", size=17)
)
plt.xlabel("Years", size=17)
plt.title(title_text, size=17)
Expand Down
Empty file.
34 changes: 34 additions & 0 deletions paperscraper/pubmed/tests/test_pubmed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os
import tempfile
from unittest.mock import patch

import pytest

from paperscraper.pubmed import get_and_dump_pubmed_papers, get_pubmed_papers
from paperscraper.pubmed.utils import get_query_from_keywords_and_date

KEYWORDS = [["machine learning", "deep learning"], ["zoology"]]


class TestPubMed:

def test_get_and_dump_pubmed(self):
with tempfile.TemporaryDirectory() as temp_dir:
output_filepath = os.path.join(temp_dir, "tmp.jsonl")
get_and_dump_pubmed_papers(KEYWORDS, output_filepath=output_filepath)
assert os.path.exists(output_filepath), "File was not created"

def test_email(self):
query = get_query_from_keywords_and_date(KEYWORDS, start_date="2020/07/20")
df = get_pubmed_papers(query, fields=["emails", "title", "authors"])
assert "emails" in df.columns

query = get_query_from_keywords_and_date(KEYWORDS, end_date="2020/07/20")
df = get_pubmed_papers(query, fields=["emails", "title", "authors"])
assert "emails" in df.columns

query = get_query_from_keywords_and_date(
KEYWORDS, start_date="2020/07/10", end_date="2020/07/20"
)
df = get_pubmed_papers(query, fields=["emails", "title", "authors"])
assert "emails" in df.columns
14 changes: 9 additions & 5 deletions paperscraper/scholar/tests/test_scholar.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import functools
import logging

import pandas as pd
import pytest
from scholarly._proxy_generator import MaxTriesExceededException
import functools

from paperscraper.scholar import (
get_and_dump_scholar_papers,
Expand Down Expand Up @@ -30,8 +31,7 @@ class TestScholar:
@handle_scholar_exception
def test_citations(self):
num = get_citations_from_title("GT4SD")
assert isinstance(num, int)
assert num > 0
assert isinstance(num, int) and num > 0

@handle_scholar_exception
def test_dump_search(self, tmpdir):
Expand All @@ -43,8 +43,7 @@ def test_dump_search(self, tmpdir):
@handle_scholar_exception
def test_basic_search(self):
results = get_scholar_papers("GT4SD")
assert len(results) > 0 # Ensure we get some results
assert isinstance(results, pd.DataFrame)
assert len(results) > 0 and isinstance(results, pd.DataFrame)
assert all(
[
x in results.columns
Expand All @@ -58,3 +57,8 @@ def test_basic_search(self):
]
]
)

@handle_scholar_exception
def test_bad_search(self):
results = get_scholar_papers("GT4SDfsdhfiobfpsdfbsdp")
assert len(results) == 0
3 changes: 3 additions & 0 deletions paperscraper/tests/test_dump.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"title": "Population genomics of Saccharomyces cerevisiae human isolates: passengers, colonizers, invaders.", "doi": "10.1101/001891", "authors": "Carlotta De Filippo;Monica Di Paola;Irene Stefanini;Lisa Rizzetto;Luisa Bern\u00e1;Matteo Ramazzotti;Leonardo Dapporto;Damariz Rivero;Ivo G Gut;Marta Gut;M\u00f3nica Bay\u00e9s;Jean-Luc Legras;Roberto Viola;Cristina Massi-Benedetti;Antonella De Luca;Luigina Romani;Paolo Lionetti;Duccio Cavalieri;", "abstract": "The quest for the ecological niches of Saccharomyces cerevisiae ranged from wineries to oaks and more recently to the gut of Crabro Wasps. Here we propose the role of the human gut in shaping S. cerevisiae evolution, presenting the genetic structure of a previously unknown population of yeasts, associated with Crohns disease, providing evidence for clonal expansion within humans gut. To understand the role of immune function in the human-yeast interaction we classified strains according to their immunomodulatory properties, discovering a set of genetically homogeneous isolates, capable of inducing anti-inflammatory signals via regulatory T cells proliferation, and on the contrary, a positive association between strain mosaicism and ability to elicit inflammatory, IL-17 driven, immune responses. The approach integrating genomics with immune phenotyping showed selection on genes involved in sporulation and cell wall remodeling as central for the evolution of S. cerevisiae Crohns strains from passengers to commensals to potential pathogens.", "date": "2014-01-17", "journal": ""}
{"title": "Estimating seed bank accumulation and dynamics in three obligate-seeder Proteaceae species", "doi": "10.1101/001867", "authors": "Meaghan E. Jenkins;David Morrison;Tony D. Auld;", "abstract": "The seed bank dynamics of the three co-occurring obligate-seeder (i.e. fire-sensitive) Proteaceae species, Banksia ericifolia, Banksia marginata and Petrophile pulchella, were examined at sites of varying time since the most recent fire (i.e. plant age) in the Sydney region. Significant variation among species was found in the number of cones produced, the position of the cones within the canopy, the percentage of barren cones produced (Banksia species only), the number of follicles/bracts produced per cone, and the number of seeds lost/released due to spontaneous fruit rupture. Thus, three different regeneration strategies were observed, highlighting the variation in reproductive strategies of co-occurring Proteaceae species. Ultimately, B. marginata potentially accumulated a seed bank of [~]3000 seeds per plant after 20 years, with [~]1500 seeds per plant for P. pulchella and [~]500 for B. ericifolia. Based on these data, B. marginata and B. ericifolia require a minimum fire-free period of 8-10 years, with 7-8 years for P. pulchella, to allow for an adequate seed bank to accumulate and thus ensure local persistence of these species in fire-prone habitats.", "date": "2014-01-17", "journal": ""}
{"title": "How and where to look for tRNAs in Metazoan mitochondrial genomes, and what you might find when you get there", "doi": "10.1101/001875", "authors": "David Morrison;", "abstract": "The ability to locate and annotate mitochondrial genes is an important practical issue, given the rapidly increasing number of mitogenomes appearing in the public databases. Unfortunately, tRNA genes in Metazoan mitochondria have proved to be problematic because they often vary in number (genes missing or duplicated) and also in the secondary structure of the transcribed tRNAs (T or D arms missing). I have performed a series of comparative analyses of the tRNA genes of a broad range of Metazoan mitogenomes in order to address this issue. I conclude that no single computer program is necessarily capable of finding all of the tRNA genes in any given mitogenome, and that use of both the ARWEN and DOGMA programs is sometimes necessary because they produce complementary false negatives. There are apparently a very large number of erroneous annotations in the databased mitogenome sequences, including missed genes, wrongly annotated locations, false complements, and inconsistent criteria for assigning the 5' and 3' boundaries; and I have listed many of these. The extent of overlap between genes is often greatly exaggerated due to inconsistent annotations, although notable overlaps involving tRNAs are apparently real. Finally, three novel hypotheses were examined and found to have support from the comparative analyses: (1) some organisms have mitogenomic locations that simultaneously code for multiple tRNAs; (2) some organisms have mitogenomic locations that simultaneously code for tRNAs and proteins (but not rRNAs); and (3) one group of nematodes has several genes that code for tRNAs lacking both the D and T arms.", "date": "2014-01-17", "journal": ""}
97 changes: 97 additions & 0 deletions paperscraper/tests/test_dump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import logging
import importlib
import os
import threading

import pytest

from paperscraper import dump_queries
from paperscraper.arxiv import get_and_dump_arxiv_papers
from paperscraper.get_dumps import biorxiv, chemrxiv, medrxiv
from paperscraper.load_dumps import QUERY_FN_DICT
import paperscraper.load_dumps as load_dumps_module


logging.disable(logging.INFO)

covid19 = ["COVID-19", "SARS-CoV-2"]
ai = ["Artificial intelligence", "Deep learning", "Machine learning"]
mi = ["Medical imaging"]


class TestDumper:

def test_dump_existence_initial(self):
# This test checks the initial state, should be run first if order matters
assert len(QUERY_FN_DICT) == 2, "Initial length of QUERY_FN_DICT should be 2"


@pytest.fixture
def setup_medrxiv(self):
return medrxiv

@pytest.fixture
def setup_biorxiv(self):
return lambda: biorxiv(max_retries=2)

@pytest.fixture
def setup_chemrxiv(self):
return chemrxiv

def run_function_with_timeout(self, func, timeout):
# Define the target function for the thread
def target():
func()

# Create a daemon thread that runs the target function
thread = threading.Thread(target=target)
thread.daemon = True # This makes the thread exit when the main thread exits
thread.start()
thread.join(
timeout=timeout
) # Wait for the specified time or until the function finishes
if thread.is_alive():
return True # Function is still running, which is our success condition
return False # Function has completed or failed within the timeout, which we don't expect

@pytest.mark.timeout(30)
def test_medrxiv(self, setup_medrxiv):
# Check that the function runs for at least 15 seconds
assert self.run_function_with_timeout(
setup_medrxiv, 15
), "medrxiv should still be running after 15 seconds"

@pytest.mark.timeout(30)
def test_biorxiv(self, setup_biorxiv):
# Check that the function runs for at least 15 seconds
assert self.run_function_with_timeout(
setup_biorxiv, 15
), "biorxiv should still be running after 15 seconds"

@pytest.mark.timeout(30)
def test_chemrxiv(self, setup_chemrxiv):
# Check that the function runs for at least 15 seconds
assert self.run_function_with_timeout(
setup_chemrxiv, 15
), "chemrxiv should still be running after 15 seconds"

def test_chemrxiv_date(self):
chemrxiv(begin_date="2024-06-01", end_date="2024-06-02")

def test_biorxiv_date(self):
biorxiv(begin_date="2024-06-01", end_date="2024-06-02")

def test_dumping(self):
queries = [[covid19, ai, mi]]
dump_queries(queries, "tmpdir")
assert os.path.exists("tmpdir/pubmed")

def test_arxiv_dumping(self):
query = [covid19, ai, mi]
get_and_dump_arxiv_papers(query, output_filepath="covid19_ai_imaging.jsonl")
assert os.path.exists("covid19_ai_imaging.jsonl")

def test_dump_existence(self):
importlib.reload(load_dumps_module)
from paperscraper.load_dumps import QUERY_FN_DICT
assert len(QUERY_FN_DICT) > 2, "Expected QUERY_FN_DICT to be updated by previous tests"
48 changes: 0 additions & 48 deletions paperscraper/tests/test_dumpy.py

This file was deleted.

21 changes: 18 additions & 3 deletions paperscraper/tests/test_impactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ def test_impact_factor_filtering(self, impactor: Impactor):

def test_return_all_fields(self, impactor: Impactor):
results = impactor.search("nature chem", return_all=True)
assert all(
len(r) > 3 for r in results
) # Check if more than the basic fields are returned
for sorting in ["impact", "journal", "score"]:
assert all(
len(r) > 3 for r in results
) # Check if more than the basic fields are returned

def test_quantum_information_search(self, impactor):
expected_results = [
Expand All @@ -67,3 +68,17 @@ def test_quantum_information_search(self, impactor):
assert (
expected["score"] == actual["score"]
), f"Score does not match for {expected['journal']}"

def test_type_error(self, impactor: Impactor):
with pytest.raises(TypeError):
impactor.search(123, threshold=99) # query is not a str
with pytest.raises(TypeError):
impactor.search("Nature", threshold="99") # threshold is not an int

def test_value_error(self, impactor: Impactor):
with pytest.raises(ValueError):
impactor.search("Nature", threshold=-1)

def test_nlm_id(self, impactor: Impactor):
results = impactor.search("101528555", return_all=True)
assert len(results) > 0
Loading

0 comments on commit dc681c5

Please sign in to comment.