-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
86f7bcb
commit dc681c5
Showing
17 changed files
with
333 additions
and
70 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
"""Misc utils to download chemRxiv dump""" | ||
|
||
import json | ||
import logging | ||
import os | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import os | ||
import tempfile | ||
from unittest.mock import patch | ||
|
||
import pytest | ||
|
||
from paperscraper.pubmed import get_and_dump_pubmed_papers, get_pubmed_papers | ||
from paperscraper.pubmed.utils import get_query_from_keywords_and_date | ||
|
||
KEYWORDS = [["machine learning", "deep learning"], ["zoology"]] | ||
|
||
|
||
class TestPubMed: | ||
|
||
def test_get_and_dump_pubmed(self): | ||
with tempfile.TemporaryDirectory() as temp_dir: | ||
output_filepath = os.path.join(temp_dir, "tmp.jsonl") | ||
get_and_dump_pubmed_papers(KEYWORDS, output_filepath=output_filepath) | ||
assert os.path.exists(output_filepath), "File was not created" | ||
|
||
def test_email(self): | ||
query = get_query_from_keywords_and_date(KEYWORDS, start_date="2020/07/20") | ||
df = get_pubmed_papers(query, fields=["emails", "title", "authors"]) | ||
assert "emails" in df.columns | ||
|
||
query = get_query_from_keywords_and_date(KEYWORDS, end_date="2020/07/20") | ||
df = get_pubmed_papers(query, fields=["emails", "title", "authors"]) | ||
assert "emails" in df.columns | ||
|
||
query = get_query_from_keywords_and_date( | ||
KEYWORDS, start_date="2020/07/10", end_date="2020/07/20" | ||
) | ||
df = get_pubmed_papers(query, fields=["emails", "title", "authors"]) | ||
assert "emails" in df.columns |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{"title": "Population genomics of Saccharomyces cerevisiae human isolates: passengers, colonizers, invaders.", "doi": "10.1101/001891", "authors": "Carlotta De Filippo;Monica Di Paola;Irene Stefanini;Lisa Rizzetto;Luisa Bern\u00e1;Matteo Ramazzotti;Leonardo Dapporto;Damariz Rivero;Ivo G Gut;Marta Gut;M\u00f3nica Bay\u00e9s;Jean-Luc Legras;Roberto Viola;Cristina Massi-Benedetti;Antonella De Luca;Luigina Romani;Paolo Lionetti;Duccio Cavalieri;", "abstract": "The quest for the ecological niches of Saccharomyces cerevisiae ranged from wineries to oaks and more recently to the gut of Crabro Wasps. Here we propose the role of the human gut in shaping S. cerevisiae evolution, presenting the genetic structure of a previously unknown population of yeasts, associated with Crohns disease, providing evidence for clonal expansion within humans gut. To understand the role of immune function in the human-yeast interaction we classified strains according to their immunomodulatory properties, discovering a set of genetically homogeneous isolates, capable of inducing anti-inflammatory signals via regulatory T cells proliferation, and on the contrary, a positive association between strain mosaicism and ability to elicit inflammatory, IL-17 driven, immune responses. The approach integrating genomics with immune phenotyping showed selection on genes involved in sporulation and cell wall remodeling as central for the evolution of S. cerevisiae Crohns strains from passengers to commensals to potential pathogens.", "date": "2014-01-17", "journal": ""} | ||
{"title": "Estimating seed bank accumulation and dynamics in three obligate-seeder Proteaceae species", "doi": "10.1101/001867", "authors": "Meaghan E. Jenkins;David Morrison;Tony D. Auld;", "abstract": "The seed bank dynamics of the three co-occurring obligate-seeder (i.e. fire-sensitive) Proteaceae species, Banksia ericifolia, Banksia marginata and Petrophile pulchella, were examined at sites of varying time since the most recent fire (i.e. plant age) in the Sydney region. Significant variation among species was found in the number of cones produced, the position of the cones within the canopy, the percentage of barren cones produced (Banksia species only), the number of follicles/bracts produced per cone, and the number of seeds lost/released due to spontaneous fruit rupture. Thus, three different regeneration strategies were observed, highlighting the variation in reproductive strategies of co-occurring Proteaceae species. Ultimately, B. marginata potentially accumulated a seed bank of [~]3000 seeds per plant after 20 years, with [~]1500 seeds per plant for P. pulchella and [~]500 for B. ericifolia. Based on these data, B. marginata and B. ericifolia require a minimum fire-free period of 8-10 years, with 7-8 years for P. pulchella, to allow for an adequate seed bank to accumulate and thus ensure local persistence of these species in fire-prone habitats.", "date": "2014-01-17", "journal": ""} | ||
{"title": "How and where to look for tRNAs in Metazoan mitochondrial genomes, and what you might find when you get there", "doi": "10.1101/001875", "authors": "David Morrison;", "abstract": "The ability to locate and annotate mitochondrial genes is an important practical issue, given the rapidly increasing number of mitogenomes appearing in the public databases. Unfortunately, tRNA genes in Metazoan mitochondria have proved to be problematic because they often vary in number (genes missing or duplicated) and also in the secondary structure of the transcribed tRNAs (T or D arms missing). I have performed a series of comparative analyses of the tRNA genes of a broad range of Metazoan mitogenomes in order to address this issue. I conclude that no single computer program is necessarily capable of finding all of the tRNA genes in any given mitogenome, and that use of both the ARWEN and DOGMA programs is sometimes necessary because they produce complementary false negatives. There are apparently a very large number of erroneous annotations in the databased mitogenome sequences, including missed genes, wrongly annotated locations, false complements, and inconsistent criteria for assigning the 5' and 3' boundaries; and I have listed many of these. The extent of overlap between genes is often greatly exaggerated due to inconsistent annotations, although notable overlaps involving tRNAs are apparently real. Finally, three novel hypotheses were examined and found to have support from the comparative analyses: (1) some organisms have mitogenomic locations that simultaneously code for multiple tRNAs; (2) some organisms have mitogenomic locations that simultaneously code for tRNAs and proteins (but not rRNAs); and (3) one group of nematodes has several genes that code for tRNAs lacking both the D and T arms.", "date": "2014-01-17", "journal": ""} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import logging | ||
import importlib | ||
import os | ||
import threading | ||
|
||
import pytest | ||
|
||
from paperscraper import dump_queries | ||
from paperscraper.arxiv import get_and_dump_arxiv_papers | ||
from paperscraper.get_dumps import biorxiv, chemrxiv, medrxiv | ||
from paperscraper.load_dumps import QUERY_FN_DICT | ||
import paperscraper.load_dumps as load_dumps_module | ||
|
||
|
||
logging.disable(logging.INFO) | ||
|
||
covid19 = ["COVID-19", "SARS-CoV-2"] | ||
ai = ["Artificial intelligence", "Deep learning", "Machine learning"] | ||
mi = ["Medical imaging"] | ||
|
||
|
||
class TestDumper: | ||
|
||
def test_dump_existence_initial(self): | ||
# This test checks the initial state, should be run first if order matters | ||
assert len(QUERY_FN_DICT) == 2, "Initial length of QUERY_FN_DICT should be 2" | ||
|
||
|
||
@pytest.fixture | ||
def setup_medrxiv(self): | ||
return medrxiv | ||
|
||
@pytest.fixture | ||
def setup_biorxiv(self): | ||
return lambda: biorxiv(max_retries=2) | ||
|
||
@pytest.fixture | ||
def setup_chemrxiv(self): | ||
return chemrxiv | ||
|
||
def run_function_with_timeout(self, func, timeout): | ||
# Define the target function for the thread | ||
def target(): | ||
func() | ||
|
||
# Create a daemon thread that runs the target function | ||
thread = threading.Thread(target=target) | ||
thread.daemon = True # This makes the thread exit when the main thread exits | ||
thread.start() | ||
thread.join( | ||
timeout=timeout | ||
) # Wait for the specified time or until the function finishes | ||
if thread.is_alive(): | ||
return True # Function is still running, which is our success condition | ||
return False # Function has completed or failed within the timeout, which we don't expect | ||
|
||
@pytest.mark.timeout(30) | ||
def test_medrxiv(self, setup_medrxiv): | ||
# Check that the function runs for at least 15 seconds | ||
assert self.run_function_with_timeout( | ||
setup_medrxiv, 15 | ||
), "medrxiv should still be running after 15 seconds" | ||
|
||
@pytest.mark.timeout(30) | ||
def test_biorxiv(self, setup_biorxiv): | ||
# Check that the function runs for at least 15 seconds | ||
assert self.run_function_with_timeout( | ||
setup_biorxiv, 15 | ||
), "biorxiv should still be running after 15 seconds" | ||
|
||
@pytest.mark.timeout(30) | ||
def test_chemrxiv(self, setup_chemrxiv): | ||
# Check that the function runs for at least 15 seconds | ||
assert self.run_function_with_timeout( | ||
setup_chemrxiv, 15 | ||
), "chemrxiv should still be running after 15 seconds" | ||
|
||
def test_chemrxiv_date(self): | ||
chemrxiv(begin_date="2024-06-01", end_date="2024-06-02") | ||
|
||
def test_biorxiv_date(self): | ||
biorxiv(begin_date="2024-06-01", end_date="2024-06-02") | ||
|
||
def test_dumping(self): | ||
queries = [[covid19, ai, mi]] | ||
dump_queries(queries, "tmpdir") | ||
assert os.path.exists("tmpdir/pubmed") | ||
|
||
def test_arxiv_dumping(self): | ||
query = [covid19, ai, mi] | ||
get_and_dump_arxiv_papers(query, output_filepath="covid19_ai_imaging.jsonl") | ||
assert os.path.exists("covid19_ai_imaging.jsonl") | ||
|
||
def test_dump_existence(self): | ||
importlib.reload(load_dumps_module) | ||
from paperscraper.load_dumps import QUERY_FN_DICT | ||
assert len(QUERY_FN_DICT) > 2, "Expected QUERY_FN_DICT to be updated by previous tests" |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.