diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 36ce5ce..8136354 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -12,5 +12,10 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip3 install -e . + pip install -r requirements.txt + pip install -e . python -c "import paperscraper" + - name: Run test suite + run: | + python -m pytest -sv paperscraper + diff --git a/README.md b/README.md index 5fa2e00..6a88d4c 100644 --- a/README.md +++ b/README.md @@ -149,14 +149,41 @@ get_citations_from_title(title) *NOTE*: The scholar endpoint does not require authentification but since it regularly prompts with captchas, it's difficult to apply large scale. -#### Journal impact factor +### Journal impact factor -You can also retrieve the impact factor for all journals indexed by citefactor: +You can also retrieve the impact factor for all journals: ```py -from paperscraper.journal_if import Impactor -i = Impactor() +>>>from paperscraper.impact import Impactor +>>>i = Impactor() +>>>i.search("Nat Comms", threshold=85, sort_by='impact') +[ + {'journal': 'Nature Communications', 'factor': 17.694, 'score': 94}, + {'journal': 'Natural Computing', 'factor': 1.504, 'score': 88} +] +``` +This performs a fuzzy search with a threshold of 85. `threshold` defaults to 100 in which case an exact search +is performed. You can also search by journal abbreviation, [E-ISSN](https://portal.issn.org) or [NLM ID](https://portal.issn.org). +```py +i.search("Nat Rev Earth Environ") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}] +i.search("101771060") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}] +i.search('2662-138X') # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}] + +# Filter results by impact factor +i.search("Neural network", threshold=85, min_impact=1.5, max_impact=20) +# [ +# {'journal': 'IEEE Transactions on Neural Networks and Learning Systems', 'factor': 14.255, 'score': 93}, +# {'journal': 'NEURAL NETWORKS', 'factor': 9.657, 'score': 91}, +# {'journal': 'WORK-A Journal of Prevention Assessment & Rehabilitation', 'factor': 1.803, 'score': 86}, +# {'journal': 'NETWORK-COMPUTATION IN NEURAL SYSTEMS', 'factor': 1.5, 'score': 92} +# ] + +# Show all fields +i.search("quantum information", threshold=90, return_all=True) +# [ +# {'factor': 10.758, 'jcr': 'Q1', 'journal_abbr': 'npj Quantum Inf', 'eissn': '2056-6387', 'journal': 'npj Quantum Information', 'nlm_id': '101722857', 'issn': '', 'score': 92}, +# {'factor': 1.577, 'jcr': 'Q3', 'journal_abbr': 'Nation', 'eissn': '0027-8378', 'journal': 'NATION', 'nlm_id': '9877123', 'issn': '0027-8378', 'score': 91} +# ] ``` -Then, `i.journal_to_if` should give you a dictionary wit journal to IF mappings for >9000 journals as of 2014. ### Plotting diff --git a/paperscraper/__init__.py b/paperscraper/__init__.py index 4d2c3f6..e4dd77e 100644 --- a/paperscraper/__init__.py +++ b/paperscraper/__init__.py @@ -1,6 +1,6 @@ """Initialize the module.""" __name__ = "paperscraper" -__version__ = "0.2.8" +__version__ = "0.2.9" import logging import os diff --git a/paperscraper/impact.py b/paperscraper/impact.py new file mode 100644 index 0000000..1eb5a53 --- /dev/null +++ b/paperscraper/impact.py @@ -0,0 +1,111 @@ +import logging +from typing import Any, Dict, List, Optional + +import pandas as pd +from impact_factor.core import Factor +from thefuzz import fuzz + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +logging.disable(logging.INFO) + + +class Impactor: + def __init__(self): + """ + Initialize the Impactor class with an instance of the Factor class. + This allows access to the database of journal impact factors. + """ + self.fa = Factor() + self.all_journals = self.fa.search("%") + self.metadata = pd.DataFrame(self.all_journals, dtype=str) + logger.info(f"Loaded metadata for {len(self.metadata)} journals") + + def search( + self, + query: str, + threshold: int = 100, + sort_by: Optional[str] = None, + min_impact: float = 0.0, + max_impact: float = float("inf"), + return_all: bool = False, + ) -> List[Dict[str, Any]]: + """ + Search for journals matching the given query with an optional fuzziness + level and sorting. + + Args: + query: The journal name or abbreviation to search for. + threshold: The threshold for fuzzy matching. If set to 100, exact matching + is performed. If set below 100, fuzzy matching is used. Defaults to 100. + sort_by: Criterion for sorting results, one of 'impact', 'journal' and 'score'. + min_impact: Minimum impact factor for journals to be considered, defaults to 0. + max_impact: Maximum impact factor for journals to be considered, defaults to infinity. + return_all: If True, returns all columns of the DataFrame for each match. + + Returns: + List[dict]: A list of dictionaries containing the journal information. + + """ + # Validation of parameters + if not isinstance(query, str) or not isinstance(threshold, int): + raise TypeError( + f"Query must be a str and threshold must be an int, not {type(query)} and {type(threshold)}" + ) + if threshold < 0 or threshold > 100: + raise ValueError( + f"Fuzziness threshold must be between 0 and 100, not {threshold}" + ) + + if str.isdigit(query) and threshold >= 100: + # When querying with NLM ID, exact matching does not work since impact_factor + # strips off leading zeros, so we use fuzzy matching instead + threshold = 99 + + # Define a function to calculate fuzziness score + def calculate_fuzziness_score(row): + return max(fuzz.partial_ratio(query, str(value)) for value in row.values) + + # Search with or without fuzzy matching + if threshold >= 100: + matched_df = self.metadata[ + self.metadata.apply( + lambda x: query.lower() in x.astype(str).str.lower().values, axis=1 + ) + ].copy() + # Exact matches get a default score of 100 + matched_df["score"] = 100 + else: + matched_df = self.metadata[ + self.metadata.apply( + lambda x: calculate_fuzziness_score(x) >= threshold, axis=1 + ) + ].copy() + matched_df["score"] = matched_df.apply(calculate_fuzziness_score, axis=1) + + # Sorting based on the specified criterion + if sort_by == "score": + matched_df = matched_df.sort_values(by="score", ascending=False) + elif sort_by == "journal": + matched_df = matched_df.sort_values(by="journal") + elif sort_by == "impact": + matched_df = matched_df.sort_values(by="factor", ascending=False) + + matched_df["factor"] = pd.to_numeric(matched_df["factor"]) + matched_df = matched_df[ + (matched_df["factor"] >= min_impact) & (matched_df["factor"] <= max_impact) + ] + + # Prepare the final result + results = [ + row.to_dict() + if return_all + else { + "journal": row["journal"], + "factor": row["factor"], + "score": row["score"], + } + for _, row in matched_df.iterrows() + ] + + return results diff --git a/paperscraper/journal_if.py b/paperscraper/journal_if.py deleted file mode 100644 index e80d954..0000000 --- a/paperscraper/journal_if.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -Class to fetch the impact factor of all citefactor-indexed journals. -Limitation: Fetches the 2014 IFs. - -Adapted from: https://github.com/andrew-hill/impactor/blob/master/impactor.py -Available via MIT License. - -Adaptions: -- Converting code from Python2 to Python3. -- Fetching IFs from *all* journals not just from journals starting with "A". - -""" - -import logging -import pickle -import re -import string -from urllib.request import urlopen - -# http://www.crummy.com/software/BeautifulSoup/ -from bs4 import BeautifulSoup - - -class Impactor(object): - """ - Class to fetch the impact factor of all citefactor-indexed journals as of 2014. - """ - - BASE_URL_PREFIX = r"http://www.citefactor.org/journal-impact-factor-list-" - BASE_URL_SUFFIX = r".html" - URL_REGEX_PREFIX = r"http://www\.citefactor\.org/journal-impact-factor-list-" - URL_REGEX_SUFFIX = r"_?[A-Z]?\.html" - - def __init__(self, journal_db_file=None, year=2014): - logging.debug("journal_db_file={}, year={}".format(journal_db_file, year)) - - self.journal_data = None - self.journal_db_file = journal_db_file - self.matches = set() - self.year = year - - assert year in (2014,), "Can only handle 2014 at the moment." - self.base_url = self.BASE_URL_PREFIX + str(year) + self.BASE_URL_SUFFIX - self.url_regex = self.URL_REGEX_PREFIX + str(year) + self.URL_REGEX_SUFFIX - self.re = re.compile(self.url_regex) - self.load() - self.save() - self.create_if_dict() - - def match(self, search_terms): - # If no terms specified, show all entries - if search_terms is None or len(search_terms) == 0: - for j in self.journal_data.values(): - self.matches.add(j["ISSN"]) - # Otherwise do search - issn_re = re.compile(r"\d{4}-\d{4}") - for s in search_terms: - if issn_re.match(s): - self.matches.add(s) - else: - for j in self.journal_data.values(): - if j["JOURNAL"].lower().find(s.lower()) >= 0: - self.matches.add(j["ISSN"]) - - def load(self): - # Try to load from file - if self.journal_db_file is not None: - try: - with open(self.journal_db_file, "rb") as f: - self.journal_data = pickle.load(f) - logging.debug( - "loaded journals from {}".format(self.journal_db_file) - ) - except Exception: - pass - # If cannot load from file, load from URL - if self.journal_data is None: - logging.info("Fetching database from citefactor.org...") - self.journal_data = self.get_all_journal_data() - - def save(self): - if self.journal_db_file is not None: - try: - with open(self.journal_db_file, "wb") as f: - pickle.dump(self.journal_data, f, -1) - logging.debug("saved journals to {}".format(self.journal_db_file)) - except Exception: - pass - - def get_all_urls(self): - main_page_content = urlopen(self.base_url).read() - soup = BeautifulSoup(main_page_content) - soup.prettify() # necessary? - return [ - self.base_url, - ] + [anchor["href"] for anchor in soup.find_all("a", href=self.re)] - - def get_journal_table(self, url): - content = urlopen(url).read() - soup = BeautifulSoup(content) - soup.prettify() # necessary? - t = soup.table - caption_re = re.compile( - r"^Impact Factor " + str(self.year) - ) # works for Year==2015 only - while t is not None: - if ( - t.caption is None - or t.caption.string is None - or caption_re.match(t.caption.string) is None - ): - t = t.find_next() - continue - return t - - def get_table_headers(self, table): - return [str(x.string) for x in table.tr.find_all("td")] - - def get_journal_data(self, table): - headers = self.get_table_headers(table) - journals = dict() - for row in table.find_all("tr")[1:]: - cells = row.find_all("td") - j = dict(zip(headers, [str(x.string) for x in cells])) - # logging.debug('importing: {}'.format(j)) - journals[j["ISSN"]] = j - return journals - - def get_all_journal_data(self): - journals = dict() - for url in self.get_all_urls(): - - for page in string.ascii_uppercase: - page = "0-A" if page == "A" else page - url_page = url.split("2014")[0] + "2014_" + page + url.split("2014")[1] - table = self.get_journal_table(url_page) - journals.update(self.get_journal_data(table)) - logging.info( - "imported {} journal entries from citefactor.org".format(len(journals)) - ) - return journals - - def create_if_dict(self): - """ - Creates a dictionary with journal names as key (lowercase) and impact factors - as values. - """ - - stringparse = ( - lambda x: str(x).strip().lower().replace("\\", "_").replace(" ", "_") - ) - self.journal_to_if = dict( - (stringparse(value["JOURNAL"]), value["2013/2014"]) - for key, value in self.journal_data.items() - ) diff --git a/paperscraper/tests/__init__.py b/paperscraper/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/paperscraper/tests/test_impactor.py b/paperscraper/tests/test_impactor.py new file mode 100644 index 0000000..99f017b --- /dev/null +++ b/paperscraper/tests/test_impactor.py @@ -0,0 +1,68 @@ +import logging + +import pytest +from paperscraper.impact import Impactor + +logging.disable(logging.INFO) + + +class TestImpactor: + @pytest.fixture + def impactor(self): + return Impactor() + + def test_basic_search(self, impactor: Impactor): + results = impactor.search("Nat Comm", threshold=99, sort_by="score") + assert len(results) > 0 # Ensure we get some results + assert all( + "journal" in r and "factor" in r and "score" in r for r in results + ) # Basic fields are present + + def test_fuzzy_search(self, impactor: Impactor): + results = impactor.search("Nat Comm", threshold=99) + assert any( + r["journal"] == "Nature Communications" for r in results + ) # Check for a specific journal + + def test_sort_by_score(self, impactor: Impactor): + results = impactor.search("nature chem", threshold=80, sort_by="score") + scores = [r["score"] for r in results] + assert scores == sorted( + scores, reverse=True + ) # Ensure results are sorted by score + + def test_impact_factor_filtering(self, impactor: Impactor): + results = impactor.search("Quantum information", threshold=70, min_impact=8) + assert all( + 8 <= r["factor"] for r in results + ) # Check if all results have a factor >= 8 + + def test_return_all_fields(self, impactor: Impactor): + results = impactor.search("nature chem", return_all=True) + assert all( + len(r) > 3 for r in results + ) # Check if more than the basic fields are returned + + def test_quantum_information_search(self, impactor): + expected_results = [ + {"journal": "InfoMat", "factor": 24.798, "score": 71}, + {"journal": "Information Fusion", "factor": 17.564, "score": 71}, + {"journal": "npj Quantum Information", "factor": 10.758, "score": 95}, + ] + + results = impactor.search( + "Quantum information", threshold=70, sort_by="factor", min_impact=8 + ) + + # Ensure that the results match the expected results + assert len(results) == len(expected_results), "Number of results does not match" + for expected, actual in zip(expected_results, results): + assert ( + expected["journal"] == actual["journal"] + ), f"Journal name does not match for {expected['journal']}" + assert ( + abs(expected["factor"] - actual["factor"]) < 0.001 + ), f"Impact factor does not match for {expected['journal']}" + assert ( + expected["score"] == actual["score"] + ), f"Score does not match for {expected['journal']}" diff --git a/requirements.txt b/requirements.txt index fb1116c..57ff25f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,7 @@ scholarly==0.5.1 seaborn>=0.11.0 matplotlib>=3.3.2 matplotlib-venn>=0.11.5 -bs4>=0.0.1 \ No newline at end of file +bs4>=0.0.1 +impact-factor>=1.1.0 +thefuzz>=0.20.0 +pytest \ No newline at end of file