Skip to content

Commit

Permalink
Impact factor (#37)
Browse files Browse the repository at this point in the history
* feat: impact factor class

* test: imapct test

* doc: readme

* wip: finish PR

* ci

* ci: udpate test

* Update README.md
  • Loading branch information
jannisborn authored Dec 24, 2023
1 parent db4f0c1 commit 0ff9218
Show file tree
Hide file tree
Showing 8 changed files with 222 additions and 163 deletions.
7 changes: 6 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,10 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip3 install -e .
pip install -r requirements.txt
pip install -e .
python -c "import paperscraper"
- name: Run test suite
run: |
python -m pytest -sv paperscraper
37 changes: 32 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,14 +149,41 @@ get_citations_from_title(title)
*NOTE*: The scholar endpoint does not require authentification but since it regularly
prompts with captchas, it's difficult to apply large scale.

#### Journal impact factor
### Journal impact factor

You can also retrieve the impact factor for all journals indexed by citefactor:
You can also retrieve the impact factor for all journals:
```py
from paperscraper.journal_if import Impactor
i = Impactor()
>>>from paperscraper.impact import Impactor
>>>i = Impactor()
>>>i.search("Nat Comms", threshold=85, sort_by='impact')
[
{'journal': 'Nature Communications', 'factor': 17.694, 'score': 94},
{'journal': 'Natural Computing', 'factor': 1.504, 'score': 88}
]
```
This performs a fuzzy search with a threshold of 85. `threshold` defaults to 100 in which case an exact search
is performed. You can also search by journal abbreviation, [E-ISSN](https://portal.issn.org) or [NLM ID](https://portal.issn.org).
```py
i.search("Nat Rev Earth Environ") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
i.search("101771060") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
i.search('2662-138X') # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]

# Filter results by impact factor
i.search("Neural network", threshold=85, min_impact=1.5, max_impact=20)
# [
# {'journal': 'IEEE Transactions on Neural Networks and Learning Systems', 'factor': 14.255, 'score': 93},
# {'journal': 'NEURAL NETWORKS', 'factor': 9.657, 'score': 91},
# {'journal': 'WORK-A Journal of Prevention Assessment & Rehabilitation', 'factor': 1.803, 'score': 86},
# {'journal': 'NETWORK-COMPUTATION IN NEURAL SYSTEMS', 'factor': 1.5, 'score': 92}
# ]

# Show all fields
i.search("quantum information", threshold=90, return_all=True)
# [
# {'factor': 10.758, 'jcr': 'Q1', 'journal_abbr': 'npj Quantum Inf', 'eissn': '2056-6387', 'journal': 'npj Quantum Information', 'nlm_id': '101722857', 'issn': '', 'score': 92},
# {'factor': 1.577, 'jcr': 'Q3', 'journal_abbr': 'Nation', 'eissn': '0027-8378', 'journal': 'NATION', 'nlm_id': '9877123', 'issn': '0027-8378', 'score': 91}
# ]
```
Then, `i.journal_to_if` should give you a dictionary wit journal to IF mappings for >9000 journals as of 2014.

### Plotting

Expand Down
2 changes: 1 addition & 1 deletion paperscraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Initialize the module."""
__name__ = "paperscraper"
__version__ = "0.2.8"
__version__ = "0.2.9"

import logging
import os
Expand Down
111 changes: 111 additions & 0 deletions paperscraper/impact.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import logging
from typing import Any, Dict, List, Optional

import pandas as pd
from impact_factor.core import Factor
from thefuzz import fuzz

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logging.disable(logging.INFO)


class Impactor:
def __init__(self):
"""
Initialize the Impactor class with an instance of the Factor class.
This allows access to the database of journal impact factors.
"""
self.fa = Factor()
self.all_journals = self.fa.search("%")
self.metadata = pd.DataFrame(self.all_journals, dtype=str)
logger.info(f"Loaded metadata for {len(self.metadata)} journals")

def search(
self,
query: str,
threshold: int = 100,
sort_by: Optional[str] = None,
min_impact: float = 0.0,
max_impact: float = float("inf"),
return_all: bool = False,
) -> List[Dict[str, Any]]:
"""
Search for journals matching the given query with an optional fuzziness
level and sorting.
Args:
query: The journal name or abbreviation to search for.
threshold: The threshold for fuzzy matching. If set to 100, exact matching
is performed. If set below 100, fuzzy matching is used. Defaults to 100.
sort_by: Criterion for sorting results, one of 'impact', 'journal' and 'score'.
min_impact: Minimum impact factor for journals to be considered, defaults to 0.
max_impact: Maximum impact factor for journals to be considered, defaults to infinity.
return_all: If True, returns all columns of the DataFrame for each match.
Returns:
List[dict]: A list of dictionaries containing the journal information.
"""
# Validation of parameters
if not isinstance(query, str) or not isinstance(threshold, int):
raise TypeError(
f"Query must be a str and threshold must be an int, not {type(query)} and {type(threshold)}"
)
if threshold < 0 or threshold > 100:
raise ValueError(
f"Fuzziness threshold must be between 0 and 100, not {threshold}"
)

if str.isdigit(query) and threshold >= 100:
# When querying with NLM ID, exact matching does not work since impact_factor
# strips off leading zeros, so we use fuzzy matching instead
threshold = 99

# Define a function to calculate fuzziness score
def calculate_fuzziness_score(row):
return max(fuzz.partial_ratio(query, str(value)) for value in row.values)

# Search with or without fuzzy matching
if threshold >= 100:
matched_df = self.metadata[
self.metadata.apply(
lambda x: query.lower() in x.astype(str).str.lower().values, axis=1
)
].copy()
# Exact matches get a default score of 100
matched_df["score"] = 100
else:
matched_df = self.metadata[
self.metadata.apply(
lambda x: calculate_fuzziness_score(x) >= threshold, axis=1
)
].copy()
matched_df["score"] = matched_df.apply(calculate_fuzziness_score, axis=1)

# Sorting based on the specified criterion
if sort_by == "score":
matched_df = matched_df.sort_values(by="score", ascending=False)
elif sort_by == "journal":
matched_df = matched_df.sort_values(by="journal")
elif sort_by == "impact":
matched_df = matched_df.sort_values(by="factor", ascending=False)

matched_df["factor"] = pd.to_numeric(matched_df["factor"])
matched_df = matched_df[
(matched_df["factor"] >= min_impact) & (matched_df["factor"] <= max_impact)
]

# Prepare the final result
results = [
row.to_dict()
if return_all
else {
"journal": row["journal"],
"factor": row["factor"],
"score": row["score"],
}
for _, row in matched_df.iterrows()
]

return results
155 changes: 0 additions & 155 deletions paperscraper/journal_if.py

This file was deleted.

Empty file added paperscraper/tests/__init__.py
Empty file.
Loading

0 comments on commit 0ff9218

Please sign in to comment.