Impact factor (#37)

* feat: impact factor class * test: imapct test * doc: readme * wip: finish PR * ci * ci: udpate test * Update README.md
jannisborn · Dec 24, 2023 · 0ff9218 · 0ff9218
1 parent db4f0c1
commit 0ff9218
Show file tree

Hide file tree

Showing 8 changed files with 222 additions and 163 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -12,5 +12,10 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip3 install -e .
+          pip install -r requirements.txt
+          pip install -e .
           python -c "import paperscraper"
+      - name: Run test suite
+        run: |
+          python -m pytest -sv paperscraper
+
diff --git a/README.md b/README.md
@@ -149,14 +149,41 @@ get_citations_from_title(title)
 *NOTE*: The scholar endpoint does not require authentification but since it regularly
 prompts with captchas, it's difficult to apply large scale.
 
-#### Journal impact factor
+### Journal impact factor
 
-You can also retrieve the impact factor for all journals indexed by citefactor:
+You can also retrieve the impact factor for all journals:
 ```py
-from paperscraper.journal_if import Impactor
-i = Impactor()
+>>>from paperscraper.impact import Impactor
+>>>i = Impactor()
+>>>i.search("Nat Comms", threshold=85, sort_by='impact') 
+[
+    {'journal': 'Nature Communications', 'factor': 17.694, 'score': 94}, 
+    {'journal': 'Natural Computing', 'factor': 1.504, 'score': 88}
+]
+```
+This performs a fuzzy search with a threshold of 85. `threshold` defaults to 100 in which case an exact search
+is performed. You can also search by journal abbreviation, [E-ISSN](https://portal.issn.org) or [NLM ID](https://portal.issn.org).
+```py
+i.search("Nat Rev Earth Environ") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
+i.search("101771060") # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
+i.search('2662-138X') # [{'journal': 'Nature Reviews Earth & Environment', 'factor': 37.214, 'score': 100}]
+
+# Filter results by impact factor
+i.search("Neural network", threshold=85, min_impact=1.5, max_impact=20)
+# [
+#   {'journal': 'IEEE Transactions on Neural Networks and Learning Systems', 'factor': 14.255, 'score': 93}, 
+#   {'journal': 'NEURAL NETWORKS', 'factor': 9.657, 'score': 91},
+#   {'journal': 'WORK-A Journal of Prevention Assessment & Rehabilitation', 'factor': 1.803, 'score': 86}, 
+#   {'journal': 'NETWORK-COMPUTATION IN NEURAL SYSTEMS', 'factor': 1.5, 'score': 92}
+# ]
+
+# Show all fields
+i.search("quantum information", threshold=90, return_all=True)
+# [
+#   {'factor': 10.758, 'jcr': 'Q1', 'journal_abbr': 'npj Quantum Inf', 'eissn': '2056-6387', 'journal': 'npj Quantum Information', 'nlm_id': '101722857', 'issn': '', 'score': 92},
+#   {'factor': 1.577, 'jcr': 'Q3', 'journal_abbr': 'Nation', 'eissn': '0027-8378', 'journal': 'NATION', 'nlm_id': '9877123', 'issn': '0027-8378', 'score': 91}
+# ]
 ```
-Then, `i.journal_to_if` should give you a dictionary wit journal to IF mappings for >9000 journals as of 2014.
 
 ### Plotting
 

diff --git a/paperscraper/__init__.py b/paperscraper/__init__.py
@@ -1,6 +1,6 @@
 """Initialize the module."""
 __name__ = "paperscraper"
-__version__ = "0.2.8"
+__version__ = "0.2.9"
 
 import logging
 import os

diff --git a/paperscraper/impact.py b/paperscraper/impact.py
@@ -0,0 +1,111 @@
+import logging
+from typing import Any, Dict, List, Optional
+
+import pandas as pd
+from impact_factor.core import Factor
+from thefuzz import fuzz
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+logging.disable(logging.INFO)
+
+
+class Impactor:
+    def __init__(self):
+        """
+        Initialize the Impactor class with an instance of the Factor class.
+        This allows access to the database of journal impact factors.
+        """
+        self.fa = Factor()
+        self.all_journals = self.fa.search("%")
+        self.metadata = pd.DataFrame(self.all_journals, dtype=str)
+        logger.info(f"Loaded metadata for {len(self.metadata)} journals")
+
+    def search(
+        self,
+        query: str,
+        threshold: int = 100,
+        sort_by: Optional[str] = None,
+        min_impact: float = 0.0,
+        max_impact: float = float("inf"),
+        return_all: bool = False,
+    ) -> List[Dict[str, Any]]:
+        """
+        Search for journals matching the given query with an optional fuzziness
+            level and sorting.
+
+        Args:
+            query: The journal name or abbreviation to search for.
+            threshold: The threshold for fuzzy matching. If set to 100, exact matching
+                is performed. If set below 100, fuzzy matching is used. Defaults to 100.
+            sort_by: Criterion for sorting results, one of 'impact', 'journal' and 'score'.
+            min_impact: Minimum impact factor for journals to be considered, defaults to 0.
+            max_impact: Maximum impact factor for journals to be considered, defaults to infinity.
+            return_all: If True, returns all columns of the DataFrame for each match.
+
+        Returns:
+            List[dict]: A list of dictionaries containing the journal information.
+
+        """
+        # Validation of parameters
+        if not isinstance(query, str) or not isinstance(threshold, int):
+            raise TypeError(
+                f"Query must be a str and threshold must be an int, not {type(query)} and {type(threshold)}"
+            )
+        if threshold < 0 or threshold > 100:
+            raise ValueError(
+                f"Fuzziness threshold must be between 0 and 100, not {threshold}"
+            )
+
+        if str.isdigit(query) and threshold >= 100:
+            # When querying with NLM ID, exact matching does not work since impact_factor
+            # strips off leading zeros, so we use fuzzy matching instead
+            threshold = 99
+
+        # Define a function to calculate fuzziness score
+        def calculate_fuzziness_score(row):
+            return max(fuzz.partial_ratio(query, str(value)) for value in row.values)
+
+        # Search with or without fuzzy matching
+        if threshold >= 100:
+            matched_df = self.metadata[
+                self.metadata.apply(
+                    lambda x: query.lower() in x.astype(str).str.lower().values, axis=1
+                )
+            ].copy()
+            # Exact matches get a default score of 100
+            matched_df["score"] = 100
+        else:
+            matched_df = self.metadata[
+                self.metadata.apply(
+                    lambda x: calculate_fuzziness_score(x) >= threshold, axis=1
+                )
+            ].copy()
+            matched_df["score"] = matched_df.apply(calculate_fuzziness_score, axis=1)
+
+        # Sorting based on the specified criterion
+        if sort_by == "score":
+            matched_df = matched_df.sort_values(by="score", ascending=False)
+        elif sort_by == "journal":
+            matched_df = matched_df.sort_values(by="journal")
+        elif sort_by == "impact":
+            matched_df = matched_df.sort_values(by="factor", ascending=False)
+
+        matched_df["factor"] = pd.to_numeric(matched_df["factor"])
+        matched_df = matched_df[
+            (matched_df["factor"] >= min_impact) & (matched_df["factor"] <= max_impact)
+        ]
+
+        # Prepare the final result
+        results = [
+            row.to_dict()
+            if return_all
+            else {
+                "journal": row["journal"],
+                "factor": row["factor"],
+                "score": row["score"],
+            }
+            for _, row in matched_df.iterrows()
+        ]
+
+        return results
diff --git a/paperscraper/journal_if.py b/paperscraper/journal_if.py
diff --git a/paperscraper/tests/__init__.py b/paperscraper/tests/__init__.py