diff --git a/paperscraper/citations/__init__.py b/paperscraper/citations/__init__.py index efca5a7..7fd7683 100644 --- a/paperscraper/citations/__init__.py +++ b/paperscraper/citations/__init__.py @@ -1,2 +1,2 @@ from ..scholar import get_citations_from_title -from .core import self_references, self_references_paper +from .self_references import self_references, self_references_paper diff --git a/paperscraper/citations/core.py b/paperscraper/citations/core.py index e69175a..d97e62e 100644 --- a/paperscraper/citations/core.py +++ b/paperscraper/citations/core.py @@ -2,118 +2,39 @@ import logging import re import sys -from typing import Dict, Iterable, Union +from typing import Dict, Iterable, Literal, Union import httpx +from semanticscholar import SemanticScholar from ..utils import optional_async +from .entity import Paper, Researcher from .utils import check_overlap, doi_pattern logging.basicConfig(stream=sys.stdout, level=logging.INFO) logger = logging.getLogger(__name__) logging.getLogger("httpx").setLevel(logging.WARNING) +ModeType = Literal[MODES := ("paper", "author")] -@optional_async -async def self_references( - inputs: Union[str, Iterable[str]], - relative: bool = False, - verbose: bool = False, -) -> Dict[str, Dict[str, Union[float, int]]]: - """ - Analyze self-references for a DOI or a list of DOIs. - Args: - inputs: A single DOI or an iterable of DOIs. - relative: If True, returns self-citations as percentages; otherwise, as raw counts. - Defaults to False. - verbose: Whether to log detailed information. Defaults to False. +class SelfLinkClient: + def __init__(self, entity: str, mode: ModeType = "paper") -> None: + if self.mode == "paper": + self.object = Paper(entity) - Returns: - A dictionary where the keys are DOIs and the values are dictionaries mapping - authors to their self-citations. + elif mode == "author": + self.object = Researcher(entity) - Raises: - NotImplementedError: If the input does not match a DOI format. - """ - if isinstance(inputs, str): - inputs = [inputs] + def extract_self_citations(self): + self.object.self_citations() - results: Dict[str, Dict[str, Union[float, int]]] = {} + def extract_self_references(self): + self.object.self_references() - tasks = [] + def extract(self): + self.extract_self_citations() + self.extract_self_references() - for sample in inputs: - dois = re.findall(doi_pattern, sample, re.IGNORECASE) - if len(dois) == 1: - # This is a DOI - tasks.append( - ( - sample, - self_references_paper(dois[0], verbose=verbose, relative=relative), - ) - ) - elif len(dois) == 0: - # TODO: Check that it is a proper name or an ORCID ID - raise NotImplementedError( - "Analyzing self-references of whole authors is not yet implemented." - ) - completed_tasks = await asyncio.gather(*[task[1] for task in tasks]) - for sample, task_result in zip(tasks, completed_tasks): - results[sample[0]] = task_result - - return results - - -@optional_async -async def self_references_paper( - doi: str, - relative: bool = False, - verbose: bool = False, -) -> Dict[str, Union[float, int]]: - """ - Analyze self-references for a single DOI. - - Args: - doi: The DOI to analyze. - relative: If True, returns self-citations as percentages; otherwise, as raw counts. - Defaults to False. - verbose: Whether to log detailed information. Defaults to False. - - Returns: - A dictionary mapping authors to their self-citations. - - Raises: - ValueError: If no references are found for the given DOI. - """ - async with httpx.AsyncClient() as client: - response = await client.get( - f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}", - params={"fields": "title,authors,references.authors"}, - ) - response.raise_for_status() - paper = response.json() - - if not paper["references"]: - raise ValueError("Could not find citations from Semantic Scholar") - - authors: Dict[str, int] = {a["name"]: 0 for a in paper["authors"]} - - for ref in paper["references"]: - ref_authors = {a["name"] for a in ref["authors"]} - for author in authors: - if any(check_overlap(author, ra) for ra in ref_authors): - authors[author] += 1 - total = len(paper["references"]) - - if verbose: - logger.info(f"Self references in \"{paper['title']}\"") - logger.info(f" N = {len(paper['references'])}") - for author, self_cites in authors.items(): - logger.info(f" {author}: {100*(self_cites/total):.2f}% self-references") - - if relative: - for author, self_cites in authors.items(): - authors[author] = round(100 * self_cites / total, 2) - - return authors + def get_result(self): + return self.object.get_result() diff --git a/paperscraper/citations/entity/__init__.py b/paperscraper/citations/entity/__init__.py new file mode 100644 index 0000000..295b72f --- /dev/null +++ b/paperscraper/citations/entity/__init__.py @@ -0,0 +1,2 @@ +from .paper import Paper +from .researcher import Researcher diff --git a/paperscraper/citations/entity/core.py b/paperscraper/citations/entity/core.py new file mode 100644 index 0000000..b25b035 --- /dev/null +++ b/paperscraper/citations/entity/core.py @@ -0,0 +1,45 @@ +from abc import abstractmethod +from typing import Dict + +from pydantic import BaseModel + + +class EntityResult(BaseModel): + num_citations: int + num_references: int + # keys are authors or papers and values are absolute self links + self_citations: Dict[str, int] = {} + self_references: Dict[str, int] = {} + # aggregated results + self_citation_ratio: float = 0 + self_reference_ratio: float = 0 + + +class Entity: + """ + An abstract entity class with a set of utilities shared by the objects that perform + self-linking analyses, such as Paper and Researcher. + """ + + @abstractmethod + def self_references(self): + """ + Has to be implemented by the child class. Performs a self-referencing analyses + for the object. + """ + ... + + @abstractmethod + def self_citations(self): + """ + Has to be implemented by the child class. Performs a self-citation analyses + for the object. + """ + ... + + @abstractmethod + def get_result(self): + """ + Has to be implemented by the child class. Provides the result of the analysis. + """ + ... diff --git a/paperscraper/citations/entity/paper.py b/paperscraper/citations/entity/paper.py new file mode 100644 index 0000000..03654c0 --- /dev/null +++ b/paperscraper/citations/entity/paper.py @@ -0,0 +1,39 @@ +from typing import List + +from ..self_references import self_references_paper +from .core import Entity, EntityResult + + +class PaperResult(EntityResult): + title: str + doi: str + authors: List[str] + # TODO: the ratios will be averaged across all authors + + +class Paper(Entity): + title: str + doi: str + authors: List[str] + + def __init__(self, input: str, mode): + # Determine whether + ... + + def self_references(self): + """ + Extracts the self references of a paper, for each author. + """ + ... + + def self_citations(self): + """ + Extracts the self citations of a paper, for each author. + """ + ... + + def get_result(self) -> PaperResult: + """ + Provides the result of the analysis. + """ + ... diff --git a/paperscraper/citations/entity/researcher.py b/paperscraper/citations/entity/researcher.py new file mode 100644 index 0000000..beee78c --- /dev/null +++ b/paperscraper/citations/entity/researcher.py @@ -0,0 +1,48 @@ +from typing import Literal, Optional + +from semanticscholar import SemanticScholar + +from .core import Entity, EntityResult + + +class ResearcherResult(EntityResult): + name: str + ssid: int + orcid: Optional[str] = None + # TODO: the ratios will be averaged across all papers for that author + + +ModeType = Literal[MODES := ("name", "orcid", "ssid")] + +sch = SemanticScholar() + + +class Researcher(Entity): + name: str + ssid: int + orcid: Optional[str] = None + + def __init__(self, input: str, mode: ModeType): + if mode not in MODES: + raise ValueError(f"Unknown mode {mode} chose from {MODES}.") + + if mode == "ssid": + author = sch.get_author(input) + + def self_references(self): + """ + Sifts through all papers of a researcher and extracts the self references. + """ + ... + + def self_citations(self): + """ + Sifts through all papers of a researcher and finds how often they are self-cited. + """ + ... + + def get_result(self) -> ResearcherResult: + """ + Provides the result of the analysis. + """ + ... diff --git a/paperscraper/citations/self_citations.py b/paperscraper/citations/self_citations.py new file mode 100644 index 0000000..e69de29 diff --git a/paperscraper/citations/self_references.py b/paperscraper/citations/self_references.py new file mode 100644 index 0000000..fe5fa1d --- /dev/null +++ b/paperscraper/citations/self_references.py @@ -0,0 +1,170 @@ +import asyncio +import logging +import re +import sys +from typing import Dict, Iterable, Literal, Union + +import httpx +from semanticscholar import SemanticScholar + +from ..utils import optional_async +from .utils import check_overlap, doi_pattern + +logging.basicConfig(stream=sys.stdout, level=logging.INFO) +logger = logging.getLogger(__name__) +logging.getLogger("httpx").setLevel(logging.WARNING) +ModeType = Literal[MODES := ("doi", "name", "orcid", "ssid")] + + +@optional_async +async def self_references( + inputs: Union[str, Iterable[str]], + mode: ModeType = "doi", + relative: bool = True, + verbose: bool = False, +) -> Dict[str, Dict[str, Union[float, int]]]: + """ + Analyze self-references for a DOI or a list of DOIs. + + Args: + inputs: A single or a list of strings to analyze self-references for. + Dependent on the `mode` this can be either of: + - doi: Digital object identifier of a paper to measure self references. + - name: Name of a researcher to measure self references across all papers. + - orcid: ORCID ID of a researcher to measure self references across all papers. + - ssid: Semantic Scholar ID of a researcher to measure self references across all papers. + mode: + Either `doi`, `author`, `orcid` or `ssid`. + - doi: Digital object identifier of a paper to measure self references. + - name: Name of a researcher to measure self references across all papers. + - orcid: ORCID ID of a researcher to measure self references across all papers. + - ssid: Semantic Scholar ID of a researcher to measure self references across all papers. + relative: If True, returns self-citations as percentages; otherwise, as raw counts. + Defaults to False. + verbose: Whether to log detailed information. Defaults to False. + + Returns: + A dictionary where the keys are DOIs and the values are dictionaries mapping + authors to their self-citations. + + Raises: + NotImplementedError: If the input does not match a DOI format. + """ + if isinstance(inputs, str): + inputs = [inputs] + + results: Dict[str, Dict[str, Union[float, int]]] = {} + tasks = [] + + if mode == "doi": + for should_be_doi in inputs: + dois = re.findall(doi_pattern, should_be_doi, re.IGNORECASE) + if len(dois) == 1: + # This is a DOI + tasks.append( + ( + should_be_doi, + self_references_paper( + dois[0], verbose=verbose, relative=relative + ), + ) + ) + else: + raise ValueError( + f"For {should_be_doi} {len(dois)} DOIs were extracted. Please check your input." + ) + completed_tasks = await asyncio.gather(*[task[1] for task in tasks]) + for sample, task_result in zip(tasks, completed_tasks): + results[sample[0]] = task_result + elif mode == "name": + pass + + elif mode == "orcid": + pass + + elif mode == "ssid": + sch = SemanticScholar() + for should_be_ssid in inputs: + # TODO: Error handling + author = sch.get_author(should_be_ssid) + # TODO: Support other IDs than DOI + dois = [ + paper._data["externalIds"]["DOI"] + for paper in author.papers + if "DOI" in paper._data["externalIds"].keys() + ] + for doi in dois: + # TODO: Skip over erratum / corrigendum + tasks.append( + ( + should_be_ssid, + self_references_paper(doi, verbose=verbose, relative=relative), + ) + ) + completed_tasks = await asyncio.gather(*[task[1] for task in tasks]) + results[author.name] = [] + for sample, task_result in zip(tasks, completed_tasks): + results[author.name].append(task_result[author.name]) + # TODO: Consider returning this as JSON/DF + + else: + raise ValueError(f"Unknown mode {mode}, pick from {MODES}") + + # TODO: Post-hoc aggregation for SS-ID + + return results + + +@optional_async +async def self_references_paper( + doi: str, + relative: bool = True, + verbose: bool = False, +) -> Dict[str, Union[float, int]]: + """ + Analyze self-references for a single DOI. + + Args: + doi: The DOI to analyze. + relative: If True, returns self-citations as percentages; otherwise, as raw counts. + Defaults to False. + verbose: Whether to log detailed information. Defaults to False. + + Returns: + A dictionary mapping authors to their self-citations. + + Raises: + ValueError: If no references are found for the given DOI. + """ + async with httpx.AsyncClient() as client: + response = await client.get( + f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}", + params={"fields": "title,authors,references.authors"}, + ) + response.raise_for_status() + paper = response.json() + + authors: Dict[str, int] = {a["name"]: 0 for a in paper["authors"]} + if not paper["references"]: + logger.warning(f"Could not find citations from Semantic Scholar for {doi}") + return authors + + for ref in paper["references"]: + ref_authors = {a["name"] for a in ref["authors"]} + for author in authors: + # TODO: Make sure to expand names given as J. Doe to John Doe + if any(check_overlap(author, ra) for ra in ref_authors): + authors[author] += 1 + total = len(paper["references"]) + + if verbose: + logger.info(f"Self references in \"{paper['title']}\"") + logger.info(f" N = {len(paper['references'])}") + for author, self_cites in authors.items(): + logger.info(f" {author}: {100*(self_cites/total):.2f}% self-references") + + if relative: + for author, self_cites in authors.items(): + authors[author] = round(100 * self_cites / total, 2) + + return authors diff --git a/paperscraper/citations/tests/test_self_references.py b/paperscraper/citations/tests/test_self_references.py index bb77d6c..228f89a 100644 --- a/paperscraper/citations/tests/test_self_references.py +++ b/paperscraper/citations/tests/test_self_references.py @@ -52,10 +52,6 @@ def test_multiple_dois(self, dois): assert isinstance(self_cites, int) assert self_cites >= 0 - def test_not_implemented_error(self): - with pytest.raises(NotImplementedError): - self_references("John Jumper") - def test_compare_async_and_sync_performance(self, dois): """ Compares the execution time of asynchronous and synchronous `self_references` diff --git a/requirements.txt b/requirements.txt index 12375a7..153dddd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ bs4>=0.0.1 impact-factor>=1.1.1 thefuzz>=0.20.0 pytest -tldextract \ No newline at end of file +tldextract +semanticscholar>=0.8.4 \ No newline at end of file