Skip to content

Commit

Permalink
feat: Scaffold for oo-self linking
Browse files Browse the repository at this point in the history
  • Loading branch information
jannisborn committed Dec 7, 2024
1 parent c546e80 commit 7be71a4
Show file tree
Hide file tree
Showing 10 changed files with 326 additions and 104 deletions.
2 changes: 1 addition & 1 deletion paperscraper/citations/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from ..scholar import get_citations_from_title
from .core import self_references, self_references_paper
from .self_references import self_references, self_references_paper
117 changes: 19 additions & 98 deletions paperscraper/citations/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,118 +2,39 @@
import logging
import re
import sys
from typing import Dict, Iterable, Union
from typing import Dict, Iterable, Literal, Union

import httpx
from semanticscholar import SemanticScholar

from ..utils import optional_async
from .entity import Paper, Researcher
from .utils import check_overlap, doi_pattern

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)
logging.getLogger("httpx").setLevel(logging.WARNING)

ModeType = Literal[MODES := ("paper", "author")]

@optional_async
async def self_references(
inputs: Union[str, Iterable[str]],
relative: bool = False,
verbose: bool = False,
) -> Dict[str, Dict[str, Union[float, int]]]:
"""
Analyze self-references for a DOI or a list of DOIs.

Args:
inputs: A single DOI or an iterable of DOIs.
relative: If True, returns self-citations as percentages; otherwise, as raw counts.
Defaults to False.
verbose: Whether to log detailed information. Defaults to False.
class SelfLinkClient:
def __init__(self, entity: str, mode: ModeType = "paper") -> None:
if self.mode == "paper":
self.object = Paper(entity)

Returns:
A dictionary where the keys are DOIs and the values are dictionaries mapping
authors to their self-citations.
elif mode == "author":
self.object = Researcher(entity)

Raises:
NotImplementedError: If the input does not match a DOI format.
"""
if isinstance(inputs, str):
inputs = [inputs]
def extract_self_citations(self):
self.object.self_citations()

results: Dict[str, Dict[str, Union[float, int]]] = {}
def extract_self_references(self):
self.object.self_references()

tasks = []
def extract(self):
self.extract_self_citations()
self.extract_self_references()

for sample in inputs:
dois = re.findall(doi_pattern, sample, re.IGNORECASE)
if len(dois) == 1:
# This is a DOI
tasks.append(
(
sample,
self_references_paper(dois[0], verbose=verbose, relative=relative),
)
)
elif len(dois) == 0:
# TODO: Check that it is a proper name or an ORCID ID
raise NotImplementedError(
"Analyzing self-references of whole authors is not yet implemented."
)
completed_tasks = await asyncio.gather(*[task[1] for task in tasks])
for sample, task_result in zip(tasks, completed_tasks):
results[sample[0]] = task_result

return results


@optional_async
async def self_references_paper(
doi: str,
relative: bool = False,
verbose: bool = False,
) -> Dict[str, Union[float, int]]:
"""
Analyze self-references for a single DOI.
Args:
doi: The DOI to analyze.
relative: If True, returns self-citations as percentages; otherwise, as raw counts.
Defaults to False.
verbose: Whether to log detailed information. Defaults to False.
Returns:
A dictionary mapping authors to their self-citations.
Raises:
ValueError: If no references are found for the given DOI.
"""
async with httpx.AsyncClient() as client:
response = await client.get(
f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}",
params={"fields": "title,authors,references.authors"},
)
response.raise_for_status()
paper = response.json()

if not paper["references"]:
raise ValueError("Could not find citations from Semantic Scholar")

authors: Dict[str, int] = {a["name"]: 0 for a in paper["authors"]}

for ref in paper["references"]:
ref_authors = {a["name"] for a in ref["authors"]}
for author in authors:
if any(check_overlap(author, ra) for ra in ref_authors):
authors[author] += 1
total = len(paper["references"])

if verbose:
logger.info(f"Self references in \"{paper['title']}\"")
logger.info(f" N = {len(paper['references'])}")
for author, self_cites in authors.items():
logger.info(f" {author}: {100*(self_cites/total):.2f}% self-references")

if relative:
for author, self_cites in authors.items():
authors[author] = round(100 * self_cites / total, 2)

return authors
def get_result(self):
return self.object.get_result()
2 changes: 2 additions & 0 deletions paperscraper/citations/entity/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .paper import Paper
from .researcher import Researcher
45 changes: 45 additions & 0 deletions paperscraper/citations/entity/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from abc import abstractmethod
from typing import Dict

from pydantic import BaseModel


class EntityResult(BaseModel):
num_citations: int
num_references: int
# keys are authors or papers and values are absolute self links
self_citations: Dict[str, int] = {}
self_references: Dict[str, int] = {}
# aggregated results
self_citation_ratio: float = 0
self_reference_ratio: float = 0


class Entity:
"""
An abstract entity class with a set of utilities shared by the objects that perform
self-linking analyses, such as Paper and Researcher.
"""

@abstractmethod
def self_references(self):
"""
Has to be implemented by the child class. Performs a self-referencing analyses
for the object.
"""
...

@abstractmethod
def self_citations(self):
"""
Has to be implemented by the child class. Performs a self-citation analyses
for the object.
"""
...

@abstractmethod
def get_result(self):
"""
Has to be implemented by the child class. Provides the result of the analysis.
"""
...
39 changes: 39 additions & 0 deletions paperscraper/citations/entity/paper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import List

from ..self_references import self_references_paper
from .core import Entity, EntityResult


class PaperResult(EntityResult):
title: str
doi: str
authors: List[str]
# TODO: the ratios will be averaged across all authors


class Paper(Entity):
title: str
doi: str
authors: List[str]

def __init__(self, input: str, mode):
# Determine whether
...

def self_references(self):
"""
Extracts the self references of a paper, for each author.
"""
...

def self_citations(self):
"""
Extracts the self citations of a paper, for each author.
"""
...

def get_result(self) -> PaperResult:
"""
Provides the result of the analysis.
"""
...
48 changes: 48 additions & 0 deletions paperscraper/citations/entity/researcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from typing import Literal, Optional

from semanticscholar import SemanticScholar

from .core import Entity, EntityResult


class ResearcherResult(EntityResult):
name: str
ssid: int
orcid: Optional[str] = None
# TODO: the ratios will be averaged across all papers for that author


ModeType = Literal[MODES := ("name", "orcid", "ssid")]

sch = SemanticScholar()


class Researcher(Entity):
name: str
ssid: int
orcid: Optional[str] = None

def __init__(self, input: str, mode: ModeType):
if mode not in MODES:
raise ValueError(f"Unknown mode {mode} chose from {MODES}.")

if mode == "ssid":
author = sch.get_author(input)

def self_references(self):
"""
Sifts through all papers of a researcher and extracts the self references.
"""
...

def self_citations(self):
"""
Sifts through all papers of a researcher and finds how often they are self-cited.
"""
...

def get_result(self) -> ResearcherResult:
"""
Provides the result of the analysis.
"""
...
Empty file.
Loading

0 comments on commit 7be71a4

Please sign in to comment.