diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..e012ac2 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,35 @@ +name: website + +# build the documentation whenever there are new commits on main +on: + push: + branches: + - main + # Alternative: only build for tags. + # tags: + # - '*' + +# security: restrict permissions for CI jobs. +permissions: + contents: read + +jobs: + # Build the documentation and upload the static HTML files as an artifact. + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + + # ADJUST THIS: install all dependencies (including pdoc) + - run: pip install -e . + - run: pip install pdoc + # ADJUST THIS: build your documentation into docs/. + # We use a custom build script for pdoc itself, ideally you just run `pdoc -o docs/ ...` here. + - run: pdoc src/sciterra -d google --math -o ./docs + + - uses: actions/upload-pages-artifact@v1 + with: + path: docs/ diff --git a/docs/sciterra.html b/docs/sciterra.html index 66cfe88..217c9fc 100644 --- a/docs/sciterra.html +++ b/docs/sciterra.html @@ -79,20 +79,26 @@

build

-

Software library to support data-driven analyses of scientific literature

+

Software library to support data-driven analyses of scientific literature.

-

Inspired heavily by Zach Hafen's cc library.

+

This library is a reimplementation of Zach Hafen's cc library.

-
1"""
-2.. include:: ../../README.md
-3"""
-4
-5__docformat__ = "google"
+                        
 1"""
+ 2.. include:: ../../README.md
+ 3"""
+ 4
+ 5__docformat__ = "google"
+ 6
+ 7from .mapping.atlas import Atlas
+ 8from .mapping.cartography import Cartographer
+ 9from .mapping.publication import (
+10    Publication,
+11)  # publication should probably be moved out of mapping.
 
diff --git a/docs/sciterra/librarians.html b/docs/sciterra/librarians.html index 538d15a..126e652 100644 --- a/docs/sciterra/librarians.html +++ b/docs/sciterra/librarians.html @@ -71,9 +71,19 @@

sciterra.librarians

- - - + + + + +
1from .librarian import Librarian
+2from .adslibrarian import ADSLibrarian
+3from .s2librarian import SemanticScholarLibrarian
+4
+5"""Why is there not an ArxivLibrarian? For now, we are restricting to APIs that allow us to traverse literature graphs, and arxiv does not have one. While there is a useful pip-installable package for querying the arxiv api for papers, https://pypi.org/project/arxiv/, the returned object does not have information on references and citations. However, it may still be possible to obtain a large sample of publications with abstracts and submission dates (though no citation counts), because the arxiv API's limit for a single query is 300,000 results.
+6"""
+
+ + + + + + + + +
+
+

+sciterra.mapping.topography

+ +

Functions for measuring topographic properties of (the semantic feature space of publications inside) an Atlas.

+
+ + + + + +
  1"""Functions for measuring topographic properties of (the semantic feature space of publications inside) an Atlas."""
+  2
+  3import inspect
+  4import numpy as np
+  5
+  6
+  7########################################################################
+  8# Density metrics
+  9########################################################################
+ 10
+ 11
+ 12def smoothing_length_metric(
+ 13    idx: int,
+ 14    cospsi_matrix: np.ndarray,
+ 15    valid_indices: np.ndarray,
+ 16    kernel_size: int = 16,
+ 17):
+ 18    """Proxy for the density of a publication defined as the minimum
+ 19    arc length that encloses kernel_size other publications.
+ 20
+ 21    Args:
+ 22        idx: the index of the vector to calculate the measurement for.
+ 23
+ 24        cospsi_matrix: a 2D matrix of pairwise cosine similarity scores for publication embeddings.
+ 25
+ 26        valid_indices: Indices of the other publication used when calculating the measurements.
+ 27
+ 28        kernel_size: number of K nearest neighbors to calculate the measurement on.
+ 29
+ 30    Returns:
+ 31        h: float representing arc length containing `kernel_size` other publications. (Assumes normalized to a radius of 1.)
+ 32    """
+ 33
+ 34    # We can't have the kernel larger than the number of valid publications
+ 35    if kernel_size > len(valid_indices):
+ 36        return np.nan
+ 37
+ 38    # Get 1D array of similarity scores to idx vector
+ 39    try:
+ 40        cospsi = cospsi_matrix[idx][valid_indices]
+ 41    except IndexError:
+ 42        breakpoint()
+ 43
+ 44    # Get cosine distance to the least similar vector
+ 45    # np.sort orders from least to greatest similarity, so reverse after
+ 46    cospsi_max = np.sort(cospsi)[::-1][kernel_size - 1]
+ 47
+ 48    # Compute arclength to furthest vector
+ 49    return np.arccos(cospsi_max)
+ 50
+ 51
+ 52def density_metric(
+ 53    idx: int,
+ 54    cospsi_matrix: np.ndarray,
+ 55    valid_indices: np.ndarray,
+ 56    kernel_size: int = 16,
+ 57):
+ 58    """Estimate the density of a publication by calculating the
+ 59    smoothing length that encloses kernel_size other publications.
+ 60
+ 61    Args:
+ 62        idx: the index of the vector to calculate the measurement for.
+ 63
+ 64        cospsi_matrix: a 2D matrix of pairwise cosine similarity scores for publication embeddings.
+ 65
+ 66        valid_indices: Indices of the other publication used when calculating the measurements.
+ 67
+ 68        kernel_size: number of K nearest neighbors to calculate the measurement on.
+ 69
+ 70    Returns:
+ 71        density: a float representing `kernel_size` divided by arc length containing `kernel_size` other publications.
+ 72    """
+ 73
+ 74    h = smoothing_length_metric(idx, cospsi_matrix, valid_indices, kernel_size)
+ 75    density = kernel_size / h
+ 76
+ 77    return density
+ 78
+ 79
+ 80########################################################################
+ 81# Asymmetry metrics
+ 82########################################################################
+ 83
+ 84
+ 85def edginess_metric(
+ 86    idx: int,
+ 87    cospsi_matrix: np.ndarray,
+ 88    valid_indices: np.ndarray,
+ 89    publication_indices: np.ndarray,
+ 90    embeddings: np.ndarray,
+ 91    kernel_size: int = 16,
+ 92) -> float:
+ 93    """Estimate the asymmetry of a publication by calculating the difference between that publication's projection and the other publications within the kernel. Normalized to between 0 and 1.
+ 94
+ 95    Args:
+ 96        idx: the index of the vector to calculate the measurement for.
+ 97
+ 98        cospsi_matrix: an np.ndarray of shape `(num_pubs, num_pubs)` representing pairwise cosine similarity scores for publication embeddings.
+ 99
+100        valid_indices: an np.ndarray of shape `(num_valid_pubs)` representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs <= num_pubs.
+101
+102        publication_indices: an np.ndarray of shape `(num_pubs,)` representing indices of all publications in the atlas projection
+103
+104        embeddings: an np.ndarray of shape `(num_pubs, embedding_dim)` vectors for all publications in the atlas projection
+105
+106        kernel_size: number of K nearest neighbors to calculate the measurement on.
+107
+108    Returns:
+109        a float representing the normalized magnitude of the asymmetry metric.
+110
+111    """
+112    return (
+113        kernel_constant_asymmetry_metric(
+114            idx,
+115            cospsi_matrix,
+116            valid_indices,
+117            publication_indices,
+118            embeddings,
+119            kernel_size=kernel_size,
+120        )
+121        / kernel_size
+122    )
+123
+124
+125def kernel_constant_asymmetry_metric(
+126    idx: int,
+127    cospsi_matrix: np.ndarray,
+128    valid_indices: np.ndarray,
+129    publication_indices: np.ndarray,
+130    embeddings: np.ndarray,
+131    kernel_size: int = 16,
+132) -> float:
+133    """Estimate the asymmetry of a publication by calculating the difference
+134    between that publication's projection and the other publications within
+135    the kernel.
+136
+137    Args:
+138        idx: an int representing the index of the vector to calculate the measurement for.
+139
+140        cospsi_matrix: an np.ndarray of shape `(num_pubs, num_pubs)` representing pairwise cosine similarity scores for publication embeddings.
+141
+142        valid_indices: an np.ndarray of shape `(num_valid_pubs)` representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs <= num_pubs.
+143
+144        publication_indices: an np.ndarray of shape `(num_pubs,)` representing indices of all publications in the atlas projection
+145
+146        embeddings: an np.ndarray of shape `(num_pubs, embedding_dim)` vectors for all publications in the atlas projection
+147
+148        kernel_size: an int representing the number of K nearest neighbors to calculate the measurement on.
+149
+150    Returns:
+151        mag: a float representing the magnitude of the asymmetry metric.
+152    """
+153
+154    # We can't have the kernel larger than the number of valid publications
+155    if kernel_size > len(valid_indices):
+156        return np.nan
+157
+158    # Input
+159    cospsi = cospsi_matrix[idx][valid_indices]
+160    sorted_inds = np.argsort(cospsi)[::-1][:kernel_size]
+161    other_inds = publication_indices[valid_indices][sorted_inds]
+162    embedding = embeddings[idx]
+163    other_embeddings = embeddings[other_inds]
+164
+165    # Differences
+166    diff = embedding - other_embeddings
+167    diff_mag = np.linalg.norm(diff, axis=1)
+168    result = (diff / diff_mag[:, np.newaxis]).sum(axis=0)
+169    mag = np.linalg.norm(result)
+170
+171    return mag
+
+ + +
+
+ +
+ + def + smoothing_length_metric( idx: int, cospsi_matrix: numpy.ndarray, valid_indices: numpy.ndarray, kernel_size: int = 16): + + + +
+ +
13def smoothing_length_metric(
+14    idx: int,
+15    cospsi_matrix: np.ndarray,
+16    valid_indices: np.ndarray,
+17    kernel_size: int = 16,
+18):
+19    """Proxy for the density of a publication defined as the minimum
+20    arc length that encloses kernel_size other publications.
+21
+22    Args:
+23        idx: the index of the vector to calculate the measurement for.
+24
+25        cospsi_matrix: a 2D matrix of pairwise cosine similarity scores for publication embeddings.
+26
+27        valid_indices: Indices of the other publication used when calculating the measurements.
+28
+29        kernel_size: number of K nearest neighbors to calculate the measurement on.
+30
+31    Returns:
+32        h: float representing arc length containing `kernel_size` other publications. (Assumes normalized to a radius of 1.)
+33    """
+34
+35    # We can't have the kernel larger than the number of valid publications
+36    if kernel_size > len(valid_indices):
+37        return np.nan
+38
+39    # Get 1D array of similarity scores to idx vector
+40    try:
+41        cospsi = cospsi_matrix[idx][valid_indices]
+42    except IndexError:
+43        breakpoint()
+44
+45    # Get cosine distance to the least similar vector
+46    # np.sort orders from least to greatest similarity, so reverse after
+47    cospsi_max = np.sort(cospsi)[::-1][kernel_size - 1]
+48
+49    # Compute arclength to furthest vector
+50    return np.arccos(cospsi_max)
+
+ + +

Proxy for the density of a publication defined as the minimum +arc length that encloses kernel_size other publications.

+ +
Arguments:
+ +
    +
  • idx: the index of the vector to calculate the measurement for.
  • +
  • cospsi_matrix: a 2D matrix of pairwise cosine similarity scores for publication embeddings.
  • +
  • valid_indices: Indices of the other publication used when calculating the measurements.
  • +
  • kernel_size: number of K nearest neighbors to calculate the measurement on.
  • +
+ +
Returns:
+ +
+

h: float representing arc length containing kernel_size other publications. (Assumes normalized to a radius of 1.)

+
+
+ + +
+
+ +
+ + def + density_metric( idx: int, cospsi_matrix: numpy.ndarray, valid_indices: numpy.ndarray, kernel_size: int = 16): + + + +
+ +
53def density_metric(
+54    idx: int,
+55    cospsi_matrix: np.ndarray,
+56    valid_indices: np.ndarray,
+57    kernel_size: int = 16,
+58):
+59    """Estimate the density of a publication by calculating the
+60    smoothing length that encloses kernel_size other publications.
+61
+62    Args:
+63        idx: the index of the vector to calculate the measurement for.
+64
+65        cospsi_matrix: a 2D matrix of pairwise cosine similarity scores for publication embeddings.
+66
+67        valid_indices: Indices of the other publication used when calculating the measurements.
+68
+69        kernel_size: number of K nearest neighbors to calculate the measurement on.
+70
+71    Returns:
+72        density: a float representing `kernel_size` divided by arc length containing `kernel_size` other publications.
+73    """
+74
+75    h = smoothing_length_metric(idx, cospsi_matrix, valid_indices, kernel_size)
+76    density = kernel_size / h
+77
+78    return density
+
+ + +

Estimate the density of a publication by calculating the +smoothing length that encloses kernel_size other publications.

+ +
Arguments:
+ +
    +
  • idx: the index of the vector to calculate the measurement for.
  • +
  • cospsi_matrix: a 2D matrix of pairwise cosine similarity scores for publication embeddings.
  • +
  • valid_indices: Indices of the other publication used when calculating the measurements.
  • +
  • kernel_size: number of K nearest neighbors to calculate the measurement on.
  • +
+ +
Returns:
+ +
+

density: a float representing kernel_size divided by arc length containing kernel_size other publications.

+
+
+ + +
+
+ +
+ + def + edginess_metric( idx: int, cospsi_matrix: numpy.ndarray, valid_indices: numpy.ndarray, publication_indices: numpy.ndarray, embeddings: numpy.ndarray, kernel_size: int = 16) -> float: + + + +
+ +
 86def edginess_metric(
+ 87    idx: int,
+ 88    cospsi_matrix: np.ndarray,
+ 89    valid_indices: np.ndarray,
+ 90    publication_indices: np.ndarray,
+ 91    embeddings: np.ndarray,
+ 92    kernel_size: int = 16,
+ 93) -> float:
+ 94    """Estimate the asymmetry of a publication by calculating the difference between that publication's projection and the other publications within the kernel. Normalized to between 0 and 1.
+ 95
+ 96    Args:
+ 97        idx: the index of the vector to calculate the measurement for.
+ 98
+ 99        cospsi_matrix: an np.ndarray of shape `(num_pubs, num_pubs)` representing pairwise cosine similarity scores for publication embeddings.
+100
+101        valid_indices: an np.ndarray of shape `(num_valid_pubs)` representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs <= num_pubs.
+102
+103        publication_indices: an np.ndarray of shape `(num_pubs,)` representing indices of all publications in the atlas projection
+104
+105        embeddings: an np.ndarray of shape `(num_pubs, embedding_dim)` vectors for all publications in the atlas projection
+106
+107        kernel_size: number of K nearest neighbors to calculate the measurement on.
+108
+109    Returns:
+110        a float representing the normalized magnitude of the asymmetry metric.
+111
+112    """
+113    return (
+114        kernel_constant_asymmetry_metric(
+115            idx,
+116            cospsi_matrix,
+117            valid_indices,
+118            publication_indices,
+119            embeddings,
+120            kernel_size=kernel_size,
+121        )
+122        / kernel_size
+123    )
+
+ + +

Estimate the asymmetry of a publication by calculating the difference between that publication's projection and the other publications within the kernel. Normalized to between 0 and 1.

+ +
Arguments:
+ +
    +
  • idx: the index of the vector to calculate the measurement for.
  • +
  • cospsi_matrix: an np.ndarray of shape (num_pubs, num_pubs) representing pairwise cosine similarity scores for publication embeddings.
  • +
  • valid_indices: an np.ndarray of shape (num_valid_pubs) representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs <= num_pubs.
  • +
  • publication_indices: an np.ndarray of shape (num_pubs,) representing indices of all publications in the atlas projection
  • +
  • embeddings: an np.ndarray of shape (num_pubs, embedding_dim) vectors for all publications in the atlas projection
  • +
  • kernel_size: number of K nearest neighbors to calculate the measurement on.
  • +
+ +
Returns:
+ +
+

a float representing the normalized magnitude of the asymmetry metric.

+
+
+ + +
+
+ +
+ + def + kernel_constant_asymmetry_metric( idx: int, cospsi_matrix: numpy.ndarray, valid_indices: numpy.ndarray, publication_indices: numpy.ndarray, embeddings: numpy.ndarray, kernel_size: int = 16) -> float: + + + +
+ +
126def kernel_constant_asymmetry_metric(
+127    idx: int,
+128    cospsi_matrix: np.ndarray,
+129    valid_indices: np.ndarray,
+130    publication_indices: np.ndarray,
+131    embeddings: np.ndarray,
+132    kernel_size: int = 16,
+133) -> float:
+134    """Estimate the asymmetry of a publication by calculating the difference
+135    between that publication's projection and the other publications within
+136    the kernel.
+137
+138    Args:
+139        idx: an int representing the index of the vector to calculate the measurement for.
+140
+141        cospsi_matrix: an np.ndarray of shape `(num_pubs, num_pubs)` representing pairwise cosine similarity scores for publication embeddings.
+142
+143        valid_indices: an np.ndarray of shape `(num_valid_pubs)` representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs <= num_pubs.
+144
+145        publication_indices: an np.ndarray of shape `(num_pubs,)` representing indices of all publications in the atlas projection
+146
+147        embeddings: an np.ndarray of shape `(num_pubs, embedding_dim)` vectors for all publications in the atlas projection
+148
+149        kernel_size: an int representing the number of K nearest neighbors to calculate the measurement on.
+150
+151    Returns:
+152        mag: a float representing the magnitude of the asymmetry metric.
+153    """
+154
+155    # We can't have the kernel larger than the number of valid publications
+156    if kernel_size > len(valid_indices):
+157        return np.nan
+158
+159    # Input
+160    cospsi = cospsi_matrix[idx][valid_indices]
+161    sorted_inds = np.argsort(cospsi)[::-1][:kernel_size]
+162    other_inds = publication_indices[valid_indices][sorted_inds]
+163    embedding = embeddings[idx]
+164    other_embeddings = embeddings[other_inds]
+165
+166    # Differences
+167    diff = embedding - other_embeddings
+168    diff_mag = np.linalg.norm(diff, axis=1)
+169    result = (diff / diff_mag[:, np.newaxis]).sum(axis=0)
+170    mag = np.linalg.norm(result)
+171
+172    return mag
+
+ + +

Estimate the asymmetry of a publication by calculating the difference +between that publication's projection and the other publications within +the kernel.

+ +
Arguments:
+ +
    +
  • idx: an int representing the index of the vector to calculate the measurement for.
  • +
  • cospsi_matrix: an np.ndarray of shape (num_pubs, num_pubs) representing pairwise cosine similarity scores for publication embeddings.
  • +
  • valid_indices: an np.ndarray of shape (num_valid_pubs) representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs <= num_pubs.
  • +
  • publication_indices: an np.ndarray of shape (num_pubs,) representing indices of all publications in the atlas projection
  • +
  • embeddings: an np.ndarray of shape (num_pubs, embedding_dim) vectors for all publications in the atlas projection
  • +
  • kernel_size: an int representing the number of K nearest neighbors to calculate the measurement on.
  • +
+ +
Returns:
+ +
+

mag: a float representing the magnitude of the asymmetry metric.

+
+
+ + +
+
+ + \ No newline at end of file diff --git a/docs/sciterra/misc/utils.html b/docs/sciterra/misc/utils.html index 70aed76..86d6d4a 100644 --- a/docs/sciterra/misc/utils.html +++ b/docs/sciterra/misc/utils.html @@ -67,6 +67,12 @@

API Documentation

  • read_pickle
  • +
  • + get_verbose +
  • +
  • + custom_formatwarning +
  • @@ -199,7 +205,7 @@

    107 return _keep_trying 108 109 -110def chunk_ids(ids: list[str], call_size=2000): +110def chunk_ids(ids: list[str], call_size): 111 """Helper function to chunk bibcodes or paperIds into smaller sublists if appropriate.""" 112 # Break into chunks 113 assert ( # TODO: this seems like an irrelevant copypasta since we use SearchQuery @@ -225,6 +231,18 @@

    133 with open(fn, "rb") as f: 134 data = pickle.load(f) 135 return data +136 +137 +138# various helper functions +139 +140 +141def get_verbose(kwargs: dict): +142 return kwargs["verbose"] if "verbose" in kwargs else False +143 +144 +145def custom_formatwarning(msg, *args, **kwargs): +146 # ignore everything except the message +147 return str(msg) + "\n"

    @@ -385,13 +403,13 @@

    Example Usage:
    def - chunk_ids(ids: list[str], call_size=2000): + chunk_ids(ids: list[str], call_size):
    -
    111def chunk_ids(ids: list[str], call_size=2000):
    +            
    111def chunk_ids(ids: list[str], call_size):
     112    """Helper function to chunk bibcodes or paperIds into smaller sublists if appropriate."""
     113    # Break into chunks
     114    assert (  # TODO: this seems like an irrelevant copypasta since we use SearchQuery
    @@ -453,6 +471,45 @@ 
    Example Usage:
    + +
    + +
    + + def + get_verbose(kwargs: dict): + + + +
    + +
    142def get_verbose(kwargs: dict):
    +143    return kwargs["verbose"] if "verbose" in kwargs else False
    +
    + + + + +
    +
    + +
    + + def + custom_formatwarning(msg, *args, **kwargs): + + + +
    + +
    146def custom_formatwarning(msg, *args, **kwargs):
    +147    # ignore everything except the message
    +148    return str(msg) + "\n"
    +
    + + + +
    + + + + + + +
    +
    +

    +sciterra.vectorization.preprocessing

    + +

    Simple preprocessing of scientific abstracts prior to vectorization.

    +
    + + + + + +
     1"""Simple preprocessing of scientific abstracts prior to vectorization."""
    + 2
    + 3import spacy
    + 4
    + 5nlp = spacy.load("en_core_web_sm")
    + 6
    + 7# Another off the shelf simple tokenizer
    + 8from gensim.utils import simple_preprocess
    + 9
    +10
    +11def custom_preprocess(
    +12    document: str,
    +13    allowed_pos_tags: set = {"NOUN", "VERB", "ADJ"},
    +14) -> list[str]:
    +15    """Get all of the lemmas of the words in a document, filtering by POS.
    +16
    +17    Args:
    +18        document: a multi-sentence string
    +19
    +20        allowed_pos_tags: keep and lemmatize words that are tagged as one of these POS categories.
    +21
    +22    Returns:
    +23        a list of the lemmatized, filtered words in the document
    +24
    +25    Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming
    +26
    +27    See https://github.com/zhafen/cc/blob/master/cc/utils.py#L173.
    +28    """
    +29    return [
    +30        token.lemma_
    +31        for sent in nlp(document).sents
    +32        for token in sent
    +33        if token.pos_ in allowed_pos_tags
    +34    ]
    +
    + + +
    +
    +
    + nlp = +<spacy.lang.en.English object> + + +
    + + + + +
    +
    + +
    + + def + custom_preprocess( document: str, allowed_pos_tags: set = {'VERB', 'NOUN', 'ADJ'}) -> list[str]: + + + +
    + +
    12def custom_preprocess(
    +13    document: str,
    +14    allowed_pos_tags: set = {"NOUN", "VERB", "ADJ"},
    +15) -> list[str]:
    +16    """Get all of the lemmas of the words in a document, filtering by POS.
    +17
    +18    Args:
    +19        document: a multi-sentence string
    +20
    +21        allowed_pos_tags: keep and lemmatize words that are tagged as one of these POS categories.
    +22
    +23    Returns:
    +24        a list of the lemmatized, filtered words in the document
    +25
    +26    Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming
    +27
    +28    See https://github.com/zhafen/cc/blob/master/cc/utils.py#L173.
    +29    """
    +30    return [
    +31        token.lemma_
    +32        for sent in nlp(document).sents
    +33        for token in sent
    +34        if token.pos_ in allowed_pos_tags
    +35    ]
    +
    + + +

    Get all of the lemmas of the words in a document, filtering by POS.

    + +
    Arguments:
    + +
      +
    • document: a multi-sentence string
    • +
    • allowed_pos_tags: keep and lemmatize words that are tagged as one of these POS categories.
    • +
    + +
    Returns:
    + +
    +

    a list of the lemmatized, filtered words in the document

    +
    + +

    Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming

    + +

    See https://github.com/zhafen/cc/blob/master/cc/utils.py#L173.

    +
    + + +
    +
    + + \ No newline at end of file diff --git a/docs/sciterra/vectorization/projection.html b/docs/sciterra/vectorization/projection.html index 9aa33d3..862cae8 100644 --- a/docs/sciterra/vectorization/projection.html +++ b/docs/sciterra/vectorization/projection.html @@ -74,7 +74,7 @@

    API Documentation

    identifiers_to_embeddings
  • - identifier_to_embedding + identifiers_to_indices
  • @@ -82,6 +82,9 @@

    API Documentation

  • merge
  • +
  • + get_empty_projection +
  • @@ -103,97 +106,108 @@

    -
     1import numpy as np
    - 2
    - 3
    - 4class Projection:
    - 5    """Basic wrapper for document embeddings and helper methods."""
    - 6
    - 7    def __init__(
    - 8        self,
    - 9        identifier_to_index: dict[str, int],
    -10        index_to_identifier: tuple[str],
    -11        embeddings: np.ndarray,
    -12    ) -> None:
    -13        """Construct a Projection object, a bidirectional mapping from identifiers to document embeddings.
    -14
    -15        Args:
    -16            identifiers_to_indices: a map from Publication identifiers to indices in the embedding matrix.
    -17
    -18            indices_to_identifiers: a map from embedding indices to Publication identifiers.
    -19
    -20            embeddings: ndarray of document embeddings of shape `(num_pubs, embedding_dim)`
    -21        """
    -22        self.identifier_to_index = identifier_to_index
    -23        self.index_to_identifier = index_to_identifier
    -24        self.embeddings = embeddings
    -25
    -26    def indices_to_identifiers(self, indices) -> list[str]:
    -27        """Retrieve the identifiers for a list of embedding matrix indices."""
    -28        return [self.index_to_identifier[index] for index in indices]
    -29
    -30    def identifiers_to_embeddings(self, identifiers: list[str]) -> np.ndarray:
    -31        """Retrieve the document embeddings for a list of identifiers."""
    -32        return [self.identifier_to_embedding(identifier) for identifier in identifiers]
    -33
    -34    def identifier_to_embedding(self, identifier: str) -> np.ndarray:
    -35        """Retrieve the document embedding of a Publication."""
    -36        return self.embeddings[self.identifier_to_index[identifier]]
    -37
    -38    def __len__(self) -> int:
    -39        return len(self.identifier_to_index)
    -40
    -41    def __eq__(self, __value: object) -> bool:
    -42        return (
    -43            np.array_equal(self.embeddings, __value.embeddings)
    -44            and self.identifier_to_index == __value.identifier_to_index
    -45            and self.index_to_identifier == __value.index_to_identifier
    -46        )
    -47
    -48
    -49######################################################################
    -50# Merge projections
    -51######################################################################
    -52
    -53
    -54def merge(proj_a: Projection, proj_b: Projection) -> Projection:
    -55    """Return the result of merging projection `proj_a` with projection `proj_b`.
    -56
    -57    NOTE: This is not a symmetric operation: it adds all embedding data contained in proj_a that is missing from proj_b. This means that the resulting projection can only be greater or equal in size to proj_a.
    -58    """
    -59    if proj_b is None or not len(proj_b):
    -60        return proj_a
    -61
    -62    # Get the data in the new projection missing from the old
    -63    indices_missing = []
    -64    identifiers_missing = []
    -65    for id, idx in proj_b.identifier_to_index.items():
    -66        if proj_a is None or id not in proj_a.identifier_to_index:
    -67            indices_missing.append(idx)
    -68            identifiers_missing.append(id)
    -69
    -70    # Get just the missing embeddings
    -71    embeddings_missing = np.array(
    -72        [
    -73            embedding
    -74            for idx, embedding in enumerate(proj_b.embeddings)
    -75            if idx in set(indices_missing)
    -76        ]
    -77    )
    -78
    -79    # Concatenate index mapping and embeddings
    -80    idx_to_ids_new = identifiers_missing
    -81    embeddings_new = embeddings_missing
    -82    if proj_a is not None:
    -83        idx_to_ids_new = list(proj_a.index_to_identifier) + idx_to_ids_new
    -84        embeddings_new = np.concatenate(proj_a.embeddings, embeddings_new)
    -85
    -86    # Return a new projection
    -87    return Projection(
    -88        identifier_to_index={id: idx for idx, id in enumerate(idx_to_ids_new)},
    -89        index_to_identifier=tuple(idx_to_ids_new),
    -90        embeddings=embeddings_new,
    -91    )
    +                        
      1import numpy as np
    +  2
    +  3
    +  4class Projection:
    +  5    """Basic wrapper for document embeddings and helper methods."""
    +  6
    +  7    def __init__(
    +  8        self,
    +  9        identifier_to_index: dict[str, int],
    + 10        index_to_identifier: tuple[str],
    + 11        embeddings: np.ndarray,
    + 12    ) -> None:
    + 13        """Construct a Projection object, a bidirectional mapping from identifiers to document embeddings.
    + 14
    + 15        Args:
    + 16            identifier_to_index: a dict mapping Publication identifiers to indices in the embedding matrix.
    + 17
    + 18            index_to_identifier: a tuple mapping embedding indices to Publication identifiers.
    + 19
    + 20            embeddings: ndarray of document embeddings of shape `(num_pubs, embedding_dim)`
    + 21        """
    + 22        self.identifier_to_index = identifier_to_index
    + 23        self.index_to_identifier = index_to_identifier
    + 24        self.embeddings = embeddings
    + 25
    + 26    def indices_to_identifiers(self, indices) -> list[str]:
    + 27        """Retrieve the identifiers for a list of embedding matrix indices."""
    + 28        return [self.index_to_identifier[index] for index in indices]
    + 29
    + 30    def identifiers_to_embeddings(self, identifiers: list[str]) -> np.ndarray:
    + 31        """Retrieve the document embeddings for a list of identifiers."""
    + 32        return self.embeddings[self.identifiers_to_indices(identifiers)]
    + 33
    + 34    def identifiers_to_indices(self, identifiers: list[str]) -> np.ndarray:
    + 35        """Retrieve the embedding indices for a list of identifiers."""
    + 36        return np.array(
    + 37            [self.identifier_to_index[identifier] for identifier in identifiers]
    + 38        )
    + 39
    + 40    def __len__(self) -> int:
    + 41        return len(self.identifier_to_index)
    + 42
    + 43    def __eq__(self, __value: object) -> bool:
    + 44        return (
    + 45            np.array_equal(self.embeddings, __value.embeddings)
    + 46            and self.identifier_to_index == __value.identifier_to_index
    + 47            and self.index_to_identifier == __value.index_to_identifier
    + 48        )
    + 49
    + 50
    + 51######################################################################
    + 52# Merge projections
    + 53######################################################################
    + 54
    + 55
    + 56def merge(proj_a: Projection, proj_b: Projection) -> Projection:
    + 57    """Return the result of merging projection `proj_a` with projection `proj_b`.
    + 58
    + 59    This adds to proj_a all embedding data contained in proj_b that is missing from proj_a. This means that the resulting projection can only be greater or equal in size to proj_a.
    + 60    """
    + 61    if proj_b is None or not len(proj_b):
    + 62        return proj_a
    + 63
    + 64    # Get the data in the new projection missing from the old
    + 65    indices_missing = []
    + 66    identifiers_missing = []
    + 67    for id, idx in proj_b.identifier_to_index.items():
    + 68        if proj_a is None or id not in proj_a.identifier_to_index:
    + 69            indices_missing.append(idx)
    + 70            identifiers_missing.append(id)
    + 71
    + 72    # Get just the missing embeddings
    + 73    embeddings_missing = np.array(
    + 74        [
    + 75            embedding
    + 76            for idx, embedding in enumerate(proj_b.embeddings)
    + 77            if idx in set(indices_missing)
    + 78        ]
    + 79    )
    + 80
    + 81    # Concatenate index mapping and embeddings
    + 82    idx_to_ids_new = identifiers_missing
    + 83    embeddings_new = embeddings_missing
    + 84    if proj_a is not None:
    + 85        idx_to_ids_new = list(proj_a.index_to_identifier) + idx_to_ids_new
    + 86        embeddings_new = np.concatenate([proj_a.embeddings, embeddings_new])
    + 87
    + 88    # Return a new projection
    + 89    return Projection(
    + 90        identifier_to_index={id: idx for idx, id in enumerate(idx_to_ids_new)},
    + 91        index_to_identifier=tuple(idx_to_ids_new),
    + 92        embeddings=embeddings_new,
    + 93    )
    + 94
    + 95
    + 96def get_empty_projection() -> Projection:
    + 97    """Construct a Projection with no data (but is not None)."""
    + 98    return Projection(
    + 99        identifier_to_index={},
    +100        index_to_identifier=(),
    +101        embeddings=np.array([[]]),  # 2D
    +102    )
     
    @@ -221,9 +235,9 @@

    14 """Construct a Projection object, a bidirectional mapping from identifiers to document embeddings. 15 16 Args: -17 identifiers_to_indices: a map from Publication identifiers to indices in the embedding matrix. +17 identifier_to_index: a dict mapping Publication identifiers to indices in the embedding matrix. 18 -19 indices_to_identifiers: a map from embedding indices to Publication identifiers. +19 index_to_identifier: a tuple mapping embedding indices to Publication identifiers. 20 21 embeddings: ndarray of document embeddings of shape `(num_pubs, embedding_dim)` 22 """ @@ -237,21 +251,23 @@

    30 31 def identifiers_to_embeddings(self, identifiers: list[str]) -> np.ndarray: 32 """Retrieve the document embeddings for a list of identifiers.""" -33 return [self.identifier_to_embedding(identifier) for identifier in identifiers] +33 return self.embeddings[self.identifiers_to_indices(identifiers)] 34 -35 def identifier_to_embedding(self, identifier: str) -> np.ndarray: -36 """Retrieve the document embedding of a Publication.""" -37 return self.embeddings[self.identifier_to_index[identifier]] -38 -39 def __len__(self) -> int: -40 return len(self.identifier_to_index) -41 -42 def __eq__(self, __value: object) -> bool: -43 return ( -44 np.array_equal(self.embeddings, __value.embeddings) -45 and self.identifier_to_index == __value.identifier_to_index -46 and self.index_to_identifier == __value.index_to_identifier -47 ) +35 def identifiers_to_indices(self, identifiers: list[str]) -> np.ndarray: +36 """Retrieve the embedding indices for a list of identifiers.""" +37 return np.array( +38 [self.identifier_to_index[identifier] for identifier in identifiers] +39 ) +40 +41 def __len__(self) -> int: +42 return len(self.identifier_to_index) +43 +44 def __eq__(self, __value: object) -> bool: +45 return ( +46 np.array_equal(self.embeddings, __value.embeddings) +47 and self.identifier_to_index == __value.identifier_to_index +48 and self.index_to_identifier == __value.index_to_identifier +49 )

    @@ -278,9 +294,9 @@

    14 """Construct a Projection object, a bidirectional mapping from identifiers to document embeddings. 15 16 Args: -17 identifiers_to_indices: a map from Publication identifiers to indices in the embedding matrix. +17 identifier_to_index: a dict mapping Publication identifiers to indices in the embedding matrix. 18 -19 indices_to_identifiers: a map from embedding indices to Publication identifiers. +19 index_to_identifier: a tuple mapping embedding indices to Publication identifiers. 20 21 embeddings: ndarray of document embeddings of shape `(num_pubs, embedding_dim)` 22 """ @@ -295,8 +311,8 @@

    Arguments:
      -
    • identifiers_to_indices: a map from Publication identifiers to indices in the embedding matrix.
    • -
    • indices_to_identifiers: a map from embedding indices to Publication identifiers.
    • +
    • identifier_to_index: a dict mapping Publication identifiers to indices in the embedding matrix.
    • +
    • index_to_identifier: a tuple mapping embedding indices to Publication identifiers.
    • embeddings: ndarray of document embeddings of shape (num_pubs, embedding_dim)

    @@ -371,7 +387,7 @@
    Arguments:
    31    def identifiers_to_embeddings(self, identifiers: list[str]) -> np.ndarray:
     32        """Retrieve the document embeddings for a list of identifiers."""
    -33        return [self.identifier_to_embedding(identifier) for identifier in identifiers]
    +33        return self.embeddings[self.identifiers_to_indices(identifiers)]
     
    @@ -380,24 +396,26 @@
    Arguments:
    -
    - +
    +
    def - identifier_to_embedding(self, identifier: str) -> numpy.ndarray: + identifiers_to_indices(self, identifiers: list[str]) -> numpy.ndarray: - +
    - -
    35    def identifier_to_embedding(self, identifier: str) -> np.ndarray:
    -36        """Retrieve the document embedding of a Publication."""
    -37        return self.embeddings[self.identifier_to_index[identifier]]
    +    
    +            
    35    def identifiers_to_indices(self, identifiers: list[str]) -> np.ndarray:
    +36        """Retrieve the embedding indices for a list of identifiers."""
    +37        return np.array(
    +38            [self.identifier_to_index[identifier] for identifier in identifiers]
    +39        )
     
    -

    Retrieve the document embedding of a Publication.

    +

    Retrieve the embedding indices for a list of identifiers.

    @@ -414,50 +432,76 @@
    Arguments:
    -
    55def merge(proj_a: Projection, proj_b: Projection) -> Projection:
    -56    """Return the result of merging projection `proj_a` with projection `proj_b`.
    -57
    -58    NOTE: This is not a symmetric operation: it adds all embedding data contained in proj_a that is missing from proj_b. This means that the resulting projection can only be greater or equal in size to proj_a.
    -59    """
    -60    if proj_b is None or not len(proj_b):
    -61        return proj_a
    -62
    -63    # Get the data in the new projection missing from the old
    -64    indices_missing = []
    -65    identifiers_missing = []
    -66    for id, idx in proj_b.identifier_to_index.items():
    -67        if proj_a is None or id not in proj_a.identifier_to_index:
    -68            indices_missing.append(idx)
    -69            identifiers_missing.append(id)
    -70
    -71    # Get just the missing embeddings
    -72    embeddings_missing = np.array(
    -73        [
    -74            embedding
    -75            for idx, embedding in enumerate(proj_b.embeddings)
    -76            if idx in set(indices_missing)
    -77        ]
    -78    )
    -79
    -80    # Concatenate index mapping and embeddings
    -81    idx_to_ids_new = identifiers_missing
    -82    embeddings_new = embeddings_missing
    -83    if proj_a is not None:
    -84        idx_to_ids_new = list(proj_a.index_to_identifier) + idx_to_ids_new
    -85        embeddings_new = np.concatenate(proj_a.embeddings, embeddings_new)
    -86
    -87    # Return a new projection
    -88    return Projection(
    -89        identifier_to_index={id: idx for idx, id in enumerate(idx_to_ids_new)},
    -90        index_to_identifier=tuple(idx_to_ids_new),
    -91        embeddings=embeddings_new,
    -92    )
    +            
    57def merge(proj_a: Projection, proj_b: Projection) -> Projection:
    +58    """Return the result of merging projection `proj_a` with projection `proj_b`.
    +59
    +60    This adds to proj_a all embedding data contained in proj_b that is missing from proj_a. This means that the resulting projection can only be greater or equal in size to proj_a.
    +61    """
    +62    if proj_b is None or not len(proj_b):
    +63        return proj_a
    +64
    +65    # Get the data in the new projection missing from the old
    +66    indices_missing = []
    +67    identifiers_missing = []
    +68    for id, idx in proj_b.identifier_to_index.items():
    +69        if proj_a is None or id not in proj_a.identifier_to_index:
    +70            indices_missing.append(idx)
    +71            identifiers_missing.append(id)
    +72
    +73    # Get just the missing embeddings
    +74    embeddings_missing = np.array(
    +75        [
    +76            embedding
    +77            for idx, embedding in enumerate(proj_b.embeddings)
    +78            if idx in set(indices_missing)
    +79        ]
    +80    )
    +81
    +82    # Concatenate index mapping and embeddings
    +83    idx_to_ids_new = identifiers_missing
    +84    embeddings_new = embeddings_missing
    +85    if proj_a is not None:
    +86        idx_to_ids_new = list(proj_a.index_to_identifier) + idx_to_ids_new
    +87        embeddings_new = np.concatenate([proj_a.embeddings, embeddings_new])
    +88
    +89    # Return a new projection
    +90    return Projection(
    +91        identifier_to_index={id: idx for idx, id in enumerate(idx_to_ids_new)},
    +92        index_to_identifier=tuple(idx_to_ids_new),
    +93        embeddings=embeddings_new,
    +94    )
     

    Return the result of merging projection proj_a with projection proj_b.

    -

    NOTE: This is not a symmetric operation: it adds all embedding data contained in proj_a that is missing from proj_b. This means that the resulting projection can only be greater or equal in size to proj_a.

    +

    This adds to proj_a all embedding data contained in proj_b that is missing from proj_a. This means that the resulting projection can only be greater or equal in size to proj_a.

    +
    + + + +
    + +
    + + def + get_empty_projection() -> sciterra.vectorization.projection.Projection: + + + +
    + +
     97def get_empty_projection() -> Projection:
    + 98    """Construct a Projection with no data (but is not None)."""
    + 99    return Projection(
    +100        identifier_to_index={},
    +101        index_to_identifier=(),
    +102        embeddings=np.array([[]]),  # 2D
    +103    )
    +
    + + +

    Construct a Projection with no data (but is not None).

    diff --git a/docs/sciterra/vectorization/sbert.html b/docs/sciterra/vectorization/sbert.html new file mode 100644 index 0000000..e29fc20 --- /dev/null +++ b/docs/sciterra/vectorization/sbert.html @@ -0,0 +1,601 @@ + + + + + + + sciterra.vectorization.sbert API documentation + + + + + + + + + + + + + +
    +
    +

    +sciterra.vectorization.sbert

    + +

    We use the acronym SBERT as a catch-all for BERT-based sentence transformers. In particular, we use a lightweight/fast version of one the top-performing model.

    + + + +
    +

    sbert: https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models. + HF: https://huggingface.co/sentence-transformers

    +
    +
    + + + + + +
     1"""We use the acronym SBERT as a catch-all for BERT-based sentence transformers. In particular, we use a lightweight/fast version of one the top-performing model.
    + 2
    + 3Links:
    + 4    sbert: https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models.
    + 5    HF: https://huggingface.co/sentence-transformers
    + 6"""
    + 7
    + 8import torch
    + 9import numpy as np
    +10from .vectorizer import Vectorizer
    +11from tqdm import tqdm
    +12
    +13from sentence_transformers import SentenceTransformer
    +14
    +15MPS_DEVICE = torch.device("mps")
    +16
    +17# MODEL_PATH = "bert-base-nli-mean-tokens" # NOTE: while Liu and Xu (2020) use this model in a metascience context, huggingface recommends a more recent sentence transformer.
    +18MODEL_PATH = "all-MiniLM-L6-v2"  # All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs. Listed as rank 50 on https://huggingface.co/spaces/mteb/leaderboard on 10/11/2023 with an average of 56; rank 1 achieved 64, bert-base-uncased achieved 34; GPT embedding ada-002 achieved 60.
    +19EMBEDDING_DIM = 384
    +20MAX_SEQ_LENGTH = 256
    +21
    +22
    +23class SBERTVectorizer(Vectorizer):
    +24    def __init__(self, device="cuda", **kwargs) -> None:
    +25        # Get the model
    +26        self.model = SentenceTransformer(MODEL_PATH)
    +27
    +28        # set device to GPU
    +29        if device == "mps":
    +30            self.device = MPS_DEVICE
    +31        elif device == "cuda":
    +32            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    +33
    +34        print(f"Using device: {self.device}.")
    +35        self.model.to(self.device)
    +36
    +37        # Put the model in "evaluation" mode
    +38        self.model.eval()
    +39        super().__init__()
    +40
    +41    def embed_documents(self, docs: list[str], batch_size: int = 64) -> np.ndarray:
    +42        """Embed a list of documents (raw text) into SBERT vectors, by batching.
    +43
    +44        Args:
    +45            docs: the documents to embed.
    +46
    +47        Returns:
    +48            a numpy array of shape `(num_documents, 384)`
    +49        """
    +50
    +51        embeddings = []
    +52
    +53        pbar = tqdm(
    +54            total=len(docs),
    +55            desc="embedding documents",
    +56            leave=True,
    +57        )
    +58
    +59        for i in range(0, len(docs), batch_size):
    +60            batch = docs[i : min(len(docs), i + batch_size)]
    +61
    +62            # no need to convert anything or dig inside model for outputs
    +63            batched_embeddings = self.model.encode(batch)
    +64
    +65            # Collect batched embeddings
    +66            embeddings.extend(batched_embeddings)
    +67
    +68            pbar.update(batch_size)
    +69        pbar.close()
    +70
    +71        # We don't have to deal with OOV, so we always return full list of ids
    +72        return {
    +73            "embeddings": np.array(embeddings),
    +74            "indices": np.arange(len(embeddings)),
    +75        }
    +
    + + +
    +
    +
    + MPS_DEVICE = +device(type='mps') + + +
    + + + + +
    +
    +
    + MODEL_PATH = +'all-MiniLM-L6-v2' + + +
    + + + + +
    +
    +
    + EMBEDDING_DIM = +384 + + +
    + + + + +
    +
    +
    + MAX_SEQ_LENGTH = +256 + + +
    + + + + +
    +
    + +
    + + class + SBERTVectorizer(sciterra.vectorization.vectorizer.Vectorizer): + + + +
    + +
    24class SBERTVectorizer(Vectorizer):
    +25    def __init__(self, device="cuda", **kwargs) -> None:
    +26        # Get the model
    +27        self.model = SentenceTransformer(MODEL_PATH)
    +28
    +29        # set device to GPU
    +30        if device == "mps":
    +31            self.device = MPS_DEVICE
    +32        elif device == "cuda":
    +33            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    +34
    +35        print(f"Using device: {self.device}.")
    +36        self.model.to(self.device)
    +37
    +38        # Put the model in "evaluation" mode
    +39        self.model.eval()
    +40        super().__init__()
    +41
    +42    def embed_documents(self, docs: list[str], batch_size: int = 64) -> np.ndarray:
    +43        """Embed a list of documents (raw text) into SBERT vectors, by batching.
    +44
    +45        Args:
    +46            docs: the documents to embed.
    +47
    +48        Returns:
    +49            a numpy array of shape `(num_documents, 384)`
    +50        """
    +51
    +52        embeddings = []
    +53
    +54        pbar = tqdm(
    +55            total=len(docs),
    +56            desc="embedding documents",
    +57            leave=True,
    +58        )
    +59
    +60        for i in range(0, len(docs), batch_size):
    +61            batch = docs[i : min(len(docs), i + batch_size)]
    +62
    +63            # no need to convert anything or dig inside model for outputs
    +64            batched_embeddings = self.model.encode(batch)
    +65
    +66            # Collect batched embeddings
    +67            embeddings.extend(batched_embeddings)
    +68
    +69            pbar.update(batch_size)
    +70        pbar.close()
    +71
    +72        # We don't have to deal with OOV, so we always return full list of ids
    +73        return {
    +74            "embeddings": np.array(embeddings),
    +75            "indices": np.arange(len(embeddings)),
    +76        }
    +
    + + + + +
    + +
    + + SBERTVectorizer(device='cuda', **kwargs) + + + +
    + +
    25    def __init__(self, device="cuda", **kwargs) -> None:
    +26        # Get the model
    +27        self.model = SentenceTransformer(MODEL_PATH)
    +28
    +29        # set device to GPU
    +30        if device == "mps":
    +31            self.device = MPS_DEVICE
    +32        elif device == "cuda":
    +33            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    +34
    +35        print(f"Using device: {self.device}.")
    +36        self.model.to(self.device)
    +37
    +38        # Put the model in "evaluation" mode
    +39        self.model.eval()
    +40        super().__init__()
    +
    + + + + +
    +
    +
    + model + + +
    + + + + +
    +
    + +
    + + def + embed_documents(self, docs: list[str], batch_size: int = 64) -> numpy.ndarray: + + + +
    + +
    42    def embed_documents(self, docs: list[str], batch_size: int = 64) -> np.ndarray:
    +43        """Embed a list of documents (raw text) into SBERT vectors, by batching.
    +44
    +45        Args:
    +46            docs: the documents to embed.
    +47
    +48        Returns:
    +49            a numpy array of shape `(num_documents, 384)`
    +50        """
    +51
    +52        embeddings = []
    +53
    +54        pbar = tqdm(
    +55            total=len(docs),
    +56            desc="embedding documents",
    +57            leave=True,
    +58        )
    +59
    +60        for i in range(0, len(docs), batch_size):
    +61            batch = docs[i : min(len(docs), i + batch_size)]
    +62
    +63            # no need to convert anything or dig inside model for outputs
    +64            batched_embeddings = self.model.encode(batch)
    +65
    +66            # Collect batched embeddings
    +67            embeddings.extend(batched_embeddings)
    +68
    +69            pbar.update(batch_size)
    +70        pbar.close()
    +71
    +72        # We don't have to deal with OOV, so we always return full list of ids
    +73        return {
    +74            "embeddings": np.array(embeddings),
    +75            "indices": np.arange(len(embeddings)),
    +76        }
    +
    + + +

    Embed a list of documents (raw text) into SBERT vectors, by batching.

    + +
    Arguments:
    + +
      +
    • docs: the documents to embed.
    • +
    + +
    Returns:
    + +
    +

    a numpy array of shape (num_documents, 384)

    +
    +
    + + +
    +
    +
    + + \ No newline at end of file diff --git a/docs/sciterra/vectorization/scibert.html b/docs/sciterra/vectorization/scibert.html index 6f79168..43c8363 100644 --- a/docs/sciterra/vectorization/scibert.html +++ b/docs/sciterra/vectorization/scibert.html @@ -58,6 +58,9 @@

    API Documentation

  • MODEL_PATH
  • +
  • + EMBEDDING_DIM +
  • SciBERTVectorizer
      @@ -121,96 +124,104 @@ 12from tqdm import tqdm 13 14from transformers import BertTokenizerFast, AutoModelForSequenceClassification - 15 - 16MPS_DEVICE = torch.device("mps") - 17 - 18# the SciBERT pretrained model path from Allen AI repo - 19MODEL_PATH = "allenai/scibert_scivocab_uncased" + 15from transformers import logging + 16 + 17logging.set_verbosity(logging.ERROR) # Silence warnings about training SCIBERT + 18 + 19MPS_DEVICE = torch.device("mps") 20 - 21 - 22class SciBERTVectorizer(Vectorizer): - 23 def __init__(self, device="cuda") -> None: - 24 # Get tokenizer - 25 self.tokenizer = BertTokenizerFast.from_pretrained( - 26 MODEL_PATH, - 27 do_lower_case=True, - 28 model_max_length=512, # I shouldn't have to pass this but I do - 29 ) - 30 # Get the model - 31 self.model = AutoModelForSequenceClassification.from_pretrained( - 32 pretrained_model_name_or_path=MODEL_PATH, - 33 output_attentions=False, - 34 output_hidden_states=True, - 35 ) - 36 - 37 # set device to GPU - 38 if device == "mps": - 39 self.device = MPS_DEVICE - 40 elif device == "cuda": - 41 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - 42 - 43 print(f"Using device: {self.device}.") - 44 self.model.to(self.device) - 45 - 46 # Put the model in "evaluation" mode - 47 self.model.eval() - 48 super().__init__() - 49 - 50 def embed_documents(self, docs: list[str], batch_size: int = 64) -> np.ndarray: - 51 """Embed a list of documents (raw text) into SciBERT vectors, by batching. - 52 - 53 Args: - 54 docs: the documents to embed. - 55 - 56 Returns: - 57 a numpy array of shape `(num_documents, 768)` - 58 """ - 59 - 60 embeddings = [] - 61 with tqdm( + 21# the SciBERT pretrained model path from Allen AI repo + 22MODEL_PATH = "allenai/scibert_scivocab_uncased" + 23EMBEDDING_DIM = 768 + 24 + 25 + 26class SciBERTVectorizer(Vectorizer): + 27 def __init__(self, device="cuda", **kwargs) -> None: + 28 # Get tokenizer + 29 # TODO: does this include the SCIVOCAB or BASEVOCAB? + 30 self.tokenizer = BertTokenizerFast.from_pretrained( + 31 MODEL_PATH, + 32 do_lower_case=True, + 33 model_max_length=512, # I shouldn't have to pass this but I do + 34 ) + 35 # Get the model + 36 self.model = AutoModelForSequenceClassification.from_pretrained( + 37 pretrained_model_name_or_path=MODEL_PATH, + 38 output_attentions=False, + 39 output_hidden_states=True, + 40 ) + 41 + 42 # set device to GPU + 43 if device == "mps": + 44 self.device = MPS_DEVICE + 45 elif device == "cuda": + 46 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + 47 + 48 print(f"Using device: {self.device}.") + 49 self.model.to(self.device) + 50 + 51 # Put the model in "evaluation" mode + 52 self.model.eval() + 53 super().__init__() + 54 + 55 def embed_documents(self, docs: list[str], batch_size: int = 64) -> dict[str, np.ndarray]: + 56 """Embed a list of documents (raw text) into SciBERT vectors, by batching. + 57 """ + 58 + 59 embeddings = [] + 60 + 61 pbar = tqdm( 62 total=len(docs), 63 desc="embedding documents", - 64 ) as pbar: - 65 for i in range(0, len(docs), batch_size): - 66 batch = docs[i : min(len(docs), i + batch_size)] - 67 - 68 # Tokenize the batch - 69 encoded = self.tokenizer( - 70 batch, - 71 add_special_tokens=True, - 72 padding=True, # pad up to length of longest abstract - 73 truncation=True, # max length 512 chars, unfortunately - 74 return_tensors="pt", - 75 ) - 76 # each encoded item of shape [64, 512] - 77 assert encoded["input_ids"].size()[-1] <= 512 - 78 - 79 # Put data on GPU - 80 for k, v in encoded.items(): - 81 encoded[k] = v.to(self.device) - 82 - 83 # Run the text through SciBERT, and collect all of the hidden states produced - 84 # from all 12 layers. - 85 with torch.no_grad(): - 86 _, encoded_layers = self.model( # discard logits - 87 **encoded, - 88 return_dict=False, - 89 ) - 90 - 91 # Extract the embeddings - 92 # index last (13th) BERT layer before the classifier - 93 final_hidden_state = encoded_layers[12] # (batch_size, 256, 768) - 94 # index first token of sequence, [CLS], for our document embeddings - 95 batched_embeddings = final_hidden_state[:, 0, :] # (batch_size, 768) - 96 - 97 # Move to the CPU and convert to numpy ndarray - 98 batched_embeddings = batched_embeddings.detach().cpu().numpy() - 99 -100 # Collect batched embeddings -101 embeddings.extend(batched_embeddings) -102 pbar.update(batch_size) -103 -104 return np.array(embeddings) + 64 leave=True, + 65 ) + 66 + 67 for i in range(0, len(docs), batch_size): + 68 batch = docs[i : min(len(docs), i + batch_size)] + 69 + 70 # Tokenize the batch + 71 encoded = self.tokenizer( + 72 batch, + 73 add_special_tokens=True, + 74 padding=True, # pad up to length of longest abstract + 75 truncation=True, # max length 512 chars, unfortunately + 76 return_tensors="pt", + 77 ) + 78 # each encoded item of shape [64, 512] + 79 assert encoded["input_ids"].size()[-1] <= 512 + 80 + 81 # Put data on GPU + 82 for k, v in encoded.items(): + 83 encoded[k] = v.to(self.device) + 84 + 85 # Run the text through SciBERT, + 86 # collecting all of the hidden states produced from all 12 layers. + 87 with torch.no_grad(): + 88 _, encoded_layers = self.model( # discard logits + 89 **encoded, + 90 return_dict=False, + 91 ) + 92 + 93 # Extract the embeddings + 94 # index last (13th) BERT layer before the classifier + 95 final_hidden_state = encoded_layers[12] # [batch_size, 256, 768] + 96 # index first token of sequence, [CLS], for our document embeddings + 97 batched_embeddings = final_hidden_state[:, 0, :] # [batch_size, 768] + 98 + 99 # Move to the CPU and convert to numpy ndarray +100 batched_embeddings = batched_embeddings.detach().cpu().numpy() +101 +102 # Collect batched embeddings +103 embeddings.extend(batched_embeddings) +104 +105 pbar.update(batch_size) +106 pbar.close() +107 +108 # We don't have to deal with OOV, so we always return full list of ids +109 return { +110 "embeddings": np.array(embeddings), +111 "indices": np.arange(len(embeddings)), +112 }
  • @@ -238,6 +249,18 @@ + +
    +
    + EMBEDDING_DIM = +768 + + +
    + + + +
    @@ -250,89 +273,93 @@
    -
     23class SciBERTVectorizer(Vectorizer):
    - 24    def __init__(self, device="cuda") -> None:
    - 25        # Get tokenizer
    - 26        self.tokenizer = BertTokenizerFast.from_pretrained(
    - 27            MODEL_PATH,
    - 28            do_lower_case=True,
    - 29            model_max_length=512,  # I shouldn't have to pass this but I do
    - 30        )
    - 31        # Get the model
    - 32        self.model = AutoModelForSequenceClassification.from_pretrained(
    - 33            pretrained_model_name_or_path=MODEL_PATH,
    - 34            output_attentions=False,
    - 35            output_hidden_states=True,
    - 36        )
    - 37
    - 38        # set device to GPU
    - 39        if device == "mps":
    - 40            self.device = MPS_DEVICE
    - 41        elif device == "cuda":
    - 42            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    - 43
    - 44        print(f"Using device: {self.device}.")
    - 45        self.model.to(self.device)
    - 46
    - 47        # Put the model in "evaluation" mode
    - 48        self.model.eval()
    - 49        super().__init__()
    - 50
    - 51    def embed_documents(self, docs: list[str], batch_size: int = 64) -> np.ndarray:
    - 52        """Embed a list of documents (raw text) into SciBERT vectors, by batching.
    - 53
    - 54        Args:
    - 55            docs: the documents to embed.
    - 56
    - 57        Returns:
    - 58            a numpy array of shape `(num_documents, 768)`
    - 59        """
    - 60
    - 61        embeddings = []
    - 62        with tqdm(
    +            
     27class SciBERTVectorizer(Vectorizer):
    + 28    def __init__(self, device="cuda", **kwargs) -> None:
    + 29        # Get tokenizer
    + 30        # TODO: does this include the SCIVOCAB or BASEVOCAB?
    + 31        self.tokenizer = BertTokenizerFast.from_pretrained(
    + 32            MODEL_PATH,
    + 33            do_lower_case=True,
    + 34            model_max_length=512,  # I shouldn't have to pass this but I do
    + 35        )
    + 36        # Get the model
    + 37        self.model = AutoModelForSequenceClassification.from_pretrained(
    + 38            pretrained_model_name_or_path=MODEL_PATH,
    + 39            output_attentions=False,
    + 40            output_hidden_states=True,
    + 41        )
    + 42
    + 43        # set device to GPU
    + 44        if device == "mps":
    + 45            self.device = MPS_DEVICE
    + 46        elif device == "cuda":
    + 47            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    + 48
    + 49        print(f"Using device: {self.device}.")
    + 50        self.model.to(self.device)
    + 51
    + 52        # Put the model in "evaluation" mode
    + 53        self.model.eval()
    + 54        super().__init__()
    + 55
    + 56    def embed_documents(self, docs: list[str], batch_size: int = 64) -> dict[str, np.ndarray]:
    + 57        """Embed a list of documents (raw text) into SciBERT vectors, by batching.
    + 58        """
    + 59
    + 60        embeddings = []
    + 61
    + 62        pbar = tqdm(
      63            total=len(docs),
      64            desc="embedding documents",
    - 65        ) as pbar:
    - 66            for i in range(0, len(docs), batch_size):
    - 67                batch = docs[i : min(len(docs), i + batch_size)]
    - 68
    - 69                # Tokenize the batch
    - 70                encoded = self.tokenizer(
    - 71                    batch,
    - 72                    add_special_tokens=True,
    - 73                    padding=True,  # pad up to length of longest abstract
    - 74                    truncation=True,  # max length 512 chars, unfortunately
    - 75                    return_tensors="pt",
    - 76                )
    - 77                # each encoded item of shape [64, 512]
    - 78                assert encoded["input_ids"].size()[-1] <= 512
    - 79
    - 80                # Put data on GPU
    - 81                for k, v in encoded.items():
    - 82                    encoded[k] = v.to(self.device)
    - 83
    - 84                # Run the text through SciBERT, and collect all of the hidden states produced
    - 85                # from all 12 layers.
    - 86                with torch.no_grad():
    - 87                    _, encoded_layers = self.model(  # discard logits
    - 88                        **encoded,
    - 89                        return_dict=False,
    - 90                    )
    - 91
    - 92                # Extract the embeddings
    - 93                # index last (13th) BERT layer before the classifier
    - 94                final_hidden_state = encoded_layers[12]  # (batch_size, 256, 768)
    - 95                # index first token of sequence, [CLS], for our document embeddings
    - 96                batched_embeddings = final_hidden_state[:, 0, :]  # (batch_size, 768)
    - 97
    - 98                # Move to the CPU and convert to numpy ndarray
    - 99                batched_embeddings = batched_embeddings.detach().cpu().numpy()
    -100
    -101                # Collect batched embeddings
    -102                embeddings.extend(batched_embeddings)
    -103                pbar.update(batch_size)
    -104
    -105        return np.array(embeddings)
    + 65            leave=True,
    + 66        )
    + 67
    + 68        for i in range(0, len(docs), batch_size):
    + 69            batch = docs[i : min(len(docs), i + batch_size)]
    + 70
    + 71            # Tokenize the batch
    + 72            encoded = self.tokenizer(
    + 73                batch,
    + 74                add_special_tokens=True,
    + 75                padding=True,  # pad up to length of longest abstract
    + 76                truncation=True,  # max length 512 chars, unfortunately
    + 77                return_tensors="pt",
    + 78            )
    + 79            # each encoded item of shape [64, 512]
    + 80            assert encoded["input_ids"].size()[-1] <= 512
    + 81
    + 82            # Put data on GPU
    + 83            for k, v in encoded.items():
    + 84                encoded[k] = v.to(self.device)
    + 85
    + 86            # Run the text through SciBERT,
    + 87            # collecting all of the hidden states produced from all 12 layers.
    + 88            with torch.no_grad():
    + 89                _, encoded_layers = self.model(  # discard logits
    + 90                    **encoded,
    + 91                    return_dict=False,
    + 92                )
    + 93
    + 94            # Extract the embeddings
    + 95            # index last (13th) BERT layer before the classifier
    + 96            final_hidden_state = encoded_layers[12]  # [batch_size, 256, 768]
    + 97            # index first token of sequence, [CLS], for our document embeddings
    + 98            batched_embeddings = final_hidden_state[:, 0, :]  # [batch_size, 768]
    + 99
    +100            # Move to the CPU and convert to numpy ndarray
    +101            batched_embeddings = batched_embeddings.detach().cpu().numpy()
    +102
    +103            # Collect batched embeddings
    +104            embeddings.extend(batched_embeddings)
    +105
    +106            pbar.update(batch_size)
    +107        pbar.close()
    +108
    +109        # We don't have to deal with OOV, so we always return full list of ids
    +110        return {
    +111            "embeddings": np.array(embeddings),
    +112            "indices": np.arange(len(embeddings)),
    +113        }
     
    @@ -342,38 +369,39 @@
    - SciBERTVectorizer(device='cuda') + SciBERTVectorizer(device='cuda', **kwargs)
    -
    24    def __init__(self, device="cuda") -> None:
    -25        # Get tokenizer
    -26        self.tokenizer = BertTokenizerFast.from_pretrained(
    -27            MODEL_PATH,
    -28            do_lower_case=True,
    -29            model_max_length=512,  # I shouldn't have to pass this but I do
    -30        )
    -31        # Get the model
    -32        self.model = AutoModelForSequenceClassification.from_pretrained(
    -33            pretrained_model_name_or_path=MODEL_PATH,
    -34            output_attentions=False,
    -35            output_hidden_states=True,
    -36        )
    -37
    -38        # set device to GPU
    -39        if device == "mps":
    -40            self.device = MPS_DEVICE
    -41        elif device == "cuda":
    -42            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    -43
    -44        print(f"Using device: {self.device}.")
    -45        self.model.to(self.device)
    -46
    -47        # Put the model in "evaluation" mode
    -48        self.model.eval()
    -49        super().__init__()
    +            
    28    def __init__(self, device="cuda", **kwargs) -> None:
    +29        # Get tokenizer
    +30        # TODO: does this include the SCIVOCAB or BASEVOCAB?
    +31        self.tokenizer = BertTokenizerFast.from_pretrained(
    +32            MODEL_PATH,
    +33            do_lower_case=True,
    +34            model_max_length=512,  # I shouldn't have to pass this but I do
    +35        )
    +36        # Get the model
    +37        self.model = AutoModelForSequenceClassification.from_pretrained(
    +38            pretrained_model_name_or_path=MODEL_PATH,
    +39            output_attentions=False,
    +40            output_hidden_states=True,
    +41        )
    +42
    +43        # set device to GPU
    +44        if device == "mps":
    +45            self.device = MPS_DEVICE
    +46        elif device == "cuda":
    +47            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    +48
    +49        print(f"Using device: {self.device}.")
    +50        self.model.to(self.device)
    +51
    +52        # Put the model in "evaluation" mode
    +53        self.model.eval()
    +54        super().__init__()
     
    @@ -407,83 +435,74 @@
    def - embed_documents(self, docs: list[str], batch_size: int = 64) -> numpy.ndarray: + embed_documents(self, docs: list[str], batch_size: int = 64) -> dict[str, numpy.ndarray]:
    -
     51    def embed_documents(self, docs: list[str], batch_size: int = 64) -> np.ndarray:
    - 52        """Embed a list of documents (raw text) into SciBERT vectors, by batching.
    - 53
    - 54        Args:
    - 55            docs: the documents to embed.
    - 56
    - 57        Returns:
    - 58            a numpy array of shape `(num_documents, 768)`
    - 59        """
    - 60
    - 61        embeddings = []
    - 62        with tqdm(
    +            
     56    def embed_documents(self, docs: list[str], batch_size: int = 64) -> dict[str, np.ndarray]:
    + 57        """Embed a list of documents (raw text) into SciBERT vectors, by batching.
    + 58        """
    + 59
    + 60        embeddings = []
    + 61
    + 62        pbar = tqdm(
      63            total=len(docs),
      64            desc="embedding documents",
    - 65        ) as pbar:
    - 66            for i in range(0, len(docs), batch_size):
    - 67                batch = docs[i : min(len(docs), i + batch_size)]
    - 68
    - 69                # Tokenize the batch
    - 70                encoded = self.tokenizer(
    - 71                    batch,
    - 72                    add_special_tokens=True,
    - 73                    padding=True,  # pad up to length of longest abstract
    - 74                    truncation=True,  # max length 512 chars, unfortunately
    - 75                    return_tensors="pt",
    - 76                )
    - 77                # each encoded item of shape [64, 512]
    - 78                assert encoded["input_ids"].size()[-1] <= 512
    - 79
    - 80                # Put data on GPU
    - 81                for k, v in encoded.items():
    - 82                    encoded[k] = v.to(self.device)
    - 83
    - 84                # Run the text through SciBERT, and collect all of the hidden states produced
    - 85                # from all 12 layers.
    - 86                with torch.no_grad():
    - 87                    _, encoded_layers = self.model(  # discard logits
    - 88                        **encoded,
    - 89                        return_dict=False,
    - 90                    )
    - 91
    - 92                # Extract the embeddings
    - 93                # index last (13th) BERT layer before the classifier
    - 94                final_hidden_state = encoded_layers[12]  # (batch_size, 256, 768)
    - 95                # index first token of sequence, [CLS], for our document embeddings
    - 96                batched_embeddings = final_hidden_state[:, 0, :]  # (batch_size, 768)
    - 97
    - 98                # Move to the CPU and convert to numpy ndarray
    - 99                batched_embeddings = batched_embeddings.detach().cpu().numpy()
    -100
    -101                # Collect batched embeddings
    -102                embeddings.extend(batched_embeddings)
    -103                pbar.update(batch_size)
    -104
    -105        return np.array(embeddings)
    + 65            leave=True,
    + 66        )
    + 67
    + 68        for i in range(0, len(docs), batch_size):
    + 69            batch = docs[i : min(len(docs), i + batch_size)]
    + 70
    + 71            # Tokenize the batch
    + 72            encoded = self.tokenizer(
    + 73                batch,
    + 74                add_special_tokens=True,
    + 75                padding=True,  # pad up to length of longest abstract
    + 76                truncation=True,  # max length 512 chars, unfortunately
    + 77                return_tensors="pt",
    + 78            )
    + 79            # each encoded item of shape [64, 512]
    + 80            assert encoded["input_ids"].size()[-1] <= 512
    + 81
    + 82            # Put data on GPU
    + 83            for k, v in encoded.items():
    + 84                encoded[k] = v.to(self.device)
    + 85
    + 86            # Run the text through SciBERT,
    + 87            # collecting all of the hidden states produced from all 12 layers.
    + 88            with torch.no_grad():
    + 89                _, encoded_layers = self.model(  # discard logits
    + 90                    **encoded,
    + 91                    return_dict=False,
    + 92                )
    + 93
    + 94            # Extract the embeddings
    + 95            # index last (13th) BERT layer before the classifier
    + 96            final_hidden_state = encoded_layers[12]  # [batch_size, 256, 768]
    + 97            # index first token of sequence, [CLS], for our document embeddings
    + 98            batched_embeddings = final_hidden_state[:, 0, :]  # [batch_size, 768]
    + 99
    +100            # Move to the CPU and convert to numpy ndarray
    +101            batched_embeddings = batched_embeddings.detach().cpu().numpy()
    +102
    +103            # Collect batched embeddings
    +104            embeddings.extend(batched_embeddings)
    +105
    +106            pbar.update(batch_size)
    +107        pbar.close()
    +108
    +109        # We don't have to deal with OOV, so we always return full list of ids
    +110        return {
    +111            "embeddings": np.array(embeddings),
    +112            "indices": np.arange(len(embeddings)),
    +113        }
     

    Embed a list of documents (raw text) into SciBERT vectors, by batching.

    - -
    Arguments:
    - -
      -
    • docs: the documents to embed.
    • -
    - -
    Returns:
    - -
    -

    a numpy array of shape (num_documents, 768)

    -
    diff --git a/docs/sciterra/vectorization/vectorizer.html b/docs/sciterra/vectorization/vectorizer.html index 9cd9eb4..06c6a97 100644 --- a/docs/sciterra/vectorization/vectorizer.html +++ b/docs/sciterra/vectorization/vectorizer.html @@ -93,16 +93,23 @@

    7 def __init__(self) -> None: 8 pass 9 -10 def embed_documents(self, docs: list[str]) -> np.ndarray: +10 def embed_documents(self, docs: list[str]) -> dict[str, np.ndarray]: 11 """Embed a list of documents into document vectors. 12 13 Args: 14 docs: the documents to embed. 15 16 Returns: -17 a numpy array of shape `(num_documents, embedding_dim)` -18 """ -19 raise NotImplementedError +17 a dict of the form +18 { +19 "embeddings": a numpy array of shape `(num_successful, embedding_dim)`, containing the document embeddingss +20 +21 "indices": a numpy array of shape `(num_successful,)`, containing the indices of all the documents for which document embeddings were successfully obtained. +22 } +23 where `num_successful` is the number of documents in `docs` that were successfully embedded. +24 +25 """ +26 raise NotImplementedError

    @@ -122,16 +129,23 @@

    8 def __init__(self) -> None: 9 pass 10 -11 def embed_documents(self, docs: list[str]) -> np.ndarray: +11 def embed_documents(self, docs: list[str]) -> dict[str, np.ndarray]: 12 """Embed a list of documents into document vectors. 13 14 Args: 15 docs: the documents to embed. 16 17 Returns: -18 a numpy array of shape `(num_documents, embedding_dim)` -19 """ -20 raise NotImplementedError +18 a dict of the form +19 { +20 "embeddings": a numpy array of shape `(num_successful, embedding_dim)`, containing the document embeddingss +21 +22 "indices": a numpy array of shape `(num_successful,)`, containing the indices of all the documents for which document embeddings were successfully obtained. +23 } +24 where `num_successful` is the number of documents in `docs` that were successfully embedded. +25 +26 """ +27 raise NotImplementedError

    @@ -142,22 +156,29 @@

    def - embed_documents(self, docs: list[str]) -> numpy.ndarray: + embed_documents(self, docs: list[str]) -> dict[str, numpy.ndarray]:
    -
    11    def embed_documents(self, docs: list[str]) -> np.ndarray:
    +            
    11    def embed_documents(self, docs: list[str]) -> dict[str, np.ndarray]:
     12        """Embed a list of documents into document vectors.
     13
     14        Args:
     15            docs: the documents to embed.
     16
     17        Returns:
    -18            a numpy array of shape `(num_documents, embedding_dim)`
    -19        """
    -20        raise NotImplementedError
    +18            a dict of the form 
    +19            {
    +20                "embeddings": a numpy array of shape `(num_successful, embedding_dim)`, containing the document embeddingss
    +21
    +22                "indices": a numpy array of shape `(num_successful,)`, containing the indices of all the documents for which document embeddings were successfully obtained.
    +23            }
    +24            where `num_successful` is the number of documents in `docs` that were successfully embedded.
    +25
    +26        """
    +27        raise NotImplementedError
     
    @@ -172,7 +193,15 @@
    Arguments:
    Returns:
    -

    a numpy array of shape (num_documents, embedding_dim)

    +

    a dict of the form + { + "embeddings": a numpy array of shape (num_successful, embedding_dim), containing the document embeddingss

    + +
    "indices": a numpy array of shape `(num_successful,)`, containing the indices of all the documents for which document embeddings were successfully obtained.
    +
    + +

    } + where num_successful is the number of documents in docs that were successfully embedded.

    diff --git a/docs/sciterra/vectorization/word2vec.html b/docs/sciterra/vectorization/word2vec.html new file mode 100644 index 0000000..f6d022b --- /dev/null +++ b/docs/sciterra/vectorization/word2vec.html @@ -0,0 +1,758 @@ + + + + + + + sciterra.vectorization.word2vec API documentation + + + + + + + + + + + + + +
    +
    +

    +sciterra.vectorization.word2vec

    + +

    We use a simple word2vec model that gets a document vector by averaging all words in the document.

    + +

    Since we are getting vectors for scientific documents, we must load a vocabulary to train the model from scratch. Therefore we define different subclasses for each scientific field, which may differ substantially by vocabulary.

    + +

    There exists a Doc2Vec module by gensim, but it seems that empirically Word2Vec + averaging can do just as well; furthermore, we're mainly interested in a simple baseline to compare with sophisticated embeddings.

    + + + +
    +

    gensim: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#

    +
    +
    + + + + + +
      1"""We use a simple word2vec model that gets a document vector by averaging all words in the document.
    +  2
    +  3Since we are getting vectors for scientific documents, we must load a vocabulary to train the model from scratch. Therefore we define different subclasses for each scientific field, which may differ substantially by vocabulary.
    +  4
    +  5There exists a Doc2Vec module by gensim, but it seems that empirically Word2Vec + averaging can do just as well; furthermore, we're mainly interested in a simple baseline to compare with sophisticated embeddings.
    +  6
    +  7Links:
    +  8    gensim: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#
    +  9"""
    + 10
    + 11import os
    + 12import time
    + 13
    + 14import numpy as np
    + 15
    + 16from .vectorizer import Vectorizer
    + 17from .preprocessing import custom_preprocess
    + 18from tqdm import tqdm
    + 19from typing import Callable
    + 20
    + 21from gensim.models import Word2Vec
    + 22
    + 23from multiprocessing import cpu_count
    + 24
    + 25
    + 26EMBEDDING_DIM = 300
    + 27
    + 28# Training data for vocabulary
    + 29
    + 30current_file_abs_path = os.path.dirname(os.path.abspath(__file__))
    + 31corpora_path = os.path.join(current_file_abs_path, "corpora")
    + 32ASTROPHYSICS_CORPUS = "astro_small.txt"
    + 33DEFAULT_CORPUS = os.path.join(corpora_path, ASTROPHYSICS_CORPUS)
    + 34
    + 35
    + 36class Word2VecVectorizer(Vectorizer):
    + 37    def __init__(
    + 38        self,
    + 39        corpus_path: str = DEFAULT_CORPUS,
    + 40        model_path: str = None,
    + 41        vector_size: int = EMBEDDING_DIM,
    + 42        window: int = 5,
    + 43        min_count: int = 2,
    + 44        workers: int = cpu_count(),
    + 45        epochs: int = 10,
    + 46        tokenizer: Callable[[str], list[str]] = custom_preprocess,
    + 47        **kwargs,
    + 48    ) -> None:
    + 49        """Construct a Word2Vec based document embedding model from a corpus."""
    + 50        super().__init__()
    + 51
    + 52        self.tokenizer = tokenizer
    + 53
    + 54        if (model_path is None) or (not os.path.exists(model_path)):
    + 55            start = time.time()
    + 56            # Assume the file is line-based, and one document per line
    + 57            print(
    + 58                f"Loading and tokenizing data from {corpus_path} for vocabulary and training..."
    + 59            )
    + 60            sentences = [self.tokenizer(line) for line in tqdm(open(corpus_path))]
    + 61
    + 62            print(f"Training Word2Vec model...")
    + 63            model = Word2Vec(
    + 64                sentences=sentences,
    + 65                vector_size=vector_size,
    + 66                window=window,
    + 67                min_count=min_count,
    + 68                workers=workers,
    + 69                epochs=epochs,
    + 70            )
    + 71            duration = time.time() - start
    + 72            print(f"Loaded corpus and trained model in {duration:.2f} seconds.")
    + 73        else:
    + 74            print(f"Loading saved Word2Vec model from {model_path}.")
    + 75            model = Word2Vec.load(model_path)
    + 76
    + 77        self.model = model
    + 78
    + 79        # We don't plan to train the model any further, so we call `init_sims` to make the model much more memory-efficient
    + 80        # If `replace` is set, forget the original vectors and only keep the normalized ones = saves lots of memory!
    + 81        self.model.init_sims(replace=True)
    + 82
    + 83        # write model to disk to load later and save time
    + 84        if model_path is not None:
    + 85            print(f"Saving Word2Vec model at {model_path}.")
    + 86            self.model.save(model_path)
    + 87
    + 88    def embed_documents(self, docs: list[str], **kwargs) -> np.ndarray:
    + 89        """Embed a list of documents (raw text) into word2vec document vectors by averaging the word vectors in each of the documents.
    + 90
    + 91        Since there's no speedup via batching like there is in pytorch models, we iterate one document at a time.
    + 92        """
    + 93
    + 94        # return np.array(
    + 95        #     [
    + 96        #         np.mean(
    + 97        #             [
    + 98        #                 self.model.wv[word]
    + 99        #                 for word in self.tokenizer(doc)
    +100        #                 if word in self.model.wv
    +101        #             ],  # shape `(300,)`
    +102        #             axis=0,
    +103        #         )
    +104        #         for doc in tqdm(
    +105        #             docs,
    +106        #             desc="embedding documents",
    +107        #             leave=True,
    +108        #         )
    +109        #     ]
    +110        # )
    +111        means = []
    +112        success_indices = []
    +113        for i, doc in tqdm(enumerate(docs), desc="embedding documents", leave=True):
    +114            mean = np.mean([
    +115                self.model.wv[word]
    +116                for word in self.tokenizer(doc)
    +117                if word in self.model.wv
    +118                ],  # shape `(300,)`
    +119                axis=0,
    +120            )
    +121            if not np.isnan(mean).any():
    +122                means.append(mean)
    +123                success_indices.append(i)
    +124
    +125        return {
    +126            "embeddings": np.array(means),
    +127            "indices": np.array(success_indices),
    +128        }
    +
    + + +
    +
    +
    + EMBEDDING_DIM = +300 + + +
    + + + + +
    +
    +
    + current_file_abs_path = +'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization' + + +
    + + + + +
    +
    +
    + corpora_path = +'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora' + + +
    + + + + +
    +
    +
    + ASTROPHYSICS_CORPUS = +'astro_small.txt' + + +
    + + + + +
    +
    +
    + DEFAULT_CORPUS = +'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora/astro_small.txt' + + +
    + + + + +
    +
    + +
    + + class + Word2VecVectorizer(sciterra.vectorization.vectorizer.Vectorizer): + + + +
    + +
     37class Word2VecVectorizer(Vectorizer):
    + 38    def __init__(
    + 39        self,
    + 40        corpus_path: str = DEFAULT_CORPUS,
    + 41        model_path: str = None,
    + 42        vector_size: int = EMBEDDING_DIM,
    + 43        window: int = 5,
    + 44        min_count: int = 2,
    + 45        workers: int = cpu_count(),
    + 46        epochs: int = 10,
    + 47        tokenizer: Callable[[str], list[str]] = custom_preprocess,
    + 48        **kwargs,
    + 49    ) -> None:
    + 50        """Construct a Word2Vec based document embedding model from a corpus."""
    + 51        super().__init__()
    + 52
    + 53        self.tokenizer = tokenizer
    + 54
    + 55        if (model_path is None) or (not os.path.exists(model_path)):
    + 56            start = time.time()
    + 57            # Assume the file is line-based, and one document per line
    + 58            print(
    + 59                f"Loading and tokenizing data from {corpus_path} for vocabulary and training..."
    + 60            )
    + 61            sentences = [self.tokenizer(line) for line in tqdm(open(corpus_path))]
    + 62
    + 63            print(f"Training Word2Vec model...")
    + 64            model = Word2Vec(
    + 65                sentences=sentences,
    + 66                vector_size=vector_size,
    + 67                window=window,
    + 68                min_count=min_count,
    + 69                workers=workers,
    + 70                epochs=epochs,
    + 71            )
    + 72            duration = time.time() - start
    + 73            print(f"Loaded corpus and trained model in {duration:.2f} seconds.")
    + 74        else:
    + 75            print(f"Loading saved Word2Vec model from {model_path}.")
    + 76            model = Word2Vec.load(model_path)
    + 77
    + 78        self.model = model
    + 79
    + 80        # We don't plan to train the model any further, so we call `init_sims` to make the model much more memory-efficient
    + 81        # If `replace` is set, forget the original vectors and only keep the normalized ones = saves lots of memory!
    + 82        self.model.init_sims(replace=True)
    + 83
    + 84        # write model to disk to load later and save time
    + 85        if model_path is not None:
    + 86            print(f"Saving Word2Vec model at {model_path}.")
    + 87            self.model.save(model_path)
    + 88
    + 89    def embed_documents(self, docs: list[str], **kwargs) -> np.ndarray:
    + 90        """Embed a list of documents (raw text) into word2vec document vectors by averaging the word vectors in each of the documents.
    + 91
    + 92        Since there's no speedup via batching like there is in pytorch models, we iterate one document at a time.
    + 93        """
    + 94
    + 95        # return np.array(
    + 96        #     [
    + 97        #         np.mean(
    + 98        #             [
    + 99        #                 self.model.wv[word]
    +100        #                 for word in self.tokenizer(doc)
    +101        #                 if word in self.model.wv
    +102        #             ],  # shape `(300,)`
    +103        #             axis=0,
    +104        #         )
    +105        #         for doc in tqdm(
    +106        #             docs,
    +107        #             desc="embedding documents",
    +108        #             leave=True,
    +109        #         )
    +110        #     ]
    +111        # )
    +112        means = []
    +113        success_indices = []
    +114        for i, doc in tqdm(enumerate(docs), desc="embedding documents", leave=True):
    +115            mean = np.mean([
    +116                self.model.wv[word]
    +117                for word in self.tokenizer(doc)
    +118                if word in self.model.wv
    +119                ],  # shape `(300,)`
    +120                axis=0,
    +121            )
    +122            if not np.isnan(mean).any():
    +123                means.append(mean)
    +124                success_indices.append(i)
    +125
    +126        return {
    +127            "embeddings": np.array(means),
    +128            "indices": np.array(success_indices),
    +129        }
    +
    + + + + +
    + +
    + + Word2VecVectorizer( corpus_path: str = '/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora/astro_small.txt', model_path: str = None, vector_size: int = 300, window: int = 5, min_count: int = 2, workers: int = 8, epochs: int = 10, tokenizer: Callable[[str], list[str]] = <function custom_preprocess>, **kwargs) + + + +
    + +
    38    def __init__(
    +39        self,
    +40        corpus_path: str = DEFAULT_CORPUS,
    +41        model_path: str = None,
    +42        vector_size: int = EMBEDDING_DIM,
    +43        window: int = 5,
    +44        min_count: int = 2,
    +45        workers: int = cpu_count(),
    +46        epochs: int = 10,
    +47        tokenizer: Callable[[str], list[str]] = custom_preprocess,
    +48        **kwargs,
    +49    ) -> None:
    +50        """Construct a Word2Vec based document embedding model from a corpus."""
    +51        super().__init__()
    +52
    +53        self.tokenizer = tokenizer
    +54
    +55        if (model_path is None) or (not os.path.exists(model_path)):
    +56            start = time.time()
    +57            # Assume the file is line-based, and one document per line
    +58            print(
    +59                f"Loading and tokenizing data from {corpus_path} for vocabulary and training..."
    +60            )
    +61            sentences = [self.tokenizer(line) for line in tqdm(open(corpus_path))]
    +62
    +63            print(f"Training Word2Vec model...")
    +64            model = Word2Vec(
    +65                sentences=sentences,
    +66                vector_size=vector_size,
    +67                window=window,
    +68                min_count=min_count,
    +69                workers=workers,
    +70                epochs=epochs,
    +71            )
    +72            duration = time.time() - start
    +73            print(f"Loaded corpus and trained model in {duration:.2f} seconds.")
    +74        else:
    +75            print(f"Loading saved Word2Vec model from {model_path}.")
    +76            model = Word2Vec.load(model_path)
    +77
    +78        self.model = model
    +79
    +80        # We don't plan to train the model any further, so we call `init_sims` to make the model much more memory-efficient
    +81        # If `replace` is set, forget the original vectors and only keep the normalized ones = saves lots of memory!
    +82        self.model.init_sims(replace=True)
    +83
    +84        # write model to disk to load later and save time
    +85        if model_path is not None:
    +86            print(f"Saving Word2Vec model at {model_path}.")
    +87            self.model.save(model_path)
    +
    + + +

    Construct a Word2Vec based document embedding model from a corpus.

    +
    + + +
    +
    +
    + tokenizer + + +
    + + + + +
    +
    +
    + model + + +
    + + + + +
    +
    + +
    + + def + embed_documents(self, docs: list[str], **kwargs) -> numpy.ndarray: + + + +
    + +
     89    def embed_documents(self, docs: list[str], **kwargs) -> np.ndarray:
    + 90        """Embed a list of documents (raw text) into word2vec document vectors by averaging the word vectors in each of the documents.
    + 91
    + 92        Since there's no speedup via batching like there is in pytorch models, we iterate one document at a time.
    + 93        """
    + 94
    + 95        # return np.array(
    + 96        #     [
    + 97        #         np.mean(
    + 98        #             [
    + 99        #                 self.model.wv[word]
    +100        #                 for word in self.tokenizer(doc)
    +101        #                 if word in self.model.wv
    +102        #             ],  # shape `(300,)`
    +103        #             axis=0,
    +104        #         )
    +105        #         for doc in tqdm(
    +106        #             docs,
    +107        #             desc="embedding documents",
    +108        #             leave=True,
    +109        #         )
    +110        #     ]
    +111        # )
    +112        means = []
    +113        success_indices = []
    +114        for i, doc in tqdm(enumerate(docs), desc="embedding documents", leave=True):
    +115            mean = np.mean([
    +116                self.model.wv[word]
    +117                for word in self.tokenizer(doc)
    +118                if word in self.model.wv
    +119                ],  # shape `(300,)`
    +120                axis=0,
    +121            )
    +122            if not np.isnan(mean).any():
    +123                means.append(mean)
    +124                success_indices.append(i)
    +125
    +126        return {
    +127            "embeddings": np.array(means),
    +128            "indices": np.array(success_indices),
    +129        }
    +
    + + +

    Embed a list of documents (raw text) into word2vec document vectors by averaging the word vectors in each of the documents.

    + +

    Since there's no speedup via batching like there is in pytorch models, we iterate one document at a time.

    +
    + + +
    +
    +
    + + \ No newline at end of file diff --git a/docs/search.js b/docs/search.js index 48e25bf..94683aa 100644 --- a/docs/search.js +++ b/docs/search.js @@ -1,6 +1,6 @@ window.pdocSearch = (function(){ /** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();osciterra

    \n\n

    \"build\"

    \n\n

    Software library to support data-driven analyses of scientific literature

    \n\n

    Inspired heavily by Zach Hafen's cc library.

    \n"}, {"fullname": "sciterra.librarians", "modulename": "sciterra.librarians", "kind": "module", "doc": "

    \n"}, {"fullname": "sciterra.librarians.adslibrarian", "modulename": "sciterra.librarians.adslibrarian", "kind": "module", "doc": "

    \n"}, {"fullname": "sciterra.librarians.adslibrarian.CALL_SIZE", "modulename": "sciterra.librarians.adslibrarian", "qualname": "CALL_SIZE", "kind": "variable", "doc": "

    \n", "default_value": "2000"}, {"fullname": "sciterra.librarians.adslibrarian.NUM_ATTEMPTS_PER_QUERY", "modulename": "sciterra.librarians.adslibrarian", "qualname": "NUM_ATTEMPTS_PER_QUERY", "kind": "variable", "doc": "

    \n", "default_value": "10"}, {"fullname": "sciterra.librarians.adslibrarian.QUERY_FIELDS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "QUERY_FIELDS", "kind": "variable", "doc": "

    \n", "default_value": "['bibcode', 'abstract', 'title', 'entry_date', 'pubdate', 'year', 'citation_count', 'citation', 'reference', 'identifier']"}, {"fullname": "sciterra.librarians.adslibrarian.ALLOWED_EXCEPTIONS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ALLOWED_EXCEPTIONS", "kind": "variable", "doc": "

    \n", "default_value": "(<class 'ads.exceptions.APIResponseError'>,)"}, {"fullname": "sciterra.librarians.adslibrarian.EXTERNAL_IDS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "EXTERNAL_IDS", "kind": "variable", "doc": "

    \n", "default_value": "['DOI', 'arXiv', 'bibcode']"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian", "kind": "class", "doc": "

    \n", "bases": "sciterra.librarians.librarian.Librarian"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.bibtex_entry_identifier", "kind": "function", "doc": "

    Parse a bibtex entry for a usable identifier for querying ADS (see EXTERNAL_IDS).

    \n", "signature": "(self, bibtex_entry: dict) -> str:", "funcdef": "def"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.get_publications", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.get_publications", "kind": "function", "doc": "

    Use the NASA ADS python package, which calls the ADS API to retrieve publications.

    \n\n
    Arguments:
    \n\n
      \n
    • bibcodes: the str ids required for querying. While it is possible to use one of EXTERNAL_IDS to query, if ADS returns a paper at all, it will return a bibcode, so it is preferred to use bibcodes.
    • \n
    • n_attempts_per_query: Number of attempts to access the API per query. Useful when experiencing connection issues.
    • \n
    • call_size: maximum number of papers to call API for in one query; if less than len(bibcodes), chunking will be performed.
    • \n
    • convert: whether to convert each resulting ADS Article to sciterra Publications (True by default).
    • \n
    \n\n
    Returns:
    \n\n
    \n

    the list of publications (or Papers)

    \n
    \n", "signature": "(\tself,\tbibcodes: list[str],\t*args,\tcall_size: int = 2000,\tn_attempts_per_query: int = 10,\tconvert: bool = True,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.convert_publication", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.convert_publication", "kind": "function", "doc": "

    Convert a ADS Article object to a sciterra.publication.Publication.

    \n", "signature": "(\tself,\tarticle: ads.search.Article,\t*args,\t**kwargs) -> sciterra.mapping.publication.Publication:", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian", "modulename": "sciterra.librarians.librarian", "kind": "module", "doc": "

    \n"}, {"fullname": "sciterra.librarians.librarian.Librarian", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian", "kind": "class", "doc": "

    \n"}, {"fullname": "sciterra.librarians.librarian.Librarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.bibtex_entry_identifier", "kind": "function", "doc": "

    Parse a bibtex entry for a usable unique identifier appropriate to the API.

    \n", "signature": "(self, bibtex_entry: dict) -> str:", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.get_publications", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.get_publications", "kind": "function", "doc": "

    Call an API and retrieve the publications corresponding to str identifiers.

    \n\n
    Arguments:
    \n\n
      \n
    • n_attempts_per_query: Number of attempts to access the API per query. Useful when experiencing connection issues.
    • \n
    • call_size: (int): maximum number of papers to call API for in one query; if less than len(paper_ids), chunking will be performed.
    • \n
    \n", "signature": "(\tself,\tidentifiers: list[str],\t*args,\tcall_size: int = None,\tn_attempts_per_query: int = None,\tconvert: bool = True,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.convert_publication", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.convert_publication", "kind": "function", "doc": "

    Convert an API-specific resulting publication data structure into a sciterra Publication object.

    \n", "signature": "(self, pub: Any, *args, **kwargs):", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.convert_publications", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.convert_publications", "kind": "function", "doc": "

    Convet a list of API-specific results to sciterra Publications, possibly using multiprocessing.

    \n", "signature": "(\tself,\tpapers: list,\t*args,\tmultiprocess: bool = True,\tnum_processes=6,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian", "modulename": "sciterra.librarians.s2librarian", "kind": "module", "doc": "

    \n"}, {"fullname": "sciterra.librarians.s2librarian.QUERY_FIELDS", "modulename": "sciterra.librarians.s2librarian", "qualname": "QUERY_FIELDS", "kind": "variable", "doc": "

    \n", "default_value": "['year', 'abstract', 'title', 'externalIds', 'citationCount', 'url', 'citations.externalIds', 'citations.url', 'references.externalIds', 'references.url', 'citationStyles', 'publicationDate']"}, {"fullname": "sciterra.librarians.s2librarian.EXTERNAL_IDS", "modulename": "sciterra.librarians.s2librarian", "qualname": "EXTERNAL_IDS", "kind": "variable", "doc": "

    \n", "default_value": "['DOI', 'ArXiv', 'CorpusId', 'MAG', 'ACL', 'PubMed', 'Medline', 'PubMedCentral', 'DBLP', 'URL']"}, {"fullname": "sciterra.librarians.s2librarian.STORE_FIELDS", "modulename": "sciterra.librarians.s2librarian", "qualname": "STORE_FIELDS", "kind": "variable", "doc": "

    \n", "default_value": "['abstract', 'externalIds', 'url', 'citations', 'references', 'citationStyles', 'publicationDate']"}, {"fullname": "sciterra.librarians.s2librarian.ATTRS_TO_SAVE", "modulename": "sciterra.librarians.s2librarian", "qualname": "ATTRS_TO_SAVE", "kind": "variable", "doc": "

    \n", "default_value": "['paper', 'abstract', 'citations', 'references', 'bibcode', 'entry_date', 'notes', 'unofficial_flag', 'citation', 'stemmed_content_words']"}, {"fullname": "sciterra.librarians.s2librarian.ALLOWED_EXCEPTIONS", "modulename": "sciterra.librarians.s2librarian", "qualname": "ALLOWED_EXCEPTIONS", "kind": "variable", "doc": "

    \n", "default_value": "(<class 'Exception'>, <class 'requests.exceptions.ReadTimeout'>, <class 'requests.exceptions.ConnectionError'>, <class 'semanticscholar.SemanticScholarException.ObjectNotFoundExeception'>)"}, {"fullname": "sciterra.librarians.s2librarian.CALL_SIZE", "modulename": "sciterra.librarians.s2librarian", "qualname": "CALL_SIZE", "kind": "variable", "doc": "

    \n", "default_value": "10"}, {"fullname": "sciterra.librarians.s2librarian.NUM_ATTEMPTS_PER_QUERY", "modulename": "sciterra.librarians.s2librarian", "qualname": "NUM_ATTEMPTS_PER_QUERY", "kind": "variable", "doc": "

    \n", "default_value": "50"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian", "kind": "class", "doc": "

    \n", "bases": "sciterra.librarians.librarian.Librarian"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.sch", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.sch", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.bibtex_entry_identifier", "kind": "function", "doc": "

    Parse a bibtex entry for a usable identifier for querying SemanticScholar (see EXTERNAL_IDS).

    \n", "signature": "(self, bibtex_entry: dict) -> str:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_publications", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_publications", "kind": "function", "doc": "

    Use the (unofficial) S2 python package, which calls the Semantic Scholar API to retrieve publications from the S2AG.

    \n\n
    Arguments:
    \n\n
      \n
    • paper_ids: the str ids required for querying. While it is possible to use one of EXTERNAL_IDS to query, if SemanticScholar returns a paper at all, it will return a paperId, so it is preferred to use paperIds.
    • \n
    • n_attempts_per_query: Number of attempts to access the API per query. Useful when experiencing connection issues.
    • \n
    • call_size: maximum number of papers to call API for in one query; if less than len(paper_ids), chunking will be performed.
    • \n
    • convert: whether to convert each resulting SemanticScholar Paper to sciterra Publications (True by default).
    • \n
    \n\n
    Returns:
    \n\n
    \n

    the list of publications (or Papers)

    \n
    \n", "signature": "(\tself,\tpaper_ids: list[str],\t*args,\tcall_size: int = 10,\tn_attempts_per_query: int = 50,\tconvert: bool = True,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.convert_publication", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.convert_publication", "kind": "function", "doc": "

    Convert a SemanticScholar Paper object to a sciterra.publication.Publication.

    \n", "signature": "(\tself,\tpaper: semanticscholar.Paper.Paper,\t*args,\t**kwargs) -> sciterra.mapping.publication.Publication:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_papers", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_papers", "kind": "function", "doc": "

    Custom function for calling the S2 API that doesn't fail on empty results.

    \n", "signature": "(self, paper_ids: list[str], fields: list[str]):", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_paper", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_paper", "kind": "function", "doc": "

    Custom function for calling the S2 API that doesn't fail on empty results.

    \n", "signature": "(self, paper_id: str, fields: list[str]):", "funcdef": "def"}, {"fullname": "sciterra.mapping", "modulename": "sciterra.mapping", "kind": "module", "doc": "

    Classes for constructing maps of scientific literature.

    \n\n

    The sciterra.mapping.atlas submodule contains the basic data structure, the Atlas.

    \n\n

    The sciterra.mapping.cartography submodule contains functionality for manipulating an Atlas.

    \n"}, {"fullname": "sciterra.mapping.atlas", "modulename": "sciterra.mapping.atlas", "kind": "module", "doc": "

    Main container object for a large library of publications.

    \n"}, {"fullname": "sciterra.mapping.atlas.Atlas", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas", "kind": "class", "doc": "

    \n"}, {"fullname": "sciterra.mapping.atlas.Atlas.__init__", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tpublications: list[sciterra.mapping.publication.Publication],\tprojection: sciterra.vectorization.projection.Projection = None)"}, {"fullname": "sciterra.mapping.atlas.Atlas.publications", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.publications", "kind": "variable", "doc": "

    \n", "annotation": ": dict[str, sciterra.mapping.publication.Publication]"}, {"fullname": "sciterra.mapping.atlas.Atlas.projection", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.projection", "kind": "variable", "doc": "

    \n", "annotation": ": sciterra.vectorization.projection.Projection"}, {"fullname": "sciterra.mapping.atlas.Atlas.save", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.save", "kind": "function", "doc": "

    Write the Atlas to a directory containing a CSV file of publications and a .npy file of embeddings.

    \n\n

    Write the Atlas to a directory containing a .pkl file of publications and a .pkl file of the projection.

    \n\n
    Arguments:
    \n\n
      \n
    • atlas_dirpath: path of directory to save files to.
    • \n
    • publications_fn: name of file to save publications to.
    • \n
    • projection_fn: name of file to save projection to.
    • \n
    • overwrite_publications: whether to overwrite an existing publications file.
    • \n
    • overwrite_projection: whether to overwrite an existing projection file.
    • \n
    \n", "signature": "(\tself,\tatlas_dirpath: str,\tpublications_fn: str = 'publications.pkl',\tprojection_fn: str = 'projection.pkl',\toverwrite_publications: bool = True,\toverwrite_projection: bool = True) -> None:", "funcdef": "def"}, {"fullname": "sciterra.mapping.atlas.Atlas.load", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.load", "kind": "function", "doc": "

    Load an Atlas object from a directory containing publications and/or their projection.

    \n\n
    Arguments:
    \n\n
      \n
    • atlas_dirpath: file with vocab, assumed output from save_to_file
    • \n
    • publications_fn: name of file to load publications from.
    • \n
    • projection_fn: name of file to load projection from.
    • \n
    \n", "signature": "(\tcls,\tatlas_dirpath: str,\tpublications_fn: str = 'publications.pkl',\tprojection_fn: str = 'projection.pkl',\t**kwargs):", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography", "modulename": "sciterra.mapping.cartography", "kind": "module", "doc": "

    Functions for manipulating an atlas based on the document embeddings of the abstracts of its publications.

    \n"}, {"fullname": "sciterra.mapping.cartography.Cartographer", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer", "kind": "class", "doc": "

    A basic wrapper for obtaining and updating atlas projections.

    \n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.__init__", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tlibrarian: sciterra.librarians.librarian.Librarian = None,\tvectorizer: sciterra.vectorization.vectorizer.Vectorizer = None)"}, {"fullname": "sciterra.mapping.cartography.Cartographer.librarian", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.librarian", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.vectorizer", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.vectorizer", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.bibtex_to_atlas", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.bibtex_to_atlas", "kind": "function", "doc": "

    Convert a bibtex file to an atlas, by parsing each entry for an identifier, and querying an API for publications using self.librarian.

    \n\n

    NOTE: the identifiers in the corresponding atlas will be API-specific ids; there is no relationship between the parsed id used to query papers (e.g. 'DOI:XYZ' in the case of SemanticScholar) and the resulting identifier associated with the resulting Publication object, (a paperId, a bibcode, etc.) Therefore, the purpose of using the bibtex_to_atlas method is primarily for initializing literature exploration in a human-readable way. If you want to obtain as many publications as identifiers supplied, you need to use get_publications.

    \n\n
    Arguments:
    \n\n
      \n
    • bibtex_fp: the filepath where the bibtex file is saved.
    • \n
    • args and kwargs are passed to get_publications.
    • \n
    \n", "signature": "(self, bibtex_fp: str, *args, **kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.project", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.project", "kind": "function", "doc": "

    Update an atlas with its projection, i.e. the document embeddings for all publications using self.vectorizer, removing publications with no abstracts.

    \n\n
    Arguments:
    \n\n
      \n
    • atl: the Atlas containing publications to project to document embeddings
    • \n
    \n\n
    Returns:
    \n\n
    \n

    the updated atlas containing all nonempty-abstract-containing publications and their projection

    \n
    \n", "signature": "(self, atl: sciterra.mapping.atlas.Atlas) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.expand", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.expand", "kind": "function", "doc": "

    Expand an atlas by retrieving a list of publications resulting from traversal of the citation network.

    \n\n
    Arguments:
    \n\n
      \n
    • atl: the atlas containing the region to expand
    • \n
    • center: (if given) center the search on this publication, preferentially searching related publications.
    • \n
    • n_pubs_max: maximum number of publications allowed in the expansion.
    • \n
    • n_sources_max: maximum number of publications (already in the atlas) to draw references and citations from.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    atl_expanded: the expanded atlas

    \n
    \n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\tcenter: str = None,\tn_pubs_max: int = 4000,\tn_sources_max: int = None) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.filter", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.filter", "kind": "function", "doc": "

    Update an atlas by dropping publications (and corresponding data in projection) when certain fields are empty.

    \n\n
    Arguments:
    \n\n
      \n
    • atl: the Atlas containing publications to filter
    • \n
    • attributes: the list of attributes to filter publications from the atlas if any of items are None for a publication. For example, if attributes = [\"abstract\"], then all publications pub such that pub.abstract is None is True will be removed from the atlas, along with the corresponding data in the projection.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    the filtered atlas

    \n
    \n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\tattributes: list = ['abstract', 'publication_date']) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.publication", "modulename": "sciterra.mapping.publication", "kind": "module", "doc": "

    The general container for data for any scientific publication, regardless of the API that was used to obtain it.

    \n"}, {"fullname": "sciterra.mapping.publication.FIELDS", "modulename": "sciterra.mapping.publication", "qualname": "FIELDS", "kind": "variable", "doc": "

    \n", "default_value": "['identifier', 'abstract', 'publication_date', 'citation_count', 'citations', 'references']"}, {"fullname": "sciterra.mapping.publication.ADDITIONAL_FIELDS", "modulename": "sciterra.mapping.publication", "qualname": "ADDITIONAL_FIELDS", "kind": "variable", "doc": "

    \n", "default_value": "['doi', 'url', 'title', 'issn']"}, {"fullname": "sciterra.mapping.publication.Publication", "modulename": "sciterra.mapping.publication", "qualname": "Publication", "kind": "class", "doc": "

    The Publication is a standardized container a scientific publication's retrieved data.

    \n\n

    In general, all data-cleaning shoud be done prior to constructing a Publication, in order to keep the class minimal.

    \n\n
    Attributes:
    \n\n
      \n
    • identifier: The string id that uniquely identifies the publication, used for\n
        \n
      • storing in an Atlas
      • \n
      • querying an API
      • \n
    • \n
    • abstract: The string corresponding to the publication's abstract
    • \n
    • publication_date: A datetime representing the date of publication
    • \n
    • citation_count: An int corresponding to the number of citations received by the publication
    • \n
    \n"}, {"fullname": "sciterra.mapping.publication.Publication.__init__", "modulename": "sciterra.mapping.publication", "qualname": "Publication.__init__", "kind": "function", "doc": "

    Construct a publication.

    \n\n
    Arguments:
    \n\n
      \n
    • data: to initialize attributes
    • \n
    \n", "signature": "(data: dict = {})"}, {"fullname": "sciterra.mapping.publication.Publication.identifier", "modulename": "sciterra.mapping.publication", "qualname": "Publication.identifier", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "sciterra.mapping.publication.Publication.abstract", "modulename": "sciterra.mapping.publication", "qualname": "Publication.abstract", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "sciterra.mapping.publication.Publication.publication_date", "modulename": "sciterra.mapping.publication", "qualname": "Publication.publication_date", "kind": "variable", "doc": "

    \n", "annotation": ": datetime.date"}, {"fullname": "sciterra.mapping.publication.Publication.citations", "modulename": "sciterra.mapping.publication", "qualname": "Publication.citations", "kind": "variable", "doc": "

    \n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.references", "modulename": "sciterra.mapping.publication", "qualname": "Publication.references", "kind": "variable", "doc": "

    \n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.citation_count", "modulename": "sciterra.mapping.publication", "qualname": "Publication.citation_count", "kind": "variable", "doc": "

    The citation_count can be different from the length of citations, since the number of citations listed for a paper might be different from the number of (valid) citing papers indexed on the relevant API.

    \n", "annotation": ": int"}, {"fullname": "sciterra.mapping.publication.Publication.init_attributes", "modulename": "sciterra.mapping.publication", "qualname": "Publication.init_attributes", "kind": "function", "doc": "

    \n", "signature": "(self, data) -> None:", "funcdef": "def"}, {"fullname": "sciterra.misc", "modulename": "sciterra.misc", "kind": "module", "doc": "

    \n"}, {"fullname": "sciterra.misc.utils", "modulename": "sciterra.misc.utils", "kind": "module", "doc": "

    Miscellaneous helper functions.

    \n"}, {"fullname": "sciterra.misc.utils.standardize_month", "modulename": "sciterra.misc.utils", "qualname": "standardize_month", "kind": "function", "doc": "

    \n", "signature": "(month: str) -> str:", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.keep_trying", "modulename": "sciterra.misc.utils", "qualname": "keep_trying", "kind": "function", "doc": "

    Sometimes we receive server errors. We don't want that to disrupt the entire process, so this decorator allow trying n_attempts times.

    \n\n

    API_extension::get_data_via_api

    \n\n

    This decorator is general, except for the default allowed exception.

    \n\n
    Arguments:
    \n\n
      \n
    • n_attempts (int): Number of attempts before letting the exception happen.
    • \n
    • allowed_exceptions (tuple of class): Allowed exception class. Set to BaseException to keep trying regardless of exception.
    • \n
    • sleep_after_attempt (int): Number of seconds to wait before trying each additional attempt.
    • \n
    • verbose (bool): If True, be talkative.
    • \n
    \n\n
    Example Usage:
    \n\n
    \n
    \n

    @keep_trying( n_attempts=4 )\n def try_to_call_web_api():\n \" do stuff \"

    \n
    \n
    \n", "signature": "(\tn_attempts=5,\tallowed_exceptions=(<class 'requests.exceptions.ReadTimeout'>, <class 'requests.exceptions.ConnectionError'>),\tverbose=True,\tsleep_after_attempt=1):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.chunk_ids", "modulename": "sciterra.misc.utils", "qualname": "chunk_ids", "kind": "function", "doc": "

    Helper function to chunk bibcodes or paperIds into smaller sublists if appropriate.

    \n", "signature": "(ids: list[str], call_size=2000):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.write_pickle", "modulename": "sciterra.misc.utils", "qualname": "write_pickle", "kind": "function", "doc": "

    \n", "signature": "(fn: str, data):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.read_pickle", "modulename": "sciterra.misc.utils", "qualname": "read_pickle", "kind": "function", "doc": "

    \n", "signature": "(fn: str):", "funcdef": "def"}, {"fullname": "sciterra.vectorization", "modulename": "sciterra.vectorization", "kind": "module", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.projection", "modulename": "sciterra.vectorization.projection", "kind": "module", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.projection.Projection", "modulename": "sciterra.vectorization.projection", "qualname": "Projection", "kind": "class", "doc": "

    Basic wrapper for document embeddings and helper methods.

    \n"}, {"fullname": "sciterra.vectorization.projection.Projection.__init__", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.__init__", "kind": "function", "doc": "

    Construct a Projection object, a bidirectional mapping from identifiers to document embeddings.

    \n\n
    Arguments:
    \n\n
      \n
    • identifiers_to_indices: a map from Publication identifiers to indices in the embedding matrix.
    • \n
    • indices_to_identifiers: a map from embedding indices to Publication identifiers.
    • \n
    • embeddings: ndarray of document embeddings of shape (num_pubs, embedding_dim)
    • \n
    \n", "signature": "(\tidentifier_to_index: dict[str, int],\tindex_to_identifier: tuple[str],\tembeddings: numpy.ndarray)"}, {"fullname": "sciterra.vectorization.projection.Projection.identifier_to_index", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifier_to_index", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.projection.Projection.index_to_identifier", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.index_to_identifier", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.projection.Projection.embeddings", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.embeddings", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.projection.Projection.indices_to_identifiers", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.indices_to_identifiers", "kind": "function", "doc": "

    Retrieve the identifiers for a list of embedding matrix indices.

    \n", "signature": "(self, indices) -> list[str]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.Projection.identifiers_to_embeddings", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifiers_to_embeddings", "kind": "function", "doc": "

    Retrieve the document embeddings for a list of identifiers.

    \n", "signature": "(self, identifiers: list[str]) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.Projection.identifier_to_embedding", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifier_to_embedding", "kind": "function", "doc": "

    Retrieve the document embedding of a Publication.

    \n", "signature": "(self, identifier: str) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.merge", "modulename": "sciterra.vectorization.projection", "qualname": "merge", "kind": "function", "doc": "

    Return the result of merging projection proj_a with projection proj_b.

    \n\n

    NOTE: This is not a symmetric operation: it adds all embedding data contained in proj_a that is missing from proj_b. This means that the resulting projection can only be greater or equal in size to proj_a.

    \n", "signature": "(\tproj_a: sciterra.vectorization.projection.Projection,\tproj_b: sciterra.vectorization.projection.Projection) -> sciterra.vectorization.projection.Projection:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.scibert", "modulename": "sciterra.vectorization.scibert", "kind": "module", "doc": "

    SciBERT is a BERT model trained on scientific text.

    \n\n
    Links:
    \n\n
    \n

    Paper: https://aclanthology.org/D19-1371/\n Github: https://github.com/allenai/scibert\n HF: https://huggingface.co/allenai/scibert_scivocab_uncased

    \n
    \n"}, {"fullname": "sciterra.vectorization.scibert.MPS_DEVICE", "modulename": "sciterra.vectorization.scibert", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "

    \n", "default_value": "device(type='mps')"}, {"fullname": "sciterra.vectorization.scibert.MODEL_PATH", "modulename": "sciterra.vectorization.scibert", "qualname": "MODEL_PATH", "kind": "variable", "doc": "

    \n", "default_value": "'allenai/scibert_scivocab_uncased'"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer", "kind": "class", "doc": "

    \n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.__init__", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.__init__", "kind": "function", "doc": "

    \n", "signature": "(device='cuda')"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.tokenizer", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.tokenizer", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.model", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.model", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.embed_documents", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.embed_documents", "kind": "function", "doc": "

    Embed a list of documents (raw text) into SciBERT vectors, by batching.

    \n\n
    Arguments:
    \n\n
      \n
    • docs: the documents to embed.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    a numpy array of shape (num_documents, 768)

    \n
    \n", "signature": "(self, docs: list[str], batch_size: int = 64) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.vectorizer", "modulename": "sciterra.vectorization.vectorizer", "kind": "module", "doc": "

    Base class for vectorizing abstracts.

    \n"}, {"fullname": "sciterra.vectorization.vectorizer.Vectorizer", "modulename": "sciterra.vectorization.vectorizer", "qualname": "Vectorizer", "kind": "class", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.vectorizer.Vectorizer.embed_documents", "modulename": "sciterra.vectorization.vectorizer", "qualname": "Vectorizer.embed_documents", "kind": "function", "doc": "

    Embed a list of documents into document vectors.

    \n\n
    Arguments:
    \n\n
      \n
    • docs: the documents to embed.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    a numpy array of shape (num_documents, embedding_dim)

    \n
    \n", "signature": "(self, docs: list[str]) -> numpy.ndarray:", "funcdef": "def"}]; + /** pdoc search index */const docs = [{"fullname": "sciterra", "modulename": "sciterra", "kind": "module", "doc": "

    sciterra

    \n\n

    \"build\"

    \n\n

    Software library to support data-driven analyses of scientific literature.

    \n\n

    This library is a reimplementation of Zach Hafen's cc library.

    \n"}, {"fullname": "sciterra.librarians", "modulename": "sciterra.librarians", "kind": "module", "doc": "

    \n"}, {"fullname": "sciterra.librarians.adslibrarian", "modulename": "sciterra.librarians.adslibrarian", "kind": "module", "doc": "

    \n"}, {"fullname": "sciterra.librarians.adslibrarian.CALL_SIZE", "modulename": "sciterra.librarians.adslibrarian", "qualname": "CALL_SIZE", "kind": "variable", "doc": "

    \n", "default_value": "50"}, {"fullname": "sciterra.librarians.adslibrarian.NUM_ATTEMPTS_PER_QUERY", "modulename": "sciterra.librarians.adslibrarian", "qualname": "NUM_ATTEMPTS_PER_QUERY", "kind": "variable", "doc": "

    \n", "default_value": "10"}, {"fullname": "sciterra.librarians.adslibrarian.QUERY_FIELDS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "QUERY_FIELDS", "kind": "variable", "doc": "

    \n", "default_value": "['bibcode', 'abstract', 'title', 'entry_date', 'pubdate', 'year', 'citation_count', 'citation', 'reference', 'identifier']"}, {"fullname": "sciterra.librarians.adslibrarian.ALLOWED_EXCEPTIONS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ALLOWED_EXCEPTIONS", "kind": "variable", "doc": "

    \n", "default_value": "(<class 'ads.exceptions.APIResponseError'>,)"}, {"fullname": "sciterra.librarians.adslibrarian.EXTERNAL_IDS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "EXTERNAL_IDS", "kind": "variable", "doc": "

    \n", "default_value": "['DOI', 'arXiv', 'bibcode']"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian", "kind": "class", "doc": "

    \n", "bases": "sciterra.librarians.librarian.Librarian"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.bibtex_entry_identifier", "kind": "function", "doc": "

    Parse a bibtex entry for a usable identifier for querying ADS (see EXTERNAL_IDS).

    \n", "signature": "(self, bibtex_entry: dict) -> str:", "funcdef": "def"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.get_publications", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.get_publications", "kind": "function", "doc": "

    Use the NASA ADS python package, which calls the ADS API to retrieve publications.

    \n\n
    Arguments:
    \n\n
      \n
    • bibcodes: the str ids required for querying. While it is possible to use one of EXTERNAL_IDS to query, if ADS returns a paper at all, it will return a bibcode, so it is preferred to use bibcodes.
    • \n
    • n_attempts_per_query: Number of attempts to access the API per query. Useful when experiencing connection issues.
    • \n
    • call_size: maximum number of papers to call API for in one query; if less than len(bibcodes), chunking will be performed.
    • \n
    • convert: whether to convert each resulting ADS Article to sciterra Publications (True by default).
    • \n
    \n\n
    Returns:
    \n\n
    \n

    the list of publications (or Papers)

    \n
    \n", "signature": "(\tself,\tbibcodes: list[str],\t*args,\tcall_size: int = 50,\tn_attempts_per_query: int = 10,\tconvert: bool = True,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.convert_publication", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.convert_publication", "kind": "function", "doc": "

    Convert a ADS Article object to a sciterra.publication.Publication.

    \n", "signature": "(\tself,\tarticle: ads.search.Article,\t*args,\t**kwargs) -> sciterra.mapping.publication.Publication:", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian", "modulename": "sciterra.librarians.librarian", "kind": "module", "doc": "

    \n"}, {"fullname": "sciterra.librarians.librarian.Librarian", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian", "kind": "class", "doc": "

    \n"}, {"fullname": "sciterra.librarians.librarian.Librarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.bibtex_entry_identifier", "kind": "function", "doc": "

    Parse a bibtex entry for a usable unique identifier appropriate to the API.

    \n", "signature": "(self, bibtex_entry: dict) -> str:", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.get_publications", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.get_publications", "kind": "function", "doc": "

    Call an API and retrieve the publications corresponding to str identifiers.

    \n\n
    Arguments:
    \n\n
      \n
    • n_attempts_per_query: Number of attempts to access the API per query. Useful when experiencing connection issues.
    • \n
    • call_size: (int): maximum number of papers to call API for in one query; if less than len(paper_ids), chunking will be performed.
    • \n
    \n", "signature": "(\tself,\tidentifiers: list[str],\t*args,\tcall_size: int = None,\tn_attempts_per_query: int = None,\tconvert: bool = True,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.convert_publication", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.convert_publication", "kind": "function", "doc": "

    Convert an API-specific resulting publication data structure into a sciterra Publication object.

    \n", "signature": "(self, pub: Any, *args, **kwargs):", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.convert_publications", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.convert_publications", "kind": "function", "doc": "

    Convet a list of API-specific results to sciterra Publications, possibly using multiprocessing.

    \n", "signature": "(\tself,\tpapers: list,\t*args,\tmultiprocess: bool = True,\tnum_processes=6,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian", "modulename": "sciterra.librarians.s2librarian", "kind": "module", "doc": "

    \n"}, {"fullname": "sciterra.librarians.s2librarian.QUERY_FIELDS", "modulename": "sciterra.librarians.s2librarian", "qualname": "QUERY_FIELDS", "kind": "variable", "doc": "

    \n", "default_value": "['year', 'abstract', 'title', 'externalIds', 'citationCount', 'url', 'citations.externalIds', 'citations.url', 'references.externalIds', 'references.url', 'citationStyles', 'publicationDate']"}, {"fullname": "sciterra.librarians.s2librarian.EXTERNAL_IDS", "modulename": "sciterra.librarians.s2librarian", "qualname": "EXTERNAL_IDS", "kind": "variable", "doc": "

    \n", "default_value": "['DOI', 'ArXiv', 'CorpusId', 'MAG', 'ACL', 'PubMed', 'Medline', 'PubMedCentral', 'DBLP', 'URL']"}, {"fullname": "sciterra.librarians.s2librarian.STORE_FIELDS", "modulename": "sciterra.librarians.s2librarian", "qualname": "STORE_FIELDS", "kind": "variable", "doc": "

    \n", "default_value": "['abstract', 'externalIds', 'url', 'citations', 'references', 'citationStyles', 'publicationDate']"}, {"fullname": "sciterra.librarians.s2librarian.ATTRS_TO_SAVE", "modulename": "sciterra.librarians.s2librarian", "qualname": "ATTRS_TO_SAVE", "kind": "variable", "doc": "

    \n", "default_value": "['paper', 'abstract', 'citations', 'references', 'bibcode', 'entry_date', 'notes', 'unofficial_flag', 'citation', 'stemmed_content_words']"}, {"fullname": "sciterra.librarians.s2librarian.ALLOWED_EXCEPTIONS", "modulename": "sciterra.librarians.s2librarian", "qualname": "ALLOWED_EXCEPTIONS", "kind": "variable", "doc": "

    \n", "default_value": "(<class 'Exception'>, <class 'requests.exceptions.ReadTimeout'>, <class 'requests.exceptions.ConnectionError'>, <class 'semanticscholar.SemanticScholarException.ObjectNotFoundException'>)"}, {"fullname": "sciterra.librarians.s2librarian.CALL_SIZE", "modulename": "sciterra.librarians.s2librarian", "qualname": "CALL_SIZE", "kind": "variable", "doc": "

    \n", "default_value": "10"}, {"fullname": "sciterra.librarians.s2librarian.NUM_ATTEMPTS_PER_QUERY", "modulename": "sciterra.librarians.s2librarian", "qualname": "NUM_ATTEMPTS_PER_QUERY", "kind": "variable", "doc": "

    \n", "default_value": "50"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian", "kind": "class", "doc": "

    \n", "bases": "sciterra.librarians.librarian.Librarian"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.sch", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.sch", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.bibtex_entry_identifier", "kind": "function", "doc": "

    Parse a bibtex entry for a usable identifier for querying SemanticScholar (see EXTERNAL_IDS).

    \n", "signature": "(self, bibtex_entry: dict) -> str:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_publications", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_publications", "kind": "function", "doc": "

    Use the (unofficial) S2 python package, which calls the Semantic Scholar API to retrieve publications from the S2AG.

    \n\n
    Arguments:
    \n\n
      \n
    • paper_ids: the str ids required for querying. While it is possible to use one of EXTERNAL_IDS to query, if SemanticScholar returns a paper at all, it will return a paperId, so it is preferred to use paperIds.
    • \n
    • n_attempts_per_query: Number of attempts to access the API per query. Useful when experiencing connection issues.
    • \n
    • call_size: maximum number of papers to call API for in one query; if less than len(paper_ids), chunking will be performed. Maximum that S2 allows is 500.
    • \n
    • convert: whether to convert each resulting SemanticScholar Paper to sciterra Publications (True by default).
    • \n
    \n\n
    Returns:
    \n\n
    \n

    the list of publications (or Papers)

    \n
    \n", "signature": "(\tself,\tpaper_ids: list[str],\t*args,\tcall_size: int = 10,\tn_attempts_per_query: int = 50,\tconvert: bool = True,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.convert_publication", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.convert_publication", "kind": "function", "doc": "

    Convert a SemanticScholar Paper object to a sciterra.publication.Publication.

    \n", "signature": "(\tself,\tpaper: semanticscholar.Paper.Paper,\t*args,\t**kwargs) -> sciterra.mapping.publication.Publication:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_papers", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_papers", "kind": "function", "doc": "

    Custom function for calling the S2 API that doesn't fail on empty results.

    \n", "signature": "(self, paper_ids: list[str], fields: list[str]):", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_paper", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_paper", "kind": "function", "doc": "

    Custom function for calling the S2 API that doesn't fail on empty results.

    \n", "signature": "(self, paper_id: str, fields: list[str]):", "funcdef": "def"}, {"fullname": "sciterra.mapping", "modulename": "sciterra.mapping", "kind": "module", "doc": "

    Classes for constructing maps of scientific literature.

    \n\n

    The sciterra.mapping.atlas submodule contains the basic data structure, the Atlas.

    \n\n

    The sciterra.mapping.cartography submodule contains functionality for manipulating an Atlas.

    \n"}, {"fullname": "sciterra.mapping.atlas", "modulename": "sciterra.mapping.atlas", "kind": "module", "doc": "

    Main container object for a large library of publications.

    \n"}, {"fullname": "sciterra.mapping.atlas.Atlas", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas", "kind": "class", "doc": "

    Data structure for storing publications.

    \n\n

    self.projection: the Projection object containing the embeddings of all publications and their mapping to str identifiers.

    \n\n

    self.bad_ids: a list of identifiers that have failed for some reason or other during an expansion, and will be excluded from subsequent expansions.

    \n\n

    self.history: dict of the form {'pubs_per_update': list[list[str]], 'kernel_size': np.ndarray of ints of shape (num_pubs, last_update) where last_update <= the total number of expansions performed.}

    \n"}, {"fullname": "sciterra.mapping.atlas.Atlas.__init__", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tpublications: list[sciterra.mapping.publication.Publication],\tprojection: sciterra.vectorization.projection.Projection = None,\tbad_ids: set[str] = set(),\thistory: dict[str, typing.Any] = {})"}, {"fullname": "sciterra.mapping.atlas.Atlas.publications", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.publications", "kind": "variable", "doc": "

    \n", "annotation": ": dict[str, sciterra.mapping.publication.Publication]"}, {"fullname": "sciterra.mapping.atlas.Atlas.projection", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.projection", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.mapping.atlas.Atlas.bad_ids", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.bad_ids", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.mapping.atlas.Atlas.history", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.history", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.mapping.atlas.Atlas.ids", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.ids", "kind": "function", "doc": "

    Get a list of all the publication identifiers in the Atlas.

    \n", "signature": "(self) -> list[str]:", "funcdef": "def"}, {"fullname": "sciterra.mapping.atlas.Atlas.save", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.save", "kind": "function", "doc": "

    Write the Atlas to a directory containing a .pkl binary for each attribute.

    \n\n

    Warnings cannot be silenced.

    \n\n
    Arguments:
    \n\n
      \n
    • atlas_dirpath: path of directory to save files to.
    • \n
    \n", "signature": "(self, atlas_dirpath: str, overwrite: bool = True) -> None:", "funcdef": "def"}, {"fullname": "sciterra.mapping.atlas.Atlas.load", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.load", "kind": "function", "doc": "

    Load an Atlas object from a directory containing the .pkl binary for each attribute.

    \n\n

    Warnings cannot be silenced.

    \n\n
    Arguments:
    \n\n
      \n
    • atlas_dirpath: directory where .pkl binaries will be read from
    • \n
    \n", "signature": "(cls, atlas_dirpath: str):", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography", "modulename": "sciterra.mapping.cartography", "kind": "module", "doc": "

    Functions for manipulating an atlas based on the document embeddings of the abstracts of its publications.

    \n"}, {"fullname": "sciterra.mapping.cartography.batch_cospsi_matrix", "modulename": "sciterra.mapping.cartography", "qualname": "batch_cospsi_matrix", "kind": "function", "doc": "

    Batch-process a pairwise cosine similarity matrix between embeddings.

    \n\n

    In order to avoid memory errors (e.g. bus error, segfaults) resulting from too large arrays, we batch process the construction of the cospsi_matrix.

    \n\n
    Arguments:
    \n\n
      \n
    • embeddings: a 1D numpy array of embeddings
    • \n
    \n\n
    Returns:
    \n\n
    \n

    cosine_similarities: a 2D numpy array representing the pairwise cosine similarity between each embedding

    \n
    \n", "signature": "(embeddings: numpy.ndarray) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer", "kind": "class", "doc": "

    A basic wrapper for obtaining and updating atlas projections.

    \n\n

    self.librarian: the Librarian object used to query a bibliographic database API.\nself.vectorizer: the Vectorizer object used to get a document embedding for each abstract\nself.pubs_per_update: a list of lists of publication str ids, representing the publications that exist at each time step / expansion update.\nself.update_history: an np.array of ints representing when publications were added. A value of -2 indicates no record of being added.

    \n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.__init__", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tlibrarian: sciterra.librarians.librarian.Librarian = None,\tvectorizer: sciterra.vectorization.vectorizer.Vectorizer = None)"}, {"fullname": "sciterra.mapping.cartography.Cartographer.librarian", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.librarian", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.vectorizer", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.vectorizer", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.pubs_per_update", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.pubs_per_update", "kind": "variable", "doc": "

    \n", "annotation": ": list[list[str]]"}, {"fullname": "sciterra.mapping.cartography.Cartographer.update_history", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.update_history", "kind": "variable", "doc": "

    \n", "annotation": ": numpy.ndarray"}, {"fullname": "sciterra.mapping.cartography.Cartographer.bibtex_to_atlas", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.bibtex_to_atlas", "kind": "function", "doc": "

    Convert a bibtex file to an atlas, by parsing each entry for an identifier, and querying an API for publications using self.librarian.

    \n\n

    NOTE: the identifiers in the corresponding atlas will be API-specific ids; there is no relationship between the parsed id used to query papers (e.g. 'DOI:XYZ' in the case of SemanticScholar) and the resulting identifier associated with the resulting Publication object, (a paperId, a bibcode, etc.) Therefore, the purpose of using the bibtex_to_atlas method is primarily for initializing literature exploration in a human-readable way. If you want to obtain as many publications as identifiers supplied, you need to use get_publications.

    \n\n
    Arguments:
    \n\n
      \n
    • bibtex_fp: the filepath where the bibtex file is saved.
    • \n
    • args and kwargs are passed to get_publications.
    • \n
    \n", "signature": "(self, bibtex_fp: str, *args, **kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.project", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.project", "kind": "function", "doc": "

    Update an atlas with its projection, i.e. the document embeddings for all publications using self.vectorizer, removing publications with no abstracts.

    \n\n
    Arguments:
    \n\n
      \n
    • atl: the Atlas containing publications to project to document embeddings
    • \n
    \n\n
    Returns:
    \n\n
    \n

    the updated atlas containing all nonempty-abstract-containing publications and their projection

    \n
    \n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\t**kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.expand", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.expand", "kind": "function", "doc": "

    Expand an atlas by retrieving a list of publications resulting from traversal of the citation network.

    \n\n
    Arguments:
    \n\n
      \n
    • atl: the atlas containing the region to expand
    • \n
    • center: (if given) center the search on this publication, preferentially searching related publications.
    • \n
    • n_pubs_max: maximum number of publications allowed in the expansion.
    • \n
    • n_sources_max: maximum number of publications (already in the atlas) to draw references and citations from.
    • \n
    • record_pubs_per_update: whether to track all the publications that exist in the resulting atlas to self.pubs_per_update. This should only be set to True when you need to later filter by degree of convergence of the atlas.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    atl_expanded: the expanded atlas

    \n
    \n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\t*args,\tcenter: str = None,\tn_pubs_max: int = 4000,\tn_sources_max: int = None,\trecord_pubs_per_update: bool = False,\t**kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.filter_by_attributes", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.filter_by_attributes", "kind": "function", "doc": "

    Update an atlas by dropping publications (and corresponding data in projection) when certain fields are empty.

    \n\n
    Arguments:
    \n\n
      \n
    • atl: the Atlas containing publications to filter
    • \n
    • attributes: the list of attributes to filter publications from the atlas IF any of items are None for a publication. For example, if attributes = [\"abstract\"], then all publications pub such that pub.abstract is None is True will be removed from the atlas, along with the corresponding data in the projection.
    • \n
    • record_pubs_per_update: whether to track all the publications that exist in the resulting atlas to self.pubs_per_update. This should only be set to True when you need to later filter by degree of convergence of the atlas. This is an important parameter because self.filter is called in self.project, which typically is called after self.expand, where we pass in the same parameter.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    the filtered atlas

    \n
    \n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\tattributes: list = ['abstract', 'publication_date'],\trecord_pubs_per_update=False,\t**kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.filter_by_ids", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.filter_by_ids", "kind": "function", "doc": "

    Update an atlas by dropping publications (and corresponding data in projection).

    \n\n
    Arguments:
    \n\n
      \n
    • atl: the Atlas containing publications to filter
    • \n
    • keep_ids: the list of publication ids to NOT filter; all other publications in atl not matching one of these ids will be removed.
    • \n
    • drop_ids: the list of publications to filter; all publications in atl matching one of these ids will be removed.
    • \n
    \n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\tkeep_ids: list[str] = None,\tdrop_ids: list[str] = None) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.track", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.track", "kind": "function", "doc": "

    Overwrite the data associated with tracking degree of convergence of publications in an atlas over multiple expansions. N.B.: the atlas must be fully projected, or else converged_kernel_size will raise a KeyError.

    \n\n
    Arguments:
    \n\n
      \n
    • atl: the Atlas that will be updated by overwriting Atlas.history
    • \n
    \n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\tpubs: list[str] = None,\tpubs_per_update: list[list[str]] = None) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.record_update_history", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.record_update_history", "kind": "function", "doc": "

    Record when publications were added, by updating atl.update_history.

    \n\n

    atl.update_history is a np.array of ints representing when publications were added. A value of -2 indicates no record of being added.

    \n\n
    Arguments:
    \n\n
      \n
    • pubs: a list of str ids corresponding to publications at the final update in the update history. By default None, and self.pubs_per_update[-1] will be used.
    • \n
    • pubs_per_update: a list of which publications existed at which iteration, with the index of the overall list corresponding to the iteration the publication was added. By default None, and self.pubs_per_update will be used.
    • \n
    \n\n
    Updates:
    \n\n
    \n

    self.update_history: an np.array of ints representing when publications were added. A value of -2 indicates no record of being added.

    \n
    \n\n
    Returns:
    \n\n
    \n

    None

    \n
    \n", "signature": "(\tself,\tpubs: list[str] = None,\tpubs_per_update: list[list[str]] = None) -> None:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.converged_kernel_size", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.converged_kernel_size", "kind": "function", "doc": "

    Calculate the largest size of the kernel that's converged (at differing levels of convergence) for each publication in a sample at each update.

    \n\n
    Arguments:
    \n\n
      \n
    • atl: Atlas containing publications; for each publication we compute the largest converged kernel size at each update
    • \n
    \n\n
    Returns:
    \n\n
    \n

    kernel_size: an array of ints of shape (num_pubs, max_update) representing the kernel size for converged kernels.\n - The first column indicates the largest kernel size that hasn't changed since the beginning,\n - The second column indicates the largest kernel size that hasn't changed since the first update,\n - etc. for the nth column.

    \n
    \n", "signature": "(self, atl: sciterra.mapping.atlas.Atlas) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.measure_topography", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.measure_topography", "kind": "function", "doc": "

    Measure topographic properties of all publications relative to prior\npublications.

    \n\n
    Arguments:
    \n\n
      \n
    • atl: the Atlas to measure
    • \n
    • publication_indices: an np.ndarray of ints representing the indices of publications in the Atlas projection to measure
    • \n
    • metrics: A list of strings representing the metrics to use. Options are...\nconstant_asymmetry: The asymmetry of a publication $p_i$ w.r.t the entire atlas ${ p_j \\forall j \\in {1, ..., k} } where $k$ is the length of the atlas

      \n\n

      $| \\sum_{j}^{k-1}( p_i - p_j ) |$

      \n\n

      kernel_constant_asymmetry: The asymmetry of a publication w.r.t. its kernel, { p_j for all j in {1, ..., k} } where k is kernel_size, i.e. the k nearest neighbors.

      \n\n

      density: the density of a publication's surrounding area, estimated by a heuristic inspired by mass / volume = k publications divided by the minimum arc length enclosing the furthest publication.

      \n\n

      $\\frac{ k }{ smoothing_length(k) }$

      \n\n

      smoothing_length: The distance (in radians) to the farthest publication in the kernel, i.e. the kth nearest neighbor.

    • \n
    • min_prior_pubs: The minimum number of publications prior to the target publication for which to calculate the metric.
    • \n
    • kernel_size: the number of publications surrounding the publication for which to compute the topography metric, i.e. k nearest neighbors for k=kernel_size.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    estimates: an np.ndarray of shape (len(publication_indices), len(metrics)) representing the estimated topography metric values for each publication.

    \n
    \n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\tids: list[str] = None,\tmetrics: list[str] = ['density'],\tmin_prior_pubs: int = 2,\tkernel_size=16,\t**kwargs):", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.iterate_expand", "modulename": "sciterra.mapping.cartography", "qualname": "iterate_expand", "kind": "function", "doc": "

    Build out an Atlas of publications, i.e. search for similar publications. This is done by iterating a sequence of [expand, save, project, save, track, save].

    \n\n
    Arguments:
    \n\n
      \n
    • atl: the Atlas to expand
    • \n
    • crt: the Cartographer to use
    • \n
    • atlas_dir: the directory where Atlas binaries will be saved/loaded from
    • \n
    • target_size: stop iterating when we reach this number of publications in the Atlas
    • \n
    • max_failed_expansions: stop iterating when we fail to add new publications after this many successive iterations. Default is 2.
    • \n
    • center: (if given) center the search on this publication, preferentially searching related publications.
    • \n
    • n_pubs_max: maximum number of publications allowed in the expansion.
    • \n
    • call_size: maximum number of papers to call API for in one query; if less than len(paper_ids), chunking will be performed.
    • \n
    • n_sources_max: maximum number of publications (already in the atlas) to draw references and citations from.
    • \n
    • record_pubs_per_update: whether to track all the publications that exist in the resulting atlas to self.pubs_per_update. This should only be set to True when you need to later filter by degree of convergence of the atlas.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    atl: the expanded Atlas

    \n
    \n", "signature": "(\tatl: sciterra.mapping.atlas.Atlas,\tcrt: sciterra.mapping.cartography.Cartographer,\tatlas_dir: str,\ttarget_size: int,\tmax_failed_expansions: int = 2,\tcenter: str = None,\tn_pubs_max: int = None,\tcall_size: int = None,\tn_sources_max: int = None,\trecord_pubs_per_update: bool = False) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.publication", "modulename": "sciterra.mapping.publication", "kind": "module", "doc": "

    The general container for data for any scientific publication, regardless of the API that was used to obtain it.

    \n"}, {"fullname": "sciterra.mapping.publication.FIELDS", "modulename": "sciterra.mapping.publication", "qualname": "FIELDS", "kind": "variable", "doc": "

    \n", "default_value": "['identifier', 'abstract', 'publication_date', 'citation_count', 'citations', 'references']"}, {"fullname": "sciterra.mapping.publication.ADDITIONAL_FIELDS", "modulename": "sciterra.mapping.publication", "qualname": "ADDITIONAL_FIELDS", "kind": "variable", "doc": "

    \n", "default_value": "['doi', 'url', 'title', 'issn']"}, {"fullname": "sciterra.mapping.publication.Publication", "modulename": "sciterra.mapping.publication", "qualname": "Publication", "kind": "class", "doc": "

    The Publication is a standardized container a scientific publication's retrieved data.

    \n\n

    In general, all data-cleaning shoud be done prior to constructing a Publication, in order to keep the class minimal.

    \n\n
    Attributes:
    \n\n
      \n
    • identifier: The string id that uniquely identifies the publication, used for\n
        \n
      • storing in an Atlas
      • \n
      • querying an API
      • \n
    • \n
    • abstract: The string corresponding to the publication's abstract
    • \n
    • publication_date: A datetime representing the date of publication
    • \n
    • citation_count: An int corresponding to the number of citations received by the publication
    • \n
    \n"}, {"fullname": "sciterra.mapping.publication.Publication.__init__", "modulename": "sciterra.mapping.publication", "qualname": "Publication.__init__", "kind": "function", "doc": "

    Construct a publication.

    \n\n
    Arguments:
    \n\n
      \n
    • data: to initialize attributes
    • \n
    \n", "signature": "(data: dict, **kwargs)"}, {"fullname": "sciterra.mapping.publication.Publication.identifier", "modulename": "sciterra.mapping.publication", "qualname": "Publication.identifier", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "sciterra.mapping.publication.Publication.abstract", "modulename": "sciterra.mapping.publication", "qualname": "Publication.abstract", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "sciterra.mapping.publication.Publication.publication_date", "modulename": "sciterra.mapping.publication", "qualname": "Publication.publication_date", "kind": "variable", "doc": "

    \n", "annotation": ": datetime.date"}, {"fullname": "sciterra.mapping.publication.Publication.citations", "modulename": "sciterra.mapping.publication", "qualname": "Publication.citations", "kind": "variable", "doc": "

    \n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.references", "modulename": "sciterra.mapping.publication", "qualname": "Publication.references", "kind": "variable", "doc": "

    \n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.citation_count", "modulename": "sciterra.mapping.publication", "qualname": "Publication.citation_count", "kind": "variable", "doc": "

    The citation_count can be different from the length of citations, since the number of citations listed for a paper might be different from the number of (valid) citing papers indexed on the relevant API.

    \n", "annotation": ": int"}, {"fullname": "sciterra.mapping.publication.Publication.init_attributes", "modulename": "sciterra.mapping.publication", "qualname": "Publication.init_attributes", "kind": "function", "doc": "

    \n", "signature": "(self, data, **kwargs) -> None:", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography", "modulename": "sciterra.mapping.topography", "kind": "module", "doc": "

    Functions for measuring topographic properties of (the semantic feature space of publications inside) an Atlas.

    \n"}, {"fullname": "sciterra.mapping.topography.smoothing_length_metric", "modulename": "sciterra.mapping.topography", "qualname": "smoothing_length_metric", "kind": "function", "doc": "

    Proxy for the density of a publication defined as the minimum\narc length that encloses kernel_size other publications.

    \n\n
    Arguments:
    \n\n
      \n
    • idx: the index of the vector to calculate the measurement for.
    • \n
    • cospsi_matrix: a 2D matrix of pairwise cosine similarity scores for publication embeddings.
    • \n
    • valid_indices: Indices of the other publication used when calculating the measurements.
    • \n
    • kernel_size: number of K nearest neighbors to calculate the measurement on.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    h: float representing arc length containing kernel_size other publications. (Assumes normalized to a radius of 1.)

    \n
    \n", "signature": "(\tidx: int,\tcospsi_matrix: numpy.ndarray,\tvalid_indices: numpy.ndarray,\tkernel_size: int = 16):", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.density_metric", "modulename": "sciterra.mapping.topography", "qualname": "density_metric", "kind": "function", "doc": "

    Estimate the density of a publication by calculating the\nsmoothing length that encloses kernel_size other publications.

    \n\n
    Arguments:
    \n\n
      \n
    • idx: the index of the vector to calculate the measurement for.
    • \n
    • cospsi_matrix: a 2D matrix of pairwise cosine similarity scores for publication embeddings.
    • \n
    • valid_indices: Indices of the other publication used when calculating the measurements.
    • \n
    • kernel_size: number of K nearest neighbors to calculate the measurement on.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    density: a float representing kernel_size divided by arc length containing kernel_size other publications.

    \n
    \n", "signature": "(\tidx: int,\tcospsi_matrix: numpy.ndarray,\tvalid_indices: numpy.ndarray,\tkernel_size: int = 16):", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.edginess_metric", "modulename": "sciterra.mapping.topography", "qualname": "edginess_metric", "kind": "function", "doc": "

    Estimate the asymmetry of a publication by calculating the difference between that publication's projection and the other publications within the kernel. Normalized to between 0 and 1.

    \n\n
    Arguments:
    \n\n
      \n
    • idx: the index of the vector to calculate the measurement for.
    • \n
    • cospsi_matrix: an np.ndarray of shape (num_pubs, num_pubs) representing pairwise cosine similarity scores for publication embeddings.
    • \n
    • valid_indices: an np.ndarray of shape (num_valid_pubs) representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs <= num_pubs.
    • \n
    • publication_indices: an np.ndarray of shape (num_pubs,) representing indices of all publications in the atlas projection
    • \n
    • embeddings: an np.ndarray of shape (num_pubs, embedding_dim) vectors for all publications in the atlas projection
    • \n
    • kernel_size: number of K nearest neighbors to calculate the measurement on.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    a float representing the normalized magnitude of the asymmetry metric.

    \n
    \n", "signature": "(\tidx: int,\tcospsi_matrix: numpy.ndarray,\tvalid_indices: numpy.ndarray,\tpublication_indices: numpy.ndarray,\tembeddings: numpy.ndarray,\tkernel_size: int = 16) -> float:", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.kernel_constant_asymmetry_metric", "modulename": "sciterra.mapping.topography", "qualname": "kernel_constant_asymmetry_metric", "kind": "function", "doc": "

    Estimate the asymmetry of a publication by calculating the difference\nbetween that publication's projection and the other publications within\nthe kernel.

    \n\n
    Arguments:
    \n\n
      \n
    • idx: an int representing the index of the vector to calculate the measurement for.
    • \n
    • cospsi_matrix: an np.ndarray of shape (num_pubs, num_pubs) representing pairwise cosine similarity scores for publication embeddings.
    • \n
    • valid_indices: an np.ndarray of shape (num_valid_pubs) representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs <= num_pubs.
    • \n
    • publication_indices: an np.ndarray of shape (num_pubs,) representing indices of all publications in the atlas projection
    • \n
    • embeddings: an np.ndarray of shape (num_pubs, embedding_dim) vectors for all publications in the atlas projection
    • \n
    • kernel_size: an int representing the number of K nearest neighbors to calculate the measurement on.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    mag: a float representing the magnitude of the asymmetry metric.

    \n
    \n", "signature": "(\tidx: int,\tcospsi_matrix: numpy.ndarray,\tvalid_indices: numpy.ndarray,\tpublication_indices: numpy.ndarray,\tembeddings: numpy.ndarray,\tkernel_size: int = 16) -> float:", "funcdef": "def"}, {"fullname": "sciterra.misc", "modulename": "sciterra.misc", "kind": "module", "doc": "

    \n"}, {"fullname": "sciterra.misc.utils", "modulename": "sciterra.misc.utils", "kind": "module", "doc": "

    Miscellaneous helper functions.

    \n"}, {"fullname": "sciterra.misc.utils.standardize_month", "modulename": "sciterra.misc.utils", "qualname": "standardize_month", "kind": "function", "doc": "

    \n", "signature": "(month: str) -> str:", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.keep_trying", "modulename": "sciterra.misc.utils", "qualname": "keep_trying", "kind": "function", "doc": "

    Sometimes we receive server errors. We don't want that to disrupt the entire process, so this decorator allow trying n_attempts times.

    \n\n

    API_extension::get_data_via_api

    \n\n

    This decorator is general, except for the default allowed exception.

    \n\n
    Arguments:
    \n\n
      \n
    • n_attempts (int): Number of attempts before letting the exception happen.
    • \n
    • allowed_exceptions (tuple of class): Allowed exception class. Set to BaseException to keep trying regardless of exception.
    • \n
    • sleep_after_attempt (int): Number of seconds to wait before trying each additional attempt.
    • \n
    • verbose (bool): If True, be talkative.
    • \n
    \n\n
    Example Usage:
    \n\n
    \n
    \n

    @keep_trying( n_attempts=4 )\n def try_to_call_web_api():\n \" do stuff \"

    \n
    \n
    \n", "signature": "(\tn_attempts=5,\tallowed_exceptions=(<class 'requests.exceptions.ReadTimeout'>, <class 'requests.exceptions.ConnectionError'>),\tverbose=True,\tsleep_after_attempt=1):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.chunk_ids", "modulename": "sciterra.misc.utils", "qualname": "chunk_ids", "kind": "function", "doc": "

    Helper function to chunk bibcodes or paperIds into smaller sublists if appropriate.

    \n", "signature": "(ids: list[str], call_size):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.write_pickle", "modulename": "sciterra.misc.utils", "qualname": "write_pickle", "kind": "function", "doc": "

    \n", "signature": "(fn: str, data):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.read_pickle", "modulename": "sciterra.misc.utils", "qualname": "read_pickle", "kind": "function", "doc": "

    \n", "signature": "(fn: str):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.get_verbose", "modulename": "sciterra.misc.utils", "qualname": "get_verbose", "kind": "function", "doc": "

    \n", "signature": "(kwargs: dict):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.custom_formatwarning", "modulename": "sciterra.misc.utils", "qualname": "custom_formatwarning", "kind": "function", "doc": "

    \n", "signature": "(msg, *args, **kwargs):", "funcdef": "def"}, {"fullname": "sciterra.vectorization", "modulename": "sciterra.vectorization", "kind": "module", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.vectorizers", "modulename": "sciterra.vectorization", "qualname": "vectorizers", "kind": "variable", "doc": "

    \n", "default_value": "{'SciBERT': <class 'sciterra.vectorization.scibert.SciBERTVectorizer'>, 'SBERT': <class 'sciterra.vectorization.sbert.SBERTVectorizer'>, 'Word2Vec': <class 'sciterra.vectorization.word2vec.Word2VecVectorizer'>}"}, {"fullname": "sciterra.vectorization.preprocessing", "modulename": "sciterra.vectorization.preprocessing", "kind": "module", "doc": "

    Simple preprocessing of scientific abstracts prior to vectorization.

    \n"}, {"fullname": "sciterra.vectorization.preprocessing.nlp", "modulename": "sciterra.vectorization.preprocessing", "qualname": "nlp", "kind": "variable", "doc": "

    \n", "default_value": "<spacy.lang.en.English object>"}, {"fullname": "sciterra.vectorization.preprocessing.custom_preprocess", "modulename": "sciterra.vectorization.preprocessing", "qualname": "custom_preprocess", "kind": "function", "doc": "

    Get all of the lemmas of the words in a document, filtering by POS.

    \n\n
    Arguments:
    \n\n
      \n
    • document: a multi-sentence string
    • \n
    • allowed_pos_tags: keep and lemmatize words that are tagged as one of these POS categories.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    a list of the lemmatized, filtered words in the document

    \n
    \n\n

    Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming

    \n\n

    See https://github.com/zhafen/cc/blob/master/cc/utils.py#L173.

    \n", "signature": "(\tdocument: str,\tallowed_pos_tags: set = {'VERB', 'NOUN', 'ADJ'}) -> list[str]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection", "modulename": "sciterra.vectorization.projection", "kind": "module", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.projection.Projection", "modulename": "sciterra.vectorization.projection", "qualname": "Projection", "kind": "class", "doc": "

    Basic wrapper for document embeddings and helper methods.

    \n"}, {"fullname": "sciterra.vectorization.projection.Projection.__init__", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.__init__", "kind": "function", "doc": "

    Construct a Projection object, a bidirectional mapping from identifiers to document embeddings.

    \n\n
    Arguments:
    \n\n
      \n
    • identifier_to_index: a dict mapping Publication identifiers to indices in the embedding matrix.
    • \n
    • index_to_identifier: a tuple mapping embedding indices to Publication identifiers.
    • \n
    • embeddings: ndarray of document embeddings of shape (num_pubs, embedding_dim)
    • \n
    \n", "signature": "(\tidentifier_to_index: dict[str, int],\tindex_to_identifier: tuple[str],\tembeddings: numpy.ndarray)"}, {"fullname": "sciterra.vectorization.projection.Projection.identifier_to_index", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifier_to_index", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.projection.Projection.index_to_identifier", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.index_to_identifier", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.projection.Projection.embeddings", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.embeddings", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.projection.Projection.indices_to_identifiers", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.indices_to_identifiers", "kind": "function", "doc": "

    Retrieve the identifiers for a list of embedding matrix indices.

    \n", "signature": "(self, indices) -> list[str]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.Projection.identifiers_to_embeddings", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifiers_to_embeddings", "kind": "function", "doc": "

    Retrieve the document embeddings for a list of identifiers.

    \n", "signature": "(self, identifiers: list[str]) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.Projection.identifiers_to_indices", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifiers_to_indices", "kind": "function", "doc": "

    Retrieve the embedding indices for a list of identifiers.

    \n", "signature": "(self, identifiers: list[str]) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.merge", "modulename": "sciterra.vectorization.projection", "qualname": "merge", "kind": "function", "doc": "

    Return the result of merging projection proj_a with projection proj_b.

    \n\n

    This adds to proj_a all embedding data contained in proj_b that is missing from proj_a. This means that the resulting projection can only be greater or equal in size to proj_a.

    \n", "signature": "(\tproj_a: sciterra.vectorization.projection.Projection,\tproj_b: sciterra.vectorization.projection.Projection) -> sciterra.vectorization.projection.Projection:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.get_empty_projection", "modulename": "sciterra.vectorization.projection", "qualname": "get_empty_projection", "kind": "function", "doc": "

    Construct a Projection with no data (but is not None).

    \n", "signature": "() -> sciterra.vectorization.projection.Projection:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.sbert", "modulename": "sciterra.vectorization.sbert", "kind": "module", "doc": "

    We use the acronym SBERT as a catch-all for BERT-based sentence transformers. In particular, we use a lightweight/fast version of one the top-performing model.

    \n\n
    Links:
    \n\n
    \n

    sbert: https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models.\n HF: https://huggingface.co/sentence-transformers

    \n
    \n"}, {"fullname": "sciterra.vectorization.sbert.MPS_DEVICE", "modulename": "sciterra.vectorization.sbert", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "

    \n", "default_value": "device(type='mps')"}, {"fullname": "sciterra.vectorization.sbert.MODEL_PATH", "modulename": "sciterra.vectorization.sbert", "qualname": "MODEL_PATH", "kind": "variable", "doc": "

    \n", "default_value": "'all-MiniLM-L6-v2'"}, {"fullname": "sciterra.vectorization.sbert.EMBEDDING_DIM", "modulename": "sciterra.vectorization.sbert", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "

    \n", "default_value": "384"}, {"fullname": "sciterra.vectorization.sbert.MAX_SEQ_LENGTH", "modulename": "sciterra.vectorization.sbert", "qualname": "MAX_SEQ_LENGTH", "kind": "variable", "doc": "

    \n", "default_value": "256"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer", "kind": "class", "doc": "

    \n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.__init__", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.__init__", "kind": "function", "doc": "

    \n", "signature": "(device='cuda', **kwargs)"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.model", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.model", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.embed_documents", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.embed_documents", "kind": "function", "doc": "

    Embed a list of documents (raw text) into SBERT vectors, by batching.

    \n\n
    Arguments:
    \n\n
      \n
    • docs: the documents to embed.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    a numpy array of shape (num_documents, 384)

    \n
    \n", "signature": "(self, docs: list[str], batch_size: int = 64) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.scibert", "modulename": "sciterra.vectorization.scibert", "kind": "module", "doc": "

    SciBERT is a BERT model trained on scientific text.

    \n\n
    Links:
    \n\n
    \n

    Paper: https://aclanthology.org/D19-1371/\n Github: https://github.com/allenai/scibert\n HF: https://huggingface.co/allenai/scibert_scivocab_uncased

    \n
    \n"}, {"fullname": "sciterra.vectorization.scibert.MPS_DEVICE", "modulename": "sciterra.vectorization.scibert", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "

    \n", "default_value": "device(type='mps')"}, {"fullname": "sciterra.vectorization.scibert.MODEL_PATH", "modulename": "sciterra.vectorization.scibert", "qualname": "MODEL_PATH", "kind": "variable", "doc": "

    \n", "default_value": "'allenai/scibert_scivocab_uncased'"}, {"fullname": "sciterra.vectorization.scibert.EMBEDDING_DIM", "modulename": "sciterra.vectorization.scibert", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "

    \n", "default_value": "768"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer", "kind": "class", "doc": "

    \n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.__init__", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.__init__", "kind": "function", "doc": "

    \n", "signature": "(device='cuda', **kwargs)"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.tokenizer", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.tokenizer", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.model", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.model", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.embed_documents", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.embed_documents", "kind": "function", "doc": "

    Embed a list of documents (raw text) into SciBERT vectors, by batching.

    \n", "signature": "(self, docs: list[str], batch_size: int = 64) -> dict[str, numpy.ndarray]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.vectorizer", "modulename": "sciterra.vectorization.vectorizer", "kind": "module", "doc": "

    Base class for vectorizing abstracts.

    \n"}, {"fullname": "sciterra.vectorization.vectorizer.Vectorizer", "modulename": "sciterra.vectorization.vectorizer", "qualname": "Vectorizer", "kind": "class", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.vectorizer.Vectorizer.embed_documents", "modulename": "sciterra.vectorization.vectorizer", "qualname": "Vectorizer.embed_documents", "kind": "function", "doc": "

    Embed a list of documents into document vectors.

    \n\n
    Arguments:
    \n\n
      \n
    • docs: the documents to embed.
    • \n
    \n\n
    Returns:
    \n\n
    \n

    a dict of the form \n {\n \"embeddings\": a numpy array of shape (num_successful, embedding_dim), containing the document embeddingss

    \n\n
    \"indices\": a numpy array of shape `(num_successful,)`, containing the indices of all the documents for which document embeddings were successfully obtained.\n
    \n \n

    }\n where num_successful is the number of documents in docs that were successfully embedded.

    \n
    \n", "signature": "(self, docs: list[str]) -> dict[str, numpy.ndarray]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.word2vec", "modulename": "sciterra.vectorization.word2vec", "kind": "module", "doc": "

    We use a simple word2vec model that gets a document vector by averaging all words in the document.

    \n\n

    Since we are getting vectors for scientific documents, we must load a vocabulary to train the model from scratch. Therefore we define different subclasses for each scientific field, which may differ substantially by vocabulary.

    \n\n

    There exists a Doc2Vec module by gensim, but it seems that empirically Word2Vec + averaging can do just as well; furthermore, we're mainly interested in a simple baseline to compare with sophisticated embeddings.

    \n\n
    Links:
    \n\n
    \n

    gensim: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#

    \n
    \n"}, {"fullname": "sciterra.vectorization.word2vec.EMBEDDING_DIM", "modulename": "sciterra.vectorization.word2vec", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "

    \n", "default_value": "300"}, {"fullname": "sciterra.vectorization.word2vec.current_file_abs_path", "modulename": "sciterra.vectorization.word2vec", "qualname": "current_file_abs_path", "kind": "variable", "doc": "

    \n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization'"}, {"fullname": "sciterra.vectorization.word2vec.corpora_path", "modulename": "sciterra.vectorization.word2vec", "qualname": "corpora_path", "kind": "variable", "doc": "

    \n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora'"}, {"fullname": "sciterra.vectorization.word2vec.ASTROPHYSICS_CORPUS", "modulename": "sciterra.vectorization.word2vec", "qualname": "ASTROPHYSICS_CORPUS", "kind": "variable", "doc": "

    \n", "default_value": "'astro_small.txt'"}, {"fullname": "sciterra.vectorization.word2vec.DEFAULT_CORPUS", "modulename": "sciterra.vectorization.word2vec", "qualname": "DEFAULT_CORPUS", "kind": "variable", "doc": "

    \n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora/astro_small.txt'"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer", "kind": "class", "doc": "

    \n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.__init__", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.__init__", "kind": "function", "doc": "

    Construct a Word2Vec based document embedding model from a corpus.

    \n", "signature": "(\tcorpus_path: str = '/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora/astro_small.txt',\tmodel_path: str = None,\tvector_size: int = 300,\twindow: int = 5,\tmin_count: int = 2,\tworkers: int = 8,\tepochs: int = 10,\ttokenizer: Callable[[str], list[str]] = <function custom_preprocess>,\t**kwargs)"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.tokenizer", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.tokenizer", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.model", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.model", "kind": "variable", "doc": "

    \n"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.embed_documents", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.embed_documents", "kind": "function", "doc": "

    Embed a list of documents (raw text) into word2vec document vectors by averaging the word vectors in each of the documents.

    \n\n

    Since there's no speedup via batching like there is in pytorch models, we iterate one document at a time.

    \n", "signature": "(self, docs: list[str], **kwargs) -> numpy.ndarray:", "funcdef": "def"}]; // mirrored in build-search-index.js (part 1) // Also split on html tags. this is a cheap heuristic, but good enough. diff --git a/src/examples/scratch/outputs/atlas_s2-11-11-23_w2v-centered_hafenetal/astro_1.model b/src/examples/scratch/outputs/atlas_s2-11-11-23_w2v-centered_hafenetal/astro_1.model index ac817e7..deba504 100644 Binary files a/src/examples/scratch/outputs/atlas_s2-11-11-23_w2v-centered_hafenetal/astro_1.model and b/src/examples/scratch/outputs/atlas_s2-11-11-23_w2v-centered_hafenetal/astro_1.model differ diff --git a/src/sciterra/librarians/__init__.py b/src/sciterra/librarians/__init__.py index 1b2880f..5a6e5b4 100644 --- a/src/sciterra/librarians/__init__.py +++ b/src/sciterra/librarians/__init__.py @@ -2,5 +2,10 @@ from .adslibrarian import ADSLibrarian from .s2librarian import SemanticScholarLibrarian +librarians = { + "S2": SemanticScholarLibrarian, + "ADS": ADSLibrarian, +} + """Why is there not an ArxivLibrarian? For now, we are restricting to APIs that allow us to traverse literature graphs, and arxiv does not have one. While there is a useful pip-installable package for querying the arxiv api for papers, https://pypi.org/project/arxiv/, the returned object does not have information on references and citations. However, it may still be possible to obtain a large sample of publications with abstracts and submission dates (though no citation counts), because the arxiv API's limit for a single query is 300,000 results. """ diff --git a/src/sciterra/mapping/cartography.py b/src/sciterra/mapping/cartography.py index 0ce4cd4..13e01ca 100644 --- a/src/sciterra/mapping/cartography.py +++ b/src/sciterra/mapping/cartography.py @@ -164,6 +164,7 @@ def project(self, atl: Atlas, **kwargs) -> Atlas: id for id in atl_filtered.publications if id not in previously_embedded_ids ] + fail_ids = set() if embed_ids: if verbose: if atl_filtered.projection is not None: @@ -171,22 +172,40 @@ def project(self, atl: Atlas, **kwargs) -> Atlas: f"Found {len(atl_filtered.publications) - len(atl_filtered.projection)} publications not contained in Atlas projection." ) warnings.warn(f"Embedding {len(embed_ids)} total documents.") + # Embed documents - embeddings = self.vectorizer.embed_documents( + result = self.vectorizer.embed_documents( [atl_filtered[id].abstract for id in embed_ids] ) + embeddings = result["embeddings"] + success_indices = result["success_indices"] + fail_indices = result["fail_indices"] - if embeddings is None and verbose: - warnings.warn(f"Obtained no new publication embeddings.") + if fail_indices.tolist() and verbose: + warnings.warn( + f"Failed to get embeddings for all {len(embed_ids)} publications; only {len(embeddings)} will be added to the Atlas." + ) - # create new projection - projection = Projection( - identifier_to_index={ - identifier: idx for idx, identifier in enumerate(embed_ids) - }, - index_to_identifier=tuple(embed_ids), - embeddings=embeddings, - ) + # successful_ids = [id for i, id in enumerate(embed_ids) if i in successful_indices] + embed_ids_array = np.array(embed_ids) + success_ids = embed_ids_array[success_indices] + try: + fail_ids = set(embed_ids_array[fail_indices]) + except IndexError: + breakpoint() + + # create new projection + projection = Projection( + identifier_to_index={ + identifier: idx for idx, identifier in enumerate(success_ids) + }, + index_to_identifier=tuple(success_ids), + embeddings=embeddings, + ) + + if not embed_ids or embed_ids is None and verbose: + warnings.warn(f"Obtained no new publication embeddings.") + projection = get_empty_projection() # merge existing projection with new projection merged_projection = merge(atl_filtered.projection, projection) @@ -197,11 +216,14 @@ def project(self, atl: Atlas, **kwargs) -> Atlas: for id, pub in atl_filtered.publications.items() if id in merged_projection.identifier_to_index } - assert not set(atl_filtered.ids()) - set(embedded_publications.keys()) + + # get new set of bad ids + bad_ids = atl_filtered.bad_ids.union(fail_ids) # Overwrite atlas data atl_filtered.publications = embedded_publications atl_filtered.projection = merged_projection + atl_filtered.bad_ids = bad_ids return atl_filtered ###################################################################### diff --git a/src/sciterra/vectorization/sbert.py b/src/sciterra/vectorization/sbert.py index 16bab3b..0e5ba0c 100644 --- a/src/sciterra/vectorization/sbert.py +++ b/src/sciterra/vectorization/sbert.py @@ -68,4 +68,9 @@ def embed_documents(self, docs: list[str], batch_size: int = 64) -> np.ndarray: pbar.update(batch_size) pbar.close() - return np.array(embeddings) + # We don't have to deal with OOV, so we always return full list of ids + return { + "embeddings": np.array(embeddings), + "success_indices": np.arange(len(embeddings)), + "fail_indices": np.array([], dtype=int), + } diff --git a/src/sciterra/vectorization/scibert.py b/src/sciterra/vectorization/scibert.py index f032ee8..71bd3fb 100644 --- a/src/sciterra/vectorization/scibert.py +++ b/src/sciterra/vectorization/scibert.py @@ -52,15 +52,10 @@ def __init__(self, device="cuda", **kwargs) -> None: self.model.eval() super().__init__() - def embed_documents(self, docs: list[str], batch_size: int = 64) -> np.ndarray: - """Embed a list of documents (raw text) into SciBERT vectors, by batching. - - Args: - docs: the documents to embed. - - Returns: - a numpy array of shape `(num_documents, 768)` - """ + def embed_documents( + self, docs: list[str], batch_size: int = 64 + ) -> dict[str, np.ndarray]: + """Embed a list of documents (raw text) into SciBERT vectors, by batching.""" embeddings = [] @@ -111,4 +106,9 @@ def embed_documents(self, docs: list[str], batch_size: int = 64) -> np.ndarray: pbar.update(batch_size) pbar.close() - return np.array(embeddings) + # We don't have to deal with OOV, so we always return full list of ids + return { + "embeddings": np.array(embeddings), + "success_indices": np.arange(len(embeddings)), + "fail_indices": np.array([], dtype=int), + } diff --git a/src/sciterra/vectorization/vectorizer.py b/src/sciterra/vectorization/vectorizer.py index b88fc24..34a82ae 100644 --- a/src/sciterra/vectorization/vectorizer.py +++ b/src/sciterra/vectorization/vectorizer.py @@ -7,13 +7,22 @@ class Vectorizer: def __init__(self) -> None: pass - def embed_documents(self, docs: list[str]) -> np.ndarray: + def embed_documents(self, docs: list[str]) -> dict[str, np.ndarray]: """Embed a list of documents into document vectors. Args: docs: the documents to embed. Returns: - a numpy array of shape `(num_documents, embedding_dim)` + a dict of the form + { + "embeddings": a numpy array of shape `(num_successful, embedding_dim)`, containing the document embeddingss + + "success_indices": a numpy array of shape `(num_successful,)`, containing the indices of all the documents for which document embeddings were successfully obtained. + + "fail_indices": a numpy array of shape `(len(docs) - num_successful,)`, containing the indices of all the documents for which document embeddings could not be obtained + } + where the indices are with respect to the `docs` list passed. + """ raise NotImplementedError diff --git a/src/sciterra/vectorization/word2vec.py b/src/sciterra/vectorization/word2vec.py index a298bcc..dfef981 100644 --- a/src/sciterra/vectorization/word2vec.py +++ b/src/sciterra/vectorization/word2vec.py @@ -89,28 +89,45 @@ def embed_documents(self, docs: list[str], **kwargs) -> np.ndarray: """Embed a list of documents (raw text) into word2vec document vectors by averaging the word vectors in each of the documents. Since there's no speedup via batching like there is in pytorch models, we iterate one document at a time. - - Args: - docs: the documents to embed. - - Returns: - a numpy array of shape `(num_documents, 300)` """ - return np.array( - [ - np.mean( - [ - self.model.wv[word] - for word in self.tokenizer(doc) - if word in self.model.wv - ], # shape `(300,)` - axis=0, - ) - for doc in tqdm( - docs, - desc="embedding documents", - leave=True, - ) - ] - ) + # return np.array( + # [ + # np.mean( + # [ + # self.model.wv[word] + # for word in self.tokenizer(doc) + # if word in self.model.wv + # ], # shape `(300,)` + # axis=0, + # ) + # for doc in tqdm( + # docs, + # desc="embedding documents", + # leave=True, + # ) + # ] + # ) + means = [] + success_indices = [] + failed_indices = [] + for i, doc in tqdm(enumerate(docs), desc="embedding documents", leave=True): + mean = np.mean( + [ + self.model.wv[word] + for word in self.tokenizer(doc) + if word in self.model.wv + ], # shape `(300,)` + axis=0, + ) + if np.isnan(mean).any(): + failed_indices.append(i) + else: + means.append(mean) + success_indices.append(i) + + return { + "embeddings": np.array(means), + "success_indices": np.array(success_indices), + "fail_indices": np.array(failed_indices), + } diff --git a/src/tests/data/models/word2vec_model_example.model b/src/tests/data/models/word2vec_model_example.model new file mode 100644 index 0000000..e971c61 Binary files /dev/null and b/src/tests/data/models/word2vec_model_example.model differ diff --git a/src/tests/test_cartography.py b/src/tests/test_cartography.py index 1e61581..17e90a9 100644 --- a/src/tests/test_cartography.py +++ b/src/tests/test_cartography.py @@ -10,7 +10,7 @@ from sciterra.mapping.cartography import Cartographer, iterate_expand from sciterra.librarians.s2librarian import SemanticScholarLibrarian from sciterra.mapping.publication import Publication -from sciterra.vectorization.scibert import SciBERTVectorizer +from sciterra.vectorization import SciBERTVectorizer, Word2VecVectorizer bib_dir = "src/tests/data/bib" single_pub_bibtex_fp = f"{bib_dir}/single_publication.bib" @@ -171,6 +171,38 @@ def test_dummy_projection(self): vector1 = projection.identifiers_to_embeddings(["id_9"]) assert np.array_equal(vector0, vector1) + def test_dummy_projection_partial(self): + crt = Cartographer(vectorizer=Word2VecVectorizer()) + + pubs = [ + Publication( + { + "identifier": f"id_{0}", + "abstract": "We use cosmological hydrodynamic simulations with stellar feedback from the FIRE (Feedback In Realistic Environments) project to study the physical nature of Lyman limit systems (LLSs) at z ≤ 1.", # everything here should be in the Word2Vec default vocab, since it trains on this abstract. + "publication_date": datetime(2023, 1, 1), + } + ), + Publication( + { + "identifier": f"id_{1}", + "abstract": "outofvocabularyitem", # this should not + "publication_date": datetime(2023, 1, 1), + } + ), + Publication( + { + "identifier": f"id_{2}", + "abstract": "We use cosmological hydrodynamic simulations with stellar feedback from the FIRE (Feedback In Realistic Environments) project to study the physical nature of Lyman limit systems (LLSs) at z ≤ 1.", + "publication_date": datetime(2023, 1, 1), + } + ), + ] + atl = Atlas(pubs) + + atl_proj = crt.project(atl) + + assert len(atl_proj) == 2 + def test_single_projection(self, tmp_path): path = tmp_path / atlas_dir path.mkdir() diff --git a/src/tests/test_vectorization.py b/src/tests/test_vectorization.py index 17f107f..5772407 100644 --- a/src/tests/test_vectorization.py +++ b/src/tests/test_vectorization.py @@ -7,8 +7,8 @@ from sciterra.vectorization.vectorizer import Vectorizer from sciterra.vectorization import scibert, sbert, word2vec -astro_corpus_1 = "src/tests/data/corpora/astro_1.txt" -model_path_1 = "src/tests/data/models/word2vec_model_astro_1.model" +astro_corpus_1 = "src/tests/data/corpora/example.txt" +model_path_1 = "src/tests/data/models/word2vec_model_example.model" abstract_str = "We use cosmological hydrodynamic simulations with stellar feedback from the FIRE (Feedback In Realistic Environments) project to study the physical nature of Lyman limit systems (LLSs) at z ≤ 1. At these low redshifts, LLSs are closely associated with dense gas structures surrounding galaxies, such as galactic winds, dwarf satellites and cool inflows from the intergalactic medium. Our analysis is based on 14 zoom-in simulations covering the halo mass range Mh ≈ 109-1013 M at z = 0, which we convolve with the dark matter halo mass function to produce cosmological statistics. We find that the majority of cosmologically selected LLSs are associated with haloes in the mass range 1010 ≲ Mh ≲ 1012 M. The incidence and H I column density distribution of simulated absorbers with columns in the range 10^{16.2} ≤ N_{H I} ≤ 2× 10^{20} cm-2 are consistent with observations. High-velocity outflows (with radial velocity exceeding the halo circular velocity by a factor of ≳ 2) tend to have higher metallicities ([X/H] ∼ -0.5) while very low metallicity ([X/H] < -2) LLSs are typically associated with gas infalling from the intergalactic medium. However, most LLSs occupy an intermediate region in metallicity-radial velocity space, for which there is no clear trend between metallicity and radial kinematics. The overall simulated LLS metallicity distribution has a mean (standard deviation) [X/H] = -0.9 (0.4) and does not show significant evidence for bimodality, in contrast to recent observational studies, but consistent with LLSs arising from haloes with a broad range of masses and metallicities." # 252 tokens @@ -22,7 +22,9 @@ class TestSciBERTVectorizer: embedding_dim = scibert.EMBEDDING_DIM def test_single_vector(self): - embedding = TestSciBERTVectorizer.vectorizer.embed_documents([abstract_str]) + embedding = TestSciBERTVectorizer.vectorizer.embed_documents([abstract_str])[ + "embeddings" + ] # Check embedding is of correct type, shape, and has no nans assert isinstance(embedding, np.ndarray) @@ -32,14 +34,14 @@ def test_single_vector(self): def test_identity_of_embeddings(self): embeddings = TestSciBERTVectorizer.vectorizer.embed_documents( [abstract_str, abstract_str] - ) + )["embeddings"] # check identity assert np.all(embeddings[0] == embeddings[1]) def test_single_cosine_pair(self): embeddings = TestSciBERTVectorizer.vectorizer.embed_documents( [abstract_str, abstract_str] - ) + )["embeddings"] # Check that the cosine sim of doc w/ itself is 1 # n.b., see sklearn.metrics.pairwise.cosine_similarity @@ -57,7 +59,7 @@ def test_basic_cosine_matrix(self): [ TestSciBERTVectorizer.vectorizer.embed_documents( [abstract_str] * num_pubs - ).flatten() + )["embeddings"].flatten() ] ) cosine_matrix = cosine_distances(embeddings, embeddings) @@ -74,7 +76,9 @@ class TestSBERTVectorizer: embedding_dim = sbert.EMBEDDING_DIM def test_single_vector(self): - embedding = TestSBERTVectorizer.vectorizer.embed_documents([abstract_str]) + embedding = TestSBERTVectorizer.vectorizer.embed_documents([abstract_str])[ + "embeddings" + ] # Check embedding is of correct type, shape, and has no nans assert isinstance(embedding, np.ndarray) @@ -84,14 +88,14 @@ def test_single_vector(self): def test_identity_of_embeddings(self): embeddings = TestSBERTVectorizer.vectorizer.embed_documents( [abstract_str, abstract_str] - ) + )["embeddings"] # check identity assert np.all(embeddings[0] == embeddings[1]) def test_single_cosine_pair(self): embeddings = TestSBERTVectorizer.vectorizer.embed_documents( [abstract_str, abstract_str] - ) + )["embeddings"] # Check that the cosine sim of doc w/ itself is 1 # n.b., see sklearn.metrics.pairwise.cosine_similarity @@ -109,7 +113,7 @@ def test_basic_cosine_matrix(self): [ TestSBERTVectorizer.vectorizer.embed_documents( [abstract_str] * num_pubs - ).flatten() + )["embeddings"].flatten() ] ) cosine_matrix = cosine_distances(embeddings, embeddings) @@ -131,24 +135,36 @@ class TestWord2VecVectorizer: embedding_dim = word2vec.EMBEDDING_DIM def test_single_vector(self): - embedding = TestWord2VecVectorizer.vectorizer.embed_documents([abstract_str]) + embedding = TestWord2VecVectorizer.vectorizer.embed_documents([abstract_str])[ + "embeddings" + ] # Check embedding is of correct type, shape, and has no nans assert isinstance(embedding, np.ndarray) assert embedding.shape == (1, TestWord2VecVectorizer.embedding_dim) assert not np.isnan(embedding).any() + def test_failed_embedding(self): + result = TestWord2VecVectorizer.vectorizer.embed_documents( + ["outofvocabularyitem"] + ) + + # Check embedding is of correct type, shape, and has no nans + assert np.array_equal(result["fail_indices"], np.array([0])) + assert np.array_equal(result["success_indices"], np.array([])) + assert np.array_equal(result["embeddings"], np.array([])) + def test_identity_of_embeddings(self): embeddings = TestWord2VecVectorizer.vectorizer.embed_documents( [abstract_str, abstract_str] - ) + )["embeddings"] # check identity assert np.all(embeddings[0] == embeddings[1]) def test_single_cosine_pair(self): embeddings = TestWord2VecVectorizer.vectorizer.embed_documents( [abstract_str, abstract_str] - ) + )["embeddings"] # Check that the cosine sim of doc w/ itself is 1 # n.b., see sklearn.metrics.pairwise.cosine_similarity @@ -166,7 +182,7 @@ def test_basic_cosine_matrix(self): [ TestWord2VecVectorizer.vectorizer.embed_documents( [abstract_str] * num_pubs - ).flatten() + )["embeddings"].flatten() ] ) cosine_matrix = cosine_distances(embeddings, embeddings)