From ee5013852764a47ed5f82edc86843359fc1f3d73 Mon Sep 17 00:00:00 2001 From: Nathaniel Imel Date: Thu, 14 Mar 2024 21:48:57 -0700 Subject: [PATCH] minor doc gen experimentation --- docs/sciterra/vectorization/preprocessing.html | 2 +- docs/search.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sciterra/vectorization/preprocessing.html b/docs/sciterra/vectorization/preprocessing.html index 936bf14..a136214 100644 --- a/docs/sciterra/vectorization/preprocessing.html +++ b/docs/sciterra/vectorization/preprocessing.html @@ -217,7 +217,7 @@

- CustomPreprocessor( allowed_pos_tags: set = {'ADJ', 'NOUN', 'VERB'}, model='en_core_web_sm') + CustomPreprocessor( allowed_pos_tags: set = {'NOUN', 'VERB', 'ADJ'}, model='en_core_web_sm') diff --git a/docs/search.js b/docs/search.js index 7e29e0a..86b3b75 100644 --- a/docs/search.js +++ b/docs/search.js @@ -1,6 +1,6 @@ window.pdocSearch = (function(){ /** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();osciterra: a python library for similarity-based scientometrics

\n\n

\"build\"

\n\n

Sciterra is a software libary to support data-driven analyses of scientific literature, with a focus on unifying different bibliographic database APIs and document-embedding methods for systematic scientometrics research.

\n\n

Overview

\n\n

The main purpose of sciterra is to perform similarity-based retrieval of scientific publications for metascience/scientometrics research. While there are many services that can make the individual steps of this simple, this software library exists to

\n\n
    \n
  1. Unify the different APIs and vector-based retrieval methods

  2. \n
  3. Support scientometrics analyses of citation dynamics, especially with respect to a vectorized 'landscape' of literature.

  4. \n
\n\n

Installing sciterra

\n\n

First, set up a virtual environment (e.g. via miniconda, conda create -n sciterra, and conda activate sciterra).

\n\n
    \n
  1. Install sciterra via git:

    \n\n

    python -m pip install 'sciterra @ git+https://github.com/nathimel/sciterra.git'

  2. \n
  3. Alternatively, download or clone this repository and navigate to the root folder, and install locally:

    \n\n

    pip install -e .

  4. \n
  5. It is not yet recommended because sciterra is still in development, but you can also install via pip from pypi:

    \n\n

    pip install sciterra

  6. \n
\n\n

Usage

\n\n

Atlas

\n\n

The central object in sciterra is the Atlas. This is a basic data structure for containing scientific publications that are returned from calls to various bibliographic database APIs.

\n\n

An Atlas minimally requires a list of Publications.

\n\n

Publication

\n\n

A publication object is a minimal wrapper around publication data, and should have a string identifier. It is designed to standardize the basic metadata contained in the results from some bibliographic database API.

\n\n
\n
from sciterra import Atlas, Publication\n\natl = Atlas([Publication({"identifier": "id"})])\n
\n
\n\n

Alternatively, you can construct an Atlas by passing in a .bib file. The entries in this bibtex file will be parsed for unique identifiers (e.g., DOIs), and sent in an API call, and returned as Publications, which then populate an Atlas.

\n\n
\n
atl = crt.bibtex_to_atlas(bibtex_filepath)\n
\n
\n\n

In the line of code above, the variable crt is an instance of a Cartographer object, which encapsulates the bookkeeping involved in querying a bibliographic database for publications.

\n\n

Cartographer

\n\n

The Cartographer class is named because interfaces with an Atlas to build out a library of publications. Since it does so via similarity-based retrieval, the resulting Atlas can be considered a 'region' of publications.

\n\n

To do this, a Cartographer needs two things: an API with which to interface, and a way of getting document embeddings. Both are encapsulated, respectively, by the Librarian and the Vectorizer classes.

\n\n
\n
from sciterra import Cartographer\nfrom sciterra.librarians import SemanticScholarLibrarian # or ADSLibrarian\nfrom sciterra.vectorization import SciBERTVectorizer # among others\n\ncrt = Cartographer(\n    librarian=SemanticScholarLibrarian(),\n    vectorizer=SciBERTVectorizer(),\n)\n
\n
\n\n

Librarian

\n\n

Each Librarian subclass is designed to be a wrapper for an existing python API service, such as the ads package or the semanticscholar client library.

\n\n

A Librarian subclass also overrides two methods. The first is get_publications, which takes a list of identifiers, should query the specific API for that Librarian, and returns a list of Publications. Keyword arguments can be passed to specify the metadata that is kept for each publication (e.g. date, title, journal, authors, etc.) The second method is convert_publication, which defines how the result of an API call should be converted to a sciterra Publication object.

\n\n

Contributions to sciterra in the form of new Librarian subclasses are encouraged and appreciated.

\n\n

Vectorizer

\n\n

Vectorizer subclasses override one function, embed_documents, which takes a list of strings, representing the text of a publication (currently, just its abstract), and returns an np.ndarray of embeddings.

\n\n

Under the hood, the project method of Cartographer, which is used during similarity-based retrieval, uses the vectorizer roughly as follows

\n\n
\n
# Get abstracts\ndocs = [atlas[identifier].abstract for identifier in identifiers]\n\n# Embed abstracts\nresult = vectorizer.embed_documents(docs)\nembeddings = result["embeddings"]\n\n# depending on the vectorizer, sometimes not all embeddings can be obtained due to out-of-vocab issues\nsuccess_indices = result["success_indices"] # shape `(len(embeddings),)`\nfail_indices = result["fail_indices"] # shape `(len(docs) - len(embeddings))``\n
\n
\n\n

Currently, sciterra has vectorizers using SciBERT, SBERT, GPT-2, Word2Vec, and a simple bag-of-words (BOW) vectorizer that uses the same vocabulary as the Word2Vec vectorizer. Contributions to sciterra in the form of new Vectorizer subclasses are also encouraged and appreciated.

\n\n

Putting it all together

\n\n

The main use case for all of these ingredients is to iteratively build out a region of publications. This is done using iterate_expand:

\n\n
\n
from sciterra.mapping.tracing import iterate_expand\n\n# Assuming the initial atlas contains just one publication\n(atl.center, ) = atl.publications.values()\n# build out an atlas to contain 10,000 publications, with increasing dissimilarity to the initial publication, saving progress in binary files to the directory named "atlas".\niterate_expand(\n    atl=atl,\n    crt=crt,\n    atlas_dir="atlas",\n    target_size=10000,\n    center=atl.center,\n)\n
\n
\n\n

This method has a number of keyword arguments that enable tracking the Atlas expansion, limiting the number of publications per expansion, how many times to try to get a response if there are connection issues, etc.

\n\n

In practice, it may be helpful to use the sciterra.mapping.tracing.AtlasTracer data structure to reduce most of the loading/initialization boilerplate described above. For an example, see main.py.

\n\n

Additional features

\n\n\n\n

Acknowledgments

\n\n

This software is a reimplimentation of Zachary Hafen-Saavedra's library, cc.

\n\n

To cite sciterra, please use the following workshop paper,

\n\n
@inproceedings{Imel2023,\n author = {Imel, Nathaniel, and Hafen, Zachary},\n title = {Citation-similarity relationships in astrophysics},\n booktitle = {AI for Scientific Discovery: From Theory to Practice Workshop (AI4Science @ NeurIPS)},\n year = {2023},\n url = {https://openreview.net/pdf?id=mISayy7DPI},\n}\n
\n"}, {"fullname": "sciterra.librarians", "modulename": "sciterra.librarians", "kind": "module", "doc": "

\n"}, {"fullname": "sciterra.librarians.librarians", "modulename": "sciterra.librarians", "qualname": "librarians", "kind": "variable", "doc": "

Why is there not an ArxivLibrarian? For now, we are restricting to APIs that allow us to traverse literature graphs, and arxiv does not have one. While there is a useful pip-installable package for querying the arxiv api for papers, https://pypi.org/project/arxiv/, the returned object does not have information on references and citations. However, it may still be possible to obtain a large sample of publications with abstracts and submission dates (though no citation counts), because the arxiv API's limit for a single query is 300,000 results.

\n", "default_value": "{'S2': <class 'sciterra.librarians.s2librarian.SemanticScholarLibrarian'>, 'ADS': <class 'sciterra.librarians.adslibrarian.ADSLibrarian'>}"}, {"fullname": "sciterra.librarians.adslibrarian", "modulename": "sciterra.librarians.adslibrarian", "kind": "module", "doc": "

\n"}, {"fullname": "sciterra.librarians.adslibrarian.CALL_SIZE", "modulename": "sciterra.librarians.adslibrarian", "qualname": "CALL_SIZE", "kind": "variable", "doc": "

\n", "default_value": "50"}, {"fullname": "sciterra.librarians.adslibrarian.NUM_ATTEMPTS_PER_QUERY", "modulename": "sciterra.librarians.adslibrarian", "qualname": "NUM_ATTEMPTS_PER_QUERY", "kind": "variable", "doc": "

\n", "default_value": "10"}, {"fullname": "sciterra.librarians.adslibrarian.QUERY_FIELDS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "QUERY_FIELDS", "kind": "variable", "doc": "

\n", "default_value": "['bibcode', 'abstract', 'title', 'entry_date', 'pubdate', 'year', 'citation_count', 'citation', 'reference', 'identifier', 'arxiv_class']"}, {"fullname": "sciterra.librarians.adslibrarian.ALLOWED_EXCEPTIONS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ALLOWED_EXCEPTIONS", "kind": "variable", "doc": "

\n", "default_value": "(<class 'ads.exceptions.APIResponseError'>,)"}, {"fullname": "sciterra.librarians.adslibrarian.EXTERNAL_IDS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "EXTERNAL_IDS", "kind": "variable", "doc": "

\n", "default_value": "['DOI', 'arXiv', 'bibcode']"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "sciterra.librarians.librarian.Librarian"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.bibtex_entry_identifier", "kind": "function", "doc": "

Parse a bibtex entry for a usable identifier for querying ADS (see EXTERNAL_IDS).

\n", "signature": "(self, bibtex_entry: dict) -> str:", "funcdef": "def"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.get_publications", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.get_publications", "kind": "function", "doc": "

Use the NASA ADS python package, which calls the ADS API to retrieve publications.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

the list of publications (or Papers)

\n
\n", "signature": "(\tself,\tbibcodes: list[str],\t*args,\tcall_size: int = 50,\tn_attempts_per_query: int = 10,\tconvert: bool = True,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.convert_publication", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.convert_publication", "kind": "function", "doc": "

Convert a ADS Article object to a sciterra.publication.Publication.

\n", "signature": "(\tself,\tarticle: ads.search.Article,\t*args,\t**kwargs) -> sciterra.mapping.publication.Publication:", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian", "modulename": "sciterra.librarians.librarian", "kind": "module", "doc": "

\n"}, {"fullname": "sciterra.librarians.librarian.Librarian", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "abc.ABC"}, {"fullname": "sciterra.librarians.librarian.Librarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.bibtex_entry_identifier", "kind": "function", "doc": "

Parse a bibtex entry for a usable unique identifier appropriate to the API.

\n", "signature": "(self, bibtex_entry: dict) -> str:", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.get_publications", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.get_publications", "kind": "function", "doc": "

Call an API and retrieve the publications corresponding to str identifiers.

\n\n
Arguments:
\n\n\n", "signature": "(\tself,\tidentifiers: list[str],\t*args,\tcall_size: int = None,\tn_attempts_per_query: int = None,\tconvert: bool = True,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.convert_publication", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.convert_publication", "kind": "function", "doc": "

Convert an API-specific resulting publication data structure into a sciterra Publication object.

\n", "signature": "(self, pub: Any, *args, **kwargs):", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.convert_publications", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.convert_publications", "kind": "function", "doc": "

Convet a list of API-specific results to sciterra Publications, possibly using multiprocessing.

\n", "signature": "(\tself,\tpapers: list,\t*args,\tmultiprocess: bool = True,\tnum_processes=6,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian", "modulename": "sciterra.librarians.s2librarian", "kind": "module", "doc": "

\n"}, {"fullname": "sciterra.librarians.s2librarian.QUERY_FIELDS", "modulename": "sciterra.librarians.s2librarian", "qualname": "QUERY_FIELDS", "kind": "variable", "doc": "

\n", "default_value": "['year', 'abstract', 'title', 'externalIds', 'citationCount', 'fieldsOfStudy', 's2FieldsOfStudy', 'url', 'citations.externalIds', 'citations.url', 'references.externalIds', 'references.url', 'citationStyles', 'publicationDate']"}, {"fullname": "sciterra.librarians.s2librarian.EXTERNAL_IDS", "modulename": "sciterra.librarians.s2librarian", "qualname": "EXTERNAL_IDS", "kind": "variable", "doc": "

\n", "default_value": "['DOI', 'ArXiv', 'CorpusId', 'MAG', 'ACL', 'PubMed', 'Medline', 'PubMedCentral', 'DBLP', 'URL']"}, {"fullname": "sciterra.librarians.s2librarian.ALLOWED_EXCEPTIONS", "modulename": "sciterra.librarians.s2librarian", "qualname": "ALLOWED_EXCEPTIONS", "kind": "variable", "doc": "

\n", "default_value": "(<class 'Exception'>, <class 'requests.exceptions.ReadTimeout'>, <class 'requests.exceptions.ConnectionError'>, <class 'semanticscholar.SemanticScholarException.ObjectNotFoundException'>)"}, {"fullname": "sciterra.librarians.s2librarian.CALL_SIZE", "modulename": "sciterra.librarians.s2librarian", "qualname": "CALL_SIZE", "kind": "variable", "doc": "

\n", "default_value": "10"}, {"fullname": "sciterra.librarians.s2librarian.NUM_ATTEMPTS_PER_QUERY", "modulename": "sciterra.librarians.s2librarian", "qualname": "NUM_ATTEMPTS_PER_QUERY", "kind": "variable", "doc": "

\n", "default_value": "50"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "sciterra.librarians.librarian.Librarian"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.__init__", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.__init__", "kind": "function", "doc": "

\n", "signature": "(api_key: str = None, api_key_fn: str = None)"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.sch", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.sch", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.bibtex_entry_identifier", "kind": "function", "doc": "

Parse a bibtex entry for a usable identifier for querying SemanticScholar (see EXTERNAL_IDS).

\n", "signature": "(self, bibtex_entry: dict) -> str:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_publications", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_publications", "kind": "function", "doc": "

Use the (unofficial) S2 python package, which calls the Semantic Scholar API to retrieve publications from the S2AG.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

the list of publications (or Papers)

\n
\n", "signature": "(\tself,\tpaper_ids: list[str],\t*args,\tcall_size: int = 10,\tn_attempts_per_query: int = 50,\tconvert: bool = True,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.convert_publication", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.convert_publication", "kind": "function", "doc": "

Convert a SemanticScholar Paper object to a sciterra.publication.Publication.

\n", "signature": "(\tself,\tpaper: semanticscholar.Paper.Paper,\t*args,\t**kwargs) -> sciterra.mapping.publication.Publication:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_papers", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_papers", "kind": "function", "doc": "

Custom function for calling the S2 API that doesn't fail on empty results.

\n", "signature": "(self, paper_ids: list[str], fields: list[str]):", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_paper", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_paper", "kind": "function", "doc": "

Custom function for calling the S2 API that doesn't fail on empty results.

\n", "signature": "(self, paper_id: str, fields: list[str]):", "funcdef": "def"}, {"fullname": "sciterra.mapping", "modulename": "sciterra.mapping", "kind": "module", "doc": "

Classes for constructing maps of scientific literature.

\n\n

The sciterra.mapping.atlas submodule contains the basic data structure, the Atlas.

\n\n

The sciterra.mapping.cartography submodule contains functionality for manipulating an Atlas.

\n"}, {"fullname": "sciterra.mapping.atlas", "modulename": "sciterra.mapping.atlas", "kind": "module", "doc": "

Main container object for a large library of publications.

\n"}, {"fullname": "sciterra.mapping.atlas.Atlas", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas", "kind": "class", "doc": "

Data structure for storing publications.

\n\n

self.projection: the Projection object containing the embeddings of all publications and their mapping to str identifiers.

\n\n

self.bad_ids: a list of identifiers that have failed for some reason or other during an expansion, and will be excluded from subsequent expansions.

\n\n

self.history: dict of the form {'pubs_per_update': list[list[str]], 'kernel_size': np.ndarray of ints of shape (num_pubs, last_update) where last_update <= the total number of expansions performed.}

\n\n

self.center: the core, central Publication identifier repeatedly passed to cartography.Cartographer.expand. Default is None, which means the Atlas has no internal record of the central publication.

\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.__init__", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.__init__", "kind": "function", "doc": "

\n", "signature": "(\tpublications: list[sciterra.mapping.publication.Publication],\tprojection: sciterra.vectorization.projection.Projection = None,\tbad_ids: set[str] = set(),\thistory: dict[str, typing.Any] = None,\tcenter: str = None)"}, {"fullname": "sciterra.mapping.atlas.Atlas.publications", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.publications", "kind": "variable", "doc": "

\n", "annotation": ": dict[str, sciterra.mapping.publication.Publication]"}, {"fullname": "sciterra.mapping.atlas.Atlas.projection", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.projection", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.bad_ids", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.bad_ids", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.history", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.history", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.center", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.center", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.ids", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.ids", "kind": "variable", "doc": "

Get a list of all the publication identifiers in the Atlas.

\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.atlas.Atlas.save", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.save", "kind": "function", "doc": "

Write the Atlas to a directory containing a .pkl binary for each attribute.

\n\n

Warnings cannot be silenced.

\n\n
Arguments:
\n\n\n", "signature": "(self, atlas_dirpath: str, overwrite: bool = True) -> None:", "funcdef": "def"}, {"fullname": "sciterra.mapping.atlas.Atlas.load", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.load", "kind": "function", "doc": "

Load an Atlas object from a directory containing the .pkl binary for each attribute.

\n\n

Warnings cannot be silenced.

\n\n
Arguments:
\n\n\n", "signature": "(cls, atlas_dirpath: str):", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography", "modulename": "sciterra.mapping.cartography", "kind": "module", "doc": "

Functions for manipulating an atlas based on the document embeddings of the abstracts of its publications.

\n"}, {"fullname": "sciterra.mapping.cartography.batch_cospsi_matrix", "modulename": "sciterra.mapping.cartography", "qualname": "batch_cospsi_matrix", "kind": "function", "doc": "

Batch-process a pairwise cosine similarity matrix between embeddings.

\n\n

In order to avoid memory errors (e.g. bus error, segfaults) resulting from too large arrays, we batch process the construction of the cospsi_matrix.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

cosine_similarities: a 2D numpy array of shape (num_pubs, num_pubs) representing the pairwise cosine similarity between each embedding

\n
\n", "signature": "(embeddings: numpy.ndarray) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.pub_has_attributes", "modulename": "sciterra.mapping.cartography", "qualname": "pub_has_attributes", "kind": "function", "doc": "

Return True if a publication has all attributes.

\n\n
Arguments:
\n\n\n", "signature": "(\tpub: sciterra.mapping.publication.Publication,\tattributes: list[str]) -> bool:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.pub_has_fields_of_study", "modulename": "sciterra.mapping.cartography", "qualname": "pub_has_fields_of_study", "kind": "function", "doc": "

Return true if any of pub.fields_of_study are in passed fields_of_study.

\n", "signature": "(\tpub: sciterra.mapping.publication.Publication,\tfields_of_study: list[str]) -> bool:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer", "kind": "class", "doc": "

A basic wrapper for obtaining and updating atlas projections.

\n\n

self.librarian: the Librarian object used to query a bibliographic database API.\nself.vectorizer: the Vectorizer object used to get a document embedding for each abstract\nself.pubs_per_update: a list of lists of publication str ids, representing the publications that exist at each time step / expansion update.\nself.update_history: an np.array of ints representing when publications were added. A value of -2 indicates no record of being added.

\n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.__init__", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.__init__", "kind": "function", "doc": "

\n", "signature": "(\tlibrarian: sciterra.librarians.librarian.Librarian = None,\tvectorizer: sciterra.vectorization.vectorizer.Vectorizer = None)"}, {"fullname": "sciterra.mapping.cartography.Cartographer.librarian", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.librarian", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.vectorizer", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.vectorizer", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.pubs_per_update", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.pubs_per_update", "kind": "variable", "doc": "

\n", "annotation": ": list[list[str]]"}, {"fullname": "sciterra.mapping.cartography.Cartographer.update_history", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.update_history", "kind": "variable", "doc": "

\n", "annotation": ": numpy.ndarray"}, {"fullname": "sciterra.mapping.cartography.Cartographer.bibtex_to_atlas", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.bibtex_to_atlas", "kind": "function", "doc": "

Convert a bibtex file to an atlas, by parsing each entry for an identifier, and querying an API for publications using self.librarian.

\n\n

NOTE: the identifiers in the corresponding atlas will be API-specific ids; there is no relationship between the parsed id used to query papers (e.g. 'DOI:XYZ' in the case of SemanticScholar) and the resulting identifier associated with the resulting Publication object, (a paperId, a bibcode, etc.) Therefore, the purpose of using the bibtex_to_atlas method is primarily for initializing literature exploration in a human-readable way. If you want to obtain as many publications as identifiers supplied, you need to use get_publications.

\n\n
Arguments:
\n\n\n", "signature": "(self, bibtex_fp: str, *args, **kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.project", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.project", "kind": "function", "doc": "

Update an atlas with its projection, i.e. the document embeddings for all publications using self.vectorizer, removing publications with no abstracts.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

the updated atlas containing all nonempty-abstract-containing publications and their projection

\n
\n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\t**kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.expand", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.expand", "kind": "function", "doc": "

Expand an atlas by retrieving a list of publications resulting from traversal of the citation network.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

atl_expanded: the expanded atlas

\n
\n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\t*args,\tcenter: str = None,\tn_pubs_max: int = 4000,\tn_sources_max: int = None,\trecord_pubs_per_update: bool = False,\t**kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.filter_by_func", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.filter_by_func", "kind": "function", "doc": "

Update an atlas by dropping publications (and corresponding data in projection) when certain fields are empty.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

the filtered atlas

\n
\n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\trequire_func: Callable[[sciterra.mapping.publication.Publication], bool] = <function Cartographer.<lambda>>,\trecord_pubs_per_update=False,\t**kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.filter_by_ids", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.filter_by_ids", "kind": "function", "doc": "

Update an atlas by dropping publications (and corresponding data in projection).

\n\n
Arguments:
\n\n\n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\tkeep_ids: list[str] = None,\tdrop_ids: list[str] = None) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.track", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.track", "kind": "function", "doc": "

Overwrite the data associated with tracking degree of convergence of publications in an atlas over multiple expansions. N.B.: the atlas must be fully projected, or else converged_kernel_size will raise a KeyError. By default, this function will overwrite the atl.history with updated self.pubs_per_update, but not kernel_size, which requires computing the converged kernel size for every publication in the atlas.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

atl the updated Atlas

\n
\n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\tcalculate_convergence: bool = False,\tpubs: list[str] = None,\tpubs_per_update: list[list[str]] = None) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.record_update_history", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.record_update_history", "kind": "function", "doc": "

Record when publications were added, by updating atl.update_history.

\n\n

atl.update_history is a np.array of ints representing when publications were added. A value of -2 indicates no record of being added.

\n\n
Arguments:
\n\n\n\n
Updates:
\n\n
\n

self.update_history: an np.array of ints representing when publications were added. A value of -2 indicates no record of being added.

\n
\n\n
Returns:
\n\n
\n

None

\n
\n", "signature": "(\tself,\tpubs: list[str] = None,\tpubs_per_update: list[list[str]] = None) -> None:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.converged_kernel_size", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.converged_kernel_size", "kind": "function", "doc": "

Calculate the largest size of the kernel that's converged (at differing levels of convergence) for each publication in a sample at each update.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

kernel_size: an array of ints of shape (num_pubs, max_update) representing the kernel size for converged kernels.\n - The first column indicates the largest kernel size that hasn't changed since the beginning,\n - The second column indicates the largest kernel size that hasn't changed since the first update,\n - etc. for the nth column.

\n
\n", "signature": "(self, atl: sciterra.mapping.atlas.Atlas) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.measure_topography", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.measure_topography", "kind": "function", "doc": "

Measure topographic properties of all publications relative to prior\npublications.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

estimates: an np.ndarray of shape (len(publication_indices), len(metrics)) representing the estimated topography metric values for each publication.

\n
\n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\tids: list[str] = None,\tmetrics: list[str] = ['density'],\tmin_prior_pubs: int = 2,\tkernel_size=16,\t**kwargs):", "funcdef": "def"}, {"fullname": "sciterra.mapping.publication", "modulename": "sciterra.mapping.publication", "kind": "module", "doc": "

The general container for data for any scientific publication, regardless of the API that was used to obtain it.

\n"}, {"fullname": "sciterra.mapping.publication.FIELDS", "modulename": "sciterra.mapping.publication", "qualname": "FIELDS", "kind": "variable", "doc": "

\n", "default_value": "['identifier', 'abstract', 'publication_date', 'citation_count', 'citations', 'references']"}, {"fullname": "sciterra.mapping.publication.ADDITIONAL_FIELDS", "modulename": "sciterra.mapping.publication", "qualname": "ADDITIONAL_FIELDS", "kind": "variable", "doc": "

\n", "default_value": "['doi', 'url', 'title', 'issn']"}, {"fullname": "sciterra.mapping.publication.Publication", "modulename": "sciterra.mapping.publication", "qualname": "Publication", "kind": "class", "doc": "

The Publication is a standardized container a scientific publication's retrieved data.

\n\n

In general, all data-cleaning shoud be done prior to constructing a Publication, in order to keep the class minimal.

\n\n
Attributes:
\n\n\n"}, {"fullname": "sciterra.mapping.publication.Publication.__init__", "modulename": "sciterra.mapping.publication", "qualname": "Publication.__init__", "kind": "function", "doc": "

Construct a publication.

\n\n
Arguments:
\n\n\n", "signature": "(data: dict, **kwargs)"}, {"fullname": "sciterra.mapping.publication.Publication.identifier", "modulename": "sciterra.mapping.publication", "qualname": "Publication.identifier", "kind": "variable", "doc": "

\n", "annotation": ": str"}, {"fullname": "sciterra.mapping.publication.Publication.abstract", "modulename": "sciterra.mapping.publication", "qualname": "Publication.abstract", "kind": "variable", "doc": "

\n", "annotation": ": str"}, {"fullname": "sciterra.mapping.publication.Publication.publication_date", "modulename": "sciterra.mapping.publication", "qualname": "Publication.publication_date", "kind": "variable", "doc": "

\n", "annotation": ": datetime.date"}, {"fullname": "sciterra.mapping.publication.Publication.citations", "modulename": "sciterra.mapping.publication", "qualname": "Publication.citations", "kind": "variable", "doc": "

\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.references", "modulename": "sciterra.mapping.publication", "qualname": "Publication.references", "kind": "variable", "doc": "

\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.citation_count", "modulename": "sciterra.mapping.publication", "qualname": "Publication.citation_count", "kind": "variable", "doc": "

The citation_count can be different from the length of citations, since the number of citations listed for a paper might be different from the number of (valid) citing papers indexed on the relevant API.

\n", "annotation": ": int"}, {"fullname": "sciterra.mapping.publication.Publication.fields_of_study", "modulename": "sciterra.mapping.publication", "qualname": "Publication.fields_of_study", "kind": "variable", "doc": "

\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.init_attributes", "modulename": "sciterra.mapping.publication", "qualname": "Publication.init_attributes", "kind": "function", "doc": "

\n", "signature": "(self, data, **kwargs) -> None:", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography", "modulename": "sciterra.mapping.topography", "kind": "module", "doc": "

Functions for measuring topographic properties of (the semantic feature space of publications inside) an Atlas.

\n"}, {"fullname": "sciterra.mapping.topography.smoothing_length_metric", "modulename": "sciterra.mapping.topography", "qualname": "smoothing_length_metric", "kind": "function", "doc": "

Proxy for the density of a publication defined as the minimum\narc length that encloses kernel_size other publications.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

h: float representing arc length containing kernel_size other publications. (Assumes normalized to a radius of 1.)

\n
\n", "signature": "(\tidx: int,\tcospsi_matrix: numpy.ndarray,\tvalid_indices: numpy.ndarray,\tkernel_size: int = 16):", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.density_metric", "modulename": "sciterra.mapping.topography", "qualname": "density_metric", "kind": "function", "doc": "

Estimate the density of a publication by calculating the\nsmoothing length that encloses kernel_size other publications.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

density: a float representing kernel_size divided by arc length containing kernel_size other publications.

\n
\n", "signature": "(\tidx: int,\tcospsi_matrix: numpy.ndarray,\tvalid_indices: numpy.ndarray,\tkernel_size: int = 16):", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.edginess_metric", "modulename": "sciterra.mapping.topography", "qualname": "edginess_metric", "kind": "function", "doc": "

Estimate the asymmetry of a publication by calculating the difference between that publication's projection and the other publications within the kernel. Normalized to between 0 and 1.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

a float representing the normalized magnitude of the asymmetry metric.

\n
\n", "signature": "(\tidx: int,\tcospsi_matrix: numpy.ndarray,\tvalid_indices: numpy.ndarray,\tpublication_indices: numpy.ndarray,\tembeddings: numpy.ndarray,\tkernel_size: int = 16) -> float:", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.kernel_constant_asymmetry_metric", "modulename": "sciterra.mapping.topography", "qualname": "kernel_constant_asymmetry_metric", "kind": "function", "doc": "

Estimate the asymmetry of a publication by calculating the difference\nbetween that publication's projection and the other publications within\nthe kernel.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

mag: a float representing the magnitude of the asymmetry metric.

\n
\n", "signature": "(\tidx: int,\tcospsi_matrix: numpy.ndarray,\tvalid_indices: numpy.ndarray,\tpublication_indices: numpy.ndarray,\tembeddings: numpy.ndarray,\tkernel_size: int = 16) -> float:", "funcdef": "def"}, {"fullname": "sciterra.mapping.tracing", "modulename": "sciterra.mapping.tracing", "kind": "module", "doc": "

Convenience functionality for organized expansions of an Atlas.

\n"}, {"fullname": "sciterra.mapping.tracing.iterate_expand", "modulename": "sciterra.mapping.tracing", "qualname": "iterate_expand", "kind": "function", "doc": "

Build out an Atlas of publications, i.e. search for similar publications. This is done by iterating a sequence of [expand, save, project, save, track, save]. The convergence criterion is:

\n\n

converged = len(atl) >= target_size or failures >= max_failed_expansions or convergence_func(atl)

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

atl: the expanded Atlas

\n
\n", "signature": "(\tatl: sciterra.mapping.atlas.Atlas,\tcrt: sciterra.mapping.cartography.Cartographer,\tatlas_dir: str,\ttarget_size: int,\tmax_failed_expansions: int = 2,\tconvergence_func: Callable[[sciterra.mapping.atlas.Atlas], bool] = <function <lambda>>,\tcenter: str = None,\tn_pubs_max: int = None,\tcall_size: int = None,\tn_sources_max: int = None,\trecord_pubs_per_update: bool = False,\t**project_kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.tracing.search_converged_ids", "modulename": "sciterra.mapping.tracing", "qualname": "search_converged_ids", "kind": "function", "doc": "

Get all publication ids such that they did not change neighborhood identity over the duration of the addition of the last num_pubs_added publications added to the atlas during previous Cartographer.expand calls.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

converged_pub_ids: a list of Publication identifiers corresponding to publications that have converged acording to the criterion.

\n
\n", "signature": "(\tatl: sciterra.mapping.atlas.Atlas,\tnum_pubs_added: int,\tkernel_size: int = 16) -> list[str]:", "funcdef": "def"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer", "kind": "class", "doc": "

Convenience data structure for bookkeeping expansions of an Atlas that reduces boilerplate and ensures an aligned update history between the Atlas and Cartographer.

\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.__init__", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.__init__", "kind": "function", "doc": "

Convenience wrapper data structure for tracked expansions, by aligning the history of a Cartographer with an Atlas.

\n\n
Arguments:
\n\n\n", "signature": "(\tatlas_dir: str,\tatlas_center_bibtex: str,\tlibrarian_name: str,\tvectorizer_name: str,\tlibrarian_kwargs: dict = {},\tvectorizer_kwargs: dict = {})"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.cartographer", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.cartographer", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.atlas_dir", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.atlas_dir", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.atlas", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.atlas", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.expand_atlas", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.expand_atlas", "kind": "function", "doc": "

Start or continue the expansion of the Atlas by calling iterate_expand with aligned Cartographer and Atlas, by default centered on atl.center.

\n\n
Arguments:
\n\n\n", "signature": "(self, target_size: int, **kwargs) -> None:", "funcdef": "def"}, {"fullname": "sciterra.misc", "modulename": "sciterra.misc", "kind": "module", "doc": "

\n"}, {"fullname": "sciterra.misc.analysis", "modulename": "sciterra.misc.analysis", "kind": "module", "doc": "

Helper functions for analyzing data yielded by an atlas.

\n"}, {"fullname": "sciterra.misc.analysis.atlas_to_measurements", "modulename": "sciterra.misc.analysis", "qualname": "atlas_to_measurements", "kind": "function", "doc": "

Compute the density, edginess, and citations per year metrics for each publicaation in an atlas w.r.t. a vectorizer and convergence configurations, and return the results in a dataframe.

\n\n
Arguments:
\n\n\n", "signature": "(\tatl: sciterra.mapping.atlas.Atlas,\tvectorizer: sciterra.vectorization.vectorizer.Vectorizer,\tcon_d: float,\tkernel_size=16,\tmetrics: list[str] = ['density', 'edginess'],\tfields_of_study=None,\tmax_year: int = 2023) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "sciterra.misc.utils", "modulename": "sciterra.misc.utils", "kind": "module", "doc": "

Miscellaneous helper functions.

\n"}, {"fullname": "sciterra.misc.utils.standardize_month", "modulename": "sciterra.misc.utils", "qualname": "standardize_month", "kind": "function", "doc": "

\n", "signature": "(month: str) -> str:", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.keep_trying", "modulename": "sciterra.misc.utils", "qualname": "keep_trying", "kind": "function", "doc": "

Sometimes we receive server errors. We don't want that to disrupt the entire process, so this decorator allow trying n_attempts times.

\n\n

API_extension::get_data_via_api

\n\n

This decorator is general, except for the default allowed exception.

\n\n
Arguments:
\n\n\n\n
Example Usage:
\n\n
\n
\n

@keep_trying( n_attempts=4 )\n def try_to_call_web_api():\n \" do stuff \"

\n
\n
\n", "signature": "(\tn_attempts=5,\tallowed_exceptions=(<class 'requests.exceptions.ReadTimeout'>, <class 'requests.exceptions.ConnectionError'>),\tverbose=True,\tsleep_after_attempt=1):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.chunk_ids", "modulename": "sciterra.misc.utils", "qualname": "chunk_ids", "kind": "function", "doc": "

Helper function to chunk bibcodes or paperIds into smaller sublists if appropriate.

\n", "signature": "(ids: list[str], call_size):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.write_pickle", "modulename": "sciterra.misc.utils", "qualname": "write_pickle", "kind": "function", "doc": "

\n", "signature": "(fn: str, data):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.read_pickle", "modulename": "sciterra.misc.utils", "qualname": "read_pickle", "kind": "function", "doc": "

\n", "signature": "(fn: str):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.get_verbose", "modulename": "sciterra.misc.utils", "qualname": "get_verbose", "kind": "function", "doc": "

\n", "signature": "(kwargs: dict):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.custom_formatwarning", "modulename": "sciterra.misc.utils", "qualname": "custom_formatwarning", "kind": "function", "doc": "

\n", "signature": "(msg, *args, **kwargs):", "funcdef": "def"}, {"fullname": "sciterra.vectorization", "modulename": "sciterra.vectorization", "kind": "module", "doc": "

\n"}, {"fullname": "sciterra.vectorization.vectorizers", "modulename": "sciterra.vectorization", "qualname": "vectorizers", "kind": "variable", "doc": "

\n", "default_value": "{'GPT2': <class 'sciterra.vectorization.gpt2.GPT2Vectorizer'>, 'SciBERT': <class 'sciterra.vectorization.scibert.SciBERTVectorizer'>, 'SBERT': <class 'sciterra.vectorization.sbert.SBERTVectorizer'>, 'Word2Vec': <class 'sciterra.vectorization.word2vec.Word2VecVectorizer'>, 'BOW': <class 'sciterra.vectorization.bow.BOWVectorizer'>}"}, {"fullname": "sciterra.vectorization.bow", "modulename": "sciterra.vectorization.bow", "kind": "module", "doc": "

Bag of words document embedder. Unlike cc vectorization, we fix the dimension of the embeddings to be the same; this requires us to fix the vocabulary, so for consistency we do so via the same method as the Word2Vec vocabulary construction.

\n"}, {"fullname": "sciterra.vectorization.bow.current_file_abs_path", "modulename": "sciterra.vectorization.bow", "qualname": "current_file_abs_path", "kind": "variable", "doc": "

\n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization'"}, {"fullname": "sciterra.vectorization.bow.corpora_path", "modulename": "sciterra.vectorization.bow", "qualname": "corpora_path", "kind": "variable", "doc": "

\n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora'"}, {"fullname": "sciterra.vectorization.bow.ASTROPHYSICS_CORPUS", "modulename": "sciterra.vectorization.bow", "qualname": "ASTROPHYSICS_CORPUS", "kind": "variable", "doc": "

\n", "default_value": "'astro_small.txt'"}, {"fullname": "sciterra.vectorization.bow.DEFAULT_CORPUS", "modulename": "sciterra.vectorization.bow", "qualname": "DEFAULT_CORPUS", "kind": "variable", "doc": "

\n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora/astro_small.txt'"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.__init__", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.__init__", "kind": "function", "doc": "

Construct a bag-of-words document vectorizer.

\n", "signature": "(*args, **kwargs)"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.word2vec_vectorizer", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.word2vec_vectorizer", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.vocabulary", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.vocabulary", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.embedding_dim", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.embedding_dim", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.count_vectorizer", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.count_vectorizer", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.embed_documents", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.embed_documents", "kind": "function", "doc": "

Embed a list of documents (raw text) into bow document vectors using scikit-learn's CountVectorizer.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

a numpy array of shape (num_documents, len(self.vocabulary))

\n
\n", "signature": "(self, docs: list[str], **kwargs) -> dict[str, numpy.ndarray]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.gpt2", "modulename": "sciterra.vectorization.gpt2", "kind": "module", "doc": "

GPT-2 is a large causal language model that achieved SOTA in many NLP tasks before its successors created by OpenAI.

\n\n
Links:
\n\n
\n \n
\n"}, {"fullname": "sciterra.vectorization.gpt2.MPS_DEVICE", "modulename": "sciterra.vectorization.gpt2", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "

\n", "default_value": "device(type='mps')"}, {"fullname": "sciterra.vectorization.gpt2.EMBEDDING_DIM", "modulename": "sciterra.vectorization.gpt2", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "

\n", "default_value": "768"}, {"fullname": "sciterra.vectorization.gpt2.BATCH_SIZE", "modulename": "sciterra.vectorization.gpt2", "qualname": "BATCH_SIZE", "kind": "variable", "doc": "

\n", "default_value": "8"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.__init__", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.__init__", "kind": "function", "doc": "

\n", "signature": "(device='cuda', **kwargs)"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.tokenizer", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.tokenizer", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.model", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.model", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.embed_documents", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.embed_documents", "kind": "function", "doc": "

Embed a list of documents (raw text) into GPT-2 vectors, by batching.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

a numpy array of shape (num_documents, embedding_dim)

\n
\n", "signature": "(self, docs: list[str], batch_size: int = 8) -> dict[str, numpy.ndarray]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.preprocessing", "modulename": "sciterra.vectorization.preprocessing", "kind": "module", "doc": "

Simple preprocessing of scientific abstracts prior to vectorization.

\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor", "kind": "class", "doc": "

\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.__init__", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.__init__", "kind": "function", "doc": "

Initialize a custom tokenizer.

\n\n
Arguments:
\n\n\n", "signature": "(\tallowed_pos_tags: set = {'ADJ', 'NOUN', 'VERB'},\tmodel='en_core_web_sm')"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.nlp", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.nlp", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.allowed_pos_tags", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.allowed_pos_tags", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.custom_preprocess", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.custom_preprocess", "kind": "function", "doc": "

Get all of the lemmas of the words in a document, filtering by POS.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

a list of the lemmatized, filtered words in the document

\n
\n\n

Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming

\n\n

See https://github.com/zhafen/cc/blob/master/cc/utils.py#L173.

\n", "signature": "(self, document: str) -> list[str]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection", "modulename": "sciterra.vectorization.projection", "kind": "module", "doc": "

\n"}, {"fullname": "sciterra.vectorization.projection.Projection", "modulename": "sciterra.vectorization.projection", "qualname": "Projection", "kind": "class", "doc": "

Basic wrapper for document embeddings and helper methods.

\n"}, {"fullname": "sciterra.vectorization.projection.Projection.__init__", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.__init__", "kind": "function", "doc": "

Construct a Projection object, a bidirectional mapping from identifiers to document embeddings.

\n\n
Arguments:
\n\n\n", "signature": "(\tidentifier_to_index: dict[str, int],\tindex_to_identifier: tuple[str],\tembeddings: numpy.ndarray)"}, {"fullname": "sciterra.vectorization.projection.Projection.identifier_to_index", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifier_to_index", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.projection.Projection.index_to_identifier", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.index_to_identifier", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.projection.Projection.embeddings", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.embeddings", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.projection.Projection.indices_to_identifiers", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.indices_to_identifiers", "kind": "function", "doc": "

Retrieve the identifiers for a list of embedding matrix indices.

\n", "signature": "(self, indices) -> list[str]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.Projection.identifiers_to_embeddings", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifiers_to_embeddings", "kind": "function", "doc": "

Retrieve the document embeddings for a list of identifiers.

\n", "signature": "(self, identifiers: list[str]) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.Projection.identifiers_to_indices", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifiers_to_indices", "kind": "function", "doc": "

Retrieve the embedding indices for a list of identifiers.

\n", "signature": "(self, identifiers: list[str]) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.merge", "modulename": "sciterra.vectorization.projection", "qualname": "merge", "kind": "function", "doc": "

Return the result of merging projection proj_a with projection proj_b.

\n\n

This adds to proj_a all embedding data contained in proj_b that is missing from proj_a. This means that the resulting projection can only be greater or equal in size to proj_a.

\n", "signature": "(\tproj_a: sciterra.vectorization.projection.Projection,\tproj_b: sciterra.vectorization.projection.Projection) -> sciterra.vectorization.projection.Projection:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.get_empty_projection", "modulename": "sciterra.vectorization.projection", "qualname": "get_empty_projection", "kind": "function", "doc": "

Construct a Projection with no data (but is not None).

\n", "signature": "() -> sciterra.vectorization.projection.Projection:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.sbert", "modulename": "sciterra.vectorization.sbert", "kind": "module", "doc": "

We use the acronym SBERT as a catch-all for BERT-based sentence transformers. In particular, we use a lightweight/fast version of one the top-performing model.

\n\n
Links:
\n\n
\n

sbert: https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models.\n HF: https://huggingface.co/sentence-transformers

\n
\n"}, {"fullname": "sciterra.vectorization.sbert.MPS_DEVICE", "modulename": "sciterra.vectorization.sbert", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "

\n", "default_value": "device(type='mps')"}, {"fullname": "sciterra.vectorization.sbert.MODEL_PATH", "modulename": "sciterra.vectorization.sbert", "qualname": "MODEL_PATH", "kind": "variable", "doc": "

\n", "default_value": "'all-MiniLM-L6-v2'"}, {"fullname": "sciterra.vectorization.sbert.EMBEDDING_DIM", "modulename": "sciterra.vectorization.sbert", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "

\n", "default_value": "384"}, {"fullname": "sciterra.vectorization.sbert.MAX_SEQ_LENGTH", "modulename": "sciterra.vectorization.sbert", "qualname": "MAX_SEQ_LENGTH", "kind": "variable", "doc": "

\n", "default_value": "256"}, {"fullname": "sciterra.vectorization.sbert.BATCH_SIZE", "modulename": "sciterra.vectorization.sbert", "qualname": "BATCH_SIZE", "kind": "variable", "doc": "

\n", "default_value": "64"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.__init__", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.__init__", "kind": "function", "doc": "

\n", "signature": "(device='cuda', **kwargs)"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.model", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.model", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.embed_documents", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.embed_documents", "kind": "function", "doc": "

Embed a list of documents (raw text) into SBERT vectors, by batching.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

a numpy array of shape (num_documents, 384)

\n
\n", "signature": "(self, docs: list[str], batch_size: int = 64) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.scibert", "modulename": "sciterra.vectorization.scibert", "kind": "module", "doc": "

SciBERT is a BERT model trained on scientific text.

\n\n
Links:
\n\n
\n

Paper: https://aclanthology.org/D19-1371/\n Github: https://github.com/allenai/scibert\n HF: https://huggingface.co/allenai/scibert_scivocab_uncased

\n
\n"}, {"fullname": "sciterra.vectorization.scibert.MPS_DEVICE", "modulename": "sciterra.vectorization.scibert", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "

\n", "default_value": "device(type='mps')"}, {"fullname": "sciterra.vectorization.scibert.MODEL_PATH", "modulename": "sciterra.vectorization.scibert", "qualname": "MODEL_PATH", "kind": "variable", "doc": "

\n", "default_value": "'allenai/scibert_scivocab_uncased'"}, {"fullname": "sciterra.vectorization.scibert.EMBEDDING_DIM", "modulename": "sciterra.vectorization.scibert", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "

\n", "default_value": "768"}, {"fullname": "sciterra.vectorization.scibert.BATCH_SIZE", "modulename": "sciterra.vectorization.scibert", "qualname": "BATCH_SIZE", "kind": "variable", "doc": "

\n", "default_value": "64"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.__init__", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.__init__", "kind": "function", "doc": "

\n", "signature": "(device='cuda', **kwargs)"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.tokenizer", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.tokenizer", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.model", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.model", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.embed_documents", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.embed_documents", "kind": "function", "doc": "

Embed a list of documents (raw text) into SciBERT vectors, by batching.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

a numpy array of shape (num_documents, 768)

\n
\n", "signature": "(self, docs: list[str], batch_size: int = 64) -> dict[str, numpy.ndarray]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.vectorizer", "modulename": "sciterra.vectorization.vectorizer", "kind": "module", "doc": "

Base class for vectorizing abstracts.

\n"}, {"fullname": "sciterra.vectorization.vectorizer.Vectorizer", "modulename": "sciterra.vectorization.vectorizer", "qualname": "Vectorizer", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "abc.ABC"}, {"fullname": "sciterra.vectorization.vectorizer.Vectorizer.embed_documents", "modulename": "sciterra.vectorization.vectorizer", "qualname": "Vectorizer.embed_documents", "kind": "function", "doc": "

Embed a list of documents into document vectors.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

a dict of the form\n {\n \"embeddings\": a numpy array of shape (num_successful, embedding_dim), containing the document embeddingss

\n\n
\"success_indices\": a numpy array of shape `(num_successful,)`, containing the indices of all the documents for which document embeddings were successfully obtained.\n\n\"fail_indices\": a numpy array of shape `(len(docs) - num_successful,)`, containing the indices of all the documents for which document embeddings could not be obtained\n
\n \n

}\n where the indices are with respect to the docs list passed.

\n
\n", "signature": "(self, docs: list[str], batch_size: int = 64) -> dict[str, numpy.ndarray]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.word2vec", "modulename": "sciterra.vectorization.word2vec", "kind": "module", "doc": "

We use a simple word2vec model that gets a document vector by averaging all words in the document.

\n\n

Since we are getting vectors for scientific documents, we must load a vocabulary to train the model from scratch. Therefore we define different subclasses for each scientific field, which may differ substantially by vocabulary.

\n\n

There exists a Doc2Vec module by gensim, but it seems that empirically Word2Vec + averaging can do just as well; furthermore, we're mainly interested in a simple baseline to compare with sophisticated embeddings.

\n\n
Links:
\n\n
\n

gensim: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#

\n
\n"}, {"fullname": "sciterra.vectorization.word2vec.EMBEDDING_DIM", "modulename": "sciterra.vectorization.word2vec", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "

\n", "default_value": "300"}, {"fullname": "sciterra.vectorization.word2vec.current_file_abs_path", "modulename": "sciterra.vectorization.word2vec", "qualname": "current_file_abs_path", "kind": "variable", "doc": "

\n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization'"}, {"fullname": "sciterra.vectorization.word2vec.corpora_path", "modulename": "sciterra.vectorization.word2vec", "qualname": "corpora_path", "kind": "variable", "doc": "

\n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora'"}, {"fullname": "sciterra.vectorization.word2vec.ASTROPHYSICS_CORPUS", "modulename": "sciterra.vectorization.word2vec", "qualname": "ASTROPHYSICS_CORPUS", "kind": "variable", "doc": "

\n", "default_value": "'astro_small.txt'"}, {"fullname": "sciterra.vectorization.word2vec.DEFAULT_CORPUS", "modulename": "sciterra.vectorization.word2vec", "qualname": "DEFAULT_CORPUS", "kind": "variable", "doc": "

\n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora/astro_small.txt'"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.__init__", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.__init__", "kind": "function", "doc": "

Construct a Word2Vec based document embedding model from a corpus.

\n", "signature": "(\tcorpus_path: str,\tmodel_path: str = None,\tvector_size: int = 300,\twindow: int = 5,\tmin_count: int = 2,\tworkers: int = 8,\tepochs: int = 10,\ttokenizer: Callable[[str], list[str]] = None,\t**kwargs)"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.model", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.model", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.embed_documents", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.embed_documents", "kind": "function", "doc": "

Embed a list of documents (raw text) into word2vec document vectors by averaging the word vectors in each of the documents.

\n\n

Since there's no speedup via batching like there is in pytorch models, we iterate one document at a time.

\n", "signature": "(self, docs: list[str], **kwargs) -> numpy.ndarray:", "funcdef": "def"}]; + /** pdoc search index */const docs = [{"fullname": "sciterra", "modulename": "sciterra", "kind": "module", "doc": "

sciterra: a python library for similarity-based scientometrics

\n\n

\"build\"

\n\n

Sciterra is a software libary to support data-driven analyses of scientific literature, with a focus on unifying different bibliographic database APIs and document-embedding methods for systematic scientometrics research.

\n\n

Overview

\n\n

The main purpose of sciterra is to perform similarity-based retrieval of scientific publications for metascience/scientometrics research. While there are many services that can make the individual steps of this simple, this software library exists to

\n\n
    \n
  1. Unify the different APIs and vector-based retrieval methods

  2. \n
  3. Support scientometrics analyses of citation dynamics, especially with respect to a vectorized 'landscape' of literature.

  4. \n
\n\n

Installing sciterra

\n\n

First, set up a virtual environment (e.g. via miniconda, conda create -n sciterra, and conda activate sciterra).

\n\n
    \n
  1. Install sciterra via git:

    \n\n

    python -m pip install 'sciterra @ git+https://github.com/nathimel/sciterra.git'

  2. \n
  3. Alternatively, download or clone this repository and navigate to the root folder, and install locally:

    \n\n

    pip install -e .

  4. \n
  5. It is not yet recommended because sciterra is still in development, but you can also install via pip from pypi:

    \n\n

    pip install sciterra

  6. \n
\n\n

Usage

\n\n

Atlas

\n\n

The central object in sciterra is the Atlas. This is a basic data structure for containing scientific publications that are returned from calls to various bibliographic database APIs.

\n\n

An Atlas minimally requires a list of Publications.

\n\n

Publication

\n\n

A publication object is a minimal wrapper around publication data, and should have a string identifier. It is designed to standardize the basic metadata contained in the results from some bibliographic database API.

\n\n
\n
from sciterra import Atlas, Publication\n\natl = Atlas([Publication({"identifier": "id"})])\n
\n
\n\n

Alternatively, you can construct an Atlas by passing in a .bib file. The entries in this bibtex file will be parsed for unique identifiers (e.g., DOIs), and sent in an API call, and returned as Publications, which then populate an Atlas.

\n\n
\n
atl = crt.bibtex_to_atlas(bibtex_filepath)\n
\n
\n\n

In the line of code above, the variable crt is an instance of a Cartographer object, which encapsulates the bookkeeping involved in querying a bibliographic database for publications.

\n\n

Cartographer

\n\n

The Cartographer class is named because interfaces with an Atlas to build out a library of publications. Since it does so via similarity-based retrieval, the resulting Atlas can be considered a 'region' of publications.

\n\n

To do this, a Cartographer needs two things: an API with which to interface, and a way of getting document embeddings. Both are encapsulated, respectively, by the Librarian and the Vectorizer classes.

\n\n
\n
from sciterra import Cartographer\nfrom sciterra.librarians import SemanticScholarLibrarian # or ADSLibrarian\nfrom sciterra.vectorization import SciBERTVectorizer # among others\n\ncrt = Cartographer(\n    librarian=SemanticScholarLibrarian(),\n    vectorizer=SciBERTVectorizer(),\n)\n
\n
\n\n

Librarian

\n\n

Each Librarian subclass is designed to be a wrapper for an existing python API service, such as the ads package or the semanticscholar client library.

\n\n

A Librarian subclass also overrides two methods. The first is get_publications, which takes a list of identifiers, should query the specific API for that Librarian, and returns a list of Publications. Keyword arguments can be passed to specify the metadata that is kept for each publication (e.g. date, title, journal, authors, etc.) The second method is convert_publication, which defines how the result of an API call should be converted to a sciterra Publication object.

\n\n

Contributions to sciterra in the form of new Librarian subclasses are encouraged and appreciated.

\n\n

Vectorizer

\n\n

Vectorizer subclasses override one function, embed_documents, which takes a list of strings, representing the text of a publication (currently, just its abstract), and returns an np.ndarray of embeddings.

\n\n

Under the hood, the project method of Cartographer, which is used during similarity-based retrieval, uses the vectorizer roughly as follows

\n\n
\n
# Get abstracts\ndocs = [atlas[identifier].abstract for identifier in identifiers]\n\n# Embed abstracts\nresult = vectorizer.embed_documents(docs)\nembeddings = result["embeddings"]\n\n# depending on the vectorizer, sometimes not all embeddings can be obtained due to out-of-vocab issues\nsuccess_indices = result["success_indices"] # shape `(len(embeddings),)`\nfail_indices = result["fail_indices"] # shape `(len(docs) - len(embeddings))``\n
\n
\n\n

Currently, sciterra has vectorizers using SciBERT, SBERT, GPT-2, Word2Vec, and a simple bag-of-words (BOW) vectorizer that uses the same vocabulary as the Word2Vec vectorizer. Contributions to sciterra in the form of new Vectorizer subclasses are also encouraged and appreciated.

\n\n

Putting it all together

\n\n

The main use case for all of these ingredients is to iteratively build out a region of publications. This is done using iterate_expand:

\n\n
\n
from sciterra.mapping.tracing import iterate_expand\n\n# Assuming the initial atlas contains just one publication\n(atl.center, ) = atl.publications.values()\n# build out an atlas to contain 10,000 publications, with increasing dissimilarity to the initial publication, saving progress in binary files to the directory named "atlas".\niterate_expand(\n    atl=atl,\n    crt=crt,\n    atlas_dir="atlas",\n    target_size=10000,\n    center=atl.center,\n)\n
\n
\n\n

This method has a number of keyword arguments that enable tracking the Atlas expansion, limiting the number of publications per expansion, how many times to try to get a response if there are connection issues, etc.

\n\n

In practice, it may be helpful to use the sciterra.mapping.tracing.AtlasTracer data structure to reduce most of the loading/initialization boilerplate described above. For an example, see main.py.

\n\n

Additional features

\n\n\n\n

Acknowledgments

\n\n

This software is a reimplimentation of Zachary Hafen-Saavedra's library, cc.

\n\n

To cite sciterra, please use the following workshop paper,

\n\n
@inproceedings{Imel2023,\n author = {Imel, Nathaniel, and Hafen, Zachary},\n title = {Citation-similarity relationships in astrophysics},\n booktitle = {AI for Scientific Discovery: From Theory to Practice Workshop (AI4Science @ NeurIPS)},\n year = {2023},\n url = {https://openreview.net/pdf?id=mISayy7DPI},\n}\n
\n"}, {"fullname": "sciterra.librarians", "modulename": "sciterra.librarians", "kind": "module", "doc": "

\n"}, {"fullname": "sciterra.librarians.librarians", "modulename": "sciterra.librarians", "qualname": "librarians", "kind": "variable", "doc": "

Why is there not an ArxivLibrarian? For now, we are restricting to APIs that allow us to traverse literature graphs, and arxiv does not have one. While there is a useful pip-installable package for querying the arxiv api for papers, https://pypi.org/project/arxiv/, the returned object does not have information on references and citations. However, it may still be possible to obtain a large sample of publications with abstracts and submission dates (though no citation counts), because the arxiv API's limit for a single query is 300,000 results.

\n", "default_value": "{'S2': <class 'sciterra.librarians.s2librarian.SemanticScholarLibrarian'>, 'ADS': <class 'sciterra.librarians.adslibrarian.ADSLibrarian'>}"}, {"fullname": "sciterra.librarians.adslibrarian", "modulename": "sciterra.librarians.adslibrarian", "kind": "module", "doc": "

\n"}, {"fullname": "sciterra.librarians.adslibrarian.CALL_SIZE", "modulename": "sciterra.librarians.adslibrarian", "qualname": "CALL_SIZE", "kind": "variable", "doc": "

\n", "default_value": "50"}, {"fullname": "sciterra.librarians.adslibrarian.NUM_ATTEMPTS_PER_QUERY", "modulename": "sciterra.librarians.adslibrarian", "qualname": "NUM_ATTEMPTS_PER_QUERY", "kind": "variable", "doc": "

\n", "default_value": "10"}, {"fullname": "sciterra.librarians.adslibrarian.QUERY_FIELDS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "QUERY_FIELDS", "kind": "variable", "doc": "

\n", "default_value": "['bibcode', 'abstract', 'title', 'entry_date', 'pubdate', 'year', 'citation_count', 'citation', 'reference', 'identifier', 'arxiv_class']"}, {"fullname": "sciterra.librarians.adslibrarian.ALLOWED_EXCEPTIONS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ALLOWED_EXCEPTIONS", "kind": "variable", "doc": "

\n", "default_value": "(<class 'ads.exceptions.APIResponseError'>,)"}, {"fullname": "sciterra.librarians.adslibrarian.EXTERNAL_IDS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "EXTERNAL_IDS", "kind": "variable", "doc": "

\n", "default_value": "['DOI', 'arXiv', 'bibcode']"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "sciterra.librarians.librarian.Librarian"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.bibtex_entry_identifier", "kind": "function", "doc": "

Parse a bibtex entry for a usable identifier for querying ADS (see EXTERNAL_IDS).

\n", "signature": "(self, bibtex_entry: dict) -> str:", "funcdef": "def"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.get_publications", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.get_publications", "kind": "function", "doc": "

Use the NASA ADS python package, which calls the ADS API to retrieve publications.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

the list of publications (or Papers)

\n
\n", "signature": "(\tself,\tbibcodes: list[str],\t*args,\tcall_size: int = 50,\tn_attempts_per_query: int = 10,\tconvert: bool = True,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.convert_publication", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.convert_publication", "kind": "function", "doc": "

Convert a ADS Article object to a sciterra.publication.Publication.

\n", "signature": "(\tself,\tarticle: ads.search.Article,\t*args,\t**kwargs) -> sciterra.mapping.publication.Publication:", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian", "modulename": "sciterra.librarians.librarian", "kind": "module", "doc": "

\n"}, {"fullname": "sciterra.librarians.librarian.Librarian", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "abc.ABC"}, {"fullname": "sciterra.librarians.librarian.Librarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.bibtex_entry_identifier", "kind": "function", "doc": "

Parse a bibtex entry for a usable unique identifier appropriate to the API.

\n", "signature": "(self, bibtex_entry: dict) -> str:", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.get_publications", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.get_publications", "kind": "function", "doc": "

Call an API and retrieve the publications corresponding to str identifiers.

\n\n
Arguments:
\n\n\n", "signature": "(\tself,\tidentifiers: list[str],\t*args,\tcall_size: int = None,\tn_attempts_per_query: int = None,\tconvert: bool = True,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.convert_publication", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.convert_publication", "kind": "function", "doc": "

Convert an API-specific resulting publication data structure into a sciterra Publication object.

\n", "signature": "(self, pub: Any, *args, **kwargs):", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.convert_publications", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.convert_publications", "kind": "function", "doc": "

Convet a list of API-specific results to sciterra Publications, possibly using multiprocessing.

\n", "signature": "(\tself,\tpapers: list,\t*args,\tmultiprocess: bool = True,\tnum_processes=6,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian", "modulename": "sciterra.librarians.s2librarian", "kind": "module", "doc": "

\n"}, {"fullname": "sciterra.librarians.s2librarian.QUERY_FIELDS", "modulename": "sciterra.librarians.s2librarian", "qualname": "QUERY_FIELDS", "kind": "variable", "doc": "

\n", "default_value": "['year', 'abstract', 'title', 'externalIds', 'citationCount', 'fieldsOfStudy', 's2FieldsOfStudy', 'url', 'citations.externalIds', 'citations.url', 'references.externalIds', 'references.url', 'citationStyles', 'publicationDate']"}, {"fullname": "sciterra.librarians.s2librarian.EXTERNAL_IDS", "modulename": "sciterra.librarians.s2librarian", "qualname": "EXTERNAL_IDS", "kind": "variable", "doc": "

\n", "default_value": "['DOI', 'ArXiv', 'CorpusId', 'MAG', 'ACL', 'PubMed', 'Medline', 'PubMedCentral', 'DBLP', 'URL']"}, {"fullname": "sciterra.librarians.s2librarian.ALLOWED_EXCEPTIONS", "modulename": "sciterra.librarians.s2librarian", "qualname": "ALLOWED_EXCEPTIONS", "kind": "variable", "doc": "

\n", "default_value": "(<class 'Exception'>, <class 'requests.exceptions.ReadTimeout'>, <class 'requests.exceptions.ConnectionError'>, <class 'semanticscholar.SemanticScholarException.ObjectNotFoundException'>)"}, {"fullname": "sciterra.librarians.s2librarian.CALL_SIZE", "modulename": "sciterra.librarians.s2librarian", "qualname": "CALL_SIZE", "kind": "variable", "doc": "

\n", "default_value": "10"}, {"fullname": "sciterra.librarians.s2librarian.NUM_ATTEMPTS_PER_QUERY", "modulename": "sciterra.librarians.s2librarian", "qualname": "NUM_ATTEMPTS_PER_QUERY", "kind": "variable", "doc": "

\n", "default_value": "50"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "sciterra.librarians.librarian.Librarian"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.__init__", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.__init__", "kind": "function", "doc": "

\n", "signature": "(api_key: str = None, api_key_fn: str = None)"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.sch", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.sch", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.bibtex_entry_identifier", "kind": "function", "doc": "

Parse a bibtex entry for a usable identifier for querying SemanticScholar (see EXTERNAL_IDS).

\n", "signature": "(self, bibtex_entry: dict) -> str:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_publications", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_publications", "kind": "function", "doc": "

Use the (unofficial) S2 python package, which calls the Semantic Scholar API to retrieve publications from the S2AG.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

the list of publications (or Papers)

\n
\n", "signature": "(\tself,\tpaper_ids: list[str],\t*args,\tcall_size: int = 10,\tn_attempts_per_query: int = 50,\tconvert: bool = True,\t**kwargs) -> list[sciterra.mapping.publication.Publication]:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.convert_publication", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.convert_publication", "kind": "function", "doc": "

Convert a SemanticScholar Paper object to a sciterra.publication.Publication.

\n", "signature": "(\tself,\tpaper: semanticscholar.Paper.Paper,\t*args,\t**kwargs) -> sciterra.mapping.publication.Publication:", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_papers", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_papers", "kind": "function", "doc": "

Custom function for calling the S2 API that doesn't fail on empty results.

\n", "signature": "(self, paper_ids: list[str], fields: list[str]):", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_paper", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_paper", "kind": "function", "doc": "

Custom function for calling the S2 API that doesn't fail on empty results.

\n", "signature": "(self, paper_id: str, fields: list[str]):", "funcdef": "def"}, {"fullname": "sciterra.mapping", "modulename": "sciterra.mapping", "kind": "module", "doc": "

Classes for constructing maps of scientific literature.

\n\n

The sciterra.mapping.atlas submodule contains the basic data structure, the Atlas.

\n\n

The sciterra.mapping.cartography submodule contains functionality for manipulating an Atlas.

\n"}, {"fullname": "sciterra.mapping.atlas", "modulename": "sciterra.mapping.atlas", "kind": "module", "doc": "

Main container object for a large library of publications.

\n"}, {"fullname": "sciterra.mapping.atlas.Atlas", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas", "kind": "class", "doc": "

Data structure for storing publications.

\n\n

self.projection: the Projection object containing the embeddings of all publications and their mapping to str identifiers.

\n\n

self.bad_ids: a list of identifiers that have failed for some reason or other during an expansion, and will be excluded from subsequent expansions.

\n\n

self.history: dict of the form {'pubs_per_update': list[list[str]], 'kernel_size': np.ndarray of ints of shape (num_pubs, last_update) where last_update <= the total number of expansions performed.}

\n\n

self.center: the core, central Publication identifier repeatedly passed to cartography.Cartographer.expand. Default is None, which means the Atlas has no internal record of the central publication.

\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.__init__", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.__init__", "kind": "function", "doc": "

\n", "signature": "(\tpublications: list[sciterra.mapping.publication.Publication],\tprojection: sciterra.vectorization.projection.Projection = None,\tbad_ids: set[str] = set(),\thistory: dict[str, typing.Any] = None,\tcenter: str = None)"}, {"fullname": "sciterra.mapping.atlas.Atlas.publications", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.publications", "kind": "variable", "doc": "

\n", "annotation": ": dict[str, sciterra.mapping.publication.Publication]"}, {"fullname": "sciterra.mapping.atlas.Atlas.projection", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.projection", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.bad_ids", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.bad_ids", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.history", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.history", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.center", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.center", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.ids", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.ids", "kind": "variable", "doc": "

Get a list of all the publication identifiers in the Atlas.

\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.atlas.Atlas.save", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.save", "kind": "function", "doc": "

Write the Atlas to a directory containing a .pkl binary for each attribute.

\n\n

Warnings cannot be silenced.

\n\n
Arguments:
\n\n\n", "signature": "(self, atlas_dirpath: str, overwrite: bool = True) -> None:", "funcdef": "def"}, {"fullname": "sciterra.mapping.atlas.Atlas.load", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.load", "kind": "function", "doc": "

Load an Atlas object from a directory containing the .pkl binary for each attribute.

\n\n

Warnings cannot be silenced.

\n\n
Arguments:
\n\n\n", "signature": "(cls, atlas_dirpath: str):", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography", "modulename": "sciterra.mapping.cartography", "kind": "module", "doc": "

Functions for manipulating an atlas based on the document embeddings of the abstracts of its publications.

\n"}, {"fullname": "sciterra.mapping.cartography.batch_cospsi_matrix", "modulename": "sciterra.mapping.cartography", "qualname": "batch_cospsi_matrix", "kind": "function", "doc": "

Batch-process a pairwise cosine similarity matrix between embeddings.

\n\n

In order to avoid memory errors (e.g. bus error, segfaults) resulting from too large arrays, we batch process the construction of the cospsi_matrix.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

cosine_similarities: a 2D numpy array of shape (num_pubs, num_pubs) representing the pairwise cosine similarity between each embedding

\n
\n", "signature": "(embeddings: numpy.ndarray) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.pub_has_attributes", "modulename": "sciterra.mapping.cartography", "qualname": "pub_has_attributes", "kind": "function", "doc": "

Return True if a publication has all attributes.

\n\n
Arguments:
\n\n\n", "signature": "(\tpub: sciterra.mapping.publication.Publication,\tattributes: list[str]) -> bool:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.pub_has_fields_of_study", "modulename": "sciterra.mapping.cartography", "qualname": "pub_has_fields_of_study", "kind": "function", "doc": "

Return true if any of pub.fields_of_study are in passed fields_of_study.

\n", "signature": "(\tpub: sciterra.mapping.publication.Publication,\tfields_of_study: list[str]) -> bool:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer", "kind": "class", "doc": "

A basic wrapper for obtaining and updating atlas projections.

\n\n

self.librarian: the Librarian object used to query a bibliographic database API.\nself.vectorizer: the Vectorizer object used to get a document embedding for each abstract\nself.pubs_per_update: a list of lists of publication str ids, representing the publications that exist at each time step / expansion update.\nself.update_history: an np.array of ints representing when publications were added. A value of -2 indicates no record of being added.

\n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.__init__", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.__init__", "kind": "function", "doc": "

\n", "signature": "(\tlibrarian: sciterra.librarians.librarian.Librarian = None,\tvectorizer: sciterra.vectorization.vectorizer.Vectorizer = None)"}, {"fullname": "sciterra.mapping.cartography.Cartographer.librarian", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.librarian", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.vectorizer", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.vectorizer", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.pubs_per_update", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.pubs_per_update", "kind": "variable", "doc": "

\n", "annotation": ": list[list[str]]"}, {"fullname": "sciterra.mapping.cartography.Cartographer.update_history", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.update_history", "kind": "variable", "doc": "

\n", "annotation": ": numpy.ndarray"}, {"fullname": "sciterra.mapping.cartography.Cartographer.bibtex_to_atlas", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.bibtex_to_atlas", "kind": "function", "doc": "

Convert a bibtex file to an atlas, by parsing each entry for an identifier, and querying an API for publications using self.librarian.

\n\n

NOTE: the identifiers in the corresponding atlas will be API-specific ids; there is no relationship between the parsed id used to query papers (e.g. 'DOI:XYZ' in the case of SemanticScholar) and the resulting identifier associated with the resulting Publication object, (a paperId, a bibcode, etc.) Therefore, the purpose of using the bibtex_to_atlas method is primarily for initializing literature exploration in a human-readable way. If you want to obtain as many publications as identifiers supplied, you need to use get_publications.

\n\n
Arguments:
\n\n\n", "signature": "(self, bibtex_fp: str, *args, **kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.project", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.project", "kind": "function", "doc": "

Update an atlas with its projection, i.e. the document embeddings for all publications using self.vectorizer, removing publications with no abstracts.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

the updated atlas containing all nonempty-abstract-containing publications and their projection

\n
\n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\t**kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.expand", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.expand", "kind": "function", "doc": "

Expand an atlas by retrieving a list of publications resulting from traversal of the citation network.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

atl_expanded: the expanded atlas

\n
\n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\t*args,\tcenter: str = None,\tn_pubs_max: int = 4000,\tn_sources_max: int = None,\trecord_pubs_per_update: bool = False,\t**kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.filter_by_func", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.filter_by_func", "kind": "function", "doc": "

Update an atlas by dropping publications (and corresponding data in projection) when certain fields are empty.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

the filtered atlas

\n
\n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\trequire_func: Callable[[sciterra.mapping.publication.Publication], bool] = <function Cartographer.<lambda>>,\trecord_pubs_per_update=False,\t**kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.filter_by_ids", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.filter_by_ids", "kind": "function", "doc": "

Update an atlas by dropping publications (and corresponding data in projection).

\n\n
Arguments:
\n\n\n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\tkeep_ids: list[str] = None,\tdrop_ids: list[str] = None) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.track", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.track", "kind": "function", "doc": "

Overwrite the data associated with tracking degree of convergence of publications in an atlas over multiple expansions. N.B.: the atlas must be fully projected, or else converged_kernel_size will raise a KeyError. By default, this function will overwrite the atl.history with updated self.pubs_per_update, but not kernel_size, which requires computing the converged kernel size for every publication in the atlas.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

atl the updated Atlas

\n
\n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\tcalculate_convergence: bool = False,\tpubs: list[str] = None,\tpubs_per_update: list[list[str]] = None) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.record_update_history", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.record_update_history", "kind": "function", "doc": "

Record when publications were added, by updating atl.update_history.

\n\n

atl.update_history is a np.array of ints representing when publications were added. A value of -2 indicates no record of being added.

\n\n
Arguments:
\n\n\n\n
Updates:
\n\n
\n

self.update_history: an np.array of ints representing when publications were added. A value of -2 indicates no record of being added.

\n
\n\n
Returns:
\n\n
\n

None

\n
\n", "signature": "(\tself,\tpubs: list[str] = None,\tpubs_per_update: list[list[str]] = None) -> None:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.converged_kernel_size", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.converged_kernel_size", "kind": "function", "doc": "

Calculate the largest size of the kernel that's converged (at differing levels of convergence) for each publication in a sample at each update.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

kernel_size: an array of ints of shape (num_pubs, max_update) representing the kernel size for converged kernels.\n - The first column indicates the largest kernel size that hasn't changed since the beginning,\n - The second column indicates the largest kernel size that hasn't changed since the first update,\n - etc. for the nth column.

\n
\n", "signature": "(self, atl: sciterra.mapping.atlas.Atlas) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.measure_topography", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.measure_topography", "kind": "function", "doc": "

Measure topographic properties of all publications relative to prior\npublications.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

estimates: an np.ndarray of shape (len(publication_indices), len(metrics)) representing the estimated topography metric values for each publication.

\n
\n", "signature": "(\tself,\tatl: sciterra.mapping.atlas.Atlas,\tids: list[str] = None,\tmetrics: list[str] = ['density'],\tmin_prior_pubs: int = 2,\tkernel_size=16,\t**kwargs):", "funcdef": "def"}, {"fullname": "sciterra.mapping.publication", "modulename": "sciterra.mapping.publication", "kind": "module", "doc": "

The general container for data for any scientific publication, regardless of the API that was used to obtain it.

\n"}, {"fullname": "sciterra.mapping.publication.FIELDS", "modulename": "sciterra.mapping.publication", "qualname": "FIELDS", "kind": "variable", "doc": "

\n", "default_value": "['identifier', 'abstract', 'publication_date', 'citation_count', 'citations', 'references']"}, {"fullname": "sciterra.mapping.publication.ADDITIONAL_FIELDS", "modulename": "sciterra.mapping.publication", "qualname": "ADDITIONAL_FIELDS", "kind": "variable", "doc": "

\n", "default_value": "['doi', 'url', 'title', 'issn']"}, {"fullname": "sciterra.mapping.publication.Publication", "modulename": "sciterra.mapping.publication", "qualname": "Publication", "kind": "class", "doc": "

The Publication is a standardized container a scientific publication's retrieved data.

\n\n

In general, all data-cleaning shoud be done prior to constructing a Publication, in order to keep the class minimal.

\n\n
Attributes:
\n\n\n"}, {"fullname": "sciterra.mapping.publication.Publication.__init__", "modulename": "sciterra.mapping.publication", "qualname": "Publication.__init__", "kind": "function", "doc": "

Construct a publication.

\n\n
Arguments:
\n\n\n", "signature": "(data: dict, **kwargs)"}, {"fullname": "sciterra.mapping.publication.Publication.identifier", "modulename": "sciterra.mapping.publication", "qualname": "Publication.identifier", "kind": "variable", "doc": "

\n", "annotation": ": str"}, {"fullname": "sciterra.mapping.publication.Publication.abstract", "modulename": "sciterra.mapping.publication", "qualname": "Publication.abstract", "kind": "variable", "doc": "

\n", "annotation": ": str"}, {"fullname": "sciterra.mapping.publication.Publication.publication_date", "modulename": "sciterra.mapping.publication", "qualname": "Publication.publication_date", "kind": "variable", "doc": "

\n", "annotation": ": datetime.date"}, {"fullname": "sciterra.mapping.publication.Publication.citations", "modulename": "sciterra.mapping.publication", "qualname": "Publication.citations", "kind": "variable", "doc": "

\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.references", "modulename": "sciterra.mapping.publication", "qualname": "Publication.references", "kind": "variable", "doc": "

\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.citation_count", "modulename": "sciterra.mapping.publication", "qualname": "Publication.citation_count", "kind": "variable", "doc": "

The citation_count can be different from the length of citations, since the number of citations listed for a paper might be different from the number of (valid) citing papers indexed on the relevant API.

\n", "annotation": ": int"}, {"fullname": "sciterra.mapping.publication.Publication.fields_of_study", "modulename": "sciterra.mapping.publication", "qualname": "Publication.fields_of_study", "kind": "variable", "doc": "

\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.init_attributes", "modulename": "sciterra.mapping.publication", "qualname": "Publication.init_attributes", "kind": "function", "doc": "

\n", "signature": "(self, data, **kwargs) -> None:", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography", "modulename": "sciterra.mapping.topography", "kind": "module", "doc": "

Functions for measuring topographic properties of (the semantic feature space of publications inside) an Atlas.

\n"}, {"fullname": "sciterra.mapping.topography.smoothing_length_metric", "modulename": "sciterra.mapping.topography", "qualname": "smoothing_length_metric", "kind": "function", "doc": "

Proxy for the density of a publication defined as the minimum\narc length that encloses kernel_size other publications.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

h: float representing arc length containing kernel_size other publications. (Assumes normalized to a radius of 1.)

\n
\n", "signature": "(\tidx: int,\tcospsi_matrix: numpy.ndarray,\tvalid_indices: numpy.ndarray,\tkernel_size: int = 16):", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.density_metric", "modulename": "sciterra.mapping.topography", "qualname": "density_metric", "kind": "function", "doc": "

Estimate the density of a publication by calculating the\nsmoothing length that encloses kernel_size other publications.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

density: a float representing kernel_size divided by arc length containing kernel_size other publications.

\n
\n", "signature": "(\tidx: int,\tcospsi_matrix: numpy.ndarray,\tvalid_indices: numpy.ndarray,\tkernel_size: int = 16):", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.edginess_metric", "modulename": "sciterra.mapping.topography", "qualname": "edginess_metric", "kind": "function", "doc": "

Estimate the asymmetry of a publication by calculating the difference between that publication's projection and the other publications within the kernel. Normalized to between 0 and 1.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

a float representing the normalized magnitude of the asymmetry metric.

\n
\n", "signature": "(\tidx: int,\tcospsi_matrix: numpy.ndarray,\tvalid_indices: numpy.ndarray,\tpublication_indices: numpy.ndarray,\tembeddings: numpy.ndarray,\tkernel_size: int = 16) -> float:", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.kernel_constant_asymmetry_metric", "modulename": "sciterra.mapping.topography", "qualname": "kernel_constant_asymmetry_metric", "kind": "function", "doc": "

Estimate the asymmetry of a publication by calculating the difference\nbetween that publication's projection and the other publications within\nthe kernel.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

mag: a float representing the magnitude of the asymmetry metric.

\n
\n", "signature": "(\tidx: int,\tcospsi_matrix: numpy.ndarray,\tvalid_indices: numpy.ndarray,\tpublication_indices: numpy.ndarray,\tembeddings: numpy.ndarray,\tkernel_size: int = 16) -> float:", "funcdef": "def"}, {"fullname": "sciterra.mapping.tracing", "modulename": "sciterra.mapping.tracing", "kind": "module", "doc": "

Convenience functionality for organized expansions of an Atlas.

\n"}, {"fullname": "sciterra.mapping.tracing.iterate_expand", "modulename": "sciterra.mapping.tracing", "qualname": "iterate_expand", "kind": "function", "doc": "

Build out an Atlas of publications, i.e. search for similar publications. This is done by iterating a sequence of [expand, save, project, save, track, save]. The convergence criterion is:

\n\n

converged = len(atl) >= target_size or failures >= max_failed_expansions or convergence_func(atl)

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

atl: the expanded Atlas

\n
\n", "signature": "(\tatl: sciterra.mapping.atlas.Atlas,\tcrt: sciterra.mapping.cartography.Cartographer,\tatlas_dir: str,\ttarget_size: int,\tmax_failed_expansions: int = 2,\tconvergence_func: Callable[[sciterra.mapping.atlas.Atlas], bool] = <function <lambda>>,\tcenter: str = None,\tn_pubs_max: int = None,\tcall_size: int = None,\tn_sources_max: int = None,\trecord_pubs_per_update: bool = False,\t**project_kwargs) -> sciterra.mapping.atlas.Atlas:", "funcdef": "def"}, {"fullname": "sciterra.mapping.tracing.search_converged_ids", "modulename": "sciterra.mapping.tracing", "qualname": "search_converged_ids", "kind": "function", "doc": "

Get all publication ids such that they did not change neighborhood identity over the duration of the addition of the last num_pubs_added publications added to the atlas during previous Cartographer.expand calls.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

converged_pub_ids: a list of Publication identifiers corresponding to publications that have converged acording to the criterion.

\n
\n", "signature": "(\tatl: sciterra.mapping.atlas.Atlas,\tnum_pubs_added: int,\tkernel_size: int = 16) -> list[str]:", "funcdef": "def"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer", "kind": "class", "doc": "

Convenience data structure for bookkeeping expansions of an Atlas that reduces boilerplate and ensures an aligned update history between the Atlas and Cartographer.

\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.__init__", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.__init__", "kind": "function", "doc": "

Convenience wrapper data structure for tracked expansions, by aligning the history of a Cartographer with an Atlas.

\n\n
Arguments:
\n\n\n", "signature": "(\tatlas_dir: str,\tatlas_center_bibtex: str,\tlibrarian_name: str,\tvectorizer_name: str,\tlibrarian_kwargs: dict = {},\tvectorizer_kwargs: dict = {})"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.cartographer", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.cartographer", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.atlas_dir", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.atlas_dir", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.atlas", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.atlas", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.expand_atlas", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.expand_atlas", "kind": "function", "doc": "

Start or continue the expansion of the Atlas by calling iterate_expand with aligned Cartographer and Atlas, by default centered on atl.center.

\n\n
Arguments:
\n\n\n", "signature": "(self, target_size: int, **kwargs) -> None:", "funcdef": "def"}, {"fullname": "sciterra.misc", "modulename": "sciterra.misc", "kind": "module", "doc": "

\n"}, {"fullname": "sciterra.misc.analysis", "modulename": "sciterra.misc.analysis", "kind": "module", "doc": "

Helper functions for analyzing data yielded by an atlas.

\n"}, {"fullname": "sciterra.misc.analysis.atlas_to_measurements", "modulename": "sciterra.misc.analysis", "qualname": "atlas_to_measurements", "kind": "function", "doc": "

Compute the density, edginess, and citations per year metrics for each publicaation in an atlas w.r.t. a vectorizer and convergence configurations, and return the results in a dataframe.

\n\n
Arguments:
\n\n\n", "signature": "(\tatl: sciterra.mapping.atlas.Atlas,\tvectorizer: sciterra.vectorization.vectorizer.Vectorizer,\tcon_d: float,\tkernel_size=16,\tmetrics: list[str] = ['density', 'edginess'],\tfields_of_study=None,\tmax_year: int = 2023) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "sciterra.misc.utils", "modulename": "sciterra.misc.utils", "kind": "module", "doc": "

Miscellaneous helper functions.

\n"}, {"fullname": "sciterra.misc.utils.standardize_month", "modulename": "sciterra.misc.utils", "qualname": "standardize_month", "kind": "function", "doc": "

\n", "signature": "(month: str) -> str:", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.keep_trying", "modulename": "sciterra.misc.utils", "qualname": "keep_trying", "kind": "function", "doc": "

Sometimes we receive server errors. We don't want that to disrupt the entire process, so this decorator allow trying n_attempts times.

\n\n

API_extension::get_data_via_api

\n\n

This decorator is general, except for the default allowed exception.

\n\n
Arguments:
\n\n\n\n
Example Usage:
\n\n
\n
\n

@keep_trying( n_attempts=4 )\n def try_to_call_web_api():\n \" do stuff \"

\n
\n
\n", "signature": "(\tn_attempts=5,\tallowed_exceptions=(<class 'requests.exceptions.ReadTimeout'>, <class 'requests.exceptions.ConnectionError'>),\tverbose=True,\tsleep_after_attempt=1):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.chunk_ids", "modulename": "sciterra.misc.utils", "qualname": "chunk_ids", "kind": "function", "doc": "

Helper function to chunk bibcodes or paperIds into smaller sublists if appropriate.

\n", "signature": "(ids: list[str], call_size):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.write_pickle", "modulename": "sciterra.misc.utils", "qualname": "write_pickle", "kind": "function", "doc": "

\n", "signature": "(fn: str, data):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.read_pickle", "modulename": "sciterra.misc.utils", "qualname": "read_pickle", "kind": "function", "doc": "

\n", "signature": "(fn: str):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.get_verbose", "modulename": "sciterra.misc.utils", "qualname": "get_verbose", "kind": "function", "doc": "

\n", "signature": "(kwargs: dict):", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.custom_formatwarning", "modulename": "sciterra.misc.utils", "qualname": "custom_formatwarning", "kind": "function", "doc": "

\n", "signature": "(msg, *args, **kwargs):", "funcdef": "def"}, {"fullname": "sciterra.vectorization", "modulename": "sciterra.vectorization", "kind": "module", "doc": "

\n"}, {"fullname": "sciterra.vectorization.vectorizers", "modulename": "sciterra.vectorization", "qualname": "vectorizers", "kind": "variable", "doc": "

\n", "default_value": "{'GPT2': <class 'sciterra.vectorization.gpt2.GPT2Vectorizer'>, 'SciBERT': <class 'sciterra.vectorization.scibert.SciBERTVectorizer'>, 'SBERT': <class 'sciterra.vectorization.sbert.SBERTVectorizer'>, 'Word2Vec': <class 'sciterra.vectorization.word2vec.Word2VecVectorizer'>, 'BOW': <class 'sciterra.vectorization.bow.BOWVectorizer'>}"}, {"fullname": "sciterra.vectorization.bow", "modulename": "sciterra.vectorization.bow", "kind": "module", "doc": "

Bag of words document embedder. Unlike cc vectorization, we fix the dimension of the embeddings to be the same; this requires us to fix the vocabulary, so for consistency we do so via the same method as the Word2Vec vocabulary construction.

\n"}, {"fullname": "sciterra.vectorization.bow.current_file_abs_path", "modulename": "sciterra.vectorization.bow", "qualname": "current_file_abs_path", "kind": "variable", "doc": "

\n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization'"}, {"fullname": "sciterra.vectorization.bow.corpora_path", "modulename": "sciterra.vectorization.bow", "qualname": "corpora_path", "kind": "variable", "doc": "

\n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora'"}, {"fullname": "sciterra.vectorization.bow.ASTROPHYSICS_CORPUS", "modulename": "sciterra.vectorization.bow", "qualname": "ASTROPHYSICS_CORPUS", "kind": "variable", "doc": "

\n", "default_value": "'astro_small.txt'"}, {"fullname": "sciterra.vectorization.bow.DEFAULT_CORPUS", "modulename": "sciterra.vectorization.bow", "qualname": "DEFAULT_CORPUS", "kind": "variable", "doc": "

\n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora/astro_small.txt'"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.__init__", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.__init__", "kind": "function", "doc": "

Construct a bag-of-words document vectorizer.

\n", "signature": "(*args, **kwargs)"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.word2vec_vectorizer", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.word2vec_vectorizer", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.vocabulary", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.vocabulary", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.embedding_dim", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.embedding_dim", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.count_vectorizer", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.count_vectorizer", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.embed_documents", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.embed_documents", "kind": "function", "doc": "

Embed a list of documents (raw text) into bow document vectors using scikit-learn's CountVectorizer.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

a numpy array of shape (num_documents, len(self.vocabulary))

\n
\n", "signature": "(self, docs: list[str], **kwargs) -> dict[str, numpy.ndarray]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.gpt2", "modulename": "sciterra.vectorization.gpt2", "kind": "module", "doc": "

GPT-2 is a large causal language model that achieved SOTA in many NLP tasks before its successors created by OpenAI.

\n\n
Links:
\n\n
\n \n
\n"}, {"fullname": "sciterra.vectorization.gpt2.MPS_DEVICE", "modulename": "sciterra.vectorization.gpt2", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "

\n", "default_value": "device(type='mps')"}, {"fullname": "sciterra.vectorization.gpt2.EMBEDDING_DIM", "modulename": "sciterra.vectorization.gpt2", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "

\n", "default_value": "768"}, {"fullname": "sciterra.vectorization.gpt2.BATCH_SIZE", "modulename": "sciterra.vectorization.gpt2", "qualname": "BATCH_SIZE", "kind": "variable", "doc": "

\n", "default_value": "8"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.__init__", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.__init__", "kind": "function", "doc": "

\n", "signature": "(device='cuda', **kwargs)"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.tokenizer", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.tokenizer", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.model", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.model", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.embed_documents", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.embed_documents", "kind": "function", "doc": "

Embed a list of documents (raw text) into GPT-2 vectors, by batching.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

a numpy array of shape (num_documents, embedding_dim)

\n
\n", "signature": "(self, docs: list[str], batch_size: int = 8) -> dict[str, numpy.ndarray]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.preprocessing", "modulename": "sciterra.vectorization.preprocessing", "kind": "module", "doc": "

Simple preprocessing of scientific abstracts prior to vectorization.

\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor", "kind": "class", "doc": "

\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.__init__", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.__init__", "kind": "function", "doc": "

Initialize a custom tokenizer.

\n\n
Arguments:
\n\n\n", "signature": "(\tallowed_pos_tags: set = {'NOUN', 'VERB', 'ADJ'},\tmodel='en_core_web_sm')"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.nlp", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.nlp", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.allowed_pos_tags", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.allowed_pos_tags", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.custom_preprocess", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.custom_preprocess", "kind": "function", "doc": "

Get all of the lemmas of the words in a document, filtering by POS.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

a list of the lemmatized, filtered words in the document

\n
\n\n

Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming

\n\n

See https://github.com/zhafen/cc/blob/master/cc/utils.py#L173.

\n", "signature": "(self, document: str) -> list[str]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection", "modulename": "sciterra.vectorization.projection", "kind": "module", "doc": "

\n"}, {"fullname": "sciterra.vectorization.projection.Projection", "modulename": "sciterra.vectorization.projection", "qualname": "Projection", "kind": "class", "doc": "

Basic wrapper for document embeddings and helper methods.

\n"}, {"fullname": "sciterra.vectorization.projection.Projection.__init__", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.__init__", "kind": "function", "doc": "

Construct a Projection object, a bidirectional mapping from identifiers to document embeddings.

\n\n
Arguments:
\n\n\n", "signature": "(\tidentifier_to_index: dict[str, int],\tindex_to_identifier: tuple[str],\tembeddings: numpy.ndarray)"}, {"fullname": "sciterra.vectorization.projection.Projection.identifier_to_index", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifier_to_index", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.projection.Projection.index_to_identifier", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.index_to_identifier", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.projection.Projection.embeddings", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.embeddings", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.projection.Projection.indices_to_identifiers", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.indices_to_identifiers", "kind": "function", "doc": "

Retrieve the identifiers for a list of embedding matrix indices.

\n", "signature": "(self, indices) -> list[str]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.Projection.identifiers_to_embeddings", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifiers_to_embeddings", "kind": "function", "doc": "

Retrieve the document embeddings for a list of identifiers.

\n", "signature": "(self, identifiers: list[str]) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.Projection.identifiers_to_indices", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifiers_to_indices", "kind": "function", "doc": "

Retrieve the embedding indices for a list of identifiers.

\n", "signature": "(self, identifiers: list[str]) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.merge", "modulename": "sciterra.vectorization.projection", "qualname": "merge", "kind": "function", "doc": "

Return the result of merging projection proj_a with projection proj_b.

\n\n

This adds to proj_a all embedding data contained in proj_b that is missing from proj_a. This means that the resulting projection can only be greater or equal in size to proj_a.

\n", "signature": "(\tproj_a: sciterra.vectorization.projection.Projection,\tproj_b: sciterra.vectorization.projection.Projection) -> sciterra.vectorization.projection.Projection:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.get_empty_projection", "modulename": "sciterra.vectorization.projection", "qualname": "get_empty_projection", "kind": "function", "doc": "

Construct a Projection with no data (but is not None).

\n", "signature": "() -> sciterra.vectorization.projection.Projection:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.sbert", "modulename": "sciterra.vectorization.sbert", "kind": "module", "doc": "

We use the acronym SBERT as a catch-all for BERT-based sentence transformers. In particular, we use a lightweight/fast version of one the top-performing model.

\n\n
Links:
\n\n
\n

sbert: https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models.\n HF: https://huggingface.co/sentence-transformers

\n
\n"}, {"fullname": "sciterra.vectorization.sbert.MPS_DEVICE", "modulename": "sciterra.vectorization.sbert", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "

\n", "default_value": "device(type='mps')"}, {"fullname": "sciterra.vectorization.sbert.MODEL_PATH", "modulename": "sciterra.vectorization.sbert", "qualname": "MODEL_PATH", "kind": "variable", "doc": "

\n", "default_value": "'all-MiniLM-L6-v2'"}, {"fullname": "sciterra.vectorization.sbert.EMBEDDING_DIM", "modulename": "sciterra.vectorization.sbert", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "

\n", "default_value": "384"}, {"fullname": "sciterra.vectorization.sbert.MAX_SEQ_LENGTH", "modulename": "sciterra.vectorization.sbert", "qualname": "MAX_SEQ_LENGTH", "kind": "variable", "doc": "

\n", "default_value": "256"}, {"fullname": "sciterra.vectorization.sbert.BATCH_SIZE", "modulename": "sciterra.vectorization.sbert", "qualname": "BATCH_SIZE", "kind": "variable", "doc": "

\n", "default_value": "64"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.__init__", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.__init__", "kind": "function", "doc": "

\n", "signature": "(device='cuda', **kwargs)"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.model", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.model", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.embed_documents", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.embed_documents", "kind": "function", "doc": "

Embed a list of documents (raw text) into SBERT vectors, by batching.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

a numpy array of shape (num_documents, 384)

\n
\n", "signature": "(self, docs: list[str], batch_size: int = 64) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.scibert", "modulename": "sciterra.vectorization.scibert", "kind": "module", "doc": "

SciBERT is a BERT model trained on scientific text.

\n\n
Links:
\n\n
\n

Paper: https://aclanthology.org/D19-1371/\n Github: https://github.com/allenai/scibert\n HF: https://huggingface.co/allenai/scibert_scivocab_uncased

\n
\n"}, {"fullname": "sciterra.vectorization.scibert.MPS_DEVICE", "modulename": "sciterra.vectorization.scibert", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "

\n", "default_value": "device(type='mps')"}, {"fullname": "sciterra.vectorization.scibert.MODEL_PATH", "modulename": "sciterra.vectorization.scibert", "qualname": "MODEL_PATH", "kind": "variable", "doc": "

\n", "default_value": "'allenai/scibert_scivocab_uncased'"}, {"fullname": "sciterra.vectorization.scibert.EMBEDDING_DIM", "modulename": "sciterra.vectorization.scibert", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "

\n", "default_value": "768"}, {"fullname": "sciterra.vectorization.scibert.BATCH_SIZE", "modulename": "sciterra.vectorization.scibert", "qualname": "BATCH_SIZE", "kind": "variable", "doc": "

\n", "default_value": "64"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.__init__", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.__init__", "kind": "function", "doc": "

\n", "signature": "(device='cuda', **kwargs)"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.tokenizer", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.tokenizer", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.model", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.model", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.embed_documents", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.embed_documents", "kind": "function", "doc": "

Embed a list of documents (raw text) into SciBERT vectors, by batching.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

a numpy array of shape (num_documents, 768)

\n
\n", "signature": "(self, docs: list[str], batch_size: int = 64) -> dict[str, numpy.ndarray]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.vectorizer", "modulename": "sciterra.vectorization.vectorizer", "kind": "module", "doc": "

Base class for vectorizing abstracts.

\n"}, {"fullname": "sciterra.vectorization.vectorizer.Vectorizer", "modulename": "sciterra.vectorization.vectorizer", "qualname": "Vectorizer", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "abc.ABC"}, {"fullname": "sciterra.vectorization.vectorizer.Vectorizer.embed_documents", "modulename": "sciterra.vectorization.vectorizer", "qualname": "Vectorizer.embed_documents", "kind": "function", "doc": "

Embed a list of documents into document vectors.

\n\n
Arguments:
\n\n\n\n
Returns:
\n\n
\n

a dict of the form\n {\n \"embeddings\": a numpy array of shape (num_successful, embedding_dim), containing the document embeddingss

\n\n
\"success_indices\": a numpy array of shape `(num_successful,)`, containing the indices of all the documents for which document embeddings were successfully obtained.\n\n\"fail_indices\": a numpy array of shape `(len(docs) - num_successful,)`, containing the indices of all the documents for which document embeddings could not be obtained\n
\n \n

}\n where the indices are with respect to the docs list passed.

\n
\n", "signature": "(self, docs: list[str], batch_size: int = 64) -> dict[str, numpy.ndarray]:", "funcdef": "def"}, {"fullname": "sciterra.vectorization.word2vec", "modulename": "sciterra.vectorization.word2vec", "kind": "module", "doc": "

We use a simple word2vec model that gets a document vector by averaging all words in the document.

\n\n

Since we are getting vectors for scientific documents, we must load a vocabulary to train the model from scratch. Therefore we define different subclasses for each scientific field, which may differ substantially by vocabulary.

\n\n

There exists a Doc2Vec module by gensim, but it seems that empirically Word2Vec + averaging can do just as well; furthermore, we're mainly interested in a simple baseline to compare with sophisticated embeddings.

\n\n
Links:
\n\n
\n

gensim: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#

\n
\n"}, {"fullname": "sciterra.vectorization.word2vec.EMBEDDING_DIM", "modulename": "sciterra.vectorization.word2vec", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "

\n", "default_value": "300"}, {"fullname": "sciterra.vectorization.word2vec.current_file_abs_path", "modulename": "sciterra.vectorization.word2vec", "qualname": "current_file_abs_path", "kind": "variable", "doc": "

\n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization'"}, {"fullname": "sciterra.vectorization.word2vec.corpora_path", "modulename": "sciterra.vectorization.word2vec", "qualname": "corpora_path", "kind": "variable", "doc": "

\n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora'"}, {"fullname": "sciterra.vectorization.word2vec.ASTROPHYSICS_CORPUS", "modulename": "sciterra.vectorization.word2vec", "qualname": "ASTROPHYSICS_CORPUS", "kind": "variable", "doc": "

\n", "default_value": "'astro_small.txt'"}, {"fullname": "sciterra.vectorization.word2vec.DEFAULT_CORPUS", "modulename": "sciterra.vectorization.word2vec", "qualname": "DEFAULT_CORPUS", "kind": "variable", "doc": "

\n", "default_value": "'/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora/astro_small.txt'"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer", "kind": "class", "doc": "

Helper class that provides a standard way to create an ABC using\ninheritance.

\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.__init__", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.__init__", "kind": "function", "doc": "

Construct a Word2Vec based document embedding model from a corpus.

\n", "signature": "(\tcorpus_path: str,\tmodel_path: str = None,\tvector_size: int = 300,\twindow: int = 5,\tmin_count: int = 2,\tworkers: int = 8,\tepochs: int = 10,\ttokenizer: Callable[[str], list[str]] = None,\t**kwargs)"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.model", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.model", "kind": "variable", "doc": "

\n"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.embed_documents", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.embed_documents", "kind": "function", "doc": "

Embed a list of documents (raw text) into word2vec document vectors by averaging the word vectors in each of the documents.

\n\n

Since there's no speedup via batching like there is in pytorch models, we iterate one document at a time.

\n", "signature": "(self, docs: list[str], **kwargs) -> numpy.ndarray:", "funcdef": "def"}]; // mirrored in build-search-index.js (part 1) // Also split on html tags. this is a cheap heuristic, but good enough.