From ee5013852764a47ed5f82edc86843359fc1f3d73 Mon Sep 17 00:00:00 2001
From: Nathaniel Imel <nathanielimel@Nathaniels-MacBook-Air.local>
Date: Thu, 14 Mar 2024 21:48:57 -0700
Subject: [PATCH] minor doc gen experimentation

---
 docs/sciterra/vectorization/preprocessing.html | 2 +-
 docs/search.js                                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/sciterra/vectorization/preprocessing.html b/docs/sciterra/vectorization/preprocessing.html
index 936bf14..a136214 100644
--- a/docs/sciterra/vectorization/preprocessing.html
+++ b/docs/sciterra/vectorization/preprocessing.html
@@ -217,7 +217,7 @@ <h1 class="modulename">
                                         <input id="CustomPreprocessor.__init__-view-source" class="view-source-toggle-state" type="checkbox" aria-hidden="true" tabindex="-1">
 <div class="attr function">
             
-        <span class="name">CustomPreprocessor</span><span class="signature pdoc-code multiline">(<span class="param">	<span class="n">allowed_pos_tags</span><span class="p">:</span> <span class="nb">set</span> <span class="o">=</span> <span class="p">{</span><span class="s1">&#39;ADJ&#39;</span><span class="p">,</span> <span class="s1">&#39;NOUN&#39;</span><span class="p">,</span> <span class="s1">&#39;VERB&#39;</span><span class="p">}</span>,</span><span class="param">	<span class="n">model</span><span class="o">=</span><span class="s1">&#39;en_core_web_sm&#39;</span></span>)</span>
+        <span class="name">CustomPreprocessor</span><span class="signature pdoc-code multiline">(<span class="param">	<span class="n">allowed_pos_tags</span><span class="p">:</span> <span class="nb">set</span> <span class="o">=</span> <span class="p">{</span><span class="s1">&#39;NOUN&#39;</span><span class="p">,</span> <span class="s1">&#39;VERB&#39;</span><span class="p">,</span> <span class="s1">&#39;ADJ&#39;</span><span class="p">}</span>,</span><span class="param">	<span class="n">model</span><span class="o">=</span><span class="s1">&#39;en_core_web_sm&#39;</span></span>)</span>
 
                 <label class="view-source-button" for="CustomPreprocessor.__init__-view-source"><span>View Source</span></label>
 
diff --git a/docs/search.js b/docs/search.js
index 7e29e0a..86b3b75 100644
--- a/docs/search.js
+++ b/docs/search.js
@@ -1,6 +1,6 @@
 window.pdocSearch = (function(){
 /** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u<s.length;u++){var a=s[u];r[a]=this.pipeline.run(t.tokenizer(e[a]))}var l={};for(var c in o){var d=r[c]||r.any;if(d){var f=this.fieldSearch(d,c,o),h=o[c].boost;for(var p in f)f[p]=f[p]*h;for(var p in f)p in l?l[p]+=f[p]:l[p]=f[p]}}var v,g=[];for(var p in l)v={ref:p,score:l[p]},this.documentStore.hasDoc(p)&&(v.doc=this.documentStore.getDoc(p)),g.push(v);return g.sort(function(e,t){return t.score-e.score}),g},t.Index.prototype.fieldSearch=function(e,t,n){var i=n[t].bool,o=n[t].expand,r=n[t].boost,s=null,u={};return 0!==r?(e.forEach(function(e){var n=[e];1==o&&(n=this.index[t].expandToken(e));var r={};n.forEach(function(n){var o=this.index[t].getDocs(n),a=this.idf(n,t);if(s&&"AND"==i){var l={};for(var c in s)c in o&&(l[c]=o[c]);o=l}n==e&&this.fieldSearchStats(u,n,o);for(var c in o){var d=this.index[t].getTermFrequency(n,c),f=this.documentStore.getFieldLength(c,t),h=1;0!=f&&(h=1/Math.sqrt(f));var p=1;n!=e&&(p=.15*(1-(n.length-e.length)/n.length));var v=d*a*h*p;c in r?r[c]+=v:r[c]=v}},this),s=this.mergeScores(s,r,i)},this),s=this.coordNorm(s,u,e.length)):void 0},t.Index.prototype.mergeScores=function(e,t,n){if(!e)return t;if("AND"==n){var i={};for(var o in t)o in e&&(i[o]=e[o]+t[o]);return i}for(var o in t)o in e?e[o]+=t[o]:e[o]=t[o];return e},t.Index.prototype.fieldSearchStats=function(e,t,n){for(var i in n)i in e?e[i].push(t):e[i]=[t]},t.Index.prototype.coordNorm=function(e,t,n){for(var i in e)if(i in t){var o=t[i].length;e[i]=e[i]*o/n}return e},t.Index.prototype.toJSON=function(){var e={};return this._fields.forEach(function(t){e[t]=this.index[t].toJSON()},this),{version:t.version,fields:this._fields,ref:this._ref,documentStore:this.documentStore.toJSON(),index:e,pipeline:this.pipeline.toJSON()}},t.Index.prototype.use=function(e){var t=Array.prototype.slice.call(arguments,1);t.unshift(this),e.apply(this,t)},t.DocumentStore=function(e){this._save=null===e||void 0===e?!0:e,this.docs={},this.docInfo={},this.length=0},t.DocumentStore.load=function(e){var t=new this;return t.length=e.length,t.docs=e.docs,t.docInfo=e.docInfo,t._save=e.save,t},t.DocumentStore.prototype.isDocStored=function(){return this._save},t.DocumentStore.prototype.addDoc=function(t,n){this.hasDoc(t)||this.length++,this.docs[t]=this._save===!0?e(n):null},t.DocumentStore.prototype.getDoc=function(e){return this.hasDoc(e)===!1?null:this.docs[e]},t.DocumentStore.prototype.hasDoc=function(e){return e in this.docs},t.DocumentStore.prototype.removeDoc=function(e){this.hasDoc(e)&&(delete this.docs[e],delete this.docInfo[e],this.length--)},t.DocumentStore.prototype.addFieldLength=function(e,t,n){null!==e&&void 0!==e&&0!=this.hasDoc(e)&&(this.docInfo[e]||(this.docInfo[e]={}),this.docInfo[e][t]=n)},t.DocumentStore.prototype.updateFieldLength=function(e,t,n){null!==e&&void 0!==e&&0!=this.hasDoc(e)&&this.addFieldLength(e,t,n)},t.DocumentStore.prototype.getFieldLength=function(e,t){return null===e||void 0===e?0:e in this.docs&&t in this.docInfo[e]?this.docInfo[e][t]:0},t.DocumentStore.prototype.toJSON=function(){return{docs:this.docs,docInfo:this.docInfo,length:this.length,save:this._save}},t.stemmer=function(){var e={ational:"ate",tional:"tion",enci:"ence",anci:"ance",izer:"ize",bli:"ble",alli:"al",entli:"ent",eli:"e",ousli:"ous",ization:"ize",ation:"ate",ator:"ate",alism:"al",iveness:"ive",fulness:"ful",ousness:"ous",aliti:"al",iviti:"ive",biliti:"ble",logi:"log"},t={icate:"ic",ative:"",alize:"al",iciti:"ic",ical:"ic",ful:"",ness:""},n="[^aeiou]",i="[aeiouy]",o=n+"[^aeiouy]*",r=i+"[aeiou]*",s="^("+o+")?"+r+o,u="^("+o+")?"+r+o+"("+r+")?$",a="^("+o+")?"+r+o+r+o,l="^("+o+")?"+i,c=new RegExp(s),d=new RegExp(a),f=new RegExp(u),h=new RegExp(l),p=/^(.+?)(ss|i)es$/,v=/^(.+?)([^s])s$/,g=/^(.+?)eed$/,m=/^(.+?)(ed|ing)$/,y=/.$/,S=/(at|bl|iz)$/,x=new RegExp("([^aeiouylsz])\\1$"),w=new RegExp("^"+o+i+"[^aeiouwxy]$"),I=/^(.+?[^aeiou])y$/,b=/^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/,E=/^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/,D=/^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/,F=/^(.+?)(s|t)(ion)$/,_=/^(.+?)e$/,P=/ll$/,k=new RegExp("^"+o+i+"[^aeiouwxy]$"),z=function(n){var i,o,r,s,u,a,l;if(n.length<3)return n;if(r=n.substr(0,1),"y"==r&&(n=r.toUpperCase()+n.substr(1)),s=p,u=v,s.test(n)?n=n.replace(s,"$1$2"):u.test(n)&&(n=n.replace(u,"$1$2")),s=g,u=m,s.test(n)){var z=s.exec(n);s=c,s.test(z[1])&&(s=y,n=n.replace(s,""))}else if(u.test(n)){var z=u.exec(n);i=z[1],u=h,u.test(i)&&(n=i,u=S,a=x,l=w,u.test(n)?n+="e":a.test(n)?(s=y,n=n.replace(s,"")):l.test(n)&&(n+="e"))}if(s=I,s.test(n)){var z=s.exec(n);i=z[1],n=i+"i"}if(s=b,s.test(n)){var z=s.exec(n);i=z[1],o=z[2],s=c,s.test(i)&&(n=i+e[o])}if(s=E,s.test(n)){var z=s.exec(n);i=z[1],o=z[2],s=c,s.test(i)&&(n=i+t[o])}if(s=D,u=F,s.test(n)){var z=s.exec(n);i=z[1],s=d,s.test(i)&&(n=i)}else if(u.test(n)){var z=u.exec(n);i=z[1]+z[2],u=d,u.test(i)&&(n=i)}if(s=_,s.test(n)){var z=s.exec(n);i=z[1],s=d,u=f,a=k,(s.test(i)||u.test(i)&&!a.test(i))&&(n=i)}return s=P,u=d,s.test(n)&&u.test(n)&&(s=y,n=n.replace(s,"")),"y"==r&&(n=r.toLowerCase()+n.substr(1)),n};return z}(),t.Pipeline.registerFunction(t.stemmer,"stemmer"),t.stopWordFilter=function(e){return e&&t.stopWordFilter.stopWords[e]!==!0?e:void 0},t.clearStopWords=function(){t.stopWordFilter.stopWords={}},t.addStopWords=function(e){null!=e&&Array.isArray(e)!==!1&&e.forEach(function(e){t.stopWordFilter.stopWords[e]=!0},this)},t.resetStopWords=function(){t.stopWordFilter.stopWords=t.defaultStopWords},t.defaultStopWords={"":!0,a:!0,able:!0,about:!0,across:!0,after:!0,all:!0,almost:!0,also:!0,am:!0,among:!0,an:!0,and:!0,any:!0,are:!0,as:!0,at:!0,be:!0,because:!0,been:!0,but:!0,by:!0,can:!0,cannot:!0,could:!0,dear:!0,did:!0,"do":!0,does:!0,either:!0,"else":!0,ever:!0,every:!0,"for":!0,from:!0,get:!0,got:!0,had:!0,has:!0,have:!0,he:!0,her:!0,hers:!0,him:!0,his:!0,how:!0,however:!0,i:!0,"if":!0,"in":!0,into:!0,is:!0,it:!0,its:!0,just:!0,least:!0,let:!0,like:!0,likely:!0,may:!0,me:!0,might:!0,most:!0,must:!0,my:!0,neither:!0,no:!0,nor:!0,not:!0,of:!0,off:!0,often:!0,on:!0,only:!0,or:!0,other:!0,our:!0,own:!0,rather:!0,said:!0,say:!0,says:!0,she:!0,should:!0,since:!0,so:!0,some:!0,than:!0,that:!0,the:!0,their:!0,them:!0,then:!0,there:!0,these:!0,they:!0,"this":!0,tis:!0,to:!0,too:!0,twas:!0,us:!0,wants:!0,was:!0,we:!0,were:!0,what:!0,when:!0,where:!0,which:!0,"while":!0,who:!0,whom:!0,why:!0,will:!0,"with":!0,would:!0,yet:!0,you:!0,your:!0},t.stopWordFilter.stopWords=t.defaultStopWords,t.Pipeline.registerFunction(t.stopWordFilter,"stopWordFilter"),t.trimmer=function(e){if(null===e||void 0===e)throw new Error("token should not be undefined");return e.replace(/^\W+/,"").replace(/\W+$/,"")},t.Pipeline.registerFunction(t.trimmer,"trimmer"),t.InvertedIndex=function(){this.root={docs:{},df:0}},t.InvertedIndex.load=function(e){var t=new this;return t.root=e.root,t},t.InvertedIndex.prototype.addToken=function(e,t,n){for(var n=n||this.root,i=0;i<=e.length-1;){var o=e[i];o in n||(n[o]={docs:{},df:0}),i+=1,n=n[o]}var r=t.ref;n.docs[r]?n.docs[r]={tf:t.tf}:(n.docs[r]={tf:t.tf},n.df+=1)},t.InvertedIndex.prototype.hasToken=function(e){if(!e)return!1;for(var t=this.root,n=0;n<e.length;n++){if(!t[e[n]])return!1;t=t[e[n]]}return!0},t.InvertedIndex.prototype.getNode=function(e){if(!e)return null;for(var t=this.root,n=0;n<e.length;n++){if(!t[e[n]])return null;t=t[e[n]]}return t},t.InvertedIndex.prototype.getDocs=function(e){var t=this.getNode(e);return null==t?{}:t.docs},t.InvertedIndex.prototype.getTermFrequency=function(e,t){var n=this.getNode(e);return null==n?0:t in n.docs?n.docs[t].tf:0},t.InvertedIndex.prototype.getDocFreq=function(e){var t=this.getNode(e);return null==t?0:t.df},t.InvertedIndex.prototype.removeToken=function(e,t){if(e){var n=this.getNode(e);null!=n&&t in n.docs&&(delete n.docs[t],n.df-=1)}},t.InvertedIndex.prototype.expandToken=function(e,t,n){if(null==e||""==e)return[];var t=t||[];if(void 0==n&&(n=this.getNode(e),null==n))return t;n.df>0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e<arguments.length;e++)t=arguments[e],~this.indexOf(t)||this.elements.splice(this.locationFor(t),0,t);this.length=this.elements.length},lunr.SortedSet.prototype.toArray=function(){return this.elements.slice()},lunr.SortedSet.prototype.map=function(e,t){return this.elements.map(e,t)},lunr.SortedSet.prototype.forEach=function(e,t){return this.elements.forEach(e,t)},lunr.SortedSet.prototype.indexOf=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]<u[i]?n++:s[n]>u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();o<r.length;o++)i.add(r[o]);return i},lunr.SortedSet.prototype.toJSON=function(){return this.toArray()},function(e,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():e.elasticlunr=t()}(this,function(){return t})}();
-    /** pdoc search index */const docs = [{"fullname": "sciterra", "modulename": "sciterra", "kind": "module", "doc": "<h1 id=\"sciterra-a-python-library-for-similarity-based-scientometrics\">sciterra: a python library for similarity-based scientometrics</h1>\n\n<p><a href=\"https://github.com/nathimel/sciterra/actions/workflows/build.yml\"><img src=\"https://github.com/nathimel/sciterra/actions/workflows/build.yml/badge.svg\" alt=\"build\" /></a></p>\n\n<p>Sciterra is a software libary to support data-driven analyses of scientific literature, with a focus on unifying different bibliographic database APIs and document-embedding methods for systematic scientometrics research.</p>\n\n<h2 id=\"overview\">Overview</h2>\n\n<p>The main purpose of sciterra is to perform similarity-based retrieval of scientific publications for metascience/scientometrics research. While there are many services that can make the individual steps of this simple, this software library exists to</p>\n\n<ol>\n<li><p>Unify the different APIs and vector-based retrieval methods</p></li>\n<li><p>Support scientometrics analyses of citation dynamics, especially with respect to a vectorized 'landscape' of literature.</p></li>\n</ol>\n\n<h2 id=\"installing-sciterra\">Installing sciterra</h2>\n\n<p>First, set up a virtual environment (e.g. via <a href=\"https://docs.conda.io/projects/miniconda/en/latest/\">miniconda</a>, <code>conda create -n sciterra</code>, and <code>conda activate sciterra</code>).</p>\n\n<ol>\n<li><p>Install sciterra via git:</p>\n\n<p><code>python -m pip install 'sciterra @ git+https://github.com/nathimel/sciterra.git'</code></p></li>\n<li><p>Alternatively, download or clone this repository and navigate to the root folder, and install locally:</p>\n\n<p><code>pip install -e .</code></p></li>\n<li><p>It is not yet recommended because sciterra is still in development, but you can also install via pip from pypi:</p>\n\n<p><code>pip install sciterra</code></p></li>\n</ol>\n\n<h2 id=\"usage\">Usage</h2>\n\n<h3 id=\"atlas\">Atlas</h3>\n\n<p>The central object in sciterra is the <a href=\"src/sciterra/mapping/atlas.py\"><code>Atlas</code></a>. This is a basic data structure for containing scientific publications that are returned from calls to various bibliographic database APIs.</p>\n\n<p>An Atlas minimally requires a list of <a href=\"src/sciterra/mapping/publication.py\"><code>Publications</code></a>.</p>\n\n<h4 id=\"publication\">Publication</h4>\n\n<p>A publication object is a minimal wrapper around publication data, and should have a string identifier. It is designed to standardize the basic metadata contained in the results from some bibliographic database API.</p>\n\n<div class=\"pdoc-code codehilite\">\n<pre><span></span><code><span class=\"kn\">from</span> <span class=\"nn\">sciterra</span> <span class=\"kn\">import</span> <span class=\"n\">Atlas</span><span class=\"p\">,</span> <span class=\"n\">Publication</span>\n\n<span class=\"n\">atl</span> <span class=\"o\">=</span> <span class=\"n\">Atlas</span><span class=\"p\">([</span><span class=\"n\">Publication</span><span class=\"p\">({</span><span class=\"s2\">&quot;identifier&quot;</span><span class=\"p\">:</span> <span class=\"s2\">&quot;id&quot;</span><span class=\"p\">})])</span>\n</code></pre>\n</div>\n\n<p>Alternatively, you can construct an Atlas by passing in a .bib file. The entries in this bibtex file will be parsed for unique identifiers (e.g., DOIs), and sent in an API call, and returned as Publications, which then populate an Atlas.</p>\n\n<div class=\"pdoc-code codehilite\">\n<pre><span></span><code><span class=\"n\">atl</span> <span class=\"o\">=</span> <span class=\"n\">crt</span><span class=\"o\">.</span><span class=\"n\">bibtex_to_atlas</span><span class=\"p\">(</span><span class=\"n\">bibtex_filepath</span><span class=\"p\">)</span>\n</code></pre>\n</div>\n\n<p>In the line of code above, the variable <code>crt</code> is an instance of a <a href=\"src/sciterra/mapping/cartography.py\"><code>Cartographer</code></a> object, which encapsulates the bookkeeping involved in querying a bibliographic database for publications.</p>\n\n<h3 id=\"cartographer\">Cartographer</h3>\n\n<p>The Cartographer class is named because interfaces with an Atlas to build out a library of publications. Since it does so via similarity-based retrieval, the resulting Atlas can be considered a 'region' of publications.</p>\n\n<p>To do this, a Cartographer needs two things: an API with which to interface, and a way of getting document embeddings. Both are encapsulated, respectively, by the <a href=\"src/sciterra/librarians/librarian.py\"><code>Librarian</code></a> and the <a href=\"src/sciterra/vectorization/vectorizer.py\"><code>Vectorizer</code></a> classes.</p>\n\n<div class=\"pdoc-code codehilite\">\n<pre><span></span><code><span class=\"kn\">from</span> <span class=\"nn\">sciterra</span> <span class=\"kn\">import</span> <span class=\"n\">Cartographer</span>\n<span class=\"kn\">from</span> <span class=\"nn\">sciterra.librarians</span> <span class=\"kn\">import</span> <span class=\"n\">SemanticScholarLibrarian</span> <span class=\"c1\"># or ADSLibrarian</span>\n<span class=\"kn\">from</span> <span class=\"nn\">sciterra.vectorization</span> <span class=\"kn\">import</span> <span class=\"n\">SciBERTVectorizer</span> <span class=\"c1\"># among others</span>\n\n<span class=\"n\">crt</span> <span class=\"o\">=</span> <span class=\"n\">Cartographer</span><span class=\"p\">(</span>\n    <span class=\"n\">librarian</span><span class=\"o\">=</span><span class=\"n\">SemanticScholarLibrarian</span><span class=\"p\">(),</span>\n    <span class=\"n\">vectorizer</span><span class=\"o\">=</span><span class=\"n\">SciBERTVectorizer</span><span class=\"p\">(),</span>\n<span class=\"p\">)</span>\n</code></pre>\n</div>\n\n<h4 id=\"librarian\">Librarian</h4>\n\n<p>Each Librarian subclass is designed to be a wrapper for an existing python API service, such as the <a href=\"https://ads.readthedocs.io/en/latest/\">ads</a> package or the <a href=\"https://github.com/danielnsilva/semanticscholar#\">semanticscholar</a> client library.</p>\n\n<p>A Librarian subclass also overrides two methods. The first is <code>get_publications</code>, which takes a list of identifiers, should query the specific API for that Librarian, and returns a list of Publications. Keyword arguments can be passed to specify the metadata that is kept for each publication (e.g. date, title, journal, authors, etc.) The second method is <code>convert_publication</code>, which defines how the result of an API call should be converted to a sciterra Publication object.</p>\n\n<p>Contributions to sciterra in the form of new Librarian subclasses are encouraged and appreciated.</p>\n\n<h3 id=\"vectorizer\">Vectorizer</h3>\n\n<p>Vectorizer subclasses override one function, <code>embed_documents</code>, which takes a list of strings, representing the text of a publication (currently, just its abstract), and returns an <code>np.ndarray</code> of embeddings.</p>\n\n<p>Under the hood, the <code>project</code> method of Cartographer, which is used during similarity-based retrieval, uses the vectorizer roughly as follows</p>\n\n<div class=\"pdoc-code codehilite\">\n<pre><span></span><code><span class=\"c1\"># Get abstracts</span>\n<span class=\"n\">docs</span> <span class=\"o\">=</span> <span class=\"p\">[</span><span class=\"n\">atlas</span><span class=\"p\">[</span><span class=\"n\">identifier</span><span class=\"p\">]</span><span class=\"o\">.</span><span class=\"n\">abstract</span> <span class=\"k\">for</span> <span class=\"n\">identifier</span> <span class=\"ow\">in</span> <span class=\"n\">identifiers</span><span class=\"p\">]</span>\n\n<span class=\"c1\"># Embed abstracts</span>\n<span class=\"n\">result</span> <span class=\"o\">=</span> <span class=\"n\">vectorizer</span><span class=\"o\">.</span><span class=\"n\">embed_documents</span><span class=\"p\">(</span><span class=\"n\">docs</span><span class=\"p\">)</span>\n<span class=\"n\">embeddings</span> <span class=\"o\">=</span> <span class=\"n\">result</span><span class=\"p\">[</span><span class=\"s2\">&quot;embeddings&quot;</span><span class=\"p\">]</span>\n\n<span class=\"c1\"># depending on the vectorizer, sometimes not all embeddings can be obtained due to out-of-vocab issues</span>\n<span class=\"n\">success_indices</span> <span class=\"o\">=</span> <span class=\"n\">result</span><span class=\"p\">[</span><span class=\"s2\">&quot;success_indices&quot;</span><span class=\"p\">]</span> <span class=\"c1\"># shape `(len(embeddings),)`</span>\n<span class=\"n\">fail_indices</span> <span class=\"o\">=</span> <span class=\"n\">result</span><span class=\"p\">[</span><span class=\"s2\">&quot;fail_indices&quot;</span><span class=\"p\">]</span> <span class=\"c1\"># shape `(len(docs) - len(embeddings))``</span>\n</code></pre>\n</div>\n\n<p>Currently, sciterra has vectorizers using <a href=\"https://aclanthology.org/D19-1371/\">SciBERT</a>, <a href=\"https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models\">SBERT</a>, <a href=\"https://huggingface.co/docs/transformers/en/model_doc/gpt2\">GPT-2</a>, <a href=\"https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#\">Word2Vec</a>, and a simple bag-of-words (BOW) vectorizer that uses the same vocabulary as the Word2Vec vectorizer. Contributions to sciterra in the form of new Vectorizer subclasses are also encouraged and appreciated.</p>\n\n<h3 id=\"putting-it-all-together\">Putting it all together</h3>\n\n<p>The main use case for all of these ingredients is to iteratively build out a region of publications. This is done using <code>iterate_expand</code>:</p>\n\n<div class=\"pdoc-code codehilite\">\n<pre><span></span><code><span class=\"kn\">from</span> <span class=\"nn\">sciterra.mapping.tracing</span> <span class=\"kn\">import</span> <span class=\"n\">iterate_expand</span>\n\n<span class=\"c1\"># Assuming the initial atlas contains just one publication</span>\n<span class=\"p\">(</span><span class=\"n\">atl</span><span class=\"o\">.</span><span class=\"n\">center</span><span class=\"p\">,</span> <span class=\"p\">)</span> <span class=\"o\">=</span> <span class=\"n\">atl</span><span class=\"o\">.</span><span class=\"n\">publications</span><span class=\"o\">.</span><span class=\"n\">values</span><span class=\"p\">()</span>\n<span class=\"c1\"># build out an atlas to contain 10,000 publications, with increasing dissimilarity to the initial publication, saving progress in binary files to the directory named &quot;atlas&quot;.</span>\n<span class=\"n\">iterate_expand</span><span class=\"p\">(</span>\n    <span class=\"n\">atl</span><span class=\"o\">=</span><span class=\"n\">atl</span><span class=\"p\">,</span>\n    <span class=\"n\">crt</span><span class=\"o\">=</span><span class=\"n\">crt</span><span class=\"p\">,</span>\n    <span class=\"n\">atlas_dir</span><span class=\"o\">=</span><span class=\"s2\">&quot;atlas&quot;</span><span class=\"p\">,</span>\n    <span class=\"n\">target_size</span><span class=\"o\">=</span><span class=\"mi\">10000</span><span class=\"p\">,</span>\n    <span class=\"n\">center</span><span class=\"o\">=</span><span class=\"n\">atl</span><span class=\"o\">.</span><span class=\"n\">center</span><span class=\"p\">,</span>\n<span class=\"p\">)</span>\n</code></pre>\n</div>\n\n<p>This method has a number of keyword arguments that enable tracking the Atlas expansion, limiting the number of publications per expansion, how many times to try to get a response if there are connection issues, etc.</p>\n\n<p>In practice, it may be helpful to use the <a href=\"src/sciterra/mapping/tracing.py\"><code>sciterra.mapping.tracing.AtlasTracer</code></a> data structure to reduce most of the loading/initialization boilerplate described above. For an example, see <a href=\"src/examples/scratch/main.py\">main.py</a>.</p>\n\n<h2 id=\"additional-features\">Additional features</h2>\n\n<ul>\n<li>The <a href=\"src/sciterra/mapping/topography.py\">topography</a> submodule contains similarity-based metrics for publications, to support scientometrics analyses.</li>\n</ul>\n\n<h2 id=\"acknowledgments\">Acknowledgments</h2>\n\n<p>This software is a reimplimentation of Zachary Hafen-Saavedra's library, <a href=\"https://github.com/zhafen/cc\">cc</a>.</p>\n\n<p>To cite sciterra, please use the following workshop paper,</p>\n\n<pre><code>@inproceedings{Imel2023,\n author = {Imel, Nathaniel, and Hafen, Zachary},\n title = {Citation-similarity relationships in astrophysics},\n booktitle = {AI for Scientific Discovery: From Theory to Practice Workshop (AI4Science @ NeurIPS)},\n year = {2023},\n url = {https://openreview.net/pdf?id=mISayy7DPI},\n}\n</code></pre>\n"}, {"fullname": "sciterra.librarians", "modulename": "sciterra.librarians", "kind": "module", "doc": "<p></p>\n"}, {"fullname": "sciterra.librarians.librarians", "modulename": "sciterra.librarians", "qualname": "librarians", "kind": "variable", "doc": "<p>Why is there not an ArxivLibrarian? For now, we are restricting to APIs that allow us to traverse literature graphs, and arxiv does not have one. While there is a useful pip-installable package for querying the arxiv api for papers, <a href=\"https://pypi.org/project/arxiv/\">https://pypi.org/project/arxiv/</a>, the returned object does not have information on references and citations. However, it may still be possible to obtain a large sample of publications with abstracts and submission dates (though no citation counts), because the arxiv API's limit for a single query is 300,000 results.</p>\n", "default_value": "{&#x27;S2&#x27;: &lt;class &#x27;sciterra.librarians.s2librarian.SemanticScholarLibrarian&#x27;&gt;, &#x27;ADS&#x27;: &lt;class &#x27;sciterra.librarians.adslibrarian.ADSLibrarian&#x27;&gt;}"}, {"fullname": "sciterra.librarians.adslibrarian", "modulename": "sciterra.librarians.adslibrarian", "kind": "module", "doc": "<p></p>\n"}, {"fullname": "sciterra.librarians.adslibrarian.CALL_SIZE", "modulename": "sciterra.librarians.adslibrarian", "qualname": "CALL_SIZE", "kind": "variable", "doc": "<p></p>\n", "default_value": "50"}, {"fullname": "sciterra.librarians.adslibrarian.NUM_ATTEMPTS_PER_QUERY", "modulename": "sciterra.librarians.adslibrarian", "qualname": "NUM_ATTEMPTS_PER_QUERY", "kind": "variable", "doc": "<p></p>\n", "default_value": "10"}, {"fullname": "sciterra.librarians.adslibrarian.QUERY_FIELDS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "QUERY_FIELDS", "kind": "variable", "doc": "<p></p>\n", "default_value": "[&#x27;bibcode&#x27;, &#x27;abstract&#x27;, &#x27;title&#x27;, &#x27;entry_date&#x27;, &#x27;pubdate&#x27;, &#x27;year&#x27;, &#x27;citation_count&#x27;, &#x27;citation&#x27;, &#x27;reference&#x27;, &#x27;identifier&#x27;, &#x27;arxiv_class&#x27;]"}, {"fullname": "sciterra.librarians.adslibrarian.ALLOWED_EXCEPTIONS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ALLOWED_EXCEPTIONS", "kind": "variable", "doc": "<p></p>\n", "default_value": "(&lt;class &#x27;ads.exceptions.APIResponseError&#x27;&gt;,)"}, {"fullname": "sciterra.librarians.adslibrarian.EXTERNAL_IDS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "EXTERNAL_IDS", "kind": "variable", "doc": "<p></p>\n", "default_value": "[&#x27;DOI&#x27;, &#x27;arXiv&#x27;, &#x27;bibcode&#x27;]"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "sciterra.librarians.librarian.Librarian"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.bibtex_entry_identifier", "kind": "function", "doc": "<p>Parse a bibtex entry for a usable identifier for querying ADS (see EXTERNAL_IDS).</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">bibtex_entry</span><span class=\"p\">:</span> <span class=\"nb\">dict</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">str</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.get_publications", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.get_publications", "kind": "function", "doc": "<p>Use the NASA ADS python package, which calls the ADS API to retrieve publications.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>bibcodes:</strong>  the str ids required for querying. While it is possible to use one of EXTERNAL_IDS to query, if ADS returns a paper at all, it will return a bibcode, so it is preferred to use bibcodes.</li>\n<li><strong>n_attempts_per_query:</strong>  Number of attempts to access the API per query. Useful when experiencing connection issues.</li>\n<li><strong>call_size:</strong>  maximum number of papers to call API for in one query; if less than <code>len(bibcodes)</code>, chunking will be performed.</li>\n<li><strong>convert:</strong>  whether to convert each resulting ADS Article to sciterra Publications (True by default).</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>the list of publications (or Papers)</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">bibcodes</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"o\">*</span><span class=\"n\">args</span>,</span><span class=\"param\">\t<span class=\"n\">call_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">50</span>,</span><span class=\"param\">\t<span class=\"n\">n_attempts_per_query</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">10</span>,</span><span class=\"param\">\t<span class=\"n\">convert</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">True</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.convert_publication", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.convert_publication", "kind": "function", "doc": "<p>Convert a ADS Article object to a sciterra.publication.Publication.</p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">article</span><span class=\"p\">:</span> <span class=\"n\">ads</span><span class=\"o\">.</span><span class=\"n\">search</span><span class=\"o\">.</span><span class=\"n\">Article</span>,</span><span class=\"param\">\t<span class=\"o\">*</span><span class=\"n\">args</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian", "modulename": "sciterra.librarians.librarian", "kind": "module", "doc": "<p></p>\n"}, {"fullname": "sciterra.librarians.librarian.Librarian", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "abc.ABC"}, {"fullname": "sciterra.librarians.librarian.Librarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.bibtex_entry_identifier", "kind": "function", "doc": "<p>Parse a bibtex entry for a usable unique identifier appropriate to the API.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">bibtex_entry</span><span class=\"p\">:</span> <span class=\"nb\">dict</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">str</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.get_publications", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.get_publications", "kind": "function", "doc": "<p>Call an API and retrieve the publications corresponding to str identifiers.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>n_attempts_per_query:</strong>  Number of attempts to access the API per query. Useful when experiencing connection issues.</li>\n<li><strong>call_size:</strong>  (int): maximum number of papers to call API for in one query; if less than <code>len(paper_ids)</code>, chunking will be performed.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">identifiers</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"o\">*</span><span class=\"n\">args</span>,</span><span class=\"param\">\t<span class=\"n\">call_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">n_attempts_per_query</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">convert</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">True</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.convert_publication", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.convert_publication", "kind": "function", "doc": "<p>Convert an API-specific resulting publication data structure into a sciterra Publication object.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">pub</span><span class=\"p\">:</span> <span class=\"n\">Any</span>, </span><span class=\"param\"><span class=\"o\">*</span><span class=\"n\">args</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.convert_publications", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.convert_publications", "kind": "function", "doc": "<p>Convet a list of API-specific results to sciterra Publications, possibly using multiprocessing.</p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">papers</span><span class=\"p\">:</span> <span class=\"nb\">list</span>,</span><span class=\"param\">\t<span class=\"o\">*</span><span class=\"n\">args</span>,</span><span class=\"param\">\t<span class=\"n\">multiprocess</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">True</span>,</span><span class=\"param\">\t<span class=\"n\">num_processes</span><span class=\"o\">=</span><span class=\"mi\">6</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian", "modulename": "sciterra.librarians.s2librarian", "kind": "module", "doc": "<p></p>\n"}, {"fullname": "sciterra.librarians.s2librarian.QUERY_FIELDS", "modulename": "sciterra.librarians.s2librarian", "qualname": "QUERY_FIELDS", "kind": "variable", "doc": "<p></p>\n", "default_value": "[&#x27;year&#x27;, &#x27;abstract&#x27;, &#x27;title&#x27;, &#x27;externalIds&#x27;, &#x27;citationCount&#x27;, &#x27;fieldsOfStudy&#x27;, &#x27;s2FieldsOfStudy&#x27;, &#x27;url&#x27;, &#x27;citations.externalIds&#x27;, &#x27;citations.url&#x27;, &#x27;references.externalIds&#x27;, &#x27;references.url&#x27;, &#x27;citationStyles&#x27;, &#x27;publicationDate&#x27;]"}, {"fullname": "sciterra.librarians.s2librarian.EXTERNAL_IDS", "modulename": "sciterra.librarians.s2librarian", "qualname": "EXTERNAL_IDS", "kind": "variable", "doc": "<p></p>\n", "default_value": "[&#x27;DOI&#x27;, &#x27;ArXiv&#x27;, &#x27;CorpusId&#x27;, &#x27;MAG&#x27;, &#x27;ACL&#x27;, &#x27;PubMed&#x27;, &#x27;Medline&#x27;, &#x27;PubMedCentral&#x27;, &#x27;DBLP&#x27;, &#x27;URL&#x27;]"}, {"fullname": "sciterra.librarians.s2librarian.ALLOWED_EXCEPTIONS", "modulename": "sciterra.librarians.s2librarian", "qualname": "ALLOWED_EXCEPTIONS", "kind": "variable", "doc": "<p></p>\n", "default_value": "(&lt;class &#x27;Exception&#x27;&gt;, &lt;class &#x27;requests.exceptions.ReadTimeout&#x27;&gt;, &lt;class &#x27;requests.exceptions.ConnectionError&#x27;&gt;, &lt;class &#x27;semanticscholar.SemanticScholarException.ObjectNotFoundException&#x27;&gt;)"}, {"fullname": "sciterra.librarians.s2librarian.CALL_SIZE", "modulename": "sciterra.librarians.s2librarian", "qualname": "CALL_SIZE", "kind": "variable", "doc": "<p></p>\n", "default_value": "10"}, {"fullname": "sciterra.librarians.s2librarian.NUM_ATTEMPTS_PER_QUERY", "modulename": "sciterra.librarians.s2librarian", "qualname": "NUM_ATTEMPTS_PER_QUERY", "kind": "variable", "doc": "<p></p>\n", "default_value": "50"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "sciterra.librarians.librarian.Librarian"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.__init__", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.__init__", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">api_key</span><span class=\"p\">:</span> <span class=\"nb\">str</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>, </span><span class=\"param\"><span class=\"n\">api_key_fn</span><span class=\"p\">:</span> <span class=\"nb\">str</span> <span class=\"o\">=</span> <span class=\"kc\">None</span></span>)</span>"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.sch", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.sch", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.bibtex_entry_identifier", "kind": "function", "doc": "<p>Parse a bibtex entry for a usable identifier for querying SemanticScholar (see EXTERNAL_IDS).</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">bibtex_entry</span><span class=\"p\">:</span> <span class=\"nb\">dict</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">str</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_publications", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_publications", "kind": "function", "doc": "<p>Use the (unofficial) S2 python package, which calls the Semantic Scholar API to retrieve publications from the S2AG.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>paper_ids:</strong>  the str ids required for querying. While it is possible to use one of EXTERNAL_IDS to query, if SemanticScholar returns a paper at all, it will return a paperId, so it is preferred to use paperIds.</li>\n<li><strong>n_attempts_per_query:</strong>  Number of attempts to access the API per query. Useful when experiencing connection issues.</li>\n<li><strong>call_size:</strong>  maximum number of papers to call API for in one query; if less than <code>len(paper_ids)</code>, chunking will be performed. Maximum that S2 allows is 500.</li>\n<li><strong>convert:</strong>  whether to convert each resulting SemanticScholar Paper to sciterra Publications (True by default).</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>the list of publications (or Papers)</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">paper_ids</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"o\">*</span><span class=\"n\">args</span>,</span><span class=\"param\">\t<span class=\"n\">call_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">10</span>,</span><span class=\"param\">\t<span class=\"n\">n_attempts_per_query</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">50</span>,</span><span class=\"param\">\t<span class=\"n\">convert</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">True</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.convert_publication", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.convert_publication", "kind": "function", "doc": "<p>Convert a SemanticScholar Paper object to a sciterra.publication.Publication.</p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">paper</span><span class=\"p\">:</span> <span class=\"n\">semanticscholar</span><span class=\"o\">.</span><span class=\"n\">Paper</span><span class=\"o\">.</span><span class=\"n\">Paper</span>,</span><span class=\"param\">\t<span class=\"o\">*</span><span class=\"n\">args</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_papers", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_papers", "kind": "function", "doc": "<p>Custom function for calling the S2 API that doesn't fail on empty results.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">paper_ids</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"n\">fields</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_paper", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_paper", "kind": "function", "doc": "<p>Custom function for calling the S2 API that doesn't fail on empty results.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">paper_id</span><span class=\"p\">:</span> <span class=\"nb\">str</span>, </span><span class=\"param\"><span class=\"n\">fields</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping", "modulename": "sciterra.mapping", "kind": "module", "doc": "<p>Classes for constructing maps of scientific literature.</p>\n\n<p>The <code>sciterra.mapping.atlas</code> submodule contains the basic data structure, the Atlas.</p>\n\n<p>The <code>sciterra.mapping.cartography</code> submodule contains functionality for manipulating an Atlas.</p>\n"}, {"fullname": "sciterra.mapping.atlas", "modulename": "sciterra.mapping.atlas", "kind": "module", "doc": "<p>Main container object for a large library of publications.</p>\n"}, {"fullname": "sciterra.mapping.atlas.Atlas", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas", "kind": "class", "doc": "<p>Data structure for storing publications.</p>\n\n<p><code>self.projection</code>: the Projection object containing the embeddings of all publications and their mapping to str identifiers.</p>\n\n<p><code>self.bad_ids</code>: a list of identifiers that have failed for some reason or other during an expansion, and will be excluded from subsequent expansions.</p>\n\n<p><code>self.history</code>: dict of the form {'pubs_per_update': list[list[str]], 'kernel_size': np.ndarray of ints of shape <code>(num_pubs, last_update)</code> where last_update &lt;= the total number of expansions performed.}</p>\n\n<p><code>self.center</code>: the core, central Publication identifier repeatedly passed to <code>cartography.Cartographer.expand</code>. Default is None, which means the Atlas has no internal record of the central publication.</p>\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.__init__", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.__init__", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">publications</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"n\">projection</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">vectorization</span><span class=\"o\">.</span><span class=\"n\">projection</span><span class=\"o\">.</span><span class=\"n\">Projection</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">bad_ids</span><span class=\"p\">:</span> <span class=\"nb\">set</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"nb\">set</span><span class=\"p\">()</span>,</span><span class=\"param\">\t<span class=\"n\">history</span><span class=\"p\">:</span> <span class=\"nb\">dict</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">,</span> <span class=\"n\">typing</span><span class=\"o\">.</span><span class=\"n\">Any</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">center</span><span class=\"p\">:</span> <span class=\"nb\">str</span> <span class=\"o\">=</span> <span class=\"kc\">None</span></span>)</span>"}, {"fullname": "sciterra.mapping.atlas.Atlas.publications", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.publications", "kind": "variable", "doc": "<p></p>\n", "annotation": ": dict[str, sciterra.mapping.publication.Publication]"}, {"fullname": "sciterra.mapping.atlas.Atlas.projection", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.projection", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.bad_ids", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.bad_ids", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.history", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.history", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.center", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.center", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.ids", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.ids", "kind": "variable", "doc": "<p>Get a list of all the publication identifiers in the Atlas.</p>\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.atlas.Atlas.save", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.save", "kind": "function", "doc": "<p>Write the Atlas to a directory containing a .pkl binary for each attribute.</p>\n\n<p>Warnings cannot be silenced.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atlas_dirpath:</strong>  path of directory to save files to.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">atlas_dirpath</span><span class=\"p\">:</span> <span class=\"nb\">str</span>, </span><span class=\"param\"><span class=\"n\">overwrite</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">True</span></span><span class=\"return-annotation\">) -> <span class=\"kc\">None</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.atlas.Atlas.load", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.load", "kind": "function", "doc": "<p>Load an Atlas object from a directory containing the .pkl binary for each attribute.</p>\n\n<p>Warnings cannot be silenced.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atlas_dirpath:</strong>  directory where .pkl binaries will be read from</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">cls</span>, </span><span class=\"param\"><span class=\"n\">atlas_dirpath</span><span class=\"p\">:</span> <span class=\"nb\">str</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography", "modulename": "sciterra.mapping.cartography", "kind": "module", "doc": "<p>Functions for manipulating an atlas based on the document embeddings of the abstracts of its publications.</p>\n"}, {"fullname": "sciterra.mapping.cartography.batch_cospsi_matrix", "modulename": "sciterra.mapping.cartography", "qualname": "batch_cospsi_matrix", "kind": "function", "doc": "<p>Batch-process a pairwise cosine similarity matrix between embeddings.</p>\n\n<p>In order to avoid memory errors (e.g. bus error, segfaults) resulting from too large arrays, we batch process the construction of the cospsi_matrix.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>embeddings:</strong>  a numpy array of embeddings of shape <code>(num_pubs, embedding_dim)</code></li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>cosine_similarities: a 2D numpy array of shape <code>(num_pubs, num_pubs)</code> representing the pairwise cosine similarity between each embedding</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">embeddings</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span></span><span class=\"return-annotation\">) -> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.pub_has_attributes", "modulename": "sciterra.mapping.cartography", "qualname": "pub_has_attributes", "kind": "function", "doc": "<p>Return True if a publication has all <code>attributes</code>.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>attributes:</strong>  the list of attributes to check are not <code>None</code> for each publication from the atlas.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">pub</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span>,</span><span class=\"param\">\t<span class=\"n\">attributes</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">bool</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.pub_has_fields_of_study", "modulename": "sciterra.mapping.cartography", "qualname": "pub_has_fields_of_study", "kind": "function", "doc": "<p>Return true if any of <code>pub.fields_of_study</code> are in passed <code>fields_of_study</code>.</p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">pub</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span>,</span><span class=\"param\">\t<span class=\"n\">fields_of_study</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">bool</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer", "kind": "class", "doc": "<p>A basic wrapper for obtaining and updating atlas projections.</p>\n\n<p><code>self.librarian</code>: the Librarian object used to query a bibliographic database API.\n<code>self.vectorizer</code>: the Vectorizer object used to get a document embedding for each abstract\n<code>self.pubs_per_update</code>: a list of lists of publication str ids, representing the publications that exist at each time step / expansion update.\n<code>self.update_history</code>: an np.array of ints representing when publications were added. A value of -2 indicates no record of being added.</p>\n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.__init__", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.__init__", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">librarian</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">librarians</span><span class=\"o\">.</span><span class=\"n\">librarian</span><span class=\"o\">.</span><span class=\"n\">Librarian</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">vectorizer</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">vectorization</span><span class=\"o\">.</span><span class=\"n\">vectorizer</span><span class=\"o\">.</span><span class=\"n\">Vectorizer</span> <span class=\"o\">=</span> <span class=\"kc\">None</span></span>)</span>"}, {"fullname": "sciterra.mapping.cartography.Cartographer.librarian", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.librarian", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.vectorizer", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.vectorizer", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.pubs_per_update", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.pubs_per_update", "kind": "variable", "doc": "<p></p>\n", "annotation": ": list[list[str]]"}, {"fullname": "sciterra.mapping.cartography.Cartographer.update_history", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.update_history", "kind": "variable", "doc": "<p></p>\n", "annotation": ": numpy.ndarray"}, {"fullname": "sciterra.mapping.cartography.Cartographer.bibtex_to_atlas", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.bibtex_to_atlas", "kind": "function", "doc": "<p>Convert a bibtex file to an atlas, by parsing each entry for an identifier, and querying an API for publications using <code>self.librarian</code>.</p>\n\n<p>NOTE: the identifiers in the corresponding atlas will be API-specific ids; there is no relationship between the parsed id used to query papers (e.g. 'DOI:XYZ' in the case of SemanticScholar) and the resulting identifier associated with the resulting Publication object, (a paperId, a bibcode, etc.) Therefore, the purpose of using the <code>bibtex_to_atlas</code> method is primarily for initializing literature exploration in a human-readable way. If you want to obtain as many publications as identifiers supplied, you need to use <code>get_publications</code>.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>bibtex_fp:</strong>  the filepath where the bibtex file is saved.</li>\n<li>args and kwargs are passed to <code>get_publications</code>.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">bibtex_fp</span><span class=\"p\">:</span> <span class=\"nb\">str</span>, </span><span class=\"param\"><span class=\"o\">*</span><span class=\"n\">args</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.project", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.project", "kind": "function", "doc": "<p>Update an atlas with its projection, i.e. the document embeddings for all publications using <code>self.vectorizer</code>, removing publications with no abstracts.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the Atlas containing publications to project to document embeddings</li>\n<li><strong>kwargs:</strong>  keyword arguments propagated to <code>filter_by_func</code></li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>the updated atlas containing all nonempty-abstract-containing publications and their projection</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.expand", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.expand", "kind": "function", "doc": "<p>Expand an atlas by retrieving a list of publications resulting from traversal of the citation network.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the atlas containing the region to expand</li>\n<li><strong>center:</strong>  (if given) center the search on this publication, preferentially searching related publications. Default is <code>None</code>, and the expansion is not centered. To keep a consistent expansion around one center, you should pass <code>atl.center</code>.</li>\n<li><strong>n_pubs_max:</strong>  maximum number of publications allowed in the expansion.</li>\n<li><strong>n_sources_max:</strong>  maximum number of publications (already in the atlas) to draw references and citations from.</li>\n<li><strong>record_pubs_per_update:</strong>  whether to track all the publications that exist in the resulting atlas to <code>self.pubs_per_update</code>. This should only be set to <code>True</code> when you need to later filter by degree of convergence of the atlas.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>atl_expanded: the expanded atlas</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"o\">*</span><span class=\"n\">args</span>,</span><span class=\"param\">\t<span class=\"n\">center</span><span class=\"p\">:</span> <span class=\"nb\">str</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">n_pubs_max</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">4000</span>,</span><span class=\"param\">\t<span class=\"n\">n_sources_max</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">record_pubs_per_update</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">False</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.filter_by_func", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.filter_by_func", "kind": "function", "doc": "<p>Update an atlas by dropping publications (and corresponding data in projection) when certain fields are empty.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the Atlas containing publications to filter</li>\n<li><strong>require_func:</strong>  a function that takes a publication and returns True if it should be kept in the atlas. For example, if <code>func = lambda pub: pub_has_attributes(pub, [\"abstract\"])</code>, then all publications that have <code>None</code> for the attribute <code>abstract</code> will be removed from the atlas, along with the corresponding data in the projection.</li>\n<li><strong>record_pubs_per_update:</strong>  whether to track all the publications that exist in the resulting atlas to <code>self.pubs_per_update</code>. This should only be set to <code>True</code> when you need to later filter by degree of convergence of the atlas. This is an important parameter because <code>self.filter</code> is called in <code>self.project</code>, which typically is called after <code>self.expand</code>, where we pass in the same parameter.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>the filtered atlas</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"n\">require_func</span><span class=\"p\">:</span> <span class=\"n\">Callable</span><span class=\"p\">[[</span><span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span><span class=\"p\">],</span> <span class=\"nb\">bool</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"o\">&lt;</span><span class=\"n\">function</span> <span class=\"n\">Cartographer</span><span class=\"o\">.&lt;</span><span class=\"k\">lambda</span><span class=\"o\">&gt;&gt;</span>,</span><span class=\"param\">\t<span class=\"n\">record_pubs_per_update</span><span class=\"o\">=</span><span class=\"kc\">False</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.filter_by_ids", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.filter_by_ids", "kind": "function", "doc": "<p>Update an atlas by dropping publications (and corresponding data in projection).</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the Atlas containing publications to filter</li>\n<li><strong>keep_ids:</strong>  the list of publication ids to NOT filter; all other publications in <code>atl</code> not matching one of these ids will be removed.</li>\n<li><strong>drop_ids:</strong>  the list of publications to filter; all publications in <code>atl</code> matching one of these ids will be removed.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"n\">keep_ids</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">drop_ids</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.track", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.track", "kind": "function", "doc": "<p>Overwrite the data associated with tracking degree of convergence of publications in an atlas over multiple expansions. N.B.: the atlas must be fully projected, or else <code>converged_kernel_size</code> will raise a KeyError. By default, this function will overwrite the <code>atl.history</code> with updated <code>self.pubs_per_update</code>, but not <code>kernel_size</code>, which requires computing the converged kernel size for every publication in the atlas.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the Atlas that will be updated by overwriting <code>Atlas.history</code></li>\n<li><strong>calculate_convergence:</strong>  whether to call <code>self.converged_kernel_size</code>, and store the results in the <code>atl.history</code>.</li>\n<li><strong>pubs:</strong>  the list of publications to pass to <code>self.record_update_history</code>. By default <code>None</code>, and the most recent list of publications in the <code>self.update_history</code> list will be used.</li>\n<li><strong>pubs_per_update:</strong>  the list of lists of publications to pass to <code>self.record_update_history</code>. By default <code>None</code>, adn the full history in <code>self.update_history</code> will be used.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>atl the updated Atlas</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"n\">calculate_convergence</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">False</span>,</span><span class=\"param\">\t<span class=\"n\">pubs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">pubs_per_update</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.record_update_history", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.record_update_history", "kind": "function", "doc": "<p>Record when publications were added, by updating atl.update_history.</p>\n\n<p>atl.update_history is a np.array of ints representing when publications were added. A value of -2 indicates no record of being added.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>pubs:</strong>  a list of str ids corresponding to publications at the final update in the update history. By default <code>None</code>, and <code>self.pubs_per_update[-1]</code> will be used.</li>\n<li><strong>pubs_per_update:</strong>  a list of which publications existed at which iteration, with the index of the overall list corresponding to the iteration the publication was added. By default <code>None</code>, and <code>self.pubs_per_update</code> will be used.</li>\n</ul>\n\n<h6 id=\"updates\">Updates:</h6>\n\n<blockquote>\n  <p><code>self.update_history</code>: an np.array of ints representing when publications were added. A value of -2 indicates no record of being added.</p>\n</blockquote>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p><code>None</code></p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">pubs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">pubs_per_update</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span></span><span class=\"return-annotation\">) -> <span class=\"kc\">None</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.converged_kernel_size", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.converged_kernel_size", "kind": "function", "doc": "<p>Calculate the largest size of the kernel that's converged (at differing levels of convergence) for each publication in a sample at each update.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  Atlas containing publications; for each publication we compute the largest converged kernel size at each update</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>kernel_size: an array of ints of shape <code>(num_pubs, max_update)</code> representing the kernel size for converged kernels.\n      - The first column indicates the largest kernel size that hasn't changed since the beginning,\n      - The second column indicates the largest kernel size that hasn't changed since the first update,\n      - etc. for the nth column.</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span></span><span class=\"return-annotation\">) -> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.measure_topography", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.measure_topography", "kind": "function", "doc": "<p>Measure topographic properties of all publications relative to prior\npublications.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the Atlas to measure</li>\n<li><strong>publication_indices:</strong>  an np.ndarray of ints representing the indices of publications in the Atlas projection to measure</li>\n<li><p><strong>metrics:</strong>  A list of strings representing the metrics to use. Options are...\nconstant_asymmetry: The asymmetry of a publication $p_i$ w.r.t the entire atlas ${ p_j \\forall j \\in {1, ..., k} } where $k$ is the length of the atlas</p>\n\n<p>$| \\sum_{j}^{k-1}( p_i - p_j ) |$</p>\n\n<p>kernel_constant_asymmetry: The asymmetry of a publication w.r.t. its kernel, { p_j for all j in {1, ..., k} } where k is <code>kernel_size</code>, i.e. the k nearest neighbors.</p>\n\n<p>density: the density of a publication's surrounding area, estimated by a heuristic inspired by mass / volume = k publications divided by the minimum arc length enclosing the furthest publication.</p>\n\n<p>$\\frac{ k }{ smoothing_length(k) }$</p>\n\n<p>smoothing_length: The distance (in radians) to the farthest publication in the kernel, i.e. the kth nearest neighbor.</p></li>\n<li><strong>min_prior_pubs:</strong>  The minimum number of publications prior to the target publication for which to calculate the metric.</li>\n<li><strong>kernel_size:</strong>  the number of publications surrounding the publication for which to compute the topography metric, i.e. k nearest neighbors for k=kernel_size.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>estimates: an np.ndarray of shape <code>(len(publication_indices), len(metrics))</code> representing the estimated topography metric values for each publication.</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"n\">ids</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">metrics</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"p\">[</span><span class=\"s1\">&#39;density&#39;</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"n\">min_prior_pubs</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">2</span>,</span><span class=\"param\">\t<span class=\"n\">kernel_size</span><span class=\"o\">=</span><span class=\"mi\">16</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.publication", "modulename": "sciterra.mapping.publication", "kind": "module", "doc": "<p>The general container for data for any scientific publication, regardless of the API that was used to obtain it.</p>\n"}, {"fullname": "sciterra.mapping.publication.FIELDS", "modulename": "sciterra.mapping.publication", "qualname": "FIELDS", "kind": "variable", "doc": "<p></p>\n", "default_value": "[&#x27;identifier&#x27;, &#x27;abstract&#x27;, &#x27;publication_date&#x27;, &#x27;citation_count&#x27;, &#x27;citations&#x27;, &#x27;references&#x27;]"}, {"fullname": "sciterra.mapping.publication.ADDITIONAL_FIELDS", "modulename": "sciterra.mapping.publication", "qualname": "ADDITIONAL_FIELDS", "kind": "variable", "doc": "<p></p>\n", "default_value": "[&#x27;doi&#x27;, &#x27;url&#x27;, &#x27;title&#x27;, &#x27;issn&#x27;]"}, {"fullname": "sciterra.mapping.publication.Publication", "modulename": "sciterra.mapping.publication", "qualname": "Publication", "kind": "class", "doc": "<p>The Publication is a standardized container a scientific publication's retrieved data.</p>\n\n<p>In general, all data-cleaning shoud be done prior to constructing a Publication, in order to keep the class minimal.</p>\n\n<h6 id=\"attributes\">Attributes:</h6>\n\n<ul>\n<li><strong>identifier:</strong>  The string id that uniquely identifies the publication, used for\n<ul>\n<li>storing in an Atlas</li>\n<li>querying an API</li>\n</ul></li>\n<li><strong>abstract:</strong>  The string corresponding to the publication's abstract</li>\n<li><strong>publication_date:</strong>  A datetime representing the date of publication</li>\n<li><strong>citation_count:</strong>  An int corresponding to the number of citations received by the publication</li>\n</ul>\n"}, {"fullname": "sciterra.mapping.publication.Publication.__init__", "modulename": "sciterra.mapping.publication", "qualname": "Publication.__init__", "kind": "function", "doc": "<p>Construct a publication.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>data:</strong>  to initialize attributes</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">data</span><span class=\"p\">:</span> <span class=\"nb\">dict</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span>)</span>"}, {"fullname": "sciterra.mapping.publication.Publication.identifier", "modulename": "sciterra.mapping.publication", "qualname": "Publication.identifier", "kind": "variable", "doc": "<p></p>\n", "annotation": ": str"}, {"fullname": "sciterra.mapping.publication.Publication.abstract", "modulename": "sciterra.mapping.publication", "qualname": "Publication.abstract", "kind": "variable", "doc": "<p></p>\n", "annotation": ": str"}, {"fullname": "sciterra.mapping.publication.Publication.publication_date", "modulename": "sciterra.mapping.publication", "qualname": "Publication.publication_date", "kind": "variable", "doc": "<p></p>\n", "annotation": ": datetime.date"}, {"fullname": "sciterra.mapping.publication.Publication.citations", "modulename": "sciterra.mapping.publication", "qualname": "Publication.citations", "kind": "variable", "doc": "<p></p>\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.references", "modulename": "sciterra.mapping.publication", "qualname": "Publication.references", "kind": "variable", "doc": "<p></p>\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.citation_count", "modulename": "sciterra.mapping.publication", "qualname": "Publication.citation_count", "kind": "variable", "doc": "<p>The citation_count can be different from the length of <code>citations</code>, since the number of citations listed for a paper might be different from the number of (valid) citing papers indexed on the relevant API.</p>\n", "annotation": ": int"}, {"fullname": "sciterra.mapping.publication.Publication.fields_of_study", "modulename": "sciterra.mapping.publication", "qualname": "Publication.fields_of_study", "kind": "variable", "doc": "<p></p>\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.init_attributes", "modulename": "sciterra.mapping.publication", "qualname": "Publication.init_attributes", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">data</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"kc\">None</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography", "modulename": "sciterra.mapping.topography", "kind": "module", "doc": "<p>Functions for measuring topographic properties of (the semantic feature space of publications inside) an Atlas.</p>\n"}, {"fullname": "sciterra.mapping.topography.smoothing_length_metric", "modulename": "sciterra.mapping.topography", "qualname": "smoothing_length_metric", "kind": "function", "doc": "<p>Proxy for the density of a publication defined as the minimum\narc length that encloses kernel_size other publications.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>idx:</strong>  the index of the vector to calculate the measurement for.</li>\n<li><strong>cospsi_matrix:</strong>  a 2D matrix of pairwise cosine similarity scores for publication embeddings.</li>\n<li><strong>valid_indices:</strong>  Indices of the other publication used when calculating the measurements.</li>\n<li><strong>kernel_size:</strong>  number of K nearest neighbors to calculate the measurement on.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>h: float representing arc length containing <code>kernel_size</code> other publications. (Assumes normalized to a radius of 1.)</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">idx</span><span class=\"p\">:</span> <span class=\"nb\">int</span>,</span><span class=\"param\">\t<span class=\"n\">cospsi_matrix</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">valid_indices</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">kernel_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">16</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.density_metric", "modulename": "sciterra.mapping.topography", "qualname": "density_metric", "kind": "function", "doc": "<p>Estimate the density of a publication by calculating the\nsmoothing length that encloses kernel_size other publications.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>idx:</strong>  the index of the vector to calculate the measurement for.</li>\n<li><strong>cospsi_matrix:</strong>  a 2D matrix of pairwise cosine similarity scores for publication embeddings.</li>\n<li><strong>valid_indices:</strong>  Indices of the other publication used when calculating the measurements.</li>\n<li><strong>kernel_size:</strong>  number of K nearest neighbors to calculate the measurement on.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>density: a float representing <code>kernel_size</code> divided by arc length containing <code>kernel_size</code> other publications.</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">idx</span><span class=\"p\">:</span> <span class=\"nb\">int</span>,</span><span class=\"param\">\t<span class=\"n\">cospsi_matrix</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">valid_indices</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">kernel_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">16</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.edginess_metric", "modulename": "sciterra.mapping.topography", "qualname": "edginess_metric", "kind": "function", "doc": "<p>Estimate the asymmetry of a publication by calculating the difference between that publication's projection and the other publications within the kernel. Normalized to between 0 and 1.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>idx:</strong>  the index of the vector to calculate the measurement for.</li>\n<li><strong>cospsi_matrix:</strong>  an np.ndarray of shape <code>(num_pubs, num_pubs)</code> representing pairwise cosine similarity scores for publication embeddings.</li>\n<li><strong>valid_indices:</strong>  an np.ndarray of shape <code>(num_valid_pubs)</code> representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs &lt;= num_pubs.</li>\n<li><strong>publication_indices:</strong>  an np.ndarray of shape <code>(num_pubs,)</code> representing indices of all publications in the atlas projection</li>\n<li><strong>embeddings:</strong>  an np.ndarray of shape <code>(num_pubs, embedding_dim)</code> vectors for all publications in the atlas projection</li>\n<li><strong>kernel_size:</strong>  number of K nearest neighbors to calculate the measurement on.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>a float representing the normalized magnitude of the asymmetry metric.</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">idx</span><span class=\"p\">:</span> <span class=\"nb\">int</span>,</span><span class=\"param\">\t<span class=\"n\">cospsi_matrix</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">valid_indices</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">publication_indices</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">embeddings</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">kernel_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">16</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">float</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.kernel_constant_asymmetry_metric", "modulename": "sciterra.mapping.topography", "qualname": "kernel_constant_asymmetry_metric", "kind": "function", "doc": "<p>Estimate the asymmetry of a publication by calculating the difference\nbetween that publication's projection and the other publications within\nthe kernel.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>idx:</strong>  an int representing the index of the vector to calculate the measurement for.</li>\n<li><strong>cospsi_matrix:</strong>  an np.ndarray of shape <code>(num_pubs, num_pubs)</code> representing pairwise cosine similarity scores for publication embeddings.</li>\n<li><strong>valid_indices:</strong>  an np.ndarray of shape <code>(num_valid_pubs)</code> representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs &lt;= num_pubs.</li>\n<li><strong>publication_indices:</strong>  an np.ndarray of shape <code>(num_pubs,)</code> representing indices of all publications in the atlas projection</li>\n<li><strong>embeddings:</strong>  an np.ndarray of shape <code>(num_pubs, embedding_dim)</code> vectors for all publications in the atlas projection</li>\n<li><strong>kernel_size:</strong>  an int representing the number of K nearest neighbors to calculate the measurement on.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>mag: a float representing the magnitude of the asymmetry metric.</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">idx</span><span class=\"p\">:</span> <span class=\"nb\">int</span>,</span><span class=\"param\">\t<span class=\"n\">cospsi_matrix</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">valid_indices</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">publication_indices</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">embeddings</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">kernel_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">16</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">float</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.tracing", "modulename": "sciterra.mapping.tracing", "kind": "module", "doc": "<p>Convenience functionality for organized expansions of an Atlas.</p>\n"}, {"fullname": "sciterra.mapping.tracing.iterate_expand", "modulename": "sciterra.mapping.tracing", "qualname": "iterate_expand", "kind": "function", "doc": "<p>Build out an Atlas of publications, i.e. search for similar publications. This is done by iterating a sequence of [expand, save, project, save, track, save]. The convergence criterion is:</p>\n\n<p><code>converged = len(atl) &gt;= target_size or failures &gt;= max_failed_expansions or convergence_func(atl)</code></p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the Atlas to expand</li>\n<li><strong>crt:</strong>  the Cartographer to use</li>\n<li><strong>atlas_dir:</strong>  the directory where Atlas binaries will be saved/loaded from</li>\n<li><strong>target_size:</strong>  stop iterating when we reach this number of publications in the Atlas</li>\n<li><strong>max_failed_expansions:</strong>  stop iterating when we fail to add new publications after this many successive iterations. Default is 2.</li>\n<li><strong>convergence_func:</strong>  a function taking an Atlas and returns True if the expansion loop should stop. This serves as an additional disjunctive convergence criterion besides <code>target_size</code> and <code>max_failed_expansions</code> to exit the expansion loop, which will be called at the convergence check after each expansion. For example, a redundant function to choose would be <code>lambda atl: len(atl) &gt;= target_size</code>. By default returns False.</li>\n<li><strong>center:</strong>  (if given) center the search on this publication, preferentially searching related publications.</li>\n<li><strong>n_pubs_max:</strong>  maximum number of publications allowed in the expansion.</li>\n<li><strong>call_size:</strong>  maximum number of papers to call API for in one query; if less than <code>len(paper_ids)</code>, chunking will be performed.</li>\n<li><strong>n_sources_max:</strong>  maximum number of publications (already in the atlas) to draw references and citations from.</li>\n<li><strong>record_pubs_per_update:</strong>  whether to track all the publications that exist in the resulting atlas to <code>self.pubs_per_update</code>. This should only be set to <code>True</code> when you need to later filter by degree of convergence of the atlas.</li>\n<li><strong>project_kwargs:</strong>  keyword args propagated to every <code>Cartographer.project</code> call during iterate_expand; see <code>Cartographer.filter_by_func</code>.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>atl: the expanded Atlas</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"n\">crt</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">cartography</span><span class=\"o\">.</span><span class=\"n\">Cartographer</span>,</span><span class=\"param\">\t<span class=\"n\">atlas_dir</span><span class=\"p\">:</span> <span class=\"nb\">str</span>,</span><span class=\"param\">\t<span class=\"n\">target_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span>,</span><span class=\"param\">\t<span class=\"n\">max_failed_expansions</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">2</span>,</span><span class=\"param\">\t<span class=\"n\">convergence_func</span><span class=\"p\">:</span> <span class=\"n\">Callable</span><span class=\"p\">[[</span><span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span><span class=\"p\">],</span> <span class=\"nb\">bool</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"o\">&lt;</span><span class=\"n\">function</span> <span class=\"o\">&lt;</span><span class=\"k\">lambda</span><span class=\"o\">&gt;&gt;</span>,</span><span class=\"param\">\t<span class=\"n\">center</span><span class=\"p\">:</span> <span class=\"nb\">str</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">n_pubs_max</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">call_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">n_sources_max</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">record_pubs_per_update</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">False</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">project_kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.tracing.search_converged_ids", "modulename": "sciterra.mapping.tracing", "qualname": "search_converged_ids", "kind": "function", "doc": "<p>Get all publication ids such that they did not change neighborhood identity over the duration of the addition of the last <code>num_pubs_added</code> publications added to the atlas during previous <code>Cartographer.expand</code> calls.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  Atlas to search for converged publications</li>\n<li><strong>num_pubs_added:</strong>  the number of publications that we require to have been added to the Atlas in order to compute convergence, in order to compute the find the update index in the Atlas history to filter by <code>&gt;= kernel_size</code>.</li>\n<li><strong>kernel_size:</strong>  the minimum required size of the neighborhood that we will require to not have changed, w.r.t. <code>cond_d</code>. Default is 16.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>converged_pub_ids: a list of Publication identifiers corresponding to publications that have converged acording to the criterion.</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"n\">num_pubs_added</span><span class=\"p\">:</span> <span class=\"nb\">int</span>,</span><span class=\"param\">\t<span class=\"n\">kernel_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">16</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer", "kind": "class", "doc": "<p>Convenience data structure for bookkeeping expansions of an Atlas that reduces boilerplate and ensures an aligned update history between the Atlas and Cartographer.</p>\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.__init__", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.__init__", "kind": "function", "doc": "<p>Convenience wrapper data structure for tracked expansions, by aligning the history of a Cartographer with an Atlas.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atlas_dir:</strong>  absolute path of the directory to save atlas data in, propogated to <code>Atlas.load</code> and <code>Atlas.save</code></li>\n<li><strong>atlas_center_bibtex:</strong>  absolute path of the .bib file containing a single entry, which is the core, central publication, and this entry contains an identifier recognizable by the librarian corresponding to <code>librarian_name</code>.</li>\n<li><strong>librarian_name:</strong>  a str name of a librarian, one of <code>librarians.librarians.keys()</code>, e.g. 'S2' or 'ADS'.</li>\n<li><strong>vectorizer_name:</strong>  a str name of a vectorizer, one of <code>vectorization.vectorizers.keys()</code>, e.g. 'BOW' or 'SciBERT'.</li>\n<li><strong>librarian_kwargs:</strong>  keyword args propogated to a Librarian initialization; if values are <code>None</code> they will be omitted</li>\n<li><strong>vectorizer_kwargs:</strong>  keyword args propogated to a Vectorizer initialization; if values are <code>None</code> they will be omitted</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">atlas_dir</span><span class=\"p\">:</span> <span class=\"nb\">str</span>,</span><span class=\"param\">\t<span class=\"n\">atlas_center_bibtex</span><span class=\"p\">:</span> <span class=\"nb\">str</span>,</span><span class=\"param\">\t<span class=\"n\">librarian_name</span><span class=\"p\">:</span> <span class=\"nb\">str</span>,</span><span class=\"param\">\t<span class=\"n\">vectorizer_name</span><span class=\"p\">:</span> <span class=\"nb\">str</span>,</span><span class=\"param\">\t<span class=\"n\">librarian_kwargs</span><span class=\"p\">:</span> <span class=\"nb\">dict</span> <span class=\"o\">=</span> <span class=\"p\">{}</span>,</span><span class=\"param\">\t<span class=\"n\">vectorizer_kwargs</span><span class=\"p\">:</span> <span class=\"nb\">dict</span> <span class=\"o\">=</span> <span class=\"p\">{}</span></span>)</span>"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.cartographer", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.cartographer", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.atlas_dir", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.atlas_dir", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.atlas", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.atlas", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.expand_atlas", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.expand_atlas", "kind": "function", "doc": "<p>Start or continue the expansion of the Atlas by calling <code>iterate_expand</code> with aligned Cartographer and Atlas, by default centered on atl.center.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>target_size:</strong>  stop iterating expansion when Atlas contains this many publications; argument propagated to <code>iterate_expand</code>.</li>\n<li><strong>kwargs:</strong>  keyword args propagated to <code>iterate_expand</code>.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">target_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"kc\">None</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc", "modulename": "sciterra.misc", "kind": "module", "doc": "<p></p>\n"}, {"fullname": "sciterra.misc.analysis", "modulename": "sciterra.misc.analysis", "kind": "module", "doc": "<p>Helper functions for analyzing data yielded by an atlas.</p>\n"}, {"fullname": "sciterra.misc.analysis.atlas_to_measurements", "modulename": "sciterra.misc.analysis", "qualname": "atlas_to_measurements", "kind": "function", "doc": "<p>Compute the density, edginess, and citations per year metrics for each publicaation in an atlas w.r.t. a vectorizer and convergence configurations, and return the results in a dataframe.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the Atlas to measure</li>\n<li><strong>vectorizer:</strong>  the Vectorizer to use to compute density and edginess</li>\n<li><strong>con_d:</strong>  a reverse index. This represents a convergence degree, in the sense that it is the number of updates before the last udpate to require that a publication's neighborhood has not changed identity of composition.</li>\n<li>This will be used to compute the inverse index for the second axis of the array <code>atl.history['kernel_size']</code>, representing the degree of convergence. For details about this array see <code>sciterra.mapping.cartography.Cartographer.converged_kernel_size</code>. Default is 1, which means we will filter to all publications that have not changed neighborhoods up to <code>kernel_size</code> up until the very last update. If 2, then up to the second to last update, etc.</li>\n<li><strong>kernel_size:</strong>  the minimum required size of the neighborhood that we will require to not have changed, w.r.t. <code>cond_d</code>. Default is 16.</li>\n<li><strong>metrics:</strong>  the list of str names corresponding to metrics to compute for the atlas. See <code>sciterra.mapping.topography</code> for possible metrics.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"n\">vectorizer</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">vectorization</span><span class=\"o\">.</span><span class=\"n\">vectorizer</span><span class=\"o\">.</span><span class=\"n\">Vectorizer</span>,</span><span class=\"param\">\t<span class=\"n\">con_d</span><span class=\"p\">:</span> <span class=\"nb\">float</span>,</span><span class=\"param\">\t<span class=\"n\">kernel_size</span><span class=\"o\">=</span><span class=\"mi\">16</span>,</span><span class=\"param\">\t<span class=\"n\">metrics</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"p\">[</span><span class=\"s1\">&#39;density&#39;</span><span class=\"p\">,</span> <span class=\"s1\">&#39;edginess&#39;</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"n\">fields_of_study</span><span class=\"o\">=</span><span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">max_year</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">2023</span></span><span class=\"return-annotation\">) -> <span class=\"n\">pandas</span><span class=\"o\">.</span><span class=\"n\">core</span><span class=\"o\">.</span><span class=\"n\">frame</span><span class=\"o\">.</span><span class=\"n\">DataFrame</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc.utils", "modulename": "sciterra.misc.utils", "kind": "module", "doc": "<p>Miscellaneous helper functions.</p>\n"}, {"fullname": "sciterra.misc.utils.standardize_month", "modulename": "sciterra.misc.utils", "qualname": "standardize_month", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">month</span><span class=\"p\">:</span> <span class=\"nb\">str</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">str</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.keep_trying", "modulename": "sciterra.misc.utils", "qualname": "keep_trying", "kind": "function", "doc": "<p>Sometimes we receive server errors. We don't want that to disrupt the entire process, so this decorator allow trying n_attempts times.</p>\n\n<h2 id=\"api_extensionget_data_via_api\">API_extension::get_data_via_api</h2>\n\n<h2 id=\"this-decorator-is-general-except-for-the-default-allowed-exception\">This decorator is general, except for the default allowed exception.</h2>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>n_attempts (int):</strong>  Number of attempts before letting the exception happen.</li>\n<li><strong>allowed_exceptions (tuple of class):</strong>  Allowed exception class. Set to BaseException to keep trying regardless of exception.</li>\n<li><strong>sleep_after_attempt (int):</strong>  Number of seconds to wait before trying each additional attempt.</li>\n<li><strong>verbose (bool):</strong>  If True, be talkative.</li>\n</ul>\n\n<h6 id=\"example-usage\">Example Usage:</h6>\n\n<blockquote>\n  <blockquote>\n    <p>@keep_trying( n_attempts=4 )\n    def try_to_call_web_api():\n        \" do stuff \"</p>\n  </blockquote>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">n_attempts</span><span class=\"o\">=</span><span class=\"mi\">5</span>,</span><span class=\"param\">\tallowed_exceptions=(&lt;class &#x27;requests.exceptions.ReadTimeout&#x27;&gt;, &lt;class &#x27;requests.exceptions.ConnectionError&#x27;&gt;),</span><span class=\"param\">\t<span class=\"n\">verbose</span><span class=\"o\">=</span><span class=\"kc\">True</span>,</span><span class=\"param\">\t<span class=\"n\">sleep_after_attempt</span><span class=\"o\">=</span><span class=\"mi\">1</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.chunk_ids", "modulename": "sciterra.misc.utils", "qualname": "chunk_ids", "kind": "function", "doc": "<p>Helper function to chunk bibcodes or paperIds into smaller sublists if appropriate.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">ids</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"n\">call_size</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.write_pickle", "modulename": "sciterra.misc.utils", "qualname": "write_pickle", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">fn</span><span class=\"p\">:</span> <span class=\"nb\">str</span>, </span><span class=\"param\"><span class=\"n\">data</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.read_pickle", "modulename": "sciterra.misc.utils", "qualname": "read_pickle", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">fn</span><span class=\"p\">:</span> <span class=\"nb\">str</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.get_verbose", "modulename": "sciterra.misc.utils", "qualname": "get_verbose", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">kwargs</span><span class=\"p\">:</span> <span class=\"nb\">dict</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.custom_formatwarning", "modulename": "sciterra.misc.utils", "qualname": "custom_formatwarning", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">msg</span>, </span><span class=\"param\"><span class=\"o\">*</span><span class=\"n\">args</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization", "modulename": "sciterra.vectorization", "kind": "module", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.vectorizers", "modulename": "sciterra.vectorization", "qualname": "vectorizers", "kind": "variable", "doc": "<p></p>\n", "default_value": "{&#x27;GPT2&#x27;: &lt;class &#x27;sciterra.vectorization.gpt2.GPT2Vectorizer&#x27;&gt;, &#x27;SciBERT&#x27;: &lt;class &#x27;sciterra.vectorization.scibert.SciBERTVectorizer&#x27;&gt;, &#x27;SBERT&#x27;: &lt;class &#x27;sciterra.vectorization.sbert.SBERTVectorizer&#x27;&gt;, &#x27;Word2Vec&#x27;: &lt;class &#x27;sciterra.vectorization.word2vec.Word2VecVectorizer&#x27;&gt;, &#x27;BOW&#x27;: &lt;class &#x27;sciterra.vectorization.bow.BOWVectorizer&#x27;&gt;}"}, {"fullname": "sciterra.vectorization.bow", "modulename": "sciterra.vectorization.bow", "kind": "module", "doc": "<p>Bag of words document embedder. Unlike cc vectorization, we fix the dimension of the embeddings to be the same; this requires us to fix the vocabulary, so for consistency we do so via the same method as the Word2Vec vocabulary construction.</p>\n"}, {"fullname": "sciterra.vectorization.bow.current_file_abs_path", "modulename": "sciterra.vectorization.bow", "qualname": "current_file_abs_path", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization&#x27;"}, {"fullname": "sciterra.vectorization.bow.corpora_path", "modulename": "sciterra.vectorization.bow", "qualname": "corpora_path", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora&#x27;"}, {"fullname": "sciterra.vectorization.bow.ASTROPHYSICS_CORPUS", "modulename": "sciterra.vectorization.bow", "qualname": "ASTROPHYSICS_CORPUS", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;astro_small.txt&#x27;"}, {"fullname": "sciterra.vectorization.bow.DEFAULT_CORPUS", "modulename": "sciterra.vectorization.bow", "qualname": "DEFAULT_CORPUS", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora/astro_small.txt&#x27;"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.__init__", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.__init__", "kind": "function", "doc": "<p>Construct a bag-of-words document vectorizer.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"o\">*</span><span class=\"n\">args</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span>)</span>"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.word2vec_vectorizer", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.word2vec_vectorizer", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.vocabulary", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.vocabulary", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.embedding_dim", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.embedding_dim", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.count_vectorizer", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.count_vectorizer", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.embed_documents", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.embed_documents", "kind": "function", "doc": "<p>Embed a list of documents (raw text) into bow document vectors using scikit-learn's CountVectorizer.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>docs:</strong>  the documents to embed.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>a numpy array of shape <code>(num_documents, len(self.vocabulary))</code></p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">docs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">dict</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">,</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.gpt2", "modulename": "sciterra.vectorization.gpt2", "kind": "module", "doc": "<p>GPT-2 is a large causal language model that achieved SOTA in many NLP tasks before its successors created by OpenAI.</p>\n\n<h6 id=\"links\">Links:</h6>\n\n<blockquote>\n  <ul>\n  <li>Paper: <a href=\"https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf\">https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf</a></li>\n  <li>HF: <a href=\"https://huggingface.co/docs/transformers/en/model_doc/gpt2\">https://huggingface.co/docs/transformers/en/model_doc/gpt2</a></li>\n  </ul>\n</blockquote>\n"}, {"fullname": "sciterra.vectorization.gpt2.MPS_DEVICE", "modulename": "sciterra.vectorization.gpt2", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "<p></p>\n", "default_value": "device(type=&#x27;mps&#x27;)"}, {"fullname": "sciterra.vectorization.gpt2.EMBEDDING_DIM", "modulename": "sciterra.vectorization.gpt2", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "<p></p>\n", "default_value": "768"}, {"fullname": "sciterra.vectorization.gpt2.BATCH_SIZE", "modulename": "sciterra.vectorization.gpt2", "qualname": "BATCH_SIZE", "kind": "variable", "doc": "<p></p>\n", "default_value": "8"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.__init__", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.__init__", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">device</span><span class=\"o\">=</span><span class=\"s1\">&#39;cuda&#39;</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span>)</span>"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.tokenizer", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.tokenizer", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.model", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.model", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.embed_documents", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.embed_documents", "kind": "function", "doc": "<p>Embed a list of documents (raw text) into GPT-2 vectors, by batching.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>docs:</strong>  the documents to embed.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>a numpy array of shape <code>(num_documents, embedding_dim)</code></p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">docs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"n\">batch_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">8</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">dict</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">,</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.preprocessing", "modulename": "sciterra.vectorization.preprocessing", "kind": "module", "doc": "<p>Simple preprocessing of scientific abstracts prior to vectorization.</p>\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor", "kind": "class", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.__init__", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.__init__", "kind": "function", "doc": "<p>Initialize a custom tokenizer.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>allowed_pos_tags:</strong>  keep and lemmatize words that are tagged as one of these POS categories.</li>\n<li><strong>model:</strong>  the name of the spacy language model to load, assuming it is already downloaded.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">allowed_pos_tags</span><span class=\"p\">:</span> <span class=\"nb\">set</span> <span class=\"o\">=</span> <span class=\"p\">{</span><span class=\"s1\">&#39;ADJ&#39;</span><span class=\"p\">,</span> <span class=\"s1\">&#39;NOUN&#39;</span><span class=\"p\">,</span> <span class=\"s1\">&#39;VERB&#39;</span><span class=\"p\">}</span>,</span><span class=\"param\">\t<span class=\"n\">model</span><span class=\"o\">=</span><span class=\"s1\">&#39;en_core_web_sm&#39;</span></span>)</span>"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.nlp", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.nlp", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.allowed_pos_tags", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.allowed_pos_tags", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.custom_preprocess", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.custom_preprocess", "kind": "function", "doc": "<p>Get all of the lemmas of the words in a document, filtering by POS.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>document:</strong>  a multi-sentence string</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>a list of the lemmatized, filtered words in the document</p>\n</blockquote>\n\n<p>Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming</p>\n\n<p>See <a href=\"https://github.com/zhafen/cc/blob/master/cc/utils.py#L173\">https://github.com/zhafen/cc/blob/master/cc/utils.py#L173</a>.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">document</span><span class=\"p\">:</span> <span class=\"nb\">str</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection", "modulename": "sciterra.vectorization.projection", "kind": "module", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.projection.Projection", "modulename": "sciterra.vectorization.projection", "qualname": "Projection", "kind": "class", "doc": "<p>Basic wrapper for document embeddings and helper methods.</p>\n"}, {"fullname": "sciterra.vectorization.projection.Projection.__init__", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.__init__", "kind": "function", "doc": "<p>Construct a Projection object, a bidirectional mapping from identifiers to document embeddings.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>identifier_to_index:</strong>  a dict mapping Publication identifiers to indices in the embedding matrix.</li>\n<li><strong>index_to_identifier:</strong>  a tuple mapping embedding indices to Publication identifiers.</li>\n<li><strong>embeddings:</strong>  ndarray of document embeddings of shape <code>(num_pubs, embedding_dim)</code></li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">identifier_to_index</span><span class=\"p\">:</span> <span class=\"nb\">dict</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">,</span> <span class=\"nb\">int</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"n\">index_to_identifier</span><span class=\"p\">:</span> <span class=\"nb\">tuple</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"n\">embeddings</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span></span>)</span>"}, {"fullname": "sciterra.vectorization.projection.Projection.identifier_to_index", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifier_to_index", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.projection.Projection.index_to_identifier", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.index_to_identifier", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.projection.Projection.embeddings", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.embeddings", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.projection.Projection.indices_to_identifiers", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.indices_to_identifiers", "kind": "function", "doc": "<p>Retrieve the identifiers for a list of embedding matrix indices.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">indices</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.Projection.identifiers_to_embeddings", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifiers_to_embeddings", "kind": "function", "doc": "<p>Retrieve the document embeddings for a list of identifiers.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">identifiers</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span></span><span class=\"return-annotation\">) -> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.Projection.identifiers_to_indices", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifiers_to_indices", "kind": "function", "doc": "<p>Retrieve the embedding indices for a list of identifiers.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">identifiers</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span></span><span class=\"return-annotation\">) -> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.merge", "modulename": "sciterra.vectorization.projection", "qualname": "merge", "kind": "function", "doc": "<p>Return the result of merging projection <code>proj_a</code> with projection <code>proj_b</code>.</p>\n\n<p>This adds to proj_a all embedding data contained in proj_b that is missing from proj_a. This means that the resulting projection can only be greater or equal in size to proj_a.</p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">proj_a</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">vectorization</span><span class=\"o\">.</span><span class=\"n\">projection</span><span class=\"o\">.</span><span class=\"n\">Projection</span>,</span><span class=\"param\">\t<span class=\"n\">proj_b</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">vectorization</span><span class=\"o\">.</span><span class=\"n\">projection</span><span class=\"o\">.</span><span class=\"n\">Projection</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">vectorization</span><span class=\"o\">.</span><span class=\"n\">projection</span><span class=\"o\">.</span><span class=\"n\">Projection</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.get_empty_projection", "modulename": "sciterra.vectorization.projection", "qualname": "get_empty_projection", "kind": "function", "doc": "<p>Construct a Projection with no data (but is not None).</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">vectorization</span><span class=\"o\">.</span><span class=\"n\">projection</span><span class=\"o\">.</span><span class=\"n\">Projection</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.sbert", "modulename": "sciterra.vectorization.sbert", "kind": "module", "doc": "<p>We use the acronym SBERT as a catch-all for BERT-based sentence transformers. In particular, we use a lightweight/fast version of one the top-performing model.</p>\n\n<h6 id=\"links\">Links:</h6>\n\n<blockquote>\n  <p>sbert: <a href=\"https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models\">https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models</a>.\n  HF: <a href=\"https://huggingface.co/sentence-transformers\">https://huggingface.co/sentence-transformers</a></p>\n</blockquote>\n"}, {"fullname": "sciterra.vectorization.sbert.MPS_DEVICE", "modulename": "sciterra.vectorization.sbert", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "<p></p>\n", "default_value": "device(type=&#x27;mps&#x27;)"}, {"fullname": "sciterra.vectorization.sbert.MODEL_PATH", "modulename": "sciterra.vectorization.sbert", "qualname": "MODEL_PATH", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;all-MiniLM-L6-v2&#x27;"}, {"fullname": "sciterra.vectorization.sbert.EMBEDDING_DIM", "modulename": "sciterra.vectorization.sbert", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "<p></p>\n", "default_value": "384"}, {"fullname": "sciterra.vectorization.sbert.MAX_SEQ_LENGTH", "modulename": "sciterra.vectorization.sbert", "qualname": "MAX_SEQ_LENGTH", "kind": "variable", "doc": "<p></p>\n", "default_value": "256"}, {"fullname": "sciterra.vectorization.sbert.BATCH_SIZE", "modulename": "sciterra.vectorization.sbert", "qualname": "BATCH_SIZE", "kind": "variable", "doc": "<p></p>\n", "default_value": "64"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.__init__", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.__init__", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">device</span><span class=\"o\">=</span><span class=\"s1\">&#39;cuda&#39;</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span>)</span>"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.model", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.model", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.embed_documents", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.embed_documents", "kind": "function", "doc": "<p>Embed a list of documents (raw text) into SBERT vectors, by batching.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>docs:</strong>  the documents to embed.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>a numpy array of shape <code>(num_documents, 384)</code></p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">docs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"n\">batch_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">64</span></span><span class=\"return-annotation\">) -> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.scibert", "modulename": "sciterra.vectorization.scibert", "kind": "module", "doc": "<p>SciBERT is a BERT model trained on scientific text.</p>\n\n<h6 id=\"links\">Links:</h6>\n\n<blockquote>\n  <p>Paper: <a href=\"https://aclanthology.org/D19-1371/\">https://aclanthology.org/D19-1371/</a>\n  Github:  <a href=\"https://github.com/allenai/scibert\">https://github.com/allenai/scibert</a>\n  HF: <a href=\"https://huggingface.co/allenai/scibert_scivocab_uncased\">https://huggingface.co/allenai/scibert_scivocab_uncased</a></p>\n</blockquote>\n"}, {"fullname": "sciterra.vectorization.scibert.MPS_DEVICE", "modulename": "sciterra.vectorization.scibert", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "<p></p>\n", "default_value": "device(type=&#x27;mps&#x27;)"}, {"fullname": "sciterra.vectorization.scibert.MODEL_PATH", "modulename": "sciterra.vectorization.scibert", "qualname": "MODEL_PATH", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;allenai/scibert_scivocab_uncased&#x27;"}, {"fullname": "sciterra.vectorization.scibert.EMBEDDING_DIM", "modulename": "sciterra.vectorization.scibert", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "<p></p>\n", "default_value": "768"}, {"fullname": "sciterra.vectorization.scibert.BATCH_SIZE", "modulename": "sciterra.vectorization.scibert", "qualname": "BATCH_SIZE", "kind": "variable", "doc": "<p></p>\n", "default_value": "64"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.__init__", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.__init__", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">device</span><span class=\"o\">=</span><span class=\"s1\">&#39;cuda&#39;</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span>)</span>"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.tokenizer", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.tokenizer", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.model", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.model", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.embed_documents", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.embed_documents", "kind": "function", "doc": "<p>Embed a list of documents (raw text) into SciBERT vectors, by batching.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>docs:</strong>  the documents to embed.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>a numpy array of shape <code>(num_documents, 768)</code></p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">docs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"n\">batch_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">64</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">dict</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">,</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.vectorizer", "modulename": "sciterra.vectorization.vectorizer", "kind": "module", "doc": "<p>Base class for vectorizing abstracts.</p>\n"}, {"fullname": "sciterra.vectorization.vectorizer.Vectorizer", "modulename": "sciterra.vectorization.vectorizer", "qualname": "Vectorizer", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "abc.ABC"}, {"fullname": "sciterra.vectorization.vectorizer.Vectorizer.embed_documents", "modulename": "sciterra.vectorization.vectorizer", "qualname": "Vectorizer.embed_documents", "kind": "function", "doc": "<p>Embed a list of documents into document vectors.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>docs:</strong>  the documents to embed.</li>\n<li><strong>batch_size:</strong>  the batch size to use.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>a dict of the form\n  {\n      \"embeddings\": a numpy array of shape <code>(num_successful, embedding_dim)</code>, containing the document embeddingss</p>\n\n<pre><code>\"success_indices\": a numpy array of shape `(num_successful,)`, containing the indices of all the documents for which document embeddings were successfully obtained.\n\n\"fail_indices\": a numpy array of shape `(len(docs) - num_successful,)`, containing the indices of all the documents for which document embeddings could not be obtained\n</code></pre>\n  \n  <p>}\n  where the indices are with respect to the <code>docs</code> list passed.</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">docs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"n\">batch_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">64</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">dict</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">,</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.word2vec", "modulename": "sciterra.vectorization.word2vec", "kind": "module", "doc": "<p>We use a simple word2vec model that gets a document vector by averaging all words in the document.</p>\n\n<p>Since we are getting vectors for scientific documents, we must load a vocabulary to train the model from scratch. Therefore we define different subclasses for each scientific field, which may differ substantially by vocabulary.</p>\n\n<p>There exists a Doc2Vec module by gensim, but it seems that empirically Word2Vec + averaging can do just as well; furthermore, we're mainly interested in a simple baseline to compare with sophisticated embeddings.</p>\n\n<h6 id=\"links\">Links:</h6>\n\n<blockquote>\n  <p>gensim: <a href=\"https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#\">https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#</a></p>\n</blockquote>\n"}, {"fullname": "sciterra.vectorization.word2vec.EMBEDDING_DIM", "modulename": "sciterra.vectorization.word2vec", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "<p></p>\n", "default_value": "300"}, {"fullname": "sciterra.vectorization.word2vec.current_file_abs_path", "modulename": "sciterra.vectorization.word2vec", "qualname": "current_file_abs_path", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization&#x27;"}, {"fullname": "sciterra.vectorization.word2vec.corpora_path", "modulename": "sciterra.vectorization.word2vec", "qualname": "corpora_path", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora&#x27;"}, {"fullname": "sciterra.vectorization.word2vec.ASTROPHYSICS_CORPUS", "modulename": "sciterra.vectorization.word2vec", "qualname": "ASTROPHYSICS_CORPUS", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;astro_small.txt&#x27;"}, {"fullname": "sciterra.vectorization.word2vec.DEFAULT_CORPUS", "modulename": "sciterra.vectorization.word2vec", "qualname": "DEFAULT_CORPUS", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora/astro_small.txt&#x27;"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.__init__", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.__init__", "kind": "function", "doc": "<p>Construct a Word2Vec based document embedding model from a corpus.</p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">corpus_path</span><span class=\"p\">:</span> <span class=\"nb\">str</span>,</span><span class=\"param\">\t<span class=\"n\">model_path</span><span class=\"p\">:</span> <span class=\"nb\">str</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">vector_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">300</span>,</span><span class=\"param\">\t<span class=\"n\">window</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">5</span>,</span><span class=\"param\">\t<span class=\"n\">min_count</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">2</span>,</span><span class=\"param\">\t<span class=\"n\">workers</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">8</span>,</span><span class=\"param\">\t<span class=\"n\">epochs</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">10</span>,</span><span class=\"param\">\t<span class=\"n\">tokenizer</span><span class=\"p\">:</span> <span class=\"n\">Callable</span><span class=\"p\">[[</span><span class=\"nb\">str</span><span class=\"p\">],</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span>)</span>"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.model", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.model", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.embed_documents", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.embed_documents", "kind": "function", "doc": "<p>Embed a list of documents (raw text) into word2vec document vectors by averaging the word vectors in each of the documents.</p>\n\n<p>Since there's no speedup via batching like there is in pytorch models, we iterate one document at a time.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">docs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>:</span></span>", "funcdef": "def"}];
+    /** pdoc search index */const docs = [{"fullname": "sciterra", "modulename": "sciterra", "kind": "module", "doc": "<h1 id=\"sciterra-a-python-library-for-similarity-based-scientometrics\">sciterra: a python library for similarity-based scientometrics</h1>\n\n<p><a href=\"https://github.com/nathimel/sciterra/actions/workflows/build.yml\"><img src=\"https://github.com/nathimel/sciterra/actions/workflows/build.yml/badge.svg\" alt=\"build\" /></a></p>\n\n<p>Sciterra is a software libary to support data-driven analyses of scientific literature, with a focus on unifying different bibliographic database APIs and document-embedding methods for systematic scientometrics research.</p>\n\n<h2 id=\"overview\">Overview</h2>\n\n<p>The main purpose of sciterra is to perform similarity-based retrieval of scientific publications for metascience/scientometrics research. While there are many services that can make the individual steps of this simple, this software library exists to</p>\n\n<ol>\n<li><p>Unify the different APIs and vector-based retrieval methods</p></li>\n<li><p>Support scientometrics analyses of citation dynamics, especially with respect to a vectorized 'landscape' of literature.</p></li>\n</ol>\n\n<h2 id=\"installing-sciterra\">Installing sciterra</h2>\n\n<p>First, set up a virtual environment (e.g. via <a href=\"https://docs.conda.io/projects/miniconda/en/latest/\">miniconda</a>, <code>conda create -n sciterra</code>, and <code>conda activate sciterra</code>).</p>\n\n<ol>\n<li><p>Install sciterra via git:</p>\n\n<p><code>python -m pip install 'sciterra @ git+https://github.com/nathimel/sciterra.git'</code></p></li>\n<li><p>Alternatively, download or clone this repository and navigate to the root folder, and install locally:</p>\n\n<p><code>pip install -e .</code></p></li>\n<li><p>It is not yet recommended because sciterra is still in development, but you can also install via pip from pypi:</p>\n\n<p><code>pip install sciterra</code></p></li>\n</ol>\n\n<h2 id=\"usage\">Usage</h2>\n\n<h3 id=\"atlas\">Atlas</h3>\n\n<p>The central object in sciterra is the <a href=\"src/sciterra/mapping/atlas.py\"><code>Atlas</code></a>. This is a basic data structure for containing scientific publications that are returned from calls to various bibliographic database APIs.</p>\n\n<p>An Atlas minimally requires a list of <a href=\"src/sciterra/mapping/publication.py\"><code>Publications</code></a>.</p>\n\n<h4 id=\"publication\">Publication</h4>\n\n<p>A publication object is a minimal wrapper around publication data, and should have a string identifier. It is designed to standardize the basic metadata contained in the results from some bibliographic database API.</p>\n\n<div class=\"pdoc-code codehilite\">\n<pre><span></span><code><span class=\"kn\">from</span> <span class=\"nn\">sciterra</span> <span class=\"kn\">import</span> <span class=\"n\">Atlas</span><span class=\"p\">,</span> <span class=\"n\">Publication</span>\n\n<span class=\"n\">atl</span> <span class=\"o\">=</span> <span class=\"n\">Atlas</span><span class=\"p\">([</span><span class=\"n\">Publication</span><span class=\"p\">({</span><span class=\"s2\">&quot;identifier&quot;</span><span class=\"p\">:</span> <span class=\"s2\">&quot;id&quot;</span><span class=\"p\">})])</span>\n</code></pre>\n</div>\n\n<p>Alternatively, you can construct an Atlas by passing in a .bib file. The entries in this bibtex file will be parsed for unique identifiers (e.g., DOIs), and sent in an API call, and returned as Publications, which then populate an Atlas.</p>\n\n<div class=\"pdoc-code codehilite\">\n<pre><span></span><code><span class=\"n\">atl</span> <span class=\"o\">=</span> <span class=\"n\">crt</span><span class=\"o\">.</span><span class=\"n\">bibtex_to_atlas</span><span class=\"p\">(</span><span class=\"n\">bibtex_filepath</span><span class=\"p\">)</span>\n</code></pre>\n</div>\n\n<p>In the line of code above, the variable <code>crt</code> is an instance of a <a href=\"src/sciterra/mapping/cartography.py\"><code>Cartographer</code></a> object, which encapsulates the bookkeeping involved in querying a bibliographic database for publications.</p>\n\n<h3 id=\"cartographer\">Cartographer</h3>\n\n<p>The Cartographer class is named because interfaces with an Atlas to build out a library of publications. Since it does so via similarity-based retrieval, the resulting Atlas can be considered a 'region' of publications.</p>\n\n<p>To do this, a Cartographer needs two things: an API with which to interface, and a way of getting document embeddings. Both are encapsulated, respectively, by the <a href=\"src/sciterra/librarians/librarian.py\"><code>Librarian</code></a> and the <a href=\"src/sciterra/vectorization/vectorizer.py\"><code>Vectorizer</code></a> classes.</p>\n\n<div class=\"pdoc-code codehilite\">\n<pre><span></span><code><span class=\"kn\">from</span> <span class=\"nn\">sciterra</span> <span class=\"kn\">import</span> <span class=\"n\">Cartographer</span>\n<span class=\"kn\">from</span> <span class=\"nn\">sciterra.librarians</span> <span class=\"kn\">import</span> <span class=\"n\">SemanticScholarLibrarian</span> <span class=\"c1\"># or ADSLibrarian</span>\n<span class=\"kn\">from</span> <span class=\"nn\">sciterra.vectorization</span> <span class=\"kn\">import</span> <span class=\"n\">SciBERTVectorizer</span> <span class=\"c1\"># among others</span>\n\n<span class=\"n\">crt</span> <span class=\"o\">=</span> <span class=\"n\">Cartographer</span><span class=\"p\">(</span>\n    <span class=\"n\">librarian</span><span class=\"o\">=</span><span class=\"n\">SemanticScholarLibrarian</span><span class=\"p\">(),</span>\n    <span class=\"n\">vectorizer</span><span class=\"o\">=</span><span class=\"n\">SciBERTVectorizer</span><span class=\"p\">(),</span>\n<span class=\"p\">)</span>\n</code></pre>\n</div>\n\n<h4 id=\"librarian\">Librarian</h4>\n\n<p>Each Librarian subclass is designed to be a wrapper for an existing python API service, such as the <a href=\"https://ads.readthedocs.io/en/latest/\">ads</a> package or the <a href=\"https://github.com/danielnsilva/semanticscholar#\">semanticscholar</a> client library.</p>\n\n<p>A Librarian subclass also overrides two methods. The first is <code>get_publications</code>, which takes a list of identifiers, should query the specific API for that Librarian, and returns a list of Publications. Keyword arguments can be passed to specify the metadata that is kept for each publication (e.g. date, title, journal, authors, etc.) The second method is <code>convert_publication</code>, which defines how the result of an API call should be converted to a sciterra Publication object.</p>\n\n<p>Contributions to sciterra in the form of new Librarian subclasses are encouraged and appreciated.</p>\n\n<h3 id=\"vectorizer\">Vectorizer</h3>\n\n<p>Vectorizer subclasses override one function, <code>embed_documents</code>, which takes a list of strings, representing the text of a publication (currently, just its abstract), and returns an <code>np.ndarray</code> of embeddings.</p>\n\n<p>Under the hood, the <code>project</code> method of Cartographer, which is used during similarity-based retrieval, uses the vectorizer roughly as follows</p>\n\n<div class=\"pdoc-code codehilite\">\n<pre><span></span><code><span class=\"c1\"># Get abstracts</span>\n<span class=\"n\">docs</span> <span class=\"o\">=</span> <span class=\"p\">[</span><span class=\"n\">atlas</span><span class=\"p\">[</span><span class=\"n\">identifier</span><span class=\"p\">]</span><span class=\"o\">.</span><span class=\"n\">abstract</span> <span class=\"k\">for</span> <span class=\"n\">identifier</span> <span class=\"ow\">in</span> <span class=\"n\">identifiers</span><span class=\"p\">]</span>\n\n<span class=\"c1\"># Embed abstracts</span>\n<span class=\"n\">result</span> <span class=\"o\">=</span> <span class=\"n\">vectorizer</span><span class=\"o\">.</span><span class=\"n\">embed_documents</span><span class=\"p\">(</span><span class=\"n\">docs</span><span class=\"p\">)</span>\n<span class=\"n\">embeddings</span> <span class=\"o\">=</span> <span class=\"n\">result</span><span class=\"p\">[</span><span class=\"s2\">&quot;embeddings&quot;</span><span class=\"p\">]</span>\n\n<span class=\"c1\"># depending on the vectorizer, sometimes not all embeddings can be obtained due to out-of-vocab issues</span>\n<span class=\"n\">success_indices</span> <span class=\"o\">=</span> <span class=\"n\">result</span><span class=\"p\">[</span><span class=\"s2\">&quot;success_indices&quot;</span><span class=\"p\">]</span> <span class=\"c1\"># shape `(len(embeddings),)`</span>\n<span class=\"n\">fail_indices</span> <span class=\"o\">=</span> <span class=\"n\">result</span><span class=\"p\">[</span><span class=\"s2\">&quot;fail_indices&quot;</span><span class=\"p\">]</span> <span class=\"c1\"># shape `(len(docs) - len(embeddings))``</span>\n</code></pre>\n</div>\n\n<p>Currently, sciterra has vectorizers using <a href=\"https://aclanthology.org/D19-1371/\">SciBERT</a>, <a href=\"https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models\">SBERT</a>, <a href=\"https://huggingface.co/docs/transformers/en/model_doc/gpt2\">GPT-2</a>, <a href=\"https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#\">Word2Vec</a>, and a simple bag-of-words (BOW) vectorizer that uses the same vocabulary as the Word2Vec vectorizer. Contributions to sciterra in the form of new Vectorizer subclasses are also encouraged and appreciated.</p>\n\n<h3 id=\"putting-it-all-together\">Putting it all together</h3>\n\n<p>The main use case for all of these ingredients is to iteratively build out a region of publications. This is done using <code>iterate_expand</code>:</p>\n\n<div class=\"pdoc-code codehilite\">\n<pre><span></span><code><span class=\"kn\">from</span> <span class=\"nn\">sciterra.mapping.tracing</span> <span class=\"kn\">import</span> <span class=\"n\">iterate_expand</span>\n\n<span class=\"c1\"># Assuming the initial atlas contains just one publication</span>\n<span class=\"p\">(</span><span class=\"n\">atl</span><span class=\"o\">.</span><span class=\"n\">center</span><span class=\"p\">,</span> <span class=\"p\">)</span> <span class=\"o\">=</span> <span class=\"n\">atl</span><span class=\"o\">.</span><span class=\"n\">publications</span><span class=\"o\">.</span><span class=\"n\">values</span><span class=\"p\">()</span>\n<span class=\"c1\"># build out an atlas to contain 10,000 publications, with increasing dissimilarity to the initial publication, saving progress in binary files to the directory named &quot;atlas&quot;.</span>\n<span class=\"n\">iterate_expand</span><span class=\"p\">(</span>\n    <span class=\"n\">atl</span><span class=\"o\">=</span><span class=\"n\">atl</span><span class=\"p\">,</span>\n    <span class=\"n\">crt</span><span class=\"o\">=</span><span class=\"n\">crt</span><span class=\"p\">,</span>\n    <span class=\"n\">atlas_dir</span><span class=\"o\">=</span><span class=\"s2\">&quot;atlas&quot;</span><span class=\"p\">,</span>\n    <span class=\"n\">target_size</span><span class=\"o\">=</span><span class=\"mi\">10000</span><span class=\"p\">,</span>\n    <span class=\"n\">center</span><span class=\"o\">=</span><span class=\"n\">atl</span><span class=\"o\">.</span><span class=\"n\">center</span><span class=\"p\">,</span>\n<span class=\"p\">)</span>\n</code></pre>\n</div>\n\n<p>This method has a number of keyword arguments that enable tracking the Atlas expansion, limiting the number of publications per expansion, how many times to try to get a response if there are connection issues, etc.</p>\n\n<p>In practice, it may be helpful to use the <a href=\"src/sciterra/mapping/tracing.py\"><code>sciterra.mapping.tracing.AtlasTracer</code></a> data structure to reduce most of the loading/initialization boilerplate described above. For an example, see <a href=\"src/examples/scratch/main.py\">main.py</a>.</p>\n\n<h2 id=\"additional-features\">Additional features</h2>\n\n<ul>\n<li>The <a href=\"src/sciterra/mapping/topography.py\">topography</a> submodule contains similarity-based metrics for publications, to support scientometrics analyses.</li>\n</ul>\n\n<h2 id=\"acknowledgments\">Acknowledgments</h2>\n\n<p>This software is a reimplimentation of Zachary Hafen-Saavedra's library, <a href=\"https://github.com/zhafen/cc\">cc</a>.</p>\n\n<p>To cite sciterra, please use the following workshop paper,</p>\n\n<pre><code>@inproceedings{Imel2023,\n author = {Imel, Nathaniel, and Hafen, Zachary},\n title = {Citation-similarity relationships in astrophysics},\n booktitle = {AI for Scientific Discovery: From Theory to Practice Workshop (AI4Science @ NeurIPS)},\n year = {2023},\n url = {https://openreview.net/pdf?id=mISayy7DPI},\n}\n</code></pre>\n"}, {"fullname": "sciterra.librarians", "modulename": "sciterra.librarians", "kind": "module", "doc": "<p></p>\n"}, {"fullname": "sciterra.librarians.librarians", "modulename": "sciterra.librarians", "qualname": "librarians", "kind": "variable", "doc": "<p>Why is there not an ArxivLibrarian? For now, we are restricting to APIs that allow us to traverse literature graphs, and arxiv does not have one. While there is a useful pip-installable package for querying the arxiv api for papers, <a href=\"https://pypi.org/project/arxiv/\">https://pypi.org/project/arxiv/</a>, the returned object does not have information on references and citations. However, it may still be possible to obtain a large sample of publications with abstracts and submission dates (though no citation counts), because the arxiv API's limit for a single query is 300,000 results.</p>\n", "default_value": "{&#x27;S2&#x27;: &lt;class &#x27;sciterra.librarians.s2librarian.SemanticScholarLibrarian&#x27;&gt;, &#x27;ADS&#x27;: &lt;class &#x27;sciterra.librarians.adslibrarian.ADSLibrarian&#x27;&gt;}"}, {"fullname": "sciterra.librarians.adslibrarian", "modulename": "sciterra.librarians.adslibrarian", "kind": "module", "doc": "<p></p>\n"}, {"fullname": "sciterra.librarians.adslibrarian.CALL_SIZE", "modulename": "sciterra.librarians.adslibrarian", "qualname": "CALL_SIZE", "kind": "variable", "doc": "<p></p>\n", "default_value": "50"}, {"fullname": "sciterra.librarians.adslibrarian.NUM_ATTEMPTS_PER_QUERY", "modulename": "sciterra.librarians.adslibrarian", "qualname": "NUM_ATTEMPTS_PER_QUERY", "kind": "variable", "doc": "<p></p>\n", "default_value": "10"}, {"fullname": "sciterra.librarians.adslibrarian.QUERY_FIELDS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "QUERY_FIELDS", "kind": "variable", "doc": "<p></p>\n", "default_value": "[&#x27;bibcode&#x27;, &#x27;abstract&#x27;, &#x27;title&#x27;, &#x27;entry_date&#x27;, &#x27;pubdate&#x27;, &#x27;year&#x27;, &#x27;citation_count&#x27;, &#x27;citation&#x27;, &#x27;reference&#x27;, &#x27;identifier&#x27;, &#x27;arxiv_class&#x27;]"}, {"fullname": "sciterra.librarians.adslibrarian.ALLOWED_EXCEPTIONS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ALLOWED_EXCEPTIONS", "kind": "variable", "doc": "<p></p>\n", "default_value": "(&lt;class &#x27;ads.exceptions.APIResponseError&#x27;&gt;,)"}, {"fullname": "sciterra.librarians.adslibrarian.EXTERNAL_IDS", "modulename": "sciterra.librarians.adslibrarian", "qualname": "EXTERNAL_IDS", "kind": "variable", "doc": "<p></p>\n", "default_value": "[&#x27;DOI&#x27;, &#x27;arXiv&#x27;, &#x27;bibcode&#x27;]"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "sciterra.librarians.librarian.Librarian"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.bibtex_entry_identifier", "kind": "function", "doc": "<p>Parse a bibtex entry for a usable identifier for querying ADS (see EXTERNAL_IDS).</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">bibtex_entry</span><span class=\"p\">:</span> <span class=\"nb\">dict</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">str</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.get_publications", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.get_publications", "kind": "function", "doc": "<p>Use the NASA ADS python package, which calls the ADS API to retrieve publications.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>bibcodes:</strong>  the str ids required for querying. While it is possible to use one of EXTERNAL_IDS to query, if ADS returns a paper at all, it will return a bibcode, so it is preferred to use bibcodes.</li>\n<li><strong>n_attempts_per_query:</strong>  Number of attempts to access the API per query. Useful when experiencing connection issues.</li>\n<li><strong>call_size:</strong>  maximum number of papers to call API for in one query; if less than <code>len(bibcodes)</code>, chunking will be performed.</li>\n<li><strong>convert:</strong>  whether to convert each resulting ADS Article to sciterra Publications (True by default).</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>the list of publications (or Papers)</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">bibcodes</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"o\">*</span><span class=\"n\">args</span>,</span><span class=\"param\">\t<span class=\"n\">call_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">50</span>,</span><span class=\"param\">\t<span class=\"n\">n_attempts_per_query</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">10</span>,</span><span class=\"param\">\t<span class=\"n\">convert</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">True</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.adslibrarian.ADSLibrarian.convert_publication", "modulename": "sciterra.librarians.adslibrarian", "qualname": "ADSLibrarian.convert_publication", "kind": "function", "doc": "<p>Convert a ADS Article object to a sciterra.publication.Publication.</p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">article</span><span class=\"p\">:</span> <span class=\"n\">ads</span><span class=\"o\">.</span><span class=\"n\">search</span><span class=\"o\">.</span><span class=\"n\">Article</span>,</span><span class=\"param\">\t<span class=\"o\">*</span><span class=\"n\">args</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian", "modulename": "sciterra.librarians.librarian", "kind": "module", "doc": "<p></p>\n"}, {"fullname": "sciterra.librarians.librarian.Librarian", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "abc.ABC"}, {"fullname": "sciterra.librarians.librarian.Librarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.bibtex_entry_identifier", "kind": "function", "doc": "<p>Parse a bibtex entry for a usable unique identifier appropriate to the API.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">bibtex_entry</span><span class=\"p\">:</span> <span class=\"nb\">dict</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">str</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.get_publications", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.get_publications", "kind": "function", "doc": "<p>Call an API and retrieve the publications corresponding to str identifiers.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>n_attempts_per_query:</strong>  Number of attempts to access the API per query. Useful when experiencing connection issues.</li>\n<li><strong>call_size:</strong>  (int): maximum number of papers to call API for in one query; if less than <code>len(paper_ids)</code>, chunking will be performed.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">identifiers</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"o\">*</span><span class=\"n\">args</span>,</span><span class=\"param\">\t<span class=\"n\">call_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">n_attempts_per_query</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">convert</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">True</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.convert_publication", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.convert_publication", "kind": "function", "doc": "<p>Convert an API-specific resulting publication data structure into a sciterra Publication object.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">pub</span><span class=\"p\">:</span> <span class=\"n\">Any</span>, </span><span class=\"param\"><span class=\"o\">*</span><span class=\"n\">args</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.librarian.Librarian.convert_publications", "modulename": "sciterra.librarians.librarian", "qualname": "Librarian.convert_publications", "kind": "function", "doc": "<p>Convet a list of API-specific results to sciterra Publications, possibly using multiprocessing.</p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">papers</span><span class=\"p\">:</span> <span class=\"nb\">list</span>,</span><span class=\"param\">\t<span class=\"o\">*</span><span class=\"n\">args</span>,</span><span class=\"param\">\t<span class=\"n\">multiprocess</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">True</span>,</span><span class=\"param\">\t<span class=\"n\">num_processes</span><span class=\"o\">=</span><span class=\"mi\">6</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian", "modulename": "sciterra.librarians.s2librarian", "kind": "module", "doc": "<p></p>\n"}, {"fullname": "sciterra.librarians.s2librarian.QUERY_FIELDS", "modulename": "sciterra.librarians.s2librarian", "qualname": "QUERY_FIELDS", "kind": "variable", "doc": "<p></p>\n", "default_value": "[&#x27;year&#x27;, &#x27;abstract&#x27;, &#x27;title&#x27;, &#x27;externalIds&#x27;, &#x27;citationCount&#x27;, &#x27;fieldsOfStudy&#x27;, &#x27;s2FieldsOfStudy&#x27;, &#x27;url&#x27;, &#x27;citations.externalIds&#x27;, &#x27;citations.url&#x27;, &#x27;references.externalIds&#x27;, &#x27;references.url&#x27;, &#x27;citationStyles&#x27;, &#x27;publicationDate&#x27;]"}, {"fullname": "sciterra.librarians.s2librarian.EXTERNAL_IDS", "modulename": "sciterra.librarians.s2librarian", "qualname": "EXTERNAL_IDS", "kind": "variable", "doc": "<p></p>\n", "default_value": "[&#x27;DOI&#x27;, &#x27;ArXiv&#x27;, &#x27;CorpusId&#x27;, &#x27;MAG&#x27;, &#x27;ACL&#x27;, &#x27;PubMed&#x27;, &#x27;Medline&#x27;, &#x27;PubMedCentral&#x27;, &#x27;DBLP&#x27;, &#x27;URL&#x27;]"}, {"fullname": "sciterra.librarians.s2librarian.ALLOWED_EXCEPTIONS", "modulename": "sciterra.librarians.s2librarian", "qualname": "ALLOWED_EXCEPTIONS", "kind": "variable", "doc": "<p></p>\n", "default_value": "(&lt;class &#x27;Exception&#x27;&gt;, &lt;class &#x27;requests.exceptions.ReadTimeout&#x27;&gt;, &lt;class &#x27;requests.exceptions.ConnectionError&#x27;&gt;, &lt;class &#x27;semanticscholar.SemanticScholarException.ObjectNotFoundException&#x27;&gt;)"}, {"fullname": "sciterra.librarians.s2librarian.CALL_SIZE", "modulename": "sciterra.librarians.s2librarian", "qualname": "CALL_SIZE", "kind": "variable", "doc": "<p></p>\n", "default_value": "10"}, {"fullname": "sciterra.librarians.s2librarian.NUM_ATTEMPTS_PER_QUERY", "modulename": "sciterra.librarians.s2librarian", "qualname": "NUM_ATTEMPTS_PER_QUERY", "kind": "variable", "doc": "<p></p>\n", "default_value": "50"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "sciterra.librarians.librarian.Librarian"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.__init__", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.__init__", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">api_key</span><span class=\"p\">:</span> <span class=\"nb\">str</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>, </span><span class=\"param\"><span class=\"n\">api_key_fn</span><span class=\"p\">:</span> <span class=\"nb\">str</span> <span class=\"o\">=</span> <span class=\"kc\">None</span></span>)</span>"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.sch", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.sch", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.bibtex_entry_identifier", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.bibtex_entry_identifier", "kind": "function", "doc": "<p>Parse a bibtex entry for a usable identifier for querying SemanticScholar (see EXTERNAL_IDS).</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">bibtex_entry</span><span class=\"p\">:</span> <span class=\"nb\">dict</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">str</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_publications", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_publications", "kind": "function", "doc": "<p>Use the (unofficial) S2 python package, which calls the Semantic Scholar API to retrieve publications from the S2AG.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>paper_ids:</strong>  the str ids required for querying. While it is possible to use one of EXTERNAL_IDS to query, if SemanticScholar returns a paper at all, it will return a paperId, so it is preferred to use paperIds.</li>\n<li><strong>n_attempts_per_query:</strong>  Number of attempts to access the API per query. Useful when experiencing connection issues.</li>\n<li><strong>call_size:</strong>  maximum number of papers to call API for in one query; if less than <code>len(paper_ids)</code>, chunking will be performed. Maximum that S2 allows is 500.</li>\n<li><strong>convert:</strong>  whether to convert each resulting SemanticScholar Paper to sciterra Publications (True by default).</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>the list of publications (or Papers)</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">paper_ids</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"o\">*</span><span class=\"n\">args</span>,</span><span class=\"param\">\t<span class=\"n\">call_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">10</span>,</span><span class=\"param\">\t<span class=\"n\">n_attempts_per_query</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">50</span>,</span><span class=\"param\">\t<span class=\"n\">convert</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">True</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.convert_publication", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.convert_publication", "kind": "function", "doc": "<p>Convert a SemanticScholar Paper object to a sciterra.publication.Publication.</p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">paper</span><span class=\"p\">:</span> <span class=\"n\">semanticscholar</span><span class=\"o\">.</span><span class=\"n\">Paper</span><span class=\"o\">.</span><span class=\"n\">Paper</span>,</span><span class=\"param\">\t<span class=\"o\">*</span><span class=\"n\">args</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_papers", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_papers", "kind": "function", "doc": "<p>Custom function for calling the S2 API that doesn't fail on empty results.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">paper_ids</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"n\">fields</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.librarians.s2librarian.SemanticScholarLibrarian.get_paper", "modulename": "sciterra.librarians.s2librarian", "qualname": "SemanticScholarLibrarian.get_paper", "kind": "function", "doc": "<p>Custom function for calling the S2 API that doesn't fail on empty results.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">paper_id</span><span class=\"p\">:</span> <span class=\"nb\">str</span>, </span><span class=\"param\"><span class=\"n\">fields</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping", "modulename": "sciterra.mapping", "kind": "module", "doc": "<p>Classes for constructing maps of scientific literature.</p>\n\n<p>The <code>sciterra.mapping.atlas</code> submodule contains the basic data structure, the Atlas.</p>\n\n<p>The <code>sciterra.mapping.cartography</code> submodule contains functionality for manipulating an Atlas.</p>\n"}, {"fullname": "sciterra.mapping.atlas", "modulename": "sciterra.mapping.atlas", "kind": "module", "doc": "<p>Main container object for a large library of publications.</p>\n"}, {"fullname": "sciterra.mapping.atlas.Atlas", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas", "kind": "class", "doc": "<p>Data structure for storing publications.</p>\n\n<p><code>self.projection</code>: the Projection object containing the embeddings of all publications and their mapping to str identifiers.</p>\n\n<p><code>self.bad_ids</code>: a list of identifiers that have failed for some reason or other during an expansion, and will be excluded from subsequent expansions.</p>\n\n<p><code>self.history</code>: dict of the form {'pubs_per_update': list[list[str]], 'kernel_size': np.ndarray of ints of shape <code>(num_pubs, last_update)</code> where last_update &lt;= the total number of expansions performed.}</p>\n\n<p><code>self.center</code>: the core, central Publication identifier repeatedly passed to <code>cartography.Cartographer.expand</code>. Default is None, which means the Atlas has no internal record of the central publication.</p>\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.__init__", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.__init__", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">publications</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"n\">projection</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">vectorization</span><span class=\"o\">.</span><span class=\"n\">projection</span><span class=\"o\">.</span><span class=\"n\">Projection</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">bad_ids</span><span class=\"p\">:</span> <span class=\"nb\">set</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"nb\">set</span><span class=\"p\">()</span>,</span><span class=\"param\">\t<span class=\"n\">history</span><span class=\"p\">:</span> <span class=\"nb\">dict</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">,</span> <span class=\"n\">typing</span><span class=\"o\">.</span><span class=\"n\">Any</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">center</span><span class=\"p\">:</span> <span class=\"nb\">str</span> <span class=\"o\">=</span> <span class=\"kc\">None</span></span>)</span>"}, {"fullname": "sciterra.mapping.atlas.Atlas.publications", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.publications", "kind": "variable", "doc": "<p></p>\n", "annotation": ": dict[str, sciterra.mapping.publication.Publication]"}, {"fullname": "sciterra.mapping.atlas.Atlas.projection", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.projection", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.bad_ids", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.bad_ids", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.history", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.history", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.center", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.center", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.atlas.Atlas.ids", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.ids", "kind": "variable", "doc": "<p>Get a list of all the publication identifiers in the Atlas.</p>\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.atlas.Atlas.save", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.save", "kind": "function", "doc": "<p>Write the Atlas to a directory containing a .pkl binary for each attribute.</p>\n\n<p>Warnings cannot be silenced.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atlas_dirpath:</strong>  path of directory to save files to.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">atlas_dirpath</span><span class=\"p\">:</span> <span class=\"nb\">str</span>, </span><span class=\"param\"><span class=\"n\">overwrite</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">True</span></span><span class=\"return-annotation\">) -> <span class=\"kc\">None</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.atlas.Atlas.load", "modulename": "sciterra.mapping.atlas", "qualname": "Atlas.load", "kind": "function", "doc": "<p>Load an Atlas object from a directory containing the .pkl binary for each attribute.</p>\n\n<p>Warnings cannot be silenced.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atlas_dirpath:</strong>  directory where .pkl binaries will be read from</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">cls</span>, </span><span class=\"param\"><span class=\"n\">atlas_dirpath</span><span class=\"p\">:</span> <span class=\"nb\">str</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography", "modulename": "sciterra.mapping.cartography", "kind": "module", "doc": "<p>Functions for manipulating an atlas based on the document embeddings of the abstracts of its publications.</p>\n"}, {"fullname": "sciterra.mapping.cartography.batch_cospsi_matrix", "modulename": "sciterra.mapping.cartography", "qualname": "batch_cospsi_matrix", "kind": "function", "doc": "<p>Batch-process a pairwise cosine similarity matrix between embeddings.</p>\n\n<p>In order to avoid memory errors (e.g. bus error, segfaults) resulting from too large arrays, we batch process the construction of the cospsi_matrix.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>embeddings:</strong>  a numpy array of embeddings of shape <code>(num_pubs, embedding_dim)</code></li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>cosine_similarities: a 2D numpy array of shape <code>(num_pubs, num_pubs)</code> representing the pairwise cosine similarity between each embedding</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">embeddings</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span></span><span class=\"return-annotation\">) -> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.pub_has_attributes", "modulename": "sciterra.mapping.cartography", "qualname": "pub_has_attributes", "kind": "function", "doc": "<p>Return True if a publication has all <code>attributes</code>.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>attributes:</strong>  the list of attributes to check are not <code>None</code> for each publication from the atlas.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">pub</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span>,</span><span class=\"param\">\t<span class=\"n\">attributes</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">bool</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.pub_has_fields_of_study", "modulename": "sciterra.mapping.cartography", "qualname": "pub_has_fields_of_study", "kind": "function", "doc": "<p>Return true if any of <code>pub.fields_of_study</code> are in passed <code>fields_of_study</code>.</p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">pub</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span>,</span><span class=\"param\">\t<span class=\"n\">fields_of_study</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">bool</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer", "kind": "class", "doc": "<p>A basic wrapper for obtaining and updating atlas projections.</p>\n\n<p><code>self.librarian</code>: the Librarian object used to query a bibliographic database API.\n<code>self.vectorizer</code>: the Vectorizer object used to get a document embedding for each abstract\n<code>self.pubs_per_update</code>: a list of lists of publication str ids, representing the publications that exist at each time step / expansion update.\n<code>self.update_history</code>: an np.array of ints representing when publications were added. A value of -2 indicates no record of being added.</p>\n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.__init__", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.__init__", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">librarian</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">librarians</span><span class=\"o\">.</span><span class=\"n\">librarian</span><span class=\"o\">.</span><span class=\"n\">Librarian</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">vectorizer</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">vectorization</span><span class=\"o\">.</span><span class=\"n\">vectorizer</span><span class=\"o\">.</span><span class=\"n\">Vectorizer</span> <span class=\"o\">=</span> <span class=\"kc\">None</span></span>)</span>"}, {"fullname": "sciterra.mapping.cartography.Cartographer.librarian", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.librarian", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.vectorizer", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.vectorizer", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.cartography.Cartographer.pubs_per_update", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.pubs_per_update", "kind": "variable", "doc": "<p></p>\n", "annotation": ": list[list[str]]"}, {"fullname": "sciterra.mapping.cartography.Cartographer.update_history", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.update_history", "kind": "variable", "doc": "<p></p>\n", "annotation": ": numpy.ndarray"}, {"fullname": "sciterra.mapping.cartography.Cartographer.bibtex_to_atlas", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.bibtex_to_atlas", "kind": "function", "doc": "<p>Convert a bibtex file to an atlas, by parsing each entry for an identifier, and querying an API for publications using <code>self.librarian</code>.</p>\n\n<p>NOTE: the identifiers in the corresponding atlas will be API-specific ids; there is no relationship between the parsed id used to query papers (e.g. 'DOI:XYZ' in the case of SemanticScholar) and the resulting identifier associated with the resulting Publication object, (a paperId, a bibcode, etc.) Therefore, the purpose of using the <code>bibtex_to_atlas</code> method is primarily for initializing literature exploration in a human-readable way. If you want to obtain as many publications as identifiers supplied, you need to use <code>get_publications</code>.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>bibtex_fp:</strong>  the filepath where the bibtex file is saved.</li>\n<li>args and kwargs are passed to <code>get_publications</code>.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">bibtex_fp</span><span class=\"p\">:</span> <span class=\"nb\">str</span>, </span><span class=\"param\"><span class=\"o\">*</span><span class=\"n\">args</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.project", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.project", "kind": "function", "doc": "<p>Update an atlas with its projection, i.e. the document embeddings for all publications using <code>self.vectorizer</code>, removing publications with no abstracts.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the Atlas containing publications to project to document embeddings</li>\n<li><strong>kwargs:</strong>  keyword arguments propagated to <code>filter_by_func</code></li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>the updated atlas containing all nonempty-abstract-containing publications and their projection</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.expand", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.expand", "kind": "function", "doc": "<p>Expand an atlas by retrieving a list of publications resulting from traversal of the citation network.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the atlas containing the region to expand</li>\n<li><strong>center:</strong>  (if given) center the search on this publication, preferentially searching related publications. Default is <code>None</code>, and the expansion is not centered. To keep a consistent expansion around one center, you should pass <code>atl.center</code>.</li>\n<li><strong>n_pubs_max:</strong>  maximum number of publications allowed in the expansion.</li>\n<li><strong>n_sources_max:</strong>  maximum number of publications (already in the atlas) to draw references and citations from.</li>\n<li><strong>record_pubs_per_update:</strong>  whether to track all the publications that exist in the resulting atlas to <code>self.pubs_per_update</code>. This should only be set to <code>True</code> when you need to later filter by degree of convergence of the atlas.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>atl_expanded: the expanded atlas</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"o\">*</span><span class=\"n\">args</span>,</span><span class=\"param\">\t<span class=\"n\">center</span><span class=\"p\">:</span> <span class=\"nb\">str</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">n_pubs_max</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">4000</span>,</span><span class=\"param\">\t<span class=\"n\">n_sources_max</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">record_pubs_per_update</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">False</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.filter_by_func", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.filter_by_func", "kind": "function", "doc": "<p>Update an atlas by dropping publications (and corresponding data in projection) when certain fields are empty.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the Atlas containing publications to filter</li>\n<li><strong>require_func:</strong>  a function that takes a publication and returns True if it should be kept in the atlas. For example, if <code>func = lambda pub: pub_has_attributes(pub, [\"abstract\"])</code>, then all publications that have <code>None</code> for the attribute <code>abstract</code> will be removed from the atlas, along with the corresponding data in the projection.</li>\n<li><strong>record_pubs_per_update:</strong>  whether to track all the publications that exist in the resulting atlas to <code>self.pubs_per_update</code>. This should only be set to <code>True</code> when you need to later filter by degree of convergence of the atlas. This is an important parameter because <code>self.filter</code> is called in <code>self.project</code>, which typically is called after <code>self.expand</code>, where we pass in the same parameter.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>the filtered atlas</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"n\">require_func</span><span class=\"p\">:</span> <span class=\"n\">Callable</span><span class=\"p\">[[</span><span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">publication</span><span class=\"o\">.</span><span class=\"n\">Publication</span><span class=\"p\">],</span> <span class=\"nb\">bool</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"o\">&lt;</span><span class=\"n\">function</span> <span class=\"n\">Cartographer</span><span class=\"o\">.&lt;</span><span class=\"k\">lambda</span><span class=\"o\">&gt;&gt;</span>,</span><span class=\"param\">\t<span class=\"n\">record_pubs_per_update</span><span class=\"o\">=</span><span class=\"kc\">False</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.filter_by_ids", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.filter_by_ids", "kind": "function", "doc": "<p>Update an atlas by dropping publications (and corresponding data in projection).</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the Atlas containing publications to filter</li>\n<li><strong>keep_ids:</strong>  the list of publication ids to NOT filter; all other publications in <code>atl</code> not matching one of these ids will be removed.</li>\n<li><strong>drop_ids:</strong>  the list of publications to filter; all publications in <code>atl</code> matching one of these ids will be removed.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"n\">keep_ids</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">drop_ids</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.track", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.track", "kind": "function", "doc": "<p>Overwrite the data associated with tracking degree of convergence of publications in an atlas over multiple expansions. N.B.: the atlas must be fully projected, or else <code>converged_kernel_size</code> will raise a KeyError. By default, this function will overwrite the <code>atl.history</code> with updated <code>self.pubs_per_update</code>, but not <code>kernel_size</code>, which requires computing the converged kernel size for every publication in the atlas.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the Atlas that will be updated by overwriting <code>Atlas.history</code></li>\n<li><strong>calculate_convergence:</strong>  whether to call <code>self.converged_kernel_size</code>, and store the results in the <code>atl.history</code>.</li>\n<li><strong>pubs:</strong>  the list of publications to pass to <code>self.record_update_history</code>. By default <code>None</code>, and the most recent list of publications in the <code>self.update_history</code> list will be used.</li>\n<li><strong>pubs_per_update:</strong>  the list of lists of publications to pass to <code>self.record_update_history</code>. By default <code>None</code>, adn the full history in <code>self.update_history</code> will be used.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>atl the updated Atlas</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"n\">calculate_convergence</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">False</span>,</span><span class=\"param\">\t<span class=\"n\">pubs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">pubs_per_update</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.record_update_history", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.record_update_history", "kind": "function", "doc": "<p>Record when publications were added, by updating atl.update_history.</p>\n\n<p>atl.update_history is a np.array of ints representing when publications were added. A value of -2 indicates no record of being added.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>pubs:</strong>  a list of str ids corresponding to publications at the final update in the update history. By default <code>None</code>, and <code>self.pubs_per_update[-1]</code> will be used.</li>\n<li><strong>pubs_per_update:</strong>  a list of which publications existed at which iteration, with the index of the overall list corresponding to the iteration the publication was added. By default <code>None</code>, and <code>self.pubs_per_update</code> will be used.</li>\n</ul>\n\n<h6 id=\"updates\">Updates:</h6>\n\n<blockquote>\n  <p><code>self.update_history</code>: an np.array of ints representing when publications were added. A value of -2 indicates no record of being added.</p>\n</blockquote>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p><code>None</code></p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">pubs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">pubs_per_update</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span></span><span class=\"return-annotation\">) -> <span class=\"kc\">None</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.converged_kernel_size", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.converged_kernel_size", "kind": "function", "doc": "<p>Calculate the largest size of the kernel that's converged (at differing levels of convergence) for each publication in a sample at each update.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  Atlas containing publications; for each publication we compute the largest converged kernel size at each update</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>kernel_size: an array of ints of shape <code>(num_pubs, max_update)</code> representing the kernel size for converged kernels.\n      - The first column indicates the largest kernel size that hasn't changed since the beginning,\n      - The second column indicates the largest kernel size that hasn't changed since the first update,\n      - etc. for the nth column.</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span></span><span class=\"return-annotation\">) -> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.cartography.Cartographer.measure_topography", "modulename": "sciterra.mapping.cartography", "qualname": "Cartographer.measure_topography", "kind": "function", "doc": "<p>Measure topographic properties of all publications relative to prior\npublications.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the Atlas to measure</li>\n<li><strong>publication_indices:</strong>  an np.ndarray of ints representing the indices of publications in the Atlas projection to measure</li>\n<li><p><strong>metrics:</strong>  A list of strings representing the metrics to use. Options are...\nconstant_asymmetry: The asymmetry of a publication $p_i$ w.r.t the entire atlas ${ p_j \\forall j \\in {1, ..., k} } where $k$ is the length of the atlas</p>\n\n<p>$| \\sum_{j}^{k-1}( p_i - p_j ) |$</p>\n\n<p>kernel_constant_asymmetry: The asymmetry of a publication w.r.t. its kernel, { p_j for all j in {1, ..., k} } where k is <code>kernel_size</code>, i.e. the k nearest neighbors.</p>\n\n<p>density: the density of a publication's surrounding area, estimated by a heuristic inspired by mass / volume = k publications divided by the minimum arc length enclosing the furthest publication.</p>\n\n<p>$\\frac{ k }{ smoothing_length(k) }$</p>\n\n<p>smoothing_length: The distance (in radians) to the farthest publication in the kernel, i.e. the kth nearest neighbor.</p></li>\n<li><strong>min_prior_pubs:</strong>  The minimum number of publications prior to the target publication for which to calculate the metric.</li>\n<li><strong>kernel_size:</strong>  the number of publications surrounding the publication for which to compute the topography metric, i.e. k nearest neighbors for k=kernel_size.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>estimates: an np.ndarray of shape <code>(len(publication_indices), len(metrics))</code> representing the estimated topography metric values for each publication.</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"bp\">self</span>,</span><span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"n\">ids</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">metrics</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"p\">[</span><span class=\"s1\">&#39;density&#39;</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"n\">min_prior_pubs</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">2</span>,</span><span class=\"param\">\t<span class=\"n\">kernel_size</span><span class=\"o\">=</span><span class=\"mi\">16</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.publication", "modulename": "sciterra.mapping.publication", "kind": "module", "doc": "<p>The general container for data for any scientific publication, regardless of the API that was used to obtain it.</p>\n"}, {"fullname": "sciterra.mapping.publication.FIELDS", "modulename": "sciterra.mapping.publication", "qualname": "FIELDS", "kind": "variable", "doc": "<p></p>\n", "default_value": "[&#x27;identifier&#x27;, &#x27;abstract&#x27;, &#x27;publication_date&#x27;, &#x27;citation_count&#x27;, &#x27;citations&#x27;, &#x27;references&#x27;]"}, {"fullname": "sciterra.mapping.publication.ADDITIONAL_FIELDS", "modulename": "sciterra.mapping.publication", "qualname": "ADDITIONAL_FIELDS", "kind": "variable", "doc": "<p></p>\n", "default_value": "[&#x27;doi&#x27;, &#x27;url&#x27;, &#x27;title&#x27;, &#x27;issn&#x27;]"}, {"fullname": "sciterra.mapping.publication.Publication", "modulename": "sciterra.mapping.publication", "qualname": "Publication", "kind": "class", "doc": "<p>The Publication is a standardized container a scientific publication's retrieved data.</p>\n\n<p>In general, all data-cleaning shoud be done prior to constructing a Publication, in order to keep the class minimal.</p>\n\n<h6 id=\"attributes\">Attributes:</h6>\n\n<ul>\n<li><strong>identifier:</strong>  The string id that uniquely identifies the publication, used for\n<ul>\n<li>storing in an Atlas</li>\n<li>querying an API</li>\n</ul></li>\n<li><strong>abstract:</strong>  The string corresponding to the publication's abstract</li>\n<li><strong>publication_date:</strong>  A datetime representing the date of publication</li>\n<li><strong>citation_count:</strong>  An int corresponding to the number of citations received by the publication</li>\n</ul>\n"}, {"fullname": "sciterra.mapping.publication.Publication.__init__", "modulename": "sciterra.mapping.publication", "qualname": "Publication.__init__", "kind": "function", "doc": "<p>Construct a publication.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>data:</strong>  to initialize attributes</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">data</span><span class=\"p\">:</span> <span class=\"nb\">dict</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span>)</span>"}, {"fullname": "sciterra.mapping.publication.Publication.identifier", "modulename": "sciterra.mapping.publication", "qualname": "Publication.identifier", "kind": "variable", "doc": "<p></p>\n", "annotation": ": str"}, {"fullname": "sciterra.mapping.publication.Publication.abstract", "modulename": "sciterra.mapping.publication", "qualname": "Publication.abstract", "kind": "variable", "doc": "<p></p>\n", "annotation": ": str"}, {"fullname": "sciterra.mapping.publication.Publication.publication_date", "modulename": "sciterra.mapping.publication", "qualname": "Publication.publication_date", "kind": "variable", "doc": "<p></p>\n", "annotation": ": datetime.date"}, {"fullname": "sciterra.mapping.publication.Publication.citations", "modulename": "sciterra.mapping.publication", "qualname": "Publication.citations", "kind": "variable", "doc": "<p></p>\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.references", "modulename": "sciterra.mapping.publication", "qualname": "Publication.references", "kind": "variable", "doc": "<p></p>\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.citation_count", "modulename": "sciterra.mapping.publication", "qualname": "Publication.citation_count", "kind": "variable", "doc": "<p>The citation_count can be different from the length of <code>citations</code>, since the number of citations listed for a paper might be different from the number of (valid) citing papers indexed on the relevant API.</p>\n", "annotation": ": int"}, {"fullname": "sciterra.mapping.publication.Publication.fields_of_study", "modulename": "sciterra.mapping.publication", "qualname": "Publication.fields_of_study", "kind": "variable", "doc": "<p></p>\n", "annotation": ": list[str]"}, {"fullname": "sciterra.mapping.publication.Publication.init_attributes", "modulename": "sciterra.mapping.publication", "qualname": "Publication.init_attributes", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">data</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"kc\">None</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography", "modulename": "sciterra.mapping.topography", "kind": "module", "doc": "<p>Functions for measuring topographic properties of (the semantic feature space of publications inside) an Atlas.</p>\n"}, {"fullname": "sciterra.mapping.topography.smoothing_length_metric", "modulename": "sciterra.mapping.topography", "qualname": "smoothing_length_metric", "kind": "function", "doc": "<p>Proxy for the density of a publication defined as the minimum\narc length that encloses kernel_size other publications.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>idx:</strong>  the index of the vector to calculate the measurement for.</li>\n<li><strong>cospsi_matrix:</strong>  a 2D matrix of pairwise cosine similarity scores for publication embeddings.</li>\n<li><strong>valid_indices:</strong>  Indices of the other publication used when calculating the measurements.</li>\n<li><strong>kernel_size:</strong>  number of K nearest neighbors to calculate the measurement on.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>h: float representing arc length containing <code>kernel_size</code> other publications. (Assumes normalized to a radius of 1.)</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">idx</span><span class=\"p\">:</span> <span class=\"nb\">int</span>,</span><span class=\"param\">\t<span class=\"n\">cospsi_matrix</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">valid_indices</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">kernel_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">16</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.density_metric", "modulename": "sciterra.mapping.topography", "qualname": "density_metric", "kind": "function", "doc": "<p>Estimate the density of a publication by calculating the\nsmoothing length that encloses kernel_size other publications.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>idx:</strong>  the index of the vector to calculate the measurement for.</li>\n<li><strong>cospsi_matrix:</strong>  a 2D matrix of pairwise cosine similarity scores for publication embeddings.</li>\n<li><strong>valid_indices:</strong>  Indices of the other publication used when calculating the measurements.</li>\n<li><strong>kernel_size:</strong>  number of K nearest neighbors to calculate the measurement on.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>density: a float representing <code>kernel_size</code> divided by arc length containing <code>kernel_size</code> other publications.</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">idx</span><span class=\"p\">:</span> <span class=\"nb\">int</span>,</span><span class=\"param\">\t<span class=\"n\">cospsi_matrix</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">valid_indices</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">kernel_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">16</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.edginess_metric", "modulename": "sciterra.mapping.topography", "qualname": "edginess_metric", "kind": "function", "doc": "<p>Estimate the asymmetry of a publication by calculating the difference between that publication's projection and the other publications within the kernel. Normalized to between 0 and 1.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>idx:</strong>  the index of the vector to calculate the measurement for.</li>\n<li><strong>cospsi_matrix:</strong>  an np.ndarray of shape <code>(num_pubs, num_pubs)</code> representing pairwise cosine similarity scores for publication embeddings.</li>\n<li><strong>valid_indices:</strong>  an np.ndarray of shape <code>(num_valid_pubs)</code> representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs &lt;= num_pubs.</li>\n<li><strong>publication_indices:</strong>  an np.ndarray of shape <code>(num_pubs,)</code> representing indices of all publications in the atlas projection</li>\n<li><strong>embeddings:</strong>  an np.ndarray of shape <code>(num_pubs, embedding_dim)</code> vectors for all publications in the atlas projection</li>\n<li><strong>kernel_size:</strong>  number of K nearest neighbors to calculate the measurement on.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>a float representing the normalized magnitude of the asymmetry metric.</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">idx</span><span class=\"p\">:</span> <span class=\"nb\">int</span>,</span><span class=\"param\">\t<span class=\"n\">cospsi_matrix</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">valid_indices</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">publication_indices</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">embeddings</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">kernel_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">16</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">float</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.topography.kernel_constant_asymmetry_metric", "modulename": "sciterra.mapping.topography", "qualname": "kernel_constant_asymmetry_metric", "kind": "function", "doc": "<p>Estimate the asymmetry of a publication by calculating the difference\nbetween that publication's projection and the other publications within\nthe kernel.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>idx:</strong>  an int representing the index of the vector to calculate the measurement for.</li>\n<li><strong>cospsi_matrix:</strong>  an np.ndarray of shape <code>(num_pubs, num_pubs)</code> representing pairwise cosine similarity scores for publication embeddings.</li>\n<li><strong>valid_indices:</strong>  an np.ndarray of shape <code>(num_valid_pubs)</code> representing indices of the other publications used when calculating the measurements, i.e. num_valid_pubs &lt;= num_pubs.</li>\n<li><strong>publication_indices:</strong>  an np.ndarray of shape <code>(num_pubs,)</code> representing indices of all publications in the atlas projection</li>\n<li><strong>embeddings:</strong>  an np.ndarray of shape <code>(num_pubs, embedding_dim)</code> vectors for all publications in the atlas projection</li>\n<li><strong>kernel_size:</strong>  an int representing the number of K nearest neighbors to calculate the measurement on.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>mag: a float representing the magnitude of the asymmetry metric.</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">idx</span><span class=\"p\">:</span> <span class=\"nb\">int</span>,</span><span class=\"param\">\t<span class=\"n\">cospsi_matrix</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">valid_indices</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">publication_indices</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">embeddings</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>,</span><span class=\"param\">\t<span class=\"n\">kernel_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">16</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">float</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.tracing", "modulename": "sciterra.mapping.tracing", "kind": "module", "doc": "<p>Convenience functionality for organized expansions of an Atlas.</p>\n"}, {"fullname": "sciterra.mapping.tracing.iterate_expand", "modulename": "sciterra.mapping.tracing", "qualname": "iterate_expand", "kind": "function", "doc": "<p>Build out an Atlas of publications, i.e. search for similar publications. This is done by iterating a sequence of [expand, save, project, save, track, save]. The convergence criterion is:</p>\n\n<p><code>converged = len(atl) &gt;= target_size or failures &gt;= max_failed_expansions or convergence_func(atl)</code></p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the Atlas to expand</li>\n<li><strong>crt:</strong>  the Cartographer to use</li>\n<li><strong>atlas_dir:</strong>  the directory where Atlas binaries will be saved/loaded from</li>\n<li><strong>target_size:</strong>  stop iterating when we reach this number of publications in the Atlas</li>\n<li><strong>max_failed_expansions:</strong>  stop iterating when we fail to add new publications after this many successive iterations. Default is 2.</li>\n<li><strong>convergence_func:</strong>  a function taking an Atlas and returns True if the expansion loop should stop. This serves as an additional disjunctive convergence criterion besides <code>target_size</code> and <code>max_failed_expansions</code> to exit the expansion loop, which will be called at the convergence check after each expansion. For example, a redundant function to choose would be <code>lambda atl: len(atl) &gt;= target_size</code>. By default returns False.</li>\n<li><strong>center:</strong>  (if given) center the search on this publication, preferentially searching related publications.</li>\n<li><strong>n_pubs_max:</strong>  maximum number of publications allowed in the expansion.</li>\n<li><strong>call_size:</strong>  maximum number of papers to call API for in one query; if less than <code>len(paper_ids)</code>, chunking will be performed.</li>\n<li><strong>n_sources_max:</strong>  maximum number of publications (already in the atlas) to draw references and citations from.</li>\n<li><strong>record_pubs_per_update:</strong>  whether to track all the publications that exist in the resulting atlas to <code>self.pubs_per_update</code>. This should only be set to <code>True</code> when you need to later filter by degree of convergence of the atlas.</li>\n<li><strong>project_kwargs:</strong>  keyword args propagated to every <code>Cartographer.project</code> call during iterate_expand; see <code>Cartographer.filter_by_func</code>.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>atl: the expanded Atlas</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"n\">crt</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">cartography</span><span class=\"o\">.</span><span class=\"n\">Cartographer</span>,</span><span class=\"param\">\t<span class=\"n\">atlas_dir</span><span class=\"p\">:</span> <span class=\"nb\">str</span>,</span><span class=\"param\">\t<span class=\"n\">target_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span>,</span><span class=\"param\">\t<span class=\"n\">max_failed_expansions</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">2</span>,</span><span class=\"param\">\t<span class=\"n\">convergence_func</span><span class=\"p\">:</span> <span class=\"n\">Callable</span><span class=\"p\">[[</span><span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span><span class=\"p\">],</span> <span class=\"nb\">bool</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"o\">&lt;</span><span class=\"n\">function</span> <span class=\"o\">&lt;</span><span class=\"k\">lambda</span><span class=\"o\">&gt;&gt;</span>,</span><span class=\"param\">\t<span class=\"n\">center</span><span class=\"p\">:</span> <span class=\"nb\">str</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">n_pubs_max</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">call_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">n_sources_max</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">record_pubs_per_update</span><span class=\"p\">:</span> <span class=\"nb\">bool</span> <span class=\"o\">=</span> <span class=\"kc\">False</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">project_kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.tracing.search_converged_ids", "modulename": "sciterra.mapping.tracing", "qualname": "search_converged_ids", "kind": "function", "doc": "<p>Get all publication ids such that they did not change neighborhood identity over the duration of the addition of the last <code>num_pubs_added</code> publications added to the atlas during previous <code>Cartographer.expand</code> calls.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  Atlas to search for converged publications</li>\n<li><strong>num_pubs_added:</strong>  the number of publications that we require to have been added to the Atlas in order to compute convergence, in order to compute the find the update index in the Atlas history to filter by <code>&gt;= kernel_size</code>.</li>\n<li><strong>kernel_size:</strong>  the minimum required size of the neighborhood that we will require to not have changed, w.r.t. <code>cond_d</code>. Default is 16.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>converged_pub_ids: a list of Publication identifiers corresponding to publications that have converged acording to the criterion.</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"n\">num_pubs_added</span><span class=\"p\">:</span> <span class=\"nb\">int</span>,</span><span class=\"param\">\t<span class=\"n\">kernel_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">16</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer", "kind": "class", "doc": "<p>Convenience data structure for bookkeeping expansions of an Atlas that reduces boilerplate and ensures an aligned update history between the Atlas and Cartographer.</p>\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.__init__", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.__init__", "kind": "function", "doc": "<p>Convenience wrapper data structure for tracked expansions, by aligning the history of a Cartographer with an Atlas.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atlas_dir:</strong>  absolute path of the directory to save atlas data in, propogated to <code>Atlas.load</code> and <code>Atlas.save</code></li>\n<li><strong>atlas_center_bibtex:</strong>  absolute path of the .bib file containing a single entry, which is the core, central publication, and this entry contains an identifier recognizable by the librarian corresponding to <code>librarian_name</code>.</li>\n<li><strong>librarian_name:</strong>  a str name of a librarian, one of <code>librarians.librarians.keys()</code>, e.g. 'S2' or 'ADS'.</li>\n<li><strong>vectorizer_name:</strong>  a str name of a vectorizer, one of <code>vectorization.vectorizers.keys()</code>, e.g. 'BOW' or 'SciBERT'.</li>\n<li><strong>librarian_kwargs:</strong>  keyword args propogated to a Librarian initialization; if values are <code>None</code> they will be omitted</li>\n<li><strong>vectorizer_kwargs:</strong>  keyword args propogated to a Vectorizer initialization; if values are <code>None</code> they will be omitted</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">atlas_dir</span><span class=\"p\">:</span> <span class=\"nb\">str</span>,</span><span class=\"param\">\t<span class=\"n\">atlas_center_bibtex</span><span class=\"p\">:</span> <span class=\"nb\">str</span>,</span><span class=\"param\">\t<span class=\"n\">librarian_name</span><span class=\"p\">:</span> <span class=\"nb\">str</span>,</span><span class=\"param\">\t<span class=\"n\">vectorizer_name</span><span class=\"p\">:</span> <span class=\"nb\">str</span>,</span><span class=\"param\">\t<span class=\"n\">librarian_kwargs</span><span class=\"p\">:</span> <span class=\"nb\">dict</span> <span class=\"o\">=</span> <span class=\"p\">{}</span>,</span><span class=\"param\">\t<span class=\"n\">vectorizer_kwargs</span><span class=\"p\">:</span> <span class=\"nb\">dict</span> <span class=\"o\">=</span> <span class=\"p\">{}</span></span>)</span>"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.cartographer", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.cartographer", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.atlas_dir", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.atlas_dir", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.atlas", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.atlas", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.mapping.tracing.AtlasTracer.expand_atlas", "modulename": "sciterra.mapping.tracing", "qualname": "AtlasTracer.expand_atlas", "kind": "function", "doc": "<p>Start or continue the expansion of the Atlas by calling <code>iterate_expand</code> with aligned Cartographer and Atlas, by default centered on atl.center.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>target_size:</strong>  stop iterating expansion when Atlas contains this many publications; argument propagated to <code>iterate_expand</code>.</li>\n<li><strong>kwargs:</strong>  keyword args propagated to <code>iterate_expand</code>.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">target_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"kc\">None</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc", "modulename": "sciterra.misc", "kind": "module", "doc": "<p></p>\n"}, {"fullname": "sciterra.misc.analysis", "modulename": "sciterra.misc.analysis", "kind": "module", "doc": "<p>Helper functions for analyzing data yielded by an atlas.</p>\n"}, {"fullname": "sciterra.misc.analysis.atlas_to_measurements", "modulename": "sciterra.misc.analysis", "qualname": "atlas_to_measurements", "kind": "function", "doc": "<p>Compute the density, edginess, and citations per year metrics for each publicaation in an atlas w.r.t. a vectorizer and convergence configurations, and return the results in a dataframe.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>atl:</strong>  the Atlas to measure</li>\n<li><strong>vectorizer:</strong>  the Vectorizer to use to compute density and edginess</li>\n<li><strong>con_d:</strong>  a reverse index. This represents a convergence degree, in the sense that it is the number of updates before the last udpate to require that a publication's neighborhood has not changed identity of composition.</li>\n<li>This will be used to compute the inverse index for the second axis of the array <code>atl.history['kernel_size']</code>, representing the degree of convergence. For details about this array see <code>sciterra.mapping.cartography.Cartographer.converged_kernel_size</code>. Default is 1, which means we will filter to all publications that have not changed neighborhoods up to <code>kernel_size</code> up until the very last update. If 2, then up to the second to last update, etc.</li>\n<li><strong>kernel_size:</strong>  the minimum required size of the neighborhood that we will require to not have changed, w.r.t. <code>cond_d</code>. Default is 16.</li>\n<li><strong>metrics:</strong>  the list of str names corresponding to metrics to compute for the atlas. See <code>sciterra.mapping.topography</code> for possible metrics.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">atl</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">mapping</span><span class=\"o\">.</span><span class=\"n\">atlas</span><span class=\"o\">.</span><span class=\"n\">Atlas</span>,</span><span class=\"param\">\t<span class=\"n\">vectorizer</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">vectorization</span><span class=\"o\">.</span><span class=\"n\">vectorizer</span><span class=\"o\">.</span><span class=\"n\">Vectorizer</span>,</span><span class=\"param\">\t<span class=\"n\">con_d</span><span class=\"p\">:</span> <span class=\"nb\">float</span>,</span><span class=\"param\">\t<span class=\"n\">kernel_size</span><span class=\"o\">=</span><span class=\"mi\">16</span>,</span><span class=\"param\">\t<span class=\"n\">metrics</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span> <span class=\"o\">=</span> <span class=\"p\">[</span><span class=\"s1\">&#39;density&#39;</span><span class=\"p\">,</span> <span class=\"s1\">&#39;edginess&#39;</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"n\">fields_of_study</span><span class=\"o\">=</span><span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">max_year</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">2023</span></span><span class=\"return-annotation\">) -> <span class=\"n\">pandas</span><span class=\"o\">.</span><span class=\"n\">core</span><span class=\"o\">.</span><span class=\"n\">frame</span><span class=\"o\">.</span><span class=\"n\">DataFrame</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc.utils", "modulename": "sciterra.misc.utils", "kind": "module", "doc": "<p>Miscellaneous helper functions.</p>\n"}, {"fullname": "sciterra.misc.utils.standardize_month", "modulename": "sciterra.misc.utils", "qualname": "standardize_month", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">month</span><span class=\"p\">:</span> <span class=\"nb\">str</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">str</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.keep_trying", "modulename": "sciterra.misc.utils", "qualname": "keep_trying", "kind": "function", "doc": "<p>Sometimes we receive server errors. We don't want that to disrupt the entire process, so this decorator allow trying n_attempts times.</p>\n\n<h2 id=\"api_extensionget_data_via_api\">API_extension::get_data_via_api</h2>\n\n<h2 id=\"this-decorator-is-general-except-for-the-default-allowed-exception\">This decorator is general, except for the default allowed exception.</h2>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>n_attempts (int):</strong>  Number of attempts before letting the exception happen.</li>\n<li><strong>allowed_exceptions (tuple of class):</strong>  Allowed exception class. Set to BaseException to keep trying regardless of exception.</li>\n<li><strong>sleep_after_attempt (int):</strong>  Number of seconds to wait before trying each additional attempt.</li>\n<li><strong>verbose (bool):</strong>  If True, be talkative.</li>\n</ul>\n\n<h6 id=\"example-usage\">Example Usage:</h6>\n\n<blockquote>\n  <blockquote>\n    <p>@keep_trying( n_attempts=4 )\n    def try_to_call_web_api():\n        \" do stuff \"</p>\n  </blockquote>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">n_attempts</span><span class=\"o\">=</span><span class=\"mi\">5</span>,</span><span class=\"param\">\tallowed_exceptions=(&lt;class &#x27;requests.exceptions.ReadTimeout&#x27;&gt;, &lt;class &#x27;requests.exceptions.ConnectionError&#x27;&gt;),</span><span class=\"param\">\t<span class=\"n\">verbose</span><span class=\"o\">=</span><span class=\"kc\">True</span>,</span><span class=\"param\">\t<span class=\"n\">sleep_after_attempt</span><span class=\"o\">=</span><span class=\"mi\">1</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.chunk_ids", "modulename": "sciterra.misc.utils", "qualname": "chunk_ids", "kind": "function", "doc": "<p>Helper function to chunk bibcodes or paperIds into smaller sublists if appropriate.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">ids</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"n\">call_size</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.write_pickle", "modulename": "sciterra.misc.utils", "qualname": "write_pickle", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">fn</span><span class=\"p\">:</span> <span class=\"nb\">str</span>, </span><span class=\"param\"><span class=\"n\">data</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.read_pickle", "modulename": "sciterra.misc.utils", "qualname": "read_pickle", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">fn</span><span class=\"p\">:</span> <span class=\"nb\">str</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.get_verbose", "modulename": "sciterra.misc.utils", "qualname": "get_verbose", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">kwargs</span><span class=\"p\">:</span> <span class=\"nb\">dict</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.misc.utils.custom_formatwarning", "modulename": "sciterra.misc.utils", "qualname": "custom_formatwarning", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">msg</span>, </span><span class=\"param\"><span class=\"o\">*</span><span class=\"n\">args</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">):</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization", "modulename": "sciterra.vectorization", "kind": "module", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.vectorizers", "modulename": "sciterra.vectorization", "qualname": "vectorizers", "kind": "variable", "doc": "<p></p>\n", "default_value": "{&#x27;GPT2&#x27;: &lt;class &#x27;sciterra.vectorization.gpt2.GPT2Vectorizer&#x27;&gt;, &#x27;SciBERT&#x27;: &lt;class &#x27;sciterra.vectorization.scibert.SciBERTVectorizer&#x27;&gt;, &#x27;SBERT&#x27;: &lt;class &#x27;sciterra.vectorization.sbert.SBERTVectorizer&#x27;&gt;, &#x27;Word2Vec&#x27;: &lt;class &#x27;sciterra.vectorization.word2vec.Word2VecVectorizer&#x27;&gt;, &#x27;BOW&#x27;: &lt;class &#x27;sciterra.vectorization.bow.BOWVectorizer&#x27;&gt;}"}, {"fullname": "sciterra.vectorization.bow", "modulename": "sciterra.vectorization.bow", "kind": "module", "doc": "<p>Bag of words document embedder. Unlike cc vectorization, we fix the dimension of the embeddings to be the same; this requires us to fix the vocabulary, so for consistency we do so via the same method as the Word2Vec vocabulary construction.</p>\n"}, {"fullname": "sciterra.vectorization.bow.current_file_abs_path", "modulename": "sciterra.vectorization.bow", "qualname": "current_file_abs_path", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization&#x27;"}, {"fullname": "sciterra.vectorization.bow.corpora_path", "modulename": "sciterra.vectorization.bow", "qualname": "corpora_path", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora&#x27;"}, {"fullname": "sciterra.vectorization.bow.ASTROPHYSICS_CORPUS", "modulename": "sciterra.vectorization.bow", "qualname": "ASTROPHYSICS_CORPUS", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;astro_small.txt&#x27;"}, {"fullname": "sciterra.vectorization.bow.DEFAULT_CORPUS", "modulename": "sciterra.vectorization.bow", "qualname": "DEFAULT_CORPUS", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora/astro_small.txt&#x27;"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.__init__", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.__init__", "kind": "function", "doc": "<p>Construct a bag-of-words document vectorizer.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"o\">*</span><span class=\"n\">args</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span>)</span>"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.word2vec_vectorizer", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.word2vec_vectorizer", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.vocabulary", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.vocabulary", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.embedding_dim", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.embedding_dim", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.count_vectorizer", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.count_vectorizer", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.bow.BOWVectorizer.embed_documents", "modulename": "sciterra.vectorization.bow", "qualname": "BOWVectorizer.embed_documents", "kind": "function", "doc": "<p>Embed a list of documents (raw text) into bow document vectors using scikit-learn's CountVectorizer.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>docs:</strong>  the documents to embed.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>a numpy array of shape <code>(num_documents, len(self.vocabulary))</code></p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">docs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">dict</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">,</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.gpt2", "modulename": "sciterra.vectorization.gpt2", "kind": "module", "doc": "<p>GPT-2 is a large causal language model that achieved SOTA in many NLP tasks before its successors created by OpenAI.</p>\n\n<h6 id=\"links\">Links:</h6>\n\n<blockquote>\n  <ul>\n  <li>Paper: <a href=\"https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf\">https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf</a></li>\n  <li>HF: <a href=\"https://huggingface.co/docs/transformers/en/model_doc/gpt2\">https://huggingface.co/docs/transformers/en/model_doc/gpt2</a></li>\n  </ul>\n</blockquote>\n"}, {"fullname": "sciterra.vectorization.gpt2.MPS_DEVICE", "modulename": "sciterra.vectorization.gpt2", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "<p></p>\n", "default_value": "device(type=&#x27;mps&#x27;)"}, {"fullname": "sciterra.vectorization.gpt2.EMBEDDING_DIM", "modulename": "sciterra.vectorization.gpt2", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "<p></p>\n", "default_value": "768"}, {"fullname": "sciterra.vectorization.gpt2.BATCH_SIZE", "modulename": "sciterra.vectorization.gpt2", "qualname": "BATCH_SIZE", "kind": "variable", "doc": "<p></p>\n", "default_value": "8"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.__init__", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.__init__", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">device</span><span class=\"o\">=</span><span class=\"s1\">&#39;cuda&#39;</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span>)</span>"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.tokenizer", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.tokenizer", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.model", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.model", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.gpt2.GPT2Vectorizer.embed_documents", "modulename": "sciterra.vectorization.gpt2", "qualname": "GPT2Vectorizer.embed_documents", "kind": "function", "doc": "<p>Embed a list of documents (raw text) into GPT-2 vectors, by batching.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>docs:</strong>  the documents to embed.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>a numpy array of shape <code>(num_documents, embedding_dim)</code></p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">docs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"n\">batch_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">8</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">dict</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">,</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.preprocessing", "modulename": "sciterra.vectorization.preprocessing", "kind": "module", "doc": "<p>Simple preprocessing of scientific abstracts prior to vectorization.</p>\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor", "kind": "class", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.__init__", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.__init__", "kind": "function", "doc": "<p>Initialize a custom tokenizer.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>allowed_pos_tags:</strong>  keep and lemmatize words that are tagged as one of these POS categories.</li>\n<li><strong>model:</strong>  the name of the spacy language model to load, assuming it is already downloaded.</li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">allowed_pos_tags</span><span class=\"p\">:</span> <span class=\"nb\">set</span> <span class=\"o\">=</span> <span class=\"p\">{</span><span class=\"s1\">&#39;NOUN&#39;</span><span class=\"p\">,</span> <span class=\"s1\">&#39;VERB&#39;</span><span class=\"p\">,</span> <span class=\"s1\">&#39;ADJ&#39;</span><span class=\"p\">}</span>,</span><span class=\"param\">\t<span class=\"n\">model</span><span class=\"o\">=</span><span class=\"s1\">&#39;en_core_web_sm&#39;</span></span>)</span>"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.nlp", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.nlp", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.allowed_pos_tags", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.allowed_pos_tags", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.preprocessing.CustomPreprocessor.custom_preprocess", "modulename": "sciterra.vectorization.preprocessing", "qualname": "CustomPreprocessor.custom_preprocess", "kind": "function", "doc": "<p>Get all of the lemmas of the words in a document, filtering by POS.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>document:</strong>  a multi-sentence string</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>a list of the lemmatized, filtered words in the document</p>\n</blockquote>\n\n<p>Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming</p>\n\n<p>See <a href=\"https://github.com/zhafen/cc/blob/master/cc/utils.py#L173\">https://github.com/zhafen/cc/blob/master/cc/utils.py#L173</a>.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">document</span><span class=\"p\">:</span> <span class=\"nb\">str</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection", "modulename": "sciterra.vectorization.projection", "kind": "module", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.projection.Projection", "modulename": "sciterra.vectorization.projection", "qualname": "Projection", "kind": "class", "doc": "<p>Basic wrapper for document embeddings and helper methods.</p>\n"}, {"fullname": "sciterra.vectorization.projection.Projection.__init__", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.__init__", "kind": "function", "doc": "<p>Construct a Projection object, a bidirectional mapping from identifiers to document embeddings.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>identifier_to_index:</strong>  a dict mapping Publication identifiers to indices in the embedding matrix.</li>\n<li><strong>index_to_identifier:</strong>  a tuple mapping embedding indices to Publication identifiers.</li>\n<li><strong>embeddings:</strong>  ndarray of document embeddings of shape <code>(num_pubs, embedding_dim)</code></li>\n</ul>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">identifier_to_index</span><span class=\"p\">:</span> <span class=\"nb\">dict</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">,</span> <span class=\"nb\">int</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"n\">index_to_identifier</span><span class=\"p\">:</span> <span class=\"nb\">tuple</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>,</span><span class=\"param\">\t<span class=\"n\">embeddings</span><span class=\"p\">:</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span></span>)</span>"}, {"fullname": "sciterra.vectorization.projection.Projection.identifier_to_index", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifier_to_index", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.projection.Projection.index_to_identifier", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.index_to_identifier", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.projection.Projection.embeddings", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.embeddings", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.projection.Projection.indices_to_identifiers", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.indices_to_identifiers", "kind": "function", "doc": "<p>Retrieve the identifiers for a list of embedding matrix indices.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">indices</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.Projection.identifiers_to_embeddings", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifiers_to_embeddings", "kind": "function", "doc": "<p>Retrieve the document embeddings for a list of identifiers.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">identifiers</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span></span><span class=\"return-annotation\">) -> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.Projection.identifiers_to_indices", "modulename": "sciterra.vectorization.projection", "qualname": "Projection.identifiers_to_indices", "kind": "function", "doc": "<p>Retrieve the embedding indices for a list of identifiers.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">identifiers</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span></span><span class=\"return-annotation\">) -> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.merge", "modulename": "sciterra.vectorization.projection", "qualname": "merge", "kind": "function", "doc": "<p>Return the result of merging projection <code>proj_a</code> with projection <code>proj_b</code>.</p>\n\n<p>This adds to proj_a all embedding data contained in proj_b that is missing from proj_a. This means that the resulting projection can only be greater or equal in size to proj_a.</p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">proj_a</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">vectorization</span><span class=\"o\">.</span><span class=\"n\">projection</span><span class=\"o\">.</span><span class=\"n\">Projection</span>,</span><span class=\"param\">\t<span class=\"n\">proj_b</span><span class=\"p\">:</span> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">vectorization</span><span class=\"o\">.</span><span class=\"n\">projection</span><span class=\"o\">.</span><span class=\"n\">Projection</span></span><span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">vectorization</span><span class=\"o\">.</span><span class=\"n\">projection</span><span class=\"o\">.</span><span class=\"n\">Projection</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.projection.get_empty_projection", "modulename": "sciterra.vectorization.projection", "qualname": "get_empty_projection", "kind": "function", "doc": "<p>Construct a Projection with no data (but is not None).</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"return-annotation\">) -> <span class=\"n\">sciterra</span><span class=\"o\">.</span><span class=\"n\">vectorization</span><span class=\"o\">.</span><span class=\"n\">projection</span><span class=\"o\">.</span><span class=\"n\">Projection</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.sbert", "modulename": "sciterra.vectorization.sbert", "kind": "module", "doc": "<p>We use the acronym SBERT as a catch-all for BERT-based sentence transformers. In particular, we use a lightweight/fast version of one the top-performing model.</p>\n\n<h6 id=\"links\">Links:</h6>\n\n<blockquote>\n  <p>sbert: <a href=\"https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models\">https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models</a>.\n  HF: <a href=\"https://huggingface.co/sentence-transformers\">https://huggingface.co/sentence-transformers</a></p>\n</blockquote>\n"}, {"fullname": "sciterra.vectorization.sbert.MPS_DEVICE", "modulename": "sciterra.vectorization.sbert", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "<p></p>\n", "default_value": "device(type=&#x27;mps&#x27;)"}, {"fullname": "sciterra.vectorization.sbert.MODEL_PATH", "modulename": "sciterra.vectorization.sbert", "qualname": "MODEL_PATH", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;all-MiniLM-L6-v2&#x27;"}, {"fullname": "sciterra.vectorization.sbert.EMBEDDING_DIM", "modulename": "sciterra.vectorization.sbert", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "<p></p>\n", "default_value": "384"}, {"fullname": "sciterra.vectorization.sbert.MAX_SEQ_LENGTH", "modulename": "sciterra.vectorization.sbert", "qualname": "MAX_SEQ_LENGTH", "kind": "variable", "doc": "<p></p>\n", "default_value": "256"}, {"fullname": "sciterra.vectorization.sbert.BATCH_SIZE", "modulename": "sciterra.vectorization.sbert", "qualname": "BATCH_SIZE", "kind": "variable", "doc": "<p></p>\n", "default_value": "64"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.__init__", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.__init__", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">device</span><span class=\"o\">=</span><span class=\"s1\">&#39;cuda&#39;</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span>)</span>"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.model", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.model", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.sbert.SBERTVectorizer.embed_documents", "modulename": "sciterra.vectorization.sbert", "qualname": "SBERTVectorizer.embed_documents", "kind": "function", "doc": "<p>Embed a list of documents (raw text) into SBERT vectors, by batching.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>docs:</strong>  the documents to embed.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>a numpy array of shape <code>(num_documents, 384)</code></p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">docs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"n\">batch_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">64</span></span><span class=\"return-annotation\">) -> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.scibert", "modulename": "sciterra.vectorization.scibert", "kind": "module", "doc": "<p>SciBERT is a BERT model trained on scientific text.</p>\n\n<h6 id=\"links\">Links:</h6>\n\n<blockquote>\n  <p>Paper: <a href=\"https://aclanthology.org/D19-1371/\">https://aclanthology.org/D19-1371/</a>\n  Github:  <a href=\"https://github.com/allenai/scibert\">https://github.com/allenai/scibert</a>\n  HF: <a href=\"https://huggingface.co/allenai/scibert_scivocab_uncased\">https://huggingface.co/allenai/scibert_scivocab_uncased</a></p>\n</blockquote>\n"}, {"fullname": "sciterra.vectorization.scibert.MPS_DEVICE", "modulename": "sciterra.vectorization.scibert", "qualname": "MPS_DEVICE", "kind": "variable", "doc": "<p></p>\n", "default_value": "device(type=&#x27;mps&#x27;)"}, {"fullname": "sciterra.vectorization.scibert.MODEL_PATH", "modulename": "sciterra.vectorization.scibert", "qualname": "MODEL_PATH", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;allenai/scibert_scivocab_uncased&#x27;"}, {"fullname": "sciterra.vectorization.scibert.EMBEDDING_DIM", "modulename": "sciterra.vectorization.scibert", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "<p></p>\n", "default_value": "768"}, {"fullname": "sciterra.vectorization.scibert.BATCH_SIZE", "modulename": "sciterra.vectorization.scibert", "qualname": "BATCH_SIZE", "kind": "variable", "doc": "<p></p>\n", "default_value": "64"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.__init__", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.__init__", "kind": "function", "doc": "<p></p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"n\">device</span><span class=\"o\">=</span><span class=\"s1\">&#39;cuda&#39;</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span>)</span>"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.tokenizer", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.tokenizer", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.model", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.model", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.scibert.SciBERTVectorizer.embed_documents", "modulename": "sciterra.vectorization.scibert", "qualname": "SciBERTVectorizer.embed_documents", "kind": "function", "doc": "<p>Embed a list of documents (raw text) into SciBERT vectors, by batching.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>docs:</strong>  the documents to embed.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>a numpy array of shape <code>(num_documents, 768)</code></p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">docs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"n\">batch_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">64</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">dict</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">,</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.vectorizer", "modulename": "sciterra.vectorization.vectorizer", "kind": "module", "doc": "<p>Base class for vectorizing abstracts.</p>\n"}, {"fullname": "sciterra.vectorization.vectorizer.Vectorizer", "modulename": "sciterra.vectorization.vectorizer", "qualname": "Vectorizer", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "abc.ABC"}, {"fullname": "sciterra.vectorization.vectorizer.Vectorizer.embed_documents", "modulename": "sciterra.vectorization.vectorizer", "qualname": "Vectorizer.embed_documents", "kind": "function", "doc": "<p>Embed a list of documents into document vectors.</p>\n\n<h6 id=\"arguments\">Arguments:</h6>\n\n<ul>\n<li><strong>docs:</strong>  the documents to embed.</li>\n<li><strong>batch_size:</strong>  the batch size to use.</li>\n</ul>\n\n<h6 id=\"returns\">Returns:</h6>\n\n<blockquote>\n  <p>a dict of the form\n  {\n      \"embeddings\": a numpy array of shape <code>(num_successful, embedding_dim)</code>, containing the document embeddingss</p>\n\n<pre><code>\"success_indices\": a numpy array of shape `(num_successful,)`, containing the indices of all the documents for which document embeddings were successfully obtained.\n\n\"fail_indices\": a numpy array of shape `(len(docs) - num_successful,)`, containing the indices of all the documents for which document embeddings could not be obtained\n</code></pre>\n  \n  <p>}\n  where the indices are with respect to the <code>docs</code> list passed.</p>\n</blockquote>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">docs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"n\">batch_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">64</span></span><span class=\"return-annotation\">) -> <span class=\"nb\">dict</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">,</span> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span><span class=\"p\">]</span>:</span></span>", "funcdef": "def"}, {"fullname": "sciterra.vectorization.word2vec", "modulename": "sciterra.vectorization.word2vec", "kind": "module", "doc": "<p>We use a simple word2vec model that gets a document vector by averaging all words in the document.</p>\n\n<p>Since we are getting vectors for scientific documents, we must load a vocabulary to train the model from scratch. Therefore we define different subclasses for each scientific field, which may differ substantially by vocabulary.</p>\n\n<p>There exists a Doc2Vec module by gensim, but it seems that empirically Word2Vec + averaging can do just as well; furthermore, we're mainly interested in a simple baseline to compare with sophisticated embeddings.</p>\n\n<h6 id=\"links\">Links:</h6>\n\n<blockquote>\n  <p>gensim: <a href=\"https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#\">https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#</a></p>\n</blockquote>\n"}, {"fullname": "sciterra.vectorization.word2vec.EMBEDDING_DIM", "modulename": "sciterra.vectorization.word2vec", "qualname": "EMBEDDING_DIM", "kind": "variable", "doc": "<p></p>\n", "default_value": "300"}, {"fullname": "sciterra.vectorization.word2vec.current_file_abs_path", "modulename": "sciterra.vectorization.word2vec", "qualname": "current_file_abs_path", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization&#x27;"}, {"fullname": "sciterra.vectorization.word2vec.corpora_path", "modulename": "sciterra.vectorization.word2vec", "qualname": "corpora_path", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora&#x27;"}, {"fullname": "sciterra.vectorization.word2vec.ASTROPHYSICS_CORPUS", "modulename": "sciterra.vectorization.word2vec", "qualname": "ASTROPHYSICS_CORPUS", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;astro_small.txt&#x27;"}, {"fullname": "sciterra.vectorization.word2vec.DEFAULT_CORPUS", "modulename": "sciterra.vectorization.word2vec", "qualname": "DEFAULT_CORPUS", "kind": "variable", "doc": "<p></p>\n", "default_value": "&#x27;/Users/nathanielimel/uci/projects/sciterra/src/sciterra/vectorization/corpora/astro_small.txt&#x27;"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer", "kind": "class", "doc": "<p>Helper class that provides a standard way to create an ABC using\ninheritance.</p>\n", "bases": "sciterra.vectorization.vectorizer.Vectorizer"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.__init__", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.__init__", "kind": "function", "doc": "<p>Construct a Word2Vec based document embedding model from a corpus.</p>\n", "signature": "<span class=\"signature pdoc-code multiline\">(<span class=\"param\">\t<span class=\"n\">corpus_path</span><span class=\"p\">:</span> <span class=\"nb\">str</span>,</span><span class=\"param\">\t<span class=\"n\">model_path</span><span class=\"p\">:</span> <span class=\"nb\">str</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"n\">vector_size</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">300</span>,</span><span class=\"param\">\t<span class=\"n\">window</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">5</span>,</span><span class=\"param\">\t<span class=\"n\">min_count</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">2</span>,</span><span class=\"param\">\t<span class=\"n\">workers</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">8</span>,</span><span class=\"param\">\t<span class=\"n\">epochs</span><span class=\"p\">:</span> <span class=\"nb\">int</span> <span class=\"o\">=</span> <span class=\"mi\">10</span>,</span><span class=\"param\">\t<span class=\"n\">tokenizer</span><span class=\"p\">:</span> <span class=\"n\">Callable</span><span class=\"p\">[[</span><span class=\"nb\">str</span><span class=\"p\">],</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]]</span> <span class=\"o\">=</span> <span class=\"kc\">None</span>,</span><span class=\"param\">\t<span class=\"o\">**</span><span class=\"n\">kwargs</span></span>)</span>"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.model", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.model", "kind": "variable", "doc": "<p></p>\n"}, {"fullname": "sciterra.vectorization.word2vec.Word2VecVectorizer.embed_documents", "modulename": "sciterra.vectorization.word2vec", "qualname": "Word2VecVectorizer.embed_documents", "kind": "function", "doc": "<p>Embed a list of documents (raw text) into word2vec document vectors by averaging the word vectors in each of the documents.</p>\n\n<p>Since there's no speedup via batching like there is in pytorch models, we iterate one document at a time.</p>\n", "signature": "<span class=\"signature pdoc-code condensed\">(<span class=\"param\"><span class=\"bp\">self</span>, </span><span class=\"param\"><span class=\"n\">docs</span><span class=\"p\">:</span> <span class=\"nb\">list</span><span class=\"p\">[</span><span class=\"nb\">str</span><span class=\"p\">]</span>, </span><span class=\"param\"><span class=\"o\">**</span><span class=\"n\">kwargs</span></span><span class=\"return-annotation\">) -> <span class=\"n\">numpy</span><span class=\"o\">.</span><span class=\"n\">ndarray</span>:</span></span>", "funcdef": "def"}];
 
     // mirrored in build-search-index.js (part 1)
     // Also split on html tags. this is a cheap heuristic, but good enough.