Skip to content

Commit

Permalink
batch process cosine sim for large arrays
Browse files Browse the repository at this point in the history
  • Loading branch information
Nathaniel Imel authored and Nathaniel Imel committed Oct 30, 2023
1 parent 96e2b3c commit a646bf9
Show file tree
Hide file tree
Showing 6 changed files with 28,836 additions and 306 deletions.
20 changes: 16 additions & 4 deletions src/examples/scratch/data_from_cc.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,21 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'cc'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/Users/nathanielimel/uci/projects/sciterra/src/examples/scratch/data_from_cc.ipynb Cell 2\u001b[0m line \u001b[0;36m8\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/nathanielimel/uci/projects/sciterra/src/examples/scratch/data_from_cc.ipynb#W1sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mcollections\u001b[39;00m \u001b[39mimport\u001b[39;00m Counter\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/nathanielimel/uci/projects/sciterra/src/examples/scratch/data_from_cc.ipynb#W1sZmlsZQ%3D%3D?line=6'>7</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39msciterra\u001b[39;00m\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/nathanielimel/uci/projects/sciterra/src/examples/scratch/data_from_cc.ipynb#W1sZmlsZQ%3D%3D?line=7'>8</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mcc\u001b[39;00m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'cc'"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
Expand All @@ -32,7 +44,7 @@
"outputs": [],
"source": [
"# sciterra_atlas_dir = \"outputs/atlas_from_cc_region_0/\"\n",
"sciterra_atlas_dir = \"outputs/atlas_from_cc_region_8/\""
"sciterra_atlas_dir = \"outputs/atlas_from_cc_region_7/\""
]
},
{
Expand Down Expand Up @@ -388,7 +400,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
"version": "3.11.3"
},
"orig_nbformat": 4
},
Expand Down
43 changes: 43 additions & 0 deletions src/examples/scratch/run_topography.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import numpy as np
import pandas as pd
import plotnine as pn

from datetime import datetime, date

from sciterra.mapping.atlas import Atlas
from sciterra.mapping.cartography import Cartographer
from sciterra.vectorization.scibert import SciBERTVectorizer

def main():

atlas_dir = "outputs/atlas_from_cc_region_8/"
# atlas_dir = "outputs/atlas_s2-7-29-23_centered_imeletal"

atl = Atlas.load(atlas_dir)

print(len(atl))

vectorizer = SciBERTVectorizer(device="mps")
crt = Cartographer(vectorizer=vectorizer)

measurements = crt.measure_topography(atl, metrics=["density", "edginess"])

citations_per_year = [
atl[id].citation_count / (2023 - atl[id].publication_date.year) if (atl[id].publication_date.year < 2023 and atl[id].citation_count is not None) else 0.
for id in atl.projection.index_to_identifier
]
# what if we just drop all those with 0 citations (per year)?
# and those > 100 anyway
citations_per_year = [item if (item > 0. and item < 100.) else None for item in citations_per_year ]

df = pd.DataFrame(
measurements,
columns=["density", "edginess"],
)
df["citations_per_year"] = citations_per_year
df.dropna(inplace=True) # not sure why this didn't take care of later NaNs
df.to_csv("sciterra_data_from_cc_region_8.csv", index=False)

if __name__ == "__main__":

main()
Loading

0 comments on commit a646bf9

Please sign in to comment.