From e1d17b32a6b44f339e6fbf0cc715a7531f3899bf Mon Sep 17 00:00:00 2001
From: Nathaniel Imel <nathanielimel@Nathaniels-MacBook-Air.local>
Date: Sun, 12 Nov 2023 18:26:57 -0800
Subject: [PATCH] define preprocessor as class so docs workflow doesnt need to
 download spacy model

---
 src/sciterra/vectorization/preprocessing.py   |  76 +++++++++++-------
 src/sciterra/vectorization/word2vec.py        |   8 +-
 .../data/models/word2vec_model_example.model  | Bin 71304 -> 72579 bytes
 src/tests/test_vectorization.py               |   2 -
 4 files changed, 51 insertions(+), 35 deletions(-)

diff --git a/src/sciterra/vectorization/preprocessing.py b/src/sciterra/vectorization/preprocessing.py
index f211c6a..bb0869f 100644
--- a/src/sciterra/vectorization/preprocessing.py
+++ b/src/sciterra/vectorization/preprocessing.py
@@ -2,37 +2,53 @@
 
 import spacy
 
-model = "en_core_web_sm"
-try:
-    nlp = spacy.load(model)
-except OSError:
-    raise OSError(f"Can't find model '{model}'; make sure you have run 'python3 -m spacy download {model}'!")
-
 # Another off the shelf simple tokenizer
 from gensim.utils import simple_preprocess
 
 
-def custom_preprocess(
-    document: str,
-    allowed_pos_tags: set = {"NOUN", "VERB", "ADJ"},
-) -> list[str]:
-    """Get all of the lemmas of the words in a document, filtering by POS.
-
-    Args:
-        document: a multi-sentence string
-
-        allowed_pos_tags: keep and lemmatize words that are tagged as one of these POS categories.
-
-    Returns:
-        a list of the lemmatized, filtered words in the document
-
-    Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming
-
-    See https://github.com/zhafen/cc/blob/master/cc/utils.py#L173.
-    """
-    return [
-        token.lemma_
-        for sent in nlp(document).sents
-        for token in sent
-        if token.pos_ in allowed_pos_tags
-    ]
+class CustomPreprocessor:
+    def __init__(
+        self,
+        allowed_pos_tags: set = {"NOUN", "VERB", "ADJ"},
+        model="en_core_web_sm",
+    ) -> None:
+        """Initialize a custom tokenizer.
+
+        Args:
+
+            allowed_pos_tags: keep and lemmatize words that are tagged as one of these POS categories.
+
+            model: the name of the spacy language model to load, assuming it is already downloaded.
+        """
+        try:
+            nlp = spacy.load(model)
+        except OSError:
+            raise OSError(
+                f"Can't find model '{model}'; make sure you have run 'python3 -m spacy download {model}'!"
+            )
+
+        self.nlp = nlp
+        self.allowed_pos_tags = allowed_pos_tags
+
+    def custom_preprocess(
+        self,
+        document: str,
+    ) -> list[str]:
+        """Get all of the lemmas of the words in a document, filtering by POS.
+
+        Args:
+            document: a multi-sentence string
+
+        Returns:
+            a list of the lemmatized, filtered words in the document
+
+        Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming
+
+        See https://github.com/zhafen/cc/blob/master/cc/utils.py#L173.
+        """
+        return [
+            token.lemma_
+            for sent in self.nlp(document).sents
+            for token in sent
+            if token.pos_ in self.allowed_pos_tags
+        ]
diff --git a/src/sciterra/vectorization/word2vec.py b/src/sciterra/vectorization/word2vec.py
index 23fa0b6..c92d0db 100644
--- a/src/sciterra/vectorization/word2vec.py
+++ b/src/sciterra/vectorization/word2vec.py
@@ -14,7 +14,7 @@
 import numpy as np
 
 from .vectorizer import Vectorizer
-from .preprocessing import custom_preprocess
+from .preprocessing import CustomPreprocessor
 from tqdm import tqdm
 from typing import Callable
 
@@ -43,13 +43,15 @@ def __init__(
         min_count: int = 2,
         workers: int = cpu_count(),
         epochs: int = 10,
-        tokenizer: Callable[[str], list[str]] = custom_preprocess,
+        tokenizer: Callable[[str], list[str]] = None,
         **kwargs,
     ) -> None:
         """Construct a Word2Vec based document embedding model from a corpus."""
         super().__init__()
 
-        self.tokenizer = tokenizer
+        if tokenizer is None:
+            preprocessor = CustomPreprocessor()
+            self.tokenizer = preprocessor.custom_preprocess
 
         if (model_path is None) or (not os.path.exists(model_path)):
             start = time.time()
diff --git a/src/tests/data/models/word2vec_model_example.model b/src/tests/data/models/word2vec_model_example.model
index e971c6104b7da668b13fcd1b0f28d4de5174c295..f82d8be579b11bef4c4a588a0879f6bc9eded235 100644
GIT binary patch
delta 309
zcmeBJ%hJ4^g|&fYs*m(WR*yN8f6s{EH#E00v9vNW(K9eJGB-1tEHG^!W65O2S*DJq
zj0_AtQbq<w#=3@vx`sv}2&EQg<`xE1vdV!fvMNACC5Wg35!E1~21L|Oem5f(WWK49
zm8r3w1;~6eu=!nJ^SgnH(9MS`wJ<R@G@p{y3o@n;MD&A*2_Rx3h?oQ-CPU1(urf5T
zGBndOF)%f<FbA8z0Brt3pdwWB;YxunF$bEz7-Y;65U~_QECUhCLBt9Wv2wG+9FHmh
D3h_^P

delta 21
dcmZqP&eE}#g|&fYYM|IgR*yNGJr-J50RU9~2j~C*

diff --git a/src/tests/test_vectorization.py b/src/tests/test_vectorization.py
index 5772407..fe22063 100644
--- a/src/tests/test_vectorization.py
+++ b/src/tests/test_vectorization.py
@@ -175,8 +175,6 @@ def test_basic_cosine_matrix(self):
         # like pair above, but pretending that we have more than 2 publications.
 
         num_pubs = 300
-        # n.b., 1000 typically takes 83.75s with mps; Colab cuda takes just 29s
-        # github actions just uses cpu, so maybe don't waste time stress testing
 
         embeddings = np.array(
             [