define preprocessor as class so docs workflow doesnt need to download…

… spacy model
nathimel · Nov 13, 2023 · e1d17b3 · e1d17b3
1 parent 2cd3735
commit e1d17b3
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 35 deletions.
diff --git a/src/sciterra/vectorization/preprocessing.py b/src/sciterra/vectorization/preprocessing.py
@@ -2,37 +2,53 @@
 
 import spacy
 
-model = "en_core_web_sm"
-try:
-    nlp = spacy.load(model)
-except OSError:
-    raise OSError(f"Can't find model '{model}'; make sure you have run 'python3 -m spacy download {model}'!")
-
 # Another off the shelf simple tokenizer
 from gensim.utils import simple_preprocess
 
 
-def custom_preprocess(
-    document: str,
-    allowed_pos_tags: set = {"NOUN", "VERB", "ADJ"},
-) -> list[str]:
-    """Get all of the lemmas of the words in a document, filtering by POS.
-
-    Args:
-        document: a multi-sentence string
-
-        allowed_pos_tags: keep and lemmatize words that are tagged as one of these POS categories.
-
-    Returns:
-        a list of the lemmatized, filtered words in the document
-
-    Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming
-
-    See https://github.com/zhafen/cc/blob/master/cc/utils.py#L173.
-    """
-    return [
-        token.lemma_
-        for sent in nlp(document).sents
-        for token in sent
-        if token.pos_ in allowed_pos_tags
-    ]
+class CustomPreprocessor:
+    def __init__(
+        self,
+        allowed_pos_tags: set = {"NOUN", "VERB", "ADJ"},
+        model="en_core_web_sm",
+    ) -> None:
+        """Initialize a custom tokenizer.
+
+        Args:
+
+            allowed_pos_tags: keep and lemmatize words that are tagged as one of these POS categories.
+
+            model: the name of the spacy language model to load, assuming it is already downloaded.
+        """
+        try:
+            nlp = spacy.load(model)
+        except OSError:
+            raise OSError(
+                f"Can't find model '{model}'; make sure you have run 'python3 -m spacy download {model}'!"
+            )
+
+        self.nlp = nlp
+        self.allowed_pos_tags = allowed_pos_tags
+
+    def custom_preprocess(
+        self,
+        document: str,
+    ) -> list[str]:
+        """Get all of the lemmas of the words in a document, filtering by POS.
+
+        Args:
+            document: a multi-sentence string
+
+        Returns:
+            a list of the lemmatized, filtered words in the document
+
+        Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming
+
+        See https://github.com/zhafen/cc/blob/master/cc/utils.py#L173.
+        """
+        return [
+            token.lemma_
+            for sent in self.nlp(document).sents
+            for token in sent
+            if token.pos_ in self.allowed_pos_tags
+        ]
diff --git a/src/sciterra/vectorization/word2vec.py b/src/sciterra/vectorization/word2vec.py
@@ -14,7 +14,7 @@
 import numpy as np
 
 from .vectorizer import Vectorizer
-from .preprocessing import custom_preprocess
+from .preprocessing import CustomPreprocessor
 from tqdm import tqdm
 from typing import Callable
 
@@ -43,13 +43,15 @@ def __init__(
         min_count: int = 2,
         workers: int = cpu_count(),
         epochs: int = 10,
-        tokenizer: Callable[[str], list[str]] = custom_preprocess,
+        tokenizer: Callable[[str], list[str]] = None,
         **kwargs,
     ) -> None:
         """Construct a Word2Vec based document embedding model from a corpus."""
         super().__init__()
 
-        self.tokenizer = tokenizer
+        if tokenizer is None:
+            preprocessor = CustomPreprocessor()
+            self.tokenizer = preprocessor.custom_preprocess
 
         if (model_path is None) or (not os.path.exists(model_path)):
             start = time.time()

diff --git a/src/tests/data/models/word2vec_model_example.model b/src/tests/data/models/word2vec_model_example.model
diff --git a/src/tests/test_vectorization.py b/src/tests/test_vectorization.py
@@ -175,8 +175,6 @@ def test_basic_cosine_matrix(self):
         # like pair above, but pretending that we have more than 2 publications.
 
         num_pubs = 300
-        # n.b., 1000 typically takes 83.75s with mps; Colab cuda takes just 29s
-        # github actions just uses cpu, so maybe don't waste time stress testing
 
         embeddings = np.array(
             [