From e1d17b32a6b44f339e6fbf0cc715a7531f3899bf Mon Sep 17 00:00:00 2001 From: Nathaniel Imel Date: Sun, 12 Nov 2023 18:26:57 -0800 Subject: [PATCH] define preprocessor as class so docs workflow doesnt need to download spacy model --- src/sciterra/vectorization/preprocessing.py | 76 +++++++++++------- src/sciterra/vectorization/word2vec.py | 8 +- .../data/models/word2vec_model_example.model | Bin 71304 -> 72579 bytes src/tests/test_vectorization.py | 2 - 4 files changed, 51 insertions(+), 35 deletions(-) diff --git a/src/sciterra/vectorization/preprocessing.py b/src/sciterra/vectorization/preprocessing.py index f211c6a..bb0869f 100644 --- a/src/sciterra/vectorization/preprocessing.py +++ b/src/sciterra/vectorization/preprocessing.py @@ -2,37 +2,53 @@ import spacy -model = "en_core_web_sm" -try: - nlp = spacy.load(model) -except OSError: - raise OSError(f"Can't find model '{model}'; make sure you have run 'python3 -m spacy download {model}'!") - # Another off the shelf simple tokenizer from gensim.utils import simple_preprocess -def custom_preprocess( - document: str, - allowed_pos_tags: set = {"NOUN", "VERB", "ADJ"}, -) -> list[str]: - """Get all of the lemmas of the words in a document, filtering by POS. - - Args: - document: a multi-sentence string - - allowed_pos_tags: keep and lemmatize words that are tagged as one of these POS categories. - - Returns: - a list of the lemmatized, filtered words in the document - - Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming - - See https://github.com/zhafen/cc/blob/master/cc/utils.py#L173. - """ - return [ - token.lemma_ - for sent in nlp(document).sents - for token in sent - if token.pos_ in allowed_pos_tags - ] +class CustomPreprocessor: + def __init__( + self, + allowed_pos_tags: set = {"NOUN", "VERB", "ADJ"}, + model="en_core_web_sm", + ) -> None: + """Initialize a custom tokenizer. + + Args: + + allowed_pos_tags: keep and lemmatize words that are tagged as one of these POS categories. + + model: the name of the spacy language model to load, assuming it is already downloaded. + """ + try: + nlp = spacy.load(model) + except OSError: + raise OSError( + f"Can't find model '{model}'; make sure you have run 'python3 -m spacy download {model}'!" + ) + + self.nlp = nlp + self.allowed_pos_tags = allowed_pos_tags + + def custom_preprocess( + self, + document: str, + ) -> list[str]: + """Get all of the lemmas of the words in a document, filtering by POS. + + Args: + document: a multi-sentence string + + Returns: + a list of the lemmatized, filtered words in the document + + Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming + + See https://github.com/zhafen/cc/blob/master/cc/utils.py#L173. + """ + return [ + token.lemma_ + for sent in self.nlp(document).sents + for token in sent + if token.pos_ in self.allowed_pos_tags + ] diff --git a/src/sciterra/vectorization/word2vec.py b/src/sciterra/vectorization/word2vec.py index 23fa0b6..c92d0db 100644 --- a/src/sciterra/vectorization/word2vec.py +++ b/src/sciterra/vectorization/word2vec.py @@ -14,7 +14,7 @@ import numpy as np from .vectorizer import Vectorizer -from .preprocessing import custom_preprocess +from .preprocessing import CustomPreprocessor from tqdm import tqdm from typing import Callable @@ -43,13 +43,15 @@ def __init__( min_count: int = 2, workers: int = cpu_count(), epochs: int = 10, - tokenizer: Callable[[str], list[str]] = custom_preprocess, + tokenizer: Callable[[str], list[str]] = None, **kwargs, ) -> None: """Construct a Word2Vec based document embedding model from a corpus.""" super().__init__() - self.tokenizer = tokenizer + if tokenizer is None: + preprocessor = CustomPreprocessor() + self.tokenizer = preprocessor.custom_preprocess if (model_path is None) or (not os.path.exists(model_path)): start = time.time() diff --git a/src/tests/data/models/word2vec_model_example.model b/src/tests/data/models/word2vec_model_example.model index e971c6104b7da668b13fcd1b0f28d4de5174c295..f82d8be579b11bef4c4a588a0879f6bc9eded235 100644 GIT binary patch delta 309 zcmeBJ%hJ4^g|&fYs*m(WR*yN8f6s{EH#E00v9vNW(K9eJGB-1tEHG^!W65O2S*DJq zj0_AtQbq