Skip to content

Commit

Permalink
define preprocessor as class so docs workflow doesnt need to download…
Browse files Browse the repository at this point in the history
… spacy model
  • Loading branch information
Nathaniel Imel authored and Nathaniel Imel committed Nov 13, 2023
1 parent 2cd3735 commit e1d17b3
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 35 deletions.
76 changes: 46 additions & 30 deletions src/sciterra/vectorization/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,53 @@

import spacy

model = "en_core_web_sm"
try:
nlp = spacy.load(model)
except OSError:
raise OSError(f"Can't find model '{model}'; make sure you have run 'python3 -m spacy download {model}'!")

# Another off the shelf simple tokenizer
from gensim.utils import simple_preprocess


def custom_preprocess(
document: str,
allowed_pos_tags: set = {"NOUN", "VERB", "ADJ"},
) -> list[str]:
"""Get all of the lemmas of the words in a document, filtering by POS.
Args:
document: a multi-sentence string
allowed_pos_tags: keep and lemmatize words that are tagged as one of these POS categories.
Returns:
a list of the lemmatized, filtered words in the document
Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming
See https://github.com/zhafen/cc/blob/master/cc/utils.py#L173.
"""
return [
token.lemma_
for sent in nlp(document).sents
for token in sent
if token.pos_ in allowed_pos_tags
]
class CustomPreprocessor:
def __init__(
self,
allowed_pos_tags: set = {"NOUN", "VERB", "ADJ"},
model="en_core_web_sm",
) -> None:
"""Initialize a custom tokenizer.
Args:
allowed_pos_tags: keep and lemmatize words that are tagged as one of these POS categories.
model: the name of the spacy language model to load, assuming it is already downloaded.
"""
try:
nlp = spacy.load(model)
except OSError:
raise OSError(
f"Can't find model '{model}'; make sure you have run 'python3 -m spacy download {model}'!"
)

self.nlp = nlp
self.allowed_pos_tags = allowed_pos_tags

def custom_preprocess(
self,
document: str,
) -> list[str]:
"""Get all of the lemmas of the words in a document, filtering by POS.
Args:
document: a multi-sentence string
Returns:
a list of the lemmatized, filtered words in the document
Given the domain-specificity, we choose to heuristically stem instead of performing full, linguistically precise lemmatization that would require detailed vocabulary rules. That said, the nltk WordNet lemmatizer doesn't immediately seem to do better than basic stemming
See https://github.com/zhafen/cc/blob/master/cc/utils.py#L173.
"""
return [
token.lemma_
for sent in self.nlp(document).sents
for token in sent
if token.pos_ in self.allowed_pos_tags
]
8 changes: 5 additions & 3 deletions src/sciterra/vectorization/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import numpy as np

from .vectorizer import Vectorizer
from .preprocessing import custom_preprocess
from .preprocessing import CustomPreprocessor
from tqdm import tqdm
from typing import Callable

Expand Down Expand Up @@ -43,13 +43,15 @@ def __init__(
min_count: int = 2,
workers: int = cpu_count(),
epochs: int = 10,
tokenizer: Callable[[str], list[str]] = custom_preprocess,
tokenizer: Callable[[str], list[str]] = None,
**kwargs,
) -> None:
"""Construct a Word2Vec based document embedding model from a corpus."""
super().__init__()

self.tokenizer = tokenizer
if tokenizer is None:
preprocessor = CustomPreprocessor()
self.tokenizer = preprocessor.custom_preprocess

if (model_path is None) or (not os.path.exists(model_path)):
start = time.time()
Expand Down
Binary file modified src/tests/data/models/word2vec_model_example.model
Binary file not shown.
2 changes: 0 additions & 2 deletions src/tests/test_vectorization.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,6 @@ def test_basic_cosine_matrix(self):
# like pair above, but pretending that we have more than 2 publications.

num_pubs = 300
# n.b., 1000 typically takes 83.75s with mps; Colab cuda takes just 29s
# github actions just uses cpu, so maybe don't waste time stress testing

embeddings = np.array(
[
Expand Down

0 comments on commit e1d17b3

Please sign in to comment.