Skip to content

Commit

Permalink
add librarian kwargs, includeing s2_api_key, and batch_size as a kwar…
Browse files Browse the repository at this point in the history
…g to expansion trace
  • Loading branch information
Nathaniel Imel authored and Nathaniel Imel committed Feb 6, 2024
1 parent 9f6654b commit c082257
Show file tree
Hide file tree
Showing 8 changed files with 79 additions and 46 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ success_indices = result["success_indices"] # shape `(len(embeddings),)`
fail_indices = result["fail_indices"] # shape `(len(docs) - len(embeddings))``
```

Currently, sciterra has vectorizers using [SciBERT](https://aclanthology.org/D19-1371/), [SBERT](https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models), [Word2Vec](https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#), and a simple bag-of-words (BOW) vectorizer that uses the same vocabulary as the Word2Vec vectorizer. Contributions to sciterra in the form of new Vectorizer subclasses are also encouraged and appreciated.
Currently, sciterra has vectorizers using [SciBERT](https://aclanthology.org/D19-1371/), [SBERT](https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models), [GPT-2](https://huggingface.co/docs/transformers/en/model_doc/gpt2), [Word2Vec](https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#), and a simple bag-of-words (BOW) vectorizer that uses the same vocabulary as the Word2Vec vectorizer. Contributions to sciterra in the form of new Vectorizer subclasses are also encouraged and appreciated.

### Putting it all together

Expand Down
14 changes: 12 additions & 2 deletions src/sciterra/librarians/s2librarian.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,18 @@


class SemanticScholarLibrarian(Librarian):
def __init__(self) -> None:
self.sch = SemanticScholar()
def __init__(
self,
api_key: str = None,
api_key_fn: str = None,
) -> None:
if api_key_fn is not None:
print(f"Reading private api key from {api_key_fn}.")
# Parse api_key_fn for 40-ch private key
with open(api_key_fn, "r") as f:
api_key = f.read()

self.sch = SemanticScholar(api_key=api_key)
super().__init__()

def bibtex_entry_identifier(self, bibtex_entry: dict) -> str:
Expand Down
3 changes: 2 additions & 1 deletion src/sciterra/mapping/cartography.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,8 @@ def project(self, atl: Atlas, **kwargs) -> Atlas:

# Embed documents
result = self.vectorizer.embed_documents(
[atl_filtered[id].abstract for id in embed_ids]
[atl_filtered[id].abstract for id in embed_ids],
batch_size=kwargs["batch_size"] if "batch_size" in kwargs else None,
)
embeddings = result["embeddings"]
success_indices = result["success_indices"]
Expand Down
8 changes: 7 additions & 1 deletion src/sciterra/mapping/tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def __init__(
atlas_dir: str,
atlas_center_bibtex: str,
librarian_name: str,
librarian_kwargs: str,
vectorizer_name: str,
vectorizer_kwargs: dict = None,
) -> None:
Expand All @@ -126,6 +127,8 @@ def __init__(
librarian_name: a str name of a librarian, one of `librarians.librarians.keys()`, e.g. 'S2' or 'ADS'.
librarian_kwargs: keyword args propogated to a Librarian initialization; if values are `None` they will be omitted
vectorizer_name: a str name of a vectorizer, one of `vectorization.vectorizers.keys()`, e.g. 'BOW' or 'SciBERT'.
vectorizer_kwargs: keyword args propogated to a Vectorizer initialization; if values are `None` they will be omitted
Expand All @@ -136,14 +139,17 @@ def __init__(

# Get librarian
librarian = librarians[librarian_name]
l_kwargs = {k: v for k, v in librarian_kwargs.items() if v is not None}

# Get vectorizer
vectorizer = vectorizers[vectorizer_name]
# Get vectorizer kwargs if they are not null in config
v_kwargs = {k: v for k, v in vectorizer_kwargs.items() if v is not None}

self.cartographer = Cartographer(
librarian=librarian(),
librarian=librarian(
**l_kwargs,
),
vectorizer=vectorizer(
**v_kwargs,
),
Expand Down
35 changes: 23 additions & 12 deletions src/sciterra/vectorization/gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,9 @@
# This is the hidden dimension size
EMBEDDING_DIM = 768

class GPT2Vectorizer(Vectorizer):

class GPT2Vectorizer(Vectorizer):
def __init__(self, device="cuda", **kwargs) -> None:

# Get tokenizer
self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

Expand Down Expand Up @@ -74,13 +73,12 @@ def embed_documents(

# A note on padding from Thomas Wolf: https://github.com/huggingface/transformers/issues/808#issuecomment-522932583


# Tokenize the batch
encoded = self.tokenizer(
batch,
add_special_tokens=True,
padding=True, # pad up to length of longest abstract
# truncation=True, # max length 1024 tokens
truncation=True, # max length 1024 tokens
return_tensors="pt",
)
# each encoded item of shape [64, 1024]
Expand All @@ -99,18 +97,31 @@ def embed_documents(
)

# Get the embeddings of each final token in the batch
# Get the full last hidden state,

# Get the full last hidden state,
# shape [batch_size, sequence_length, hidden_size=768]
last_hidden_state = outputs.last_hidden_state

# Get the varying sequence lengths,
# Get the varying sequence lengths,
# shape [batch_size,]
sequence_lengths = torch.tensor([torch.nonzero(token_ids.eq(50256))[0].item() + 1 if token_ids.eq(50256).any() else len(token_ids) for token_ids in input_ids])
sequence_lengths = torch.tensor(
[
torch.nonzero(token_ids.eq(self.tokenizer.pad_token_id))[0].item()
+ 1
if token_ids.eq(self.tokenizer.pad_token_id).any()
else len(token_ids)
for token_ids in input_ids
]
)

# Get embeddings of each final token,
# Get embeddings of each final token,
# shape [batch_size, hidden_size]
last_hidden_states = torch.stack([last_hidden_state[i, sequence_lengths[i] - 1, :] for i in range(len(sequence_lengths))])
last_hidden_states = torch.stack(
[
last_hidden_state[i, sequence_lengths[i] - 1, :]
for i in range(len(sequence_lengths))
]
)

# Move to the CPU and convert to numpy ndarray
batched_embeddings = last_hidden_states.detach().cpu().numpy()
Expand All @@ -119,11 +130,11 @@ def embed_documents(
embeddings.extend(batched_embeddings)

pbar.update(batch_size)
pbar.close()
pbar.close()

# We don't deal with OOV, so we always return full list of ids
return {
"embeddings": np.array(embeddings),
"success_indices": np.arange(len(embeddings), dtype=int),
"fail_indices": np.array([], dtype=int),
}
}
6 changes: 5 additions & 1 deletion src/sciterra/vectorization/vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,16 @@ class Vectorizer:
def __init__(self) -> None:
pass

def embed_documents(self, docs: list[str]) -> dict[str, np.ndarray]:
def embed_documents(
self, docs: list[str], batch_size: int = 64
) -> dict[str, np.ndarray]:
"""Embed a list of documents into document vectors.
Args:
docs: the documents to embed.
batch_size: the batch size to use.
Returns:
a dict of the form
{
Expand Down
Binary file modified src/tests/data/models/word2vec_model_example.model
Binary file not shown.
57 changes: 29 additions & 28 deletions src/tests/test_vectorization.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ def test_identity_of_embeddings(self):
def test_diff_seq_len_batch(self):
embeddings = TestSciBERTVectorizer.vectorizer.embed_documents(
[
abstract_str,
abstract_str,
abstract_str,
abstract_str,
abstract_str[:194],
abstract_str[:194],
abstract_str[:194],
]
)["embeddings"]

Expand All @@ -53,7 +53,7 @@ def test_diff_seq_len_batch(self):
assert np.array_equal(embeddings[2], embeddings[3])

# Check that second and third embeddings are not the same
assert not np.array_equal(embeddings[1], embeddings[2])
assert not np.array_equal(embeddings[1], embeddings[2])

def test_single_cosine_pair(self):
embeddings = TestSciBERTVectorizer.vectorizer.embed_documents(
Expand All @@ -63,7 +63,7 @@ def test_single_cosine_pair(self):
# Check that the cosine sim of doc w/ itself is 1
# n.b., see sklearn.metrics.pairwise.cosine_similarity
sim = float(1 - cosine(embeddings[0], embeddings[1]))
assert sim == 1.0
assert np.isclose(sim, 1.0)

def test_basic_cosine_matrix(self):
# like pair above, but pretending that we have more than 2 publications.
Expand Down Expand Up @@ -112,10 +112,10 @@ def test_identity_of_embeddings(self):
def test_diff_seq_len_batch(self):
embeddings = TestSBERTVectorizer.vectorizer.embed_documents(
[
abstract_str,
abstract_str,
abstract_str,
abstract_str,
abstract_str[:194],
abstract_str[:194],
abstract_str[:194],
]
)["embeddings"]

Expand All @@ -124,7 +124,7 @@ def test_diff_seq_len_batch(self):
assert np.array_equal(embeddings[2], embeddings[3])

# Check that second and third embeddings are not the same
assert not np.array_equal(embeddings[1], embeddings[2])
assert not np.array_equal(embeddings[1], embeddings[2])

def test_single_cosine_pair(self):
embeddings = TestSBERTVectorizer.vectorizer.embed_documents(
Expand All @@ -134,7 +134,7 @@ def test_single_cosine_pair(self):
# Check that the cosine sim of doc w/ itself is 1
# n.b., see sklearn.metrics.pairwise.cosine_similarity
sim = float(1 - cosine(embeddings[0], embeddings[1]))
assert sim == 1.0
assert np.isclose(sim, 1.0)

def test_basic_cosine_matrix(self):
# like pair above, but pretending that we have more than 2 publications.
Expand All @@ -158,8 +158,9 @@ def test_basic_cosine_matrix(self):
# GPT-2
##############################################################################


class TestGPT2Vectorizer:
vectorizer = gpt2.GPT2Vectorizer() # don't pass 'mps' for CI tests
vectorizer = gpt2.GPT2Vectorizer() # don't pass 'mps' for CI tests
embedding_dim = gpt2.EMBEDDING_DIM

def test_single_vector(self):
Expand All @@ -170,22 +171,22 @@ def test_single_vector(self):
# Check embedding is of correct type, shape, and has no nans
assert isinstance(embedding, np.ndarray)
assert embedding.shape == (1, TestGPT2Vectorizer.embedding_dim)
assert not np.isnan(embedding).any()
assert not np.isnan(embedding).any()

def test_identity_of_embeddings(self):
embeddings = TestGPT2Vectorizer.vectorizer.embed_documents(
[abstract_str, abstract_str]
)["embeddings"]
# check identity
assert np.all(embeddings[0] == embeddings[1])
assert np.all(embeddings[0] == embeddings[1])

def test_diff_seq_len_batch(self):
embeddings = TestGPT2Vectorizer.vectorizer.embed_documents(
[
abstract_str,
abstract_str,
abstract_str,
abstract_str,
abstract_str[:194],
abstract_str[:194],
abstract_str[:194],
]
)["embeddings"]

Expand All @@ -204,7 +205,7 @@ def test_single_cosine_pair(self):
# Check that the cosine sim of doc w/ itself is 1
# n.b., see sklearn.metrics.pairwise.cosine_similarity
sim = float(1 - cosine(embeddings[0], embeddings[1]))
assert sim == 1.0
assert np.isclose(sim, 1.0)

def test_basic_cosine_matrix(self):
# like pair above, but pretending that we have more than 2 publications.
Expand All @@ -221,7 +222,7 @@ def test_basic_cosine_matrix(self):
]
)
cosine_matrix = cosine_distances(embeddings, embeddings)
assert np.all(cosine_matrix == 0)
assert np.all(cosine_matrix == 0)


##############################################################################
Expand Down Expand Up @@ -268,10 +269,10 @@ def test_identity_of_embeddings(self):
def test_diff_seq_len_batch(self):
embeddings = TestWord2VecVectorizer.vectorizer.embed_documents(
[
abstract_str,
abstract_str,
abstract_str,
abstract_str,
abstract_str[:194],
abstract_str[:194],
abstract_str[:194],
]
)["embeddings"]

Expand All @@ -280,7 +281,7 @@ def test_diff_seq_len_batch(self):
assert np.array_equal(embeddings[2], embeddings[3])

# Check that second and third embeddings are not the same
assert not np.array_equal(embeddings[1], embeddings[2])
assert not np.array_equal(embeddings[1], embeddings[2])

def test_single_cosine_pair(self):
embeddings = TestWord2VecVectorizer.vectorizer.embed_documents(
Expand All @@ -290,7 +291,7 @@ def test_single_cosine_pair(self):
# Check that the cosine sim of doc w/ itself is 1
# n.b., see sklearn.metrics.pairwise.cosine_similarity
sim = float(1 - cosine(embeddings[0], embeddings[1]))
assert sim == 1.0
assert np.isclose(sim, 1.0)

def test_basic_cosine_matrix(self):
# like pair above, but pretending that we have more than 2 publications.
Expand Down Expand Up @@ -352,10 +353,10 @@ def test_identity_of_embeddings(self):
def test_diff_seq_len_batch(self):
embeddings = TestBOWVectorizer.vectorizer.embed_documents(
[
abstract_str,
abstract_str,
abstract_str,
abstract_str,
abstract_str[:194],
abstract_str[:194],
abstract_str[:194],
]
)["embeddings"]

Expand All @@ -364,7 +365,7 @@ def test_diff_seq_len_batch(self):
assert np.array_equal(embeddings[2], embeddings[3])

# Check that second and third embeddings are not the same
assert not np.array_equal(embeddings[1], embeddings[2])
assert not np.array_equal(embeddings[1], embeddings[2])

def test_single_cosine_pair(self):
embeddings = TestWord2VecVectorizer.vectorizer.embed_documents(
Expand All @@ -374,7 +375,7 @@ def test_single_cosine_pair(self):
# Check that the cosine sim of doc w/ itself is 1
# n.b., see sklearn.metrics.pairwise.cosine_similarity
sim = float(1 - cosine(embeddings[0], embeddings[1]))
assert sim == 1.0
assert np.isclose(sim, 1.0)

def test_basic_cosine_matrix(self):
# like pair above, but pretending that we have more than 2 publications.
Expand Down

0 comments on commit c082257

Please sign in to comment.