add librarian kwargs, includeing s2_api_key, and batch_size as a kwar…

…g to expansion trace
nathimel · Feb 6, 2024 · c082257 · c082257
1 parent 9f6654b
commit c082257
Show file tree

Hide file tree

Showing 8 changed files with 79 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -98,7 +98,7 @@ success_indices = result["success_indices"] # shape `(len(embeddings),)`
 fail_indices = result["fail_indices"] # shape `(len(docs) - len(embeddings))``
 ```
 
-Currently, sciterra has vectorizers using [SciBERT](https://aclanthology.org/D19-1371/), [SBERT](https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models), [Word2Vec](https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#), and a simple bag-of-words (BOW) vectorizer that uses the same vocabulary as the Word2Vec vectorizer. Contributions to sciterra in the form of new Vectorizer subclasses are also encouraged and appreciated.
+Currently, sciterra has vectorizers using [SciBERT](https://aclanthology.org/D19-1371/), [SBERT](https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models), [GPT-2](https://huggingface.co/docs/transformers/en/model_doc/gpt2), [Word2Vec](https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#), and a simple bag-of-words (BOW) vectorizer that uses the same vocabulary as the Word2Vec vectorizer. Contributions to sciterra in the form of new Vectorizer subclasses are also encouraged and appreciated.
 
 ### Putting it all together
 

diff --git a/src/sciterra/librarians/s2librarian.py b/src/sciterra/librarians/s2librarian.py
@@ -69,8 +69,18 @@
 
 
 class SemanticScholarLibrarian(Librarian):
-    def __init__(self) -> None:
-        self.sch = SemanticScholar()
+    def __init__(
+        self,
+        api_key: str = None,
+        api_key_fn: str = None,
+    ) -> None:
+        if api_key_fn is not None:
+            print(f"Reading private api key from {api_key_fn}.")
+            # Parse api_key_fn for 40-ch private key
+            with open(api_key_fn, "r") as f:
+                api_key = f.read()
+
+        self.sch = SemanticScholar(api_key=api_key)
         super().__init__()
 
     def bibtex_entry_identifier(self, bibtex_entry: dict) -> str:

diff --git a/src/sciterra/mapping/cartography.py b/src/sciterra/mapping/cartography.py
@@ -203,7 +203,8 @@ def project(self, atl: Atlas, **kwargs) -> Atlas:
 
             # Embed documents
             result = self.vectorizer.embed_documents(
-                [atl_filtered[id].abstract for id in embed_ids]
+                [atl_filtered[id].abstract for id in embed_ids],
+                batch_size=kwargs["batch_size"] if "batch_size" in kwargs else None,
             )
             embeddings = result["embeddings"]
             success_indices = result["success_indices"]

diff --git a/src/sciterra/mapping/tracing.py b/src/sciterra/mapping/tracing.py
@@ -114,6 +114,7 @@ def __init__(
         atlas_dir: str,
         atlas_center_bibtex: str,
         librarian_name: str,
+        librarian_kwargs: str,
         vectorizer_name: str,
         vectorizer_kwargs: dict = None,
     ) -> None:
@@ -126,6 +127,8 @@ def __init__(
 
             librarian_name: a str name of a librarian, one of `librarians.librarians.keys()`, e.g. 'S2' or 'ADS'.
 
+            librarian_kwargs: keyword args propogated to a Librarian initialization; if values are `None` they will be omitted
+
             vectorizer_name: a str name of a vectorizer, one of `vectorization.vectorizers.keys()`, e.g. 'BOW' or 'SciBERT'.
 
             vectorizer_kwargs: keyword args propogated to a Vectorizer initialization; if values are `None` they will be omitted
@@ -136,14 +139,17 @@ def __init__(
 
         # Get librarian
         librarian = librarians[librarian_name]
+        l_kwargs = {k: v for k, v in librarian_kwargs.items() if v is not None}
 
         # Get vectorizer
         vectorizer = vectorizers[vectorizer_name]
         # Get vectorizer kwargs if they are not null in config
         v_kwargs = {k: v for k, v in vectorizer_kwargs.items() if v is not None}
 
         self.cartographer = Cartographer(
-            librarian=librarian(),
+            librarian=librarian(
+                **l_kwargs,
+            ),
             vectorizer=vectorizer(
                 **v_kwargs,
             ),

diff --git a/src/sciterra/vectorization/gpt2.py b/src/sciterra/vectorization/gpt2.py
@@ -21,10 +21,9 @@
 # This is the hidden dimension size
 EMBEDDING_DIM = 768
 
-class GPT2Vectorizer(Vectorizer):
 
+class GPT2Vectorizer(Vectorizer):
     def __init__(self, device="cuda", **kwargs) -> None:
-
         # Get tokenizer
         self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
 
@@ -74,13 +73,12 @@ def embed_documents(
 
             # A note on padding from Thomas Wolf: https://github.com/huggingface/transformers/issues/808#issuecomment-522932583
 
-
             # Tokenize the batch
             encoded = self.tokenizer(
                 batch,
                 add_special_tokens=True,
                 padding=True,  # pad up to length of longest abstract
-                # truncation=True,  # max length 1024 tokens
+                truncation=True,  # max length 1024 tokens
                 return_tensors="pt",
             )
             # each encoded item of shape [64, 1024]
@@ -99,18 +97,31 @@ def embed_documents(
                 )
 
             # Get the embeddings of each final token in the batch
-                
-            # Get the full last hidden state, 
+
+            # Get the full last hidden state,
             # shape [batch_size, sequence_length, hidden_size=768]
             last_hidden_state = outputs.last_hidden_state
 
-            # Get the varying sequence lengths, 
+            # Get the varying sequence lengths,
             # shape [batch_size,]
-            sequence_lengths = torch.tensor([torch.nonzero(token_ids.eq(50256))[0].item() + 1 if token_ids.eq(50256).any() else len(token_ids) for token_ids in input_ids])
+            sequence_lengths = torch.tensor(
+                [
+                    torch.nonzero(token_ids.eq(self.tokenizer.pad_token_id))[0].item()
+                    + 1
+                    if token_ids.eq(self.tokenizer.pad_token_id).any()
+                    else len(token_ids)
+                    for token_ids in input_ids
+                ]
+            )
 
-            # Get embeddings of each final token, 
+            # Get embeddings of each final token,
             # shape [batch_size, hidden_size]
-            last_hidden_states = torch.stack([last_hidden_state[i, sequence_lengths[i] - 1, :] for i in range(len(sequence_lengths))])
+            last_hidden_states = torch.stack(
+                [
+                    last_hidden_state[i, sequence_lengths[i] - 1, :]
+                    for i in range(len(sequence_lengths))
+                ]
+            )
 
             # Move to the CPU and convert to numpy ndarray
             batched_embeddings = last_hidden_states.detach().cpu().numpy()
@@ -119,11 +130,11 @@ def embed_documents(
             embeddings.extend(batched_embeddings)
 
             pbar.update(batch_size)
-        pbar.close()            
+        pbar.close()
 
         # We don't deal with OOV, so we always return full list of ids
         return {
             "embeddings": np.array(embeddings),
             "success_indices": np.arange(len(embeddings), dtype=int),
             "fail_indices": np.array([], dtype=int),
-        }
+        }
diff --git a/src/sciterra/vectorization/vectorizer.py b/src/sciterra/vectorization/vectorizer.py
@@ -7,12 +7,16 @@ class Vectorizer:
     def __init__(self) -> None:
         pass
 
-    def embed_documents(self, docs: list[str]) -> dict[str, np.ndarray]:
+    def embed_documents(
+        self, docs: list[str], batch_size: int = 64
+    ) -> dict[str, np.ndarray]:
         """Embed a list of documents into document vectors.
 
         Args:
             docs: the documents to embed.
 
+            batch_size: the batch size to use.
+
         Returns:
             a dict of the form
             {

diff --git a/src/tests/data/models/word2vec_model_example.model b/src/tests/data/models/word2vec_model_example.model
diff --git a/src/tests/test_vectorization.py b/src/tests/test_vectorization.py
@@ -41,10 +41,10 @@ def test_identity_of_embeddings(self):
     def test_diff_seq_len_batch(self):
         embeddings = TestSciBERTVectorizer.vectorizer.embed_documents(
             [
-                abstract_str, 
-                abstract_str,                 
+                abstract_str,
+                abstract_str,
+                abstract_str[:194],
                 abstract_str[:194],
-                abstract_str[:194],                
             ]
         )["embeddings"]
 
@@ -53,7 +53,7 @@ def test_diff_seq_len_batch(self):
         assert np.array_equal(embeddings[2], embeddings[3])
 
         # Check that second and third embeddings are not the same
-        assert not np.array_equal(embeddings[1], embeddings[2])            
+        assert not np.array_equal(embeddings[1], embeddings[2])
 
     def test_single_cosine_pair(self):
         embeddings = TestSciBERTVectorizer.vectorizer.embed_documents(
@@ -63,7 +63,7 @@ def test_single_cosine_pair(self):
         # Check that the cosine sim of doc w/ itself is 1
         # n.b., see sklearn.metrics.pairwise.cosine_similarity
         sim = float(1 - cosine(embeddings[0], embeddings[1]))
-        assert sim == 1.0
+        assert np.isclose(sim, 1.0)
 
     def test_basic_cosine_matrix(self):
         # like pair above, but pretending that we have more than 2 publications.
@@ -112,10 +112,10 @@ def test_identity_of_embeddings(self):
     def test_diff_seq_len_batch(self):
         embeddings = TestSBERTVectorizer.vectorizer.embed_documents(
             [
-                abstract_str, 
-                abstract_str,                 
+                abstract_str,
+                abstract_str,
+                abstract_str[:194],
                 abstract_str[:194],
-                abstract_str[:194],                
             ]
         )["embeddings"]
 
@@ -124,7 +124,7 @@ def test_diff_seq_len_batch(self):
         assert np.array_equal(embeddings[2], embeddings[3])
 
         # Check that second and third embeddings are not the same
-        assert not np.array_equal(embeddings[1], embeddings[2])        
+        assert not np.array_equal(embeddings[1], embeddings[2])
 
     def test_single_cosine_pair(self):
         embeddings = TestSBERTVectorizer.vectorizer.embed_documents(
@@ -134,7 +134,7 @@ def test_single_cosine_pair(self):
         # Check that the cosine sim of doc w/ itself is 1
         # n.b., see sklearn.metrics.pairwise.cosine_similarity
         sim = float(1 - cosine(embeddings[0], embeddings[1]))
-        assert sim == 1.0
+        assert np.isclose(sim, 1.0)
 
     def test_basic_cosine_matrix(self):
         # like pair above, but pretending that we have more than 2 publications.
@@ -158,8 +158,9 @@ def test_basic_cosine_matrix(self):
 # GPT-2
 ##############################################################################
 
+
 class TestGPT2Vectorizer:
-    vectorizer = gpt2.GPT2Vectorizer() # don't pass 'mps' for CI tests
+    vectorizer = gpt2.GPT2Vectorizer()  # don't pass 'mps' for CI tests
     embedding_dim = gpt2.EMBEDDING_DIM
 
     def test_single_vector(self):
@@ -170,22 +171,22 @@ def test_single_vector(self):
         # Check embedding is of correct type, shape, and has no nans
         assert isinstance(embedding, np.ndarray)
         assert embedding.shape == (1, TestGPT2Vectorizer.embedding_dim)
-        assert not np.isnan(embedding).any()   
+        assert not np.isnan(embedding).any()
 
     def test_identity_of_embeddings(self):
         embeddings = TestGPT2Vectorizer.vectorizer.embed_documents(
             [abstract_str, abstract_str]
         )["embeddings"]
         # check identity
-        assert np.all(embeddings[0] == embeddings[1])        
+        assert np.all(embeddings[0] == embeddings[1])
 
     def test_diff_seq_len_batch(self):
         embeddings = TestGPT2Vectorizer.vectorizer.embed_documents(
             [
-                abstract_str, 
-                abstract_str,                 
+                abstract_str,
+                abstract_str,
+                abstract_str[:194],
                 abstract_str[:194],
-                abstract_str[:194],                
             ]
         )["embeddings"]
 
@@ -204,7 +205,7 @@ def test_single_cosine_pair(self):
         # Check that the cosine sim of doc w/ itself is 1
         # n.b., see sklearn.metrics.pairwise.cosine_similarity
         sim = float(1 - cosine(embeddings[0], embeddings[1]))
-        assert sim == 1.0
+        assert np.isclose(sim, 1.0)
 
     def test_basic_cosine_matrix(self):
         # like pair above, but pretending that we have more than 2 publications.
@@ -221,7 +222,7 @@ def test_basic_cosine_matrix(self):
             ]
         )
         cosine_matrix = cosine_distances(embeddings, embeddings)
-        assert np.all(cosine_matrix == 0)        
+        assert np.all(cosine_matrix == 0)
 
 
 ##############################################################################
@@ -268,10 +269,10 @@ def test_identity_of_embeddings(self):
     def test_diff_seq_len_batch(self):
         embeddings = TestWord2VecVectorizer.vectorizer.embed_documents(
             [
-                abstract_str, 
-                abstract_str,                 
+                abstract_str,
+                abstract_str,
+                abstract_str[:194],
                 abstract_str[:194],
-                abstract_str[:194],                
             ]
         )["embeddings"]
 
@@ -280,7 +281,7 @@ def test_diff_seq_len_batch(self):
         assert np.array_equal(embeddings[2], embeddings[3])
 
         # Check that second and third embeddings are not the same
-        assert not np.array_equal(embeddings[1], embeddings[2])        
+        assert not np.array_equal(embeddings[1], embeddings[2])
 
     def test_single_cosine_pair(self):
         embeddings = TestWord2VecVectorizer.vectorizer.embed_documents(
@@ -290,7 +291,7 @@ def test_single_cosine_pair(self):
         # Check that the cosine sim of doc w/ itself is 1
         # n.b., see sklearn.metrics.pairwise.cosine_similarity
         sim = float(1 - cosine(embeddings[0], embeddings[1]))
-        assert sim == 1.0
+        assert np.isclose(sim, 1.0)
 
     def test_basic_cosine_matrix(self):
         # like pair above, but pretending that we have more than 2 publications.
@@ -352,10 +353,10 @@ def test_identity_of_embeddings(self):
     def test_diff_seq_len_batch(self):
         embeddings = TestBOWVectorizer.vectorizer.embed_documents(
             [
-                abstract_str, 
-                abstract_str,                 
+                abstract_str,
+                abstract_str,
+                abstract_str[:194],
                 abstract_str[:194],
-                abstract_str[:194],                
             ]
         )["embeddings"]
 
@@ -364,7 +365,7 @@ def test_diff_seq_len_batch(self):
         assert np.array_equal(embeddings[2], embeddings[3])
 
         # Check that second and third embeddings are not the same
-        assert not np.array_equal(embeddings[1], embeddings[2])           
+        assert not np.array_equal(embeddings[1], embeddings[2])
 
     def test_single_cosine_pair(self):
         embeddings = TestWord2VecVectorizer.vectorizer.embed_documents(
@@ -374,7 +375,7 @@ def test_single_cosine_pair(self):
         # Check that the cosine sim of doc w/ itself is 1
         # n.b., see sklearn.metrics.pairwise.cosine_similarity
         sim = float(1 - cosine(embeddings[0], embeddings[1]))
-        assert sim == 1.0
+        assert np.isclose(sim, 1.0)
 
     def test_basic_cosine_matrix(self):
         # like pair above, but pretending that we have more than 2 publications.