fix (#103)

SciPhi-AI · Oct 25, 2023 · a4f9e67 · a4f9e67
1 parent 8b1de19
commit a4f9e67
Showing 1 changed file with 93 additions and 21 deletions.
diff --git a/sciphi/scripts/make_embeddings.py b/sciphi/scripts/make_embeddings.py
@@ -1,6 +1,7 @@
 import gzip
 import json
 import logging
+import os
 from typing import Dict, Generator, List, Tuple, Union
 
 import fire
@@ -34,6 +35,28 @@ def stream_jsonl(
             yield entry
 
 
+def initialize_memmap(
+    file_name: str, initial_estimate: int, embedding_dim: int
+) -> None:
+    """
+    Initialize memmap and its corresponding shape file if they do not exist.
+    """
+    shape_file_path = f"{file_name}.shape"
+    if not os.path.exists(shape_file_path):
+        # Create initial shape file
+        with open(shape_file_path, "w") as shape_file:
+            shape_file.write(f"{initial_estimate},{embedding_dim}")
+
+        # Create initial memmap
+        memmap_array = np.memmap(
+            file_name,
+            dtype="float32",
+            mode="w+",
+            shape=(initial_estimate, embedding_dim),
+        )
+        del memmap_array
+
+
 def reconstitute_sentences_into_chunks(
     df: pd.DataFrame,
     ids_to_titles: dict,
@@ -91,31 +114,56 @@ def reconstitute_sentences_into_chunks(
     return chunks
 
 
-def save_embeddings_to_memmap(embeddings: np.ndarray, file_name: str) -> None:
+def ensure_memmap_capacity(
+    file_name: str, required_size: int, embedding_dim: int
+) -> None:
     """
-    Save embeddings in a memory-mapped format and also save the shape of the embeddings.
-
-    Args:
-    - embeddings (np.ndarray): Numpy array containing embeddings.
-    - file_name (str): Path to the output file.
+    Ensure that the memmap has sufficient capacity, and if not, double its size.
     """
-    shape = embeddings.shape
-
-    # Save the shape information to a separate file
-    with open(f"{file_name}.shape", "w") as shape_file:
-        shape_file.write(",".join(map(str, shape)))
-
-    # Create a new memory-mapped array with the required shape and dtype
+    shape = _load_shape_from_file(file_name)
+    if shape[0] < required_size:
+        # Double the size if needed
+        new_size = max(2 * shape[0], required_size)
+        new_shape = (new_size, embedding_dim)
+
+        # Create a new, larger memmap
+        large_memmap = np.memmap(
+            file_name, dtype="float32", mode="w+", shape=new_shape
+        )
+
+        # Load existing data into the new memmap
+        old_memmap = np.memmap(
+            file_name, dtype="float32", mode="r", shape=shape
+        )
+        large_memmap[: shape[0]] = old_memmap[:]
+
+        # Save new shape to the shape file
+        with open(f"{file_name}.shape", "w") as shape_file:
+            shape_file.write(",".join(map(str, new_shape)))
+
+        del old_memmap
+        del large_memmap
+
+
+def save_embeddings_to_memmap(
+    embeddings: np.ndarray, file_name: str, start_idx: int
+) -> int:
+    # Ensure there's enough capacity
+    required_capacity = start_idx + embeddings.shape[0]
+    ensure_memmap_capacity(file_name, required_capacity, embeddings.shape[1])
+
+    # Load the shape of existing memmap
+    shape = _load_shape_from_file(file_name)
     memmap_array = np.memmap(
-        file_name, dtype=embeddings.dtype, mode="w+", shape=shape
+        file_name, dtype=embeddings.dtype, mode="r+", shape=shape
     )
 
-    # Copy the embeddings data into the memory-mapped array
-    memmap_array[:] = embeddings[:]
+    # Write the new embeddings starting from start_idx
+    end_idx = start_idx + embeddings.shape[0]
+    memmap_array[start_idx:end_idx] = embeddings[:]
 
-    # Flush memory changes to disk and close the memmap array
-    memmap_array.flush()
-    del memmap_array
+    # Return the new index after writing
+    return end_idx
 
 
 def save_metadata_to_file(metadata: list, file_name: str) -> None:
@@ -177,6 +225,7 @@ def main(
     model_name: str = "BAAI/bge-base-en",
     chunk_size: int = 512,
     batch_size: int = 128,
+    initial_memmap_size: int = 1_000_000,
     embeddings_output_path: str = "embeddings.bin",
     metadata_output_path: str = "metadata.json.gz",
 ):
@@ -191,12 +240,27 @@ def main(
     - embeddings_output_path (str, optional): Path to save embeddings. Default is 'embeddings.gz'.
     - metadata_output_path (str, optional): Path to save metadata. Default is 'metadata.json.gz'.
     """
+    if os.path.exists(embeddings_output_path) or os.path.exists(
+        metadata_output_path
+    ):
+        raise Exception(
+            f"Embeddings or metadata file already exists at {embeddings_output_path} or {metadata_output_path}. Please delete these files and try again."
+        )
+
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model = SentenceTransformer(model_name).to(device)
     document_ids: list = []
     documents: list = []
     ids_to_titles: dict = {}
 
+    embedding_dim = (
+        model.get_sentence_embedding_dimension()
+    )  # get embedding dimension from the model
+    initialize_memmap(
+        embeddings_output_path, initial_memmap_size, embedding_dim
+    )  # initialize if necessary
+
+    current_idx = 0  # current index in the memmap
     for entry in stream_jsonl(file_path):
         document_ids.append(entry["page_id"])
         documents.append(entry["text"])
@@ -211,10 +275,18 @@ def main(
                 normalize_embeddings=True,
                 show_progress_bar=False,
             )
+
             # Save embeddings and metadata
-            save_embeddings_to_memmap(chunk_embedding, embeddings_output_path)
+            current_idx = save_embeddings_to_memmap(
+                chunk_embedding, embeddings_output_path, current_idx
+            )
+
             metadata_list = [
-                {"doc_id": entry[0], "title": entry[1], "text_chunk": entry[2]}
+                {
+                    "doc_id": entry[0],
+                    "title": entry[1],
+                    "text_chunk": entry[2],
+                }
                 for entry in chunks
             ]
             save_metadata_to_file(metadata_list, metadata_output_path)