Skip to content
This repository has been archived by the owner on Feb 12, 2024. It is now read-only.

Commit

Permalink
fix (#103)
Browse files Browse the repository at this point in the history
  • Loading branch information
emrgnt-cmplxty authored Oct 25, 2023
1 parent 8b1de19 commit a4f9e67
Showing 1 changed file with 93 additions and 21 deletions.
114 changes: 93 additions & 21 deletions sciphi/scripts/make_embeddings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import gzip
import json
import logging
import os
from typing import Dict, Generator, List, Tuple, Union

import fire
Expand Down Expand Up @@ -34,6 +35,28 @@ def stream_jsonl(
yield entry


def initialize_memmap(
file_name: str, initial_estimate: int, embedding_dim: int
) -> None:
"""
Initialize memmap and its corresponding shape file if they do not exist.
"""
shape_file_path = f"{file_name}.shape"
if not os.path.exists(shape_file_path):
# Create initial shape file
with open(shape_file_path, "w") as shape_file:
shape_file.write(f"{initial_estimate},{embedding_dim}")

# Create initial memmap
memmap_array = np.memmap(
file_name,
dtype="float32",
mode="w+",
shape=(initial_estimate, embedding_dim),
)
del memmap_array


def reconstitute_sentences_into_chunks(
df: pd.DataFrame,
ids_to_titles: dict,
Expand Down Expand Up @@ -91,31 +114,56 @@ def reconstitute_sentences_into_chunks(
return chunks


def save_embeddings_to_memmap(embeddings: np.ndarray, file_name: str) -> None:
def ensure_memmap_capacity(
file_name: str, required_size: int, embedding_dim: int
) -> None:
"""
Save embeddings in a memory-mapped format and also save the shape of the embeddings.
Args:
- embeddings (np.ndarray): Numpy array containing embeddings.
- file_name (str): Path to the output file.
Ensure that the memmap has sufficient capacity, and if not, double its size.
"""
shape = embeddings.shape

# Save the shape information to a separate file
with open(f"{file_name}.shape", "w") as shape_file:
shape_file.write(",".join(map(str, shape)))

# Create a new memory-mapped array with the required shape and dtype
shape = _load_shape_from_file(file_name)
if shape[0] < required_size:
# Double the size if needed
new_size = max(2 * shape[0], required_size)
new_shape = (new_size, embedding_dim)

# Create a new, larger memmap
large_memmap = np.memmap(
file_name, dtype="float32", mode="w+", shape=new_shape
)

# Load existing data into the new memmap
old_memmap = np.memmap(
file_name, dtype="float32", mode="r", shape=shape
)
large_memmap[: shape[0]] = old_memmap[:]

# Save new shape to the shape file
with open(f"{file_name}.shape", "w") as shape_file:
shape_file.write(",".join(map(str, new_shape)))

del old_memmap
del large_memmap


def save_embeddings_to_memmap(
embeddings: np.ndarray, file_name: str, start_idx: int
) -> int:
# Ensure there's enough capacity
required_capacity = start_idx + embeddings.shape[0]
ensure_memmap_capacity(file_name, required_capacity, embeddings.shape[1])

# Load the shape of existing memmap
shape = _load_shape_from_file(file_name)
memmap_array = np.memmap(
file_name, dtype=embeddings.dtype, mode="w+", shape=shape
file_name, dtype=embeddings.dtype, mode="r+", shape=shape
)

# Copy the embeddings data into the memory-mapped array
memmap_array[:] = embeddings[:]
# Write the new embeddings starting from start_idx
end_idx = start_idx + embeddings.shape[0]
memmap_array[start_idx:end_idx] = embeddings[:]

# Flush memory changes to disk and close the memmap array
memmap_array.flush()
del memmap_array
# Return the new index after writing
return end_idx


def save_metadata_to_file(metadata: list, file_name: str) -> None:
Expand Down Expand Up @@ -177,6 +225,7 @@ def main(
model_name: str = "BAAI/bge-base-en",
chunk_size: int = 512,
batch_size: int = 128,
initial_memmap_size: int = 1_000_000,
embeddings_output_path: str = "embeddings.bin",
metadata_output_path: str = "metadata.json.gz",
):
Expand All @@ -191,12 +240,27 @@ def main(
- embeddings_output_path (str, optional): Path to save embeddings. Default is 'embeddings.gz'.
- metadata_output_path (str, optional): Path to save metadata. Default is 'metadata.json.gz'.
"""
if os.path.exists(embeddings_output_path) or os.path.exists(
metadata_output_path
):
raise Exception(
f"Embeddings or metadata file already exists at {embeddings_output_path} or {metadata_output_path}. Please delete these files and try again."
)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name).to(device)
document_ids: list = []
documents: list = []
ids_to_titles: dict = {}

embedding_dim = (
model.get_sentence_embedding_dimension()
) # get embedding dimension from the model
initialize_memmap(
embeddings_output_path, initial_memmap_size, embedding_dim
) # initialize if necessary

current_idx = 0 # current index in the memmap
for entry in stream_jsonl(file_path):
document_ids.append(entry["page_id"])
documents.append(entry["text"])
Expand All @@ -211,10 +275,18 @@ def main(
normalize_embeddings=True,
show_progress_bar=False,
)

# Save embeddings and metadata
save_embeddings_to_memmap(chunk_embedding, embeddings_output_path)
current_idx = save_embeddings_to_memmap(
chunk_embedding, embeddings_output_path, current_idx
)

metadata_list = [
{"doc_id": entry[0], "title": entry[1], "text_chunk": entry[2]}
{
"doc_id": entry[0],
"title": entry[1],
"text_chunk": entry[2],
}
for entry in chunks
]
save_metadata_to_file(metadata_list, metadata_output_path)
Expand Down

0 comments on commit a4f9e67

Please sign in to comment.