From 213e0b0c75d37e509fd9a5ae8fb688e34299de4b Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sun, 3 Sep 2023 12:55:48 -0700 Subject: [PATCH] embed-db delete-collection command and .delete() method, closes #219 --- docs/embeddings/cli.md | 10 ++++++++++ docs/embeddings/python-api.md | 1 + docs/help.md | 19 +++++++++++++++++-- llm/cli.py | 27 +++++++++++++++++++++++++++ llm/embeddings.py | 8 ++++++++ tests/conftest.py | 8 ++++++++ tests/test_embed.py | 17 +++++++++-------- tests/test_embed_cli.py | 19 +++++++++++++++++++ 8 files changed, 99 insertions(+), 10 deletions(-) diff --git a/docs/embeddings/cli.md b/docs/embeddings/cli.md index 4656b18f..7d41a4db 100644 --- a/docs/embeddings/cli.md +++ b/docs/embeddings/cli.md @@ -167,3 +167,13 @@ Add `-d/--database` to specify a different database file: ```bash llm embed-db collections -d my-embeddings.db ``` +## llm embed-db delete-collection + +To delete a collection from the database, run this: +```bash +llm embed-db delete-collection collection-name +``` +Pass `-d` to specify a different database file: +```bash +llm embed-db delete-collection collection-name -d my-embeddings.db +``` \ No newline at end of file diff --git a/docs/embeddings/python-api.md b/docs/embeddings/python-api.md index 82f2f39a..3f3e6f23 100644 --- a/docs/embeddings/python-api.md +++ b/docs/embeddings/python-api.md @@ -104,6 +104,7 @@ A collection instance has the following properties and methods: - `similar(query: str, number: int=10)` - returns a list of entries that are most similar to the embedding of the given query string - `similar_by_id(id: str, number: int=10)` - returns a list of entries that are most similar to the embedding of the item with the given ID - `similar_by_vector(vector: List[float], number: int=10, skip_id: str=None)` - returns a list of entries that are most similar to the given embedding vector, optionally skipping the entry with the given ID +- `delete()` - deletes the collection and its embeddings from the database (embeddings-python-similar)= ## Retrieving similar items diff --git a/docs/help.md b/docs/help.md index e3276161..e27b1a63 100644 --- a/docs/help.md +++ b/docs/help.md @@ -463,8 +463,9 @@ Options: --help Show this message and exit. Commands: - collections Output the path to the embeddings database - path Output the path to the embeddings database + collections Output the path to the embeddings database + delete-collection Delete the specified collection + path Output the path to the embeddings database ``` #### llm embed-db path --help ``` @@ -486,6 +487,20 @@ Options: --json Output as JSON --help Show this message and exit. ``` +#### llm embed-db delete-collection --help +``` +Usage: llm embed-db delete-collection [OPTIONS] COLLECTION + + Delete the specified collection + + Example usage: + + llm embed-db delete-collection my-collection + +Options: + -d, --database FILE Path to embeddings database + --help Show this message and exit. +``` ### llm openai --help ``` Usage: llm openai [OPTIONS] COMMAND [ARGS]... diff --git a/llm/cli.py b/llm/cli.py index 8f9e9bef..fa164884 100644 --- a/llm/cli.py +++ b/llm/cli.py @@ -1122,6 +1122,33 @@ def embed_db_collections(database, json_): ) +@embed_db.command(name="delete-collection") +@click.argument("collection") +@click.option( + "-d", + "--database", + type=click.Path(file_okay=True, allow_dash=False, dir_okay=False, writable=True), + envvar="LLM_EMBEDDINGS_DB", + help="Path to embeddings database", +) +def embed_db_delete_collection(collection, database): + """ + Delete the specified collection + + Example usage: + + \b + llm embed-db delete-collection my-collection + """ + database = database or (user_dir() / "embeddings.db") + db = sqlite_utils.Database(str(database)) + try: + collection_obj = Collection(collection, db, create=False) + except Collection.DoesNotExist: + raise click.ClickException("Collection does not exist") + collection_obj.delete() + + def template_dir(): path = user_dir() / "templates" path.mkdir(parents=True, exist_ok=True) diff --git a/llm/embeddings.py b/llm/embeddings.py index 15747391..3c0575f1 100644 --- a/llm/embeddings.py +++ b/llm/embeddings.py @@ -282,6 +282,14 @@ def similar(self, text: str, number: int = 10) -> List[Entry]: comparison_vector = self.model().embed(text) return self.similar_by_vector(comparison_vector, number) + def delete(self): + """ + Delete the collection and its embeddings from the database + """ + with self.db.conn: + self.db.execute("delete from embeddings where collection_id = ?", [self.id]) + self.db.execute("delete from collections where id = ?", [self.id]) + @staticmethod def content_hash(text: str) -> bytes: "Hash content for deduplication. Override to change hashing behavior." diff --git a/tests/conftest.py b/tests/conftest.py index 1b8443ec..fea8c67d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -94,3 +94,11 @@ def mocked_localai(requests_mock): }, headers={"Content-Type": "application/json"}, ) + + +@pytest.fixture +def collection(): + collection = llm.Collection("test", model_id="embed-demo") + collection.embed(1, "hello world") + collection.embed(2, "goodbye world") + return collection diff --git a/tests/test_embed.py b/tests/test_embed.py index eedf5d9c..f6b6edca 100644 --- a/tests/test_embed.py +++ b/tests/test_embed.py @@ -6,14 +6,6 @@ from unittest.mock import ANY -@pytest.fixture -def collection(): - collection = llm.Collection("test", model_id="embed-demo") - collection.embed(1, "hello world") - collection.embed(2, "goodbye world") - return collection - - def test_demo_plugin(): model = llm.get_embedding_model("embed-demo") assert model.embed("hello world") == [5, 5] + [0] * 14 @@ -118,3 +110,12 @@ def test_embed_multi(with_metadata): else: assert len(rows_with_metadata) == 0 assert len(rows_with_content) == 1000 + + +def test_collection_delete(collection): + db = collection.db + assert db["embeddings"].count == 2 + assert db["collections"].count == 1 + collection.delete() + assert db["embeddings"].count == 0 + assert db["collections"].count == 0 diff --git a/tests/test_embed_cli.py b/tests/test_embed_cli.py index b05694be..4de05638 100644 --- a/tests/test_embed_cli.py +++ b/tests/test_embed_cli.py @@ -139,6 +139,25 @@ def test_embed_store(user_path, metadata, metadata_error): else: assert result2.output == "items: embed-demo\n 1 embedding\n" + # And test deleting it too + result = runner.invoke(cli, ["embed-db", "delete-collection", "items"]) + assert result.exit_code == 0 + assert db["collections"].count == 0 + assert db["embeddings"].count == 0 + + +def test_collection_delete_errors(user_path): + db = sqlite_utils.Database(str(user_path / "embeddings.db")) + collection = Collection("items", db, model_id="embed-demo") + collection.embed("1", "hello") + assert db["collections"].count == 1 + assert db["embeddings"].count == 1 + runner = CliRunner() + result = runner.invoke(cli, ["embed-db", "delete-collection", "does-not-exist"]) + assert result.exit_code == 1 + assert "Collection does not exist" in result.output + assert db["collections"].count == 1 + @pytest.mark.parametrize( "args,expected_error",