From 73a90431083c8b6f21ab107ddbd06edf2e9f520f Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sat, 2 Sep 2023 20:40:33 -0700 Subject: [PATCH] Store updated timestamp on embeddings, closes #211 --- llm/embeddings.py | 3 +++ llm/embeddings_migrations.py | 12 ++++++++++++ tests/test_embed.py | 6 +++++- tests/test_embed_cli.py | 2 ++ tests/test_migrate.py | 1 + 5 files changed, 23 insertions(+), 1 deletion(-) diff --git a/llm/embeddings.py b/llm/embeddings.py index 2ea8e1c8..f2dd5a8a 100644 --- a/llm/embeddings.py +++ b/llm/embeddings.py @@ -5,6 +5,7 @@ import json from sqlite_utils import Database from sqlite_utils.db import Table +import time from typing import cast, Any, Dict, Iterable, List, Optional, Tuple @@ -133,6 +134,7 @@ def embed( "embedding": encode(embedding), "content": text if store else None, "metadata": json.dumps(metadata) if metadata else None, + "updated": int(time.time()), }, replace=True, ) @@ -184,6 +186,7 @@ def embed_multi_with_metadata( "embedding": llm.encode(embedding), "content": text if store else None, "metadata": json.dumps(metadata) if metadata else None, + "updated": int(time.time()), } for (embedding, (id, text, metadata)) in zip(embeddings, batch) ), diff --git a/llm/embeddings_migrations.py b/llm/embeddings_migrations.py index 7e1c590f..5da1e950 100644 --- a/llm/embeddings_migrations.py +++ b/llm/embeddings_migrations.py @@ -1,4 +1,5 @@ from sqlite_migrate import Migrations +import time embeddings_migrations = Migrations("llm.embeddings") @@ -22,3 +23,14 @@ def m001_create_tables(db): @embeddings_migrations() def m002_foreign_key(db): db["embeddings"].add_foreign_key("collection_id", "collections", "id") + + +@embeddings_migrations() +def m003_add_updated(db): + db["embeddings"].add_column("updated", int) + # Pretty-print the schema + db["embeddings"].transform() + # Assume anything existing was last updated right now + db.query( + "update embeddings set updated = ? where updated is null", [int(time.time())] + ) diff --git a/tests/test_embed.py b/tests/test_embed.py index d7c34423..571e2b7c 100644 --- a/tests/test_embed.py +++ b/tests/test_embed.py @@ -1,8 +1,9 @@ import json import llm from llm.embeddings import Entry -import sqlite_utils import pytest +import sqlite_utils +from unittest.mock import ANY @pytest.fixture @@ -65,6 +66,7 @@ def test_collection(collection): "embedding": llm.encode([5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), "content": None, "metadata": None, + "updated": ANY, }, { "collection_id": 1, @@ -72,8 +74,10 @@ def test_collection(collection): "embedding": llm.encode([7, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), "content": None, "metadata": None, + "updated": ANY, }, ] + assert isinstance(rows[0]["updated"], int) and rows[0]["updated"] > 0 def test_similar(collection): diff --git a/tests/test_embed_cli.py b/tests/test_embed_cli.py index 5fc89fd2..200cdd19 100644 --- a/tests/test_embed_cli.py +++ b/tests/test_embed_cli.py @@ -3,6 +3,7 @@ import json import pytest import sqlite_utils +from unittest.mock import ANY @pytest.mark.parametrize( @@ -98,6 +99,7 @@ def test_embed_store(user_path): ), "content": None, "metadata": None, + "updated": ANY, } ] # Should show up in 'llm embed-db collections' diff --git a/tests/test_migrate.py b/tests/test_migrate.py index 89deaae4..83617a04 100644 --- a/tests/test_migrate.py +++ b/tests/test_migrate.py @@ -91,6 +91,7 @@ def test_migrations_for_embeddings(): "embedding": bytes, "content": str, "metadata": str, + "updated": int, } assert db["embeddings"].foreign_keys[0].column == "collection_id" assert db["embeddings"].foreign_keys[0].other_table == "collections"