Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP][BUG]: Metadata Update Semantics Implementation #1637

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions chromadb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
GetResult,
WhereDocument,
UpdateCollectionMetadata,
METADATA_TOMBSTONE,
)

# Re-export types from chromadb.types
Expand All @@ -37,6 +38,7 @@
"UpdateCollectionMetadata",
"QueryResult",
"GetResult",
"METADATA_TOMBSTONE",
]

logger = logging.getLogger(__name__)
Expand Down
6 changes: 5 additions & 1 deletion chromadb/api/segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
validate_where,
validate_where_document,
validate_batch,
METADATA_TOMBSTONE,
)
from chromadb.telemetry.product.events import (
CollectionAddEvent,
Expand Down Expand Up @@ -861,7 +862,10 @@ def _records(
for i, id in enumerate(ids):
metadata = None
if metadatas:
metadata = metadatas[i]
if not metadatas[i]:
metadata = {METADATA_TOMBSTONE: True}
else:
metadata = metadatas[i]

if documents:
document = documents[i]
Expand Down
5 changes: 4 additions & 1 deletion chromadb/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
# URIs
URI = str
URIs = List[URI]
METADATA_TOMBSTONE = "___METADATA_TOMBSTONE___"


def maybe_cast_one_to_many_uri(target: OneOrMany[URI]) -> URIs:
Expand Down Expand Up @@ -278,7 +279,9 @@ def validate_metadata(metadata: Metadata) -> Metadata:
f"Expected metadata key to be a str, got {key} which is a {type(key).__name__}"
)
# isinstance(True, int) evaluates to True, so we need to check for bools separately
if not isinstance(value, bool) and not isinstance(value, (str, int, float)):
if not isinstance(value, bool) and not isinstance(
value, (str, int, float, type(None))
):
raise ValueError(
f"Expected metadata value to be a str, int, float or bool, got {value} which is a {type(value).__name__}"
)
Expand Down
17 changes: 15 additions & 2 deletions chromadb/segment/impl/metadata/sqlite.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from typing import Optional, Sequence, Any, Tuple, cast, Generator, Union, Dict, List

from chromadb.api.types import METADATA_TOMBSTONE
from chromadb.segment import MetadataReader
from chromadb.ingest import Consumer
from chromadb.config import System
Expand Down Expand Up @@ -296,7 +298,7 @@ def _update_metadata(self, cur: Cursor, id: int, metadata: UpdateMetadata) -> No
"""Update the metadata for a single EmbeddingRecord"""
t = Table("embedding_metadata")
to_delete = [k for k, v in metadata.items() if v is None]
if to_delete:
if to_delete and METADATA_TOMBSTONE not in metadata.keys():
q = (
self._db.querybuilder()
.from_(t)
Expand All @@ -306,7 +308,18 @@ def _update_metadata(self, cur: Cursor, id: int, metadata: UpdateMetadata) -> No
)
sql, params = get_sql(q)
cur.execute(sql, params)

# remove the full metadata
if METADATA_TOMBSTONE in metadata.keys():
q = (
self._db.querybuilder()
.from_(t)
.where(t.id == ParameterValue(id))
.where(t.key.notin(ParameterValue(["chroma:document"])))
.delete()
)
sql, params = get_sql(q)
cur.execute(sql, params)
metadata = {k: v for k, v in metadata.items() if k == "chroma:document"}
self._insert_metadata(cur, id, metadata)

@trace_method(
Expand Down
93 changes: 93 additions & 0 deletions docs/cip/CIP-01152023_Metadata_Update_Semantics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# The issue

```python
import chromadb

client = chromadb.PersistentClient() # this is in-memory client, adjust as per your needs
collection = client.get_or_create_collection("mytest")
collection.add(ids=["id1"], documents=["document 1"], metadatas=[{"key_to_keep": 1, "key_to_remove": 2}])
records = collection.get(ids=["id1"])
print(records["metadatas"][0])
# {'key_to_keep': 1, 'key_to_remove': 2}
del records["metadatas"][0]["key_to_remove"] # remove the unnecessary key
print(records)
# {'ids': ['id1'], 'embeddings': None, 'metadatas': [{'key_to_keep': 1}], 'documents': ['document 1'], 'uris': None, 'data': None}
collection.update(ids=records["ids"], documents=records["documents"], embeddings=records["embeddings"],
metadatas=records["metadatas"])
# verify
records1 = collection.get(ids=["id1"])
print(records1["metadatas"][0])
# {'key_to_keep': 1, 'key_to_remove': 2}
```

## The fix

We want to support three scenarios:

- Metadata for the item is None - the metadata for that item should be deleted from `embedding_metadata`
- Metadata is not provided with the update/upsert - No changes to the metadata on any of the items being update/upserted
- Metadata key is set to None - only the key should be deleted from the metadata for that item, rest of the keys should
be preserved

Suggested approach involves supporting `NoneType` as metadata key value and the support of special metadata value that
is inserted at segment level to indicate that the metadata key should be deleted. We call this special
value `___METADATA_TOMBSTONE___` to appropriately reflect its intent. Our suggestion is for this special value to be
also documented in the API docs. The reason for documenting it is to make users aware of it and that it can be used as a
substitute for `NoneType` in metadata. The use of the tombstone value is inspired by XML Schema implementation
of [explicit nulls](https://www.w3.org/TR/xmlschema-1/#:~:text=2.6.&text=XML%20Schema%3A%20Structures%20introduces%20a,by%20the%20corresponding%20complex%20type.),
where a specific value is sent over the wire to indicate that the value should be deleted (`xsi:nil="true"`).

Here are examples of the three scenarios:

- Metadata for the record is None - the metadata for that item should be deleted from `embedding_metadata`

```python
import chromadb

client = chromadb.Client()

col = client.get_or_create_collection("test", metadata={"test": True})
col.add(ids=["1"], documents=["test-meta-none"], metadatas=[{"test": True}])
col.update(ids=["1"], documents=["test"], metadatas=[None])
res = col.get(ids=["1"])
print(res)
assert res["metadatas"][0] is None
```

- Metadata is not provided with the update/upsert - No changes to the metadata on any of the records being
updated/upserted

> Note: The reason we want to support this is to preserve existing behavior and operations of user workflows.

```python
import chromadb

client = chromadb.Client()

col = client.get_or_create_collection("test-no-meta", metadata={"test": True})
col.add(ids=["1"], documents=["test-no-meta"], metadatas=[{"test": True, "test1": False}])
print(col.get(ids=["1"]))
col.update(ids=["1"], documents=["test1"]) # this is a bug that removes all the metadata
res = col.get(ids=["1"])
print(res)
assert res["metadatas"][0] == {"test": True, "test1": False}
```

- Metadata key is set to None - only the key should be deleted from the metadata for that record, rest of the keys
should be preserved

> Note: Given the existing codebase we assume that this is the actual intended behavior which is only prevented by the
> lack of support for `NoneType` as metadata key value validation.

```python
import chromadb

client = chromadb.Client()

col = client.get_or_create_collection("test-partial", metadata={"test": True})
col.add(ids=["1"], documents=["test"], metadatas=[{"test": True, "test1": False}])
print(col.get(ids=["1"]))
col.update(ids=["1"], documents=["test"], metadatas=[{"test1": None}])
res = col.get(ids=["1"])
assert res["metadatas"][0] == {"test": True}
```
Loading
Loading