Skip to content

Commit

Permalink
[BUG] Fix test flakiness (#1765)
Browse files Browse the repository at this point in the history
## Description of changes

*Summarize the changes made by this PR.*
 - Improvements & Bug fixes
- Fix test flakiness mentioned in
#1716 and seen after.

The problem was subtle:
- There are three places Collection state is stored: in Hypothesis's
Bundle (== test data ~randomly generated according to our models), in
our test's `.model` field, and in Chroma itself.
- Each test step gets a random Collection from the Bundle. Our test code
is responsible for doing stuff with it, modifying the test model,
executing operations on Chroma itself, and verifying that model state
matches Chroma's state.
- Hypothesis's Bundle has (but no longer will once this PR lands) a full
Collection model with all internal bookeeping, incl UUID and other
fields. So we could have two Collections in the Bundle with the same
name but different UUIDs or other fields. In our test's model and in
Chroma, there would only be one Collection.
- The bug arose when we updated metadata on one Bundle collection, on
our model, and in Chroma; then tried to read metadata from another
Bundle collection with the same name but different metadata. The fix to
this was to read expected collection metadata from our test model, not
from the Bundle. I changed line 204 to `_metadata =
self.model[coll.name]` instead of `_metadata = coll.metadata`.
- To save people others this grief in the future, I created a new
`ExternalCollection` model which only contains externally visible
Collection data. This simplifies Bundle state for this test and should
make it easier to reason about in the future.
- I also added the previously failing test cases to protect us from
regressions. Since the model is much simpler now they don't give us
much, but I feel better knowing they're there and laying the pattern for
us to add future test cases.

## Test plan
*How are these changes tested?*

- [x] Tests pass locally with `pytest` for python, `yarn test` for js

## Documentation Changes
*Are all docstrings for user-facing APIs updated if required? Do we need
to make documentation changes in the [docs
repository](https://github.com/chroma-core/docs)?*
  • Loading branch information
beggers authored Feb 24, 2024
1 parent 548032a commit fd14f2d
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 27 deletions.
17 changes: 14 additions & 3 deletions chromadb/test/property/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,18 +236,29 @@ def embedding_function_strategy(


@dataclass
class Collection:
class ExternalCollection:
"""
An external view of a collection.
"""
name: str
id: uuid.UUID
metadata: Optional[types.Metadata]
embedding_function: Optional[types.EmbeddingFunction[Embeddable]]


@dataclass
class Collection(ExternalCollection):
"""
An internal view of a collection.
"""
id: uuid.UUID
dimension: int
dtype: npt.DTypeLike
topic: str
known_metadata_keys: types.Metadata
known_document_keywords: List[str]
has_documents: bool = False
has_embeddings: bool = False
embedding_function: Optional[types.EmbeddingFunction[Embeddable]] = None


@st.composite
def collections(
Expand Down
132 changes: 108 additions & 24 deletions chromadb/test/property/test_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,15 @@
run_state_machine_as_test,
MultipleResults,
)
from typing import Dict, Optional, Any, Mapping
from typing import Dict, Optional

import numpy
import uuid
from chromadb.test.property.strategies import hashing_embedding_function


class CollectionStateMachine(RuleBasedStateMachine):
collections: Bundle[strategies.Collection]
collections: Bundle[strategies.ExternalCollection]
_model: Dict[str, Optional[types.CollectionMetadata]]

collections = Bundle("collections")
Expand All @@ -35,8 +39,8 @@ def initialize(self) -> None:

@rule(target=collections, coll=strategies.collections())
def create_coll(
self, coll: strategies.Collection
) -> MultipleResults[strategies.Collection]:
self, coll: strategies.ExternalCollection
) -> MultipleResults[strategies.ExternalCollection]:
# Metadata can either be None or a non-empty dict
if coll.name in self.model or (
coll.metadata is not None and len(coll.metadata) == 0
Expand All @@ -54,14 +58,14 @@ def create_coll(
metadata=coll.metadata,
embedding_function=coll.embedding_function,
)
self.set_model(coll.name, coll.metadata, str(coll.id))
self.set_model(coll.name, coll.metadata)

assert c.name == coll.name
assert c.metadata == self.model[coll.name]
return multiple(coll)

@rule(coll=collections)
def get_coll(self, coll: strategies.Collection) -> None:
def get_coll(self, coll: strategies.ExternalCollection) -> None:
if coll.name in self.model:
c = self.api.get_collection(name=coll.name)
assert c.name == coll.name
Expand All @@ -71,7 +75,7 @@ def get_coll(self, coll: strategies.Collection) -> None:
self.api.get_collection(name=coll.name)

@rule(coll=consumes(collections))
def delete_coll(self, coll: strategies.Collection) -> None:
def delete_coll(self, coll: strategies.ExternalCollection) -> None:
if coll.name in self.model:
self.api.delete_collection(name=coll.name)
self.delete_from_model(coll.name)
Expand All @@ -85,7 +89,7 @@ def delete_coll(self, coll: strategies.Collection) -> None:
@rule()
def list_collections(self) -> None:
colls = self.api.list_collections()
assert len(colls) == len([c for c in self.model if not c.startswith("__id__")])
assert len(colls) == len(self.model)
for c in colls:
assert c.name in self.model

Expand Down Expand Up @@ -119,9 +123,9 @@ def list_collections_with_limit_offset(self, limit: int, offset: int) -> None:
)
def get_or_create_coll(
self,
coll: strategies.Collection,
coll: strategies.ExternalCollection,
new_metadata: Optional[types.Metadata],
) -> MultipleResults[strategies.Collection]:
) -> MultipleResults[strategies.ExternalCollection]:
# Cases for get_or_create

# Case 0
Expand Down Expand Up @@ -163,7 +167,7 @@ def get_or_create_coll(
coll.metadata = (
self.model[coll.name] if new_metadata is None else new_metadata
)
self.set_model(coll.name, coll.metadata, str(coll.id))
self.set_model(coll.name, coll.metadata)

# Update API
c = self.api.get_or_create_collection(
Expand All @@ -185,20 +189,19 @@ def get_or_create_coll(
)
def modify_coll(
self,
coll: strategies.Collection,
coll: strategies.ExternalCollection,
new_metadata: types.Metadata,
new_name: Optional[str],
) -> MultipleResults[strategies.Collection]:
# early exit if a col with name exists but with diff id, possibly in another tenant/db
if coll.name in self.model and f"__id__:{coll.id}" not in self.model:
return multiple()
) -> MultipleResults[strategies.ExternalCollection]:
if coll.name not in self.model:
with pytest.raises(Exception):
c = self.api.get_collection(name=coll.name)
return multiple()

c = self.api.get_collection(name=coll.name)
_metadata: Optional[Mapping[str, Any]] = coll.metadata
print("before API:", c)
print("model:", self.model)
_metadata: Optional[Mapping[str, Any]] = self.model[coll.name]
_name: str = coll.name
if new_metadata is not None:
if len(new_metadata) == 0:
Expand All @@ -221,10 +224,12 @@ def modify_coll(
self.delete_from_model(coll.name)
coll.name = new_name
_name = new_name
self.set_model(_name, _metadata, str(coll.id))

self.set_model(_name, _metadata)
c.modify(metadata=_metadata, name=_name)
c = self.api.get_collection(name=coll.name)
print("after API:", c)
print("model:", self.model)

assert c.name == coll.name
assert c.metadata == self.model[coll.name]
Expand All @@ -234,18 +239,13 @@ def set_model(
self,
name: str,
metadata: Optional[types.CollectionMetadata],
id: Optional[str] = None,
) -> None:
model = self.model
model[name] = metadata
if id is not None:
model[f"__id__:{id}"] = metadata

def delete_from_model(self, name: str, id: Optional[str] = None) -> None:
def delete_from_model(self, name: str) -> None:
model = self.model
del model[name]
if id is not None:
del model[f"__id__:{id}"]

@property
def model(self) -> Dict[str, Optional[types.CollectionMetadata]]:
Expand All @@ -255,3 +255,87 @@ def model(self) -> Dict[str, Optional[types.CollectionMetadata]]:
def test_collections(caplog: pytest.LogCaptureFixture, api: ClientAPI) -> None:
caplog.set_level(logging.ERROR)
run_state_machine_as_test(lambda: CollectionStateMachine(api)) # type: ignore


# Below are tests that have failed in the past. If your test fails, please add
# it to protect against regressions in the test harness itself. If you need
# help doing so, talk to ben.


def test_previously_failing_one(api: ClientAPI) -> None:
state = CollectionStateMachine(api)
state.initialize()
# I don't know why the typechecker is red here. This code is correct and is
# pulled from the logs.
(v1,) = state.get_or_create_coll(
coll=strategies.ExternalCollection(
name='jjn2yjLW1zp2T\n',
metadata=None,
embedding_function=hashing_embedding_function(dtype=numpy.float32, dim=863)
),
new_metadata=None
)
(v6,) = state.get_or_create_coll(
coll=strategies.ExternalCollection(
name='jjn2yjLW1zp2T\n',
metadata=None,
embedding_function=hashing_embedding_function(dtype=numpy.float32, dim=863)
),
new_metadata=None
)
state.modify_coll(
coll=v1,
new_metadata={'7': -1281, 'fGe': -0.0, 'K5j': 'im'},
new_name=None
)
state.modify_coll(
coll=v6,
new_metadata=None,
new_name=None
)


# https://github.com/chroma-core/chroma/commit/cf476d70f0cebb7c87cb30c7172ba74d6ea175cd#diff-e81868b665d149bb315d86890dea6fc6a9fc9fc9ea3089aa7728142b54f622c5R210
def test_previously_failing_two(api: ClientAPI) -> None:
state = CollectionStateMachine(api)
state.initialize()
(v13,) = state.get_or_create_coll(
coll=strategies.ExternalCollection(
name='C1030',
metadata={},
embedding_function=hashing_embedding_function(dim=2, dtype=numpy.float32)
),
new_metadata=None
)
(v15,) = state.modify_coll(
coll=v13,
new_metadata={'0': '10', '40': '0', 'p1nviWeL7fO': 'qN', '7b': 'YS', 'VYWq4LEMWjCo': True},
new_name='OF5F0MzbQg\n'
)
state.get_or_create_coll(
coll=strategies.ExternalCollection(
name='VS0QGh',
metadata={'h': 5.681951615025145e-227, 'A1': 61126, 'uhUhLEEMfeC_kN': 2147483647, 'weF': 'pSP', 'B3DSaP': False, '6H533K': 1.192092896e-07},
embedding_function=hashing_embedding_function(dim=1915, dtype=numpy.float32)
),
new_metadata={'xVW09xUpDZA': 31734,
'g': 1.1,
'n1dUTalF-MY': -1000000.0,
'y': 'G3EtXTZ',
'ugXZ_hK': 5494
}
)
(v17,) = state.modify_coll(
coll=v15,
new_metadata={'L35J2S': 'K0l026'},
new_name='Ai1\n'
)
(v18,) = state.get_or_create_coll(coll=v13, new_metadata=None)
state.get_or_create_coll(
coll=strategies.ExternalCollection(
name='VS0QGh',
metadata=None,
embedding_function=hashing_embedding_function(dim=326, dtype=numpy.float16)
),
new_metadata=None
)

0 comments on commit fd14f2d

Please sign in to comment.