From 56a6a16bc71edb281156cfec18dec2361a8ae4cd Mon Sep 17 00:00:00 2001 From: hammadb Date: Wed, 27 Nov 2024 10:53:27 -0800 Subject: [PATCH 1/5] [BUG] Bound tokenizers, patch CVP test --- .../property/test_cross_version_persist.py | 12 ++++++++-- chromadb/test/utils/cross_version.py | 23 +++++++++++++++---- pyproject.toml | 2 +- requirements.txt | 2 +- 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/chromadb/test/property/test_cross_version_persist.py b/chromadb/test/property/test_cross_version_persist.py index 1a2510c5f05..677426a2082 100644 --- a/chromadb/test/property/test_cross_version_persist.py +++ b/chromadb/test/property/test_cross_version_persist.py @@ -22,6 +22,7 @@ import chromadb.test.property.invariants as invariants from packaging import version as packaging_version import re +import sys import multiprocessing from chromadb.config import Settings from chromadb.api.client import Client as ClientCreator @@ -38,7 +39,7 @@ version_re = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+$") # Some modules do not work across versions, since we upgrade our support for them, and should be explicitly reimported in the subprocess -VERSIONED_MODULES = ["pydantic", "numpy"] +VERSIONED_MODULES = ["pydantic", "numpy", "tokenizers"] def versions() -> List[str]: @@ -148,7 +149,14 @@ def configurations(versions: List[str]) -> List[Tuple[str, Settings]]: def version_settings(request) -> Generator[Tuple[str, Settings], None, None]: configuration = request.param version = configuration[0] - install_version(version) + + # Version <3.9 requires bounding tokenizers<=0.20.3 + (major, minor, patch) = sys.version_info[:3] + if major == 3 and minor < 9: + install_version(version, {"tokenizers": "<=0.20.3"}) + else: + install_version(version, {}) + yield configuration # Cleanup the installed version path = get_path_to_version_install(version) diff --git a/chromadb/test/utils/cross_version.py b/chromadb/test/utils/cross_version.py index 737b7f8da97..287aa154b91 100644 --- a/chromadb/test/utils/cross_version.py +++ b/chromadb/test/utils/cross_version.py @@ -3,7 +3,7 @@ import os import tempfile from types import ModuleType -from typing import List +from typing import Dict, List base_install_dir = tempfile.gettempdir() + "/persistence_test_chromadb_versions" @@ -38,16 +38,16 @@ def get_path_to_version_library(version: str) -> str: return get_path_to_version_install(version) + "/chromadb/__init__.py" -def install_version(version: str) -> None: +def install_version(version: str, dep_overrides: Dict[str, str]) -> None: # Check if already installed version_library = get_path_to_version_library(version) if os.path.exists(version_library): return path = get_path_to_version_install(version) - install(f"chromadb=={version}", path) + install(f"chromadb=={version}", path, dep_overrides) -def install(pkg: str, path: str) -> int: +def install(pkg: str, path: str, dep_overrides: Dict[str, str]) -> int: # -q -q to suppress pip output to ERROR level # https://pip.pypa.io/en/stable/cli/pip/#quiet print("Purging pip cache") @@ -60,6 +60,21 @@ def install(pkg: str, path: str) -> int: "purge", ] ) + + for dep, operator_version in dep_overrides.items(): + print(f"Installing {dep} version {operator_version}") + subprocess.check_call( + [ + sys.executable, + "-m", + "pip", + "-q", + "-q", + "install", + f"'{dep}{operator_version}'", + ] + ) + print(f"Installing chromadb version {pkg} to {path}") return subprocess.check_call( [ diff --git a/pyproject.toml b/pyproject.toml index 140a519f083..9fb9759dabd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ 'opentelemetry-exporter-otlp-proto-grpc>=1.2.0', 'opentelemetry-instrumentation-fastapi>=0.41b0', 'opentelemetry-sdk>=1.2.0', - 'tokenizers >= 0.13.2', + 'tokenizers >= 0.13.2, <= 0.20.3', 'pypika >= 0.48.9', 'tqdm >= 4.65.0', 'overrides >= 7.3.1', diff --git a/requirements.txt b/requirements.txt index b7b621faf2a..19b079af0eb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,7 +21,7 @@ pypika>=0.48.9 PyYAML>=6.0.0 rich>=10.11.0 tenacity>=8.2.3 -tokenizers>=0.13.2 +tokenizers>=0.13.2,<=0.20.3 tqdm>=4.65.0 typer>=0.9.0 typing_extensions>=4.5.0 From 3512a05dea88e8ab53ce3be33df920f36ff709d6 Mon Sep 17 00:00:00 2001 From: hammadb Date: Wed, 27 Nov 2024 11:00:05 -0800 Subject: [PATCH 2/5] strings --- chromadb/test/utils/cross_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chromadb/test/utils/cross_version.py b/chromadb/test/utils/cross_version.py index 287aa154b91..ebe0acf1305 100644 --- a/chromadb/test/utils/cross_version.py +++ b/chromadb/test/utils/cross_version.py @@ -71,7 +71,7 @@ def install(pkg: str, path: str, dep_overrides: Dict[str, str]) -> int: "-q", "-q", "install", - f"'{dep}{operator_version}'", + f"{dep}{operator_version}", ] ) From 9574d51f854e5f29e98546fb5d779c8bd04e8ef1 Mon Sep 17 00:00:00 2001 From: hammadb Date: Wed, 27 Nov 2024 11:19:51 -0800 Subject: [PATCH 3/5] Bump tokenizers --- chromadb/test/utils/cross_version.py | 32 ++++++---------------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/chromadb/test/utils/cross_version.py b/chromadb/test/utils/cross_version.py index ebe0acf1305..6480803f2fa 100644 --- a/chromadb/test/utils/cross_version.py +++ b/chromadb/test/utils/cross_version.py @@ -61,31 +61,13 @@ def install(pkg: str, path: str, dep_overrides: Dict[str, str]) -> int: ] ) + command = [sys.executable, "-m", "pip", "-q", "-q", "install", pkg] + for dep, operator_version in dep_overrides.items(): - print(f"Installing {dep} version {operator_version}") - subprocess.check_call( - [ - sys.executable, - "-m", - "pip", - "-q", - "-q", - "install", - f"{dep}{operator_version}", - ] - ) + command.append(f"{dep}{operator_version}") + + command.append("--no-binary=chroma-hnswlib") + command.append(f"--target={path}") print(f"Installing chromadb version {pkg} to {path}") - return subprocess.check_call( - [ - sys.executable, - "-m", - "pip", - "-q", - "-q", - "install", - pkg, - "--no-binary=chroma-hnswlib", - "--target={}".format(path), - ] - ) + return subprocess.check_call(command) From 7cf256936db4ed72f047aee224a7ca3f0cec9540 Mon Sep 17 00:00:00 2001 From: hammadb Date: Wed, 27 Nov 2024 11:55:14 -0800 Subject: [PATCH 4/5] missing arg --- chromadb/test/client/test_http_client_v1_compatability.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chromadb/test/client/test_http_client_v1_compatability.py b/chromadb/test/client/test_http_client_v1_compatability.py index 9e49f4fb61e..b4a8ef40b41 100644 --- a/chromadb/test/client/test_http_client_v1_compatability.py +++ b/chromadb/test/client/test_http_client_v1_compatability.py @@ -41,7 +41,7 @@ def test_http_client_bw_compatibility() -> None: port = sys.settings.chroma_server_http_port old_version = "0.5.11" # Module with known v1 client - install_version(old_version) + install_version(old_version, {}) ctx = multiprocessing.get_context("spawn") conn1, conn2 = multiprocessing.Pipe() From a338c186c9bf58ccb8083a6ee9f399c2f737de40 Mon Sep 17 00:00:00 2001 From: hammadb Date: Wed, 27 Nov 2024 12:33:54 -0800 Subject: [PATCH 5/5] install_version call elsewhere needs to be patched --- .../client/test_http_client_v1_compatability.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/chromadb/test/client/test_http_client_v1_compatability.py b/chromadb/test/client/test_http_client_v1_compatability.py index b4a8ef40b41..983770ee7ad 100644 --- a/chromadb/test/client/test_http_client_v1_compatability.py +++ b/chromadb/test/client/test_http_client_v1_compatability.py @@ -1,6 +1,6 @@ import multiprocessing from unittest.mock import patch - +import sys as pysys from multiprocessing.connection import Connection from chromadb.config import System @@ -41,7 +41,16 @@ def test_http_client_bw_compatibility() -> None: port = sys.settings.chroma_server_http_port old_version = "0.5.11" # Module with known v1 client - install_version(old_version, {}) + + # Version <3.9 requires bounding tokenizers<=0.20.3 + # TOOD(hammadb): This code is duplicated in test_cross_version_persist.py + # for expediency on 11/27/2024 I am copy pasting rather than refactoring + # to DRY. Refactor later. + (major, minor, _) = pysys.version_info[:3] + if major == 3 and minor < 9: + install_version(old_version, {"tokenizers": "<=0.20.3"}) + else: + install_version(old_version, {}) ctx = multiprocessing.get_context("spawn") conn1, conn2 = multiprocessing.Pipe()