From 4117e7dc661cb9e5697086f9a4c6d988cc1b98bc Mon Sep 17 00:00:00 2001 From: Michael Barlow <25936840+Michael-JB@users.noreply.github.com> Date: Sat, 18 May 2024 09:53:44 +0200 Subject: [PATCH] fix: URL-encode document names in document index client requests Prior to this commit, we did not URL-encode document names when querying the document index. This caused problems when names have special characters, i.e., '/' or '?'. We do not need to URL-encode other parts of the URL (e.g., namespace or collection) as these are already constrained to URL-safe characters. In addition to this fix, this commit includes the following changes: - update `DocumentPath.from_slash_separated_str` to correctly delimit paths whose document name contains a forward slash. Ideally we'd remove both this and `DocumentPath.to_slash_separated_str` in the future. - parametrize tests to include special characters and increase coverage - migrate to new document index collection to avoid CI conflicts --- CHANGELOG.md | 1 + .../document_index/document_index.py | 12 ++- .../document_index/test_document_index.py | 75 ++++++++++++++----- 3 files changed, 65 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59bc57f2e..375325ca2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ - Add `elo_qa_eval` tutorial notebook describing the use of an (incremental) Elo evaluation use case for QA models. ### Fixes - `ExpandChunks`-task is now fast even for very large documents +- The document index client now correctly URL-encodes document names in its queries. ### Deprecations ... diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py index a641dc45b..d968f4f90 100644 --- a/src/intelligence_layer/connectors/document_index/document_index.py +++ b/src/intelligence_layer/connectors/document_index/document_index.py @@ -2,6 +2,7 @@ from http import HTTPStatus from json import dumps from typing import Annotated, Any, Literal, Mapping, Optional, Sequence +from urllib.parse import quote import requests from pydantic import BaseModel, Field @@ -100,6 +101,9 @@ class DocumentPath(BaseModel, frozen=True): collection_path: CollectionPath document_name: str + def encoded_document_name(self) -> str: + return quote(self.document_name, safe="") + @classmethod def from_json(cls, document_path_json: Mapping[str, str]) -> "DocumentPath": return cls( @@ -115,7 +119,7 @@ def to_slash_separated_str(self) -> str: @classmethod def from_slash_separated_str(cls, path: str) -> "DocumentPath": - split = path.split("/") + split = path.split("/", 2) assert len(split) == 3 return cls( collection_path=CollectionPath( @@ -492,7 +496,7 @@ def add_document( Currently only supports text. """ - url = f"{self._base_document_index_url}/collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.document_name}" + url = f"{self._base_document_index_url}/collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.encoded_document_name()}" response = requests.put( url, data=dumps(contents._to_modalities_json()), headers=self.headers ) @@ -505,7 +509,7 @@ def delete_document(self, document_path: DocumentPath) -> None: document_path: Consists of `collection_path` and name of document to be deleted. """ - url = f"{self._base_document_index_url}/collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.document_name}" + url = f"{self._base_document_index_url}/collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.encoded_document_name()}" response = requests.delete(url, headers=self.headers) self._raise_for_status(response) @@ -519,7 +523,7 @@ def document(self, document_path: DocumentPath) -> DocumentContents: Content of the retrieved document. """ - url = f"{self._base_document_index_url}/collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.document_name}" + url = f"{self._base_document_index_url}/collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.encoded_document_name()}" response = requests.get(url, headers=self.headers) self._raise_for_status(response) return DocumentContents._from_modalities_json(response.json()) diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index 47c65b347..650c1fd1c 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -22,16 +22,8 @@ def aleph_alpha_namespace() -> str: @fixture def collection_path(aleph_alpha_namespace: str) -> CollectionPath: - return CollectionPath(namespace=aleph_alpha_namespace, collection="ci-collection") - - -@fixture -def document_path( - document_index: DocumentIndexClient, collection_path: CollectionPath -) -> DocumentPath: - document_index.create_collection(collection_path) - return DocumentPath( - collection_path=collection_path, document_name="Example Document" + return CollectionPath( + namespace=aleph_alpha_namespace, collection="intelligence-layer-sdk-ci" ) @@ -89,12 +81,29 @@ def test_document_index_creates_collection( @pytest.mark.internal +@pytest.mark.parametrize( + "document_name", + [ + "Example Document", + "!@#$%^&*()-_+={}[]\\|;:'\"<>,.?/~`", + ], +) def test_document_index_adds_document( document_index: DocumentIndexClient, - document_path: DocumentPath, + collection_path: CollectionPath, document_contents: DocumentContents, + document_name: str, ) -> None: + document_path = DocumentPath( + collection_path=collection_path, + document_name=document_name, + ) document_index.add_document(document_path, document_contents) + + assert any( + d.document_path == document_path + for d in document_index.documents(collection_path) + ) assert document_contents == document_index.document(document_path) @@ -115,11 +124,20 @@ def test_document_index_searches_asymmetrically( @pytest.mark.internal +@pytest.mark.parametrize( + "document_name", + [ + "Document to be deleted", + "Document to be deleted !@#$%^&*()-_+={}[]\\|;:'\"<>,.?/~`", + ], +) def test_document_index_deletes_document( - document_index: DocumentIndexClient, collection_path: CollectionPath + document_index: DocumentIndexClient, + collection_path: CollectionPath, + document_name: str, ) -> None: document_path = DocumentPath( - collection_path=collection_path, document_name="Document to be deleted" + collection_path=collection_path, document_name=document_name ) document_contents = DocumentContents.from_text("Some text...") @@ -145,11 +163,30 @@ def test_document_index_raises_on_getting_non_existing_document( ) -def test_document_path_from_string() -> None: - abc = DocumentPath.from_slash_separated_str("a/b/c") - assert abc == DocumentPath( - collection_path=CollectionPath(namespace="a", collection="b"), document_name="c" - ) +@pytest.mark.parametrize( + "slash_separated_str,expected_document_path", + [ + ( + "a/b/c", + DocumentPath( + collection_path=CollectionPath(namespace="a", collection="b"), + document_name="c", + ), + ), + ( + "a/b/c/d", + DocumentPath( + collection_path=CollectionPath(namespace="a", collection="b"), + document_name="c/d", + ), + ), + ], +) +def test_document_path_from_string( + slash_separated_str: str, expected_document_path: DocumentPath +) -> None: + actual_document_path = DocumentPath.from_slash_separated_str(slash_separated_str) + assert actual_document_path == expected_document_path with raises(AssertionError): DocumentPath.from_slash_separated_str("a/c") @@ -159,7 +196,7 @@ def test_document_list_all_documents( ) -> None: filter_result = document_index.documents(collection_path) - assert len(filter_result) == 2 + assert len(filter_result) == 3 def test_document_list_max_n_documents(