From 08331e1a1e6fef8b3aef07ff2f6c6ffb99a6c852 Mon Sep 17 00:00:00 2001 From: Til Theunissen <166376512+TilTheunissenAA@users.noreply.github.com> Date: Mon, 16 Dec 2024 15:51:16 +0100 Subject: [PATCH] feat(document-index): retrieve chunks of an indexed document (#1161) * feat(document-index): retrieve chunks of an indexed document * docs(document-index): handle unindexed document in chunks-endpoint example --------- Co-authored-by: Sebastian Niehus <165138846+SebastianNiehusAA@users.noreply.github.com> --- CHANGELOG.md | 1 + src/documentation/document_index.ipynb | 24 ++++++++ .../document_index/document_index.py | 57 +++++++++++++++++++ .../document_index/test_document_index.py | 46 ++++++++++++++- 4 files changed, 127 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7160f4f67..e34ffeefc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ - Add `create_project` bool to `StudioClient.__init__()` to enable users to automatically create their Studio projects - Add progressbar to the `Runner` to be able to track the `Run` - Add `StudioClient.submit_benchmark_lineages` function and include it in `StudioClient.submit_benchmark_execution` +- Add method `DocumentIndexClient.chunks()` for retrieving all text chunks of a document. ### Fixes ... diff --git a/src/documentation/document_index.ipynb b/src/documentation/document_index.ipynb index aae97a93f..91ef81fb2 100644 --- a/src/documentation/document_index.ipynb +++ b/src/documentation/document_index.ipynb @@ -23,6 +23,7 @@ " IndexPath,\n", " InstructableEmbed,\n", " LimitedConcurrencyClient,\n", + " ResourceNotFound,\n", " SemanticEmbed,\n", ")\n", "from intelligence_layer.core import InMemoryTracer, LuminousControlModel\n", @@ -262,6 +263,29 @@ "document_index.documents(collection_path)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once a document is indexed, we can also have a look at its chunks:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " chunks = document_index.chunks(\n", + " DocumentPath(collection_path=collection_path, document_name=document_1[\"name\"]),\n", + " index_name=INDEX,\n", + " )\n", + " print(chunks)\n", + "except ResourceNotFound:\n", + " pass # This is expected if the document is still embedding." + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py index 1ee8539a8..6c160d170 100644 --- a/src/intelligence_layer/connectors/document_index/document_index.py +++ b/src/intelligence_layer/connectors/document_index/document_index.py @@ -367,6 +367,38 @@ def _from_search_response( ) +class DocumentChunk(BaseModel): + """A chunk of a document. + + Note: + Currently only supports text-only documents. + + Args: + document_path: Path to the document that the chunk originates from. + section: Content of the chunk. + position: Position of the chunk within the document. + """ + + document_path: DocumentPath + section: str + position: DocumentTextPosition + + @classmethod + def _from_chunk_response(cls, chunk_response: Mapping[str, Any]) -> "DocumentChunk": + assert chunk_response["start"]["item"] == chunk_response["end"]["item"] + assert chunk_response["section"][0]["modality"] == "text" + + return cls( + document_path=DocumentPath.from_json(chunk_response["document_path"]), + section=chunk_response["section"][0]["text"], + position=DocumentTextPosition( + item=chunk_response["start"]["item"], + start_position=chunk_response["start"]["position"], + end_position=chunk_response["end"]["position"], + ), + ) + + class DocumentIndexError(RuntimeError): """Raised in case of any `DocumentIndexClient`-related errors. @@ -880,6 +912,31 @@ def search( self._raise_for_status(response) return [DocumentSearchResult._from_search_response(r) for r in response.json()] + def chunks( + self, document_path: DocumentPath, index_name: str + ) -> Sequence[DocumentChunk]: + """Retrieve all chunks of an indexed document. + + If the document is still indexing, a ResourceNotFound error is raised. + + Args: + document_path: Path to the document. + index_name: Name of the index to retrieve chunks from. + + Returns: + List of all chunks of the indexed document. + """ + url_suffix = f"collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.encoded_document_name()}/indexes/{index_name}/chunks" + url = urljoin(self._base_document_index_url, url_suffix) + + response = requests.get(url, headers=self.headers) + self._raise_for_status(response) + return [ + DocumentChunk._from_chunk_response(r) + for r in response.json() + if len(r["section"]) > 0 and r["section"][0]["modality"] == "text" + ] + def _raise_for_status(self, response: requests.Response) -> None: try: response.raise_for_status() diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index 97f2bed2e..4843f8d65 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -19,8 +19,13 @@ InvalidInput, ResourceNotFound, SearchQuery, + SemanticEmbed, +) +from tests.conftest_document_index import ( + random_embedding_config, + random_identifier, + retry, ) -from tests.conftest_document_index import random_embedding_config, retry @pytest.mark.internal @@ -752,3 +757,42 @@ def test_document_indexes_works( document_index: DocumentIndexClient, random_collection: CollectionPath ) -> None: document_index.progress(random_collection) + + +def test_retrieve_chunks( + document_index: DocumentIndexClient, + random_collection: CollectionPath, + document_index_namespace: str, +) -> None: + index_name = random_identifier() + index_path = IndexPath(namespace=document_index_namespace, index=index_name) + index_configuration = IndexConfiguration( + chunk_size=512, + chunk_overlap=0, + embedding=SemanticEmbed( + representation="asymmetric", + model_name="luminous-base", + ), + ) + document_index.create_index(index_path, index_configuration) + document_index.assign_index_to_collection(random_collection, index_name) + + document_path = DocumentPath( + collection_path=random_collection, + document_name="document-with-chunks", + ) + document_contents = DocumentContents( + contents=[ + # because chunk size is 512, this item will be split into 2 chunks + " token" * 750, + "final chunk", + ], + ) + document_index.add_document(document_path, document_contents) + + @retry + def chunks() -> None: + chunks = document_index.chunks(document_path, index_name) + assert len(chunks) == 3 + + chunks()