From 08331e1a1e6fef8b3aef07ff2f6c6ffb99a6c852 Mon Sep 17 00:00:00 2001 From: Til Theunissen <166376512+TilTheunissenAA@users.noreply.github.com> Date: Mon, 16 Dec 2024 15:51:16 +0100 Subject: [PATCH 1/2] feat(document-index): retrieve chunks of an indexed document (#1161) * feat(document-index): retrieve chunks of an indexed document * docs(document-index): handle unindexed document in chunks-endpoint example --------- Co-authored-by: Sebastian Niehus <165138846+SebastianNiehusAA@users.noreply.github.com> --- CHANGELOG.md | 1 + src/documentation/document_index.ipynb | 24 ++++++++ .../document_index/document_index.py | 57 +++++++++++++++++++ .../document_index/test_document_index.py | 46 ++++++++++++++- 4 files changed, 127 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7160f4f6..e34ffeef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ - Add `create_project` bool to `StudioClient.__init__()` to enable users to automatically create their Studio projects - Add progressbar to the `Runner` to be able to track the `Run` - Add `StudioClient.submit_benchmark_lineages` function and include it in `StudioClient.submit_benchmark_execution` +- Add method `DocumentIndexClient.chunks()` for retrieving all text chunks of a document. ### Fixes ... diff --git a/src/documentation/document_index.ipynb b/src/documentation/document_index.ipynb index aae97a93..91ef81fb 100644 --- a/src/documentation/document_index.ipynb +++ b/src/documentation/document_index.ipynb @@ -23,6 +23,7 @@ " IndexPath,\n", " InstructableEmbed,\n", " LimitedConcurrencyClient,\n", + " ResourceNotFound,\n", " SemanticEmbed,\n", ")\n", "from intelligence_layer.core import InMemoryTracer, LuminousControlModel\n", @@ -262,6 +263,29 @@ "document_index.documents(collection_path)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once a document is indexed, we can also have a look at its chunks:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " chunks = document_index.chunks(\n", + " DocumentPath(collection_path=collection_path, document_name=document_1[\"name\"]),\n", + " index_name=INDEX,\n", + " )\n", + " print(chunks)\n", + "except ResourceNotFound:\n", + " pass # This is expected if the document is still embedding." + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py index 1ee8539a..6c160d17 100644 --- a/src/intelligence_layer/connectors/document_index/document_index.py +++ b/src/intelligence_layer/connectors/document_index/document_index.py @@ -367,6 +367,38 @@ def _from_search_response( ) +class DocumentChunk(BaseModel): + """A chunk of a document. + + Note: + Currently only supports text-only documents. + + Args: + document_path: Path to the document that the chunk originates from. + section: Content of the chunk. + position: Position of the chunk within the document. + """ + + document_path: DocumentPath + section: str + position: DocumentTextPosition + + @classmethod + def _from_chunk_response(cls, chunk_response: Mapping[str, Any]) -> "DocumentChunk": + assert chunk_response["start"]["item"] == chunk_response["end"]["item"] + assert chunk_response["section"][0]["modality"] == "text" + + return cls( + document_path=DocumentPath.from_json(chunk_response["document_path"]), + section=chunk_response["section"][0]["text"], + position=DocumentTextPosition( + item=chunk_response["start"]["item"], + start_position=chunk_response["start"]["position"], + end_position=chunk_response["end"]["position"], + ), + ) + + class DocumentIndexError(RuntimeError): """Raised in case of any `DocumentIndexClient`-related errors. @@ -880,6 +912,31 @@ def search( self._raise_for_status(response) return [DocumentSearchResult._from_search_response(r) for r in response.json()] + def chunks( + self, document_path: DocumentPath, index_name: str + ) -> Sequence[DocumentChunk]: + """Retrieve all chunks of an indexed document. + + If the document is still indexing, a ResourceNotFound error is raised. + + Args: + document_path: Path to the document. + index_name: Name of the index to retrieve chunks from. + + Returns: + List of all chunks of the indexed document. + """ + url_suffix = f"collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.encoded_document_name()}/indexes/{index_name}/chunks" + url = urljoin(self._base_document_index_url, url_suffix) + + response = requests.get(url, headers=self.headers) + self._raise_for_status(response) + return [ + DocumentChunk._from_chunk_response(r) + for r in response.json() + if len(r["section"]) > 0 and r["section"][0]["modality"] == "text" + ] + def _raise_for_status(self, response: requests.Response) -> None: try: response.raise_for_status() diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index 97f2bed2..4843f8d6 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -19,8 +19,13 @@ InvalidInput, ResourceNotFound, SearchQuery, + SemanticEmbed, +) +from tests.conftest_document_index import ( + random_embedding_config, + random_identifier, + retry, ) -from tests.conftest_document_index import random_embedding_config, retry @pytest.mark.internal @@ -752,3 +757,42 @@ def test_document_indexes_works( document_index: DocumentIndexClient, random_collection: CollectionPath ) -> None: document_index.progress(random_collection) + + +def test_retrieve_chunks( + document_index: DocumentIndexClient, + random_collection: CollectionPath, + document_index_namespace: str, +) -> None: + index_name = random_identifier() + index_path = IndexPath(namespace=document_index_namespace, index=index_name) + index_configuration = IndexConfiguration( + chunk_size=512, + chunk_overlap=0, + embedding=SemanticEmbed( + representation="asymmetric", + model_name="luminous-base", + ), + ) + document_index.create_index(index_path, index_configuration) + document_index.assign_index_to_collection(random_collection, index_name) + + document_path = DocumentPath( + collection_path=random_collection, + document_name="document-with-chunks", + ) + document_contents = DocumentContents( + contents=[ + # because chunk size is 512, this item will be split into 2 chunks + " token" * 750, + "final chunk", + ], + ) + document_index.add_document(document_path, document_contents) + + @retry + def chunks() -> None: + chunks = document_index.chunks(document_path, index_name) + assert len(chunks) == 3 + + chunks() From 26c5a24ab2200170917487357810722274647468 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 17 Dec 2024 09:33:47 +0100 Subject: [PATCH 2/2] build(deps): bump pydantic from 2.10.2 to 2.10.3 (#1173) Bumps [pydantic](https://github.com/pydantic/pydantic) from 2.10.2 to 2.10.3. - [Release notes](https://github.com/pydantic/pydantic/releases) - [Changelog](https://github.com/pydantic/pydantic/blob/main/HISTORY.md) - [Commits](https://github.com/pydantic/pydantic/compare/v2.10.2...v2.10.3) --- updated-dependencies: - dependency-name: pydantic dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Sebastian Niehus <165138846+SebastianNiehusAA@users.noreply.github.com> --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index b38d99a2..e841859e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4160,13 +4160,13 @@ files = [ [[package]] name = "pydantic" -version = "2.10.2" +version = "2.10.3" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic-2.10.2-py3-none-any.whl", hash = "sha256:cfb96e45951117c3024e6b67b25cdc33a3cb7b2fa62e239f7af1378358a1d99e"}, - {file = "pydantic-2.10.2.tar.gz", hash = "sha256:2bc2d7f17232e0841cbba4641e65ba1eb6fafb3a08de3a091ff3ce14a197c4fa"}, + {file = "pydantic-2.10.3-py3-none-any.whl", hash = "sha256:be04d85bbc7b65651c5f8e6b9976ed9c6f41782a55524cef079a34a0bb82144d"}, + {file = "pydantic-2.10.3.tar.gz", hash = "sha256:cb5ac360ce894ceacd69c403187900a02c4b20b693a9dd1d643e1effab9eadf9"}, ] [package.dependencies]