From 08331e1a1e6fef8b3aef07ff2f6c6ffb99a6c852 Mon Sep 17 00:00:00 2001
From: Til Theunissen <166376512+TilTheunissenAA@users.noreply.github.com>
Date: Mon, 16 Dec 2024 15:51:16 +0100
Subject: [PATCH 1/2] feat(document-index): retrieve chunks of an indexed
 document (#1161)

* feat(document-index): retrieve chunks of an indexed document
* docs(document-index): handle unindexed document in chunks-endpoint example

---------

Co-authored-by: Sebastian Niehus <165138846+SebastianNiehusAA@users.noreply.github.com>
---
 CHANGELOG.md                                  |  1 +
 src/documentation/document_index.ipynb        | 24 ++++++++
 .../document_index/document_index.py          | 57 +++++++++++++++++++
 .../document_index/test_document_index.py     | 46 ++++++++++++++-
 4 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7160f4f6..e34ffeef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@
 - Add `create_project` bool to `StudioClient.__init__()` to enable users to automatically create their Studio projects
 - Add progressbar to the `Runner` to be able to track the `Run`
 - Add `StudioClient.submit_benchmark_lineages` function and include it in `StudioClient.submit_benchmark_execution`
+- Add method `DocumentIndexClient.chunks()` for retrieving all text chunks of a document.
 
 ### Fixes
 ...
diff --git a/src/documentation/document_index.ipynb b/src/documentation/document_index.ipynb
index aae97a93..91ef81fb 100644
--- a/src/documentation/document_index.ipynb
+++ b/src/documentation/document_index.ipynb
@@ -23,6 +23,7 @@
     "    IndexPath,\n",
     "    InstructableEmbed,\n",
     "    LimitedConcurrencyClient,\n",
+    "    ResourceNotFound,\n",
     "    SemanticEmbed,\n",
     ")\n",
     "from intelligence_layer.core import InMemoryTracer, LuminousControlModel\n",
@@ -262,6 +263,29 @@
     "document_index.documents(collection_path)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once a document is indexed, we can also have a look at its chunks:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    chunks = document_index.chunks(\n",
+    "        DocumentPath(collection_path=collection_path, document_name=document_1[\"name\"]),\n",
+    "        index_name=INDEX,\n",
+    "    )\n",
+    "    print(chunks)\n",
+    "except ResourceNotFound:\n",
+    "    pass  # This is expected if the document is still embedding."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py
index 1ee8539a..6c160d17 100644
--- a/src/intelligence_layer/connectors/document_index/document_index.py
+++ b/src/intelligence_layer/connectors/document_index/document_index.py
@@ -367,6 +367,38 @@ def _from_search_response(
         )
 
 
+class DocumentChunk(BaseModel):
+    """A chunk of a document.
+
+    Note:
+        Currently only supports text-only documents.
+
+    Args:
+        document_path: Path to the document that the chunk originates from.
+        section: Content of the chunk.
+        position: Position of the chunk within the document.
+    """
+
+    document_path: DocumentPath
+    section: str
+    position: DocumentTextPosition
+
+    @classmethod
+    def _from_chunk_response(cls, chunk_response: Mapping[str, Any]) -> "DocumentChunk":
+        assert chunk_response["start"]["item"] == chunk_response["end"]["item"]
+        assert chunk_response["section"][0]["modality"] == "text"
+
+        return cls(
+            document_path=DocumentPath.from_json(chunk_response["document_path"]),
+            section=chunk_response["section"][0]["text"],
+            position=DocumentTextPosition(
+                item=chunk_response["start"]["item"],
+                start_position=chunk_response["start"]["position"],
+                end_position=chunk_response["end"]["position"],
+            ),
+        )
+
+
 class DocumentIndexError(RuntimeError):
     """Raised in case of any `DocumentIndexClient`-related errors.
 
@@ -880,6 +912,31 @@ def search(
         self._raise_for_status(response)
         return [DocumentSearchResult._from_search_response(r) for r in response.json()]
 
+    def chunks(
+        self, document_path: DocumentPath, index_name: str
+    ) -> Sequence[DocumentChunk]:
+        """Retrieve all chunks of an indexed document.
+
+        If the document is still indexing, a ResourceNotFound error is raised.
+
+        Args:
+            document_path: Path to the document.
+            index_name: Name of the index to retrieve chunks from.
+
+        Returns:
+            List of all chunks of the indexed document.
+        """
+        url_suffix = f"collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.encoded_document_name()}/indexes/{index_name}/chunks"
+        url = urljoin(self._base_document_index_url, url_suffix)
+
+        response = requests.get(url, headers=self.headers)
+        self._raise_for_status(response)
+        return [
+            DocumentChunk._from_chunk_response(r)
+            for r in response.json()
+            if len(r["section"]) > 0 and r["section"][0]["modality"] == "text"
+        ]
+
     def _raise_for_status(self, response: requests.Response) -> None:
         try:
             response.raise_for_status()
diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py
index 97f2bed2..4843f8d6 100644
--- a/tests/connectors/document_index/test_document_index.py
+++ b/tests/connectors/document_index/test_document_index.py
@@ -19,8 +19,13 @@
     InvalidInput,
     ResourceNotFound,
     SearchQuery,
+    SemanticEmbed,
+)
+from tests.conftest_document_index import (
+    random_embedding_config,
+    random_identifier,
+    retry,
 )
-from tests.conftest_document_index import random_embedding_config, retry
 
 
 @pytest.mark.internal
@@ -752,3 +757,42 @@ def test_document_indexes_works(
     document_index: DocumentIndexClient, random_collection: CollectionPath
 ) -> None:
     document_index.progress(random_collection)
+
+
+def test_retrieve_chunks(
+    document_index: DocumentIndexClient,
+    random_collection: CollectionPath,
+    document_index_namespace: str,
+) -> None:
+    index_name = random_identifier()
+    index_path = IndexPath(namespace=document_index_namespace, index=index_name)
+    index_configuration = IndexConfiguration(
+        chunk_size=512,
+        chunk_overlap=0,
+        embedding=SemanticEmbed(
+            representation="asymmetric",
+            model_name="luminous-base",
+        ),
+    )
+    document_index.create_index(index_path, index_configuration)
+    document_index.assign_index_to_collection(random_collection, index_name)
+
+    document_path = DocumentPath(
+        collection_path=random_collection,
+        document_name="document-with-chunks",
+    )
+    document_contents = DocumentContents(
+        contents=[
+            # because chunk size is 512, this item will be split into 2 chunks
+            " token" * 750,
+            "final chunk",
+        ],
+    )
+    document_index.add_document(document_path, document_contents)
+
+    @retry
+    def chunks() -> None:
+        chunks = document_index.chunks(document_path, index_name)
+        assert len(chunks) == 3
+
+    chunks()

From 26c5a24ab2200170917487357810722274647468 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 17 Dec 2024 09:33:47 +0100
Subject: [PATCH 2/2] build(deps): bump pydantic from 2.10.2 to 2.10.3 (#1173)

Bumps [pydantic](https://github.com/pydantic/pydantic) from 2.10.2 to 2.10.3.
- [Release notes](https://github.com/pydantic/pydantic/releases)
- [Changelog](https://github.com/pydantic/pydantic/blob/main/HISTORY.md)
- [Commits](https://github.com/pydantic/pydantic/compare/v2.10.2...v2.10.3)

---
updated-dependencies:
- dependency-name: pydantic
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Sebastian Niehus <165138846+SebastianNiehusAA@users.noreply.github.com>
---
 poetry.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index b38d99a2..e841859e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -4160,13 +4160,13 @@ files = [
 
 [[package]]
 name = "pydantic"
-version = "2.10.2"
+version = "2.10.3"
 description = "Data validation using Python type hints"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic-2.10.2-py3-none-any.whl", hash = "sha256:cfb96e45951117c3024e6b67b25cdc33a3cb7b2fa62e239f7af1378358a1d99e"},
-    {file = "pydantic-2.10.2.tar.gz", hash = "sha256:2bc2d7f17232e0841cbba4641e65ba1eb6fafb3a08de3a091ff3ce14a197c4fa"},
+    {file = "pydantic-2.10.3-py3-none-any.whl", hash = "sha256:be04d85bbc7b65651c5f8e6b9976ed9c6f41782a55524cef079a34a0bb82144d"},
+    {file = "pydantic-2.10.3.tar.gz", hash = "sha256:cb5ac360ce894ceacd69c403187900a02c4b20b693a9dd1d643e1effab9eadf9"},
 ]
 
 [package.dependencies]