feat(document-index): retrieve chunks of an indexed document

Aleph-Alpha · Dec 2, 2024 · 874d0c2 · 874d0c2
1 parent eab6ed1
commit 874d0c2
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@
 - Add `create_project` bool to `StudioClient.__init__()` to enable users to automatically create their Studio projects
 - Add progressbar to the `Runner` to be able to track the `Run`
 - Add `StudioClient.submit_benchmark_lineages` function and include it in `StudioClient.submit_benchmark_execution`
+- Add method `DocumentIndexClient.chunks()` for retrieving all text chunks of a document.
 
 ### Fixes
 ...

diff --git a/src/documentation/document_index.ipynb b/src/documentation/document_index.ipynb
@@ -264,6 +264,25 @@
     "document_index.documents(collection_path)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once a document is indexed, we can also have a look at its chunks:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "document_index.chunks(\n",
+    "    DocumentPath(collection_path=collection_path, document_name=document_1[\"name\"]),\n",
+    "    index_name=INDEX,\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py
@@ -367,6 +367,38 @@ def _from_search_response(
         )
 
 
+class DocumentChunk(BaseModel):
+    """A chunk of a document.
+
+    Note:
+        Currently only supports text-only documents.
+
+    Args:
+        document_path: Path to the document that the chunk originates from.
+        section: Content of the chunk.
+        position: Position of the chunk within the document.
+    """
+
+    document_path: DocumentPath
+    section: str
+    position: DocumentTextPosition
+
+    @classmethod
+    def _from_chunk_response(cls, chunk_response: Mapping[str, Any]) -> "DocumentChunk":
+        assert chunk_response["start"]["item"] == chunk_response["end"]["item"]
+        assert chunk_response["section"][0]["modality"] == "text"
+
+        return cls(
+            document_path=DocumentPath.from_json(chunk_response["document_path"]),
+            section=chunk_response["section"][0]["text"],
+            position=DocumentTextPosition(
+                item=chunk_response["start"]["item"],
+                start_position=chunk_response["start"]["position"],
+                end_position=chunk_response["end"]["position"],
+            ),
+        )
+
+
 class DocumentIndexError(RuntimeError):
     """Raised in case of any `DocumentIndexClient`-related errors.
 
@@ -910,6 +942,31 @@ def search(
         self._raise_for_status(response)
         return [DocumentSearchResult._from_search_response(r) for r in response.json()]
 
+    def chunks(
+        self, document_path: DocumentPath, index_name: str
+    ) -> Sequence[DocumentChunk]:
+        """Retrieve all chunks of an indexed document.
+
+        If the document is still indexing, a ResourceNotFound error is raised.
+
+        Args:
+            document_path: Path to the document.
+            index_name: Name of the index to retrieve chunks from.
+
+        Returns:
+            List of all chunks of the indexed document.
+        """
+        url_suffix = f"collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.encoded_document_name()}/indexes/{index_name}/chunks"
+        url = urljoin(self._base_document_index_url, url_suffix)
+
+        response = requests.get(url, headers=self.headers)
+        self._raise_for_status(response)
+        return [
+            DocumentChunk._from_chunk_response(r)
+            for r in response.json()
+            if len(r["section"]) > 0 and r["section"][0]["modality"] == "text"
+        ]
+
     def _raise_for_status(self, response: requests.Response) -> None:
         try:
             response.raise_for_status()

diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py
@@ -1077,3 +1077,30 @@ def test_document_indexes_works(
     document_index: DocumentIndexClient, random_collection_path: CollectionPath
 ) -> None:
     document_index.progress(random_collection_path)
+
+
+def test_retrieve_chunks(
+    document_index: DocumentIndexClient, random_collection_path: CollectionPath
+) -> None:
+    document_path = DocumentPath(
+        collection_path=random_collection_path,
+        document_name="document-with-chunks",
+    )
+    document_contents = DocumentContents(
+        contents=[
+            # because chunk size is 512, this item will be split into 2 chunks
+            " token" * 750,
+            "final chunk",
+        ],
+    )
+    document_index.add_document(document_path, document_contents)
+
+    index_name = "ci-intelligence-layer"
+    document_index.assign_index_to_collection(random_collection_path, index_name)
+
+    @retry
+    def chunks() -> None:
+        chunks = document_index.chunks(document_path, index_name)
+        assert len(chunks) == 3
+
+    chunks()