Skip to content

Commit

Permalink
feat(document-index): retrieve chunks of an indexed document
Browse files Browse the repository at this point in the history
  • Loading branch information
TilTheunissenAA committed Dec 2, 2024
1 parent eab6ed1 commit 874d0c2
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- Add `create_project` bool to `StudioClient.__init__()` to enable users to automatically create their Studio projects
- Add progressbar to the `Runner` to be able to track the `Run`
- Add `StudioClient.submit_benchmark_lineages` function and include it in `StudioClient.submit_benchmark_execution`
- Add method `DocumentIndexClient.chunks()` for retrieving all text chunks of a document.

### Fixes
...
Expand Down
19 changes: 19 additions & 0 deletions src/documentation/document_index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,25 @@
"document_index.documents(collection_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once a document is indexed, we can also have a look at its chunks:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"document_index.chunks(\n",
" DocumentPath(collection_path=collection_path, document_name=document_1[\"name\"]),\n",
" index_name=INDEX,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
57 changes: 57 additions & 0 deletions src/intelligence_layer/connectors/document_index/document_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,38 @@ def _from_search_response(
)


class DocumentChunk(BaseModel):
"""A chunk of a document.
Note:
Currently only supports text-only documents.
Args:
document_path: Path to the document that the chunk originates from.
section: Content of the chunk.
position: Position of the chunk within the document.
"""

document_path: DocumentPath
section: str
position: DocumentTextPosition

@classmethod
def _from_chunk_response(cls, chunk_response: Mapping[str, Any]) -> "DocumentChunk":
assert chunk_response["start"]["item"] == chunk_response["end"]["item"]
assert chunk_response["section"][0]["modality"] == "text"

return cls(
document_path=DocumentPath.from_json(chunk_response["document_path"]),
section=chunk_response["section"][0]["text"],
position=DocumentTextPosition(
item=chunk_response["start"]["item"],
start_position=chunk_response["start"]["position"],
end_position=chunk_response["end"]["position"],
),
)


class DocumentIndexError(RuntimeError):
"""Raised in case of any `DocumentIndexClient`-related errors.
Expand Down Expand Up @@ -910,6 +942,31 @@ def search(
self._raise_for_status(response)
return [DocumentSearchResult._from_search_response(r) for r in response.json()]

def chunks(
self, document_path: DocumentPath, index_name: str
) -> Sequence[DocumentChunk]:
"""Retrieve all chunks of an indexed document.
If the document is still indexing, a ResourceNotFound error is raised.
Args:
document_path: Path to the document.
index_name: Name of the index to retrieve chunks from.
Returns:
List of all chunks of the indexed document.
"""
url_suffix = f"collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.encoded_document_name()}/indexes/{index_name}/chunks"
url = urljoin(self._base_document_index_url, url_suffix)

response = requests.get(url, headers=self.headers)
self._raise_for_status(response)
return [
DocumentChunk._from_chunk_response(r)
for r in response.json()
if len(r["section"]) > 0 and r["section"][0]["modality"] == "text"
]

def _raise_for_status(self, response: requests.Response) -> None:
try:
response.raise_for_status()
Expand Down
27 changes: 27 additions & 0 deletions tests/connectors/document_index/test_document_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,3 +1077,30 @@ def test_document_indexes_works(
document_index: DocumentIndexClient, random_collection_path: CollectionPath
) -> None:
document_index.progress(random_collection_path)


def test_retrieve_chunks(
document_index: DocumentIndexClient, random_collection_path: CollectionPath
) -> None:
document_path = DocumentPath(
collection_path=random_collection_path,
document_name="document-with-chunks",
)
document_contents = DocumentContents(
contents=[
# because chunk size is 512, this item will be split into 2 chunks
" token" * 750,
"final chunk",
],
)
document_index.add_document(document_path, document_contents)

index_name = "ci-intelligence-layer"
document_index.assign_index_to_collection(random_collection_path, index_name)

@retry
def chunks() -> None:
chunks = document_index.chunks(document_path, index_name)
assert len(chunks) == 3

chunks()

0 comments on commit 874d0c2

Please sign in to comment.