Skip to content

Commit

Permalink
[DI-133] feat(document-index): integrate chunk overlap config
Browse files Browse the repository at this point in the history
- you can now specify a chunk overlap when configuring an index
- add validation to ensure 0 <= overlap < chunk_size
- add validation to ensure chunk_size is specified and valid
  • Loading branch information
Michael-JB committed Aug 9, 2024
1 parent ebe4663 commit 6e8c0e5
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

### Features
- Add `StudioClient` as connector to PhariaStudio for submitting traces.
- You can now specify a `chunk_overlap` when creating an index in the Document Index.

### Fixes
...
Expand Down
14 changes: 12 additions & 2 deletions src/intelligence_layer/connectors/document_index/document_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from urllib.parse import quote

import requests
from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, model_validator
from requests import HTTPError

from intelligence_layer.connectors.base.json_serializable import JsonSerializable
Expand All @@ -29,11 +29,20 @@ class IndexConfiguration(BaseModel):
Args:
embedding_type: "symmetric" or "asymmetric" embedding type.
chunk_overlap: The maximum number of tokens of overlap between consecutive chunks. Must be
less than `chunk_size`.
chunk_size: The maximum size of the chunks in tokens to be used for the index.
"""

embedding_type: Literal["symmetric", "asymmetric"]
chunk_size: int
chunk_overlap: int = Field(default=0, ge=0)
chunk_size: int = Field(..., gt=0, le=2046)

@model_validator(mode="after")
def validate_chunk_overlap(cls, values):
if not values.chunk_overlap < values.chunk_size:
raise ValueError("chunk_overlap must be less than chunk_size")
return values


class DocumentContents(BaseModel):
Expand Down Expand Up @@ -427,6 +436,7 @@ def index_configuration(self, index_path: IndexPath) -> IndexConfiguration:
response_json: Mapping[str, Any] = response.json()
return IndexConfiguration(
embedding_type=response_json["embedding_type"],
chunk_overlap=response_json["chunk_overlap"],
chunk_size=response_json["chunk_size"],
)

Expand Down
14 changes: 14 additions & 0 deletions tests/connectors/document_index/test_document_index.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from http import HTTPStatus

import pytest
from pydantic import ValidationError
from pytest import fixture, raises

from intelligence_layer.connectors.document_index.document_index import (
Expand All @@ -9,6 +10,7 @@
DocumentFilterQueryParams,
DocumentIndexClient,
DocumentPath,
IndexConfiguration,
IndexPath,
ResourceNotFound,
SearchQuery,
Expand Down Expand Up @@ -233,6 +235,17 @@ def test_document_path_is_immutable() -> None:
assert dictionary[path] == 1


def test_index_configuration_rejects_invalid_chunk_overlap() -> None:
try:
IndexConfiguration(
chunk_size=128, chunk_overlap=128, embedding_type="asymmetric"
)
except ValidationError as e:
assert "chunk_overlap must be less than chunk_size" in str(e)
else:
raise AssertionError("ValidationError was not raised")


def test_document_indexes_are_returned(
document_index: DocumentIndexClient, collection_path: CollectionPath
) -> None:
Expand All @@ -243,4 +256,5 @@ def test_document_indexes_are_returned(
)

assert index_configuration.embedding_type == "asymmetric"
assert index_configuration.chunk_overlap == 0
assert index_configuration.chunk_size == 512

0 comments on commit 6e8c0e5

Please sign in to comment.