From 4e93262422b4c6eaad5b446fedd9d626080be65b Mon Sep 17 00:00:00 2001 From: niklas Date: Thu, 2 Nov 2023 11:51:04 +0100 Subject: [PATCH] Refine DocumentIndexClient --- src/intelligence_layer/connectors/__init__.py | 2 +- .../document_index/document_index.py | 206 ++++++++++++++++-- .../retrievers/document_index_retriever.py | 8 +- tests/conftest.py | 12 +- .../document_index/test_document_index.py | 14 +- 5 files changed, 204 insertions(+), 38 deletions(-) diff --git a/src/intelligence_layer/connectors/__init__.py b/src/intelligence_layer/connectors/__init__.py index 0f199d3dd..8d1fff400 100644 --- a/src/intelligence_layer/connectors/__init__.py +++ b/src/intelligence_layer/connectors/__init__.py @@ -1,4 +1,4 @@ -from .document_index.document_index import DocumentIndex +from .document_index.document_index import DocumentIndexClient from .retrievers.base_retriever import BaseRetriever, Document, SearchResult from .retrievers.document_index_retriever import DocumentIndexRetriever from .retrievers.qdrant_in_memory_retriever import ( diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py index 5faacb908..846a18cea 100644 --- a/src/intelligence_layer/connectors/document_index/document_index.py +++ b/src/intelligence_layer/connectors/document_index/document_index.py @@ -7,6 +7,15 @@ class DocumentContents(BaseModel): + """Actual content of a document. + + Note: + Currently only supports text-only documents. + + Args: + contents: List of text items. + """ + contents: Sequence[str] @classmethod @@ -33,11 +42,27 @@ def _to_modalities_json(self) -> Sequence[Mapping[str, str]]: class CollectionPath(BaseModel): + """Path to a collection. + + Args: + namespace: Holds collections. + collection: Holds documents. + Unique within a namespace. + """ + namespace: str collection: str class DocumentPath(BaseModel): + """Path to a document. + + Args: + collection_path: Path to a collection. + document_name: Points to a document. + Unique within a collection. + """ + collection_path: CollectionPath document_name: str @@ -53,6 +78,15 @@ def _from_json(cls, document_path_json: Mapping[str, str]) -> "DocumentPath": class DocumentInfo(BaseModel): + """Presents an overview of a document. + + Args: + document_path: Path to a document. + created: When this version of the document was created. + Equivalent to when it was last updated. + version: How many times the document was updated. + """ + document_path: DocumentPath created: datetime version: int @@ -71,12 +105,32 @@ def _from_list_documents_response( class SearchQuery(BaseModel): + """Query to search through a collection with. + + Args: + query: Actual text to be searched with. + max_results: Max number of search results to be retrieved by the query. + Must be larger than 0. + min_score: Min score needed for a search result to be returned. + Must be between 0 and 1. + """ + query: str max_results: int = Field(..., ge=0) min_score: float = Field(..., ge=0.0, le=1.0) class DocumentSearchResult(BaseModel): + """Result of a search query for one individual section. + + Args: + document_path: Path to the document that the section originates from. + section: Actual section of the document that was found as a match to the query. + score: Actual search score of the section found. + Generally, higher scores correspond to better matches. + Will be between 0 and 1. + """ + document_path: DocumentPath section: str score: float @@ -92,7 +146,20 @@ def _from_search_response( ) -class DocumentIndex: +class DocumentIndexError(Exception): + """Raised in case of any `DocumentIndexClient`-related errors. + + Attributes: + message: The error message as returned by the Document Index. + status_code: The http error code. + """ + + def __init__(self, message: str, status_code: int) -> None: + super().__init__(message) + self.status_code = status_code + + +class DocumentIndexClient: """Client for the Document Index allowing handling documents and search. Document Index is a tool for managing collections of documents, enabling operations such as creation, deletion, listing, and searching. @@ -104,21 +171,25 @@ class DocumentIndex: Example: >>> document_index = DocumentIndex(os.getenv("AA_TOKEN")) - >>> document_index.create_collection(namespace="my_namespace", collection="germany_facts_collection") + >>> collection_path = CollectionPath( + >>> namespace="my_namespace", + >>> collection="germany_facts_collection" + >>> ) + >>> document_index.create_collection(collection_path) >>> document_index.add_document( - >>> document_path=CollectionPath( - >>> namespace="my_namespace", - >>> collection="germany_facts_collection", - >>> document_name="Fun facts about Germany", - >>> ) - >>> content=DocumentContents.from_text("Germany is a country located in ...") + >>> document_path=DocumentPath( + >>> collection_path=collection_path, + >>> document_name="Fun facts about Germany" + >>> ), + >>> contents=DocumentContents.from_text("Germany is a country located in ...") >>> ) - >>> documents = document_index.search( - >>> namespace="my_namespace", - >>> collection="germany_facts_collection", - >>> query: "What is the capital of Germany", - >>> max_results=4, - >>> min_score: 0.5 + >>> search_result = document_index.asymmetric_search( + >>> collection_path=collection_path, + >>> search_query=SearchQuery( + >>> query="What is the capital of Germany", + >>> max_results=4, + >>> min_score=0.5 + >>> ) >>> ) """ @@ -134,20 +205,51 @@ def __init__( "Authorization": f"Bearer {token}", } + def _raise_for_status(self, response: requests.Response) -> None: + try: + response.raise_for_status() + except: + raise DocumentIndexError(response.text, response.status_code) + def create_collection(self, collection_path: CollectionPath) -> None: + """Creates a collection at the path. + + Note: + Collection's name must be unique within a namespace. + + Args: + collection_path: Path to the collection of interest. + """ + url = f"{self._base_document_index_url}/collections/{collection_path.namespace}/{collection_path.collection}" response = requests.put(url, headers=self.headers) - response.raise_for_status() + self._raise_for_status(response) def delete_collection(self, collection_path: CollectionPath) -> None: + """Deletes the collection at the path. + + Args: + collection_path: Path to the collection of interest. + """ + url = f"{self._base_document_index_url}/collections/{collection_path.namespace}/{collection_path.collection}" response = requests.delete(url, headers=self.headers) - response.raise_for_status() + self._raise_for_status(response) def list_collections(self, namespace: str) -> Sequence[str]: + """Lists all collections within a namespace. + + Args: + namespace: For a collection of documents. + Typically corresponds to an organization. + + Returns: + List of all collections' names. + """ + url = f"{self._base_document_index_url}/collections/{namespace}" response = requests.get(url, headers=self.headers) - response.raise_for_status() + self._raise_for_status(response) collections: Sequence[str] = response.json() return collections @@ -156,29 +258,67 @@ def add_document( document_path: DocumentPath, contents: DocumentContents, ) -> None: + """Add a document to a collection. + + Note: + If a document with the same `document_path` exists, it will be updated with the new `contents`. + + Args: + document_path: Consists of `collection_path` and name of document to be created. + contents: Actual content of the document. + Currently only supports text. + """ + url = f"{self._base_document_index_url}/collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.document_name}" data = { "schema_version": "V1", "contents": contents._to_modalities_json(), } response = requests.put(url, data=json.dumps(data), headers=self.headers) - response.raise_for_status() + self._raise_for_status(response) def delete_document(self, document_path: DocumentPath) -> None: + """Delete a document from a collection. + + Args: + document_path: Consists of `collection_path` and name of document to be deleted. + """ + url = f"{self._base_document_index_url}/collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.document_name}" response = requests.delete(url, headers=self.headers) - response.raise_for_status() + self._raise_for_status(response) def document(self, document_path: DocumentPath) -> DocumentContents: + """Retrieve a document from a collection. + + Args: + document_path: Consists of `collection_path` and name of document to be retrieved. + + Returns: + Content of the retrieved document. + """ + url = f"{self._base_document_index_url}/collections/{document_path.collection_path.namespace}/{document_path.collection_path.collection}/docs/{document_path.document_name}" response = requests.get(url, headers=self.headers) - response.raise_for_status() + self._raise_for_status(response) return DocumentContents._from_modalities_json(response.json()) def list_documents(self, collection_path: CollectionPath) -> Sequence[DocumentInfo]: + """List all documents within a collection. + + Note: + Does not return each document's content. + + Args: + collection_path: Path to the collection of interest. + + Returns: + Overview of all documents within the collection. + """ + url = f"{self._base_document_index_url}/collections/{collection_path.namespace}/{collection_path.collection}/docs" response = requests.get(url, headers=self.headers) - response.raise_for_status() + self._raise_for_status(response) return [DocumentInfo._from_list_documents_response(r) for r in response.json()] def search( @@ -187,6 +327,18 @@ def search( index: str, search_query: SearchQuery, ) -> Sequence[DocumentSearchResult]: + """Search through a collection with a `search_query`. + + Args: + collection_path: Path to the collection of interest. + index: Name of the search configuration. + Currently only supports "asymmetric". + search_query: The query to search with. + + Returns: + Result of the search operation. Will be empty if nothing was retrieved. + """ + url = f"{self._base_document_index_url}/collections/{collection_path.namespace}/{collection_path.collection}/indexes/{index}/search" data = { "query": [{"modality": "text", "text": search_query.query}], @@ -195,7 +347,7 @@ def search( "filter": [{"with": [{"modality": "text"}]}], } response = requests.post(url, data=json.dumps(data), headers=self.headers) - response.raise_for_status() + self._raise_for_status(response) return [DocumentSearchResult._from_search_response(r) for r in response.json()] def asymmetric_search( @@ -203,4 +355,14 @@ def asymmetric_search( collection_path: CollectionPath, search_query: SearchQuery, ) -> Sequence[DocumentSearchResult]: + """Search through a collection with a `search_query` using the asymmetric search configuration. + + Args: + collection_path: Path to the collection of interest. + search_query: The query to search with. + + Returns: + Result of the search operation. Will be empty if nothing was retrieved. + """ + return self.search(collection_path, "asymmetric", search_query) diff --git a/src/intelligence_layer/connectors/retrievers/document_index_retriever.py b/src/intelligence_layer/connectors/retrievers/document_index_retriever.py index 542222f03..d091383b0 100644 --- a/src/intelligence_layer/connectors/retrievers/document_index_retriever.py +++ b/src/intelligence_layer/connectors/retrievers/document_index_retriever.py @@ -2,7 +2,7 @@ from intelligence_layer.connectors.document_index.document_index import ( CollectionPath, - DocumentIndex, + DocumentIndexClient, SearchQuery, ) from intelligence_layer.connectors.retrievers.base_retriever import ( @@ -13,14 +13,14 @@ class DocumentIndexRetriever(BaseRetriever): - """Search through documents within collections in the `DocumentIndex`. + """Search through documents within collections in the `DocumentIndexClient`. We initialize this Retriever with a collection & namespace names, and we can find the documents in the collection most semanticly similar to our query. Args: document_index: Client offering functionality for search. - namespace: The namespace within the `DocumentIndex` where all collections are stored. + namespace: The namespace within the `DocumentIndexClient` where all collections are stored. collection: The collection within the namespace that holds the desired documents. k: The (top) number of documents to be returned by search. threshold: The mimumum value of cosine similarity between the query vector and the document vector. @@ -34,7 +34,7 @@ class DocumentIndexRetriever(BaseRetriever): def __init__( self, - document_index: DocumentIndex, + document_index: DocumentIndexClient, namespace: str, collection: str, k: int, diff --git a/tests/conftest.py b/tests/conftest.py index 64657a67a..06e1c0dd2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,7 +10,9 @@ from intelligence_layer.connectors.retrievers.document_index_retriever import ( DocumentIndexRetriever, ) -from intelligence_layer.connectors.document_index.document_index import DocumentIndex +from intelligence_layer.connectors.document_index.document_index import ( + DocumentIndexClient, +) from intelligence_layer.connectors.retrievers.qdrant_in_memory_retriever import ( QdrantInMemoryRetriever, RetrieverType, @@ -69,12 +71,14 @@ def symmetric_in_memory_retriever( @fixture -def document_index(token: str) -> DocumentIndex: - return DocumentIndex(token) +def document_index(token: str) -> DocumentIndexClient: + return DocumentIndexClient(token) @fixture -def document_index_retriever(document_index: DocumentIndex) -> DocumentIndexRetriever: +def document_index_retriever( + document_index: DocumentIndexClient, +) -> DocumentIndexRetriever: return DocumentIndexRetriever( document_index, namespace="aleph-alpha", collection="wikipedia-de", k=2 ) diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index d853c52a3..f81a2a4fd 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -3,7 +3,7 @@ from intelligence_layer.connectors.document_index.document_index import ( CollectionPath, DocumentContents, - DocumentIndex, + DocumentIndexClient, DocumentPath, SearchQuery, ) @@ -16,7 +16,7 @@ def collection_path() -> CollectionPath: @fixture def document_path( - document_index: DocumentIndex, collection_path: CollectionPath + document_index: DocumentIndexClient, collection_path: CollectionPath ) -> DocumentPath: document_index.create_collection(collection_path) return DocumentPath( @@ -46,7 +46,7 @@ def document_contents() -> DocumentContents: @pytest.mark.internal def test_document_index_creates_collection( - document_index: DocumentIndex, collection_path: CollectionPath + document_index: DocumentIndexClient, collection_path: CollectionPath ) -> None: document_index.create_collection(collection_path) collections = document_index.list_collections(collection_path.namespace) @@ -56,7 +56,7 @@ def test_document_index_creates_collection( @pytest.mark.internal def test_document_index_adds_document( - document_index: DocumentIndex, + document_index: DocumentIndexClient, document_path: DocumentPath, document_contents: DocumentContents, ) -> None: @@ -68,7 +68,7 @@ def test_document_index_adds_document( @pytest.mark.internal def test_document_index_searches_asymmetrically( - document_index: DocumentIndex, collection_path: CollectionPath + document_index: DocumentIndexClient, collection_path: CollectionPath ) -> None: document_path = DocumentPath( collection_path=collection_path, @@ -84,7 +84,7 @@ def test_document_index_searches_asymmetrically( @pytest.mark.internal def test_document_index_gets_document( - document_index: DocumentIndex, document_path: DocumentPath + document_index: DocumentIndexClient, document_path: DocumentPath ) -> None: document = document_index.document(document_path) @@ -93,7 +93,7 @@ def test_document_index_gets_document( @pytest.mark.internal def test_document_index_deletes_document( - document_index: DocumentIndex, collection_path: CollectionPath + document_index: DocumentIndexClient, collection_path: CollectionPath ) -> None: document_path = DocumentPath( collection_path=collection_path, document_name="Document to be deleted"