Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge from private compass-sdk #11

Merged
merged 1 commit into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,40 @@ To insert parsed documents into a `Compass` index, the Compass SDK provides a `C
allows to interact with a Compass API server. The Compass API is also a RESTful API that allows to create,
delete and search documents in a Compass index. To install a Compass API service, please refer to the
[Compass documentation](https://github.com/cohere-ai/compass)

## Quickstart Snippet

Fill in your URL, username, password, and path to test data below for an end to end run of parsing and searching.

```
from compass_sdk.compass import CompassClient
from compass_sdk.parser import CompassParserClient
from compass_sdk import MetadataStrategy, MetadataConfig

# Using cohere_web_test folder for data
url = "<COMPASS_URL>"
username = "<COMPASS_USERNAME>"
password = "<COMPASS_PASSWORD>"

index = "test-index"
data_to_index = "<PATH_TO_TEST_DATA>"


# Parse the files before indexing
parser_url = url + '/parse'
parsing_client = CompassParserClient(parser_url = parser_url)
metadata_config = MetadataConfig(
metadata_strategy=MetadataStrategy.Command_R,
commandr_extractable_attributes=["date", "link", "page_title", "authors"]
)

docs_to_index = parsing_client.process_folder(folder_path=data_to_index, metadata_config=metadata_config)

# Create index and insert files
compass_client = CompassClient(index_url=url)
compass_client.create_index(index_name=index)
results = compass_client.insert_docs(index_name=index, docs=docs_to_index)

results = compass_client.search(index_name=index, query="test", top_k=1)
print(f"Results preview: \n {results.result['hits'][-1]} ... \n \n ")
```
34 changes: 27 additions & 7 deletions compass_sdk/compass.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ def __init__(self, message="The maximum error rate was exceeded. Stopping the in
class CompassClient:
def __init__(
self,
index_url: str = "http://localhost:80",
*,
index_url: str,
username: Optional[str] = None,
password: Optional[str] = None,
logger_level: LoggerLevel = LoggerLevel.INFO,
Expand All @@ -92,6 +93,7 @@ def __init__(
"put_documents_batch": self.session.post,
"search_documents": self.session.post,
"add_context": self.session.post,
"refresh": self.session.post,
}
self.function_endpoint = {
"create_index": "/api/v1/indexes/{index_name}",
Expand All @@ -103,10 +105,11 @@ def __init__(
"put_documents_batch": "/api/v1/batch/indexes/{index_name}",
"search_documents": "/api/v1/indexes/{index_name}/documents/search",
"add_context": "/api/v1/indexes/{index_name}/documents/add_context/{doc_id}",
"refresh": "/api/v1/indexes/{index_name}/refresh",
}
logger.setLevel(logger_level.value)

def create_index(self, index_name: str):
def create_index(self, *, index_name: str):
"""
Create an index in Compass
:param index_name: the name of the index
Expand All @@ -119,7 +122,20 @@ def create_index(self, index_name: str):
sleep_retry_seconds=DEFAULT_SLEEP_RETRY_SECONDS,
)

def delete_index(self, index_name: str):
def refresh(self, *, index_name: str):
"""
Refresh index
:param index_name: the name of the index
:return: the response from the Compass API
"""
return self._send_request(
function="refresh",
index_name=index_name,
max_retries=DEFAULT_MAX_RETRIES,
sleep_retry_seconds=DEFAULT_SLEEP_RETRY_SECONDS,
)

def delete_index(self, *, index_name: str):
"""
Delete an index from Compass
:param index_name: the name of the index
Expand All @@ -132,7 +148,7 @@ def delete_index(self, index_name: str):
sleep_retry_seconds=DEFAULT_SLEEP_RETRY_SECONDS,
)

def delete_document(self, index_name: str, doc_id: str):
def delete_document(self, *, index_name: str, doc_id: str):
"""
Delete a document from Compass
:param index_name: the name of the index
Expand All @@ -147,7 +163,7 @@ def delete_document(self, index_name: str, doc_id: str):
sleep_retry_seconds=DEFAULT_SLEEP_RETRY_SECONDS,
)

def get_document(self, index_name: str, doc_id: str):
def get_document(self, *, index_name: str, doc_id: str):
"""
Get a document from Compass
:param index_name: the name of the index
Expand Down Expand Up @@ -176,6 +192,7 @@ def list_indexes(self):

def add_context(
self,
*,
index_name: str,
doc_id: str,
context: Dict,
Expand Down Expand Up @@ -203,6 +220,7 @@ def add_context(

def insert_doc(
self,
*,
index_name: str,
doc: CompassDocument,
max_retries: int = DEFAULT_MAX_RETRIES,
Expand All @@ -219,7 +237,7 @@ def insert_doc(
index_name=index_name, docs=iter([doc]), max_retries=max_retries, sleep_retry_seconds=sleep_retry_seconds
)

def insert_docs_batch(self, uuid: str, index_name: str):
def insert_docs_batch(self, *, uuid: str, index_name: str):
"""
Insert a batch of parsed documents into an index in Compass
:param uuid: the uuid of the batch
Expand All @@ -233,7 +251,7 @@ def insert_docs_batch(self, uuid: str, index_name: str):
sleep_retry_seconds=DEFAULT_SLEEP_RETRY_SECONDS,
)

def batch_status(self, uuid: str):
def batch_status(self, *, uuid: str):
"""
Get the status of a batch
:param uuid: the uuid of the batch
Expand All @@ -251,6 +269,7 @@ def batch_status(self, uuid: str):

def insert_docs(
self,
*,
index_name: str,
docs: Iterator[CompassDocument],
max_chunks_per_request: int = DEFAULT_MAX_CHUNKS_PER_REQUEST,
Expand Down Expand Up @@ -374,6 +393,7 @@ def _get_request_blocks(

def search(
self,
*,
index_name: str,
query: str,
top_k: int = 10,
Expand Down
16 changes: 10 additions & 6 deletions compass_sdk/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ class CompassParserClient:

def __init__(
self,
parser_url: str = "http://localhost:8080",
*,
parser_url: str,
parser_config: ParserConfig = ParserConfig(),
metadata_config: MetadataConfig = MetadataConfig(),
username: Optional[str] = None,
Expand All @@ -46,10 +47,9 @@ def __init__(
):
"""
Initializes the CompassParserClient with the specified parser_url, parser_config, and metadata_config.
The default parser_url is "http://localhost:8080". The parser_config and metadata_config are optional,
and if not provided, the default configurations will be used. If the parser/metadata configs are provided,
they will be used for all subsequent files processed by the client unless specific configs are passed
when calling the process_file or process_files methods.
The parser_config and metadata_config are optional, and if not provided, the default configurations will be used.
If the parser/metadata configs are provided, they will be used for all subsequent files processed by the client
unless specific configs are passed when calling the process_file or process_files methods.

:param parser_url: the URL of the CompassParser API
:param parser_config: the parser configuration to use when processing files if no parser configuration
Expand All @@ -70,6 +70,7 @@ def __init__(

def process_folder(
self,
*,
folder_path: str,
allowed_extensions: Optional[List[str]] = None,
recursive: bool = False,
Expand Down Expand Up @@ -104,6 +105,7 @@ def process_folder(

def process_files(
self,
*,
filenames: List[str],
file_ids: Optional[List[str]] = None,
parser_config: Optional[ParserConfig] = None,
Expand Down Expand Up @@ -164,6 +166,7 @@ def _get_metadata(doc: CompassDocument, custom_context: Optional[Fn_or_Dict] = N

def process_file(
self,
*,
filename: str,
file_id: Optional[str] = None,
parser_config: Optional[ParserConfig] = None,
Expand Down Expand Up @@ -229,7 +232,7 @@ def process_file(

return docs

def batch_upload(self, zip_file_path: str) -> str:
def batch_upload(self, *, zip_file_path: str) -> str:
"""
Uploads a zip file to the for offline processing. The zip file should contain the files to process.
The zip file is sent to the server, and the server will process each file in the zip file using the default
Expand Down Expand Up @@ -282,6 +285,7 @@ def batch_status(self, uuid: str) -> str:

def batch_run(
self,
*,
uuid: str,
file_name_to_doc_ids: Optional[Dict[str, str]] = None,
parser_config: Optional[ParserConfig] = None,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ description = "Compass SDK"

[tool.poetry.dependencies]
fsspec = "2024.2.0"
joblib = "*"
joblib = "1.4.2"
pydantic = ">=2.6.3"
python = ">=3.9,<3.12"
requests = ">=2.25.0,<3.0.0"
Expand Down
21 changes: 14 additions & 7 deletions tests/test_compass_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,51 +2,58 @@


def test_delete_url_formatted_with_doc_and_index(requests_mock):
compass = CompassClient("http://test.com")
compass = CompassClient(index_url="http://test.com")
compass.delete_document(index_name="test_index", doc_id="test_id")
assert requests_mock.request_history[0].url == "http://test.com/api/v1/indexes/test_index/documents/test_id"
assert requests_mock.request_history[0].method == "DELETE"


def test_create_index_formatted_with_index(requests_mock):
compass = CompassClient("http://test.com")
compass = CompassClient(index_url="http://test.com")
compass.create_index(index_name="test_index")
assert requests_mock.request_history[0].url == "http://test.com/api/v1/indexes/test_index"
assert requests_mock.request_history[0].method == "PUT"


def test_put_documents_payload_and_url_exist(requests_mock):
compass = CompassClient("http://test.com")
compass = CompassClient(index_url="http://test.com")
compass.insert_docs(index_name="test_index", docs=iter([CompassDocument()]))
assert requests_mock.request_history[0].url == "http://test.com/api/v1/indexes/test_index/documents"
assert requests_mock.request_history[0].method == "PUT"
assert "docs" in requests_mock.request_history[0].json()


def test_put_document_payload_and_url_exist(requests_mock):
compass = CompassClient("http://test.com")
compass = CompassClient(index_url="http://test.com")
compass.insert_doc(index_name="test_index", doc=CompassDocument())
assert requests_mock.request_history[0].url == "http://test.com/api/v1/indexes/test_index/documents"
assert requests_mock.request_history[0].method == "PUT"
assert "docs" in requests_mock.request_history[0].json()


def test_list_indices_is_valid(requests_mock):
compass = CompassClient("http://test.com")
compass = CompassClient(index_url="http://test.com")
compass.list_indexes()
assert requests_mock.request_history[0].method == "GET"
assert requests_mock.request_history[0].url == "http://test.com/api/v1/indexes"


def test_get_documents_is_valid(requests_mock):
compass = CompassClient("http://test.com")
compass = CompassClient(index_url="http://test.com")
compass.get_document(index_name="test_index", doc_id="test_id")
assert requests_mock.request_history[0].method == "GET"
assert requests_mock.request_history[0].url == "http://test.com/api/v1/indexes/test_index/documents/test_id"


def test_refresh_is_valid(requests_mock):
compass = CompassClient(index_url="http://test.com")
compass.refresh(index_name="test_index")
assert requests_mock.request_history[0].method == "POST"
assert requests_mock.request_history[0].url == "http://test.com/api/v1/indexes/test_index/refresh"


def test_add_context_is_valid(requests_mock):
compass = CompassClient("http://test.com")
compass = CompassClient(index_url="http://test.com")
compass.add_context(index_name="test_index", doc_id="test_id", context={"fake": "context"})
assert requests_mock.request_history[0].method == "POST"
assert (
Expand Down
Loading
Loading