From 94be863b5ae8c845fe5a4a6146a559e408eee7a7 Mon Sep 17 00:00:00 2001 From: Javier <160494147+javier-cohere@users.noreply.github.com> Date: Tue, 22 Oct 2024 20:03:55 +0200 Subject: [PATCH] Adapt to new compass parser changes (#20) - `CompassDocumentChunk.origin` is no longer a pydantic model, but a `Dict[str, Any]` so that we can add arbitrary non-numeric fields such as `"tab_name": "some name"` for spreadsheets. - Remove `is_dataset` as a parameter. --------- Signed-off-by: javier-cohere Co-authored-by: javier-cohere --- compass_sdk/__init__.py | 23 ++++++++++------------- compass_sdk/compass.py | 1 + compass_sdk/parser.py | 12 ------------ compass_sdk/utils.py | 2 +- pyproject.toml | 2 +- 5 files changed, 13 insertions(+), 27 deletions(-) diff --git a/compass_sdk/__init__.py b/compass_sdk/__init__.py index 5975f64..d9671d3 100644 --- a/compass_sdk/__init__.py +++ b/compass_sdk/__init__.py @@ -80,6 +80,7 @@ class CompassDocumentMetadata(ValidatedModel): doc_id: str = "" filename: str = "" meta: List = [] + parent_doc_id: str = "" class CompassDocumentStatus(str, Enum): @@ -104,16 +105,16 @@ class CompassSdkStage(str, Enum): Indexing = "indexing" -class CompassDocumentChunkOrigin(BaseModel): - page_number: Optional[int] = None - - class CompassDocumentChunk(BaseModel): chunk_id: str sort_id: str doc_id: str + parent_doc_id: str content: Dict[str, Any] - origin: Optional[CompassDocumentChunkOrigin] = None + origin: Optional[Dict[str, Any]] = None + + def parent_doc_is_split(self): + return self.doc_id != self.parent_doc_id class CompassDocument(ValidatedModel): @@ -130,12 +131,12 @@ class CompassDocument(ValidatedModel): filebytes: bytes = b"" metadata: CompassDocumentMetadata = CompassDocumentMetadata() content: Dict[str, str] = {} + content_type: Optional[str] = None elements: List[Any] = [] chunks: List[CompassDocumentChunk] = [] index_fields: List[str] = [] errors: List[Dict[CompassSdkStage, str]] = [] ignore_metadata_errors: bool = True - is_dataset: bool = False markdown: Optional[str] = None def has_data(self) -> bool: @@ -201,8 +202,6 @@ def _missing_(cls, value): class MetadataConfig(ValidatedModel): """ Configuration class for metadata detection. - :param pre_build_detectors: whether to pre-build all metadata detectors. If set to False (default), - detectors will be built on the fly when needed :param metadata_strategy: the metadata detection strategy to use. One of: - No_Metadata: no metadata is inferred - Heuristics: metadata is inferred using heuristics @@ -219,7 +218,6 @@ class MetadataConfig(ValidatedModel): """ - pre_build_detectors: bool = False metadata_strategy: MetadataStrategy = MetadataStrategy.No_Metadata cohere_api_key: Optional[str] = getenv(COHERE_API_ENV_VAR, None) commandr_model_name: str = "command-r" @@ -286,7 +284,6 @@ class ParserConfig(ValidatedModel): allowed_image_types: Optional[List[str]] = None min_chars_per_element: int = DEFAULT_MIN_CHARS_PER_ELEMENT skip_infer_table_types: List[str] = SKIP_INFER_TABLE_TYPES - detect_datasets: bool = True parsing_strategy: ParsingStrategy = ParsingStrategy.Fast parsing_model: ParsingModel = ParsingModel.Marker @@ -309,7 +306,8 @@ class Chunk(BaseModel): chunk_id: str sort_id: int content: Dict[str, Any] - origin: Optional[CompassDocumentChunkOrigin] = None + origin: Optional[Dict[str, Any]] = None + parent_doc_id: str class Document(BaseModel): @@ -319,6 +317,7 @@ class Document(BaseModel): doc_id: str path: str + parent_doc_id: str content: Dict[str, Any] chunks: List[Chunk] index_fields: List[str] = [] @@ -383,14 +382,12 @@ class ProcessFileParameters(ValidatedModel): parser_config: ParserConfig metadata_config: MetadataConfig doc_id: Optional[str] = None - is_dataset: Optional[bool] = None class ProcessFilesParameters(ValidatedModel): doc_ids: Optional[List[str]] = None parser_config: ParserConfig metadata_config: MetadataConfig - are_datasets: Optional[bool] = None class BatchProcessFilesParameters(ProcessFilesParameters): diff --git a/compass_sdk/compass.py b/compass_sdk/compass.py index 91e89e0..e8efe05 100644 --- a/compass_sdk/compass.py +++ b/compass_sdk/compass.py @@ -442,6 +442,7 @@ def _get_request_blocks( doc, Document( doc_id=doc.metadata.doc_id, + parent_doc_id=doc.metadata.parent_doc_id, path=doc.metadata.filename, content=doc.content, chunks=[Chunk(**c.model_dump()) for c in doc.chunks], diff --git a/compass_sdk/parser.py b/compass_sdk/parser.py index e020305..b9e25d8 100644 --- a/compass_sdk/parser.py +++ b/compass_sdk/parser.py @@ -110,7 +110,6 @@ def process_files( file_ids: Optional[List[str]] = None, parser_config: Optional[ParserConfig] = None, metadata_config: Optional[MetadataConfig] = None, - are_datasets: Optional[List[bool]] = None, custom_context: Optional[Fn_or_Dict] = None, ) -> Iterable[CompassDocument]: """ @@ -129,7 +128,6 @@ def process_files( :param file_ids: List of ids for the files :param parser_config: ParserConfig object (applies the same config to all docs) :param metadata_config: MetadataConfig object (applies the same config to all docs) - :param are_datasets: List of booleans indicating whether each file is a dataset :param custom_context: Additional data to add to compass document. Fields will be filterable but not semantically searchable. Can either be a dictionary or a callable that takes a CompassDocument and returns a dictionary. @@ -143,7 +141,6 @@ def process_file(i: int) -> List[CompassDocument]: file_id=file_ids[i] if file_ids else None, parser_config=parser_config, metadata_config=metadata_config, - is_dataset=are_datasets[i] if are_datasets else None, custom_context=custom_context, ) @@ -171,7 +168,6 @@ def process_file( file_id: Optional[str] = None, parser_config: Optional[ParserConfig] = None, metadata_config: Optional[MetadataConfig] = None, - is_dataset: Optional[bool] = None, custom_context: Optional[Fn_or_Dict] = None, ) -> List[CompassDocument]: """ @@ -184,10 +180,6 @@ def process_file( :param file_id: Id for the file :param parser_config: ParserConfig object with the config to use for parsing the file :param metadata_config: MetadataConfig object with the config to use for extracting metadata for each document - :param is_dataset: Boolean indicating whether the file is a dataset. If True, the file will be processed - as a dataset and multiple CompassDocument objects might be returned (one per dataset record). Otherwise, - the file will be processed as a single document (e.g., a PDF file). Default is None, which means that - the server will try to infer whether the file is a dataset or not. :param custom_context: Additional data to add to compass document. Fields will be filterable but not semantically searchable. Can either be a dictionary or a callable that takes a CompassDocument and returns a dictionary. @@ -211,7 +203,6 @@ def process_file( parser_config=parser_config, metadata_config=metadata_config, doc_id=file_id, - is_dataset=is_dataset, ) auth = (self.username, self.password) if self.username and self.password else None res = self.session.post( @@ -249,7 +240,6 @@ def batch_upload(self, *, zip_file_path: str) -> str: zip_data = zip_file.read() res = self.session.post( url=f"{self.parser_url}/v1/batch/upload", - data={"data": {"is_dataset": False}}, files={"file": ("data.zip", zip_data)}, auth=auth, ) @@ -290,7 +280,6 @@ def batch_run( file_name_to_doc_ids: Optional[Dict[str, str]] = None, parser_config: Optional[ParserConfig] = None, metadata_config: Optional[MetadataConfig] = None, - are_datasets: Optional[bool] = None, ) -> List[CompassDocument]: parser_config = parser_config or self.parser_config @@ -301,7 +290,6 @@ def batch_run( file_name_to_doc_ids=file_name_to_doc_ids, parser_config=parser_config, metadata_config=metadata_config, - are_datasets=are_datasets, ) auth = (self.username, self.password) if self.username and self.password else None res = self.session.post( diff --git a/compass_sdk/utils.py b/compass_sdk/utils.py index 3d5e6ce..ab2e150 100644 --- a/compass_sdk/utils.py +++ b/compass_sdk/utils.py @@ -56,7 +56,7 @@ def open_document(document_path) -> CompassDocument: fs = get_fs(document_path) with fs.open(document_path, "rb") as f: val = f.read() - if isinstance(val, bytes): + if val is not None and isinstance(val, bytes): doc.filebytes = val else: raise Exception(f"Expected bytes, got {type(val)}") diff --git a/pyproject.toml b/pyproject.toml index 1f7438a..0372e89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "compass-sdk" -version = "0.1.1" +version = "0.2.0" authors = [] description = "Compass SDK"