From 94be863b5ae8c845fe5a4a6146a559e408eee7a7 Mon Sep 17 00:00:00 2001
From: Javier <160494147+javier-cohere@users.noreply.github.com>
Date: Tue, 22 Oct 2024 20:03:55 +0200
Subject: [PATCH] Adapt to new compass parser changes (#20)

- `CompassDocumentChunk.origin` is no longer a pydantic model, but a
`Dict[str, Any]` so that we can add arbitrary non-numeric fields such as
`"tab_name": "some name"` for spreadsheets.
- Remove `is_dataset` as a parameter.

---------

Signed-off-by: javier-cohere <javi@Javi-Morales.local>
Co-authored-by: javier-cohere <javi@Javi-Morales.local>
---
 compass_sdk/__init__.py | 23 ++++++++++-------------
 compass_sdk/compass.py  |  1 +
 compass_sdk/parser.py   | 12 ------------
 compass_sdk/utils.py    |  2 +-
 pyproject.toml          |  2 +-
 5 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/compass_sdk/__init__.py b/compass_sdk/__init__.py
index 5975f64..d9671d3 100644
--- a/compass_sdk/__init__.py
+++ b/compass_sdk/__init__.py
@@ -80,6 +80,7 @@ class CompassDocumentMetadata(ValidatedModel):
     doc_id: str = ""
     filename: str = ""
     meta: List = []
+    parent_doc_id: str = ""
 
 
 class CompassDocumentStatus(str, Enum):
@@ -104,16 +105,16 @@ class CompassSdkStage(str, Enum):
     Indexing = "indexing"
 
 
-class CompassDocumentChunkOrigin(BaseModel):
-    page_number: Optional[int] = None
-
-
 class CompassDocumentChunk(BaseModel):
     chunk_id: str
     sort_id: str
     doc_id: str
+    parent_doc_id: str
     content: Dict[str, Any]
-    origin: Optional[CompassDocumentChunkOrigin] = None
+    origin: Optional[Dict[str, Any]] = None
+
+    def parent_doc_is_split(self):
+        return self.doc_id != self.parent_doc_id
 
 
 class CompassDocument(ValidatedModel):
@@ -130,12 +131,12 @@ class CompassDocument(ValidatedModel):
     filebytes: bytes = b""
     metadata: CompassDocumentMetadata = CompassDocumentMetadata()
     content: Dict[str, str] = {}
+    content_type: Optional[str] = None
     elements: List[Any] = []
     chunks: List[CompassDocumentChunk] = []
     index_fields: List[str] = []
     errors: List[Dict[CompassSdkStage, str]] = []
     ignore_metadata_errors: bool = True
-    is_dataset: bool = False
     markdown: Optional[str] = None
 
     def has_data(self) -> bool:
@@ -201,8 +202,6 @@ def _missing_(cls, value):
 class MetadataConfig(ValidatedModel):
     """
     Configuration class for metadata detection.
-    :param pre_build_detectors: whether to pre-build all metadata detectors. If set to False (default),
-        detectors will be built on the fly when needed
     :param metadata_strategy: the metadata detection strategy to use. One of:
         - No_Metadata: no metadata is inferred
         - Heuristics: metadata is inferred using heuristics
@@ -219,7 +218,6 @@ class MetadataConfig(ValidatedModel):
 
     """
 
-    pre_build_detectors: bool = False
     metadata_strategy: MetadataStrategy = MetadataStrategy.No_Metadata
     cohere_api_key: Optional[str] = getenv(COHERE_API_ENV_VAR, None)
     commandr_model_name: str = "command-r"
@@ -286,7 +284,6 @@ class ParserConfig(ValidatedModel):
     allowed_image_types: Optional[List[str]] = None
     min_chars_per_element: int = DEFAULT_MIN_CHARS_PER_ELEMENT
     skip_infer_table_types: List[str] = SKIP_INFER_TABLE_TYPES
-    detect_datasets: bool = True
     parsing_strategy: ParsingStrategy = ParsingStrategy.Fast
     parsing_model: ParsingModel = ParsingModel.Marker
 
@@ -309,7 +306,8 @@ class Chunk(BaseModel):
     chunk_id: str
     sort_id: int
     content: Dict[str, Any]
-    origin: Optional[CompassDocumentChunkOrigin] = None
+    origin: Optional[Dict[str, Any]] = None
+    parent_doc_id: str
 
 
 class Document(BaseModel):
@@ -319,6 +317,7 @@ class Document(BaseModel):
 
     doc_id: str
     path: str
+    parent_doc_id: str
     content: Dict[str, Any]
     chunks: List[Chunk]
     index_fields: List[str] = []
@@ -383,14 +382,12 @@ class ProcessFileParameters(ValidatedModel):
     parser_config: ParserConfig
     metadata_config: MetadataConfig
     doc_id: Optional[str] = None
-    is_dataset: Optional[bool] = None
 
 
 class ProcessFilesParameters(ValidatedModel):
     doc_ids: Optional[List[str]] = None
     parser_config: ParserConfig
     metadata_config: MetadataConfig
-    are_datasets: Optional[bool] = None
 
 
 class BatchProcessFilesParameters(ProcessFilesParameters):
diff --git a/compass_sdk/compass.py b/compass_sdk/compass.py
index 91e89e0..e8efe05 100644
--- a/compass_sdk/compass.py
+++ b/compass_sdk/compass.py
@@ -442,6 +442,7 @@ def _get_request_blocks(
                         doc,
                         Document(
                             doc_id=doc.metadata.doc_id,
+                            parent_doc_id=doc.metadata.parent_doc_id,
                             path=doc.metadata.filename,
                             content=doc.content,
                             chunks=[Chunk(**c.model_dump()) for c in doc.chunks],
diff --git a/compass_sdk/parser.py b/compass_sdk/parser.py
index e020305..b9e25d8 100644
--- a/compass_sdk/parser.py
+++ b/compass_sdk/parser.py
@@ -110,7 +110,6 @@ def process_files(
         file_ids: Optional[List[str]] = None,
         parser_config: Optional[ParserConfig] = None,
         metadata_config: Optional[MetadataConfig] = None,
-        are_datasets: Optional[List[bool]] = None,
         custom_context: Optional[Fn_or_Dict] = None,
     ) -> Iterable[CompassDocument]:
         """
@@ -129,7 +128,6 @@ def process_files(
         :param file_ids: List of ids for the files
         :param parser_config: ParserConfig object (applies the same config to all docs)
         :param metadata_config: MetadataConfig object (applies the same config to all docs)
-        :param are_datasets: List of booleans indicating whether each file is a dataset
         :param custom_context: Additional data to add to compass document. Fields will be filterable but not semantically searchable.
             Can either be a dictionary or a callable that takes a CompassDocument and returns a dictionary.
 
@@ -143,7 +141,6 @@ def process_file(i: int) -> List[CompassDocument]:
                 file_id=file_ids[i] if file_ids else None,
                 parser_config=parser_config,
                 metadata_config=metadata_config,
-                is_dataset=are_datasets[i] if are_datasets else None,
                 custom_context=custom_context,
             )
 
@@ -171,7 +168,6 @@ def process_file(
         file_id: Optional[str] = None,
         parser_config: Optional[ParserConfig] = None,
         metadata_config: Optional[MetadataConfig] = None,
-        is_dataset: Optional[bool] = None,
         custom_context: Optional[Fn_or_Dict] = None,
     ) -> List[CompassDocument]:
         """
@@ -184,10 +180,6 @@ def process_file(
         :param file_id: Id for the file
         :param parser_config: ParserConfig object with the config to use for parsing the file
         :param metadata_config: MetadataConfig object with the config to use for extracting metadata for each document
-        :param is_dataset: Boolean indicating whether the file is a dataset. If True, the file will be processed
-            as a dataset and multiple CompassDocument objects might be returned (one per dataset record). Otherwise,
-            the file will be processed as a single document (e.g., a PDF file). Default is None, which means that
-            the server will try to infer whether the file is a dataset or not.
         :param custom_context: Additional data to add to compass document. Fields will be filterable but not semantically searchable.
             Can either be a dictionary or a callable that takes a CompassDocument and returns a dictionary.
 
@@ -211,7 +203,6 @@ def process_file(
             parser_config=parser_config,
             metadata_config=metadata_config,
             doc_id=file_id,
-            is_dataset=is_dataset,
         )
         auth = (self.username, self.password) if self.username and self.password else None
         res = self.session.post(
@@ -249,7 +240,6 @@ def batch_upload(self, *, zip_file_path: str) -> str:
             zip_data = zip_file.read()
             res = self.session.post(
                 url=f"{self.parser_url}/v1/batch/upload",
-                data={"data": {"is_dataset": False}},
                 files={"file": ("data.zip", zip_data)},
                 auth=auth,
             )
@@ -290,7 +280,6 @@ def batch_run(
         file_name_to_doc_ids: Optional[Dict[str, str]] = None,
         parser_config: Optional[ParserConfig] = None,
         metadata_config: Optional[MetadataConfig] = None,
-        are_datasets: Optional[bool] = None,
     ) -> List[CompassDocument]:
 
         parser_config = parser_config or self.parser_config
@@ -301,7 +290,6 @@ def batch_run(
             file_name_to_doc_ids=file_name_to_doc_ids,
             parser_config=parser_config,
             metadata_config=metadata_config,
-            are_datasets=are_datasets,
         )
         auth = (self.username, self.password) if self.username and self.password else None
         res = self.session.post(
diff --git a/compass_sdk/utils.py b/compass_sdk/utils.py
index 3d5e6ce..ab2e150 100644
--- a/compass_sdk/utils.py
+++ b/compass_sdk/utils.py
@@ -56,7 +56,7 @@ def open_document(document_path) -> CompassDocument:
         fs = get_fs(document_path)
         with fs.open(document_path, "rb") as f:
             val = f.read()
-            if isinstance(val, bytes):
+            if val is not None and isinstance(val, bytes):
                 doc.filebytes = val
             else:
                 raise Exception(f"Expected bytes, got {type(val)}")
diff --git a/pyproject.toml b/pyproject.toml
index 1f7438a..0372e89 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "compass-sdk"
-version = "0.1.1"
+version = "0.2.0"
 authors = []
 description = "Compass SDK"