Skip to content

Commit

Permalink
Adapt to new compass parser changes (#20)
Browse files Browse the repository at this point in the history
- `CompassDocumentChunk.origin` is no longer a pydantic model, but a
`Dict[str, Any]` so that we can add arbitrary non-numeric fields such as
`"tab_name": "some name"` for spreadsheets.
- Remove `is_dataset` as a parameter.

---------

Signed-off-by: javier-cohere <[email protected]>
Co-authored-by: javier-cohere <[email protected]>
  • Loading branch information
javier-cohere and javier-cohere authored Oct 22, 2024
1 parent 87d4569 commit 94be863
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 27 deletions.
23 changes: 10 additions & 13 deletions compass_sdk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ class CompassDocumentMetadata(ValidatedModel):
doc_id: str = ""
filename: str = ""
meta: List = []
parent_doc_id: str = ""


class CompassDocumentStatus(str, Enum):
Expand All @@ -104,16 +105,16 @@ class CompassSdkStage(str, Enum):
Indexing = "indexing"


class CompassDocumentChunkOrigin(BaseModel):
page_number: Optional[int] = None


class CompassDocumentChunk(BaseModel):
chunk_id: str
sort_id: str
doc_id: str
parent_doc_id: str
content: Dict[str, Any]
origin: Optional[CompassDocumentChunkOrigin] = None
origin: Optional[Dict[str, Any]] = None

def parent_doc_is_split(self):
return self.doc_id != self.parent_doc_id


class CompassDocument(ValidatedModel):
Expand All @@ -130,12 +131,12 @@ class CompassDocument(ValidatedModel):
filebytes: bytes = b""
metadata: CompassDocumentMetadata = CompassDocumentMetadata()
content: Dict[str, str] = {}
content_type: Optional[str] = None
elements: List[Any] = []
chunks: List[CompassDocumentChunk] = []
index_fields: List[str] = []
errors: List[Dict[CompassSdkStage, str]] = []
ignore_metadata_errors: bool = True
is_dataset: bool = False
markdown: Optional[str] = None

def has_data(self) -> bool:
Expand Down Expand Up @@ -201,8 +202,6 @@ def _missing_(cls, value):
class MetadataConfig(ValidatedModel):
"""
Configuration class for metadata detection.
:param pre_build_detectors: whether to pre-build all metadata detectors. If set to False (default),
detectors will be built on the fly when needed
:param metadata_strategy: the metadata detection strategy to use. One of:
- No_Metadata: no metadata is inferred
- Heuristics: metadata is inferred using heuristics
Expand All @@ -219,7 +218,6 @@ class MetadataConfig(ValidatedModel):
"""

pre_build_detectors: bool = False
metadata_strategy: MetadataStrategy = MetadataStrategy.No_Metadata
cohere_api_key: Optional[str] = getenv(COHERE_API_ENV_VAR, None)
commandr_model_name: str = "command-r"
Expand Down Expand Up @@ -286,7 +284,6 @@ class ParserConfig(ValidatedModel):
allowed_image_types: Optional[List[str]] = None
min_chars_per_element: int = DEFAULT_MIN_CHARS_PER_ELEMENT
skip_infer_table_types: List[str] = SKIP_INFER_TABLE_TYPES
detect_datasets: bool = True
parsing_strategy: ParsingStrategy = ParsingStrategy.Fast
parsing_model: ParsingModel = ParsingModel.Marker

Expand All @@ -309,7 +306,8 @@ class Chunk(BaseModel):
chunk_id: str
sort_id: int
content: Dict[str, Any]
origin: Optional[CompassDocumentChunkOrigin] = None
origin: Optional[Dict[str, Any]] = None
parent_doc_id: str


class Document(BaseModel):
Expand All @@ -319,6 +317,7 @@ class Document(BaseModel):

doc_id: str
path: str
parent_doc_id: str
content: Dict[str, Any]
chunks: List[Chunk]
index_fields: List[str] = []
Expand Down Expand Up @@ -383,14 +382,12 @@ class ProcessFileParameters(ValidatedModel):
parser_config: ParserConfig
metadata_config: MetadataConfig
doc_id: Optional[str] = None
is_dataset: Optional[bool] = None


class ProcessFilesParameters(ValidatedModel):
doc_ids: Optional[List[str]] = None
parser_config: ParserConfig
metadata_config: MetadataConfig
are_datasets: Optional[bool] = None


class BatchProcessFilesParameters(ProcessFilesParameters):
Expand Down
1 change: 1 addition & 0 deletions compass_sdk/compass.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,7 @@ def _get_request_blocks(
doc,
Document(
doc_id=doc.metadata.doc_id,
parent_doc_id=doc.metadata.parent_doc_id,
path=doc.metadata.filename,
content=doc.content,
chunks=[Chunk(**c.model_dump()) for c in doc.chunks],
Expand Down
12 changes: 0 additions & 12 deletions compass_sdk/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,6 @@ def process_files(
file_ids: Optional[List[str]] = None,
parser_config: Optional[ParserConfig] = None,
metadata_config: Optional[MetadataConfig] = None,
are_datasets: Optional[List[bool]] = None,
custom_context: Optional[Fn_or_Dict] = None,
) -> Iterable[CompassDocument]:
"""
Expand All @@ -129,7 +128,6 @@ def process_files(
:param file_ids: List of ids for the files
:param parser_config: ParserConfig object (applies the same config to all docs)
:param metadata_config: MetadataConfig object (applies the same config to all docs)
:param are_datasets: List of booleans indicating whether each file is a dataset
:param custom_context: Additional data to add to compass document. Fields will be filterable but not semantically searchable.
Can either be a dictionary or a callable that takes a CompassDocument and returns a dictionary.
Expand All @@ -143,7 +141,6 @@ def process_file(i: int) -> List[CompassDocument]:
file_id=file_ids[i] if file_ids else None,
parser_config=parser_config,
metadata_config=metadata_config,
is_dataset=are_datasets[i] if are_datasets else None,
custom_context=custom_context,
)

Expand Down Expand Up @@ -171,7 +168,6 @@ def process_file(
file_id: Optional[str] = None,
parser_config: Optional[ParserConfig] = None,
metadata_config: Optional[MetadataConfig] = None,
is_dataset: Optional[bool] = None,
custom_context: Optional[Fn_or_Dict] = None,
) -> List[CompassDocument]:
"""
Expand All @@ -184,10 +180,6 @@ def process_file(
:param file_id: Id for the file
:param parser_config: ParserConfig object with the config to use for parsing the file
:param metadata_config: MetadataConfig object with the config to use for extracting metadata for each document
:param is_dataset: Boolean indicating whether the file is a dataset. If True, the file will be processed
as a dataset and multiple CompassDocument objects might be returned (one per dataset record). Otherwise,
the file will be processed as a single document (e.g., a PDF file). Default is None, which means that
the server will try to infer whether the file is a dataset or not.
:param custom_context: Additional data to add to compass document. Fields will be filterable but not semantically searchable.
Can either be a dictionary or a callable that takes a CompassDocument and returns a dictionary.
Expand All @@ -211,7 +203,6 @@ def process_file(
parser_config=parser_config,
metadata_config=metadata_config,
doc_id=file_id,
is_dataset=is_dataset,
)
auth = (self.username, self.password) if self.username and self.password else None
res = self.session.post(
Expand Down Expand Up @@ -249,7 +240,6 @@ def batch_upload(self, *, zip_file_path: str) -> str:
zip_data = zip_file.read()
res = self.session.post(
url=f"{self.parser_url}/v1/batch/upload",
data={"data": {"is_dataset": False}},
files={"file": ("data.zip", zip_data)},
auth=auth,
)
Expand Down Expand Up @@ -290,7 +280,6 @@ def batch_run(
file_name_to_doc_ids: Optional[Dict[str, str]] = None,
parser_config: Optional[ParserConfig] = None,
metadata_config: Optional[MetadataConfig] = None,
are_datasets: Optional[bool] = None,
) -> List[CompassDocument]:

parser_config = parser_config or self.parser_config
Expand All @@ -301,7 +290,6 @@ def batch_run(
file_name_to_doc_ids=file_name_to_doc_ids,
parser_config=parser_config,
metadata_config=metadata_config,
are_datasets=are_datasets,
)
auth = (self.username, self.password) if self.username and self.password else None
res = self.session.post(
Expand Down
2 changes: 1 addition & 1 deletion compass_sdk/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def open_document(document_path) -> CompassDocument:
fs = get_fs(document_path)
with fs.open(document_path, "rb") as f:
val = f.read()
if isinstance(val, bytes):
if val is not None and isinstance(val, bytes):
doc.filebytes = val
else:
raise Exception(f"Expected bytes, got {type(val)}")
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "compass-sdk"
version = "0.1.1"
version = "0.2.0"
authors = []
description = "Compass SDK"

Expand Down

0 comments on commit 94be863

Please sign in to comment.