Skip to content

Commit

Permalink
Sync sdk with internal
Browse files Browse the repository at this point in the history
  • Loading branch information
benrules3 committed Jun 27, 2024
1 parent 696e948 commit 9a0c951
Show file tree
Hide file tree
Showing 6 changed files with 11 additions and 19 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,8 @@ on:
workflow_dispatch: {}

jobs:

test_client:
runs-on: large
runs-on: ubuntu-latest

permissions:
contents: read
Expand Down
8 changes: 3 additions & 5 deletions compass_sdk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,10 @@
from os import getenv
from typing import Any, Callable, Dict, List, Optional, Union

from pydantic import BaseModel
from typing_extensions import TypedDict

from compass_sdk.constants import (
COHERE_API_ENV_VAR,
DEFAULT_COMMANDR_EXTRACTABLE_ATTRIBUTES,
DEFAULT_COMMANDR_PROMPT,
DEFAULT_MAX_TOKENS_METADATA,
DEFAULT_MIN_CHARS_PER_ELEMENT,
DEFAULT_MIN_NUM_CHUNKS_IN_TITLE,
DEFAULT_MIN_NUM_TOKENS_CHUNK,
Expand All @@ -19,6 +15,8 @@
METADATA_HEURISTICS_ATTRIBUTES,
SKIP_INFER_TABLE_TYPES,
)
from pydantic import BaseModel
from typing_extensions import TypedDict


class Logger:
Expand Down Expand Up @@ -296,7 +294,7 @@ class ParserConfig(ValidatedModel):
num_tokens_overlap: int = DEFAULT_NUM_TOKENS_CHUNK_OVERLAP
min_chunk_tokens: int = DEFAULT_MIN_NUM_TOKENS_CHUNK
num_chunks_in_title: int = DEFAULT_MIN_NUM_CHUNKS_IN_TITLE
max_tokens_metadata: int = DEFAULT_MAX_TOKENS_METADATA
max_tokens_metadata: int = 1000
include_tables: bool = True

# Formatting configuration
Expand Down
11 changes: 5 additions & 6 deletions compass_sdk/compass.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@
from typing import Dict, Iterator, List, Optional, Tuple, Union

import requests
from joblib import Parallel, delayed
from pydantic import BaseModel
from requests.exceptions import InvalidSchema
from tenacity import RetryError, retry, retry_if_not_exception_type, stop_after_attempt, wait_fixed
from tqdm import tqdm

from compass_sdk import (
BatchPutDocumentsInput,
Chunk,
Expand All @@ -31,6 +25,11 @@
DEFAULT_MAX_RETRIES,
DEFAULT_SLEEP_RETRY_SECONDS,
)
from joblib import Parallel, delayed
from pydantic import BaseModel
from requests.exceptions import InvalidSchema
from tenacity import RetryError, retry, retry_if_not_exception_type, stop_after_attempt, wait_fixed
from tqdm import tqdm


@dataclass
Expand Down
2 changes: 0 additions & 2 deletions compass_sdk/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,9 @@
DEFAULT_NUM_TOKENS_PER_CHUNK = 500
DEFAULT_NUM_TOKENS_CHUNK_OVERLAP = 15
DEFAULT_MIN_NUM_TOKENS_CHUNK = 5
DEFAULT_MAX_TOKENS_METADATA = 50
DEFAULT_MIN_NUM_CHUNKS_IN_TITLE = 1

DEFAULT_WIDTH_HEIGHT_VERTICAL_RATIO = 0.6
NUM_ADDITIONAL_CHARS_FOR_METADATA = 100
SKIP_INFER_TABLE_TYPES = ["jpg", "png", "xls", "xlsx", "heic"]

# Metadata detection constants
Expand Down
3 changes: 1 addition & 2 deletions compass_sdk/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from typing import Any, Callable, Dict, Iterable, List, Optional, Union

import requests

from compass_sdk import (
BatchProcessFilesParameters,
CompassDocument,
Expand Down Expand Up @@ -196,7 +195,7 @@ def process_file(
return []
if len(doc.filebytes) > DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES:
logger.error(
f"File too large, supported file size is {DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES / 1000_1000} "
f"File too large, supported file size is {DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES / 1000_000} "
f"mb, filename {doc.metadata.filename}"
)
return []
Expand Down
3 changes: 1 addition & 2 deletions compass_sdk/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
from typing import Callable, Iterable, Iterator, List, Optional, TypeVar

import fsspec
from fsspec import AbstractFileSystem

from compass_sdk import CompassDocument, CompassDocumentMetadata, CompassSdkStage
from fsspec import AbstractFileSystem

T = TypeVar("T")
U = TypeVar("U")
Expand Down

0 comments on commit 9a0c951

Please sign in to comment.