Skip to content

Commit

Permalink
test: Add more size estimation tests from our s3 bucket (#3514)
Browse files Browse the repository at this point in the history
This test currently fails as we underestimate by 2x

Co-authored-by: Jay Chia <[email protected]@users.noreply.github.com>
  • Loading branch information
jaychia and Jay Chia authored Dec 9, 2024
1 parent a99d2ab commit 6390afa
Showing 1 changed file with 20 additions and 0 deletions.
20 changes: 20 additions & 0 deletions tests/test_size_estimations.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,23 @@ def test_canonical_files_in_hf(path):
size_on_disk = int(response.headers["Content-Length"])

assert_close(size_on_disk, get_scantask_estimated_size(path, size_on_disk), get_actual_size(path))


@pytest.mark.parametrize(
"path",
[
"s3://daft-public-datasets/tpch_iceberg_sf1000.db/lineitem/data/L_SHIPDATE_month=1992-01/00000-6694-fa4594d5-f624-407c-8640-5b6db8150470-00001.parquet",
],
ids=[
"lineitem",
],
)
def test_canonical_files_in_s3(path):
import boto3

s3 = boto3.client("s3")
bucket, key = path.replace("s3://", "").split("/", 1)
response = s3.head_object(Bucket=bucket, Key=key)
size_on_disk = response["ContentLength"]

assert_close(size_on_disk, get_scantask_estimated_size(path, size_on_disk), get_actual_size(path))

0 comments on commit 6390afa

Please sign in to comment.