Skip to content

Commit

Permalink
Add more permutations
Browse files Browse the repository at this point in the history
  • Loading branch information
Jay Chia committed Nov 23, 2024
1 parent d2fdeec commit a23ee04
Showing 1 changed file with 22 additions and 4 deletions.
26 changes: 22 additions & 4 deletions tests/test_size_estimations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pyarrow as pa
import pyarrow.parquet as papq
import pytest

from daft.table.micropartition import MicroPartition

Expand All @@ -20,12 +21,29 @@ def get_actual_size(pq_path: pathlib.Path, columns: list[str] | None = None) ->


def assert_close(expected: int, actual: int, pct: float = 0.05):
assert abs(actual - expected) / expected < pct, f"Expected {expected} to be within {pct} of: {actual}"
assert (
abs(actual - expected) / expected < pct
), f"Expected estimations {expected} to be within {pct} of actual: {actual}"


def test_estimations_strings(tmpdir):
@pytest.mark.parametrize("compression", ["snappy", None])
@pytest.mark.parametrize("use_dictionary", [True, False])
@pytest.mark.parametrize("unique", [True, False])
def test_estimations_strings(tmpdir, use_dictionary, compression, unique):
pq_path = tmpdir / "strings.pq"
data = [f"{'a' * 100}{i}" for i in range(1000)] if unique else ["a" * 100 for _ in range(1000)]
tbl = pa.table({"foo": data})
papq.write_table(tbl, pq_path, use_dictionary=use_dictionary, compression=compression)
assert assert_close(get_scantask_estimated_size(pq_path), get_actual_size(pq_path))


@pytest.mark.parametrize("compression", ["snappy", None])
@pytest.mark.parametrize("use_dictionary", [True, False])
@pytest.mark.parametrize("unique", [True, False])
def test_estimations_ints(tmpdir, use_dictionary, compression, unique):
pq_path = tmpdir / "ints.pq"

tbl = pa.table({"foo": ["a" * 100 for _ in range(100)]})
papq.write_table(tbl, pq_path)
data = [i for i in range(1000)] if unique else [1 for _ in range(1000)]
tbl = pa.table({"foo": data})
papq.write_table(tbl, pq_path, use_dictionary=use_dictionary, compression=compression)
assert assert_close(get_scantask_estimated_size(pq_path), get_actual_size(pq_path))

0 comments on commit a23ee04

Please sign in to comment.