Skip to content

Commit

Permalink
Adds unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
Jay Chia committed Dec 6, 2024
1 parent 7307360 commit 50c6605
Showing 1 changed file with 21 additions and 0 deletions.
21 changes: 21 additions & 0 deletions tests/io/test_split_scan_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,17 @@ def parquet_files(tmpdir):
return tmpdir


@pytest.fixture(scope="function")
def many_parquet_files(tmpdir):
"""Writes 20 Parquet file with 10 rowgroups, each of 100 bytes in size"""
for i in range(20):
tbl = pa.table({"data": ["aaa"] * 100})
path = tmpdir / f"file.{i}.pq"
papq.write_table(tbl, str(path), row_group_size=10, use_dictionary=False)

return tmpdir


def test_split_parquet_read(parquet_files):
with daft.execution_config_ctx(
scan_tasks_min_size_bytes=1,
Expand All @@ -25,3 +36,13 @@ def test_split_parquet_read(parquet_files):
df = daft.read_parquet(str(parquet_files))
assert df.num_partitions() == 10, "Should have 10 partitions since we will split the file"
assert df.to_pydict() == {"data": ["aaa"] * 100}


def test_split_parquet_read_many_files(many_parquet_files):
with daft.execution_config_ctx(
scan_tasks_min_size_bytes=1,
scan_tasks_max_size_bytes=10,
):
df = daft.read_parquet(str(many_parquet_files))
assert df.num_partitions() == 200, "Should have 200 partitions since we will split all files"
assert df.to_pydict() == {"data": ["aaa"] * 2000}

0 comments on commit 50c6605

Please sign in to comment.