From 50c6605196b9d1152f744284adf4074714b2539b Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Thu, 5 Dec 2024 14:56:29 -0800 Subject: [PATCH] Adds unit test --- tests/io/test_split_scan_tasks.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/io/test_split_scan_tasks.py b/tests/io/test_split_scan_tasks.py index 7b92655668..ba3a1432f4 100644 --- a/tests/io/test_split_scan_tasks.py +++ b/tests/io/test_split_scan_tasks.py @@ -17,6 +17,17 @@ def parquet_files(tmpdir): return tmpdir +@pytest.fixture(scope="function") +def many_parquet_files(tmpdir): + """Writes 20 Parquet file with 10 rowgroups, each of 100 bytes in size""" + for i in range(20): + tbl = pa.table({"data": ["aaa"] * 100}) + path = tmpdir / f"file.{i}.pq" + papq.write_table(tbl, str(path), row_group_size=10, use_dictionary=False) + + return tmpdir + + def test_split_parquet_read(parquet_files): with daft.execution_config_ctx( scan_tasks_min_size_bytes=1, @@ -25,3 +36,13 @@ def test_split_parquet_read(parquet_files): df = daft.read_parquet(str(parquet_files)) assert df.num_partitions() == 10, "Should have 10 partitions since we will split the file" assert df.to_pydict() == {"data": ["aaa"] * 100} + + +def test_split_parquet_read_many_files(many_parquet_files): + with daft.execution_config_ctx( + scan_tasks_min_size_bytes=1, + scan_tasks_max_size_bytes=10, + ): + df = daft.read_parquet(str(many_parquet_files)) + assert df.num_partitions() == 200, "Should have 200 partitions since we will split all files" + assert df.to_pydict() == {"data": ["aaa"] * 2000}