From dd40231d033ba693822b6bde8b8739e129af5885 Mon Sep 17 00:00:00 2001 From: Jay Chia Date: Thu, 5 Dec 2024 21:57:22 -0800 Subject: [PATCH] Add a benchmarking script --- benchmarking/ooms/big_file_reads.py | 49 +++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 benchmarking/ooms/big_file_reads.py diff --git a/benchmarking/ooms/big_file_reads.py b/benchmarking/ooms/big_file_reads.py new file mode 100644 index 0000000000..586f421837 --- /dev/null +++ b/benchmarking/ooms/big_file_reads.py @@ -0,0 +1,49 @@ +# /// script +# dependencies = [] +# /// + +import argparse +import time + +import daft + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--enable-optimized-splits", action="store_true", default=False, help="Whether to enable the new splits" + ) + parser.add_argument("--dry-run", action="store_true", default=False) + return parser.parse_args() + + +if __name__ == "__main__": + daft.context.set_runner_ray() + + args = get_args() + + if args.enable_optimized_splits: + daft.set_execution_config(enable_aggressive_scantask_splitting=True) + + df = daft.read_parquet( + [ + "s3://daft-public-data/test_fixtures/parquet/large-fake-data.parquet", + ] + + [ + "s3://daft-public-data/test_fixtures/parquet/small-fake-data.parquet", + ] + * 10 + ) + + start = time.time() + df.explain(True) + print(f"Explain took: {time.time() - start}s") + + start = time.time() + df.show() + print(f"Show took: {time.time() - start}s") + + if not args.dry_run: + start = time.time() + df.collect() + print(f"Collect took: {time.time() - start}s")