diff --git a/src/driutils/benchmarking/partitioned_date_queries.py b/src/driutils/benchmarking/partitioned_date_queries.py index 90bbf5e..ca25724 100644 --- a/src/driutils/benchmarking/partitioned_date_queries.py +++ b/src/driutils/benchmarking/partitioned_date_queries.py @@ -7,7 +7,7 @@ (proposed format) 'partitioned_date_site': cosmos-test/structure/dataset=dataset_type/site=site/date=YYYY-MM-DD/data.parquet -User can select which strcuture to query. +User can select which structure to query. Each query profile is saved to ./profile.json. Final metrics are written to csv. """ @@ -52,44 +52,37 @@ def extract_metrics(profile: str | os.PathLike) -> pl.DataFrame: metrics["result_set_size_(Mb)"] = p["result_set_size"] / 1048576 metrics["rows_scanned"] = p["cumulative_rows_scanned"] metrics["cpu_time_(s)"] = p["cpu_time"] - # metrics["read_parquet_operator_time_(s)"] = p["children"][0]["children"][0]["operator_timing"] return pl.DataFrame(metrics) -def query_one_site_one_date(base_path, dataset): +def query_one_site_one_date(base_path, dataset): # noqa: ANN001, ANN201 # Test a very small return with partition filter return f"""SELECT {COLUMNS_SQL} FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet') WHERE date='2023-09-27' AND SITE_ID='BUNNY'""" -def query_one_site(base_path, dataset): - # Test a very small return without partition filter - return f"""SELECT {COLUMNS_SQL} FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet') - WHERE SITE_ID='BUNNY'""" - - -def query_multi_dates_using_conditionals_month(base_path, dataset): +def query_multi_dates_using_conditionals_month(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are filtered using conditionals return f""" SELECT {COLUMNS_SQL} FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet') - WHERE date >= '2019-01-01' AND date <= '2019-01-31' + WHERE date >= '2019-01-01' AND date <= '2019-01-31' AND SITE_ID='BUNNY' """ -def query_multi_dates_using_conditionals_year(base_path, dataset): +def query_multi_dates_using_conditionals_year(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are filtered using conditionals return f""" SELECT {COLUMNS_SQL} FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet') - WHERE date >= '2019-01-01' AND date <= '2019-12-31' + WHERE date >= '2019-01-01' AND date <= '2019-12-31' AND SITE_ID='BUNNY' """ -def query_multi_sites_and_multi_dates_using_conditionals_month(base_path, dataset): +def query_multi_sites_and_multi_dates_using_conditionals_month(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are filtered using conditionals # Non partitioned column used @@ -101,7 +94,7 @@ def query_multi_sites_and_multi_dates_using_conditionals_month(base_path, datase """ -def query_multi_sites_and_multi_dates_using_conditionals_year(base_path, dataset): +def query_multi_sites_and_multi_dates_using_conditionals_year(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are filtered using conditionals # Non partitioned column used @@ -113,45 +106,49 @@ def query_multi_sites_and_multi_dates_using_conditionals_year(base_path, dataset """ -def query_multi_dates_using_hive_types_month(base_path, dataset): +def query_multi_dates_using_hive_types_month(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are hive types and filtered using BETWEEN + # Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type return f""" SELECT {COLUMNS_SQL} - FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet', hive_types = {{date: DATE}}) - WHERE date BETWEEN '2019-01-01' AND '2019-01-31' + FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet') + WHERE date BETWEEN '2019-01-01' AND '2019-01-31' AND SITE_ID='BUNNY' """ -def query_multi_dates_using_hive_types_year(base_path, dataset): +def query_multi_dates_using_hive_types_year(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are hive types and filtered using BETWEEN + # Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type return f""" SELECT {COLUMNS_SQL} - FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet', hive_types = {{date: DATE}}) - WHERE date BETWEEN '2019-01-01' AND '2019-12-31' + FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet') + WHERE date BETWEEN '2019-01-01' AND '2019-12-31' AND SITE_ID='BUNNY' """ -def query_multi_sites_and_multi_dates_using_hive_types_month(base_path, dataset): +def query_multi_sites_and_multi_dates_using_hive_types_month(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are hive types and filtered using BETWEEN # Non partitioned column used + # Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type return f""" SELECT {COLUMNS_SQL} - FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet', hive_types = {{date: DATE}}) + FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet') WHERE date BETWEEN '2019-01-01' AND '2019-01-31' AND SITE_ID IN ('BUNNY', 'ALIC1') """ -def query_multi_sites_and_multi_dates_using_hive_types_year(base_path, dataset): +def query_multi_sites_and_multi_dates_using_hive_types_year(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are hive types and filtered using BETWEEN # Non partitioned column used + # Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type return f""" SELECT {COLUMNS_SQL} - FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet', hive_types = {{date: DATE}}) + FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet') WHERE date BETWEEN '2019-01-01' AND '2019-12-31' AND SITE_ID IN ('BUNNY', 'ALIC1') """ @@ -179,7 +176,6 @@ def query_multi_sites_and_multi_dates_using_hive_types_year(base_path, dataset): """) queries = [ - # query_one_site(BASE_BUCKET_PATH, DATASET), query_one_site_one_date(BASE_BUCKET_PATH, DATASET), query_multi_dates_using_conditionals_month(BASE_BUCKET_PATH, DATASET), query_multi_dates_using_conditionals_year(BASE_BUCKET_PATH, DATASET), diff --git a/src/driutils/benchmarking/partitioned_date_site_queries.py b/src/driutils/benchmarking/partitioned_date_site_queries.py index 4e9326c..6b46d49 100644 --- a/src/driutils/benchmarking/partitioned_date_site_queries.py +++ b/src/driutils/benchmarking/partitioned_date_site_queries.py @@ -7,7 +7,7 @@ (proposed format) 'partitioned_date_site': cosmos-test/structure/dataset=dataset_type/site=site/date=YYYY-MM-DD/data.parquet -User can select which strcuture to query. +User can select which structure to query. Each query profile is saved to ./profile.json. Final metrics are written to csv. """ @@ -52,113 +52,107 @@ def extract_metrics(profile: str | os.PathLike) -> pl.DataFrame: metrics["result_set_size_(Mb)"] = p["result_set_size"] / 1048576 metrics["rows_scanned"] = p["cumulative_rows_scanned"] metrics["cpu_time_(s)"] = p["cpu_time"] - # metrics["read_parquet_operator_time_(s)"] = p["children"][0]["children"][0]["operator_timing"] return pl.DataFrame(metrics) -def query_one_site_one_date(base_path, dataset): +def query_one_site_one_date(base_path, dataset): # noqa: ANN001, ANN201 # Test a very small return with partition filter - return f"""SELECT {COLUMNS_SQL} FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet') - WHERE date='2017-09-27' AND site='BUNNY'""" + return f"""SELECT {COLUMNS_SQL} FROM read_parquet('{base_path}/dataset={dataset}/ + site=BUNNY/date=2017-09-27/data.parquet')""" -def query_one_site(base_path, dataset): - # Test a very small return without partition filter - return f"""SELECT {COLUMNS_SQL} FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet') - WHERE site='BUNNY'""" - - -def query_multi_dates_using_conditionals_month(base_path, dataset): +def query_multi_dates_using_conditionals_month(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are filtered using conditionals return f""" SELECT {COLUMNS_SQL} - FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet') + FROM read_parquet('{base_path}/dataset={dataset}/site=BUNNY/*/data.parquet') WHERE date >= '2019-01-01' AND date <= '2019-01-31' """ -def query_multi_dates_using_conditionals_year(base_path, dataset): +def query_multi_dates_using_conditionals_year(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are filtered using conditionals return f""" SELECT {COLUMNS_SQL} - FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet') + FROM read_parquet('{base_path}/dataset={dataset}/site=BUNNY/*/data.parquet') WHERE date >= '2019-01-01' AND date <= '2019-12-31' """ -def query_multi_sites_and_multi_dates_using_conditionals_month(base_path, dataset): +def query_multi_sites_and_multi_dates_using_conditionals_month(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are filtered using conditionals - # Non partitioned column used return f""" SELECT {COLUMNS_SQL} - FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet') + FROM read_parquet('{base_path}/dataset={dataset}/site=*/date=*/data.parquet') WHERE date >= '2019-01-01' AND date <= '2019-01-31' AND site IN ('BUNNY', 'ALIC1') """ -def query_multi_sites_and_multi_dates_using_conditionals_year(base_path, dataset): +def query_multi_sites_and_multi_dates_using_conditionals_year(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are filtered using conditionals - # Non partitioned column used return f""" SELECT {COLUMNS_SQL} - FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet') + FROM read_parquet('{base_path}/dataset={dataset}/site=*/date=*/data.parquet') WHERE date >= '2019-01-01' AND date <= '2019-12-31' AND site IN ('BUNNY', 'ALIC1') """ -def query_multi_dates_using_hive_types_month(base_path, dataset): +def query_multi_dates_using_hive_types_month(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are hive types and filtered using BETWEEN + # Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type return f""" SELECT {COLUMNS_SQL} - FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet', hive_types = {{date: DATE}}) + FROM read_parquet('{base_path}/dataset={dataset}/site=BUNNY/date=*/data.parquet') WHERE date BETWEEN '2019-01-01' AND '2019-01-31' """ -def query_multi_dates_using_hive_types_year(base_path, dataset): +def query_multi_dates_using_hive_types_year(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are hive types and filtered using BETWEEN + # Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type return f""" SELECT {COLUMNS_SQL} - FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet', hive_types = {{date: DATE}}) + FROM read_parquet('{base_path}/dataset={dataset}/site=BUNNY/date=*/data.parquet') WHERE date BETWEEN '2019-01-01' AND '2019-12-31' """ -def query_multi_sites_and_multi_dates_using_hive_types_month(base_path, dataset): +def query_multi_sites_and_multi_dates_using_hive_types_month(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are hive types and filtered using BETWEEN - # Non partitioned column used + # Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type return f""" SELECT {COLUMNS_SQL} - FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet', hive_types = {{date: DATE}}) + FROM read_parquet('{base_path}/dataset={dataset}/site=*/date=*/data.parquet') WHERE date BETWEEN '2019-01-01' AND '2019-01-31' AND site IN ('BUNNY', 'ALIC1') """ -def query_multi_sites_and_multi_dates_using_hive_types_year(base_path, dataset): +def query_multi_sites_and_multi_dates_using_hive_types_year(base_path, dataset): # noqa: ANN001, ANN201 # Test larger and more complex query parameters # Dates are hive types and filtered using BETWEEN - # Non partitioned column used + # Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type return f""" SELECT {COLUMNS_SQL} - FROM read_parquet('{base_path}/dataset={dataset}/site=BUNNY/date=*/data.parquet') + FROM read_parquet('{base_path}/dataset={dataset}/site=*/date=*/data.parquet') WHERE date BETWEEN '2015-01-01' AND '2015-12-31' + AND site IN ('BUNNY', 'ALIC1') """ if __name__ == "__main__": # Setup basic duckdb connection - conn = duckdb.connect(config = { 'threads': 75 }) + conn = duckdb.connect(config={"threads": 64}) conn.execute(""" INSTALL httpfs; @@ -178,16 +172,15 @@ def query_multi_sites_and_multi_dates_using_hive_types_year(base_path, dataset): """) queries = [ - # query_one_site(BASE_BUCKET_PATH, DATASET), - # query_one_site_one_date(BASE_BUCKET_PATH, DATASET), - # query_multi_dates_using_conditionals_month(BASE_BUCKET_PATH, DATASET), - # query_multi_dates_using_conditionals_year(BASE_BUCKET_PATH, DATASET), - # query_multi_sites_and_multi_dates_using_conditionals_month(BASE_BUCKET_PATH, DATASET), - # query_multi_sites_and_multi_dates_using_conditionals_year(BASE_BUCKET_PATH, DATASET), - # query_multi_dates_using_hive_types_month(BASE_BUCKET_PATH, DATASET), - # query_multi_dates_using_hive_types_year(BASE_BUCKET_PATH, DATASET), - # query_multi_sites_and_multi_dates_using_hive_types_month(BASE_BUCKET_PATH, DATASET), - query_multi_sites_and_multi_dates_using_hive_types_year(BASE_BUCKET_PATH, DATASET) + query_one_site_one_date(BASE_BUCKET_PATH, DATASET), + query_multi_dates_using_conditionals_month(BASE_BUCKET_PATH, DATASET), + query_multi_dates_using_conditionals_year(BASE_BUCKET_PATH, DATASET), + query_multi_sites_and_multi_dates_using_conditionals_month(BASE_BUCKET_PATH, DATASET), + query_multi_sites_and_multi_dates_using_conditionals_year(BASE_BUCKET_PATH, DATASET), + query_multi_dates_using_hive_types_month(BASE_BUCKET_PATH, DATASET), + query_multi_dates_using_hive_types_year(BASE_BUCKET_PATH, DATASET), + query_multi_sites_and_multi_dates_using_hive_types_month(BASE_BUCKET_PATH, DATASET), + query_multi_sites_and_multi_dates_using_hive_types_year(BASE_BUCKET_PATH, DATASET), ] # Create empty dataframe to store the results @@ -198,7 +191,9 @@ def query_multi_sites_and_multi_dates_using_hive_types_year(base_path, dataset): # Query profile is saved to ./profile.json new_df = conn.sql(query).pl() - new_df.write_csv('./test.csv') + + # Write out to csv to test all data returned + new_df.write_csv("./test.csv") # Extract whats need from the profiler df = extract_metrics(profile=OUTPUT_PROFILE)