Skip to content

Commit

Permalink
final tidy up to match confluence page
Browse files Browse the repository at this point in the history
  • Loading branch information
nkshaw23 committed Nov 6, 2024
1 parent 19a6f79 commit 39e8fa0
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 69 deletions.
48 changes: 22 additions & 26 deletions src/driutils/benchmarking/partitioned_date_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
(proposed format)
'partitioned_date_site': cosmos-test/structure/dataset=dataset_type/site=site/date=YYYY-MM-DD/data.parquet
User can select which strcuture to query.
User can select which structure to query.
Each query profile is saved to ./profile.json. Final metrics are written to csv.
"""
Expand Down Expand Up @@ -52,44 +52,37 @@ def extract_metrics(profile: str | os.PathLike) -> pl.DataFrame:
metrics["result_set_size_(Mb)"] = p["result_set_size"] / 1048576
metrics["rows_scanned"] = p["cumulative_rows_scanned"]
metrics["cpu_time_(s)"] = p["cpu_time"]
# metrics["read_parquet_operator_time_(s)"] = p["children"][0]["children"][0]["operator_timing"]

return pl.DataFrame(metrics)


def query_one_site_one_date(base_path, dataset):
def query_one_site_one_date(base_path, dataset): # noqa: ANN001, ANN201
# Test a very small return with partition filter
return f"""SELECT {COLUMNS_SQL} FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet')
WHERE date='2023-09-27' AND SITE_ID='BUNNY'"""


def query_one_site(base_path, dataset):
# Test a very small return without partition filter
return f"""SELECT {COLUMNS_SQL} FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet')
WHERE SITE_ID='BUNNY'"""


def query_multi_dates_using_conditionals_month(base_path, dataset):
def query_multi_dates_using_conditionals_month(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are filtered using conditionals
return f"""
SELECT {COLUMNS_SQL}
FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet')
WHERE date >= '2019-01-01' AND date <= '2019-01-31'
WHERE date >= '2019-01-01' AND date <= '2019-01-31' AND SITE_ID='BUNNY'
"""


def query_multi_dates_using_conditionals_year(base_path, dataset):
def query_multi_dates_using_conditionals_year(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are filtered using conditionals
return f"""
SELECT {COLUMNS_SQL}
FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet')
WHERE date >= '2019-01-01' AND date <= '2019-12-31'
WHERE date >= '2019-01-01' AND date <= '2019-12-31' AND SITE_ID='BUNNY'
"""


def query_multi_sites_and_multi_dates_using_conditionals_month(base_path, dataset):
def query_multi_sites_and_multi_dates_using_conditionals_month(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are filtered using conditionals
# Non partitioned column used
Expand All @@ -101,7 +94,7 @@ def query_multi_sites_and_multi_dates_using_conditionals_month(base_path, datase
"""


def query_multi_sites_and_multi_dates_using_conditionals_year(base_path, dataset):
def query_multi_sites_and_multi_dates_using_conditionals_year(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are filtered using conditionals
# Non partitioned column used
Expand All @@ -113,45 +106,49 @@ def query_multi_sites_and_multi_dates_using_conditionals_year(base_path, dataset
"""


def query_multi_dates_using_hive_types_month(base_path, dataset):
def query_multi_dates_using_hive_types_month(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are hive types and filtered using BETWEEN
# Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type
return f"""
SELECT {COLUMNS_SQL}
FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet', hive_types = {{date: DATE}})
WHERE date BETWEEN '2019-01-01' AND '2019-01-31'
FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet')
WHERE date BETWEEN '2019-01-01' AND '2019-01-31' AND SITE_ID='BUNNY'
"""


def query_multi_dates_using_hive_types_year(base_path, dataset):
def query_multi_dates_using_hive_types_year(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are hive types and filtered using BETWEEN
# Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type
return f"""
SELECT {COLUMNS_SQL}
FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet', hive_types = {{date: DATE}})
WHERE date BETWEEN '2019-01-01' AND '2019-12-31'
FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet')
WHERE date BETWEEN '2019-01-01' AND '2019-12-31' AND SITE_ID='BUNNY'
"""


def query_multi_sites_and_multi_dates_using_hive_types_month(base_path, dataset):
def query_multi_sites_and_multi_dates_using_hive_types_month(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are hive types and filtered using BETWEEN
# Non partitioned column used
# Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type
return f"""
SELECT {COLUMNS_SQL}
FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet', hive_types = {{date: DATE}})
FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet')
WHERE date BETWEEN '2019-01-01' AND '2019-01-31'
AND SITE_ID IN ('BUNNY', 'ALIC1')
"""


def query_multi_sites_and_multi_dates_using_hive_types_year(base_path, dataset):
def query_multi_sites_and_multi_dates_using_hive_types_year(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are hive types and filtered using BETWEEN
# Non partitioned column used
# Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type
return f"""
SELECT {COLUMNS_SQL}
FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet', hive_types = {{date: DATE}})
FROM read_parquet('{base_path}/dataset={dataset}/*/*.parquet')
WHERE date BETWEEN '2019-01-01' AND '2019-12-31'
AND SITE_ID IN ('BUNNY', 'ALIC1')
"""
Expand Down Expand Up @@ -179,7 +176,6 @@ def query_multi_sites_and_multi_dates_using_hive_types_year(base_path, dataset):
""")

queries = [
# query_one_site(BASE_BUCKET_PATH, DATASET),
query_one_site_one_date(BASE_BUCKET_PATH, DATASET),
query_multi_dates_using_conditionals_month(BASE_BUCKET_PATH, DATASET),
query_multi_dates_using_conditionals_year(BASE_BUCKET_PATH, DATASET),
Expand Down
81 changes: 38 additions & 43 deletions src/driutils/benchmarking/partitioned_date_site_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
(proposed format)
'partitioned_date_site': cosmos-test/structure/dataset=dataset_type/site=site/date=YYYY-MM-DD/data.parquet
User can select which strcuture to query.
User can select which structure to query.
Each query profile is saved to ./profile.json. Final metrics are written to csv.
"""
Expand Down Expand Up @@ -52,113 +52,107 @@ def extract_metrics(profile: str | os.PathLike) -> pl.DataFrame:
metrics["result_set_size_(Mb)"] = p["result_set_size"] / 1048576
metrics["rows_scanned"] = p["cumulative_rows_scanned"]
metrics["cpu_time_(s)"] = p["cpu_time"]
# metrics["read_parquet_operator_time_(s)"] = p["children"][0]["children"][0]["operator_timing"]

return pl.DataFrame(metrics)


def query_one_site_one_date(base_path, dataset):
def query_one_site_one_date(base_path, dataset): # noqa: ANN001, ANN201
# Test a very small return with partition filter
return f"""SELECT {COLUMNS_SQL} FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet')
WHERE date='2017-09-27' AND site='BUNNY'"""
return f"""SELECT {COLUMNS_SQL} FROM read_parquet('{base_path}/dataset={dataset}/
site=BUNNY/date=2017-09-27/data.parquet')"""


def query_one_site(base_path, dataset):
# Test a very small return without partition filter
return f"""SELECT {COLUMNS_SQL} FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet')
WHERE site='BUNNY'"""


def query_multi_dates_using_conditionals_month(base_path, dataset):
def query_multi_dates_using_conditionals_month(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are filtered using conditionals
return f"""
SELECT {COLUMNS_SQL}
FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet')
FROM read_parquet('{base_path}/dataset={dataset}/site=BUNNY/*/data.parquet')
WHERE date >= '2019-01-01' AND date <= '2019-01-31'
"""


def query_multi_dates_using_conditionals_year(base_path, dataset):
def query_multi_dates_using_conditionals_year(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are filtered using conditionals
return f"""
SELECT {COLUMNS_SQL}
FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet')
FROM read_parquet('{base_path}/dataset={dataset}/site=BUNNY/*/data.parquet')
WHERE date >= '2019-01-01' AND date <= '2019-12-31'
"""


def query_multi_sites_and_multi_dates_using_conditionals_month(base_path, dataset):
def query_multi_sites_and_multi_dates_using_conditionals_month(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are filtered using conditionals
# Non partitioned column used
return f"""
SELECT {COLUMNS_SQL}
FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet')
FROM read_parquet('{base_path}/dataset={dataset}/site=*/date=*/data.parquet')
WHERE date >= '2019-01-01' AND date <= '2019-01-31'
AND site IN ('BUNNY', 'ALIC1')
"""


def query_multi_sites_and_multi_dates_using_conditionals_year(base_path, dataset):
def query_multi_sites_and_multi_dates_using_conditionals_year(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are filtered using conditionals
# Non partitioned column used
return f"""
SELECT {COLUMNS_SQL}
FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet')
FROM read_parquet('{base_path}/dataset={dataset}/site=*/date=*/data.parquet')
WHERE date >= '2019-01-01' AND date <= '2019-12-31'
AND site IN ('BUNNY', 'ALIC1')
"""


def query_multi_dates_using_hive_types_month(base_path, dataset):
def query_multi_dates_using_hive_types_month(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are hive types and filtered using BETWEEN
# Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type
return f"""
SELECT {COLUMNS_SQL}
FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet', hive_types = {{date: DATE}})
FROM read_parquet('{base_path}/dataset={dataset}/site=BUNNY/date=*/data.parquet')
WHERE date BETWEEN '2019-01-01' AND '2019-01-31'
"""


def query_multi_dates_using_hive_types_year(base_path, dataset):
def query_multi_dates_using_hive_types_year(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are hive types and filtered using BETWEEN
# Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type
return f"""
SELECT {COLUMNS_SQL}
FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet', hive_types = {{date: DATE}})
FROM read_parquet('{base_path}/dataset={dataset}/site=BUNNY/date=*/data.parquet')
WHERE date BETWEEN '2019-01-01' AND '2019-12-31'
"""


def query_multi_sites_and_multi_dates_using_hive_types_month(base_path, dataset):
def query_multi_sites_and_multi_dates_using_hive_types_month(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are hive types and filtered using BETWEEN
# Non partitioned column used
# Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type
return f"""
SELECT {COLUMNS_SQL}
FROM read_parquet('{base_path}/dataset={dataset}/*/*/*.parquet', hive_types = {{date: DATE}})
FROM read_parquet('{base_path}/dataset={dataset}/site=*/date=*/data.parquet')
WHERE date BETWEEN '2019-01-01' AND '2019-01-31'
AND site IN ('BUNNY', 'ALIC1')
"""


def query_multi_sites_and_multi_dates_using_hive_types_year(base_path, dataset):
def query_multi_sites_and_multi_dates_using_hive_types_year(base_path, dataset): # noqa: ANN001, ANN201
# Test larger and more complex query parameters
# Dates are hive types and filtered using BETWEEN
# Non partitioned column used
# Fields of type DATE automatically picked up by duckdb so no need to specify as a hive type
return f"""
SELECT {COLUMNS_SQL}
FROM read_parquet('{base_path}/dataset={dataset}/site=BUNNY/date=*/data.parquet')
FROM read_parquet('{base_path}/dataset={dataset}/site=*/date=*/data.parquet')
WHERE date BETWEEN '2015-01-01' AND '2015-12-31'
AND site IN ('BUNNY', 'ALIC1')
"""


if __name__ == "__main__":
# Setup basic duckdb connection
conn = duckdb.connect(config = { 'threads': 75 })
conn = duckdb.connect(config={"threads": 64})

conn.execute("""
INSTALL httpfs;
Expand All @@ -178,16 +172,15 @@ def query_multi_sites_and_multi_dates_using_hive_types_year(base_path, dataset):
""")

queries = [
# query_one_site(BASE_BUCKET_PATH, DATASET),
# query_one_site_one_date(BASE_BUCKET_PATH, DATASET),
# query_multi_dates_using_conditionals_month(BASE_BUCKET_PATH, DATASET),
# query_multi_dates_using_conditionals_year(BASE_BUCKET_PATH, DATASET),
# query_multi_sites_and_multi_dates_using_conditionals_month(BASE_BUCKET_PATH, DATASET),
# query_multi_sites_and_multi_dates_using_conditionals_year(BASE_BUCKET_PATH, DATASET),
# query_multi_dates_using_hive_types_month(BASE_BUCKET_PATH, DATASET),
# query_multi_dates_using_hive_types_year(BASE_BUCKET_PATH, DATASET),
# query_multi_sites_and_multi_dates_using_hive_types_month(BASE_BUCKET_PATH, DATASET),
query_multi_sites_and_multi_dates_using_hive_types_year(BASE_BUCKET_PATH, DATASET)
query_one_site_one_date(BASE_BUCKET_PATH, DATASET),
query_multi_dates_using_conditionals_month(BASE_BUCKET_PATH, DATASET),
query_multi_dates_using_conditionals_year(BASE_BUCKET_PATH, DATASET),
query_multi_sites_and_multi_dates_using_conditionals_month(BASE_BUCKET_PATH, DATASET),
query_multi_sites_and_multi_dates_using_conditionals_year(BASE_BUCKET_PATH, DATASET),
query_multi_dates_using_hive_types_month(BASE_BUCKET_PATH, DATASET),
query_multi_dates_using_hive_types_year(BASE_BUCKET_PATH, DATASET),
query_multi_sites_and_multi_dates_using_hive_types_month(BASE_BUCKET_PATH, DATASET),
query_multi_sites_and_multi_dates_using_hive_types_year(BASE_BUCKET_PATH, DATASET),
]

# Create empty dataframe to store the results
Expand All @@ -198,7 +191,9 @@ def query_multi_sites_and_multi_dates_using_hive_types_year(base_path, dataset):

# Query profile is saved to ./profile.json
new_df = conn.sql(query).pl()
new_df.write_csv('./test.csv')

# Write out to csv to test all data returned
new_df.write_csv("./test.csv")

# Extract whats need from the profiler
df = extract_metrics(profile=OUTPUT_PROFILE)
Expand Down

0 comments on commit 39e8fa0

Please sign in to comment.