Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CHORE]: tpc-ds datagen #3103

Merged
merged 6 commits into from
Nov 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ IS_M1 ?= 0
HYPOTHESIS_MAX_EXAMPLES ?= 100
HYPOTHESIS_SEED ?= 0

# TPC-DS
SCALE_FACTOR ?= 1
OUTPUT_DIR ?= data/tpc-ds/


ifeq ($(OS),Windows_NT)
VENV_BIN=$(VENV)/Scripts
else
Expand Down Expand Up @@ -56,6 +61,10 @@ build-release: check-toolchain .venv ## Compile and install a faster Daft binar
test: .venv build ## Run tests
HYPOTHESIS_MAX_EXAMPLES=$(HYPOTHESIS_MAX_EXAMPLES) $(VENV_BIN)/pytest --hypothesis-seed=$(HYPOTHESIS_SEED)

.PHONY: dsdgen
dsdgen: .venv ## Generate TPC-DS data
$(VENV_BIN)/python benchmarking/tpcds/datagen.py --scale-factor=$(SCALE_FACTOR) --tpcds-gen-folder=$(OUTPUT_DIR)

.PHONY: clean
clean:
rm -rf $(VENV)
39 changes: 39 additions & 0 deletions benchmarking/tpcds/datagen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import argparse
import logging
import os

import duckdb

logger = logging.getLogger(__name__)


def gen_tpcds(basedir: str, scale_factor: float):
db = duckdb.connect(f"{basedir}/tpcds.db")
db.sql(f"call dsdgen(sf = {scale_factor})")
if not os.path.exists(basedir):
os.makedirs(basedir)
for item in db.sql("show tables").fetchall():
tbl = item[0]
print(f"Exporting {tbl} to {basedir}/{tbl}.parquet")
db.sql(f"COPY {tbl} TO '{basedir}/{tbl}.parquet'")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tpcds-gen-folder",
default="data/tpcds-dbgen",
help="Path to the folder containing the TPCH dbgen tool and generated data",
)
parser.add_argument("--scale-factor", default=0.01, help="Scale factor to run on in GB", type=float)

args = parser.parse_args()
num_parts = args.scale_factor

logger.info(
"Generating data at %s with: scale_factor=%s",
args.tpch_gen_folder,
args.scale_factor,
)

gen_tpcds(basedir=args.tpch_gen_folder, scale_factor=args.scale_factor)
23 changes: 23 additions & 0 deletions benchmarking/tpcds/queries/01.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
WITH customer_total_return AS
(SELECT sr_customer_sk AS ctr_customer_sk,
sr_store_sk AS ctr_store_sk,
sum(sr_return_amt) AS ctr_total_return
FROM store_returns,
date_dim
WHERE sr_returned_date_sk = d_date_sk
AND d_year = 2000
GROUP BY sr_customer_sk,
sr_store_sk)
SELECT c_customer_id
FROM customer_total_return ctr1,
store,
customer
WHERE ctr1.ctr_total_return >
(SELECT avg(ctr_total_return)*1.2
FROM customer_total_return ctr2
WHERE ctr1.ctr_store_sk = ctr2.ctr_store_sk)
AND s_store_sk = ctr1.ctr_store_sk
AND s_state = 'TN'
AND ctr1.ctr_customer_sk = c_customer_sk
ORDER BY c_customer_id
LIMIT 100;
79 changes: 79 additions & 0 deletions benchmarking/tpcds/queries/02.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
WITH wscs AS
(SELECT sold_date_sk,
sales_price
FROM
(SELECT ws_sold_date_sk sold_date_sk,
ws_ext_sales_price sales_price
FROM web_sales
UNION ALL SELECT cs_sold_date_sk sold_date_sk,
cs_ext_sales_price sales_price
FROM catalog_sales) sq1),
wswscs AS
(SELECT d_week_seq,
sum(CASE
WHEN (d_day_name='Sunday') THEN sales_price
ELSE NULL
END) sun_sales,
sum(CASE
WHEN (d_day_name='Monday') THEN sales_price
ELSE NULL
END) mon_sales,
sum(CASE
WHEN (d_day_name='Tuesday') THEN sales_price
ELSE NULL
END) tue_sales,
sum(CASE
WHEN (d_day_name='Wednesday') THEN sales_price
ELSE NULL
END) wed_sales,
sum(CASE
WHEN (d_day_name='Thursday') THEN sales_price
ELSE NULL
END) thu_sales,
sum(CASE
WHEN (d_day_name='Friday') THEN sales_price
ELSE NULL
END) fri_sales,
sum(CASE
WHEN (d_day_name='Saturday') THEN sales_price
ELSE NULL
END) sat_sales
FROM wscs,
date_dim
WHERE d_date_sk = sold_date_sk
GROUP BY d_week_seq)
SELECT d_week_seq1,
round(sun_sales1/sun_sales2, 2) r1,
round(mon_sales1/mon_sales2, 2) r2,
round(tue_sales1/tue_sales2, 2) r3,
round(wed_sales1/wed_sales2, 2) r4,
round(thu_sales1/thu_sales2, 2) r5,
round(fri_sales1/fri_sales2, 2) r6,
round(sat_sales1/sat_sales2, 2)
FROM
(SELECT wswscs.d_week_seq d_week_seq1,
sun_sales sun_sales1,
mon_sales mon_sales1,
tue_sales tue_sales1,
wed_sales wed_sales1,
thu_sales thu_sales1,
fri_sales fri_sales1,
sat_sales sat_sales1
FROM wswscs,
date_dim
WHERE date_dim.d_week_seq = wswscs.d_week_seq
AND d_year = 2001) y,
(SELECT wswscs.d_week_seq d_week_seq2,
sun_sales sun_sales2,
mon_sales mon_sales2,
tue_sales tue_sales2,
wed_sales wed_sales2,
thu_sales thu_sales2,
fri_sales fri_sales2,
sat_sales sat_sales2
FROM wswscs,
date_dim
WHERE date_dim.d_week_seq = wswscs.d_week_seq
AND d_year = 2001+1) z
WHERE d_week_seq1 = d_week_seq2-53
ORDER BY d_week_seq1 NULLS FIRST;
18 changes: 18 additions & 0 deletions benchmarking/tpcds/queries/03.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
SELECT dt.d_year,
item.i_brand_id brand_id,
item.i_brand brand,
sum(ss_ext_sales_price) sum_agg
FROM date_dim dt,
store_sales,
item
WHERE dt.d_date_sk = store_sales.ss_sold_date_sk
AND store_sales.ss_item_sk = item.i_item_sk
AND item.i_manufact_id = 128
AND dt.d_moy=11
GROUP BY dt.d_year,
item.i_brand,
item.i_brand_id
ORDER BY dt.d_year,
sum_agg DESC,
brand_id
LIMIT 100;
119 changes: 119 additions & 0 deletions benchmarking/tpcds/queries/04.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
WITH year_total AS
(SELECT c_customer_id customer_id,
c_first_name customer_first_name,
c_last_name customer_last_name,
c_preferred_cust_flag customer_preferred_cust_flag,
c_birth_country customer_birth_country,
c_login customer_login,
c_email_address customer_email_address,
d_year dyear,
sum(((ss_ext_list_price-ss_ext_wholesale_cost-ss_ext_discount_amt)+ss_ext_sales_price)/2) year_total,
's' sale_type
FROM customer,
store_sales,
date_dim
WHERE c_customer_sk = ss_customer_sk
AND ss_sold_date_sk = d_date_sk
GROUP BY c_customer_id,
c_first_name,
c_last_name,
c_preferred_cust_flag,
c_birth_country,
c_login,
c_email_address,
d_year
UNION ALL SELECT c_customer_id customer_id,
c_first_name customer_first_name,
c_last_name customer_last_name,
c_preferred_cust_flag customer_preferred_cust_flag,
c_birth_country customer_birth_country,
c_login customer_login,
c_email_address customer_email_address,
d_year dyear,
sum((((cs_ext_list_price-cs_ext_wholesale_cost-cs_ext_discount_amt)+cs_ext_sales_price)/2)) year_total,
'c' sale_type
FROM customer,
catalog_sales,
date_dim
WHERE c_customer_sk = cs_bill_customer_sk
AND cs_sold_date_sk = d_date_sk
GROUP BY c_customer_id,
c_first_name,
c_last_name,
c_preferred_cust_flag,
c_birth_country,
c_login,
c_email_address,
d_year
UNION ALL SELECT c_customer_id customer_id,
c_first_name customer_first_name,
c_last_name customer_last_name,
c_preferred_cust_flag customer_preferred_cust_flag,
c_birth_country customer_birth_country,
c_login customer_login,
c_email_address customer_email_address,
d_year dyear,
sum((((ws_ext_list_price-ws_ext_wholesale_cost-ws_ext_discount_amt)+ws_ext_sales_price)/2)) year_total,
'w' sale_type
FROM customer,
web_sales,
date_dim
WHERE c_customer_sk = ws_bill_customer_sk
AND ws_sold_date_sk = d_date_sk
GROUP BY c_customer_id,
c_first_name,
c_last_name,
c_preferred_cust_flag,
c_birth_country,
c_login,
c_email_address,
d_year)
SELECT t_s_secyear.customer_id,
t_s_secyear.customer_first_name,
t_s_secyear.customer_last_name,
t_s_secyear.customer_preferred_cust_flag
FROM year_total t_s_firstyear,
year_total t_s_secyear,
year_total t_c_firstyear,
year_total t_c_secyear,
year_total t_w_firstyear,
year_total t_w_secyear
WHERE t_s_secyear.customer_id = t_s_firstyear.customer_id
AND t_s_firstyear.customer_id = t_c_secyear.customer_id
AND t_s_firstyear.customer_id = t_c_firstyear.customer_id
AND t_s_firstyear.customer_id = t_w_firstyear.customer_id
AND t_s_firstyear.customer_id = t_w_secyear.customer_id
AND t_s_firstyear.sale_type = 's'
AND t_c_firstyear.sale_type = 'c'
AND t_w_firstyear.sale_type = 'w'
AND t_s_secyear.sale_type = 's'
AND t_c_secyear.sale_type = 'c'
AND t_w_secyear.sale_type = 'w'
AND t_s_firstyear.dyear = 2001
AND t_s_secyear.dyear = 2001+1
AND t_c_firstyear.dyear = 2001
AND t_c_secyear.dyear = 2001+1
AND t_w_firstyear.dyear = 2001
AND t_w_secyear.dyear = 2001+1
AND t_s_firstyear.year_total > 0
AND t_c_firstyear.year_total > 0
AND t_w_firstyear.year_total > 0
AND CASE
WHEN t_c_firstyear.year_total > 0 THEN t_c_secyear.year_total / t_c_firstyear.year_total
ELSE NULL
END > CASE
WHEN t_s_firstyear.year_total > 0 THEN t_s_secyear.year_total / t_s_firstyear.year_total
ELSE NULL
END
AND CASE
WHEN t_c_firstyear.year_total > 0 THEN t_c_secyear.year_total / t_c_firstyear.year_total
ELSE NULL
END > CASE
WHEN t_w_firstyear.year_total > 0 THEN t_w_secyear.year_total / t_w_firstyear.year_total
ELSE NULL
END
ORDER BY t_s_secyear.customer_id NULLS FIRST,
t_s_secyear.customer_first_name NULLS FIRST,
t_s_secyear.customer_last_name NULLS FIRST,
t_s_secyear.customer_preferred_cust_flag NULLS FIRST
LIMIT 100;
Loading
Loading