forked from Eventual-Inc/Daft
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[CHORE]: tpc-ds datagen (Eventual-Inc#3103)
a whole bunch of boilerplate for tpc-ds benchmarking and testing. wanted to keep this separate from others as there's not much functionality here, just adding a `dsdgen` command to the makefile to generate tpc-ds datasets. I called it `dsdgen` because that's what duckdb calls it, and this uses the duckdb implementation to generate all of the datasets. The answers were copied from [duckdb/duckdb/extension/tpcds/dsdgen/answers](https://github.com/duckdb/duckdb/tree/10c42435f1805ee4415faa5d6da4943e8c98fa55/extension/tpcds/dsdgen/answers) Usage: ```sh # defaults to sf=1 and dir=data/tpc-ds > make dsdgen > make dsdgen SCALE_FACTOR=<scale_factor> OUTPUT_DIR=<output_dir> ``` ## Notes for reviewer Most files here are boilerplate. The only relevant files are: - Makefile - requirements_dev.txt - benchmarking/tpc-ds/datagen.py
- Loading branch information
1 parent
d91a856
commit cc73b36
Showing
103 changed files
with
5,493 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import argparse | ||
import logging | ||
import os | ||
|
||
import duckdb | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def gen_tpcds(basedir: str, scale_factor: float): | ||
db = duckdb.connect(f"{basedir}/tpcds.db") | ||
db.sql(f"call dsdgen(sf = {scale_factor})") | ||
if not os.path.exists(basedir): | ||
os.makedirs(basedir) | ||
for item in db.sql("show tables").fetchall(): | ||
tbl = item[0] | ||
print(f"Exporting {tbl} to {basedir}/{tbl}.parquet") | ||
db.sql(f"COPY {tbl} TO '{basedir}/{tbl}.parquet'") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"--tpcds-gen-folder", | ||
default="data/tpcds-dbgen", | ||
help="Path to the folder containing the TPCH dbgen tool and generated data", | ||
) | ||
parser.add_argument("--scale-factor", default=0.01, help="Scale factor to run on in GB", type=float) | ||
|
||
args = parser.parse_args() | ||
num_parts = args.scale_factor | ||
|
||
logger.info( | ||
"Generating data at %s with: scale_factor=%s", | ||
args.tpch_gen_folder, | ||
args.scale_factor, | ||
) | ||
|
||
gen_tpcds(basedir=args.tpch_gen_folder, scale_factor=args.scale_factor) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
WITH customer_total_return AS | ||
(SELECT sr_customer_sk AS ctr_customer_sk, | ||
sr_store_sk AS ctr_store_sk, | ||
sum(sr_return_amt) AS ctr_total_return | ||
FROM store_returns, | ||
date_dim | ||
WHERE sr_returned_date_sk = d_date_sk | ||
AND d_year = 2000 | ||
GROUP BY sr_customer_sk, | ||
sr_store_sk) | ||
SELECT c_customer_id | ||
FROM customer_total_return ctr1, | ||
store, | ||
customer | ||
WHERE ctr1.ctr_total_return > | ||
(SELECT avg(ctr_total_return)*1.2 | ||
FROM customer_total_return ctr2 | ||
WHERE ctr1.ctr_store_sk = ctr2.ctr_store_sk) | ||
AND s_store_sk = ctr1.ctr_store_sk | ||
AND s_state = 'TN' | ||
AND ctr1.ctr_customer_sk = c_customer_sk | ||
ORDER BY c_customer_id | ||
LIMIT 100; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
WITH wscs AS | ||
(SELECT sold_date_sk, | ||
sales_price | ||
FROM | ||
(SELECT ws_sold_date_sk sold_date_sk, | ||
ws_ext_sales_price sales_price | ||
FROM web_sales | ||
UNION ALL SELECT cs_sold_date_sk sold_date_sk, | ||
cs_ext_sales_price sales_price | ||
FROM catalog_sales) sq1), | ||
wswscs AS | ||
(SELECT d_week_seq, | ||
sum(CASE | ||
WHEN (d_day_name='Sunday') THEN sales_price | ||
ELSE NULL | ||
END) sun_sales, | ||
sum(CASE | ||
WHEN (d_day_name='Monday') THEN sales_price | ||
ELSE NULL | ||
END) mon_sales, | ||
sum(CASE | ||
WHEN (d_day_name='Tuesday') THEN sales_price | ||
ELSE NULL | ||
END) tue_sales, | ||
sum(CASE | ||
WHEN (d_day_name='Wednesday') THEN sales_price | ||
ELSE NULL | ||
END) wed_sales, | ||
sum(CASE | ||
WHEN (d_day_name='Thursday') THEN sales_price | ||
ELSE NULL | ||
END) thu_sales, | ||
sum(CASE | ||
WHEN (d_day_name='Friday') THEN sales_price | ||
ELSE NULL | ||
END) fri_sales, | ||
sum(CASE | ||
WHEN (d_day_name='Saturday') THEN sales_price | ||
ELSE NULL | ||
END) sat_sales | ||
FROM wscs, | ||
date_dim | ||
WHERE d_date_sk = sold_date_sk | ||
GROUP BY d_week_seq) | ||
SELECT d_week_seq1, | ||
round(sun_sales1/sun_sales2, 2) r1, | ||
round(mon_sales1/mon_sales2, 2) r2, | ||
round(tue_sales1/tue_sales2, 2) r3, | ||
round(wed_sales1/wed_sales2, 2) r4, | ||
round(thu_sales1/thu_sales2, 2) r5, | ||
round(fri_sales1/fri_sales2, 2) r6, | ||
round(sat_sales1/sat_sales2, 2) | ||
FROM | ||
(SELECT wswscs.d_week_seq d_week_seq1, | ||
sun_sales sun_sales1, | ||
mon_sales mon_sales1, | ||
tue_sales tue_sales1, | ||
wed_sales wed_sales1, | ||
thu_sales thu_sales1, | ||
fri_sales fri_sales1, | ||
sat_sales sat_sales1 | ||
FROM wswscs, | ||
date_dim | ||
WHERE date_dim.d_week_seq = wswscs.d_week_seq | ||
AND d_year = 2001) y, | ||
(SELECT wswscs.d_week_seq d_week_seq2, | ||
sun_sales sun_sales2, | ||
mon_sales mon_sales2, | ||
tue_sales tue_sales2, | ||
wed_sales wed_sales2, | ||
thu_sales thu_sales2, | ||
fri_sales fri_sales2, | ||
sat_sales sat_sales2 | ||
FROM wswscs, | ||
date_dim | ||
WHERE date_dim.d_week_seq = wswscs.d_week_seq | ||
AND d_year = 2001+1) z | ||
WHERE d_week_seq1 = d_week_seq2-53 | ||
ORDER BY d_week_seq1 NULLS FIRST; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
SELECT dt.d_year, | ||
item.i_brand_id brand_id, | ||
item.i_brand brand, | ||
sum(ss_ext_sales_price) sum_agg | ||
FROM date_dim dt, | ||
store_sales, | ||
item | ||
WHERE dt.d_date_sk = store_sales.ss_sold_date_sk | ||
AND store_sales.ss_item_sk = item.i_item_sk | ||
AND item.i_manufact_id = 128 | ||
AND dt.d_moy=11 | ||
GROUP BY dt.d_year, | ||
item.i_brand, | ||
item.i_brand_id | ||
ORDER BY dt.d_year, | ||
sum_agg DESC, | ||
brand_id | ||
LIMIT 100; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
WITH year_total AS | ||
(SELECT c_customer_id customer_id, | ||
c_first_name customer_first_name, | ||
c_last_name customer_last_name, | ||
c_preferred_cust_flag customer_preferred_cust_flag, | ||
c_birth_country customer_birth_country, | ||
c_login customer_login, | ||
c_email_address customer_email_address, | ||
d_year dyear, | ||
sum(((ss_ext_list_price-ss_ext_wholesale_cost-ss_ext_discount_amt)+ss_ext_sales_price)/2) year_total, | ||
's' sale_type | ||
FROM customer, | ||
store_sales, | ||
date_dim | ||
WHERE c_customer_sk = ss_customer_sk | ||
AND ss_sold_date_sk = d_date_sk | ||
GROUP BY c_customer_id, | ||
c_first_name, | ||
c_last_name, | ||
c_preferred_cust_flag, | ||
c_birth_country, | ||
c_login, | ||
c_email_address, | ||
d_year | ||
UNION ALL SELECT c_customer_id customer_id, | ||
c_first_name customer_first_name, | ||
c_last_name customer_last_name, | ||
c_preferred_cust_flag customer_preferred_cust_flag, | ||
c_birth_country customer_birth_country, | ||
c_login customer_login, | ||
c_email_address customer_email_address, | ||
d_year dyear, | ||
sum((((cs_ext_list_price-cs_ext_wholesale_cost-cs_ext_discount_amt)+cs_ext_sales_price)/2)) year_total, | ||
'c' sale_type | ||
FROM customer, | ||
catalog_sales, | ||
date_dim | ||
WHERE c_customer_sk = cs_bill_customer_sk | ||
AND cs_sold_date_sk = d_date_sk | ||
GROUP BY c_customer_id, | ||
c_first_name, | ||
c_last_name, | ||
c_preferred_cust_flag, | ||
c_birth_country, | ||
c_login, | ||
c_email_address, | ||
d_year | ||
UNION ALL SELECT c_customer_id customer_id, | ||
c_first_name customer_first_name, | ||
c_last_name customer_last_name, | ||
c_preferred_cust_flag customer_preferred_cust_flag, | ||
c_birth_country customer_birth_country, | ||
c_login customer_login, | ||
c_email_address customer_email_address, | ||
d_year dyear, | ||
sum((((ws_ext_list_price-ws_ext_wholesale_cost-ws_ext_discount_amt)+ws_ext_sales_price)/2)) year_total, | ||
'w' sale_type | ||
FROM customer, | ||
web_sales, | ||
date_dim | ||
WHERE c_customer_sk = ws_bill_customer_sk | ||
AND ws_sold_date_sk = d_date_sk | ||
GROUP BY c_customer_id, | ||
c_first_name, | ||
c_last_name, | ||
c_preferred_cust_flag, | ||
c_birth_country, | ||
c_login, | ||
c_email_address, | ||
d_year) | ||
SELECT t_s_secyear.customer_id, | ||
t_s_secyear.customer_first_name, | ||
t_s_secyear.customer_last_name, | ||
t_s_secyear.customer_preferred_cust_flag | ||
FROM year_total t_s_firstyear, | ||
year_total t_s_secyear, | ||
year_total t_c_firstyear, | ||
year_total t_c_secyear, | ||
year_total t_w_firstyear, | ||
year_total t_w_secyear | ||
WHERE t_s_secyear.customer_id = t_s_firstyear.customer_id | ||
AND t_s_firstyear.customer_id = t_c_secyear.customer_id | ||
AND t_s_firstyear.customer_id = t_c_firstyear.customer_id | ||
AND t_s_firstyear.customer_id = t_w_firstyear.customer_id | ||
AND t_s_firstyear.customer_id = t_w_secyear.customer_id | ||
AND t_s_firstyear.sale_type = 's' | ||
AND t_c_firstyear.sale_type = 'c' | ||
AND t_w_firstyear.sale_type = 'w' | ||
AND t_s_secyear.sale_type = 's' | ||
AND t_c_secyear.sale_type = 'c' | ||
AND t_w_secyear.sale_type = 'w' | ||
AND t_s_firstyear.dyear = 2001 | ||
AND t_s_secyear.dyear = 2001+1 | ||
AND t_c_firstyear.dyear = 2001 | ||
AND t_c_secyear.dyear = 2001+1 | ||
AND t_w_firstyear.dyear = 2001 | ||
AND t_w_secyear.dyear = 2001+1 | ||
AND t_s_firstyear.year_total > 0 | ||
AND t_c_firstyear.year_total > 0 | ||
AND t_w_firstyear.year_total > 0 | ||
AND CASE | ||
WHEN t_c_firstyear.year_total > 0 THEN t_c_secyear.year_total / t_c_firstyear.year_total | ||
ELSE NULL | ||
END > CASE | ||
WHEN t_s_firstyear.year_total > 0 THEN t_s_secyear.year_total / t_s_firstyear.year_total | ||
ELSE NULL | ||
END | ||
AND CASE | ||
WHEN t_c_firstyear.year_total > 0 THEN t_c_secyear.year_total / t_c_firstyear.year_total | ||
ELSE NULL | ||
END > CASE | ||
WHEN t_w_firstyear.year_total > 0 THEN t_w_secyear.year_total / t_w_firstyear.year_total | ||
ELSE NULL | ||
END | ||
ORDER BY t_s_secyear.customer_id NULLS FIRST, | ||
t_s_secyear.customer_first_name NULLS FIRST, | ||
t_s_secyear.customer_last_name NULLS FIRST, | ||
t_s_secyear.customer_preferred_cust_flag NULLS FIRST | ||
LIMIT 100; |
Oops, something went wrong.