Skip to content

Commit

Permalink
[CHORE]: tpc-ds datagen (Eventual-Inc#3103)
Browse files Browse the repository at this point in the history
a whole bunch of boilerplate for tpc-ds benchmarking and testing. 

wanted to keep this separate from others as there's not much
functionality here, just adding a `dsdgen` command to the makefile to
generate tpc-ds datasets. I called it `dsdgen` because that's what
duckdb calls it, and this uses the duckdb implementation to generate all
of the datasets.

The answers were copied from
[duckdb/duckdb/extension/tpcds/dsdgen/answers](https://github.com/duckdb/duckdb/tree/10c42435f1805ee4415faa5d6da4943e8c98fa55/extension/tpcds/dsdgen/answers)

Usage:

```sh
# defaults to sf=1 and dir=data/tpc-ds
> make dsdgen
> make dsdgen SCALE_FACTOR=<scale_factor> OUTPUT_DIR=<output_dir>
```

## Notes for reviewer

Most files here are boilerplate. 

The only relevant files are:
- Makefile
- requirements_dev.txt
- benchmarking/tpc-ds/datagen.py
  • Loading branch information
universalmind303 authored and sagiahrac committed Nov 4, 2024
1 parent d91a856 commit cc73b36
Show file tree
Hide file tree
Showing 103 changed files with 5,493 additions and 1 deletion.
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ IS_M1 ?= 0
HYPOTHESIS_MAX_EXAMPLES ?= 100
HYPOTHESIS_SEED ?= 0

# TPC-DS
SCALE_FACTOR ?= 1
OUTPUT_DIR ?= data/tpc-ds/


ifeq ($(OS),Windows_NT)
VENV_BIN=$(VENV)/Scripts
else
Expand Down Expand Up @@ -56,6 +61,10 @@ build-release: check-toolchain .venv ## Compile and install a faster Daft binar
test: .venv build ## Run tests
HYPOTHESIS_MAX_EXAMPLES=$(HYPOTHESIS_MAX_EXAMPLES) $(VENV_BIN)/pytest --hypothesis-seed=$(HYPOTHESIS_SEED)

.PHONY: dsdgen
dsdgen: .venv ## Generate TPC-DS data
$(VENV_BIN)/python benchmarking/tpcds/datagen.py --scale-factor=$(SCALE_FACTOR) --tpcds-gen-folder=$(OUTPUT_DIR)

.PHONY: clean
clean:
rm -rf $(VENV)
39 changes: 39 additions & 0 deletions benchmarking/tpcds/datagen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import argparse
import logging
import os

import duckdb

logger = logging.getLogger(__name__)


def gen_tpcds(basedir: str, scale_factor: float):
db = duckdb.connect(f"{basedir}/tpcds.db")
db.sql(f"call dsdgen(sf = {scale_factor})")
if not os.path.exists(basedir):
os.makedirs(basedir)
for item in db.sql("show tables").fetchall():
tbl = item[0]
print(f"Exporting {tbl} to {basedir}/{tbl}.parquet")
db.sql(f"COPY {tbl} TO '{basedir}/{tbl}.parquet'")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tpcds-gen-folder",
default="data/tpcds-dbgen",
help="Path to the folder containing the TPCH dbgen tool and generated data",
)
parser.add_argument("--scale-factor", default=0.01, help="Scale factor to run on in GB", type=float)

args = parser.parse_args()
num_parts = args.scale_factor

logger.info(
"Generating data at %s with: scale_factor=%s",
args.tpch_gen_folder,
args.scale_factor,
)

gen_tpcds(basedir=args.tpch_gen_folder, scale_factor=args.scale_factor)
23 changes: 23 additions & 0 deletions benchmarking/tpcds/queries/01.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
WITH customer_total_return AS
(SELECT sr_customer_sk AS ctr_customer_sk,
sr_store_sk AS ctr_store_sk,
sum(sr_return_amt) AS ctr_total_return
FROM store_returns,
date_dim
WHERE sr_returned_date_sk = d_date_sk
AND d_year = 2000
GROUP BY sr_customer_sk,
sr_store_sk)
SELECT c_customer_id
FROM customer_total_return ctr1,
store,
customer
WHERE ctr1.ctr_total_return >
(SELECT avg(ctr_total_return)*1.2
FROM customer_total_return ctr2
WHERE ctr1.ctr_store_sk = ctr2.ctr_store_sk)
AND s_store_sk = ctr1.ctr_store_sk
AND s_state = 'TN'
AND ctr1.ctr_customer_sk = c_customer_sk
ORDER BY c_customer_id
LIMIT 100;
79 changes: 79 additions & 0 deletions benchmarking/tpcds/queries/02.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
WITH wscs AS
(SELECT sold_date_sk,
sales_price
FROM
(SELECT ws_sold_date_sk sold_date_sk,
ws_ext_sales_price sales_price
FROM web_sales
UNION ALL SELECT cs_sold_date_sk sold_date_sk,
cs_ext_sales_price sales_price
FROM catalog_sales) sq1),
wswscs AS
(SELECT d_week_seq,
sum(CASE
WHEN (d_day_name='Sunday') THEN sales_price
ELSE NULL
END) sun_sales,
sum(CASE
WHEN (d_day_name='Monday') THEN sales_price
ELSE NULL
END) mon_sales,
sum(CASE
WHEN (d_day_name='Tuesday') THEN sales_price
ELSE NULL
END) tue_sales,
sum(CASE
WHEN (d_day_name='Wednesday') THEN sales_price
ELSE NULL
END) wed_sales,
sum(CASE
WHEN (d_day_name='Thursday') THEN sales_price
ELSE NULL
END) thu_sales,
sum(CASE
WHEN (d_day_name='Friday') THEN sales_price
ELSE NULL
END) fri_sales,
sum(CASE
WHEN (d_day_name='Saturday') THEN sales_price
ELSE NULL
END) sat_sales
FROM wscs,
date_dim
WHERE d_date_sk = sold_date_sk
GROUP BY d_week_seq)
SELECT d_week_seq1,
round(sun_sales1/sun_sales2, 2) r1,
round(mon_sales1/mon_sales2, 2) r2,
round(tue_sales1/tue_sales2, 2) r3,
round(wed_sales1/wed_sales2, 2) r4,
round(thu_sales1/thu_sales2, 2) r5,
round(fri_sales1/fri_sales2, 2) r6,
round(sat_sales1/sat_sales2, 2)
FROM
(SELECT wswscs.d_week_seq d_week_seq1,
sun_sales sun_sales1,
mon_sales mon_sales1,
tue_sales tue_sales1,
wed_sales wed_sales1,
thu_sales thu_sales1,
fri_sales fri_sales1,
sat_sales sat_sales1
FROM wswscs,
date_dim
WHERE date_dim.d_week_seq = wswscs.d_week_seq
AND d_year = 2001) y,
(SELECT wswscs.d_week_seq d_week_seq2,
sun_sales sun_sales2,
mon_sales mon_sales2,
tue_sales tue_sales2,
wed_sales wed_sales2,
thu_sales thu_sales2,
fri_sales fri_sales2,
sat_sales sat_sales2
FROM wswscs,
date_dim
WHERE date_dim.d_week_seq = wswscs.d_week_seq
AND d_year = 2001+1) z
WHERE d_week_seq1 = d_week_seq2-53
ORDER BY d_week_seq1 NULLS FIRST;
18 changes: 18 additions & 0 deletions benchmarking/tpcds/queries/03.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
SELECT dt.d_year,
item.i_brand_id brand_id,
item.i_brand brand,
sum(ss_ext_sales_price) sum_agg
FROM date_dim dt,
store_sales,
item
WHERE dt.d_date_sk = store_sales.ss_sold_date_sk
AND store_sales.ss_item_sk = item.i_item_sk
AND item.i_manufact_id = 128
AND dt.d_moy=11
GROUP BY dt.d_year,
item.i_brand,
item.i_brand_id
ORDER BY dt.d_year,
sum_agg DESC,
brand_id
LIMIT 100;
119 changes: 119 additions & 0 deletions benchmarking/tpcds/queries/04.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
WITH year_total AS
(SELECT c_customer_id customer_id,
c_first_name customer_first_name,
c_last_name customer_last_name,
c_preferred_cust_flag customer_preferred_cust_flag,
c_birth_country customer_birth_country,
c_login customer_login,
c_email_address customer_email_address,
d_year dyear,
sum(((ss_ext_list_price-ss_ext_wholesale_cost-ss_ext_discount_amt)+ss_ext_sales_price)/2) year_total,
's' sale_type
FROM customer,
store_sales,
date_dim
WHERE c_customer_sk = ss_customer_sk
AND ss_sold_date_sk = d_date_sk
GROUP BY c_customer_id,
c_first_name,
c_last_name,
c_preferred_cust_flag,
c_birth_country,
c_login,
c_email_address,
d_year
UNION ALL SELECT c_customer_id customer_id,
c_first_name customer_first_name,
c_last_name customer_last_name,
c_preferred_cust_flag customer_preferred_cust_flag,
c_birth_country customer_birth_country,
c_login customer_login,
c_email_address customer_email_address,
d_year dyear,
sum((((cs_ext_list_price-cs_ext_wholesale_cost-cs_ext_discount_amt)+cs_ext_sales_price)/2)) year_total,
'c' sale_type
FROM customer,
catalog_sales,
date_dim
WHERE c_customer_sk = cs_bill_customer_sk
AND cs_sold_date_sk = d_date_sk
GROUP BY c_customer_id,
c_first_name,
c_last_name,
c_preferred_cust_flag,
c_birth_country,
c_login,
c_email_address,
d_year
UNION ALL SELECT c_customer_id customer_id,
c_first_name customer_first_name,
c_last_name customer_last_name,
c_preferred_cust_flag customer_preferred_cust_flag,
c_birth_country customer_birth_country,
c_login customer_login,
c_email_address customer_email_address,
d_year dyear,
sum((((ws_ext_list_price-ws_ext_wholesale_cost-ws_ext_discount_amt)+ws_ext_sales_price)/2)) year_total,
'w' sale_type
FROM customer,
web_sales,
date_dim
WHERE c_customer_sk = ws_bill_customer_sk
AND ws_sold_date_sk = d_date_sk
GROUP BY c_customer_id,
c_first_name,
c_last_name,
c_preferred_cust_flag,
c_birth_country,
c_login,
c_email_address,
d_year)
SELECT t_s_secyear.customer_id,
t_s_secyear.customer_first_name,
t_s_secyear.customer_last_name,
t_s_secyear.customer_preferred_cust_flag
FROM year_total t_s_firstyear,
year_total t_s_secyear,
year_total t_c_firstyear,
year_total t_c_secyear,
year_total t_w_firstyear,
year_total t_w_secyear
WHERE t_s_secyear.customer_id = t_s_firstyear.customer_id
AND t_s_firstyear.customer_id = t_c_secyear.customer_id
AND t_s_firstyear.customer_id = t_c_firstyear.customer_id
AND t_s_firstyear.customer_id = t_w_firstyear.customer_id
AND t_s_firstyear.customer_id = t_w_secyear.customer_id
AND t_s_firstyear.sale_type = 's'
AND t_c_firstyear.sale_type = 'c'
AND t_w_firstyear.sale_type = 'w'
AND t_s_secyear.sale_type = 's'
AND t_c_secyear.sale_type = 'c'
AND t_w_secyear.sale_type = 'w'
AND t_s_firstyear.dyear = 2001
AND t_s_secyear.dyear = 2001+1
AND t_c_firstyear.dyear = 2001
AND t_c_secyear.dyear = 2001+1
AND t_w_firstyear.dyear = 2001
AND t_w_secyear.dyear = 2001+1
AND t_s_firstyear.year_total > 0
AND t_c_firstyear.year_total > 0
AND t_w_firstyear.year_total > 0
AND CASE
WHEN t_c_firstyear.year_total > 0 THEN t_c_secyear.year_total / t_c_firstyear.year_total
ELSE NULL
END > CASE
WHEN t_s_firstyear.year_total > 0 THEN t_s_secyear.year_total / t_s_firstyear.year_total
ELSE NULL
END
AND CASE
WHEN t_c_firstyear.year_total > 0 THEN t_c_secyear.year_total / t_c_firstyear.year_total
ELSE NULL
END > CASE
WHEN t_w_firstyear.year_total > 0 THEN t_w_secyear.year_total / t_w_firstyear.year_total
ELSE NULL
END
ORDER BY t_s_secyear.customer_id NULLS FIRST,
t_s_secyear.customer_first_name NULLS FIRST,
t_s_secyear.customer_last_name NULLS FIRST,
t_s_secyear.customer_preferred_cust_flag NULLS FIRST
LIMIT 100;
Loading

0 comments on commit cc73b36

Please sign in to comment.