This code allows to enhance the queries plans with data-driven cardinalities.
In addition, install the following packages
pip3 install spflow --no-deps
pip3 install -r requirements.txt
Generate HDF files
python3 deepdb.py --generate_all_hdfs --csv_path ../zero-shot-data/datasets --hdf_path ../zero-shot-data/deepdb/datasets --max_rows_per_hdf_file 100000000
Train DeepDB models. For each dataset, we will train both binary RSPNs (for every join) as well as single table RSPNs.
timeout 14400 python3 deepdb.py --generate_ensemble --dataset airline --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/airline --hdf_path ../zero-shot-data/deepdb/datasets/airline/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/airline/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset imdb --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/imdb --hdf_path ../zero-shot-data/deepdb/datasets/imdb/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/imdb/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset ssb --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/ssb --hdf_path ../zero-shot-data/deepdb/datasets/ssb/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/ssb/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset tpc_h --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/tpc_h --hdf_path ../zero-shot-data/deepdb/datasets/tpc_h/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/tpc_h/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset walmart --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/walmart --hdf_path ../zero-shot-data/deepdb/datasets/walmart/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/walmart/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset financial --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/financial --hdf_path ../zero-shot-data/deepdb/datasets/financial/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/financial/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset basketball --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/basketball --hdf_path ../zero-shot-data/deepdb/datasets/basketball/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/basketball/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset accidents --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/accidents --hdf_path ../zero-shot-data/deepdb/datasets/accidents/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/accidents/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset movielens --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/movielens --hdf_path ../zero-shot-data/deepdb/datasets/movielens/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/movielens/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset baseball --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/baseball --hdf_path ../zero-shot-data/deepdb/datasets/baseball/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/baseball/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset hepatitis --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/hepatitis --hdf_path ../zero-shot-data/deepdb/datasets/hepatitis/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/hepatitis/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset tournament --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/tournament --hdf_path ../zero-shot-data/deepdb/datasets/tournament/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/tournament/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset credit --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/credit --hdf_path ../zero-shot-data/deepdb/datasets/credit/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/credit/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset employee --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/employee --hdf_path ../zero-shot-data/deepdb/datasets/employee/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/employee/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset consumer --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/consumer --hdf_path ../zero-shot-data/deepdb/datasets/consumer/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/consumer/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset geneea --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/geneea --hdf_path ../zero-shot-data/deepdb/datasets/geneea/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/geneea/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset genome --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/genome --hdf_path ../zero-shot-data/deepdb/datasets/genome/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/genome/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset carcinogenesis --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/carcinogenesis --hdf_path ../zero-shot-data/deepdb/datasets/carcinogenesis/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/carcinogenesis/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset seznam --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/seznam --hdf_path ../zero-shot-data/deepdb/datasets/seznam/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/seznam/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset fhnk --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy single --csv_path ../zero-shot-data/datasets/fhnk --hdf_path ../zero-shot-data/deepdb/datasets/fhnk/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/fhnk/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset airline --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/airline --hdf_path ../zero-shot-data/deepdb/datasets/airline/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/airline/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset imdb --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/imdb --hdf_path ../zero-shot-data/deepdb/datasets/imdb/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/imdb/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset ssb --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/ssb --hdf_path ../zero-shot-data/deepdb/datasets/ssb/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/ssb/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset tpc_h --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/tpc_h --hdf_path ../zero-shot-data/deepdb/datasets/tpc_h/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/tpc_h/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset walmart --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/walmart --hdf_path ../zero-shot-data/deepdb/datasets/walmart/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/walmart/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset financial --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/financial --hdf_path ../zero-shot-data/deepdb/datasets/financial/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/financial/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset basketball --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/basketball --hdf_path ../zero-shot-data/deepdb/datasets/basketball/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/basketball/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset accidents --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/accidents --hdf_path ../zero-shot-data/deepdb/datasets/accidents/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/accidents/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset movielens --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/movielens --hdf_path ../zero-shot-data/deepdb/datasets/movielens/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/movielens/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset baseball --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/baseball --hdf_path ../zero-shot-data/deepdb/datasets/baseball/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/baseball/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset hepatitis --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/hepatitis --hdf_path ../zero-shot-data/deepdb/datasets/hepatitis/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/hepatitis/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset tournament --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/tournament --hdf_path ../zero-shot-data/deepdb/datasets/tournament/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/tournament/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset credit --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/credit --hdf_path ../zero-shot-data/deepdb/datasets/credit/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/credit/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset employee --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/employee --hdf_path ../zero-shot-data/deepdb/datasets/employee/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/employee/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset consumer --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/consumer --hdf_path ../zero-shot-data/deepdb/datasets/consumer/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/consumer/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset geneea --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/geneea --hdf_path ../zero-shot-data/deepdb/datasets/geneea/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/geneea/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset genome --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/genome --hdf_path ../zero-shot-data/deepdb/datasets/genome/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/genome/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset carcinogenesis --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/carcinogenesis --hdf_path ../zero-shot-data/deepdb/datasets/carcinogenesis/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/carcinogenesis/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset seznam --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/seznam --hdf_path ../zero-shot-data/deepdb/datasets/seznam/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/seznam/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
timeout 14400 python3 deepdb.py --generate_ensemble --dataset fhnk --samples_per_spn 10000000 10000000 1000000 1000000 1000000 --ensemble_strategy relationship --csv_path ../zero-shot-data/datasets/fhnk --hdf_path ../zero-shot-data/deepdb/datasets/fhnk/gen_single_light --ensemble_path ../zero-shot-data/deepdb/datasets/fhnk/spn_ensembles --max_rows_per_hdf_file 100000000 --rdc_threshold 0.3 --post_sampling_factor 10 10 5 1 1
Note that we need special parsing here (--parse_baseline)
DATASETS=("credit" "accidents" "airline" "hockey" "employee" "basketball" "baseball" "ssb" "carcinogenesis" "financial" "fhnk" "consumer" "geneea" "genome" "hepatitis" "movielens" "seznam" "tournament" "walmart" "tpc_h" "imdb")
for DS in ${DATASETS[*]}; do
echo $DS;
python3 run_benchmark.py --parse_baseline --parse_run --database postgres --source ../zero-shot-data/runs/raw/$DS/workload_100k_s1_c8220.json --target ../zero-shot-data/runs/parsed_plans/$DS/workload_100k_s1_c8220.json
done;
Augment entire workload
python3 deepdb.py --augment_all_workload_cardinalities --csv_path ../zero-shot-data/datasets/ --ensemble_location ../zero-shot-data/deepdb/datasets/ --source_workload_dir ../zero-shot-data/runs/parsed_plans/ --source_workloads index_workload_100k_s2_c8220.json job-light_c8220.json scale_c8220.json synthetic_c8220.json workload_100k_s1_c8220.json --workload_target_paths ../zero-shot-data/runs/deepdb_augmented/
In case, just a single workload should be annotated, use
python3 deepdb.py --augment_cardinalities --dataset walmart --csv_path ../zero-shot-data/datasets/walmart --ensemble_location ../zero-shot-data/deepdb/datasets/walmart/spn_ensembles/ensemble_relationships_walmart_10000000.pkl --source_workloads ../zero-shot-data/runs/parsed_plans/walmart/workload_100k_s1_c8220.json --workload_target_paths ../zero-shot-data/runs/deepdb_augmented/walmart/workload_100k_s1_c8220.json