Skip to content

Commit

Permalink
Merge pull request #98 from snowplow/feature/web/bigquery/integration…
Browse files Browse the repository at this point in the history
…_tests

Feature/web/bigquery/integration tests
  • Loading branch information
bill-warner authored Jun 16, 2021
2 parents a0e2f32 + 0be98b5 commit a38e76b
Show file tree
Hide file tree
Showing 21 changed files with 283 additions and 10 deletions.
36 changes: 36 additions & 0 deletions .scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ Note that this script does not enforce dependencies, rather runs the playbooks i
-d (dryRun) use sql-runner dry run
-o (output path) path to store output of sql-runner to sql file (to be used in conjunction with p)
-t (target template) path to target template to use (minimizes risk of credential leak)
-v (variable template) path to variable template. Any variables in this template will override any corresponding variables within each playbook for the run.
```

**Examples:**
Expand Down Expand Up @@ -178,6 +179,41 @@ bash .scripts/pr_check.sh -b ~/pathTo/sql-runner -d bigquery -m web;
# Runs the pr check testing script against bigquery
```

## integration_test.sh

Runs 4 end to end runs of the standard model in 1 day increments, using the integration test dataset. The actual derived tables are then checked against the expect derived tables. The standard tests are also performed on the derived tables.

We recommend using a virtual environment for python, eg. `pyenv` or `virtualenv` - for example using the latter:

```bash
virtualenv ~/myenv
source ~/myenv/bin/activate
```

Before running, make sure to install python requirements (python3 required):

```bash
cd data-models/.test
pip3 install -r requirements.txt
```

**Arguments:**

```
-b (binary) path to sql-runner binary [required]
-d (database) target database for expectations [required]
-a (auth) optional credentials for database target
-m (model) target model to run i.e. web or mobile [required]
```

**Examples:**

```bash
bash .scripts/integration_test.sh -b ~/pathTo/sql-runner -d bigquery -m web

# Runs the integration testing script against bigquery
```

### `run_playbooks.sh` (deprecated)

Deprecated - `run_config.sh` provides a simpler instrumentation for this functionality.
Expand Down
1 change: 1 addition & 0 deletions .scripts/e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# -b (binary) path to sql-runner binary
# -d (database) target database for expectations
# -a (auth) optional credentials for database target
# -m (model) target model to run i.e. web or mobile

while getopts 'b:d:a:m:' v
do
Expand Down
54 changes: 54 additions & 0 deletions .scripts/integration_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash

# Expected input:
# -b (binary) path to sql-runner binary
# -d (database) target database for expectations
# -a (auth) optional credentials for database target
# -m (model) target model to run i.e. web or mobile

while getopts 'b:d:a:m:' opt
do
case $opt in
b) SQL_RUNNER_PATH=$OPTARG ;;
d) DATABASE=$OPTARG ;;
a) CREDENTIALS=$OPTARG ;;
m) MODEL=$OPTARG ;;
esac
done

repo_root_path=$( cd "$(dirname "$(dirname "${BASH_SOURCE[0]}")")" && pwd -P )
script_path="${repo_root_path}/.scripts"
config_dir="${repo_root_path}/$MODEL/v1/$DATABASE/sql-runner/configs"

# Set credentials via env vars
export BIGQUERY_CREDS=${BIGQUERY_CREDS:-$CREDENTIALS}
export REDSHIFT_PASSWORD=${REDSHIFT_PASSWORD:-$CREDENTIALS}
export SNOWFLAKE_PASSWORD=${SNOWFLAKE_PASSWORD:-$CREDENTIALS}

echo "integration_check: Starting 5 runs"

for i in {1..5}; do

echo "integration_check: Starting run $i";

bash .scripts/run_config.sh -b sql-runner -c $config_dir/pre_test.json -t $script_path/templates/$DATABASE.yml.tmpl -v .test/integration_tests/$MODEL/v1/${DATABASE}_variables.yml.tmpl || exit;

echo "integration_check: Checking actual vs. expected for the events_staged table";

bash $script_path/run_test.sh -m $MODEL -d $DATABASE -c events_staged_integration_test_${i} || exit 1;

bash .scripts/run_config.sh -b sql-runner -c $config_dir/post_test.json -t $script_path/templates/$DATABASE.yml.tmpl -v .test/integration_tests/$MODEL/v1/${DATABASE}_variables.yml.tmpl || exit;

echo "integration_check: run $i done";

done || exit 1

echo "integration_check: Checking actual vs. expected for derived tables";

bash $script_path/run_test.sh -m $MODEL -d $DATABASE -c perm_integration_test_tables || exit 1;

echo "integration_check: Checking standard tests against derived tables";

bash $script_path/run_test.sh -m $MODEL -d $DATABASE -c perm_tables || exit 1;

echo "integration_check: Done"
1 change: 1 addition & 0 deletions .scripts/pr_check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# -b (binary) path to sql-runner binary
# -d (database) target database for expectations
# -a (auth) optional credentials for database target
# -m (model) target model to run i.e. web or mobile

while getopts 'b:d:a:m:' v
do
Expand Down
15 changes: 12 additions & 3 deletions .scripts/run_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,19 @@
# -d (dryRun) use sql-runner dry run
# -o (output path) path to store output of sql-runner to sql file (to be used in conjunction with p)
# -t (target template) path to target template to use (minimizes risk of credential leak)
# -v (varialbles template) path to variables template to use

while getopts 'pdb:c:a:o:t:' v
while getopts 'pdb:c:a:o:t:v:' opt
do
case $v in
case $opt in
b) SQL_RUNNER_PATH=$OPTARG ;;
c) CONFIG_PATH=$OPTARG ;;
a) CREDENTIALS=$OPTARG ;;
p) FILL_TEMPLATES='-fillTemplates' ;;
d) DRY_RUN='-dryRun' ;;
o) OUTPUT_PATH=$OPTARG ;;
t) TARGET_TEMPLATE=$OPTARG
t) TARGET_TEMPLATE=$OPTARG ;;
v) VARIABLES_TEMPLATE=$OPTARG
esac
done

Expand Down Expand Up @@ -72,6 +74,13 @@ do

fi

if [ ! -z "$VARIABLES_TEMPLATE" ]; then

# Sub in any variables if specified
awk -F':' 'NR==FNR{a[$2]=$0;next} /:variables:/{flag=1} /:steps:/{flag=0} a[$2]&&flag{$0=a[$2]}1' $root_path/$VARIABLES_TEMPLATE $root_path/tmp/current_playbook.yml > $root_path/tmp/current_playbook.tmp && mv $root_path/tmp/current_playbook.tmp $root_path/tmp/current_playbook.yml

fi

# If printing sql to file, mkdirs and set path vars
if [ ! -z "$OUTPUT_PATH" ]; then
mkdir -p $OUTPUT_PATH
Expand Down
1 change: 1 addition & 0 deletions .scripts/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# -d (database) target database for expectations
# -c (config) expectation config name
# -a (auth) optional credentials for database target
# -m (model) target model to run i.e. web or mobile

while getopts 'd:c:a:m:' v
do
Expand Down
2 changes: 1 addition & 1 deletion .test/great_expectations/expectations/web/v1/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@
],
"meta": {
"versions": {
"test_suite_version": "1.1.0",
"test_suite_version": "1.1.1",
"bigquery_model_version": "1.0.3",
"snowflake_model_version": "1.0.0"
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@
],
"meta": {
"versions": {
"test_suite_version": "1.1.0",
"test_suite_version": "1.1.1",
"redshift_model_version": "1.2.0"
},
"great_expectations.__version__": "0.12.0"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
{
"data_asset_type": "Dataset",
"expectation_suite_name": "integration_tests",
"expectations": [
{
"expectation_type": "expect_column_values_to_be_null",
"kwargs": {
"column": "long_session"
}
},
{
"expectation_type": "expect_column_values_to_be_null",
"kwargs": {
"column": "null_page_view_id"
}
},
{
"expectation_type": "expect_column_values_to_be_null",
"kwargs": {
"column": "null_domain_userid"
}
},
{
"expectation_type": "expect_column_values_to_be_null",
"kwargs": {
"column": "null_domain_sessionid"
}
},
{
"expectation_type": "expect_column_values_to_be_null",
"kwargs": {
"column": "dupe_event_id_same_collector_tstamp"
}
},
{
"expectation_type": "expect_column_values_to_be_null",
"kwargs": {
"column": "dupe_event_id_diff_collector_tstamp"
}
},
{
"expectation_type": "expect_column_values_to_be_null",
"kwargs": {
"column": "dupe_page_view_id_diff_derived_tstamp"
}
},
{
"expectation_type": "expect_column_values_to_be_null",
"kwargs": {
"column": "late_arriving_dvc_created_sent"
}
},
{
"expectation_type": "expect_column_values_to_be_null",
"kwargs": {
"column": "clean_session"
}
}
],
"meta": {
"versions": {
"test_suite_version": "1.1.1",
"bigquery_model_version": "1.0.3"
},
"__comment__": "expect_column_values_to_be_null on column stray_page_ping has been removed as it is a known issue (https://github.com/snowplow/data-models/issues/92)",
"great_expectations.__version__": "0.12.0"
}
}

2 changes: 1 addition & 1 deletion .test/great_expectations/expectations/web/v1/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@
],
"meta": {
"versions": {
"test_suite_version": "1.1.0",
"test_suite_version": "1.1.1",
"redshift_model_version": "1.2.0",
"bigquery_model_version": "1.0.3",
"snowflake_model_version": "1.0.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
],
"meta": {
"versions": {
"test_suite_version": "1.1.0",
"test_suite_version": "1.1.1",
"redshift_model_version": "1.2.0",
"bigquery_model_version": "1.0.3",
"snowflake_model_version": "1.0.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@
],
"meta": {
"versions": {
"test_suite_version": "1.1.0",
"test_suite_version": "1.1.1",
"redshift_model_version": "1.2.0",
"bigquery_model_version": "1.0.3",
"snowflake_model_version": "1.0.0"
Expand Down
2 changes: 1 addition & 1 deletion .test/great_expectations/expectations/web/v1/sessions.json
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@
],
"meta": {
"versions": {
"test_suite_version": "1.1.0",
"test_suite_version": "1.1.1",
"redshift_model_version": "1.2.0",
"bigquery_model_version": "1.0.3",
"snowflake_model_version": "1.0.0"
Expand Down
2 changes: 1 addition & 1 deletion .test/great_expectations/expectations/web/v1/users.json
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@
],
"meta": {
"versions": {
"test_suite_version": "1.1.0",
"test_suite_version": "1.1.1",
"redshift_model_version": "1.2.0",
"bigquery_model_version": "1.0.3",
"snowflake_model_version": "1.0.0"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"validation_operator_name": "action_list_operator",
"batches": [
{
"batch_kwargs": {
"datasource": "bigquery",
"query": "WITH expected_hashed AS ( SELECT a AS data, FARM_FINGERPRINT(FORMAT( '%%T', a)) AS h FROM dv_test_data.events_staged_run_1 AS a ), actual_hashed AS ( SELECT b AS data, FARM_FINGERPRINT(FORMAT( '%%T', b)) AS h FROM scratch_dev1.events_staged AS b ), equality_check AS ( SELECT IF(l.h IS NULL, 'New on right', 'New on left') AS Change, IF(l.h IS NULL,r.data,l.data).* FROM expected_hashed l FULL OUTER JOIN actual_hashed r ON l.h = r.h WHERE l.h IS NULL OR r.h IS NULL)SELECT SUM(CASE WHEN user_id = 'long session' THEN 1 END) AS long_session, SUM(CASE WHEN user_id = 'NULL page_view_id' THEN 1 END) AS null_page_view_id, SUM(CASE WHEN user_id = 'NULL domain_userid' THEN 1 END) AS null_domain_userid, SUM(CASE WHEN user_id = 'NULL domain_sessionid' THEN 1 END) AS null_domain_sessionid, SUM(CASE WHEN user_id = 'dupe: event_id same collector_tstamp' THEN 1 END) AS dupe_event_id_same_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: event_id different collector_tstamp' THEN 1 END) AS dupe_event_id_diff_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id same derived_tstamp' THEN 1 END) AS dupe_page_view_id_same_derived_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id different derived_tstamp' THEN 1 END) AS dupe_page_view_id_diff_derived_tstamp, SUM(CASE WHEN user_id = 'late arriving: device created/sent >3 days' THEN 1 END) AS late_arriving_dvc_created_sent, SUM(CASE WHEN user_id = 'stray page ping' THEN 1 END) AS stray_page_ping, SUM(CASE WHEN user_id = 'No edge cases' THEN 1 END) AS clean_session FROM equality_check",
"bigquery_temp_table": "ge_test_derived_events_staged_integration"
},
"expectation_suite_names": ["web.v1.integration_tests"]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"validation_operator_name": "action_list_operator",
"batches": [
{
"batch_kwargs": {
"datasource": "bigquery",
"query": "WITH expected_hashed AS ( SELECT a AS data, FARM_FINGERPRINT(FORMAT( '%%T', a)) AS h FROM dv_test_data.events_staged_run_2 AS a ), actual_hashed AS ( SELECT b AS data, FARM_FINGERPRINT(FORMAT( '%%T', b)) AS h FROM scratch_dev1.events_staged AS b ), equality_check AS ( SELECT IF(l.h IS NULL, 'New on right', 'New on left') AS Change, IF(l.h IS NULL,r.data,l.data).* FROM expected_hashed l FULL OUTER JOIN actual_hashed r ON l.h = r.h WHERE l.h IS NULL OR r.h IS NULL)SELECT SUM(CASE WHEN user_id = 'long session' THEN 1 END) AS long_session, SUM(CASE WHEN user_id = 'NULL page_view_id' THEN 1 END) AS null_page_view_id, SUM(CASE WHEN user_id = 'NULL domain_userid' THEN 1 END) AS null_domain_userid, SUM(CASE WHEN user_id = 'NULL domain_sessionid' THEN 1 END) AS null_domain_sessionid, SUM(CASE WHEN user_id = 'dupe: event_id same collector_tstamp' THEN 1 END) AS dupe_event_id_same_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: event_id different collector_tstamp' THEN 1 END) AS dupe_event_id_diff_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id same derived_tstamp' THEN 1 END) AS dupe_page_view_id_same_derived_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id different derived_tstamp' THEN 1 END) AS dupe_page_view_id_diff_derived_tstamp, SUM(CASE WHEN user_id = 'late arriving: device created/sent >3 days' THEN 1 END) AS late_arriving_dvc_created_sent, SUM(CASE WHEN user_id = 'stray page ping' THEN 1 END) AS stray_page_ping, SUM(CASE WHEN user_id = 'No edge cases' THEN 1 END) AS clean_session FROM equality_check",
"bigquery_temp_table": "ge_test_derived_events_staged_integration"
},
"expectation_suite_names": ["web.v1.integration_tests"]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"validation_operator_name": "action_list_operator",
"batches": [
{
"batch_kwargs": {
"datasource": "bigquery",
"query": "WITH expected_hashed AS ( SELECT a AS data, FARM_FINGERPRINT(FORMAT( '%%T', a)) AS h FROM dv_test_data.events_staged_run_3 AS a ), actual_hashed AS ( SELECT b AS data, FARM_FINGERPRINT(FORMAT( '%%T', b)) AS h FROM scratch_dev1.events_staged AS b ), equality_check AS ( SELECT IF(l.h IS NULL, 'New on right', 'New on left') AS Change, IF(l.h IS NULL,r.data,l.data).* FROM expected_hashed l FULL OUTER JOIN actual_hashed r ON l.h = r.h WHERE l.h IS NULL OR r.h IS NULL)SELECT SUM(CASE WHEN user_id = 'long session' THEN 1 END) AS long_session, SUM(CASE WHEN user_id = 'NULL page_view_id' THEN 1 END) AS null_page_view_id, SUM(CASE WHEN user_id = 'NULL domain_userid' THEN 1 END) AS null_domain_userid, SUM(CASE WHEN user_id = 'NULL domain_sessionid' THEN 1 END) AS null_domain_sessionid, SUM(CASE WHEN user_id = 'dupe: event_id same collector_tstamp' THEN 1 END) AS dupe_event_id_same_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: event_id different collector_tstamp' THEN 1 END) AS dupe_event_id_diff_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id same derived_tstamp' THEN 1 END) AS dupe_page_view_id_same_derived_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id different derived_tstamp' THEN 1 END) AS dupe_page_view_id_diff_derived_tstamp, SUM(CASE WHEN user_id = 'late arriving: device created/sent >3 days' THEN 1 END) AS late_arriving_dvc_created_sent, SUM(CASE WHEN user_id = 'stray page ping' THEN 1 END) AS stray_page_ping, SUM(CASE WHEN user_id = 'No edge cases' THEN 1 END) AS clean_session FROM equality_check",
"bigquery_temp_table": "ge_test_derived_events_staged_integration"
},
"expectation_suite_names": ["web.v1.integration_tests"]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"validation_operator_name": "action_list_operator",
"batches": [
{
"batch_kwargs": {
"datasource": "bigquery",
"query": "WITH expected_hashed AS ( SELECT a AS data, FARM_FINGERPRINT(FORMAT( '%%T', a)) AS h FROM dv_test_data.events_staged_run_4 AS a ), actual_hashed AS ( SELECT b AS data, FARM_FINGERPRINT(FORMAT( '%%T', b)) AS h FROM scratch_dev1.events_staged AS b ), equality_check AS ( SELECT IF(l.h IS NULL, 'New on right', 'New on left') AS Change, IF(l.h IS NULL,r.data,l.data).* FROM expected_hashed l FULL OUTER JOIN actual_hashed r ON l.h = r.h WHERE l.h IS NULL OR r.h IS NULL)SELECT SUM(CASE WHEN user_id = 'long session' THEN 1 END) AS long_session, SUM(CASE WHEN user_id = 'NULL page_view_id' THEN 1 END) AS null_page_view_id, SUM(CASE WHEN user_id = 'NULL domain_userid' THEN 1 END) AS null_domain_userid, SUM(CASE WHEN user_id = 'NULL domain_sessionid' THEN 1 END) AS null_domain_sessionid, SUM(CASE WHEN user_id = 'dupe: event_id same collector_tstamp' THEN 1 END) AS dupe_event_id_same_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: event_id different collector_tstamp' THEN 1 END) AS dupe_event_id_diff_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id same derived_tstamp' THEN 1 END) AS dupe_page_view_id_same_derived_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id different derived_tstamp' THEN 1 END) AS dupe_page_view_id_diff_derived_tstamp, SUM(CASE WHEN user_id = 'late arriving: device created/sent >3 days' THEN 1 END) AS late_arriving_dvc_created_sent, SUM(CASE WHEN user_id = 'stray page ping' THEN 1 END) AS stray_page_ping, SUM(CASE WHEN user_id = 'No edge cases' THEN 1 END) AS clean_session FROM equality_check",
"bigquery_temp_table": "ge_test_derived_events_staged_integration"
},
"expectation_suite_names": ["web.v1.integration_tests"]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"validation_operator_name": "action_list_operator",
"batches": [
{
"batch_kwargs": {
"datasource": "bigquery",
"query": "WITH expected_hashed AS ( SELECT a AS data, FARM_FINGERPRINT(FORMAT( '%%T', a)) AS h FROM dv_test_data.events_staged_run_5 AS a ), actual_hashed AS ( SELECT b AS data, FARM_FINGERPRINT(FORMAT( '%%T', b)) AS h FROM scratch_dev1.events_staged AS b ), equality_check AS ( SELECT IF(l.h IS NULL, 'New on right', 'New on left') AS Change, IF(l.h IS NULL,r.data,l.data).* FROM expected_hashed l FULL OUTER JOIN actual_hashed r ON l.h = r.h WHERE l.h IS NULL OR r.h IS NULL)SELECT SUM(CASE WHEN user_id = 'long session' THEN 1 END) AS long_session, SUM(CASE WHEN user_id = 'NULL page_view_id' THEN 1 END) AS null_page_view_id, SUM(CASE WHEN user_id = 'NULL domain_userid' THEN 1 END) AS null_domain_userid, SUM(CASE WHEN user_id = 'NULL domain_sessionid' THEN 1 END) AS null_domain_sessionid, SUM(CASE WHEN user_id = 'dupe: event_id same collector_tstamp' THEN 1 END) AS dupe_event_id_same_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: event_id different collector_tstamp' THEN 1 END) AS dupe_event_id_diff_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id same derived_tstamp' THEN 1 END) AS dupe_page_view_id_same_derived_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id different derived_tstamp' THEN 1 END) AS dupe_page_view_id_diff_derived_tstamp, SUM(CASE WHEN user_id = 'late arriving: device created/sent >3 days' THEN 1 END) AS late_arriving_dvc_created_sent, SUM(CASE WHEN user_id = 'stray page ping' THEN 1 END) AS stray_page_ping, SUM(CASE WHEN user_id = 'No edge cases' THEN 1 END) AS clean_session FROM equality_check",
"bigquery_temp_table": "ge_test_derived_events_staged_integration"
},
"expectation_suite_names": ["web.v1.integration_tests"]
}
]
}
Loading

0 comments on commit a38e76b

Please sign in to comment.