From 8b1b157e1df644f2f18b7af5ad27fbfe2b86c9fb Mon Sep 17 00:00:00 2001 From: folhesgabriel Date: Thu, 2 May 2024 15:00:21 -0300 Subject: [PATCH 1/8] feat: add customs dbt tests --- tests/generic/custom_relationships.sql | 56 +++++++++++++++++++ .../custom_unique_combination_of_columns.sql | 49 ++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 tests/generic/custom_relationships.sql create mode 100644 tests/generic/custom_unique_combination_of_columns.sql diff --git a/tests/generic/custom_relationships.sql b/tests/generic/custom_relationships.sql new file mode 100644 index 00000000..618e4a30 --- /dev/null +++ b/tests/generic/custom_relationships.sql @@ -0,0 +1,56 @@ +{% test custom_relationships( + model, + column_name, + to, + field, + ignore_values=None, + proportion_allowed_failures=0 +) %} + + {{ config(severity="error") }} + + with + child as ( + select {{ column_name }} as child_value + from {{ model }} + {% if ignore_values %} + where {{ column_name }} not in ({{ ignore_values | join(", ") }}) + {% endif %} + ), + parent as (select {{ field }} as parent_value from {{ to }}), + validation as ( + select child.child_value + from child + left join parent on child.child_value = parent.parent_value + where parent.parent_value is null + ), + summary as ( + select + count(*) as total_missing, + (select count(*) from child) as total_child_records, + round( + ((count(*) * 100) / (select count(*) from child)), 2 + ) as failure_rate + from validation + ) + + select + total_missing, + total_child_records, + failure_rate, + case + when failure_rate > {{ proportion_allowed_failures }} + then + 'Test failed: Failure rate of ' + || failure_rate + || '% exceeds allowed proportion of ' + || '{{ proportion_allowed_failures }}%' + else + 'Test passed: Failure rate of ' + || failure_rate + || '% within acceptable limits' + end as result_message + from summary + where failure_rate > {{ proportion_allowed_failures }} + +{% endtest %} diff --git a/tests/generic/custom_unique_combination_of_columns.sql b/tests/generic/custom_unique_combination_of_columns.sql new file mode 100644 index 00000000..32b6c339 --- /dev/null +++ b/tests/generic/custom_unique_combination_of_columns.sql @@ -0,0 +1,49 @@ +{% test custom_unique_combinations_of_columns( + model, combination_of_columns, proportion_allowed_failures=5 +) %} + + {{ config(severity="error") }} + + {%- set column_list = combination_of_columns %} + {%- set columns_csv = column_list | join(", ") %} + + with + validation_data as ( + select {{ columns_csv }}, count(*) as duplicates_count + from {{ model }} + group by {{ columns_csv }} + having count(*) > 1 + ), + summary as ( + select duplicates_count, (select count(*) from {{ model }}) as total_rows + from validation_data + ), + + final_summary as ( + select + duplicates_count, + total_rows, + round(((duplicates_count * 100) / total_rows), 2) as proportion + from summary + ) + + select + duplicates_count, + total_rows, + proportion, + case + when proportion > {{ proportion_allowed_failures }} + then + 'Test failed: Proportion of non-unique ' + || proportion + || '% exceeds allowed proportion ' + || '{{ proportion_allowed_failures }}%' + else + 'Test passed: Proportion of non-unique ' + || proportion + || '% within acceptable limits' + end as log_message + from final_summary + where proportion > {{ proportion_allowed_failures }} + +{% endtest %} From 35adfb2b25480b681945087db186490162ae9c78 Mon Sep 17 00:00:00 2001 From: folhesgabriel Date: Thu, 2 May 2024 16:27:30 -0300 Subject: [PATCH 2/8] =?UTF-8?q?deixa=20a=20propor=C3=A7=C3=A3o=20como=20um?= =?UTF-8?q?=20n=C3=BAmero=20entre=200=20e=201?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/generic/custom_relationships.sql | 2 +- tests/generic/custom_unique_combination_of_columns.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/generic/custom_relationships.sql b/tests/generic/custom_relationships.sql index 618e4a30..32697dee 100644 --- a/tests/generic/custom_relationships.sql +++ b/tests/generic/custom_relationships.sql @@ -29,7 +29,7 @@ count(*) as total_missing, (select count(*) from child) as total_child_records, round( - ((count(*) * 100) / (select count(*) from child)), 2 + ((count(*)) / (select count(*) from child)), 2 ) as failure_rate from validation ) diff --git a/tests/generic/custom_unique_combination_of_columns.sql b/tests/generic/custom_unique_combination_of_columns.sql index 32b6c339..e823f9ee 100644 --- a/tests/generic/custom_unique_combination_of_columns.sql +++ b/tests/generic/custom_unique_combination_of_columns.sql @@ -23,7 +23,7 @@ select duplicates_count, total_rows, - round(((duplicates_count * 100) / total_rows), 2) as proportion + round(((duplicates_count) / total_rows), 2) as proportion from summary ) From 80d99c815726d9998f5b55ff9515ce7b2e42a734 Mon Sep 17 00:00:00 2001 From: folhesgabriel Date: Fri, 3 May 2024 07:43:00 -0300 Subject: [PATCH 3/8] feat: enhance custom_get_where_subquery macro with new place holders --- macros/custom_get_where_subquery.sql | 70 +++++++++++++++++++++------- 1 file changed, 53 insertions(+), 17 deletions(-) diff --git a/macros/custom_get_where_subquery.sql b/macros/custom_get_where_subquery.sql index 1dac0fd3..8b33dddc 100644 --- a/macros/custom_get_where_subquery.sql +++ b/macros/custom_get_where_subquery.sql @@ -1,16 +1,51 @@ --- This macro is used to get a subquery with a where clause that can be used in a test --- to filter the data to be tested. The macro looks for a where clause in the model's --- config (schema.yml) and replaces the placeholder "__most_recent_year_month__" with --- the maximum --- year and month found in the relation. The macro returns a subquery with the where --- thats used --- to filter the data to be tested -{% macro get_where_subquery(relation) -%} +-- https://github.com/basedosdados/pipelines/wiki/Incluindo-testes-no-seu-modelo#where--__most_recent_year_month__--__most_recent_date__--__most_recent_year__ +{% macro get_where_subquery(relation) %} {% set where = config.get("where", "") %} {% if where %} + {% set max_year_query = "" %} + {% set max_date_query = "" %} + {% set max_year = "" %} + {% set max_date = "" %} + + {# This block looks for __most_recent_year__ placeholder #} + {% if "__most_recent_year__" in where %} + {% set max_year_query = ( + "select max(cast(ano as int64)) as max_year from " ~ relation + ) %} + {% set max_year_result = run_query(max_year_query) %} + {% if execute and max_year_result.rows[0][0] %} + {% set max_year = max_year_result.rows[0][0] %} + {% set where = where | replace( + "__most_recent_year__", "ano = '" ~ max_year ~ "'" + ) %} + {% do log( + "The test will filter by the most recent year: " + ~ max_year, + info=True, + ) %} + {% endif %} + {% endif %} + + {# This block looks for __most_recent_date__ placeholder #} + {% if "__most_recent_date__" in where %} + {% set max_date_query = "select max(data) as max_date from " ~ relation %} + {% set max_date_result = run_query(max_date_query) %} + {% if execute and max_date_result.rows[0][0] %} + {% set max_date = max_date_result.rows[0][0] %} + {% set where = where | replace( + "__most_recent_date__", "data = '" ~ max_date ~ "'" + ) %} + {% do log( + "The test will filter by the most recent date: " + ~ max_date, + info=True, + ) %} + {% endif %} + {% endif %} + + {# This block looks for __most_recent_year_month__ placeholder #} {% if "__most_recent_year_month__" in where %} - {# Construct a query to find the maximum date using ano and mes columns #} {% set max_date_query = ( "select format_date('%Y-%m', max(date(cast(ano as int64), cast(mes as int64), 1))) as max_date from " ~ relation @@ -18,9 +53,6 @@ {% set max_date_result = run_query(max_date_query) %} {% if execute %} - {# % do log(max_date_query, info=True) %#} - {# % do log(max_date_result, info=True) %#} - {# Extract the maximum year and month from the max_date #} {% set max_date = max_date_result.rows[0][0] %} {% set max_year = max_date[:4] %} {% set max_month = max_date[5:7] %} @@ -37,11 +69,15 @@ {% endif %} {% endif %} - {%- set filtered -%} - (select * from {{ relation }} where {{ where }}) dbt_subquery - {%- endset -%} - + {# Return the filtered subquery #} + {% set filtered = ( + "(select * from " + ~ relation + ~ " where " + ~ where + ~ ") dbt_subquery" + ) %} {% do return(filtered) %} {% else %} {% do return(relation) %} {% endif %} -{%- endmacro %} +{% endmacro %} From f53f40e8ba692249f0ca911dda6d1cd5e940607a Mon Sep 17 00:00:00 2001 From: folhesgabriel Date: Fri, 3 May 2024 07:44:10 -0300 Subject: [PATCH 4/8] feat: refactor names and proportion formating of custom tests --- tests/generic/custom_relationships.sql | 8 +++----- .../custom_unique_combination_of_columns.sql | 14 +++++++------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/tests/generic/custom_relationships.sql b/tests/generic/custom_relationships.sql index 32697dee..65af9f4e 100644 --- a/tests/generic/custom_relationships.sql +++ b/tests/generic/custom_relationships.sql @@ -4,7 +4,7 @@ to, field, ignore_values=None, - proportion_allowed_failures=0 + proportion_allowed_failures=0.05 ) %} {{ config(severity="error") }} @@ -14,7 +14,7 @@ select {{ column_name }} as child_value from {{ model }} {% if ignore_values %} - where {{ column_name }} not in ({{ ignore_values | join(", ") }}) + where {{ column_name }} not in ('{{ ignore_values | join("', '") }}') {% endif %} ), parent as (select {{ field }} as parent_value from {{ to }}), @@ -28,9 +28,7 @@ select count(*) as total_missing, (select count(*) from child) as total_child_records, - round( - ((count(*)) / (select count(*) from child)), 2 - ) as failure_rate + round(count(*) / (select count(*) from child), 2) as failure_rate from validation ) diff --git a/tests/generic/custom_unique_combination_of_columns.sql b/tests/generic/custom_unique_combination_of_columns.sql index e823f9ee..05c9a364 100644 --- a/tests/generic/custom_unique_combination_of_columns.sql +++ b/tests/generic/custom_unique_combination_of_columns.sql @@ -1,5 +1,5 @@ {% test custom_unique_combinations_of_columns( - model, combination_of_columns, proportion_allowed_failures=5 + model, combination_of_columns, proportion_allowed_failures=0.05 ) %} {{ config(severity="error") }} @@ -23,27 +23,27 @@ select duplicates_count, total_rows, - round(((duplicates_count) / total_rows), 2) as proportion + round(duplicates_count / total_rows, 2) as failure_rate from summary ) select duplicates_count, total_rows, - proportion, + failure_rate, case - when proportion > {{ proportion_allowed_failures }} + when failure_rate > {{ proportion_allowed_failures }} then 'Test failed: Proportion of non-unique ' - || proportion + || failure_rate || '% exceeds allowed proportion ' || '{{ proportion_allowed_failures }}%' else 'Test passed: Proportion of non-unique ' - || proportion + || failure_rate || '% within acceptable limits' end as log_message from final_summary - where proportion > {{ proportion_allowed_failures }} + where failure_rate > {{ proportion_allowed_failures }} {% endtest %} From a39bef47c405cc6ee730e4cb6cfae5a1ed6a6f12 Mon Sep 17 00:00:00 2001 From: folhesgabriel Date: Fri, 3 May 2024 08:49:28 -0300 Subject: [PATCH 5/8] feat: add beta custom_dicionaries test --- tests/generic/custom_dictionaries.sql | 42 +++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 tests/generic/custom_dictionaries.sql diff --git a/tests/generic/custom_dictionaries.sql b/tests/generic/custom_dictionaries.sql new file mode 100644 index 00000000..1f574f8d --- /dev/null +++ b/tests/generic/custom_dictionaries.sql @@ -0,0 +1,42 @@ +-- o objetivo do teste é verificar se todos os valores que estão presentes em colunas +-- de tabelas com dicionário também estão nos dicionários +{% test custom_dictionaries( + model, table_id, dictionary_model_name, columns_covered_by_dictionary +) %} + + {{ config(severity="error") }} + + {% for column_name in columns_covered_by_dictionary %} + + with + left_table as ( + select {{ column_name }} as id + from {{ model }} + where {{ column_name }} is not null + ), + + right_table as ( + select chave as id + from {{ dictionary_model_name }} + where + valor is not null + and id_tabela = '{{ table_id }}' + and nome_coluna = '{{ column_name }}' + ), + + exceptions as ( + select left_table.id + from left_table + left join right_table on left_table.id = right_table.id + where right_table.id is null + ) + + select '{{ column_name }}' as failed_column, id as missing_value + from exceptions + {% if not loop.last %} + union all + {% endif %} + + {% endfor %} + +{% endtest %} From c45b815a3794aec050a4ee68b1bbc33476a41642 Mon Sep 17 00:00:00 2001 From: folhesgabriel Date: Fri, 3 May 2024 11:08:11 -0300 Subject: [PATCH 6/8] feat: add custom_dicionaries test --- tests/generic/custom_dictionaries.sql | 62 ++++++++++++++++----------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/tests/generic/custom_dictionaries.sql b/tests/generic/custom_dictionaries.sql index 1f574f8d..5e648328 100644 --- a/tests/generic/custom_dictionaries.sql +++ b/tests/generic/custom_dictionaries.sql @@ -1,42 +1,52 @@ --- o objetivo do teste é verificar se todos os valores que estão presentes em colunas --- de tabelas com dicionário também estão nos dicionários +-- https://github.com/basedosdados/pipelines/wiki/Incluindo-testes-no-seu-modelo#dicionários {% test custom_dictionaries( - model, table_id, dictionary_model_name, columns_covered_by_dictionary + model, dictionary_model_name, table_id, columns_covered_by_dictionary ) %} - {{ config(severity="error") }} - {% for column_name in columns_covered_by_dictionary %} + {%- set combined_query_parts = [] -%} + {%- set union_parts = [] -%} - with - left_table as ( + {%- for column_name in columns_covered_by_dictionary %} + {% set subquery_name = "exceptions_" ~ loop.index %} + {% set left_table_name = "left_table_" ~ loop.index %} + {% set right_table_name = "right_table_" ~ loop.index %} + + {% set subquery %} + {{ left_table_name }} as ( select {{ column_name }} as id from {{ model }} where {{ column_name }} is not null ), - - right_table as ( - select chave as id + {{ right_table_name }} as ( + select chave from {{ dictionary_model_name }} - where - valor is not null - and id_tabela = '{{ table_id }}' - and nome_coluna = '{{ column_name }}' + where valor is not null + and id_tabela = '{{ table_id }}' + and nome_coluna = '{{ column_name }}' ), - - exceptions as ( - select left_table.id - from left_table - left join right_table on left_table.id = right_table.id - where right_table.id is null + {{ subquery_name }} as ( + select '{{ column_name }}' as failed_column, id as missing_value + from {{ left_table_name }} + left join {{ right_table_name }} on {{ left_table_name }}.id = {{ right_table_name }}.chave + where {{ right_table_name }}.chave is null ) + {% endset %} + + {%- do combined_query_parts.append(subquery) -%} + {%- do union_parts.append(subquery_name) -%} + {%- endfor %} + + {# Combine all CTEs into a single WITH clause and then union all results #} + {% set final_query %} + with + {{ combined_query_parts | join(', ') }} + + select distinct failed_column, missing_value from {{ union_parts | join(' union all select distinct failed_column, missing_value from ') }} + {% endset %} - select '{{ column_name }}' as failed_column, id as missing_value - from exceptions - {% if not loop.last %} - union all - {% endif %} + {% do log(final_query, info=True) %} - {% endfor %} + {{ return(final_query) }} {% endtest %} From 8c5953d61f8a548f697a4ca3fadd9018b0ba5451 Mon Sep 17 00:00:00 2001 From: folhesgabriel Date: Fri, 3 May 2024 13:41:00 -0300 Subject: [PATCH 7/8] feat: improve names of custom_dictionaries macro --- tests/generic/custom_dictionaries.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/generic/custom_dictionaries.sql b/tests/generic/custom_dictionaries.sql index 5e648328..50b62c53 100644 --- a/tests/generic/custom_dictionaries.sql +++ b/tests/generic/custom_dictionaries.sql @@ -9,8 +9,8 @@ {%- for column_name in columns_covered_by_dictionary %} {% set subquery_name = "exceptions_" ~ loop.index %} - {% set left_table_name = "left_table_" ~ loop.index %} - {% set right_table_name = "right_table_" ~ loop.index %} + {% set left_table_name = "data_table_" ~ loop.index %} + {% set right_table_name = "dictionary_table_" ~ loop.index %} {% set subquery %} {{ left_table_name }} as ( From c1318e5a59b654e7f7337274727977f7c33d7974 Mon Sep 17 00:00:00 2001 From: folhesgabriel Date: Fri, 3 May 2024 13:49:24 -0300 Subject: [PATCH 8/8] remove query log from custom_dictionaries --- tests/generic/custom_dictionaries.sql | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/generic/custom_dictionaries.sql b/tests/generic/custom_dictionaries.sql index 50b62c53..b60b1a10 100644 --- a/tests/generic/custom_dictionaries.sql +++ b/tests/generic/custom_dictionaries.sql @@ -45,8 +45,6 @@ select distinct failed_column, missing_value from {{ union_parts | join(' union all select distinct failed_column, missing_value from ') }} {% endset %} - {% do log(final_query, info=True) %} - {{ return(final_query) }} {% endtest %}