Skip to content

Commit

Permalink
Merge pull request #570 from basedosdados/staging/add_custom_tests
Browse files Browse the repository at this point in the history
[dbt] add custom tests
  • Loading branch information
folhesgabriel authored May 6, 2024
2 parents f0b94cc + 12ba509 commit 4a4bf03
Show file tree
Hide file tree
Showing 4 changed files with 206 additions and 17 deletions.
70 changes: 53 additions & 17 deletions macros/custom_get_where_subquery.sql
Original file line number Diff line number Diff line change
@@ -1,26 +1,58 @@
-- This macro is used to get a subquery with a where clause that can be used in a test
-- to filter the data to be tested. The macro looks for a where clause in the model's
-- config (schema.yml) and replaces the placeholder "__most_recent_year_month__" with
-- the maximum
-- year and month found in the relation. The macro returns a subquery with the where
-- thats used
-- to filter the data to be tested
{% macro get_where_subquery(relation) -%}
-- https://github.com/basedosdados/pipelines/wiki/Incluindo-testes-no-seu-modelo#where--__most_recent_year_month__--__most_recent_date__--__most_recent_year__
{% macro get_where_subquery(relation) %}
{% set where = config.get("where", "") %}

{% if where %}
{% set max_year_query = "" %}
{% set max_date_query = "" %}
{% set max_year = "" %}
{% set max_date = "" %}

{# This block looks for __most_recent_year__ placeholder #}
{% if "__most_recent_year__" in where %}
{% set max_year_query = (
"select max(cast(ano as int64)) as max_year from " ~ relation
) %}
{% set max_year_result = run_query(max_year_query) %}
{% if execute and max_year_result.rows[0][0] %}
{% set max_year = max_year_result.rows[0][0] %}
{% set where = where | replace(
"__most_recent_year__", "ano = '" ~ max_year ~ "'"
) %}
{% do log(
"The test will filter by the most recent year: "
~ max_year,
info=True,
) %}
{% endif %}
{% endif %}

{# This block looks for __most_recent_date__ placeholder #}
{% if "__most_recent_date__" in where %}
{% set max_date_query = "select max(data) as max_date from " ~ relation %}
{% set max_date_result = run_query(max_date_query) %}
{% if execute and max_date_result.rows[0][0] %}
{% set max_date = max_date_result.rows[0][0] %}
{% set where = where | replace(
"__most_recent_date__", "data = '" ~ max_date ~ "'"
) %}
{% do log(
"The test will filter by the most recent date: "
~ max_date,
info=True,
) %}
{% endif %}
{% endif %}

{# This block looks for __most_recent_year_month__ placeholder #}
{% if "__most_recent_year_month__" in where %}
{# Construct a query to find the maximum date using ano and mes columns #}
{% set max_date_query = (
"select format_date('%Y-%m', max(date(cast(ano as int64), cast(mes as int64), 1))) as max_date from "
~ relation
) %}
{% set max_date_result = run_query(max_date_query) %}

{% if execute %}
{# % do log(max_date_query, info=True) %#}
{# % do log(max_date_result, info=True) %#}
{# Extract the maximum year and month from the max_date #}
{% set max_date = max_date_result.rows[0][0] %}
{% set max_year = max_date[:4] %}
{% set max_month = max_date[5:7] %}
Expand All @@ -37,11 +69,15 @@
{% endif %}
{% endif %}

{%- set filtered -%}
(select * from {{ relation }} where {{ where }}) dbt_subquery
{%- endset -%}

{# Return the filtered subquery #}
{% set filtered = (
"(select * from "
~ relation
~ " where "
~ where
~ ") dbt_subquery"
) %}
{% do return(filtered) %}
{% else %} {% do return(relation) %}
{% endif %}
{%- endmacro %}
{% endmacro %}
50 changes: 50 additions & 0 deletions tests/generic/custom_dictionaries.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
-- https://github.com/basedosdados/pipelines/wiki/Incluindo-testes-no-seu-modelo#dicionários
{% test custom_dictionaries(
model, dictionary_model_name, table_id, columns_covered_by_dictionary
) %}
{{ config(severity="error") }}

{%- set combined_query_parts = [] -%}
{%- set union_parts = [] -%}

{%- for column_name in columns_covered_by_dictionary %}
{% set subquery_name = "exceptions_" ~ loop.index %}
{% set left_table_name = "data_table_" ~ loop.index %}
{% set right_table_name = "dictionary_table_" ~ loop.index %}

{% set subquery %}
{{ left_table_name }} as (
select {{ column_name }} as id
from {{ model }}
where {{ column_name }} is not null
),
{{ right_table_name }} as (
select chave
from {{ dictionary_model_name }}
where valor is not null
and id_tabela = '{{ table_id }}'
and nome_coluna = '{{ column_name }}'
),
{{ subquery_name }} as (
select '{{ column_name }}' as failed_column, id as missing_value
from {{ left_table_name }}
left join {{ right_table_name }} on {{ left_table_name }}.id = {{ right_table_name }}.chave
where {{ right_table_name }}.chave is null
)
{% endset %}

{%- do combined_query_parts.append(subquery) -%}
{%- do union_parts.append(subquery_name) -%}
{%- endfor %}

{# Combine all CTEs into a single WITH clause and then union all results #}
{% set final_query %}
with
{{ combined_query_parts | join(', ') }}

select distinct failed_column, missing_value from {{ union_parts | join(' union all select distinct failed_column, missing_value from ') }}
{% endset %}

{{ return(final_query) }}

{% endtest %}
54 changes: 54 additions & 0 deletions tests/generic/custom_relationships.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{% test custom_relationships(
model,
column_name,
to,
field,
ignore_values=None,
proportion_allowed_failures=0.05
) %}

{{ config(severity="error") }}

with
child as (
select {{ column_name }} as child_value
from {{ model }}
{% if ignore_values %}
where {{ column_name }} not in ('{{ ignore_values | join("', '") }}')
{% endif %}
),
parent as (select {{ field }} as parent_value from {{ to }}),
validation as (
select child.child_value
from child
left join parent on child.child_value = parent.parent_value
where parent.parent_value is null
),
summary as (
select
count(*) as total_missing,
(select count(*) from child) as total_child_records,
round(count(*) / (select count(*) from child), 2) as failure_rate
from validation
)

select
total_missing,
total_child_records,
failure_rate,
case
when failure_rate > {{ proportion_allowed_failures }}
then
'Test failed: Failure rate of '
|| failure_rate
|| '% exceeds allowed proportion of '
|| '{{ proportion_allowed_failures }}%'
else
'Test passed: Failure rate of '
|| failure_rate
|| '% within acceptable limits'
end as result_message
from summary
where failure_rate > {{ proportion_allowed_failures }}

{% endtest %}
49 changes: 49 additions & 0 deletions tests/generic/custom_unique_combination_of_columns.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{% test custom_unique_combinations_of_columns(
model, combination_of_columns, proportion_allowed_failures=0.05
) %}

{{ config(severity="error") }}

{%- set column_list = combination_of_columns %}
{%- set columns_csv = column_list | join(", ") %}

with
validation_data as (
select {{ columns_csv }}, count(*) as duplicates_count
from {{ model }}
group by {{ columns_csv }}
having count(*) > 1
),
summary as (
select duplicates_count, (select count(*) from {{ model }}) as total_rows
from validation_data
),

final_summary as (
select
duplicates_count,
total_rows,
round(duplicates_count / total_rows, 2) as failure_rate
from summary
)

select
duplicates_count,
total_rows,
failure_rate,
case
when failure_rate > {{ proportion_allowed_failures }}
then
'Test failed: Proportion of non-unique '
|| failure_rate
|| '% exceeds allowed proportion '
|| '{{ proportion_allowed_failures }}%'
else
'Test passed: Proportion of non-unique '
|| failure_rate
|| '% within acceptable limits'
end as log_message
from final_summary
where failure_rate > {{ proportion_allowed_failures }}

{% endtest %}

0 comments on commit 4a4bf03

Please sign in to comment.