Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[dbt] add custom tests #570

Merged
merged 11 commits into from
May 6, 2024
70 changes: 53 additions & 17 deletions macros/custom_get_where_subquery.sql
Original file line number Diff line number Diff line change
@@ -1,26 +1,58 @@
-- This macro is used to get a subquery with a where clause that can be used in a test
-- to filter the data to be tested. The macro looks for a where clause in the model's
-- config (schema.yml) and replaces the placeholder "__most_recent_year_month__" with
-- the maximum
-- year and month found in the relation. The macro returns a subquery with the where
-- thats used
-- to filter the data to be tested
{% macro get_where_subquery(relation) -%}
-- https://github.com/basedosdados/pipelines/wiki/Incluindo-testes-no-seu-modelo#where--__most_recent_year_month__--__most_recent_date__--__most_recent_year__
{% macro get_where_subquery(relation) %}
{% set where = config.get("where", "") %}

{% if where %}
{% set max_year_query = "" %}
{% set max_date_query = "" %}
{% set max_year = "" %}
{% set max_date = "" %}

{# This block looks for __most_recent_year__ placeholder #}
{% if "__most_recent_year__" in where %}
{% set max_year_query = (
"select max(cast(ano as int64)) as max_year from " ~ relation
) %}
{% set max_year_result = run_query(max_year_query) %}
{% if execute and max_year_result.rows[0][0] %}
{% set max_year = max_year_result.rows[0][0] %}
{% set where = where | replace(
"__most_recent_year__", "ano = '" ~ max_year ~ "'"
) %}
{% do log(
"The test will filter by the most recent year: "
~ max_year,
info=True,
) %}
{% endif %}
{% endif %}

{# This block looks for __most_recent_date__ placeholder #}
{% if "__most_recent_date__" in where %}
{% set max_date_query = "select max(data) as max_date from " ~ relation %}
{% set max_date_result = run_query(max_date_query) %}
{% if execute and max_date_result.rows[0][0] %}
{% set max_date = max_date_result.rows[0][0] %}
{% set where = where | replace(
"__most_recent_date__", "data = '" ~ max_date ~ "'"
) %}
{% do log(
"The test will filter by the most recent date: "
~ max_date,
info=True,
) %}
{% endif %}
{% endif %}

{# This block looks for __most_recent_year_month__ placeholder #}
{% if "__most_recent_year_month__" in where %}
{# Construct a query to find the maximum date using ano and mes columns #}
{% set max_date_query = (
"select format_date('%Y-%m', max(date(cast(ano as int64), cast(mes as int64), 1))) as max_date from "
~ relation
) %}
{% set max_date_result = run_query(max_date_query) %}

{% if execute %}
{# % do log(max_date_query, info=True) %#}
{# % do log(max_date_result, info=True) %#}
{# Extract the maximum year and month from the max_date #}
{% set max_date = max_date_result.rows[0][0] %}
{% set max_year = max_date[:4] %}
{% set max_month = max_date[5:7] %}
Expand All @@ -37,11 +69,15 @@
{% endif %}
{% endif %}

{%- set filtered -%}
(select * from {{ relation }} where {{ where }}) dbt_subquery
{%- endset -%}

{# Return the filtered subquery #}
{% set filtered = (
"(select * from "
~ relation
~ " where "
~ where
~ ") dbt_subquery"
) %}
{% do return(filtered) %}
{% else %} {% do return(relation) %}
{% endif %}
{%- endmacro %}
{% endmacro %}
50 changes: 50 additions & 0 deletions tests/generic/custom_dictionaries.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
-- https://github.com/basedosdados/pipelines/wiki/Incluindo-testes-no-seu-modelo#dicionários
{% test custom_dictionaries(
model, dictionary_model_name, table_id, columns_covered_by_dictionary
) %}
{{ config(severity="error") }}

{%- set combined_query_parts = [] -%}
{%- set union_parts = [] -%}

{%- for column_name in columns_covered_by_dictionary %}
{% set subquery_name = "exceptions_" ~ loop.index %}
{% set left_table_name = "data_table_" ~ loop.index %}
{% set right_table_name = "dictionary_table_" ~ loop.index %}

{% set subquery %}
{{ left_table_name }} as (
select {{ column_name }} as id
from {{ model }}
where {{ column_name }} is not null
),
{{ right_table_name }} as (
select chave
from {{ dictionary_model_name }}
where valor is not null
and id_tabela = '{{ table_id }}'
and nome_coluna = '{{ column_name }}'
),
{{ subquery_name }} as (
select '{{ column_name }}' as failed_column, id as missing_value
from {{ left_table_name }}
left join {{ right_table_name }} on {{ left_table_name }}.id = {{ right_table_name }}.chave
where {{ right_table_name }}.chave is null
)
{% endset %}

{%- do combined_query_parts.append(subquery) -%}
{%- do union_parts.append(subquery_name) -%}
{%- endfor %}

{# Combine all CTEs into a single WITH clause and then union all results #}
{% set final_query %}
with
{{ combined_query_parts | join(', ') }}

select distinct failed_column, missing_value from {{ union_parts | join(' union all select distinct failed_column, missing_value from ') }}
{% endset %}

{{ return(final_query) }}

{% endtest %}
54 changes: 54 additions & 0 deletions tests/generic/custom_relationships.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{% test custom_relationships(
model,
column_name,
to,
field,
ignore_values=None,
proportion_allowed_failures=0.05
) %}

{{ config(severity="error") }}

with
child as (
select {{ column_name }} as child_value
from {{ model }}
{% if ignore_values %}
where {{ column_name }} not in ('{{ ignore_values | join("', '") }}')
{% endif %}
),
parent as (select {{ field }} as parent_value from {{ to }}),
validation as (
select child.child_value
from child
left join parent on child.child_value = parent.parent_value
where parent.parent_value is null
),
summary as (
select
count(*) as total_missing,
(select count(*) from child) as total_child_records,
round(count(*) / (select count(*) from child), 2) as failure_rate
from validation
)

select
total_missing,
total_child_records,
failure_rate,
case
when failure_rate > {{ proportion_allowed_failures }}
then
'Test failed: Failure rate of '
|| failure_rate
|| '% exceeds allowed proportion of '
|| '{{ proportion_allowed_failures }}%'
else
'Test passed: Failure rate of '
|| failure_rate
|| '% within acceptable limits'
end as result_message
from summary
where failure_rate > {{ proportion_allowed_failures }}

{% endtest %}
49 changes: 49 additions & 0 deletions tests/generic/custom_unique_combination_of_columns.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{% test custom_unique_combinations_of_columns(
model, combination_of_columns, proportion_allowed_failures=0.05
) %}

{{ config(severity="error") }}

{%- set column_list = combination_of_columns %}
{%- set columns_csv = column_list | join(", ") %}

with
validation_data as (
select {{ columns_csv }}, count(*) as duplicates_count
from {{ model }}
group by {{ columns_csv }}
having count(*) > 1
),
summary as (
select duplicates_count, (select count(*) from {{ model }}) as total_rows
from validation_data
),

final_summary as (
select
duplicates_count,
total_rows,
round(duplicates_count / total_rows, 2) as failure_rate
from summary
)

select
duplicates_count,
total_rows,
failure_rate,
case
when failure_rate > {{ proportion_allowed_failures }}
then
'Test failed: Proportion of non-unique '
|| failure_rate
|| '% exceeds allowed proportion '
|| '{{ proportion_allowed_failures }}%'
else
'Test passed: Proportion of non-unique '
|| failure_rate
|| '% within acceptable limits'
end as log_message
from final_summary
where failure_rate > {{ proportion_allowed_failures }}

{% endtest %}
Loading