Merge pull request #570 from basedosdados/staging/add_custom_tests

[dbt] add custom tests
basedosdados · May 6, 2024 · 4a4bf03 · 4a4bf03
2 parents f0b94cc + 12ba509
commit 4a4bf03
Show file tree

Hide file tree

Showing 4 changed files with 206 additions and 17 deletions.
diff --git a/macros/custom_get_where_subquery.sql b/macros/custom_get_where_subquery.sql
@@ -1,26 +1,58 @@
--- This macro is used to get a subquery with a where clause that can be used in a test
--- to filter the data to be tested. The macro looks for a where clause in the model's
--- config (schema.yml) and replaces the placeholder "__most_recent_year_month__" with
--- the maximum
--- year and month found in the relation. The macro returns a subquery with the where
--- thats used
--- to filter the data to be tested
-{% macro get_where_subquery(relation) -%}
+-- https://github.com/basedosdados/pipelines/wiki/Incluindo-testes-no-seu-modelo#where--__most_recent_year_month__--__most_recent_date__--__most_recent_year__
+{% macro get_where_subquery(relation) %}
     {% set where = config.get("where", "") %}
 
     {% if where %}
+        {% set max_year_query = "" %}
+        {% set max_date_query = "" %}
+        {% set max_year = "" %}
+        {% set max_date = "" %}
+
+        {# This block looks for __most_recent_year__  placeholder #}
+        {% if "__most_recent_year__" in where %}
+            {% set max_year_query = (
+                "select max(cast(ano as int64)) as max_year from " ~ relation
+            ) %}
+            {% set max_year_result = run_query(max_year_query) %}
+            {% if execute and max_year_result.rows[0][0] %}
+                {% set max_year = max_year_result.rows[0][0] %}
+                {% set where = where | replace(
+                    "__most_recent_year__", "ano = '" ~ max_year ~ "'"
+                ) %}
+                {% do log(
+                    "The test will filter by the most recent year: "
+                    ~ max_year,
+                    info=True,
+                ) %}
+            {% endif %}
+        {% endif %}
+
+        {# This block looks for __most_recent_date__  placeholder #}
+        {% if "__most_recent_date__" in where %}
+            {% set max_date_query = "select max(data) as max_date from " ~ relation %}
+            {% set max_date_result = run_query(max_date_query) %}
+            {% if execute and max_date_result.rows[0][0] %}
+                {% set max_date = max_date_result.rows[0][0] %}
+                {% set where = where | replace(
+                    "__most_recent_date__", "data = '" ~ max_date ~ "'"
+                ) %}
+                {% do log(
+                    "The test will filter by the most recent date: "
+                    ~ max_date,
+                    info=True,
+                ) %}
+            {% endif %}
+        {% endif %}
+
+        {# This block looks for __most_recent_year_month__  placeholder #}
         {% if "__most_recent_year_month__" in where %}
-            {# Construct a query to find the maximum date using ano and mes columns #}
             {% set max_date_query = (
                 "select format_date('%Y-%m', max(date(cast(ano as int64), cast(mes as int64), 1))) as max_date from "
                 ~ relation
             ) %}
             {% set max_date_result = run_query(max_date_query) %}
 
             {% if execute %}
-                {# % do log(max_date_query, info=True) %#}
-                {# % do log(max_date_result, info=True) %#}
-                {# Extract the maximum year and month from the max_date #}
                 {% set max_date = max_date_result.rows[0][0] %}
                 {% set max_year = max_date[:4] %}
                 {% set max_month = max_date[5:7] %}
@@ -37,11 +69,15 @@
             {% endif %}
         {% endif %}
 
-        {%- set filtered -%}
-            (select * from {{ relation }} where {{ where }}) dbt_subquery
-        {%- endset -%}
-
+        {# Return the filtered subquery #}
+        {% set filtered = (
+            "(select * from "
+            ~ relation
+            ~ " where "
+            ~ where
+            ~ ") dbt_subquery"
+        ) %}
         {% do return(filtered) %}
     {% else %} {% do return(relation) %}
     {% endif %}
-{%- endmacro %}
+{% endmacro %}
diff --git a/tests/generic/custom_dictionaries.sql b/tests/generic/custom_dictionaries.sql
@@ -0,0 +1,50 @@
+-- https://github.com/basedosdados/pipelines/wiki/Incluindo-testes-no-seu-modelo#dicionários
+{% test custom_dictionaries(
+    model, dictionary_model_name, table_id, columns_covered_by_dictionary
+) %}
+    {{ config(severity="error") }}
+
+    {%- set combined_query_parts = [] -%}
+    {%- set union_parts = [] -%}
+
+    {%- for column_name in columns_covered_by_dictionary %}
+        {% set subquery_name = "exceptions_" ~ loop.index %}
+        {% set left_table_name = "data_table_" ~ loop.index %}
+        {% set right_table_name = "dictionary_table_" ~ loop.index %}
+
+        {% set subquery %}
+            {{ left_table_name }} as (
+                select {{ column_name }} as id
+                from {{ model }}
+                where {{ column_name }} is not null
+            ),
+            {{ right_table_name }} as (
+                select chave
+                from {{ dictionary_model_name }}
+                where valor is not null
+                and id_tabela = '{{ table_id }}'
+                and nome_coluna = '{{ column_name }}'
+            ),
+            {{ subquery_name }} as (
+                select '{{ column_name }}' as failed_column, id as missing_value
+                from {{ left_table_name }}
+                left join {{ right_table_name }} on {{ left_table_name }}.id = {{ right_table_name }}.chave
+                where {{ right_table_name }}.chave is null
+            )
+        {% endset %}
+
+        {%- do combined_query_parts.append(subquery) -%}
+        {%- do union_parts.append(subquery_name) -%}
+    {%- endfor %}
+
+    {# Combine all CTEs into a single WITH clause and then union all results #}
+    {% set final_query %}
+        with
+        {{ combined_query_parts | join(', ') }}
+
+        select distinct failed_column, missing_value from {{ union_parts | join(' union all select distinct failed_column, missing_value from ') }}
+    {% endset %}
+
+    {{ return(final_query) }}
+
+{% endtest %}
diff --git a/tests/generic/custom_relationships.sql b/tests/generic/custom_relationships.sql
@@ -0,0 +1,54 @@
+{% test custom_relationships(
+    model,
+    column_name,
+    to,
+    field,
+    ignore_values=None,
+    proportion_allowed_failures=0.05
+) %}
+
+    {{ config(severity="error") }}
+
+    with
+        child as (
+            select {{ column_name }} as child_value
+            from {{ model }}
+            {% if ignore_values %}
+                where {{ column_name }} not in ('{{ ignore_values | join("', '") }}')
+            {% endif %}
+        ),
+        parent as (select {{ field }} as parent_value from {{ to }}),
+        validation as (
+            select child.child_value
+            from child
+            left join parent on child.child_value = parent.parent_value
+            where parent.parent_value is null
+        ),
+        summary as (
+            select
+                count(*) as total_missing,
+                (select count(*) from child) as total_child_records,
+                round(count(*) / (select count(*) from child), 2) as failure_rate
+            from validation
+        )
+
+    select
+        total_missing,
+        total_child_records,
+        failure_rate,
+        case
+            when failure_rate > {{ proportion_allowed_failures }}
+            then
+                'Test failed: Failure rate of '
+                || failure_rate
+                || '% exceeds allowed proportion of '
+                || '{{ proportion_allowed_failures }}%'
+            else
+                'Test passed: Failure rate of '
+                || failure_rate
+                || '% within acceptable limits'
+        end as result_message
+    from summary
+    where failure_rate > {{ proportion_allowed_failures }}
+
+{% endtest %}
diff --git a/tests/generic/custom_unique_combination_of_columns.sql b/tests/generic/custom_unique_combination_of_columns.sql
@@ -0,0 +1,49 @@
+{% test custom_unique_combinations_of_columns(
+    model, combination_of_columns, proportion_allowed_failures=0.05
+) %}
+
+    {{ config(severity="error") }}
+
+    {%- set column_list = combination_of_columns %}
+    {%- set columns_csv = column_list | join(", ") %}
+
+    with
+        validation_data as (
+            select {{ columns_csv }}, count(*) as duplicates_count
+            from {{ model }}
+            group by {{ columns_csv }}
+            having count(*) > 1
+        ),
+        summary as (
+            select duplicates_count, (select count(*) from {{ model }}) as total_rows
+            from validation_data
+        ),
+
+        final_summary as (
+            select
+                duplicates_count,
+                total_rows,
+                round(duplicates_count / total_rows, 2) as failure_rate
+            from summary
+        )
+
+    select
+        duplicates_count,
+        total_rows,
+        failure_rate,
+        case
+            when failure_rate > {{ proportion_allowed_failures }}
+            then
+                'Test failed: Proportion of non-unique '
+                || failure_rate
+                || '% exceeds allowed proportion '
+                || '{{ proportion_allowed_failures }}%'
+            else
+                'Test passed: Proportion of non-unique '
+                || failure_rate
+                || '% within acceptable limits'
+        end as log_message
+    from final_summary
+    where failure_rate > {{ proportion_allowed_failures }}
+
+{% endtest %}