Skip to content

Commit

Permalink
Add a median measure (#75)
Browse files Browse the repository at this point in the history
* Add a median measurement.

* Fix duplicate anchor.

* Add median to all_measures.

* Update: calculate median when only numeric value.

* Update: change testing only numeric data type.

* Update README: Add median in mesuments.

* Fix bigquery query for median.

* Update test profiles.

* Not apply median to structure data type.
  • Loading branch information
toohsk authored Jun 21, 2023
1 parent 2ea0ede commit b2cc1ad
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 2 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* `min`*^: Minimum column value
* `max`*^: Maximum column value
* `avg`**^: Average column value
* `median`**^: Median column value
* `std_dev_population`**^: Population standard deviation
* `std_dev_sample`**^: Sample standard deviation
* `profiled_at`: Profile calculation date and time
Expand Down
5 changes: 5 additions & 0 deletions integration_tests/models/profile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ models:
- not_null:
where: *column_is_numeric

- name: median
tests:
- not_null:
where: *column_is_numeric

- name: profiled_at
tests:
- not_null
2 changes: 1 addition & 1 deletion integration_tests/models/profile_exclude_measures.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-- depends_on: {{ ref("test_data_default") }}
{% if execute %}
{{ dbt_profiler.get_profile(relation=ref("test_data_default"), exclude_measures=["avg", "std_dev_population", "std_dev_sample"]) }}
{{ dbt_profiler.get_profile(relation=ref("test_data_default"), exclude_measures=["avg", "median", "std_dev_population", "std_dev_sample"]) }}
{% endif %}
2 changes: 1 addition & 1 deletion integration_tests/models/profile_exclude_measures.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ models:
- name: profile_exclude_measures
tests:
- dbt_expectations.expect_table_columns_to_not_contain_set:
column_list: ["avg", "std_dev_population", "std_dev_sample"]
column_list: ["avg", "median", "std_dev_population", "std_dev_sample"]
transform: lower
12 changes: 12 additions & 0 deletions macros/get_profile.sql
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"min",
"max",
"avg",
"median",
"std_dev_population",
"std_dev_sample"
] -%}
Expand Down Expand Up @@ -98,6 +99,9 @@
{% if "avg" not in exclude_measures -%}
{{ dbt_profiler.measure_avg(column_name, data_type) }} as avg,
{%- endif %}
{% if "median" not in exclude_measures -%}
{{ dbt_profiler.measure_median(column_name, data_type) }} as median,
{%- endif %}
{% if "std_dev_population" not in exclude_measures -%}
{{ dbt_profiler.measure_std_dev_population(column_name, data_type) }} as std_dev_population,
{%- endif %}
Expand Down Expand Up @@ -150,6 +154,7 @@
"min",
"max",
"avg",
"median",
"std_dev_population",
"std_dev_sample"
] -%}
Expand Down Expand Up @@ -234,6 +239,9 @@
{% if "avg" not in exclude_measures -%}
{{ dbt_profiler.measure_avg(column_name, data_type) }} as avg,
{%- endif %}
{% if "median" not in exclude_measures -%}
{{ dbt_profiler.measure_median(column_name, data_type) }} as median,
{%- endif %}
{% if "std_dev_population" not in exclude_measures -%}
{{ dbt_profiler.measure_std_dev_population(column_name, data_type) }} as std_dev_population,
{%- endif %}
Expand Down Expand Up @@ -288,6 +296,7 @@
"min",
"max",
"avg",
"median",
"std_dev_population",
"std_dev_sample"
] -%}
Expand Down Expand Up @@ -367,6 +376,9 @@
{% if "avg" not in exclude_measures -%}
{{ dbt_profiler.measure_avg(column_name, data_type) }} as avg,
{%- endif %}
{% if "median" not in exclude_measures -%}
{{ dbt_profiler.measure_median(column_name, data_type) }} as median,
{%- endif %}
{% if "std_dev_population" not in exclude_measures -%}
{{ dbt_profiler.measure_std_dev_population(column_name, data_type) }} as std_dev_population,
{%- endif %}
Expand Down
46 changes: 46 additions & 0 deletions macros/measures.sql
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,52 @@ case when count(distinct {{ adapter.quote(column_name) }}) = count(*) then 1 els
{%- endmacro -%}


{# measure_median ------------------------------------------------- #}

{%- macro measure_median(column_name, data_type) -%}
{{ return(adapter.dispatch("measure_median", macro_namespace="dbt_profiler")(column_name, data_type)) }}
{%- endmacro -%}

{%- macro default__measure_median(column_name, data_type) -%}

{%- if dbt_profiler.is_numeric_dtype(data_type) and not dbt_profiler.is_struct_dtype(data_type) -%}
median({{ adapter.quote(column_name) }})
{%- else -%}
cast(null as {{ dbt.type_numeric() }})
{%- endif -%}

{%- endmacro -%}

{%- macro bigquery__measure_median(column_name, data_type) -%}

{%- if dbt_profiler.is_numeric_dtype(data_type) and not dbt_profiler.is_struct_dtype(data_type) -%}
APPROX_QUANTILES({{ adapter.quote(column_name) }}, 100)[OFFSET(50)]
{%- else -%}
cast(null as {{ dbt.type_numeric() }})
{%- endif -%}

{%- endmacro -%}

{%- macro postgres__measure_median(column_name, data_type) -%}

{%- if dbt_profiler.is_numeric_dtype(data_type) and not dbt_profiler.is_struct_dtype(data_type) -%}
percentile_cont(0.5) within group (order by {{ adapter.quote(column_name) }})
{%- else -%}
cast(null as {{ dbt.type_numeric() }})
{%- endif -%}

{%- endmacro -%}

{%- macro sql_server__measure_median(column_name, data_type) -%}

{%- if dbt_profiler.is_numeric_dtype(data_type) and not dbt_profiler.is_struct_dtype(data_type) -%}
percentile_cont({{ adapter.quote(column_name) }}, 0.5) over ()
{%- else -%}
cast(null as {{ dbt.type_numeric() }})
{%- endif -%}

{%- endmacro -%}

{# measure_std_dev_population ------------------------------------------------- #}

{%- macro measure_std_dev_population(column_name, data_type) -%}
Expand Down

0 comments on commit b2cc1ad

Please sign in to comment.