Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to FULL OUTER JOIN for derived & ratio metrics #842

Merged
merged 8 commits into from
Nov 4, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changes/unreleased/Breaking Changes-20231102-182815.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: Breaking Changes
body: Use FULL OUTER JOIN to combine input metrics for derived metrics. This is a change from using INNER JOIN and may result in changes in output.
time: 2023-11-02T18:28:15.181064-07:00
custom:
Author: courtneyholcomb
Issue: "842"
4 changes: 2 additions & 2 deletions metricflow/dataflow/builder/dataflow_plan_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,14 +241,14 @@ def _build_metrics_output_node(
f"For {metric.type} metric: {metric_spec}, needed metrics are:\n"
f"{pformat_big_objects(metric_input_specs=metric_input_specs)}"
)

join_type = SqlJoinType.FULL_OUTER if metric.type is MetricType.DERIVED else SqlJoinType.INNER
compute_metrics_node = ComputeMetricsNode(
parent_node=self._build_metrics_output_node(
metric_specs=metric_input_specs,
queried_linkable_specs=queried_linkable_specs,
where_constraint=where_constraint,
time_range_constraint=time_range_constraint,
combine_metrics_join_type=SqlJoinType.INNER,
combine_metrics_join_type=join_type,
),
metric_specs=[metric_spec],
)
Expand Down
71 changes: 39 additions & 32 deletions metricflow/test/integration/test_cases/itest_metrics.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ integration_test:
FROM {{source_schema}}.fct_bookings
GROUP BY ds
) b
JOIN (
FULL OUTER JOIN (
SELECT
SUM(1) AS views
, ds
Expand Down Expand Up @@ -519,7 +519,7 @@ integration_test:
GROUP BY
ds
) a
INNER JOIN (
FULL OUTER JOIN (
SELECT
SUM(1) AS lux_listings
, created_at AS metric_time__day
Expand Down Expand Up @@ -643,27 +643,34 @@ integration_test:
group_bys: [listing__is_lux_latest]
check_query: |
SELECT
bk.booking_value / NULLIF(vw.views, 0) AS booking_value_per_view
, bk.is_lux AS listing__is_lux_latest
booking_value / NULLIF(views, 0) AS booking_value_per_view
, listing__is_lux_latest
FROM (
SELECT
SUM(a.booking_value) AS booking_value
,b.is_lux
FROM {{ source_schema }}.fct_bookings a
LEFT OUTER JOIN {{ source_schema }}.dim_listings_latest b
ON a.listing_id = b.listing_id
GROUP BY 2
) bk
INNER JOIN (
SELECT
SUM(1) AS views
,d.is_lux
FROM {{ source_schema }}.fct_views c
LEFT OUTER JOIN {{ source_schema }}.dim_listings_latest d
ON c.listing_id = d.listing_id
GROUP BY 2
) vw
ON bk.is_lux = vw.is_lux OR (bk.is_lux IS NULL AND vw.is_lux IS NULL)
MAX(bk.booking_value) AS booking_value
, MAX(vw.views) AS views
, COALESCE(bk.is_lux, vw.is_lux) AS listing__is_lux_latest
FROM (
SELECT
SUM(a.booking_value) AS booking_value
,b.is_lux
FROM {{ source_schema }}.fct_bookings a
LEFT OUTER JOIN {{ source_schema }}.dim_listings_latest b
ON a.listing_id = b.listing_id
GROUP BY 2
) bk
FULL OUTER JOIN (
SELECT
SUM(1) AS views
,d.is_lux
FROM {{ source_schema }}.fct_views c
LEFT OUTER JOIN {{ source_schema }}.dim_listings_latest d
ON c.listing_id = d.listing_id
GROUP BY 2
) vw
ON bk.is_lux = vw.is_lux
GROUP BY 3
) x
---
integration_test:
name: derived_metric_with_offset_window
Expand All @@ -683,7 +690,7 @@ integration_test:
GROUP BY
metric_time__day
) a
INNER JOIN (
FULL OUTER JOIN (
SELECT
c.ds AS metric_time__day
, d.bookings_2_weeks_ago AS bookings_2_weeks_ago
Expand Down Expand Up @@ -718,7 +725,7 @@ integration_test:
GROUP BY
metric_time__day
) a
INNER JOIN (
FULL OUTER JOIN (
SELECT
c.ds AS metric_time__day
, d.bookings_at_start_of_month AS bookings_at_start_of_month
Expand Down Expand Up @@ -760,7 +767,7 @@ integration_test:
) f
ON {{ render_date_sub("g", "ds", 1, TimeGranularity.MONTH) }} = f.metric_time__day
) a
INNER JOIN (
FULL OUTER JOIN (
SELECT
c.ds AS metric_time__day
, d.bookings AS month_start_bookings
Expand Down Expand Up @@ -808,7 +815,7 @@ integration_test:
check_query: |
SELECT
booking_value - instant_booking_value AS booking_value_sub_instant
, a.metric_time__day
, COALESCE(a.metric_time__day, b.metric_time__day) AS metric_time__day
FROM (
SELECT
SUM(booking_value) AS instant_booking_value
Expand All @@ -818,7 +825,7 @@ integration_test:
GROUP BY
ds
) a
INNER JOIN (
FULL OUTER JOIN (
SELECT
SUM(booking_value) AS booking_value
, ds AS metric_time__day
Expand All @@ -843,7 +850,7 @@ integration_test:
FROM (
SELECT
booking_value - instant_booking_value AS booking_value_sub_instant
, a.metric_time__day AS metric_time__day
, COALESCE(a.metric_time__day, b.metric_time__day) AS metric_time__day
FROM (
SELECT
SUM(booking_value) AS instant_booking_value
Expand All @@ -853,7 +860,7 @@ integration_test:
GROUP BY
ds
) a
INNER JOIN (
FULL OUTER JOIN (
SELECT
SUM(booking_value) AS booking_value
, ds AS metric_time__day
Expand Down Expand Up @@ -901,7 +908,7 @@ integration_test:
FROM {{ source_schema }}.fct_bookings
GROUP BY metric_time__week
) a
INNER JOIN (
FULL OUTER JOIN (
SELECT
{{ render_date_trunc("c.ds", TimeGranularity.WEEK) }} AS metric_time__week
, SUM(d.bookings_at_start_of_month) AS bookings_at_start_of_month
Expand Down Expand Up @@ -944,7 +951,7 @@ integration_test:
ON {{ render_date_sub("g", "ds", 1, TimeGranularity.MONTH) }} = f.metric_time__day
GROUP BY metric_time__year
) a
INNER JOIN (
FULL OUTER JOIN (
SELECT
{{ render_date_trunc("c.ds", TimeGranularity.YEAR) }} AS metric_time__year
, SUM(d.bookings) AS month_start_bookings
Expand Down Expand Up @@ -1024,7 +1031,7 @@ integration_test:
FROM {{ source_schema }}.fct_bookings
GROUP BY metric_time__week, metric_time__month
) a
INNER JOIN (
FULL OUTER JOIN (
SELECT
{{ render_date_trunc("c.ds", TimeGranularity.WEEK) }} AS metric_time__week
, {{ render_date_trunc("c.ds", TimeGranularity.MONTH) }} AS metric_time__month
Expand Down Expand Up @@ -1290,7 +1297,7 @@ integration_test:
) subq_3
ON subq_5.ds = subq_3.metric_time__day
) subq_7
INNER JOIN (
FULL OUTER JOIN (
SELECT
subq_11.ds AS metric_time__day
, SUM(subq_9.bookings) AS bookings_2_weeks_ago
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
<CombineMetricsNode>
<!-- description = Combine Metrics -->
<!-- node_id = cbm_0 -->
<!-- join type = SqlJoinType.INNER -->
<!-- join type = SqlJoinType.FULL_OUTER -->
<!-- de-duplication method = post-join aggregation across all dimensions -->
<ComputeMetricsNode>
<!-- description = Compute Metrics via Expressions -->
<!-- node_id = cm_0 -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
<CombineMetricsNode>
<!-- description = Combine Metrics -->
<!-- node_id = cbm_0 -->
<!-- join type = SqlJoinType.INNER -->
<!-- join type = SqlJoinType.FULL_OUTER -->
<!-- de-duplication method = post-join aggregation across all dimensions -->
<ComputeMetricsNode>
<!-- description = Compute Metrics via Expressions -->
<!-- node_id = cm_0 -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
<CombineMetricsNode>
<!-- description = Combine Metrics -->
<!-- node_id = cbm_0 -->
<!-- join type = SqlJoinType.INNER -->
<!-- join type = SqlJoinType.FULL_OUTER -->
<!-- de-duplication method = post-join aggregation across all dimensions -->
<ComputeMetricsNode>
<!-- description = Compute Metrics via Expressions -->
<!-- node_id = cm_0 -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ FROM (
-- Combine Metrics
SELECT
COALESCE(subq_4.metric_time__day, subq_9.metric_time__day) AS metric_time__day
, subq_4.ref_bookings AS ref_bookings
, subq_9.bookings AS bookings
, MAX(subq_4.ref_bookings) AS ref_bookings
, MAX(subq_9.bookings) AS bookings
FROM (
-- Compute Metrics via Expressions
SELECT
Expand Down Expand Up @@ -224,7 +224,7 @@ FROM (
metric_time__day
) subq_3
) subq_4
INNER JOIN (
FULL OUTER JOIN (
-- Compute Metrics via Expressions
SELECT
subq_8.metric_time__day
Expand Down Expand Up @@ -441,13 +441,7 @@ FROM (
) subq_8
) subq_9
ON
(
Copy link
Contributor Author

@courtneyholcomb courtneyholcomb Nov 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tlento do you have context on why for INNER JOIN we add this NULL column comparison, but for FULL OUTER JOIN we don't? Wondering if that's intentional or something I should fix!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The NULL comparison is needed because NULL = NULL evaluates to NULL, which is false-y in SQL comparisons, but we have to include sets in the INNER JOIN where the dimension values on both sides are NULL. If a row doesn't match on an INNER JOIN it gets discarded, so we must handle that case in the join condition.

For FULL OUTER JOIN we can't do OR conditions (or inequalities) - a bunch of our engines don't support them. Everything has to be a strict intersection of equality checks. However, if you do a FULL OUTER JOIN and a comparison fails due to a NULL = NULL check, you still get the rows from both sides. They're just not matched to anything, so you effectively get one row per aggregated measure source. That's why we have all these MAX expressions, because that's the hack I put in to deduplicate those output rows.

I have to say I find all of this absurd, but that is where we are.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah thank you!! Super helpful

subq_4.metric_time__day = subq_9.metric_time__day
) OR (
(
subq_4.metric_time__day IS NULL
) AND (
subq_9.metric_time__day IS NULL
)
)
subq_4.metric_time__day = subq_9.metric_time__day
GROUP BY
metric_time__day
) subq_10
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ FROM (
-- Combine Metrics
SELECT
COALESCE(subq_15.metric_time__day, subq_20.metric_time__day) AS metric_time__day
, subq_15.ref_bookings AS ref_bookings
, subq_20.bookings AS bookings
, MAX(subq_15.ref_bookings) AS ref_bookings
, MAX(subq_20.bookings) AS bookings
FROM (
-- Aggregate Measures
-- Compute Metrics via Expressions
Expand All @@ -27,7 +27,7 @@ FROM (
GROUP BY
metric_time__day
) subq_15
INNER JOIN (
FULL OUTER JOIN (
-- Aggregate Measures
-- Compute Metrics via Expressions
SELECT
Expand All @@ -47,13 +47,7 @@ FROM (
metric_time__day
) subq_20
ON
(
subq_15.metric_time__day = subq_20.metric_time__day
) OR (
(
subq_15.metric_time__day IS NULL
) AND (
subq_20.metric_time__day IS NULL
)
)
subq_15.metric_time__day = subq_20.metric_time__day
GROUP BY
metric_time__day
) subq_21
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ FROM (
-- Combine Metrics
SELECT
COALESCE(subq_4.metric_time__day, subq_12.metric_time__day) AS metric_time__day
, subq_4.bookings AS bookings
, subq_12.bookings_at_start_of_month AS bookings_at_start_of_month
, MAX(subq_4.bookings) AS bookings
, MAX(subq_12.bookings_at_start_of_month) AS bookings_at_start_of_month
FROM (
-- Compute Metrics via Expressions
SELECT
Expand Down Expand Up @@ -224,7 +224,7 @@ FROM (
metric_time__day
) subq_3
) subq_4
INNER JOIN (
FULL OUTER JOIN (
-- Compute Metrics via Expressions
SELECT
subq_11.metric_time__day
Expand Down Expand Up @@ -541,13 +541,7 @@ FROM (
) subq_11
) subq_12
ON
(
subq_4.metric_time__day = subq_12.metric_time__day
) OR (
(
subq_4.metric_time__day IS NULL
) AND (
subq_12.metric_time__day IS NULL
)
)
subq_4.metric_time__day = subq_12.metric_time__day
GROUP BY
metric_time__day
) subq_13
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ FROM (
-- Combine Metrics
SELECT
COALESCE(subq_18.metric_time__day, subq_26.metric_time__day) AS metric_time__day
, subq_18.bookings AS bookings
, subq_26.bookings_at_start_of_month AS bookings_at_start_of_month
, MAX(subq_18.bookings) AS bookings
, MAX(subq_26.bookings_at_start_of_month) AS bookings_at_start_of_month
FROM (
-- Aggregate Measures
-- Compute Metrics via Expressions
Expand All @@ -27,7 +27,7 @@ FROM (
GROUP BY
metric_time__day
) subq_18
INNER JOIN (
FULL OUTER JOIN (
-- Join to Time Spine Dataset
-- Pass Only Elements:
-- ['bookings', 'metric_time__day']
Expand All @@ -51,13 +51,7 @@ FROM (
metric_time__day
) subq_26
ON
(
subq_18.metric_time__day = subq_26.metric_time__day
) OR (
(
subq_18.metric_time__day IS NULL
) AND (
subq_26.metric_time__day IS NULL
)
)
subq_18.metric_time__day = subq_26.metric_time__day
GROUP BY
metric_time__day
) subq_27
Loading