From c0978289474c9cc87ee046a87b4c2132cf40d0b1 Mon Sep 17 00:00:00 2001 From: dgitis Date: Sat, 13 Apr 2024 16:29:47 -0700 Subject: [PATCH 01/12] user base tables partial --- macros/base_select.sql | 47 ++++++++++++++++++- .../base/base_ga4__pseudonymous_users.sql | 30 ++++++++++++ models/staging/src_ga4.yml | 8 +++- 3 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 models/staging/base/base_ga4__pseudonymous_users.sql diff --git a/macros/base_select.sql b/macros/base_select.sql index 1f374539..ca8ba231 100644 --- a/macros/base_select.sql +++ b/macros/base_select.sql @@ -159,4 +159,49 @@ WHEN event_name = 'purchase' THEN 1 ELSE 0 END AS is_purchase -{% endmacro %} \ No newline at end of file +{% endmacro %} + +{% macro base_select_usr_source() %} + {{ return(adapter.dispatch('base_select_usr_source', 'ga4')()) }} +{% endmacro %} + +{% macro default__base_select_usr_source() %} + , user_info.last_active_timestamp_micros + , user_info.user_first_touch_timestamp_micros + , user_info.first_purchase_date + , device.operating_system + , device.category + , device.mobile_brand_name + , device.mobile_model_name + , device.unified_screen_name + , geo.city + , geo.country + , geo.continent + , geo.region + , user_ltv.revenue_in_usd + , user_ltv.sessions + , user_ltv.engagement_time_millis + , user_ltv.purchases + , user_ltv.engaged_sessions + , user_ltv.session_duration_micros + , predictions.in_app_purchase_score_7d + , predictions.purchase_score_7d + , predictions.churn_score_7d + , predictions.revenue_28d_in_usd + , privacy_info.is_limited_ad_tracking + , privacy_info.is_ads_personalization_allowed + , occurrence_date + , last_updated_date + {% for up in var('user_properties', []) %} -- don't have sample data; need to verify + , (select value.string_value from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_string_value + , (select value.set_timestamp_micros from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_set_timestamp_micros + , (select value.user_property_name from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_user_property_name + {% endfor %} + {% for aud in var('audiences', []) %} -- this should be good, though + , (select id from unnest(audiences) where name = '{{aud}}') as {{aud | lower | replace(" ", "_")}}_id + , (select name from unnest(audiences) where name = '{{aud}}') as {{aud | lower | replace(" ", "_")}}_name + , (select membership_start_timestamp_micros from unnest(audiences) where name = '{{aud}}') as {{aud | lower | replace(" ", "_")}}_membership_start_timestamp_micros + , (select membership_expiry_timestamp_micros from unnest(audiences) where name = '{{aud}}') as {{aud | lower | replace(" ", "_")}}_membership_expiry_timestamp_micros + , (select npa from unnest(audiences) where name = '{{aud}}') as {{aud | replace(" ", "_")}}_npa + {% endfor %} +{% endmacro %} diff --git a/models/staging/base/base_ga4__pseudonymous_users.sql b/models/staging/base/base_ga4__pseudonymous_users.sql new file mode 100644 index 00000000..82a99e4f --- /dev/null +++ b/models/staging/base/base_ga4__pseudonymous_users.sql @@ -0,0 +1,30 @@ +{% set partitions_to_replace = ['current_date'] %} +{% for i in range(var('static_incremental_days')) %} + {% set partitions_to_replace = partitions_to_replace.append('date_sub(current_date, interval ' + (i+1)|string + ' day)') %} +{% endfor %} + +{{ + config( + -- todo multi-site + materialized = 'incremental', + incremental_strategy = 'insert_overwrite', + partition_by={ + "field": "occurence_date", + "data_type": "date", + }, + partitions = partitions_to_replace, + ) +}} + +with source as ( + select + pseudo_user_id + , stream_id + {{ ga4.base_select_usr_source() }} + from {{ source('ga4', 'pseudonymous_users') }} + {% if is_incremental() %} + where parse_date('%Y%m%d', left(replace(_table_suffix, 'intraday_', ''), 8)) in ({{ partitions_to_replace | join(',') }}) + {% endif %} +) + +select * from source diff --git a/models/staging/src_ga4.yml b/models/staging/src_ga4.yml index 29104767..e3eb3059 100644 --- a/models/staging/src_ga4.yml +++ b/models/staging/src_ga4.yml @@ -13,4 +13,10 @@ sources: tables: - name: events identifier: events_* # Scan across all sharded event tables. Use the 'start_date' variable to limit this scan - description: Main events table exported by GA4. Sharded by date. \ No newline at end of file + description: Main events table exported by GA4. Sharded by date. + - name: pseudonymous_users + identifier: pseudonymous_users_* + description: Daily sharded pseudonymous_users (client_id) table exported by GA4 + - name: users + identifier: users_* + description: Daily sharded users (user_id) table exported by GA4 \ No newline at end of file From 8571d400ed8121a46b5fceb0af55ce7909610f9b Mon Sep 17 00:00:00 2001 From: dgitis Date: Fri, 19 Apr 2024 16:05:53 -0700 Subject: [PATCH 02/12] combine_property_data --- macros/combine_property_data.sql | 55 +++++++++++++------ .../base/base_ga4__pseudonymous_users.sql | 2 +- 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/macros/combine_property_data.sql b/macros/combine_property_data.sql index 67ef31cc..623858a5 100644 --- a/macros/combine_property_data.sql +++ b/macros/combine_property_data.sql @@ -14,25 +14,44 @@ {% set earliest_shard_to_retrieve = var('start_date')|int %} {% endif %} - {% for property_id in var('property_ids') %} {%- set schema_name = "analytics_" + property_id|string -%} - {# Copy intraday tables #} - {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_intraday_%', database=var('source_project')) -%} - {% for relation in relations %} - {%- set relation_suffix = relation.identifier|replace('events_intraday_', '') -%} - {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} - CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_intraday_{{relation_suffix}}`; - {%- endif -%} - {% endfor %} - {# Copy daily tables and drop old intraday table #} - {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_%', exclude='events_intraday_%', database=var('source_project')) -%} - {% for relation in relations %} - {%- set relation_suffix = relation.identifier|replace('events_', '') -%} - {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} - CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_{{relation_suffix}}`; - DROP TABLE IF EXISTS `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}`; - {%- endif -%} - {% endfor %} + {% if this == 'base_ga4__events' %} + {# Copy intraday tables #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_intraday_%', database=var('source_project')) -%} + {% for relation in relations %} + {%- set relation_suffix = relation.identifier|replace('events_intraday_', '') -%} + {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} + CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_intraday_{{relation_suffix}}`; + {%- endif -%} + {% endfor %} + {# Copy daily tables and drop old intraday table #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_%', exclude='events_intraday_%', database=var('source_project')) -%} + {% for relation in relations %} + {%- set relation_suffix = relation.identifier|replace('events_', '') -%} + {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} + CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_{{relation_suffix}}`; + DROP TABLE IF EXISTS `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}`; + {%- endif -%} + {% endfor %} + {% elif this == 'base_ga4__pseudonymous_users' %} + {# Copy pseudonymous_users tables #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='pseudonymous_users_%', database=var('source_project')) -%} + {% for relation in relations %} + {%- set relation_suffix = relation.identifier|replace('pseudonymous_users_', '') -%} + {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} + CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.pseudonymous_users_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.pseudonymous_users_{{relation_suffix}}`; + {%- endif -%} + {% endfor %} + {% elif this == 'base_ga4__users' %} + {# Copy users tables #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='users_%', exclude='pseudonymous_users_%', database=var('source_project')) -%} + {% for relation in relations %} + {%- set relation_suffix = relation.identifier|replace('users_', '') -%} + {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} + CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.users_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.users_{{relation_suffix}}`; + {%- endif -%} + {% endfor %} + {% endif %} {% endfor %} {% endmacro %} \ No newline at end of file diff --git a/models/staging/base/base_ga4__pseudonymous_users.sql b/models/staging/base/base_ga4__pseudonymous_users.sql index 82a99e4f..82f8c6cd 100644 --- a/models/staging/base/base_ga4__pseudonymous_users.sql +++ b/models/staging/base/base_ga4__pseudonymous_users.sql @@ -5,7 +5,7 @@ {{ config( - -- todo multi-site + pre_hook="{{ ga4.combine_property_data() }}" if var('combined_dataset', false) else "", materialized = 'incremental', incremental_strategy = 'insert_overwrite', partition_by={ From 1c83c7db8e22f289f67dcc354db3f8dda8af0763 Mon Sep 17 00:00:00 2001 From: dgitis Date: Fri, 3 May 2024 16:26:35 -0700 Subject: [PATCH 03/12] combine_property_data macro --- macros/base_select.sql | 14 +-- macros/combine_back.sql | 104 ++++++++++++++++++ macros/combine_property_data.sql | 44 ++++---- .../base/base_ga4__pseudonymous_users.sql | 4 +- 4 files changed, 138 insertions(+), 28 deletions(-) create mode 100644 macros/combine_back.sql diff --git a/macros/base_select.sql b/macros/base_select.sql index ca8ba231..dd4fc92f 100644 --- a/macros/base_select.sql +++ b/macros/base_select.sql @@ -190,18 +190,18 @@ , predictions.revenue_28d_in_usd , privacy_info.is_limited_ad_tracking , privacy_info.is_ads_personalization_allowed - , occurrence_date - , last_updated_date + , parse_date('%Y%m%d' , occurrence_date) as occurrence_date + , parse_date('%Y%m%d' , last_updated_date) as last_updated_date {% for up in var('user_properties', []) %} -- don't have sample data; need to verify , (select value.string_value from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_string_value , (select value.set_timestamp_micros from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_set_timestamp_micros , (select value.user_property_name from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_user_property_name {% endfor %} {% for aud in var('audiences', []) %} -- this should be good, though - , (select id from unnest(audiences) where name = '{{aud}}') as {{aud | lower | replace(" ", "_")}}_id - , (select name from unnest(audiences) where name = '{{aud}}') as {{aud | lower | replace(" ", "_")}}_name - , (select membership_start_timestamp_micros from unnest(audiences) where name = '{{aud}}') as {{aud | lower | replace(" ", "_")}}_membership_start_timestamp_micros - , (select membership_expiry_timestamp_micros from unnest(audiences) where name = '{{aud}}') as {{aud | lower | replace(" ", "_")}}_membership_expiry_timestamp_micros - , (select npa from unnest(audiences) where name = '{{aud}}') as {{aud | replace(" ", "_")}}_npa + , (select id from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_id + , (select name from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_name + , (select membership_start_timestamp_micros from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_membership_start_timestamp_micros + , (select membership_expiry_timestamp_micros from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_membership_expiry_timestamp_micros + , (select npa from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_npa {% endfor %} {% endmacro %} diff --git a/macros/combine_back.sql b/macros/combine_back.sql new file mode 100644 index 00000000..dfee6656 --- /dev/null +++ b/macros/combine_back.sql @@ -0,0 +1,104 @@ + + +{%- macro combine_property_data() -%} + {{ return(adapter.dispatch('combine_property_data', 'ga4')()) }} +{%- endmacro -%} + +{% macro default__combine_property_data() %} + + create schema if not exists `{{target.project}}.{{var('combined_dataset')}}`; + + {# If incremental, then use static_incremental_days variable to find earliest shard to copy #} + + create schema if not exists `{{target.project}}.{{var('combined_dataset')}}`; + + {# If incremental, then use static_incremental_days variable to find earliest shard to copy #} + {% if not should_full_refresh() %} + {% set earliest_shard_to_retrieve = (modules.datetime.date.today() - modules.datetime.timedelta(days=var('static_incremental_days')))|string|replace("-", "")|int %} + {% set earliest_shard_to_retrieve = (modules.datetime.date.today() - modules.datetime.timedelta(days=var('static_incremental_days')))|string|replace("-", "")|int %} + {% else %} + {# Otherwise use 'start_date' variable #} + + {% set earliest_shard_to_retrieve = var('start_date')|int %} + {# Otherwise use 'start_date' variable #} + + {% set earliest_shard_to_retrieve = var('start_date')|int %} + {% endif %} + {% for property_id in var('property_ids') %} + {%- set schema_name = "analytics_" + property_id|string -%} + {% if this.name == 'base_ga4__events' %} + {# Copy intraday tables #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_intraday_%', database=var('source_project')) -%} + {% for relation in relations %} + {%- set relation_suffix = relation.identifier|replace('events_intraday_', '') -%} + {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} + CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_intraday_{{relation_suffix}}`; + {%- endif -%} + {% endfor %} + {# Copy daily tables and drop old intraday table #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_%', exclude='events_intraday_%', database=var('source_project')) -%} + {% for relation in relations %} + {%- set relation_suffix = relation.identifier|replace('events_', '') -%} + {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} + CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_{{relation_suffix}}`; + DROP TABLE IF EXISTS `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}`; + {%- endif -%} + {% endfor %} + {% elif this.name == 'base_ga4__pseudonymous_users' %} + {# Copy pseudonymous_users tables #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='pseudonymous_users_%', database=var('source_project')) -%} + {% for relation in relations %} + {%- set relation_suffix = relation.identifier|replace('pseudonymous_users_', '') -%} + {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} + CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.pseudonymous_users_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.pseudonymous_users_{{relation_suffix}}`; + {%- endif -%} + {% endfor %} + {% elif this.name == 'base_ga4__users' %} + {# Copy users tables #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='users_%', exclude='pseudonymous_users_%', database=var('source_project')) -%} + {% for relation in relations %} + {%- set relation_suffix = relation.identifier|replace('users_', '') -%} + {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} + CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.users_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.users_{{relation_suffix}}`; + {%- endif -%} + {% endfor %} + {% endif %} + {% if this.name == 'base_ga4__events' %} + {# Copy intraday tables #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_intraday_%', database=var('source_project')) -%} + {% for relation in relations %} + {%- set relation_suffix = relation.identifier|replace('events_intraday_', '') -%} + {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} + CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_intraday_{{relation_suffix}}`; + {%- endif -%} + {% endfor %} + {# Copy daily tables and drop old intraday table #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_%', exclude='events_intraday_%', database=var('source_project')) -%} + {% for relation in relations %} + {%- set relation_suffix = relation.identifier|replace('events_', '') -%} + {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} + CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_{{relation_suffix}}`; + DROP TABLE IF EXISTS `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}`; + {%- endif -%} + {% endfor %} + {% elif this.name == 'base_ga4__pseudonymous_users' %} + {# Copy pseudonymous_users tables #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='pseudonymous_users_%', database=var('source_project')) -%} + {% for relation in relations %} + {%- set relation_suffix = relation.identifier|replace('pseudonymous_users_', '') -%} + {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} + CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.pseudonymous_users_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.pseudonymous_users_{{relation_suffix}}`; + {%- endif -%} + {% endfor %} + {% elif this.name == 'base_ga4__users' %} + {# Copy users tables #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='users_%', exclude='pseudonymous_users_%', database=var('source_project')) -%} + {% for relation in relations %} + {%- set relation_suffix = relation.identifier|replace('users_', '') -%} + {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} + CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.users_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.users_{{relation_suffix}}`; + {%- endif -%} + {% endfor %} + {% endif %} + {% endfor %} +{% endmacro %} \ No newline at end of file diff --git a/macros/combine_property_data.sql b/macros/combine_property_data.sql index 623858a5..fd83fc33 100644 --- a/macros/combine_property_data.sql +++ b/macros/combine_property_data.sql @@ -3,55 +3,61 @@ {%- endmacro -%} {% macro default__combine_property_data() %} - - create schema if not exists `{{target.project}}.{{var('combined_dataset')}}`; - - {# If incremental, then use static_incremental_days variable to find earliest shard to copy #} {% if not should_full_refresh() %} - {% set earliest_shard_to_retrieve = (modules.datetime.date.today() - modules.datetime.timedelta(days=var('static_incremental_days')))|string|replace("-", "")|int %} + {# If incremental, then use static_incremental_days variable to find earliest shard to copy #} + {%- set earliest_shard_to_retrieve = (modules.datetime.date.today() - modules.datetime.timedelta(days=var('static_incremental_days')))|string|replace("-", "")|int -%} {% else %} - {# Otherwise use 'start_date' variable #} - - {% set earliest_shard_to_retrieve = var('start_date')|int %} + {# Otherwise use 'start_date' variable #} + {%- set earliest_shard_to_retrieve = var('start_date')|int -%} {% endif %} {% for property_id in var('property_ids') %} {%- set schema_name = "analytics_" + property_id|string -%} - {% if this == 'base_ga4__events' %} + {%- set combine_specified_property_data_query -%} + create schema if not exists `{{target.project}}.{{var('combined_dataset')}}`; + {% if this.name == 'base_ga4__events' %} {# Copy intraday tables #} {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_intraday_%', database=var('source_project')) -%} {% for relation in relations %} {%- set relation_suffix = relation.identifier|replace('events_intraday_', '') -%} {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} - CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_intraday_{{relation_suffix}}`; + create or replace table `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}` clone `{{var('source_project')}}.analytics_{{property_id}}.events_intraday_{{relation_suffix}}`; {%- endif -%} {% endfor %} + {# Copy daily tables and drop old intraday table #} {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_%', exclude='events_intraday_%', database=var('source_project')) -%} {% for relation in relations %} {%- set relation_suffix = relation.identifier|replace('events_', '') -%} {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} - CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_{{relation_suffix}}`; - DROP TABLE IF EXISTS `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}`; + create or replace table `{{target.project}}.{{var('combined_dataset')}}.events_{{relation_suffix}}{{property_id}}` clone `{{var('source_project')}}.analytics_{{property_id}}.events_{{relation_suffix}}`; + drop table if exists `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}`; {%- endif -%} {% endfor %} - {% elif this == 'base_ga4__pseudonymous_users' %} + {% elif this.name == 'base_ga4__pseudonymous_users' %} {# Copy pseudonymous_users tables #} {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='pseudonymous_users_%', database=var('source_project')) -%} {% for relation in relations %} {%- set relation_suffix = relation.identifier|replace('pseudonymous_users_', '') -%} {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} - CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.pseudonymous_users_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.pseudonymous_users_{{relation_suffix}}`; + create or replace table `{{target.project}}.{{var('combined_dataset')}}.pseudonymous_users_{{relation_suffix}}{{property_id}}` clone `{{var('source_project')}}.analytics_{{property_id}}.pseudonymous_users_{{relation_suffix}}`; {%- endif -%} {% endfor %} - {% elif this == 'base_ga4__users' %} - {# Copy users tables #} - {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='users_%', exclude='pseudonymous_users_%', database=var('source_project')) -%} + {% elif this.name == 'base_ga4__users' %} + {# Copy users tables #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='users_%', database=var('source_project')) -%} {% for relation in relations %} {%- set relation_suffix = relation.identifier|replace('users_', '') -%} {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} - CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.users_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.users_{{relation_suffix}}`; + create or replace table `{{target.project}}.{{var('combined_dataset')}}.users_{{relation_suffix}}{{property_id}}` clone `{{var('source_project')}}.analytics_{{property_id}}.users_{{relation_suffix}}`; {%- endif -%} {% endfor %} {% endif %} + {%- endset -%} + {% do run_query(combine_specified_property_data_query) %} + + {% if execute %} + {{ log("Cloned from `" ~ var('source_project') ~ ".analytics_" ~ property_id ~ ".events_*` to `" ~ target.project ~ "." ~ var('combined_dataset') ~ ".events_YYYYMMDD" ~ property_id ~ "`.", True) }} + {% endif %} {% endfor %} -{% endmacro %} \ No newline at end of file +{% endmacro %} + diff --git a/models/staging/base/base_ga4__pseudonymous_users.sql b/models/staging/base/base_ga4__pseudonymous_users.sql index 82f8c6cd..9f50b179 100644 --- a/models/staging/base/base_ga4__pseudonymous_users.sql +++ b/models/staging/base/base_ga4__pseudonymous_users.sql @@ -2,14 +2,14 @@ {% for i in range(var('static_incremental_days')) %} {% set partitions_to_replace = partitions_to_replace.append('date_sub(current_date, interval ' + (i+1)|string + ' day)') %} {% endfor %} - +{{ log("this.name: " ~ this.name, True)}} {{ config( pre_hook="{{ ga4.combine_property_data() }}" if var('combined_dataset', false) else "", materialized = 'incremental', incremental_strategy = 'insert_overwrite', partition_by={ - "field": "occurence_date", + "field": "occurrence_date", "data_type": "date", }, partitions = partitions_to_replace, From 459c4bf221d7fe2b8f3eef7c28519fcfa572b5d6 Mon Sep 17 00:00:00 2001 From: dgitis Date: Fri, 3 May 2024 16:28:37 -0700 Subject: [PATCH 04/12] minor cleanup --- macros/combine_back.sql | 104 ------------------------------- macros/combine_property_data.sql | 2 +- 2 files changed, 1 insertion(+), 105 deletions(-) delete mode 100644 macros/combine_back.sql diff --git a/macros/combine_back.sql b/macros/combine_back.sql deleted file mode 100644 index dfee6656..00000000 --- a/macros/combine_back.sql +++ /dev/null @@ -1,104 +0,0 @@ - - -{%- macro combine_property_data() -%} - {{ return(adapter.dispatch('combine_property_data', 'ga4')()) }} -{%- endmacro -%} - -{% macro default__combine_property_data() %} - - create schema if not exists `{{target.project}}.{{var('combined_dataset')}}`; - - {# If incremental, then use static_incremental_days variable to find earliest shard to copy #} - - create schema if not exists `{{target.project}}.{{var('combined_dataset')}}`; - - {# If incremental, then use static_incremental_days variable to find earliest shard to copy #} - {% if not should_full_refresh() %} - {% set earliest_shard_to_retrieve = (modules.datetime.date.today() - modules.datetime.timedelta(days=var('static_incremental_days')))|string|replace("-", "")|int %} - {% set earliest_shard_to_retrieve = (modules.datetime.date.today() - modules.datetime.timedelta(days=var('static_incremental_days')))|string|replace("-", "")|int %} - {% else %} - {# Otherwise use 'start_date' variable #} - - {% set earliest_shard_to_retrieve = var('start_date')|int %} - {# Otherwise use 'start_date' variable #} - - {% set earliest_shard_to_retrieve = var('start_date')|int %} - {% endif %} - {% for property_id in var('property_ids') %} - {%- set schema_name = "analytics_" + property_id|string -%} - {% if this.name == 'base_ga4__events' %} - {# Copy intraday tables #} - {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_intraday_%', database=var('source_project')) -%} - {% for relation in relations %} - {%- set relation_suffix = relation.identifier|replace('events_intraday_', '') -%} - {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} - CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_intraday_{{relation_suffix}}`; - {%- endif -%} - {% endfor %} - {# Copy daily tables and drop old intraday table #} - {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_%', exclude='events_intraday_%', database=var('source_project')) -%} - {% for relation in relations %} - {%- set relation_suffix = relation.identifier|replace('events_', '') -%} - {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} - CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_{{relation_suffix}}`; - DROP TABLE IF EXISTS `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}`; - {%- endif -%} - {% endfor %} - {% elif this.name == 'base_ga4__pseudonymous_users' %} - {# Copy pseudonymous_users tables #} - {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='pseudonymous_users_%', database=var('source_project')) -%} - {% for relation in relations %} - {%- set relation_suffix = relation.identifier|replace('pseudonymous_users_', '') -%} - {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} - CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.pseudonymous_users_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.pseudonymous_users_{{relation_suffix}}`; - {%- endif -%} - {% endfor %} - {% elif this.name == 'base_ga4__users' %} - {# Copy users tables #} - {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='users_%', exclude='pseudonymous_users_%', database=var('source_project')) -%} - {% for relation in relations %} - {%- set relation_suffix = relation.identifier|replace('users_', '') -%} - {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} - CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.users_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.users_{{relation_suffix}}`; - {%- endif -%} - {% endfor %} - {% endif %} - {% if this.name == 'base_ga4__events' %} - {# Copy intraday tables #} - {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_intraday_%', database=var('source_project')) -%} - {% for relation in relations %} - {%- set relation_suffix = relation.identifier|replace('events_intraday_', '') -%} - {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} - CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_intraday_{{relation_suffix}}`; - {%- endif -%} - {% endfor %} - {# Copy daily tables and drop old intraday table #} - {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_%', exclude='events_intraday_%', database=var('source_project')) -%} - {% for relation in relations %} - {%- set relation_suffix = relation.identifier|replace('events_', '') -%} - {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} - CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_{{relation_suffix}}`; - DROP TABLE IF EXISTS `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}`; - {%- endif -%} - {% endfor %} - {% elif this.name == 'base_ga4__pseudonymous_users' %} - {# Copy pseudonymous_users tables #} - {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='pseudonymous_users_%', database=var('source_project')) -%} - {% for relation in relations %} - {%- set relation_suffix = relation.identifier|replace('pseudonymous_users_', '') -%} - {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} - CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.pseudonymous_users_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.pseudonymous_users_{{relation_suffix}}`; - {%- endif -%} - {% endfor %} - {% elif this.name == 'base_ga4__users' %} - {# Copy users tables #} - {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='users_%', exclude='pseudonymous_users_%', database=var('source_project')) -%} - {% for relation in relations %} - {%- set relation_suffix = relation.identifier|replace('users_', '') -%} - {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} - CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.users_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.users_{{relation_suffix}}`; - {%- endif -%} - {% endfor %} - {% endif %} - {% endfor %} -{% endmacro %} \ No newline at end of file diff --git a/macros/combine_property_data.sql b/macros/combine_property_data.sql index fd83fc33..53179f54 100644 --- a/macros/combine_property_data.sql +++ b/macros/combine_property_data.sql @@ -54,7 +54,7 @@ {% endif %} {%- endset -%} {% do run_query(combine_specified_property_data_query) %} - + -- Log needs to be adjusted for different source and target tables {% if execute %} {{ log("Cloned from `" ~ var('source_project') ~ ".analytics_" ~ property_id ~ ".events_*` to `" ~ target.project ~ "." ~ var('combined_dataset') ~ ".events_YYYYMMDD" ~ property_id ~ "`.", True) }} {% endif %} From 417557365a48d51a838b8f9994f33c0eda38cb3c Mon Sep 17 00:00:00 2001 From: dgitis Date: Fri, 19 Apr 2024 16:05:53 -0700 Subject: [PATCH 05/12] combine_property_data --- macros/combine_property_data.sql | 10 ++++++++-- models/staging/base/base_ga4__pseudonymous_users.sql | 3 +-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/macros/combine_property_data.sql b/macros/combine_property_data.sql index 53179f54..e0e5e6fc 100644 --- a/macros/combine_property_data.sql +++ b/macros/combine_property_data.sql @@ -12,6 +12,7 @@ {% endif %} {% for property_id in var('property_ids') %} {%- set schema_name = "analytics_" + property_id|string -%} + {% set modifications = [] %} {%- set combine_specified_property_data_query -%} create schema if not exists `{{target.project}}.{{var('combined_dataset')}}`; {% if this.name == 'base_ga4__events' %} @@ -21,6 +22,7 @@ {%- set relation_suffix = relation.identifier|replace('events_intraday_', '') -%} {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} create or replace table `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}` clone `{{var('source_project')}}.analytics_{{property_id}}.events_intraday_{{relation_suffix}}`; + {% do modifications.append( {'source_partition': 'events_intraday_' + relation_suffix , 'destination_partition': 'events_intraday_' + relation_suffix + property_id } ) %} {%- endif -%} {% endfor %} @@ -31,6 +33,7 @@ {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} create or replace table `{{target.project}}.{{var('combined_dataset')}}.events_{{relation_suffix}}{{property_id}}` clone `{{var('source_project')}}.analytics_{{property_id}}.events_{{relation_suffix}}`; drop table if exists `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}`; + {% do modifications.append( {'source_partition': 'events_' + relation_suffix , 'destination_partition': 'events_' + relation_suffix + property_id } ) %} {%- endif -%} {% endfor %} {% elif this.name == 'base_ga4__pseudonymous_users' %} @@ -40,6 +43,7 @@ {%- set relation_suffix = relation.identifier|replace('pseudonymous_users_', '') -%} {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} create or replace table `{{target.project}}.{{var('combined_dataset')}}.pseudonymous_users_{{relation_suffix}}{{property_id}}` clone `{{var('source_project')}}.analytics_{{property_id}}.pseudonymous_users_{{relation_suffix}}`; + {% do modifications.append( {'source_partition': 'pseudonymous_users_' + relation_suffix , 'destination_partition': 'pseudonymous_users_' + relation_suffix + property_id } ) %} {%- endif -%} {% endfor %} {% elif this.name == 'base_ga4__users' %} @@ -49,14 +53,16 @@ {%- set relation_suffix = relation.identifier|replace('users_', '') -%} {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} create or replace table `{{target.project}}.{{var('combined_dataset')}}.users_{{relation_suffix}}{{property_id}}` clone `{{var('source_project')}}.analytics_{{property_id}}.users_{{relation_suffix}}`; + {% do modifications.append( {'source_partition': 'users_' + relation_suffix , 'destination_partition': 'users_' + relation_suffix + property_id } ) %} {%- endif -%} {% endfor %} {% endif %} {%- endset -%} {% do run_query(combine_specified_property_data_query) %} - -- Log needs to be adjusted for different source and target tables {% if execute %} - {{ log("Cloned from `" ~ var('source_project') ~ ".analytics_" ~ property_id ~ ".events_*` to `" ~ target.project ~ "." ~ var('combined_dataset') ~ ".events_YYYYMMDD" ~ property_id ~ "`.", True) }} + {% for modification in modifications%} + {{ log("Cloned from `" ~ var('source_project') ~ ".analytics_" ~ property_id ~ "." ~ modification.source_partition ~"` to `" ~ target.project ~ "." ~ var('combined_dataset') ~ "." ~ modification.destination_partition ~ "`.", True) }} + {% endfor %} {% endif %} {% endfor %} {% endmacro %} diff --git a/models/staging/base/base_ga4__pseudonymous_users.sql b/models/staging/base/base_ga4__pseudonymous_users.sql index 9f50b179..830f11a1 100644 --- a/models/staging/base/base_ga4__pseudonymous_users.sql +++ b/models/staging/base/base_ga4__pseudonymous_users.sql @@ -2,7 +2,6 @@ {% for i in range(var('static_incremental_days')) %} {% set partitions_to_replace = partitions_to_replace.append('date_sub(current_date, interval ' + (i+1)|string + ' day)') %} {% endfor %} -{{ log("this.name: " ~ this.name, True)}} {{ config( pre_hook="{{ ga4.combine_property_data() }}" if var('combined_dataset', false) else "", @@ -23,7 +22,7 @@ with source as ( {{ ga4.base_select_usr_source() }} from {{ source('ga4', 'pseudonymous_users') }} {% if is_incremental() %} - where parse_date('%Y%m%d', left(replace(_table_suffix, 'intraday_', ''), 8)) in ({{ partitions_to_replace | join(',') }}) + where parse_date('%Y%m%d', right(_table_suffix, 8)) in ({{ partitions_to_replace | join(',') }}) {% endif %} ) From a7aa28ee7def8ac382dd241c555169ae4cb7acae Mon Sep 17 00:00:00 2001 From: dgitis Date: Wed, 17 Jul 2024 15:18:22 -0700 Subject: [PATCH 06/12] working multi-site user tables --- macros/combine_property_data.sql | 3 ++- models/staging/base/base_ga4__pseudonymous_users.sql | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/macros/combine_property_data.sql b/macros/combine_property_data.sql index e0e5e6fc..7ce1859c 100644 --- a/macros/combine_property_data.sql +++ b/macros/combine_property_data.sql @@ -39,6 +39,7 @@ {% elif this.name == 'base_ga4__pseudonymous_users' %} {# Copy pseudonymous_users tables #} {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='pseudonymous_users_%', database=var('source_project')) -%} + {{ log("Relations: " ~ relations ) }} {% for relation in relations %} {%- set relation_suffix = relation.identifier|replace('pseudonymous_users_', '') -%} {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} @@ -61,7 +62,7 @@ {% do run_query(combine_specified_property_data_query) %} {% if execute %} {% for modification in modifications%} - {{ log("Cloned from `" ~ var('source_project') ~ ".analytics_" ~ property_id ~ "." ~ modification.source_partition ~"` to `" ~ target.project ~ "." ~ var('combined_dataset') ~ "." ~ modification.destination_partition ~ "`.", True) }} + {{ log("Cloned from `" ~ var('source_project') ~ ".analytics_" ~ property_id ~ "." ~ modification.source_partition ~"` to `" ~ target.project ~ "." ~ var('combined_dataset') ~ "." ~ modification.destination_partition ~"`", True) }} {% endfor %} {% endif %} {% endfor %} diff --git a/models/staging/base/base_ga4__pseudonymous_users.sql b/models/staging/base/base_ga4__pseudonymous_users.sql index 830f11a1..b091dfc7 100644 --- a/models/staging/base/base_ga4__pseudonymous_users.sql +++ b/models/staging/base/base_ga4__pseudonymous_users.sql @@ -22,7 +22,7 @@ with source as ( {{ ga4.base_select_usr_source() }} from {{ source('ga4', 'pseudonymous_users') }} {% if is_incremental() %} - where parse_date('%Y%m%d', right(_table_suffix, 8)) in ({{ partitions_to_replace | join(',') }}) + where parse_date('%Y%m%d', left(_table_suffix, 8)) in ({{ partitions_to_replace | join(',') }}) {% endif %} ) From de6850650d16be8241a5195efb795b4bdec32467 Mon Sep 17 00:00:00 2001 From: dgitis Date: Wed, 17 Jul 2024 15:37:55 -0700 Subject: [PATCH 07/12] staging models --- macros/base_select.sql | 48 +++++++++---------- .../base/base_ga4__pseudonymous_users.sql | 1 + models/staging/base/base_ga4__users.sql | 29 +++++++++++ .../staging/stg_ga4__pseudonymous_users.sql | 11 +++++ models/staging/stg_ga4__users.sql | 9 ++++ 5 files changed, 74 insertions(+), 24 deletions(-) create mode 100644 models/staging/base/base_ga4__users.sql create mode 100644 models/staging/stg_ga4__pseudonymous_users.sql create mode 100644 models/staging/stg_ga4__users.sql diff --git a/macros/base_select.sql b/macros/base_select.sql index dd4fc92f..53c4685b 100644 --- a/macros/base_select.sql +++ b/macros/base_select.sql @@ -166,30 +166,30 @@ {% endmacro %} {% macro default__base_select_usr_source() %} - , user_info.last_active_timestamp_micros - , user_info.user_first_touch_timestamp_micros - , user_info.first_purchase_date - , device.operating_system - , device.category - , device.mobile_brand_name - , device.mobile_model_name - , device.unified_screen_name - , geo.city - , geo.country - , geo.continent - , geo.region - , user_ltv.revenue_in_usd - , user_ltv.sessions - , user_ltv.engagement_time_millis - , user_ltv.purchases - , user_ltv.engaged_sessions - , user_ltv.session_duration_micros - , predictions.in_app_purchase_score_7d - , predictions.purchase_score_7d - , predictions.churn_score_7d - , predictions.revenue_28d_in_usd - , privacy_info.is_limited_ad_tracking - , privacy_info.is_ads_personalization_allowed + , user_info.last_active_timestamp_micros as user_info_last_active_timestamp_micros + , user_info.user_first_touch_timestamp_micros as user_info_user_first_touch_timestamp_micros + , user_info.first_purchase_date as user_info_first_purchase_date + , device.operating_system as device_operating_system + , device.category as device_category + , device.mobile_brand_name as device_mobile_brand_name + , device.mobile_model_name as device_mobile_model_name + , device.unified_screen_name as device_unified_sceen_name + , geo.city as geo_city + , geo.country as geo_country + , geo.continent as geo_continent + , geo.region as geo_region + , user_ltv.revenue_in_usd as user_ltv_revenue_in_usd + , user_ltv.sessions as user_ltv_sessions + , user_ltv.engagement_time_millis as user_ltv_engagement_time_millis + , user_ltv.purchases as user_ltv_purchases + , user_ltv.engaged_sessions as user_ltv_engaged_sessions + , user_ltv.session_duration_micros as user_ltv_session_duration_micros + , predictions.in_app_purchase_score_7d as predictions_in_app_purchase_score_7d + , predictions.purchase_score_7d as predictions_purchase_score_7d + , predictions.churn_score_7d as predictions_churn_score_7d + , predictions.revenue_28d_in_usd as predictions_revenue_28d_in_usd + , privacy_info.is_limited_ad_tracking as privacy_info_is_limited_ad_tracking + , privacy_info.is_ads_personalization_allowed as privacy_info_is_ads_personalization_allowed , parse_date('%Y%m%d' , occurrence_date) as occurrence_date , parse_date('%Y%m%d' , last_updated_date) as last_updated_date {% for up in var('user_properties', []) %} -- don't have sample data; need to verify diff --git a/models/staging/base/base_ga4__pseudonymous_users.sql b/models/staging/base/base_ga4__pseudonymous_users.sql index b091dfc7..37e11343 100644 --- a/models/staging/base/base_ga4__pseudonymous_users.sql +++ b/models/staging/base/base_ga4__pseudonymous_users.sql @@ -7,6 +7,7 @@ pre_hook="{{ ga4.combine_property_data() }}" if var('combined_dataset', false) else "", materialized = 'incremental', incremental_strategy = 'insert_overwrite', + enabled=false, partition_by={ "field": "occurrence_date", "data_type": "date", diff --git a/models/staging/base/base_ga4__users.sql b/models/staging/base/base_ga4__users.sql new file mode 100644 index 00000000..00cda7d9 --- /dev/null +++ b/models/staging/base/base_ga4__users.sql @@ -0,0 +1,29 @@ +{% set partitions_to_replace = ['current_date'] %} +{% for i in range(var('static_incremental_days')) %} + {% set partitions_to_replace = partitions_to_replace.append('date_sub(current_date, interval ' + (i+1)|string + ' day)') %} +{% endfor %} +{{ + config( + pre_hook="{{ ga4.combine_property_data() }}" if var('combined_dataset', false) else "", + materialized = 'incremental', + incremental_strategy = 'insert_overwrite', + enabled=false, + partition_by={ + "field": "occurrence_date", + "data_type": "date", + }, + partitions = partitions_to_replace, + ) +}} + +with source as ( + select + user_id + {{ ga4.base_select_usr_source() }} + from {{ source('ga4', 'users') }} + {% if is_incremental() %} + where parse_date('%Y%m%d', left(_table_suffix, 8)) in ({{ partitions_to_replace | join(',') }}) + {% endif %} +) + +select * from source diff --git a/models/staging/stg_ga4__pseudonymous_users.sql b/models/staging/stg_ga4__pseudonymous_users.sql new file mode 100644 index 00000000..b189d887 --- /dev/null +++ b/models/staging/stg_ga4__pseudonymous_users.sql @@ -0,0 +1,11 @@ +{{ + config( + materialized='view', + enabled=false + + ) +}} +select + * + , to_base64(md5(concat(user_pseudo_id, stream_id))) as client_key +from {{ref('base_ga4__pseudonymous_users')}} diff --git a/models/staging/stg_ga4__users.sql b/models/staging/stg_ga4__users.sql new file mode 100644 index 00000000..6e4fed2d --- /dev/null +++ b/models/staging/stg_ga4__users.sql @@ -0,0 +1,9 @@ +{{ + config( + materialized='view', + enabled=false + ) +}} +select + * +from {{ref('base_ga4__users')}} From 30a6cf225c0386cb40e6c7ecafdd354f1c8906aa Mon Sep 17 00:00:00 2001 From: dgitis Date: Wed, 17 Jul 2024 16:01:57 -0700 Subject: [PATCH 08/12] readme --- README.md | 138 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 97 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index 5cf39293..6e0730cd 100644 --- a/README.md +++ b/README.md @@ -163,47 +163,6 @@ vars: value_type: "int_value" ``` -### User Properties - -User properties are provided by GA4 in the `user_properties` repeated field. The most recent user property for each user will be extracted and included in the `dim_ga4__users` model by configuring the `user_properties` variable in your project as follows: - -``` -vars: - ga4: - user_properties: - - user_property_name: "membership_level" - value_type: "int_value" - - user_property_name: "account_status" - value_type: "string_value" -``` - -### Derived User Properties - -Derived user properties are different from "User Properties" in that they are derived from event parameters. This provides additional flexibility in allowing users to turn any event parameter into a user property. - -Derived User Properties are included in the `dim_ga4__users` model and contain the latest event parameter value per user. - -``` -derived_user_properties: - - event_parameter: "[your event parameter]" - user_property_name: "[a unique name for the derived user property]" - value_type: "[string_value|int_value|float_value|double_value]" -``` - -For example: - -``` -vars: - ga4: - derived_user_properties: - - event_parameter: "page_location" - user_property_name: "most_recent_page_location" - value_type: "string_value" - - event_parameter: "another_event_param" - user_property_name: "most_recent_param" - value_type: "string_value" -``` - ### Derived Session Properties Derived session properties are similar to derived user properties, but on a per-session basis, for properties that change slowly over time. This provides additional flexibility in allowing users to turn any event parameter into a session property. @@ -280,6 +239,103 @@ vars: - name: "some_other_parameter" value_type: "string_value" ``` +# User Tables + +This package contains two sets of user tables: an original set of user tables implemented from the inception of this package and a new set of user tables designed to use the GA4 BigQuery user export tables that were released after this package was first launched. + +The original user tables build one-row-per-user tables and include data like first and last device, first and last geo, user properties, and derived user properties. To build them, they need to process all-time data. Large sites might want to consider disabling these tables to save costs. + +The newer user tables that leverage the GA4 user export setting. They are partitioned tables so they are more appropriate for high-traffic sites. They lose the first and last columns and derived user properties, but include user properties, audiences, user LTV, and predictive data. + +## Settings Common to Both Sets of User Tables + +### User Properties + +User properties are provided by GA4 in the `user_properties` repeated field. The most recent user property for each user will be extracted and included in the `dim_ga4__users` model by configuring the `user_properties` variable in your project as follows: + +``` +vars: + ga4: + user_properties: + - user_property_name: "membership_level" + value_type: "int_value" + - user_property_name: "account_status" + value_type: "string_value" +``` + +## dbt-GA4 Original User Table Settings + +### Derived User Properties + +Derived user properties are different from "User Properties" in that they are derived from event parameters. This provides additional flexibility in allowing users to turn any event parameter into a user property. + +Derived User Properties are included in the `dim_ga4__users` model and contain the latest event parameter value per user. + +``` +derived_user_properties: + - event_parameter: "[your event parameter]" + user_property_name: "[a unique name for the derived user property]" + value_type: "[string_value|int_value|float_value|double_value]" +``` + +For example: + +``` +vars: + ga4: + derived_user_properties: + - event_parameter: "page_location" + user_property_name: "most_recent_page_location" + value_type: "string_value" + - event_parameter: "another_event_param" + user_property_name: "most_recent_param" + value_type: "string_value" +``` + +## GA4 User Export Settings + +The GA4 user export models are disabled by default. + +Enable them by adding the following model configs: + +``` +models: + ga4: + staging: + base: + base_ga4__pseudonymous_users: + +enabled: true + base_ga4__users: + +enabled: true + stg_ga4__pseudonymous_users: + +enabled: true + stg_ga4__users: + +enabled: true +``` + +### Audiences + +The GA4 User Export includes an Audiences repeated record that stores the audience membership details. Audiences are enabled by adding a list of audience names that match values in the `audiences.name` fields of your `psuedonymous_users_` and `users__` tables as shown below. + +``` +vars: + ga4: + audiences: ['Purchases', 'All Users'] +``` + +This example will add the following columns to the relevant dbt-GA4 models: + +- purchases_id +- purchases_name +- purchases_membership_start_timestamp_micros +- purchases_membership_expiry_timestamp_micros +- purchases_npa +- all_users_id +- all_users_name +- all_users_membership_start_timestamp_micros +- all_users_membership_expiry_timestamp_micros +- all_users_npa + # Connecting to BigQuery This package assumes that BigQuery is the source of your GA4 data. Full instructions for connecting DBT to BigQuery are here: https://docs.getdbt.com/reference/warehouse-profiles/bigquery-profile From 982cae7927923143bd05f0abf2877276295aebfb Mon Sep 17 00:00:00 2001 From: dgitis Date: Sat, 20 Jul 2024 16:04:01 -0700 Subject: [PATCH 09/12] documentation --- README.md | 2 +- macros/base_select.sql | 14 +-- .../base/base_ga4__pseudonymous_users.yml | 85 +++++++++++++++++++ models/staging/base/base_ga4__users.yml | 83 ++++++++++++++++++ models/staging/stg_ga4__client_keys.sql | 23 +++++ models/staging/stg_ga4__client_keys.yml | 14 +++ .../staging/stg_ga4__pseudonymous_users.sql | 11 --- models/staging/stg_ga4__users.sql | 12 +++ models/staging/stg_ga4__users.yml | 9 ++ 9 files changed, 229 insertions(+), 24 deletions(-) create mode 100644 models/staging/base/base_ga4__pseudonymous_users.yml create mode 100644 models/staging/base/base_ga4__users.yml create mode 100644 models/staging/stg_ga4__client_keys.sql create mode 100644 models/staging/stg_ga4__client_keys.yml delete mode 100644 models/staging/stg_ga4__pseudonymous_users.sql create mode 100644 models/staging/stg_ga4__users.yml diff --git a/README.md b/README.md index 6e0730cd..04379a97 100644 --- a/README.md +++ b/README.md @@ -307,7 +307,7 @@ models: +enabled: true base_ga4__users: +enabled: true - stg_ga4__pseudonymous_users: + stg_ga4__client_keys: +enabled: true stg_ga4__users: +enabled: true diff --git a/macros/base_select.sql b/macros/base_select.sql index 53c4685b..d07abbeb 100644 --- a/macros/base_select.sql +++ b/macros/base_select.sql @@ -192,16 +192,6 @@ , privacy_info.is_ads_personalization_allowed as privacy_info_is_ads_personalization_allowed , parse_date('%Y%m%d' , occurrence_date) as occurrence_date , parse_date('%Y%m%d' , last_updated_date) as last_updated_date - {% for up in var('user_properties', []) %} -- don't have sample data; need to verify - , (select value.string_value from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_string_value - , (select value.set_timestamp_micros from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_set_timestamp_micros - , (select value.user_property_name from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_user_property_name - {% endfor %} - {% for aud in var('audiences', []) %} -- this should be good, though - , (select id from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_id - , (select name from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_name - , (select membership_start_timestamp_micros from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_membership_start_timestamp_micros - , (select membership_expiry_timestamp_micros from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_membership_expiry_timestamp_micros - , (select npa from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_npa - {% endfor %} + , user_properties + , audiences {% endmacro %} diff --git a/models/staging/base/base_ga4__pseudonymous_users.yml b/models/staging/base/base_ga4__pseudonymous_users.yml new file mode 100644 index 00000000..59897faf --- /dev/null +++ b/models/staging/base/base_ga4__pseudonymous_users.yml @@ -0,0 +1,85 @@ +version: 2 + +models: + - name: base_ga4__pseudonymous_users + description: > + Base pseudo-user (client) model that pulls all fields from the pseudonymous user table of the user export. The pseudonymous user table is keyed on + the user_pseudo_id which is the cid parameter in Gtag calls and is the main parameter in the from which the dbt-GA4 client_id is + created. The table is partitioned by occurence_date. This model also flattens some fields. + columns: + - name: pseudo_user_id + description: > + The user_pseudo_id is a unique identifier for a user that is not tied to any personal information. This is the main identifier + used in the GA4 property. This is the cid parameter in Gtag calls and is the main parameter in the from which the dbt-GA4 client_id is + created. + - name: stream_id + description: The numeric ID of the data stream from which the event originated. + - name: user_info_last_active_timestamp_micros + description: Date of the user's last activity (timestamp in microseconds). Flattened version of user_info.last_active_timestamp_micros. + - name: user_info_user_first_touch_timestamp_micros + description: Date of the user's first_open or first_visit event, whichever is earlier (timestamp in microseconds). Flattened version of user_info.user_first_touch_timestamp_micros. + - name: user_info_first_purchase_date + description: Date of the user's first purchase (YYYYMMDD). Flattened version of user_info.first_purchase_date. + - name: device_operating_system + description: Flattened version of device.operating_system. + - name: device_category + description: Category of the device (mobile, tablet, desktop). Flattened version of device.category. + - name: device_mobile_brand_name + description: Flattened version of device.mobile_brand_name. + - name: device_mobile_model_name + description: Flattened version of device.mobile_model_name. + - name: device_unified_sceen_name + description: Flattened version of device.unified_screen_name. + - name: geo_city + description: Flattened version of geo.city. + - name: geo_country + description: Flattened version of geo.country. + - name: geo_continent + description: Flattened version of geo.continent. + - name: geo_region + description: Flattened version of geo.region. + - name: user_ltv_revenue_in_usd + description: Flattened version of user_ltv.revenue_in_usd. + - name: user_ltv_sessions + description: Flattened version of user_ltv.sessions + - name: user_ltv_engagement_time_millis + description: Flattened version of user_ltv.engagement_time_millis + - name: user_ltv_purchases + description: Flattened version of user_ltv.purchases + - name: user_ltv_engaged_sessions + description: Flattened version of user_ltv.engaged_sessions + - name: user_ltv_session_duration_micros + description: Flattened version of user_ltv.session_duration_micros + - name: predictions_in_app_purchase_score_7d + description: > + Probability that a user who was active in the last 28 days will log an in_app_purchase event within the next 7 days. + Flattened ersion of predictions.in_app_purchase_score_7d. + - name: predictions_purchase_score_7d + description: > + Probability that a user who was active in the last 28 days will log a purchase event within the next 7 days. + Flattened version of predictions.purchase_score_7d. + - name: predictions_churn_score_7d + description: > + Probability that a user who was active on your app or site within the last 7 days will not be active within the next 7 days. + Flattened version of predictions.churn_score_7d. + - name: predictions_revenue_28d_in_usd + description: > + Revenue expected (in USD) from all purchase events within the next 28 days from a user who was active in the last 28 days. + Flattened version of predictions.revenue_28d_in_usd. + - name: privacy_info_is_limited_ad_tracking + description: > + The device's Limit Ad Tracking setting. Possible values include: 'true', 'false', and '(not set)'. isLimitedAdTracking returns '(not set)' if Google Analytics is not + currently able to return this device's Limit Ad Tracking setting. Flattened version of privacy_info.is_limited_ad_tracking. + - name: privacy_info_is_ads_personalization_allowed + description: > + If a user is eligible for ads personalization, isAdsPersonalizationAllowed returns 'true'. If a user is not eligible for ads personalization, + isAdsPersonalizationAllowed returns 'false'. isAdsPersonalizationAllowed returns '(not set)' if Google Analytics is not currently able to + return whether this user is eligible for ads personalization; users where isAdsPersonalizationAllowed returns '(not set)' may or may not be + eligible for personalized ads. For personalized ads, you should treat users where isAdsPersonalizationAllowed = '(not set)' as isAdsPersonalizationAllowed = 'false' + because, in the most general case, some of the '(not set)' rows will include users that are not eligible for ads personalization. Users where + isAdsPersonalizationAllowed = 'false' can still be used for non-advertising use cases like A/B testing & data explorations. Flattened version of + privacy_info.is_ads_personalization_allowed. + - name: occurence_date + description: Date when the record change was triggered. This is the partitioning column. + - name: last_updated_date + desctiption: Date when the record was updated in the table. diff --git a/models/staging/base/base_ga4__users.yml b/models/staging/base/base_ga4__users.yml new file mode 100644 index 00000000..977d6076 --- /dev/null +++ b/models/staging/base/base_ga4__users.yml @@ -0,0 +1,83 @@ +version: 2 + +models: + - name: base_ga4__users + description: > + Base user model that pulls all fields from the pseudonymous user table of the user export. The pseudonymous user table is keyed on + the user_pseudo_id which is the cid parameter in Gtag calls and is the main parameter in the from which the dbt-GA4 client_id is + created. The table is partitioned by occurence_date. This model also flattens some fields. + columns: + - name: pseudo_iser_id + description: > + The user_pseudo_id is a unique identifier for a user that is not tied to any personal information. This is the main identifier + used in the GA4 property. This is the cid parameter in Gtag calls and is the main parameter in the from which the dbt-GA4 client_id is + created. + - name: user_info_last_active_timestamp_micros + description: Date of the user's last activity (timestamp in microseconds). Flattened version of user_info.last_active_timestamp_micros. + - name: user_info_user_first_touch_timestamp_micros + description: Date of the user's first_open or first_visit event, whichever is earlier (timestamp in microseconds). Flattened version of user_info.user_first_touch_timestamp_micros. + - name: user_info_first_purchase_date + description: Date of the user's first purchase (YYYYMMDD). Flattened version of user_info.first_purchase_date. + - name: device_operating_system + description: Flattened version of device.operating_system. + - name: device_category + description: Category of the device (mobile, tablet, desktop). Flattened version of device.category. + - name: device_mobile_brand_name + description: Flattened version of device.mobile_brand_name. + - name: device_mobile_model_name + description: Flattened version of device.mobile_model_name. + - name: device_unified_sceen_name + description: Flattened version of device.unified_screen_name. + - name: geo_city + description: Flattened version of geo.city. + - name: geo_country + description: Flattened version of geo.country. + - name: geo_continent + description: Flattened version of geo.continent. + - name: geo_region + description: Flattened version of geo.region. + - name: user_ltv_revenue_in_usd + description: Flattened version of user_ltv.revenue_in_usd. + - name: user_ltv_sessions + description: Flattened version of user_ltv.sessions + - name: user_ltv_engagement_time_millis + description: Flattened version of user_ltv.engagement_time_millis + - name: user_ltv_purchases + description: Flattened version of user_ltv.purchases + - name: user_ltv_engaged_sessions + description: Flattened version of user_ltv.engaged_sessions + - name: user_ltv_session_duration_micros + description: Flattened version of user_ltv.session_duration_micros + - name: predictions_in_app_purchase_score_7d + description: > + Probability that a user who was active in the last 28 days will log an in_app_purchase event within the next 7 days. + Flattened ersion of predictions.in_app_purchase_score_7d. + - name: predictions_purchase_score_7d + description: > + Probability that a user who was active in the last 28 days will log a purchase event within the next 7 days. + Flattened version of predictions.purchase_score_7d. + - name: predictions_churn_score_7d + description: > + Probability that a user who was active on your app or site within the last 7 days will not be active within the next 7 days. + Flattened version of predictions.churn_score_7d. + - name: predictions_revenue_28d_in_usd + description: > + Revenue expected (in USD) from all purchase events within the next 28 days from a user who was active in the last 28 days + Flattened version of predictions.revenue_28d_in_usd. + - name: privacy_info_is_limited_ad_tracking + description: > + The device's Limit Ad Tracking setting. Possible values include: 'true', 'false', and '(not set)'. isLimitedAdTracking returns '(not set)' if Google Analytics is not + currently able to return this device's Limit Ad Tracking setting. Flattened version of privacy_info.is_limited_ad_tracking. + - name: privacy_info_is_ads_personalization_allowed + description: > + If a user is eligible for ads personalization, isAdsPersonalizationAllowed returns 'true'. If a user is not eligible for ads personalization, + isAdsPersonalizationAllowed returns 'false'. isAdsPersonalizationAllowed returns '(not set)' if Google Analytics is not currently able to + return whether this user is eligible for ads personalization; users where isAdsPersonalizationAllowed returns '(not set)' may or may not be + eligible for personalized ads. For personalized ads, you should treat users where isAdsPersonalizationAllowed = '(not set)' as isAdsPersonalizationAllowed = 'false' + because, in the most general case, some of the '(not set)' rows will include users that are not eligible for ads personalization. Users where + isAdsPersonalizationAllowed = 'false' can still be used for non-advertising use cases like A/B testing & data explorations. Flattened version of + privacy_info.is_ads_personalization_allowed. + - name: occurence_date + description: Date when the record change was triggered. This is the partitioning column. + - name: last_updated_date + desctiption: Date when the record was updated in the table. diff --git a/models/staging/stg_ga4__client_keys.sql b/models/staging/stg_ga4__client_keys.sql new file mode 100644 index 00000000..31d47699 --- /dev/null +++ b/models/staging/stg_ga4__client_keys.sql @@ -0,0 +1,23 @@ +{{ + config( + materialized='view', + enabled=false + + ) +}} +select + * + , to_base64(md5(concat(pseudo_user_id, stream_id))) as client_key + {% for up in var('user_properties', []) %} -- don't have sample data; need to verify + , (select value.string_value from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_string_value + , (select value.set_timestamp_micros from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_set_timestamp_micros + , (select value.user_property_name from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_user_property_name + {% endfor %} + {% for aud in var('audiences', []) %} -- this should be good, though + , (select id from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_id + , (select name from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_name + , (select membership_start_timestamp_micros from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_membership_start_timestamp_micros + , (select membership_expiry_timestamp_micros from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_membership_expiry_timestamp_micros + , (select npa from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_npa + {% endfor %} +from {{ref('base_ga4__pseudonymous_users')}} diff --git a/models/staging/stg_ga4__client_keys.yml b/models/staging/stg_ga4__client_keys.yml new file mode 100644 index 00000000..60b73e89 --- /dev/null +++ b/models/staging/stg_ga4__client_keys.yml @@ -0,0 +1,14 @@ +version: 2 + +models: + - name: stg_ga4__client_keys + description: > + Staging model for the base_ga4__pseudonymous_users table which pulls data from the user export. + This model is keyed on client_key which is the key used by the package for joining with client (browser/app). + This model unnests user properties and audience using variables of the same name that match the user property + or audience name and prefixing the unnested fields with those names. + columns: + - name: client_key + description: Hashed combination of user_pseudo_id and stream_id + tests: + - unique \ No newline at end of file diff --git a/models/staging/stg_ga4__pseudonymous_users.sql b/models/staging/stg_ga4__pseudonymous_users.sql deleted file mode 100644 index b189d887..00000000 --- a/models/staging/stg_ga4__pseudonymous_users.sql +++ /dev/null @@ -1,11 +0,0 @@ -{{ - config( - materialized='view', - enabled=false - - ) -}} -select - * - , to_base64(md5(concat(user_pseudo_id, stream_id))) as client_key -from {{ref('base_ga4__pseudonymous_users')}} diff --git a/models/staging/stg_ga4__users.sql b/models/staging/stg_ga4__users.sql index 6e4fed2d..7cee948b 100644 --- a/models/staging/stg_ga4__users.sql +++ b/models/staging/stg_ga4__users.sql @@ -6,4 +6,16 @@ }} select * + {% for up in var('user_properties', []) %} -- don't have sample data; need to verify + , (select value.string_value from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_string_value + , (select value.set_timestamp_micros from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_set_timestamp_micros + , (select value.user_property_name from unnest(user_properties) where key = '{{up}}') as {{up | lower | replace(" ", "_")}}_user_property_name + {% endfor %} + {% for aud in var('audiences', []) %} -- this should be good, though + , (select id from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_id + , (select name from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_name + , (select membership_start_timestamp_micros from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_membership_start_timestamp_micros + , (select membership_expiry_timestamp_micros from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_membership_expiry_timestamp_micros + , (select npa from unnest(audiences) where name = '{{aud}}') as audience_{{aud | lower | replace(" ", "_")}}_npa + {% endfor %} from {{ref('base_ga4__users')}} diff --git a/models/staging/stg_ga4__users.yml b/models/staging/stg_ga4__users.yml new file mode 100644 index 00000000..841c7e09 --- /dev/null +++ b/models/staging/stg_ga4__users.yml @@ -0,0 +1,9 @@ +version: 2 + +models: + - name: stg_ga4__users + description: > + Staging model for the base_ga4__users table which pulls data from the user export. + This model is keyed on user_id field. Unlike most keys in the package, this key is not hashed. + This model unnests user properties and audience using variables of the same name that match the user property + or audience name and prefixing the unnested fields with those names. From b9fa91df320bf27a3da0e14e034ef3041bb22665 Mon Sep 17 00:00:00 2001 From: dgitis Date: Sun, 21 Jul 2024 13:19:48 -0700 Subject: [PATCH 10/12] bug fix --- macros/combine_property_data.sql | 4 ---- 1 file changed, 4 deletions(-) diff --git a/macros/combine_property_data.sql b/macros/combine_property_data.sql index fbce816d..7ce1859c 100644 --- a/macros/combine_property_data.sql +++ b/macros/combine_property_data.sql @@ -6,13 +6,9 @@ {% if not should_full_refresh() %} {# If incremental, then use static_incremental_days variable to find earliest shard to copy #} {%- set earliest_shard_to_retrieve = (modules.datetime.date.today() - modules.datetime.timedelta(days=var('static_incremental_days')))|string|replace("-", "")|int -%} - {# If incremental, then use static_incremental_days variable to find earliest shard to copy #} - {%- set earliest_shard_to_retrieve = (modules.datetime.date.today() - modules.datetime.timedelta(days=var('static_incremental_days')))|string|replace("-", "")|int -%} {% else %} {# Otherwise use 'start_date' variable #} {%- set earliest_shard_to_retrieve = var('start_date')|int -%} - {# Otherwise use 'start_date' variable #} - {%- set earliest_shard_to_retrieve = var('start_date')|int -%} {% endif %} {% for property_id in var('property_ids') %} {%- set schema_name = "analytics_" + property_id|string -%} From f633263dd9d3fa2b93c0ab63fea0f6bf323b9bd7 Mon Sep 17 00:00:00 2001 From: dgitis Date: Sun, 21 Jul 2024 13:27:16 -0700 Subject: [PATCH 11/12] fix stg_ga4__client_keys test --- models/staging/stg_ga4__client_keys.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/models/staging/stg_ga4__client_keys.yml b/models/staging/stg_ga4__client_keys.yml index 60b73e89..32bb12f4 100644 --- a/models/staging/stg_ga4__client_keys.yml +++ b/models/staging/stg_ga4__client_keys.yml @@ -7,8 +7,9 @@ models: This model is keyed on client_key which is the key used by the package for joining with client (browser/app). This model unnests user properties and audience using variables of the same name that match the user property or audience name and prefixing the unnested fields with those names. + tests: + - unique: + column_name: "(client_key || occurrence_date)" columns: - name: client_key - description: Hashed combination of user_pseudo_id and stream_id - tests: - - unique \ No newline at end of file + description: Hashed combination of user_pseudo_id and stream_id \ No newline at end of file From 147db2c9b51d6daa01acf138c9441c7d3da7a52c Mon Sep 17 00:00:00 2001 From: dgitis Date: Sun, 21 Jul 2024 13:57:37 -0700 Subject: [PATCH 12/12] force property_id to string in combine_property_data macro --- macros/combine_property_data.sql | 10 +++++----- package-lock.yml | 4 ++++ 2 files changed, 9 insertions(+), 5 deletions(-) create mode 100644 package-lock.yml diff --git a/macros/combine_property_data.sql b/macros/combine_property_data.sql index 7ce1859c..e4be8499 100644 --- a/macros/combine_property_data.sql +++ b/macros/combine_property_data.sql @@ -22,7 +22,7 @@ {%- set relation_suffix = relation.identifier|replace('events_intraday_', '') -%} {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} create or replace table `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}` clone `{{var('source_project')}}.analytics_{{property_id}}.events_intraday_{{relation_suffix}}`; - {% do modifications.append( {'source_partition': 'events_intraday_' + relation_suffix , 'destination_partition': 'events_intraday_' + relation_suffix + property_id } ) %} + {% do modifications.append( {'source_partition': 'events_intraday_' + relation_suffix , 'destination_partition': 'events_intraday_' + relation_suffix + property_id|string } ) %} {%- endif -%} {% endfor %} @@ -33,7 +33,7 @@ {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} create or replace table `{{target.project}}.{{var('combined_dataset')}}.events_{{relation_suffix}}{{property_id}}` clone `{{var('source_project')}}.analytics_{{property_id}}.events_{{relation_suffix}}`; drop table if exists `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}`; - {% do modifications.append( {'source_partition': 'events_' + relation_suffix , 'destination_partition': 'events_' + relation_suffix + property_id } ) %} + {% do modifications.append( {'source_partition': 'events_' + relation_suffix , 'destination_partition': 'events_' + relation_suffix + property_id|string } ) %} {%- endif -%} {% endfor %} {% elif this.name == 'base_ga4__pseudonymous_users' %} @@ -44,7 +44,7 @@ {%- set relation_suffix = relation.identifier|replace('pseudonymous_users_', '') -%} {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} create or replace table `{{target.project}}.{{var('combined_dataset')}}.pseudonymous_users_{{relation_suffix}}{{property_id}}` clone `{{var('source_project')}}.analytics_{{property_id}}.pseudonymous_users_{{relation_suffix}}`; - {% do modifications.append( {'source_partition': 'pseudonymous_users_' + relation_suffix , 'destination_partition': 'pseudonymous_users_' + relation_suffix + property_id } ) %} + {% do modifications.append( {'source_partition': 'pseudonymous_users_' + relation_suffix , 'destination_partition': 'pseudonymous_users_' + relation_suffix + property_id|string } ) %} {%- endif -%} {% endfor %} {% elif this.name == 'base_ga4__users' %} @@ -54,7 +54,7 @@ {%- set relation_suffix = relation.identifier|replace('users_', '') -%} {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} create or replace table `{{target.project}}.{{var('combined_dataset')}}.users_{{relation_suffix}}{{property_id}}` clone `{{var('source_project')}}.analytics_{{property_id}}.users_{{relation_suffix}}`; - {% do modifications.append( {'source_partition': 'users_' + relation_suffix , 'destination_partition': 'users_' + relation_suffix + property_id } ) %} + {% do modifications.append( {'source_partition': 'users_' + relation_suffix , 'destination_partition': 'users_' + relation_suffix + property_id|string } ) %} {%- endif -%} {% endfor %} {% endif %} @@ -62,7 +62,7 @@ {% do run_query(combine_specified_property_data_query) %} {% if execute %} {% for modification in modifications%} - {{ log("Cloned from `" ~ var('source_project') ~ ".analytics_" ~ property_id ~ "." ~ modification.source_partition ~"` to `" ~ target.project ~ "." ~ var('combined_dataset') ~ "." ~ modification.destination_partition ~"`", True) }} + {{ log("Cloned from `" ~ var('source_project') ~ ".analytics_" ~ property_id|string ~ "." ~ modification.source_partition ~"` to `" ~ target.project ~ "." ~ var('combined_dataset') ~ "." ~ modification.destination_partition ~"`", True) }} {% endfor %} {% endif %} {% endfor %} diff --git a/package-lock.yml b/package-lock.yml new file mode 100644 index 00000000..9d479f13 --- /dev/null +++ b/package-lock.yml @@ -0,0 +1,4 @@ +packages: + - package: dbt-labs/dbt_utils + version: 1.2.0 +sha1_hash: dd1e1feb2d2bbce79e7a255cd309a60e6548df0b