From 1f10cc78efca9a3dcb8e070c2ef2052097ea36ec Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 15:25:21 +0100 Subject: [PATCH 01/24] update external table columns --- .../plugins/bigquery/bigquery_external.yml | 25 +++++++++---------- macros/common/stage_external_sources.sql | 3 +++ .../common/update_external_table_columns.sql | 7 ++++++ .../bigquery/create_external_table.sql | 1 - .../bigquery/get_external_build_plan.sql | 2 +- .../update_external_table_columns.sql | 5 ++++ 6 files changed, 28 insertions(+), 15 deletions(-) create mode 100644 macros/common/update_external_table_columns.sql create mode 100644 macros/plugins/bigquery/update_external_table_columns.sql diff --git a/integration_tests/models/plugins/bigquery/bigquery_external.yml b/integration_tests/models/plugins/bigquery/bigquery_external.yml index 3b81230b..391811ff 100644 --- a/integration_tests/models/plugins/bigquery/bigquery_external.yml +++ b/integration_tests/models/plugins/bigquery/bigquery_external.yml @@ -6,16 +6,16 @@ sources: loader: Cloud Storage tables: - - name: people_csv_unpartitioned external: - location: 'gs://dbt-external-tables-testing/csv/*' + location: "gs://dbt-external-tables-testing/csv/*" options: format: csv skip_leading_rows: 1 columns: &cols-of-the-people - name: id data_type: int64 + description: id_of_column - name: first_name data_type: string - name: last_name @@ -33,24 +33,24 @@ sources: - name: people_csv_partitioned external: - location: 'gs://dbt-external-tables-testing/csv/*' + location: "gs://dbt-external-tables-testing/csv/*" options: format: csv skip_leading_rows: 1 - hive_partition_uri_prefix: 'gs://dbt-external-tables-testing/csv' + hive_partition_uri_prefix: "gs://dbt-external-tables-testing/csv" partitions: &parts-of-the-people - name: section data_type: string columns: *cols-of-the-people tests: *equal-to-the-people - + - name: people_csv_schema_auto_detect external: - location: 'gs://dbt-external-tables-testing/csv/*' + location: "gs://dbt-external-tables-testing/csv/*" options: format: csv skip_leading_rows: 1 - hive_partition_uri_prefix: 'gs://dbt-external-tables-testing/csv' + hive_partition_uri_prefix: "gs://dbt-external-tables-testing/csv" tests: *equal-to-the-people - name: people_csv_override_uris @@ -60,13 +60,12 @@ sources: format: csv skip_leading_rows: 1 uris: - - 'gs://dbt-external-tables-testing/csv/section=a/people_a.csv' - - 'gs://dbt-external-tables-testing/csv/section=b/people_b.csv' - - 'gs://dbt-external-tables-testing/csv/section=c/people_c.csv' - - 'gs://dbt-external-tables-testing/csv/section=d/people_d.csv' + - "gs://dbt-external-tables-testing/csv/section=a/people_a.csv" + - "gs://dbt-external-tables-testing/csv/section=b/people_b.csv" + - "gs://dbt-external-tables-testing/csv/section=c/people_c.csv" + - "gs://dbt-external-tables-testing/csv/section=d/people_d.csv" columns: *cols-of-the-people tests: *equal-to-the-people - # - name: people_json_unpartitioned # external: &json-people # location: 'gs://dbt-external-tables-testing/json/*' @@ -84,7 +83,7 @@ sources: # partitions: *parts-of-the-people # columns: *cols-of-the-people # tests: *equal-to-the-people -# +# # - name: people_json_schema_auto_detect # external: # location: 'gs://dbt-external-tables-testing/json/*' diff --git a/macros/common/stage_external_sources.sql b/macros/common/stage_external_sources.sql index 7fad72d5..eaf55301 100644 --- a/macros/common/stage_external_sources.sql +++ b/macros/common/stage_external_sources.sql @@ -66,6 +66,9 @@ {% endfor %} + {% set update_columns = dbt_external_tables.update_external_table_columns(node) %} + {{ update_columns }} + {% endfor %} {% endmacro %} diff --git a/macros/common/update_external_table_columns.sql b/macros/common/update_external_table_columns.sql new file mode 100644 index 00000000..37ea1434 --- /dev/null +++ b/macros/common/update_external_table_columns.sql @@ -0,0 +1,7 @@ +{% macro update_external_table_columns(source_node) %} + {{ return(adapter.dispatch('update_external_table_columns', 'dbt_external_tables')(source_node)) }} +{% endmacro %} + +{% macro default__update_external_table_columns(source_node) %} + {% do return([]) %} +{% endmacro %} diff --git a/macros/plugins/bigquery/create_external_table.sql b/macros/plugins/bigquery/create_external_table.sql index 5e098894..d5e9dca2 100644 --- a/macros/plugins/bigquery/create_external_table.sql +++ b/macros/plugins/bigquery/create_external_table.sql @@ -1,5 +1,4 @@ {% macro bigquery__create_external_table(source_node) %} - {%- set columns = source_node.columns.values() -%} {%- set external = source_node.external -%} {%- set partitions = external.partitions -%} diff --git a/macros/plugins/bigquery/get_external_build_plan.sql b/macros/plugins/bigquery/get_external_build_plan.sql index b1ce0acd..29d85301 100644 --- a/macros/plugins/bigquery/get_external_build_plan.sql +++ b/macros/plugins/bigquery/get_external_build_plan.sql @@ -13,7 +13,7 @@ {% if create_or_replace %} {% if not dbt_external_tables.create_external_schema(source_node)|length %} {% set build_plan = build_plan + [ - dbt_external_tables.create_external_table(source_node) + dbt_external_tables.create_external_table(source_node), ] %} {% else %} {% set build_plan = build_plan + [ diff --git a/macros/plugins/bigquery/update_external_table_columns.sql b/macros/plugins/bigquery/update_external_table_columns.sql new file mode 100644 index 00000000..96a1b376 --- /dev/null +++ b/macros/plugins/bigquery/update_external_table_columns.sql @@ -0,0 +1,5 @@ +{% macro bigquery__update_external_table_columns(source_node) %} + {%- set columns = source_node.columns -%} + {%- set relation = source(source_node.source_name, source_node.name) -%} + {%- do adapter.update_columns(relation, columns) -%} +{% endmacro %} \ No newline at end of file From 5b8f60e13b46cddd235afc0da388fc21f8726220 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 15:31:54 +0100 Subject: [PATCH 02/24] undo quoting --- sample_sources/bigquery.yml | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/sample_sources/bigquery.yml b/sample_sources/bigquery.yml index cfe18cde..50976d5b 100644 --- a/sample_sources/bigquery.yml +++ b/sample_sources/bigquery.yml @@ -4,24 +4,24 @@ sources: - name: snowplow database: analytics loader: gcloud storage - + tables: - name: event description: "External table of Snowplow events, stored as CSV files in Cloud Storage" external: - location: 'gs://bucket/path/*' + location: "gs://bucket/path/*" options: format: csv skip_leading_rows: 1 - - # if you want a partitioned table, file paths MUST be Hive-style: - # 'gs://bucket/path/collector_hour=2020-01-01/' - # 'gs://bucket/path/collector_hour=2020-01-02/' (etc) - hive_partition_uri_prefix: 'gs://bucket/path/' - partitions: + + # if you want a partitioned table, file paths MUST be Hive-style: + # "gs://bucket/path/collector_hour=2020-01-01/" + # "gs://bucket/path/collector_hour=2020-01-02/" (etc) + hive_partition_uri_prefix: "gs://bucket/path/" + partitions: - name: collector_date data_type: date - + columns: - name: app_id data_type: varchar(255) @@ -35,16 +35,16 @@ sources: - name: contexts data_type: variant description: "Contexts attached to event by Tracker" - + # alternatively, BigQuery can infer your schema (columns + partitions) - name: event_inferred external: - location: 'gs://bucket/path/*' + location: "gs://bucket/path/*" options: format: csv skip_leading_rows: 1 - hive_partition_uri_prefix: 'gs://bucket/path/' - + hive_partition_uri_prefix: "gs://bucket/path/" + # optionally, BigQuery can pull data from multiple GCS paths, instead of just one - name: event_multiple_paths external: @@ -52,9 +52,9 @@ sources: options: format: csv skip_leading_rows: 1 - + # list all file paths with relevant source data uris: - - 'gs://bucket_a/path/*' - - 'gs://bucket_b/path/*' - - 'gs://bucket_c/more/specific/path/file.csv' + - "gs://bucket_a/path/*" + - "gs://bucket_b/path/*" + - "gs://bucket_c/more/specific/path/file.csv" From 3edc1ca96a08485dd86cb3165a4de6c15119fe2c Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 15:32:53 +0100 Subject: [PATCH 03/24] undo newline --- macros/plugins/bigquery/create_external_schema.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/macros/plugins/bigquery/create_external_schema.sql b/macros/plugins/bigquery/create_external_schema.sql index dc42b488..c930633a 100644 --- a/macros/plugins/bigquery/create_external_schema.sql +++ b/macros/plugins/bigquery/create_external_schema.sql @@ -1,4 +1,5 @@ {%- macro bigquery__create_external_schema(source_node) -%} + {%- set fqn -%} {%- if source_node.database -%} `{{ source_node.database }}`.{{ source_node.schema }} From 1f79b8205ae925e60ef2091cb35d2acc120975b2 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 15:33:13 +0100 Subject: [PATCH 04/24] undo comma --- macros/plugins/bigquery/get_external_build_plan.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macros/plugins/bigquery/get_external_build_plan.sql b/macros/plugins/bigquery/get_external_build_plan.sql index 29d85301..b1ce0acd 100644 --- a/macros/plugins/bigquery/get_external_build_plan.sql +++ b/macros/plugins/bigquery/get_external_build_plan.sql @@ -13,7 +13,7 @@ {% if create_or_replace %} {% if not dbt_external_tables.create_external_schema(source_node)|length %} {% set build_plan = build_plan + [ - dbt_external_tables.create_external_table(source_node), + dbt_external_tables.create_external_table(source_node) ] %} {% else %} {% set build_plan = build_plan + [ From c3e6c6ebfc85a7bf4f889c9b80be99b9bdf6843d Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 15:33:41 +0100 Subject: [PATCH 05/24] add newline --- macros/plugins/bigquery/update_external_table_columns.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macros/plugins/bigquery/update_external_table_columns.sql b/macros/plugins/bigquery/update_external_table_columns.sql index 96a1b376..5af562e9 100644 --- a/macros/plugins/bigquery/update_external_table_columns.sql +++ b/macros/plugins/bigquery/update_external_table_columns.sql @@ -2,4 +2,4 @@ {%- set columns = source_node.columns -%} {%- set relation = source(source_node.source_name, source_node.name) -%} {%- do adapter.update_columns(relation, columns) -%} -{% endmacro %} \ No newline at end of file +{% endmacro %} From 49c0d2e8aa200463cf79c5778aedf256f520d521 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 15:34:52 +0100 Subject: [PATCH 06/24] undo newline --- macros/plugins/bigquery/create_external_schema.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/macros/plugins/bigquery/create_external_schema.sql b/macros/plugins/bigquery/create_external_schema.sql index c930633a..dc42b488 100644 --- a/macros/plugins/bigquery/create_external_schema.sql +++ b/macros/plugins/bigquery/create_external_schema.sql @@ -1,5 +1,4 @@ {%- macro bigquery__create_external_schema(source_node) -%} - {%- set fqn -%} {%- if source_node.database -%} `{{ source_node.database }}`.{{ source_node.schema }} From 48e3f7eb844d67d22a92f73aa5f32fee8c141659 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 15:35:54 +0100 Subject: [PATCH 07/24] fix --- sample_sources/bigquery.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sample_sources/bigquery.yml b/sample_sources/bigquery.yml index 50976d5b..4a7be45f 100644 --- a/sample_sources/bigquery.yml +++ b/sample_sources/bigquery.yml @@ -15,8 +15,8 @@ sources: skip_leading_rows: 1 # if you want a partitioned table, file paths MUST be Hive-style: - # "gs://bucket/path/collector_hour=2020-01-01/" - # "gs://bucket/path/collector_hour=2020-01-02/" (etc) + # 'gs://bucket/path/collector_hour=2020-01-01/' + # 'gs://bucket/path/collector_hour=2020-01-02/' (etc) hive_partition_uri_prefix: "gs://bucket/path/" partitions: - name: collector_date From 3d72989cc0e99debd2edca8d4b190909d032841c Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 15:42:12 +0100 Subject: [PATCH 08/24] fix quoting --- .../plugins/bigquery/bigquery_external.yml | 24 +++++++++---------- sample_sources/bigquery.yml | 24 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/integration_tests/models/plugins/bigquery/bigquery_external.yml b/integration_tests/models/plugins/bigquery/bigquery_external.yml index 391811ff..52661d7d 100644 --- a/integration_tests/models/plugins/bigquery/bigquery_external.yml +++ b/integration_tests/models/plugins/bigquery/bigquery_external.yml @@ -2,13 +2,13 @@ version: 2 sources: - name: bigquery_external - schema: "{{ target.schema }}" + schema: '{{ target.schema }}' loader: Cloud Storage tables: - name: people_csv_unpartitioned external: - location: "gs://dbt-external-tables-testing/csv/*" + location: 'gs://dbt-external-tables-testing/csv/*' options: format: csv skip_leading_rows: 1 @@ -33,11 +33,11 @@ sources: - name: people_csv_partitioned external: - location: "gs://dbt-external-tables-testing/csv/*" + location: 'gs://dbt-external-tables-testing/csv/*' options: format: csv skip_leading_rows: 1 - hive_partition_uri_prefix: "gs://dbt-external-tables-testing/csv" + hive_partition_uri_prefix: 'gs://dbt-external-tables-testing/csv' partitions: &parts-of-the-people - name: section data_type: string @@ -46,11 +46,11 @@ sources: - name: people_csv_schema_auto_detect external: - location: "gs://dbt-external-tables-testing/csv/*" + location: 'gs://dbt-external-tables-testing/csv/*' options: format: csv skip_leading_rows: 1 - hive_partition_uri_prefix: "gs://dbt-external-tables-testing/csv" + hive_partition_uri_prefix: 'gs://dbt-external-tables-testing/csv' tests: *equal-to-the-people - name: people_csv_override_uris @@ -60,10 +60,10 @@ sources: format: csv skip_leading_rows: 1 uris: - - "gs://dbt-external-tables-testing/csv/section=a/people_a.csv" - - "gs://dbt-external-tables-testing/csv/section=b/people_b.csv" - - "gs://dbt-external-tables-testing/csv/section=c/people_c.csv" - - "gs://dbt-external-tables-testing/csv/section=d/people_d.csv" + - 'gs://dbt-external-tables-testing/csv/section=a/people_a.csv' + - 'gs://dbt-external-tables-testing/csv/section=b/people_b.csv' + - 'gs://dbt-external-tables-testing/csv/section=c/people_c.csv' + - 'gs://dbt-external-tables-testing/csv/section=d/people_d.csv' columns: *cols-of-the-people tests: *equal-to-the-people # - name: people_json_unpartitioned @@ -79,7 +79,7 @@ sources: # location: 'gs://dbt-external-tables-testing/json/*' # options: # format: json -# hive_partition_uri_prefix: "'gs://dbt-external-tables-testing/json'" +# hive_partition_uri_prefix: ''gs://dbt-external-tables-testing/json'' # partitions: *parts-of-the-people # columns: *cols-of-the-people # tests: *equal-to-the-people @@ -90,5 +90,5 @@ sources: # options: # format: csv # skip_leading_rows: 1 -# hive_partition_uri_prefix: "'gs://dbt-external-tables-testing/json'" +# hive_partition_uri_prefix: ''gs://dbt-external-tables-testing/json'' # tests: *equal-to-the-people diff --git a/sample_sources/bigquery.yml b/sample_sources/bigquery.yml index 4a7be45f..fe60e2e1 100644 --- a/sample_sources/bigquery.yml +++ b/sample_sources/bigquery.yml @@ -7,9 +7,9 @@ sources: tables: - name: event - description: "External table of Snowplow events, stored as CSV files in Cloud Storage" + description: 'External table of Snowplow events, stored as CSV files in Cloud Storage' external: - location: "gs://bucket/path/*" + location: 'gs://bucket/path/*' options: format: csv skip_leading_rows: 1 @@ -17,7 +17,7 @@ sources: # if you want a partitioned table, file paths MUST be Hive-style: # 'gs://bucket/path/collector_hour=2020-01-01/' # 'gs://bucket/path/collector_hour=2020-01-02/' (etc) - hive_partition_uri_prefix: "gs://bucket/path/" + hive_partition_uri_prefix: 'gs://bucket/path/' partitions: - name: collector_date data_type: date @@ -25,25 +25,25 @@ sources: columns: - name: app_id data_type: varchar(255) - description: "Application ID" + description: 'Application ID' - name: domain_sessionidx data_type: int - description: "A visit / session index" + description: 'A visit / session index' - name: etl_tstamp data_type: timestamp - description: "Timestamp event began ETL" + description: 'Timestamp event began ETL' - name: contexts data_type: variant - description: "Contexts attached to event by Tracker" + description: 'Contexts attached to event by Tracker' # alternatively, BigQuery can infer your schema (columns + partitions) - name: event_inferred external: - location: "gs://bucket/path/*" + location: 'gs://bucket/path/*' options: format: csv skip_leading_rows: 1 - hive_partition_uri_prefix: "gs://bucket/path/" + hive_partition_uri_prefix: 'gs://bucket/path/' # optionally, BigQuery can pull data from multiple GCS paths, instead of just one - name: event_multiple_paths @@ -55,6 +55,6 @@ sources: # list all file paths with relevant source data uris: - - "gs://bucket_a/path/*" - - "gs://bucket_b/path/*" - - "gs://bucket_c/more/specific/path/file.csv" + - 'gs://bucket_a/path/*' + - 'gs://bucket_b/path/*' + - 'gs://bucket_c/more/specific/path/file.csv' From 74699c539476370b0ef83ed60eb7951bb098a634 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 15:43:35 +0100 Subject: [PATCH 09/24] fix --- sample_sources/bigquery.yml | 115 +++++++++++++++++++++++------------- 1 file changed, 75 insertions(+), 40 deletions(-) diff --git a/sample_sources/bigquery.yml b/sample_sources/bigquery.yml index fe60e2e1..d13b110d 100644 --- a/sample_sources/bigquery.yml +++ b/sample_sources/bigquery.yml @@ -1,60 +1,95 @@ version: 2 sources: - - name: snowplow - database: analytics - loader: gcloud storage + - name: bigquery_external + schema: "{{ target.schema }}" + loader: Cloud Storage tables: - - name: event - description: 'External table of Snowplow events, stored as CSV files in Cloud Storage' + + - name: people_csv_unpartitioned external: - location: 'gs://bucket/path/*' + location: 'gs://dbt-external-tables-testing/csv/*' options: format: csv skip_leading_rows: 1 + columns: &cols-of-the-people + - name: id + data_type: int64 + - name: first_name + data_type: string + - name: last_name + data_type: string + - name: email + data_type: string + tests: &equal-to-the-people + - dbt_utils.equality: + compare_model: ref('people') + compare_columns: + - id + - first_name + - last_name + - email - # if you want a partitioned table, file paths MUST be Hive-style: - # 'gs://bucket/path/collector_hour=2020-01-01/' - # 'gs://bucket/path/collector_hour=2020-01-02/' (etc) - hive_partition_uri_prefix: 'gs://bucket/path/' - partitions: - - name: collector_date - data_type: date - - columns: - - name: app_id - data_type: varchar(255) - description: 'Application ID' - - name: domain_sessionidx - data_type: int - description: 'A visit / session index' - - name: etl_tstamp - data_type: timestamp - description: 'Timestamp event began ETL' - - name: contexts - data_type: variant - description: 'Contexts attached to event by Tracker' - - # alternatively, BigQuery can infer your schema (columns + partitions) - - name: event_inferred + - name: people_csv_partitioned external: - location: 'gs://bucket/path/*' + location: 'gs://dbt-external-tables-testing/csv/*' options: format: csv skip_leading_rows: 1 - hive_partition_uri_prefix: 'gs://bucket/path/' - - # optionally, BigQuery can pull data from multiple GCS paths, instead of just one - - name: event_multiple_paths + hive_partition_uri_prefix: 'gs://dbt-external-tables-testing/csv' + partitions: &parts-of-the-people + - name: section + data_type: string + columns: *cols-of-the-people + tests: *equal-to-the-people + + - name: people_csv_schema_auto_detect external: - location: this is still a required property, but it will be ignored + location: 'gs://dbt-external-tables-testing/csv/*' options: format: csv skip_leading_rows: 1 + hive_partition_uri_prefix: 'gs://dbt-external-tables-testing/csv' + tests: *equal-to-the-people - # list all file paths with relevant source data + - name: people_csv_override_uris + external: + location: this can be anything + options: + format: csv + skip_leading_rows: 1 uris: - - 'gs://bucket_a/path/*' - - 'gs://bucket_b/path/*' - - 'gs://bucket_c/more/specific/path/file.csv' + - 'gs://dbt-external-tables-testing/csv/section=a/people_a.csv' + - 'gs://dbt-external-tables-testing/csv/section=b/people_b.csv' + - 'gs://dbt-external-tables-testing/csv/section=c/people_c.csv' + - 'gs://dbt-external-tables-testing/csv/section=d/people_d.csv' + columns: *cols-of-the-people + tests: *equal-to-the-people + +# - name: people_json_unpartitioned +# external: &json-people +# location: 'gs://dbt-external-tables-testing/json/*' +# options: +# format: json +# columns: *cols-of-the-people +# tests: *equal-to-the-people +# +# - name: people_json_partitioned +# external: +# location: 'gs://dbt-external-tables-testing/json/*' +# options: +# format: json +# hive_partition_uri_prefix: "'gs://dbt-external-tables-testing/json'" +# partitions: *parts-of-the-people +# columns: *cols-of-the-people +# tests: *equal-to-the-people +# +# - name: people_json_schema_auto_detect +# external: +# location: 'gs://dbt-external-tables-testing/json/*' +# options: +# format: csv +# skip_leading_rows: 1 +# hive_partition_uri_prefix: "'gs://dbt-external-tables-testing/json'" +# tests: *equal-to-the-people \ No newline at end of file From f3dcd89f3cb4495432d5c0446a0a56ee0d03fa11 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 15:45:17 +0100 Subject: [PATCH 10/24] fix --- sample_sources/bigquery.yml | 119 +++++++++++++----------------------- 1 file changed, 42 insertions(+), 77 deletions(-) diff --git a/sample_sources/bigquery.yml b/sample_sources/bigquery.yml index d13b110d..224464ad 100644 --- a/sample_sources/bigquery.yml +++ b/sample_sources/bigquery.yml @@ -1,95 +1,60 @@ version: 2 sources: - - name: bigquery_external - schema: "{{ target.schema }}" - loader: Cloud Storage - + - name: snowplow + database: analytics + loader: gcloud storage + tables: - - - name: people_csv_unpartitioned + - name: event + description: "External table of Snowplow events, stored as CSV files in Cloud Storage" external: - location: 'gs://dbt-external-tables-testing/csv/*' + location: 'gs://bucket/path/*' options: format: csv skip_leading_rows: 1 - columns: &cols-of-the-people - - name: id - data_type: int64 - - name: first_name - data_type: string - - name: last_name - data_type: string - - name: email - data_type: string - tests: &equal-to-the-people - - dbt_utils.equality: - compare_model: ref('people') - compare_columns: - - id - - first_name - - last_name - - email - - - name: people_csv_partitioned - external: - location: 'gs://dbt-external-tables-testing/csv/*' - options: - format: csv - skip_leading_rows: 1 - hive_partition_uri_prefix: 'gs://dbt-external-tables-testing/csv' - partitions: &parts-of-the-people - - name: section - data_type: string - columns: *cols-of-the-people - tests: *equal-to-the-people + + # if you want a partitioned table, file paths MUST be Hive-style: + # 'gs://bucket/path/collector_hour=2020-01-01/' + # 'gs://bucket/path/collector_hour=2020-01-02/' (etc) + hive_partition_uri_prefix: 'gs://bucket/path/' + partitions: + - name: collector_date + data_type: date - - name: people_csv_schema_auto_detect + columns: + - name: app_id + data_type: varchar(255) + description: "Application ID" + - name: domain_sessionidx + data_type: int + description: "A visit / session index" + - name: etl_tstamp + data_type: timestamp + description: "Timestamp event began ETL" + - name: contexts + data_type: variant + description: "Contexts attached to event by Tracker" + + # alternatively, BigQuery can infer your schema (columns + partitions) + - name: event_inferred external: - location: 'gs://dbt-external-tables-testing/csv/*' + location: 'gs://bucket/path/*' options: format: csv skip_leading_rows: 1 - hive_partition_uri_prefix: 'gs://dbt-external-tables-testing/csv' - tests: *equal-to-the-people - - - name: people_csv_override_uris + hive_partition_uri_prefix: 'gs://bucket/path/' + + # optionally, BigQuery can pull data from multiple GCS paths, instead of just one + - name: event_multiple_paths external: - location: this can be anything + location: this is still a required property, but it will be ignored options: format: csv skip_leading_rows: 1 + + # list all file paths with relevant source data uris: - - 'gs://dbt-external-tables-testing/csv/section=a/people_a.csv' - - 'gs://dbt-external-tables-testing/csv/section=b/people_b.csv' - - 'gs://dbt-external-tables-testing/csv/section=c/people_c.csv' - - 'gs://dbt-external-tables-testing/csv/section=d/people_d.csv' - columns: *cols-of-the-people - tests: *equal-to-the-people - -# - name: people_json_unpartitioned -# external: &json-people -# location: 'gs://dbt-external-tables-testing/json/*' -# options: -# format: json -# columns: *cols-of-the-people -# tests: *equal-to-the-people -# -# - name: people_json_partitioned -# external: -# location: 'gs://dbt-external-tables-testing/json/*' -# options: -# format: json -# hive_partition_uri_prefix: "'gs://dbt-external-tables-testing/json'" -# partitions: *parts-of-the-people -# columns: *cols-of-the-people -# tests: *equal-to-the-people -# -# - name: people_json_schema_auto_detect -# external: -# location: 'gs://dbt-external-tables-testing/json/*' -# options: -# format: csv -# skip_leading_rows: 1 -# hive_partition_uri_prefix: "'gs://dbt-external-tables-testing/json'" -# tests: *equal-to-the-people \ No newline at end of file + - 'gs://bucket_a/path/*' + - 'gs://bucket_b/path/*' + - 'gs://bucket_c/more/specific/path/file.csv' \ No newline at end of file From 550efe4e5087235c78856ca35731c6170c87cb46 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 15:46:02 +0100 Subject: [PATCH 11/24] add description --- .../plugins/bigquery/bigquery_external.yml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/integration_tests/models/plugins/bigquery/bigquery_external.yml b/integration_tests/models/plugins/bigquery/bigquery_external.yml index 52661d7d..d88bde21 100644 --- a/integration_tests/models/plugins/bigquery/bigquery_external.yml +++ b/integration_tests/models/plugins/bigquery/bigquery_external.yml @@ -2,10 +2,11 @@ version: 2 sources: - name: bigquery_external - schema: '{{ target.schema }}' + schema: "{{ target.schema }}" loader: Cloud Storage tables: + - name: people_csv_unpartitioned external: location: 'gs://dbt-external-tables-testing/csv/*' @@ -15,7 +16,7 @@ sources: columns: &cols-of-the-people - name: id data_type: int64 - description: id_of_column + description: id_of_the_person - name: first_name data_type: string - name: last_name @@ -43,7 +44,7 @@ sources: data_type: string columns: *cols-of-the-people tests: *equal-to-the-people - + - name: people_csv_schema_auto_detect external: location: 'gs://dbt-external-tables-testing/csv/*' @@ -66,6 +67,7 @@ sources: - 'gs://dbt-external-tables-testing/csv/section=d/people_d.csv' columns: *cols-of-the-people tests: *equal-to-the-people + # - name: people_json_unpartitioned # external: &json-people # location: 'gs://dbt-external-tables-testing/json/*' @@ -79,16 +81,16 @@ sources: # location: 'gs://dbt-external-tables-testing/json/*' # options: # format: json -# hive_partition_uri_prefix: ''gs://dbt-external-tables-testing/json'' +# hive_partition_uri_prefix: "'gs://dbt-external-tables-testing/json'" # partitions: *parts-of-the-people # columns: *cols-of-the-people # tests: *equal-to-the-people -# +# # - name: people_json_schema_auto_detect # external: # location: 'gs://dbt-external-tables-testing/json/*' # options: # format: csv # skip_leading_rows: 1 -# hive_partition_uri_prefix: ''gs://dbt-external-tables-testing/json'' -# tests: *equal-to-the-people +# hive_partition_uri_prefix: "'gs://dbt-external-tables-testing/json'" +# tests: *equal-to-the-people \ No newline at end of file From 81830d9839ab935f4802aba5103cb9e975edea7b Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 15:46:43 +0100 Subject: [PATCH 12/24] add newlines --- integration_tests/models/plugins/bigquery/bigquery_external.yml | 2 +- sample_sources/bigquery.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integration_tests/models/plugins/bigquery/bigquery_external.yml b/integration_tests/models/plugins/bigquery/bigquery_external.yml index d88bde21..62a95e06 100644 --- a/integration_tests/models/plugins/bigquery/bigquery_external.yml +++ b/integration_tests/models/plugins/bigquery/bigquery_external.yml @@ -93,4 +93,4 @@ sources: # format: csv # skip_leading_rows: 1 # hive_partition_uri_prefix: "'gs://dbt-external-tables-testing/json'" -# tests: *equal-to-the-people \ No newline at end of file +# tests: *equal-to-the-people diff --git a/sample_sources/bigquery.yml b/sample_sources/bigquery.yml index 224464ad..cfe18cde 100644 --- a/sample_sources/bigquery.yml +++ b/sample_sources/bigquery.yml @@ -57,4 +57,4 @@ sources: uris: - 'gs://bucket_a/path/*' - 'gs://bucket_b/path/*' - - 'gs://bucket_c/more/specific/path/file.csv' \ No newline at end of file + - 'gs://bucket_c/more/specific/path/file.csv' From 2222cdb14f4179505cdcfef5eaaff6087f664f16 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 16:06:32 +0100 Subject: [PATCH 13/24] test if this fixes the CI --- macros/common/stage_external_sources.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/macros/common/stage_external_sources.sql b/macros/common/stage_external_sources.sql index eaf55301..7596c3e0 100644 --- a/macros/common/stage_external_sources.sql +++ b/macros/common/stage_external_sources.sql @@ -66,8 +66,8 @@ {% endfor %} - {% set update_columns = dbt_external_tables.update_external_table_columns(node) %} - {{ update_columns }} + -- {% set update_columns = dbt_external_tables.update_external_table_columns(node) %} + -- {{ update_columns }} {% endfor %} From d4a6f1b2a56cb7a3ca3309519e997acdb6ef5124 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 16:08:36 +0100 Subject: [PATCH 14/24] test --- macros/common/update_external_table_columns.sql | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 macros/common/update_external_table_columns.sql diff --git a/macros/common/update_external_table_columns.sql b/macros/common/update_external_table_columns.sql deleted file mode 100644 index 37ea1434..00000000 --- a/macros/common/update_external_table_columns.sql +++ /dev/null @@ -1,7 +0,0 @@ -{% macro update_external_table_columns(source_node) %} - {{ return(adapter.dispatch('update_external_table_columns', 'dbt_external_tables')(source_node)) }} -{% endmacro %} - -{% macro default__update_external_table_columns(source_node) %} - {% do return([]) %} -{% endmacro %} From 49eb939e0dd628efbec3b0a2255d955d1d0671d0 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 16:10:46 +0100 Subject: [PATCH 15/24] test CI --- macros/common/stage_external_sources.sql | 3 --- 1 file changed, 3 deletions(-) diff --git a/macros/common/stage_external_sources.sql b/macros/common/stage_external_sources.sql index 7596c3e0..ec42c49d 100644 --- a/macros/common/stage_external_sources.sql +++ b/macros/common/stage_external_sources.sql @@ -65,9 +65,6 @@ {% do log(loop_label ~ ' (' ~ loop.index ~ ') ' ~ log_msg, info = true) %} {% endfor %} - - -- {% set update_columns = dbt_external_tables.update_external_table_columns(node) %} - -- {{ update_columns }} {% endfor %} From 9aeca2604d4e15b0145d0f4568a8b23fba332f63 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 16:12:35 +0100 Subject: [PATCH 16/24] Put back changes --- macros/common/stage_external_sources.sql | 3 +++ macros/common/update_external_table_columns.sql | 7 +++++++ 2 files changed, 10 insertions(+) create mode 100644 macros/common/update_external_table_columns.sql diff --git a/macros/common/stage_external_sources.sql b/macros/common/stage_external_sources.sql index ec42c49d..eaf55301 100644 --- a/macros/common/stage_external_sources.sql +++ b/macros/common/stage_external_sources.sql @@ -65,6 +65,9 @@ {% do log(loop_label ~ ' (' ~ loop.index ~ ') ' ~ log_msg, info = true) %} {% endfor %} + + {% set update_columns = dbt_external_tables.update_external_table_columns(node) %} + {{ update_columns }} {% endfor %} diff --git a/macros/common/update_external_table_columns.sql b/macros/common/update_external_table_columns.sql new file mode 100644 index 00000000..37ea1434 --- /dev/null +++ b/macros/common/update_external_table_columns.sql @@ -0,0 +1,7 @@ +{% macro update_external_table_columns(source_node) %} + {{ return(adapter.dispatch('update_external_table_columns', 'dbt_external_tables')(source_node)) }} +{% endmacro %} + +{% macro default__update_external_table_columns(source_node) %} + {% do return([]) %} +{% endmacro %} From c5746f4dd66e48c7a8d5f6af17e739724ddac954 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 22:18:36 +0100 Subject: [PATCH 17/24] test --- macros/common/update_external_table_columns.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macros/common/update_external_table_columns.sql b/macros/common/update_external_table_columns.sql index 37ea1434..4efe3062 100644 --- a/macros/common/update_external_table_columns.sql +++ b/macros/common/update_external_table_columns.sql @@ -3,5 +3,5 @@ {% endmacro %} {% macro default__update_external_table_columns(source_node) %} - {% do return([]) %} + {% return() %} {% endmacro %} From ce0059b0ce1470334c665fddc77473b310069ea1 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 22:20:05 +0100 Subject: [PATCH 18/24] fix return statement --- macros/common/update_external_table_columns.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/macros/common/update_external_table_columns.sql b/macros/common/update_external_table_columns.sql index 4efe3062..02d81e5a 100644 --- a/macros/common/update_external_table_columns.sql +++ b/macros/common/update_external_table_columns.sql @@ -3,5 +3,4 @@ {% endmacro %} {% macro default__update_external_table_columns(source_node) %} - {% return() %} {% endmacro %} From 06d47c6e985c87c9bef4d6bb0b61907bd6a91736 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Wed, 7 Feb 2024 22:21:49 +0100 Subject: [PATCH 19/24] test --- macros/common/update_external_table_columns.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/macros/common/update_external_table_columns.sql b/macros/common/update_external_table_columns.sql index 02d81e5a..a37ec52f 100644 --- a/macros/common/update_external_table_columns.sql +++ b/macros/common/update_external_table_columns.sql @@ -3,4 +3,5 @@ {% endmacro %} {% macro default__update_external_table_columns(source_node) %} + {% endmacro %} From 81976520ba028c13f5f8029903f09837c035a4d6 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Fri, 1 Mar 2024 09:38:17 +0100 Subject: [PATCH 20/24] Add trigger CI job --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 51e34d31..5b07a013 100644 --- a/README.md +++ b/README.md @@ -121,3 +121,5 @@ If you encounter issues using this package or have questions, please check the [ - post a conceptual question to the relevant database channel (#db-redshift, #dbt-snowflake, etc) in the [dbt Slack community](https://community.getdbt.com/) Additional contributions to this package are very welcome! Please create issues or open PRs against `master`. Check out [this post](https://discourse.getdbt.com/t/contributing-to-an-external-dbt-package/657) on the best workflow for contributing to a package. + +Trigger CI Job \ No newline at end of file From f8873b93917794190ba46c2ab1b4649b069333d4 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Fri, 1 Mar 2024 09:50:14 +0100 Subject: [PATCH 21/24] test --- run_test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/run_test.sh b/run_test.sh index 2b6c99a7..ac0cd08a 100755 --- a/run_test.sh +++ b/run_test.sh @@ -18,6 +18,7 @@ if [[ ! -f $VENV ]]; then else echo "Installing dbt-$1" pip install dbt-$1 --upgrade --pre + pip install protobuf==4.25.3 fi fi From 104646727efdd6ec74ada62d6d5ccd6fbe0e5e50 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Fri, 1 Mar 2024 09:53:07 +0100 Subject: [PATCH 22/24] Add comment --- README.md | 2 -- run_test.sh | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 5b07a013..51e34d31 100644 --- a/README.md +++ b/README.md @@ -121,5 +121,3 @@ If you encounter issues using this package or have questions, please check the [ - post a conceptual question to the relevant database channel (#db-redshift, #dbt-snowflake, etc) in the [dbt Slack community](https://community.getdbt.com/) Additional contributions to this package are very welcome! Please create issues or open PRs against `master`. Check out [this post](https://discourse.getdbt.com/t/contributing-to-an-external-dbt-package/657) on the best workflow for contributing to a package. - -Trigger CI Job \ No newline at end of file diff --git a/run_test.sh b/run_test.sh index ac0cd08a..d2124c05 100755 --- a/run_test.sh +++ b/run_test.sh @@ -18,6 +18,7 @@ if [[ ! -f $VENV ]]; then else echo "Installing dbt-$1" pip install dbt-$1 --upgrade --pre + # Workaround untill the DBT 1.7.9 is released on PyPI pip install protobuf==4.25.3 fi fi From c9281e984a288358259c9d8af8807aa11a6c2602 Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Fri, 1 Mar 2024 09:57:38 +0100 Subject: [PATCH 23/24] Core version could be out of sync with the adapter specific version --- run_test.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/run_test.sh b/run_test.sh index d2124c05..084e1bf0 100755 --- a/run_test.sh +++ b/run_test.sh @@ -18,8 +18,7 @@ if [[ ! -f $VENV ]]; then else echo "Installing dbt-$1" pip install dbt-$1 --upgrade --pre - # Workaround untill the DBT 1.7.9 is released on PyPI - pip install protobuf==4.25.3 + pip install dbt-core --upgrade --pre fi fi From 1a13283847c8908dec1b1d2414cb5e032fb2f7eb Mon Sep 17 00:00:00 2001 From: Thomas van Latum Date: Fri, 1 Mar 2024 10:00:12 +0100 Subject: [PATCH 24/24] Put protobuf back --- run_test.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/run_test.sh b/run_test.sh index 084e1bf0..e759c89f 100755 --- a/run_test.sh +++ b/run_test.sh @@ -18,7 +18,8 @@ if [[ ! -f $VENV ]]; then else echo "Installing dbt-$1" pip install dbt-$1 --upgrade --pre - pip install dbt-core --upgrade --pre + # remove the protobuf installation when all the dbt-provider packaged are updated with dbt core 1.7.9 + pip install protobuf==4.25.3 fi fi