Skip to content

Commit

Permalink
Merge pull request #72 from bqbooster/update-models
Browse files Browse the repository at this point in the history
Update Google base models based on latest documentation parsing
  • Loading branch information
Kayrnt authored Nov 19, 2024
2 parents e79291f + a46a6c8 commit 5173bff
Show file tree
Hide file tree
Showing 35 changed files with 299 additions and 292 deletions.
6 changes: 6 additions & 0 deletions .changes/unreleased/Under the Hood-20241119-012330.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: Under the Hood
body: Update Google base models based on latest documentation parsing
time: 2024-11-19T01:23:30.413941+01:00
custom:
Author: Kayrnt
Issue: "71"
93 changes: 1 addition & 92 deletions macros/inputs/jobs_with_cost_base.sql
Original file line number Diff line number Diff line change
@@ -1,95 +1,4 @@
{#- macro to add cost related formula to base jobs table -#}
{% macro jobs_with_cost_base(table_name, contains_query) -%}
{# More details about base table in https://cloud.google.com/bigquery/docs/information-schema-jobs -#}
WITH base AS (
SELECT
bi_engine_statistics,
cache_hit,
creation_time,
TIMESTAMP_TRUNC(creation_time, HOUR) hour,
destination_table,
{% if contains_query -%}
dml_statistics,
{% endif -%}
end_time,
error_result,
job_id,
job_stages,
job_type,
labels,
parent_job_id,
priority,
project_id,
project_number,
{% if contains_query -%}
query,
-- extract the dbt info from the query comment generated by dbt
replace(replace(regexp_extract(query, r'^(\/\* \{+?[\w\W]+?\} \*\/)'), '/', ''), '*', '') dbt_info,
{% endif -%}
referenced_tables,
reservation_id,
start_time,
state,
statement_type,
timeline,
total_bytes_billed,
total_bytes_processed,
total_modified_partitions,
total_slot_ms,
transaction_id,
user_email,
query_info,
transferred_bytes,
materialized_view_statistics
FROM
{{ ref(table_name) }}
{#- Prevent to duplicate costs as script contains query #}
WHERE statement_type != 'SCRIPT'
),
base_with_enriched_fields AS (
SELECT
*,
total_slot_ms / (1000 * 60 * 60 * 24) AS avg_slots,
total_bytes_billed / POW(1024, 4) AS total_tb_billed,
TIMESTAMP_DIFF(COALESCE(end_time, CURRENT_TIMESTAMP()), start_time, SECOND) AS total_time_seconds,
{% if contains_query -%}
IF(LENGTH(dbt_info) > 0, JSON_EXTRACT_SCALAR(dbt_info, '$.dbt_version'), NULL) AS dbt_version,
IF(LENGTH(dbt_info) > 0, JSON_EXTRACT_SCALAR(dbt_info, '$.profile_name'), NULL) AS dbt_profile_name,
IF(LENGTH(dbt_info) > 0, JSON_EXTRACT_SCALAR(dbt_info, '$.target_name'), NULL) AS dbt_target_name,
IF(LENGTH(dbt_info) > 0, JSON_EXTRACT_SCALAR(dbt_info, '$.node_id'), NULL) AS dbt_model_name,
IF(LENGTH(dbt_info) > 0,
ARRAY(
SELECT JSON_VALUE(string_element, '$')
FROM UNNEST(JSON_QUERY_ARRAY(dbt_info, '$.node_tags')) AS string_element
), NULL) AS node_tags,
{% endif -%}
FROM base
),
base_with_all_pricing AS (
SELECT
{% if contains_query -%}
* EXCEPT(dbt_model_name),
{%- else -%}
*,
{% endif -%}
total_slot_ms / (1000 * 60 * 60) * {{ var('hourly_slot_price') }} AS flat_pricing_query_cost,
total_tb_billed * {{ var('per_billed_tb_price') }} AS ondemand_query_cost,
{% if contains_query -%}
case
when dbt_model_name like 'model.%' then 'model'
when dbt_model_name like 'snapshot.%' then 'snapshot'
when dbt_model_name like 'test.%' then 'test'
end as dbt_execution_type,
concat(split(dbt_model_name, '.')[safe_offset(1)], '.',split(dbt_model_name, '.')[safe_offset(2)]) as dbt_model_name
{% endif -%}
FROM base_with_enriched_fields
)
SELECT
*,
{% if var('use_flat_pricing') -%}
flat_pricing_query_cost AS query_cost
{%- else -%}
ondemand_query_cost AS query_cost
{%- endif %}
FROM base_with_all_pricing

{%- endmacro %}
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@
For more information about granting roles, see Manage access to projects, folders, and organizations.


You might also be able to get
the required permissions through custom
roles or other predefined
roles.
-#}
You might also be able to get
the required permissions through custom
roles or other predefined
roles.
-#}

WITH base AS (
{% if project_list()|length > 0 -%}
{% for project in project_list() -%}
SELECT catalog_name, schema_name, replica_name, location, replica_primary_assigned, replica_primary_assignment_complete, creation_time, creation_complete, replication_time
SELECT catalog_name, schema_name, replica_name, location, replica_primary_assigned, replica_primary_assignment_complete, creation_time, creation_complete, replication_time, sync_status
FROM `{{ project | trim }}`.`region-{{ var('bq_region') }}`.`INFORMATION_SCHEMA`.`SCHEMATA_REPLICAS`
{% if not loop.last %}UNION ALL{% endif %}
{% endfor %}
Expand All @@ -37,7 +37,8 @@ replica_primary_assigned,
replica_primary_assignment_complete,
creation_time,
creation_complete,
replication_time
replication_time,
sync_status
FROM `region-{{ var('bq_region') }}`.`INFORMATION_SCHEMA`.`SCHEMATA_REPLICAS`
{%- endif %}
)
Expand All @@ -52,5 +53,6 @@ replica_primary_assignment_complete,
creation_time,
creation_complete,
replication_time,
sync_status,
FROM
base
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,7 @@ models:
\ This value is only visible in the secondary region.\nIf the dataset contains\
\ a table with streaming data, the value of replication_time will not be accurate."
type: TIMESTAMP
- name: sync_status
description: "The status of the sync\n between the primary and secondary\
\ replica. Returns NULL if the replica is a\n primary replica."
type: JSON
10 changes: 4 additions & 6 deletions models/base/google/jobs/information_schema_jobs.sql
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
WITH base AS (
{% if project_list()|length > 0 -%}
{% for project in project_list() -%}
SELECT bi_engine_statistics, cache_hit, creation_time, destination_table, dml_statistics, end_time, error_result, job_id, job_stages, job_type, labels, parent_job_id, priority, project_id, project_number, query, referenced_tables, reservation_id, edition, session_info, start_time, state, statement_type, timeline, total_bytes_billed, total_bytes_processed, total_modified_partitions, total_slot_ms, transaction_id, user_email, transferred_bytes, materialized_view_statistics, job_creation_reason, query_info
SELECT bi_engine_statistics, cache_hit, creation_time, destination_table, end_time, error_result, job_id, job_stages, job_type, labels, parent_job_id, priority, project_id, project_number, query, referenced_tables, reservation_id, edition, session_info, start_time, state, statement_type, timeline, total_bytes_billed, total_bytes_processed, total_modified_partitions, total_slot_ms, transaction_id, user_email, transferred_bytes, materialized_view_statistics, query_info, job_creation_reason
FROM `{{ project | trim }}`.`region-{{ var('bq_region') }}`.`INFORMATION_SCHEMA`.`JOBS`
{% if not loop.last %}UNION ALL{% endif %}
{% endfor %}
Expand All @@ -28,7 +28,6 @@ bi_engine_statistics,
cache_hit,
creation_time,
destination_table,
dml_statistics,
end_time,
error_result,
job_id,
Expand Down Expand Up @@ -56,8 +55,8 @@ transaction_id,
user_email,
transferred_bytes,
materialized_view_statistics,
job_creation_reason,
query_info
query_info,
job_creation_reason
FROM `region-{{ var('bq_region') }}`.`INFORMATION_SCHEMA`.`JOBS`
{%- endif %}
)
Expand All @@ -67,7 +66,6 @@ SELECT
cache_hit,
creation_time,
destination_table,
dml_statistics,
end_time,
error_result,
job_id,
Expand Down Expand Up @@ -95,7 +93,7 @@ transaction_id,
user_email,
transferred_bytes,
materialized_view_statistics,
job_creation_reason,
query_info,
job_creation_reason,
FROM
base
38 changes: 16 additions & 22 deletions models/base/google/jobs/information_schema_jobs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,6 @@ models:
- name: destination_table
description: "Destination table\n for results, if any."
type: RECORD
- name: dml_statistics
description: "If the job is a query with a DML statement, the value is a record\
\ with the\n following fields:\n\ninserted_row_count: The number of rows\
\ that were inserted.\ndeleted_row_count: The number of rows that were deleted.\n\
updated_row_count: The number of rows that were updated.\n\n For all\
\ other jobs, the value is NULL.\n This column is present in the INFORMATION_SCHEMA.JOBS_BY_USER\
\ and\n INFORMATION_SCHEMA.JOBS_BY_PROJECT views."
type: RECORD
- name: end_time
description: "The end time of this job, in milliseconds since the epoch. This\
\ field represents the\n time when the job enters the DONE state."
Expand Down Expand Up @@ -142,6 +134,22 @@ models:
description: "Statistics of\n materialized views considered in a query\
\ job. (Preview)"
type: RECORD
- name: query_info
description: "query_info.resource_warning : The warning message that appears if\
\ the resource usage during query processing is above the internal threshold\
\ of the system. A successful query job can have the resource_warning field\
\ populated. With resource_warning, you get additional data points to optimize\
\ your queries and to set up monitoring for performance trends of an equivalent\
\ set of queries by using query_hashes.\nquery_info.query_hashes.normalized_literals\
\ : Contains the hashes of the query. normalized_literals is a hexadecimal\n\
\ STRING hash that ignores comments, parameter values, UDFs, and literals.\n\
\ The hash value will differ when underlying views change, or if the\
\ query implicitly\n references columns, such as SELECT *, and the table\
\ schema changes.\n \n This field appears for successful GoogleSQL\
\ queries that are not cache hits.\nquery_info.performance_insights : Performance\
\ insights for the job.\nquery_info.optimization_details : The history-based\
\ optimizations\n for the job."
type: RECORD
- name: job_creation_reason
description: "job_creation_reason.code : Specifies the high level reason why a\
\ job was created.\n Possible values are:\n \nREQUESTED: job creation\
Expand All @@ -153,17 +161,3 @@ models:
\ system has determined that the query needs to be executed as a\n \
\ job."
type: RECORD
- name: query_info
description: "query_info.resource_warning : The warning message that appears if\
\ the resource usage during query processing is above the internal threshold\
\ of the system. A successful query job can have the resource_warning field\
\ populated. With resource_warning, you get additional data points to optimize\
\ your queries and to set up monitoring for performance trends of an equivalent\
\ set of queries by using query_hashes.\nquery_info.query_hashes.normalized_literals\
\ : Contains the hashes of the query. normalized_literals is a hexadecimal\n\
\ STRING hash that ignores comments, parameter values, UDFs, and literals.\n\
\ \n This field appears for successful GoogleSQL queries that\
\ are not cache hits.\nquery_info.performance_insights : Performance insights\
\ for the job.\nquery_info.optimization_details : The history-based optimizations\n\
\ for the job."
type: RECORD
8 changes: 4 additions & 4 deletions models/base/google/jobs/information_schema_jobs_by_folder.sql
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
WITH base AS (
{% if project_list()|length > 0 -%}
{% for project in project_list() -%}
SELECT bi_engine_statistics, cache_hit, creation_time, destination_table, end_time, error_result, job_id, job_stages, job_type, labels, parent_job_id, priority, project_id, project_number, query, referenced_tables, reservation_id, edition, session_info, start_time, state, statement_type, timeline, total_bytes_billed, total_bytes_processed, total_modified_partitions, total_slot_ms, transaction_id, user_email, transferred_bytes, materialized_view_statistics, job_creation_reason, query_info
SELECT bi_engine_statistics, cache_hit, creation_time, destination_table, end_time, error_result, job_id, job_stages, job_type, labels, parent_job_id, priority, project_id, project_number, query, referenced_tables, reservation_id, edition, session_info, start_time, state, statement_type, timeline, total_bytes_billed, total_bytes_processed, total_modified_partitions, total_slot_ms, transaction_id, user_email, transferred_bytes, materialized_view_statistics, query_info, job_creation_reason
FROM `{{ project | trim }}`.`region-{{ var('bq_region') }}`.`INFORMATION_SCHEMA`.`JOBS_BY_FOLDER`
{% if not loop.last %}UNION ALL{% endif %}
{% endfor %}
Expand Down Expand Up @@ -55,8 +55,8 @@ transaction_id,
user_email,
transferred_bytes,
materialized_view_statistics,
job_creation_reason,
query_info
query_info,
job_creation_reason
FROM `region-{{ var('bq_region') }}`.`INFORMATION_SCHEMA`.`JOBS_BY_FOLDER`
{%- endif %}
)
Expand Down Expand Up @@ -93,7 +93,7 @@ transaction_id,
user_email,
transferred_bytes,
materialized_view_statistics,
job_creation_reason,
query_info,
job_creation_reason,
FROM
base
30 changes: 16 additions & 14 deletions models/base/google/jobs/information_schema_jobs_by_folder.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,22 @@ models:
description: "Statistics of\n materialized views considered in a query\
\ job. (Preview)"
type: RECORD
- name: query_info
description: "query_info.resource_warning : The warning message that appears if\
\ the resource usage during query processing is above the internal threshold\
\ of the system. A successful query job can have the resource_warning field\
\ populated. With resource_warning, you get additional data points to optimize\
\ your queries and to set up monitoring for performance trends of an equivalent\
\ set of queries by using query_hashes.\nquery_info.query_hashes.normalized_literals\
\ : Contains the hashes of the query. normalized_literals is a hexadecimal\n\
\ STRING hash that ignores comments, parameter values, UDFs, and literals.\n\
\ The hash value will differ when underlying views change, or if the\
\ query implicitly\n references columns, such as SELECT *, and the table\
\ schema changes.\n \n This field appears for successful GoogleSQL\
\ queries that are not cache hits.\nquery_info.performance_insights : Performance\
\ insights for the job.\nquery_info.optimization_details : The history-based\
\ optimizations\n for the job."
type: RECORD
- name: job_creation_reason
description: "job_creation_reason.code : Specifies the high level reason why a\
\ job was created.\n Possible values are:\n \nREQUESTED: job creation\
Expand All @@ -145,17 +161,3 @@ models:
\ system has determined that the query needs to be executed as a\n \
\ job."
type: RECORD
- name: query_info
description: "query_info.resource_warning : The warning message that appears if\
\ the resource usage during query processing is above the internal threshold\
\ of the system. A successful query job can have the resource_warning field\
\ populated. With resource_warning, you get additional data points to optimize\
\ your queries and to set up monitoring for performance trends of an equivalent\
\ set of queries by using query_hashes.\nquery_info.query_hashes.normalized_literals\
\ : Contains the hashes of the query. normalized_literals is a hexadecimal\n\
\ STRING hash that ignores comments, parameter values, UDFs, and literals.\n\
\ \n This field appears for successful GoogleSQL queries that\
\ are not cache hits.\nquery_info.performance_insights : Performance insights\
\ for the job.\nquery_info.optimization_details : The history-based optimizations\n\
\ for the job."
type: RECORD
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
WITH base AS (
{% if project_list()|length > 0 -%}
{% for project in project_list() -%}
SELECT bi_engine_statistics, cache_hit, creation_time, destination_table, end_time, error_result, job_id, job_stages, job_type, labels, parent_job_id, priority, project_id, project_number, query, referenced_tables, reservation_id, edition, session_info, start_time, state, statement_type, timeline, total_bytes_billed, total_bytes_processed, total_modified_partitions, total_slot_ms, transaction_id, user_email, transferred_bytes, materialized_view_statistics, job_creation_reason, query_info
SELECT bi_engine_statistics, cache_hit, creation_time, destination_table, end_time, error_result, job_id, job_stages, job_type, labels, parent_job_id, priority, project_id, project_number, query, referenced_tables, reservation_id, edition, session_info, start_time, state, statement_type, timeline, total_bytes_billed, total_bytes_processed, total_modified_partitions, total_slot_ms, transaction_id, user_email, transferred_bytes, materialized_view_statistics, query_info, job_creation_reason
FROM `{{ project | trim }}`.`region-{{ var('bq_region') }}`.`INFORMATION_SCHEMA`.`JOBS_BY_ORGANIZATION`
{% if not loop.last %}UNION ALL{% endif %}
{% endfor %}
Expand Down Expand Up @@ -55,8 +55,8 @@ transaction_id,
user_email,
transferred_bytes,
materialized_view_statistics,
job_creation_reason,
query_info
query_info,
job_creation_reason
FROM `region-{{ var('bq_region') }}`.`INFORMATION_SCHEMA`.`JOBS_BY_ORGANIZATION`
{%- endif %}
)
Expand Down Expand Up @@ -93,7 +93,7 @@ transaction_id,
user_email,
transferred_bytes,
materialized_view_statistics,
job_creation_reason,
query_info,
job_creation_reason,
FROM
base
30 changes: 16 additions & 14 deletions models/base/google/jobs/information_schema_jobs_by_organization.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,22 @@ models:
description: "Statistics of\n materialized views considered in a query\
\ job. (Preview)"
type: RECORD
- name: query_info
description: "query_info.resource_warning : The warning message that appears if\
\ the resource usage during query processing is above the internal threshold\
\ of the system. A successful query job can have the resource_warning field\
\ populated. With resource_warning, you get additional data points to optimize\
\ your queries and to set up monitoring for performance trends of an equivalent\
\ set of queries by using query_hashes.\nquery_info.query_hashes.normalized_literals\
\ : Contains the hashes of the query. normalized_literals is a hexadecimal\n\
\ STRING hash that ignores comments, parameter values, UDFs, and literals.\n\
\ The hash value will differ when underlying views change, or if the\
\ query implicitly\n references columns, such as SELECT *, and the table\
\ schema changes.\n \n This field appears for successful GoogleSQL\
\ queries that are not cache hits.\nquery_info.performance_insights : Performance\
\ insights for the job.\nquery_info.optimization_details : The history-based\
\ optimizations\n for the job."
type: RECORD
- name: job_creation_reason
description: "job_creation_reason.code : Specifies the high level reason why a\
\ job was created.\n Possible values are:\n \nREQUESTED: job creation\
Expand All @@ -145,17 +161,3 @@ models:
\ system has determined that the query needs to be executed as a\n \
\ job."
type: RECORD
- name: query_info
description: "query_info.resource_warning : The warning message that appears if\
\ the resource usage during query processing is above the internal threshold\
\ of the system. A successful query job can have the resource_warning field\
\ populated. With resource_warning, you get additional data points to optimize\
\ your queries and to set up monitoring for performance trends of an equivalent\
\ set of queries by using query_hashes.\nquery_info.query_hashes.normalized_literals\
\ : Contains the hashes of the query. normalized_literals is a hexadecimal\n\
\ STRING hash that ignores comments, parameter values, UDFs, and literals.\n\
\ \n This field appears for successful GoogleSQL queries that\
\ are not cache hits.\nquery_info.performance_insights : Performance insights\
\ for the job.\nquery_info.optimization_details : The history-based optimizations\n\
\ for the job."
type: RECORD
Loading

0 comments on commit 5173bff

Please sign in to comment.