From 1fe273a9de73a950429f00b90a48b95cf91d328f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 8 Nov 2024 20:09:29 -0500 Subject: [PATCH 01/11] Update wheel requirement from ~=0.42 to ~=0.43 (#1304) * Update wheel requirement from ~=0.42 to ~=0.43 Updates the requirements on [wheel](https://github.com/pypa/wheel) to permit the latest version. - [Release notes](https://github.com/pypa/wheel/releases) - [Changelog](https://github.com/pypa/wheel/blob/main/docs/news.rst) - [Commits](https://github.com/pypa/wheel/compare/0.42.0...0.43.0) --- updated-dependencies: - dependency-name: wheel dependency-type: direct:development ... Signed-off-by: dependabot[bot] * Add automated changelog yaml from template for bot PR --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Github Build Bot Co-authored-by: Matthew McKnight <91097623+McKnight-42@users.noreply.github.com> Co-authored-by: Mila Page <67295367+VersusFacit@users.noreply.github.com> Co-authored-by: Mike Alfare <13974384+mikealfare@users.noreply.github.com> --- .changes/unreleased/Dependencies-20240724-040744.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changes/unreleased/Dependencies-20240724-040744.yaml diff --git a/.changes/unreleased/Dependencies-20240724-040744.yaml b/.changes/unreleased/Dependencies-20240724-040744.yaml new file mode 100644 index 000000000..fd713788e --- /dev/null +++ b/.changes/unreleased/Dependencies-20240724-040744.yaml @@ -0,0 +1,6 @@ +kind: "Dependencies" +body: "Update wheel requirement from ~=0.42 to ~=0.43" +time: 2024-07-24T04:07:44.00000Z +custom: + Author: dependabot[bot] + PR: 1304 From 75142ac7f18ae94b6171461ec379b97ebf79a0e4 Mon Sep 17 00:00:00 2001 From: Yasuhisa Yoshida Date: Tue, 19 Nov 2024 03:46:14 +0900 Subject: [PATCH 02/11] Remove unused stub function (#1393) Co-authored-by: Mike Alfare <13974384+mikealfare@users.noreply.github.com> --- dbt/adapters/bigquery/impl.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/dbt/adapters/bigquery/impl.py b/dbt/adapters/bigquery/impl.py index cf5800fd3..f6470e7f7 100644 --- a/dbt/adapters/bigquery/impl.py +++ b/dbt/adapters/bigquery/impl.py @@ -95,12 +95,6 @@ def render(self): return f"{self.project}.{self.dataset}" -def _stub_relation(*args, **kwargs): - return BigQueryRelation.create( - database="", schema="", identifier="", quote_policy={}, type=BigQueryRelation.Table - ) - - @dataclass class BigqueryConfig(AdapterConfig): cluster_by: Optional[Union[List[str], str]] = None From 83bb413d770fa1adb68cdb738dde0351347811ad Mon Sep 17 00:00:00 2001 From: Mike Alfare <13974384+mikealfare@users.noreply.github.com> Date: Wed, 20 Nov 2024 00:41:32 -0500 Subject: [PATCH 03/11] Add retry factory to consolidate retry strategies across dbt-bigquery (#1395) * fix imports * create a retry factory and move relevant objects from connections * add on_error method for deadline retries * remove dependency on retry_and_handle from cancel_open * remove dependencies on retry_and_handle * remove timeout methods from connection manager * add retry to get_bq_table * move client factory to credentials module so that on_error can be moved to the retry factory in the retry module * move on_error factory to retry module * move client factories from python_submissions module to credentials module * create a clients module * retry all client factories by default * move polling from manual check in python_submissions module into retry_factory * move load_dataframe logic from adapter to connection manager, use the built-in timeout argument instead of a manual polling method * move upload_file logic from adapter to connection manager, use the built-in timeout argument instead of a manual polling method, remove the manual polling method * move the retry to polling for done instead of create * align new retries with original methods, simplify retry factory * create a method for the dataproc endpoint * make imports explicit, remove unused constant * update names in clients.py to follow the naming convention * update names in connections.py to follow the naming convention * update names in credentials.py to follow the naming convention * update names in python_submissions.py to follow the naming convention * update names in retry.py to follow the naming convention --------- Co-authored-by: Colin Rogers <111200756+colin-rogers-dbt@users.noreply.github.com> --- .../Under the Hood-20241107-143856.yaml | 6 + dbt/adapters/bigquery/clients.py | 69 +++ dbt/adapters/bigquery/connections.py | 434 +++++++----------- dbt/adapters/bigquery/credentials.py | 174 +++++-- dbt/adapters/bigquery/dataproc/__init__.py | 0 dbt/adapters/bigquery/dataproc/batch.py | 68 --- dbt/adapters/bigquery/impl.py | 105 ++--- dbt/adapters/bigquery/python_submissions.py | 250 +++++----- dbt/adapters/bigquery/retry.py | 128 ++++++ dbt/adapters/bigquery/utility.py | 40 +- tests/conftest.py | 8 +- tests/functional/adapter/test_json_keyfile.py | 13 +- tests/unit/test_bigquery_adapter.py | 21 +- .../unit/test_bigquery_connection_manager.py | 104 ++--- tests/unit/test_configure_dataproc_batch.py | 8 +- 15 files changed, 713 insertions(+), 715 deletions(-) create mode 100644 .changes/unreleased/Under the Hood-20241107-143856.yaml create mode 100644 dbt/adapters/bigquery/clients.py delete mode 100644 dbt/adapters/bigquery/dataproc/__init__.py delete mode 100644 dbt/adapters/bigquery/dataproc/batch.py create mode 100644 dbt/adapters/bigquery/retry.py diff --git a/.changes/unreleased/Under the Hood-20241107-143856.yaml b/.changes/unreleased/Under the Hood-20241107-143856.yaml new file mode 100644 index 000000000..db8557bf0 --- /dev/null +++ b/.changes/unreleased/Under the Hood-20241107-143856.yaml @@ -0,0 +1,6 @@ +kind: Under the Hood +body: Create a retry factory to simplify retry strategies across dbt-bigquery +time: 2024-11-07T14:38:56.210445-05:00 +custom: + Author: mikealfare osalama + Issue: "1395" diff --git a/dbt/adapters/bigquery/clients.py b/dbt/adapters/bigquery/clients.py new file mode 100644 index 000000000..18c59fc12 --- /dev/null +++ b/dbt/adapters/bigquery/clients.py @@ -0,0 +1,69 @@ +from google.api_core.client_info import ClientInfo +from google.api_core.client_options import ClientOptions +from google.api_core.retry import Retry +from google.auth.exceptions import DefaultCredentialsError +from google.cloud.bigquery import Client as BigQueryClient +from google.cloud.dataproc_v1 import BatchControllerClient, JobControllerClient +from google.cloud.storage import Client as StorageClient + +from dbt.adapters.events.logging import AdapterLogger + +import dbt.adapters.bigquery.__version__ as dbt_version +from dbt.adapters.bigquery.credentials import ( + BigQueryCredentials, + create_google_credentials, + set_default_credentials, +) + + +_logger = AdapterLogger("BigQuery") + + +def create_bigquery_client(credentials: BigQueryCredentials) -> BigQueryClient: + try: + return _create_bigquery_client(credentials) + except DefaultCredentialsError: + _logger.info("Please log into GCP to continue") + set_default_credentials() + return _create_bigquery_client(credentials) + + +@Retry() # google decorator. retries on transient errors with exponential backoff +def create_gcs_client(credentials: BigQueryCredentials) -> StorageClient: + return StorageClient( + project=credentials.execution_project, + credentials=create_google_credentials(credentials), + ) + + +@Retry() # google decorator. retries on transient errors with exponential backoff +def create_dataproc_job_controller_client(credentials: BigQueryCredentials) -> JobControllerClient: + return JobControllerClient( + credentials=create_google_credentials(credentials), + client_options=ClientOptions(api_endpoint=_dataproc_endpoint(credentials)), + ) + + +@Retry() # google decorator. retries on transient errors with exponential backoff +def create_dataproc_batch_controller_client( + credentials: BigQueryCredentials, +) -> BatchControllerClient: + return BatchControllerClient( + credentials=create_google_credentials(credentials), + client_options=ClientOptions(api_endpoint=_dataproc_endpoint(credentials)), + ) + + +@Retry() # google decorator. retries on transient errors with exponential backoff +def _create_bigquery_client(credentials: BigQueryCredentials) -> BigQueryClient: + return BigQueryClient( + credentials.execution_project, + create_google_credentials(credentials), + location=getattr(credentials, "location", None), + client_info=ClientInfo(user_agent=f"dbt-bigquery-{dbt_version.version}"), + client_options=ClientOptions(quota_project_id=credentials.quota_project), + ) + + +def _dataproc_endpoint(credentials: BigQueryCredentials) -> str: + return f"{credentials.dataproc_region}-dataproc.googleapis.com:443" diff --git a/dbt/adapters/bigquery/connections.py b/dbt/adapters/bigquery/connections.py index bda54080b..61fa87d40 100644 --- a/dbt/adapters/bigquery/connections.py +++ b/dbt/adapters/bigquery/connections.py @@ -8,17 +8,20 @@ from typing import Dict, Hashable, List, Optional, Tuple, TYPE_CHECKING import uuid -from google.api_core import client_info, client_options, retry -import google.auth -from google.auth import impersonated_credentials -import google.auth.exceptions -import google.cloud.bigquery -import google.cloud.exceptions -from google.oauth2 import ( - credentials as GoogleCredentials, - service_account as GoogleServiceAccountCredentials, +from google.auth.exceptions import RefreshError +from google.cloud.bigquery import ( + Client, + CopyJobConfig, + Dataset, + DatasetReference, + LoadJobConfig, + QueryJobConfig, + QueryPriority, + SchemaField, + Table, + TableReference, ) -from requests.exceptions import ConnectionError +from google.cloud.exceptions import BadRequest, Forbidden, NotFound from dbt_common.events.contextvars import get_node_info from dbt_common.events.functions import fire_event @@ -34,14 +37,9 @@ from dbt.adapters.events.types import SQLQuery from dbt.adapters.exceptions.connection import FailedToConnectError -import dbt.adapters.bigquery.__version__ as dbt_version -from dbt.adapters.bigquery.credentials import ( - BigQueryConnectionMethod, - Priority, - get_bigquery_defaults, - setup_default_credentials, -) -from dbt.adapters.bigquery.utility import is_base64, base64_to_string +from dbt.adapters.bigquery.clients import create_bigquery_client +from dbt.adapters.bigquery.credentials import Priority +from dbt.adapters.bigquery.retry import RetryFactory if TYPE_CHECKING: # Indirectly imported via agate_helper, which is lazy loaded further downfile. @@ -51,22 +49,8 @@ logger = AdapterLogger("BigQuery") -BQ_QUERY_JOB_SPLIT = "-----Query Job SQL Follows-----" - -WRITE_TRUNCATE = google.cloud.bigquery.job.WriteDisposition.WRITE_TRUNCATE -REOPENABLE_ERRORS = ( - ConnectionResetError, - ConnectionError, -) - -RETRYABLE_ERRORS = ( - google.cloud.exceptions.ServerError, - google.cloud.exceptions.BadRequest, - google.cloud.exceptions.BadGateway, - ConnectionResetError, - ConnectionError, -) +BQ_QUERY_JOB_SPLIT = "-----Query Job SQL Follows-----" @dataclass @@ -82,12 +66,10 @@ class BigQueryAdapterResponse(AdapterResponse): class BigQueryConnectionManager(BaseConnectionManager): TYPE = "bigquery" - DEFAULT_INITIAL_DELAY = 1.0 # Seconds - DEFAULT_MAXIMUM_DELAY = 3.0 # Seconds - def __init__(self, profile: AdapterRequiredConfig, mp_context: SpawnContext): super().__init__(profile, mp_context) self.jobs_by_thread: Dict[Hashable, List[str]] = defaultdict(list) + self._retry = RetryFactory(profile.credentials) @classmethod def handle_error(cls, error, message): @@ -108,19 +90,19 @@ def exception_handler(self, sql): try: yield - except google.cloud.exceptions.BadRequest as e: + except BadRequest as e: message = "Bad request while running query" self.handle_error(e, message) - except google.cloud.exceptions.Forbidden as e: + except Forbidden as e: message = "Access denied while running query" self.handle_error(e, message) - except google.cloud.exceptions.NotFound as e: + except NotFound as e: message = "Not found while running query" self.handle_error(e, message) - except google.auth.exceptions.RefreshError as e: + except RefreshError as e: message = ( "Unable to generate access token, if you're using " "impersonate_service_account, make sure your " @@ -153,15 +135,15 @@ def cancel_open(self): for thread_id, connection in self.thread_connections.items(): if connection is this_connection: continue + if connection.handle is not None and connection.state == ConnectionState.OPEN: - client = connection.handle + client: Client = connection.handle for job_id in self.jobs_by_thread.get(thread_id, []): - - def fn(): - return client.cancel_job(job_id) - - self._retry_and_handle(msg=f"Cancel job: {job_id}", conn=connection, fn=fn) - + with self.exception_handler(f"Cancel job: {job_id}"): + client.cancel_job( + job_id, + retry=self._retry.create_reopen_with_deadline(connection), + ) self.close(connection) if connection.name is not None: @@ -203,121 +185,23 @@ def format_rows_number(self, rows_number): rows_number *= 1000.0 return f"{rows_number:3.1f}{unit}".strip() - @classmethod - def get_google_credentials(cls, profile_credentials) -> GoogleCredentials: - method = profile_credentials.method - creds = GoogleServiceAccountCredentials.Credentials - - if method == BigQueryConnectionMethod.OAUTH: - credentials, _ = get_bigquery_defaults(scopes=profile_credentials.scopes) - return credentials - - elif method == BigQueryConnectionMethod.SERVICE_ACCOUNT: - keyfile = profile_credentials.keyfile - return creds.from_service_account_file(keyfile, scopes=profile_credentials.scopes) - - elif method == BigQueryConnectionMethod.SERVICE_ACCOUNT_JSON: - details = profile_credentials.keyfile_json - if is_base64(profile_credentials.keyfile_json): - details = base64_to_string(details) - return creds.from_service_account_info(details, scopes=profile_credentials.scopes) - - elif method == BigQueryConnectionMethod.OAUTH_SECRETS: - return GoogleCredentials.Credentials( - token=profile_credentials.token, - refresh_token=profile_credentials.refresh_token, - client_id=profile_credentials.client_id, - client_secret=profile_credentials.client_secret, - token_uri=profile_credentials.token_uri, - scopes=profile_credentials.scopes, - ) - - error = 'Invalid `method` in profile: "{}"'.format(method) - raise FailedToConnectError(error) - - @classmethod - def get_impersonated_credentials(cls, profile_credentials): - source_credentials = cls.get_google_credentials(profile_credentials) - return impersonated_credentials.Credentials( - source_credentials=source_credentials, - target_principal=profile_credentials.impersonate_service_account, - target_scopes=list(profile_credentials.scopes), - ) - - @classmethod - def get_credentials(cls, profile_credentials): - if profile_credentials.impersonate_service_account: - return cls.get_impersonated_credentials(profile_credentials) - else: - return cls.get_google_credentials(profile_credentials) - - @classmethod - @retry.Retry() # google decorator. retries on transient errors with exponential backoff - def get_bigquery_client(cls, profile_credentials): - creds = cls.get_credentials(profile_credentials) - execution_project = profile_credentials.execution_project - quota_project = profile_credentials.quota_project - location = getattr(profile_credentials, "location", None) - - info = client_info.ClientInfo(user_agent=f"dbt-bigquery-{dbt_version.version}") - options = client_options.ClientOptions(quota_project_id=quota_project) - return google.cloud.bigquery.Client( - execution_project, - creds, - location=location, - client_info=info, - client_options=options, - ) - @classmethod def open(cls, connection): - if connection.state == "open": + if connection.state == ConnectionState.OPEN: logger.debug("Connection is already open, skipping open.") return connection try: - handle = cls.get_bigquery_client(connection.credentials) - - except google.auth.exceptions.DefaultCredentialsError: - logger.info("Please log into GCP to continue") - setup_default_credentials() - - handle = cls.get_bigquery_client(connection.credentials) + connection.handle = create_bigquery_client(connection.credentials) + connection.state = ConnectionState.OPEN + return connection except Exception as e: - logger.debug( - "Got an error when attempting to create a bigquery " "client: '{}'".format(e) - ) - + logger.debug(f"""Got an error when attempting to create a bigquery " "client: '{e}'""") connection.handle = None - connection.state = "fail" - + connection.state = ConnectionState.FAIL raise FailedToConnectError(str(e)) - connection.handle = handle - connection.state = "open" - return connection - - @classmethod - def get_job_execution_timeout_seconds(cls, conn): - credentials = conn.credentials - return credentials.job_execution_timeout_seconds - - @classmethod - def get_job_retries(cls, conn) -> int: - credentials = conn.credentials - return credentials.job_retries - - @classmethod - def get_job_creation_timeout_seconds(cls, conn): - credentials = conn.credentials - return credentials.job_creation_timeout_seconds - - @classmethod - def get_job_retry_deadline_seconds(cls, conn): - credentials = conn.credentials - return credentials.job_retry_deadline_seconds - @classmethod def get_table_from_response(cls, resp) -> "agate.Table": from dbt_common.clients import agate_helper @@ -357,7 +241,6 @@ def raw_execute( dry_run: bool = False, ): conn = self.get_thread_connection() - client = conn.handle fire_event(SQLQuery(conn_name=conn.name, sql=sql, node_info=get_node_info())) @@ -373,34 +256,25 @@ def raw_execute( priority = conn.credentials.priority if priority == Priority.Batch: - job_params["priority"] = google.cloud.bigquery.QueryPriority.BATCH + job_params["priority"] = QueryPriority.BATCH else: - job_params["priority"] = google.cloud.bigquery.QueryPriority.INTERACTIVE + job_params["priority"] = QueryPriority.INTERACTIVE maximum_bytes_billed = conn.credentials.maximum_bytes_billed if maximum_bytes_billed is not None and maximum_bytes_billed != 0: job_params["maximum_bytes_billed"] = maximum_bytes_billed - job_creation_timeout = self.get_job_creation_timeout_seconds(conn) - job_execution_timeout = self.get_job_execution_timeout_seconds(conn) - - def fn(): + with self.exception_handler(sql): job_id = self.generate_job_id() return self._query_and_results( - client, + conn, sql, job_params, job_id, - job_creation_timeout=job_creation_timeout, - job_execution_timeout=job_execution_timeout, limit=limit, ) - query_job, iterator = self._retry_and_handle(msg=sql, conn=conn, fn=fn) - - return query_job, iterator - def execute( self, sql, auto_begin=False, fetch=None, limit: Optional[int] = None ) -> Tuple[BigQueryAdapterResponse, "agate.Table"]: @@ -528,9 +402,9 @@ def standard_to_legacy(table): _, iterator = self.raw_execute(sql, use_legacy_sql=True) return self.get_table_from_response(iterator) - def copy_bq_table(self, source, destination, write_disposition): + def copy_bq_table(self, source, destination, write_disposition) -> None: conn = self.get_thread_connection() - client = conn.handle + client: Client = conn.handle # ------------------------------------------------------------------------------- # BigQuery allows to use copy API using two different formats: @@ -558,89 +432,149 @@ def copy_bq_table(self, source, destination, write_disposition): write_disposition, ) - def copy_and_results(): - job_config = google.cloud.bigquery.CopyJobConfig(write_disposition=write_disposition) - copy_job = client.copy_table(source_ref_array, destination_ref, job_config=job_config) - timeout = self.get_job_execution_timeout_seconds(conn) or 300 - iterator = copy_job.result(timeout=timeout) - return copy_job, iterator - - self._retry_and_handle( - msg='copy table "{}" to "{}"'.format( - ", ".join(source_ref.path for source_ref in source_ref_array), - destination_ref.path, - ), - conn=conn, - fn=copy_and_results, + msg = 'copy table "{}" to "{}"'.format( + ", ".join(source_ref.path for source_ref in source_ref_array), + destination_ref.path, + ) + with self.exception_handler(msg): + copy_job = client.copy_table( + source_ref_array, + destination_ref, + job_config=CopyJobConfig(write_disposition=write_disposition), + retry=self._retry.create_reopen_with_deadline(conn), + ) + copy_job.result(timeout=self._retry.create_job_execution_timeout(fallback=300)) + + def write_dataframe_to_table( + self, + client: Client, + file_path: str, + database: str, + schema: str, + identifier: str, + table_schema: List[SchemaField], + field_delimiter: str, + fallback_timeout: Optional[float] = None, + ) -> None: + load_config = LoadJobConfig( + skip_leading_rows=1, + schema=table_schema, + field_delimiter=field_delimiter, ) + table = self.table_ref(database, schema, identifier) + self._write_file_to_table(client, file_path, table, load_config, fallback_timeout) + + def write_file_to_table( + self, + client: Client, + file_path: str, + database: str, + schema: str, + identifier: str, + fallback_timeout: Optional[float] = None, + **kwargs, + ) -> None: + config = kwargs["kwargs"] + if "schema" in config: + config["schema"] = json.load(config["schema"]) + load_config = LoadJobConfig(**config) + table = self.table_ref(database, schema, identifier) + self._write_file_to_table(client, file_path, table, load_config, fallback_timeout) + + def _write_file_to_table( + self, + client: Client, + file_path: str, + table: TableReference, + config: LoadJobConfig, + fallback_timeout: Optional[float] = None, + ) -> None: + + with self.exception_handler("LOAD TABLE"): + with open(file_path, "rb") as f: + job = client.load_table_from_file(f, table, rewind=True, job_config=config) + + response = job.result(retry=self._retry.create_retry(fallback=fallback_timeout)) + + if response.state != "DONE": + raise DbtRuntimeError("BigQuery Timeout Exceeded") + + elif response.error_result: + message = "\n".join(error["message"].strip() for error in response.errors) + raise DbtRuntimeError(message) @staticmethod def dataset_ref(database, schema): - return google.cloud.bigquery.DatasetReference(project=database, dataset_id=schema) + return DatasetReference(project=database, dataset_id=schema) @staticmethod def table_ref(database, schema, table_name): - dataset_ref = google.cloud.bigquery.DatasetReference(database, schema) - return google.cloud.bigquery.TableReference(dataset_ref, table_name) + dataset_ref = DatasetReference(database, schema) + return TableReference(dataset_ref, table_name) - def get_bq_table(self, database, schema, identifier): + def get_bq_table(self, database, schema, identifier) -> Table: """Get a bigquery table for a schema/model.""" conn = self.get_thread_connection() + client: Client = conn.handle # backwards compatibility: fill in with defaults if not specified database = database or conn.credentials.database schema = schema or conn.credentials.schema - table_ref = self.table_ref(database, schema, identifier) - return conn.handle.get_table(table_ref) + return client.get_table( + table=self.table_ref(database, schema, identifier), + retry=self._retry.create_reopen_with_deadline(conn), + ) - def drop_dataset(self, database, schema): + def drop_dataset(self, database, schema) -> None: conn = self.get_thread_connection() - dataset_ref = self.dataset_ref(database, schema) - client = conn.handle - - def fn(): - return client.delete_dataset(dataset_ref, delete_contents=True, not_found_ok=True) - - self._retry_and_handle(msg="drop dataset", conn=conn, fn=fn) + client: Client = conn.handle + with self.exception_handler("drop dataset"): + client.delete_dataset( + dataset=self.dataset_ref(database, schema), + delete_contents=True, + not_found_ok=True, + retry=self._retry.create_reopen_with_deadline(conn), + ) - def create_dataset(self, database, schema): + def create_dataset(self, database, schema) -> Dataset: conn = self.get_thread_connection() - client = conn.handle - dataset_ref = self.dataset_ref(database, schema) - - def fn(): - return client.create_dataset(dataset_ref, exists_ok=True) - - self._retry_and_handle(msg="create dataset", conn=conn, fn=fn) + client: Client = conn.handle + with self.exception_handler("create dataset"): + return client.create_dataset( + dataset=self.dataset_ref(database, schema), + exists_ok=True, + retry=self._retry.create_reopen_with_deadline(conn), + ) def list_dataset(self, database: str): - # the database string we get here is potentially quoted. Strip that off - # for the API call. - database = database.strip("`") + # The database string we get here is potentially quoted. + # Strip that off for the API call. conn = self.get_thread_connection() - client = conn.handle - - def query_schemas(): + client: Client = conn.handle + with self.exception_handler("list dataset"): # this is similar to how we have to deal with listing tables - all_datasets = client.list_datasets(project=database, max_results=10000) + all_datasets = client.list_datasets( + project=database.strip("`"), + max_results=10000, + retry=self._retry.create_reopen_with_deadline(conn), + ) return [ds.dataset_id for ds in all_datasets] - return self._retry_and_handle(msg="list dataset", conn=conn, fn=query_schemas) - def _query_and_results( self, - client, + conn, sql, job_params, job_id, - job_creation_timeout=None, - job_execution_timeout=None, limit: Optional[int] = None, ): + client: Client = conn.handle """Query the client and wait for results.""" # Cannot reuse job_config if destination is set and ddl is used - job_config = google.cloud.bigquery.QueryJobConfig(**job_params) query_job = client.query( - query=sql, job_config=job_config, job_id=job_id, timeout=job_creation_timeout + query=sql, + job_config=QueryJobConfig(**job_params), + job_id=job_id, # note, this disables retry since the job_id will have been used + timeout=self._retry.create_job_creation_timeout(), ) if ( query_job.location is not None @@ -650,37 +584,14 @@ def _query_and_results( logger.debug( self._bq_job_link(query_job.location, query_job.project, query_job.job_id) ) + + timeout = self._retry.create_job_execution_timeout() try: - iterator = query_job.result(max_results=limit, timeout=job_execution_timeout) - return query_job, iterator + iterator = query_job.result(max_results=limit, timeout=timeout) except TimeoutError: - exc = f"Operation did not complete within the designated timeout of {job_execution_timeout} seconds." + exc = f"Operation did not complete within the designated timeout of {timeout} seconds." raise TimeoutError(exc) - - def _retry_and_handle(self, msg, conn, fn): - """retry a function call within the context of exception_handler.""" - - def reopen_conn_on_error(error): - if isinstance(error, REOPENABLE_ERRORS): - logger.warning("Reopening connection after {!r}".format(error)) - self.close(conn) - self.open(conn) - return - - with self.exception_handler(msg): - return retry.retry_target( - target=fn, - predicate=_ErrorCounter(self.get_job_retries(conn)).count_error, - sleep_generator=self._retry_generator(), - deadline=self.get_job_retry_deadline_seconds(conn), - on_error=reopen_conn_on_error, - ) - - def _retry_generator(self): - """Generates retry intervals that exponentially back off.""" - return retry.exponential_sleep_generator( - initial=self.DEFAULT_INITIAL_DELAY, maximum=self.DEFAULT_MAXIMUM_DELAY - ) + return query_job, iterator def _labels_from_query_comment(self, comment: str) -> Dict: try: @@ -693,39 +604,6 @@ def _labels_from_query_comment(self, comment: str) -> Dict: } -class _ErrorCounter(object): - """Counts errors seen up to a threshold then raises the next error.""" - - def __init__(self, retries): - self.retries = retries - self.error_count = 0 - - def count_error(self, error): - if self.retries == 0: - return False # Don't log - self.error_count += 1 - if _is_retryable(error) and self.error_count <= self.retries: - logger.debug( - "Retry attempt {} of {} after error: {}".format( - self.error_count, self.retries, repr(error) - ) - ) - return True - else: - return False - - -def _is_retryable(error): - """Return true for errors that are unlikely to occur again if retried.""" - if isinstance(error, RETRYABLE_ERRORS): - return True - elif isinstance(error, google.api_core.exceptions.Forbidden) and any( - e["reason"] == "rateLimitExceeded" for e in error.errors - ): - return True - return False - - _SANITIZE_LABEL_PATTERN = re.compile(r"[^a-z0-9_-]") _VALIDATE_LABEL_LENGTH_LIMIT = 63 diff --git a/dbt/adapters/bigquery/credentials.py b/dbt/adapters/bigquery/credentials.py index 32f172dac..94d70a931 100644 --- a/dbt/adapters/bigquery/credentials.py +++ b/dbt/adapters/bigquery/credentials.py @@ -1,9 +1,14 @@ +import base64 +import binascii from dataclasses import dataclass, field from functools import lru_cache -from typing import Any, Dict, Optional, Tuple +from typing import Any, Dict, Iterable, Optional, Tuple, Union -import google.auth +from google.auth import default from google.auth.exceptions import DefaultCredentialsError +from google.auth.impersonated_credentials import Credentials as ImpersonatedCredentials +from google.oauth2.credentials import Credentials as GoogleCredentials +from google.oauth2.service_account import Credentials as ServiceAccountCredentials from mashumaro import pass_through from dbt_common.clients.system import run_cmd @@ -11,6 +16,7 @@ from dbt_common.exceptions import DbtConfigError, DbtRuntimeError from dbt.adapters.contracts.connection import Credentials from dbt.adapters.events.logging import AdapterLogger +from dbt.adapters.exceptions.connection import FailedToConnectError _logger = AdapterLogger("BigQuery") @@ -21,59 +27,22 @@ class Priority(StrEnum): Batch = "batch" -class BigQueryConnectionMethod(StrEnum): - OAUTH = "oauth" - SERVICE_ACCOUNT = "service-account" - SERVICE_ACCOUNT_JSON = "service-account-json" - OAUTH_SECRETS = "oauth-secrets" - - @dataclass class DataprocBatchConfig(ExtensibleDbtClassMixin): def __init__(self, batch_config): self.batch_config = batch_config -@lru_cache() -def get_bigquery_defaults(scopes=None) -> Tuple[Any, Optional[str]]: - """ - Returns (credentials, project_id) - - project_id is returned available from the environment; otherwise None - """ - # Cached, because the underlying implementation shells out, taking ~1s - try: - credentials, _ = google.auth.default(scopes=scopes) - return credentials, _ - except DefaultCredentialsError as e: - raise DbtConfigError(f"Failed to authenticate with supplied credentials\nerror:\n{e}") - - -def setup_default_credentials(): - if _gcloud_installed(): - run_cmd(".", ["gcloud", "auth", "application-default", "login"]) - else: - msg = """ - dbt requires the gcloud SDK to be installed to authenticate with BigQuery. - Please download and install the SDK, or use a Service Account instead. - - https://cloud.google.com/sdk/ - """ - raise DbtRuntimeError(msg) - - -def _gcloud_installed(): - try: - run_cmd(".", ["gcloud", "--version"]) - return True - except OSError as e: - _logger.debug(e) - return False +class _BigQueryConnectionMethod(StrEnum): + OAUTH = "oauth" + OAUTH_SECRETS = "oauth-secrets" + SERVICE_ACCOUNT = "service-account" + SERVICE_ACCOUNT_JSON = "service-account-json" @dataclass class BigQueryCredentials(Credentials): - method: BigQueryConnectionMethod = None # type: ignore + method: _BigQueryConnectionMethod = None # type: ignore # BigQuery allows an empty database / project, where it defers to the # environment for the project @@ -179,9 +148,122 @@ def __pre_deserialize__(cls, d: Dict[Any, Any]) -> Dict[Any, Any]: # `database` is an alias of `project` in BigQuery if "database" not in d: - _, database = get_bigquery_defaults() + _, database = _create_bigquery_defaults() d["database"] = database # `execution_project` default to dataset/project if "execution_project" not in d: d["execution_project"] = d["database"] return d + + +def set_default_credentials() -> None: + try: + run_cmd(".", ["gcloud", "--version"]) + except OSError as e: + _logger.debug(e) + msg = """ + dbt requires the gcloud SDK to be installed to authenticate with BigQuery. + Please download and install the SDK, or use a Service Account instead. + + https://cloud.google.com/sdk/ + """ + raise DbtRuntimeError(msg) + + run_cmd(".", ["gcloud", "auth", "application-default", "login"]) + + +def create_google_credentials(credentials: BigQueryCredentials) -> GoogleCredentials: + if credentials.impersonate_service_account: + return _create_impersonated_credentials(credentials) + return _create_google_credentials(credentials) + + +def _create_impersonated_credentials(credentials: BigQueryCredentials) -> ImpersonatedCredentials: + if credentials.scopes and isinstance(credentials.scopes, Iterable): + target_scopes = list(credentials.scopes) + else: + target_scopes = [] + + return ImpersonatedCredentials( + source_credentials=_create_google_credentials(credentials), + target_principal=credentials.impersonate_service_account, + target_scopes=target_scopes, + ) + + +def _create_google_credentials(credentials: BigQueryCredentials) -> GoogleCredentials: + + if credentials.method == _BigQueryConnectionMethod.OAUTH: + creds, _ = _create_bigquery_defaults(scopes=credentials.scopes) + + elif credentials.method == _BigQueryConnectionMethod.SERVICE_ACCOUNT: + creds = ServiceAccountCredentials.from_service_account_file( + credentials.keyfile, scopes=credentials.scopes + ) + + elif credentials.method == _BigQueryConnectionMethod.SERVICE_ACCOUNT_JSON: + details = credentials.keyfile_json + if _is_base64(details): # type:ignore + details = _base64_to_string(details) + creds = ServiceAccountCredentials.from_service_account_info( + details, scopes=credentials.scopes + ) + + elif credentials.method == _BigQueryConnectionMethod.OAUTH_SECRETS: + creds = GoogleCredentials( + token=credentials.token, + refresh_token=credentials.refresh_token, + client_id=credentials.client_id, + client_secret=credentials.client_secret, + token_uri=credentials.token_uri, + scopes=credentials.scopes, + ) + + else: + raise FailedToConnectError(f"Invalid `method` in profile: '{credentials.method}'") + + return creds + + +@lru_cache() +def _create_bigquery_defaults(scopes=None) -> Tuple[Any, Optional[str]]: + """ + Returns (credentials, project_id) + + project_id is returned available from the environment; otherwise None + """ + # Cached, because the underlying implementation shells out, taking ~1s + try: + return default(scopes=scopes) + except DefaultCredentialsError as e: + raise DbtConfigError(f"Failed to authenticate with supplied credentials\nerror:\n{e}") + + +def _is_base64(s: Union[str, bytes]) -> bool: + """ + Checks if the given string or bytes object is valid Base64 encoded. + + Args: + s: The string or bytes object to check. + + Returns: + True if the input is valid Base64, False otherwise. + """ + + if isinstance(s, str): + # For strings, ensure they consist only of valid Base64 characters + if not s.isascii(): + return False + # Convert to bytes for decoding + s = s.encode("ascii") + + try: + # Use the 'validate' parameter to enforce strict Base64 decoding rules + base64.b64decode(s, validate=True) + return True + except (TypeError, binascii.Error): + return False + + +def _base64_to_string(b): + return base64.b64decode(b).decode("utf-8") diff --git a/dbt/adapters/bigquery/dataproc/__init__.py b/dbt/adapters/bigquery/dataproc/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/dbt/adapters/bigquery/dataproc/batch.py b/dbt/adapters/bigquery/dataproc/batch.py deleted file mode 100644 index 59f40d246..000000000 --- a/dbt/adapters/bigquery/dataproc/batch.py +++ /dev/null @@ -1,68 +0,0 @@ -from datetime import datetime -import time -from typing import Dict, Union - -from google.cloud.dataproc_v1 import ( - Batch, - BatchControllerClient, - CreateBatchRequest, - GetBatchRequest, -) -from google.protobuf.json_format import ParseDict - -from dbt.adapters.bigquery.credentials import DataprocBatchConfig - - -_BATCH_RUNNING_STATES = [Batch.State.PENDING, Batch.State.RUNNING] -DEFAULT_JAR_FILE_URI = "gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.13-0.34.0.jar" - - -def create_batch_request( - batch: Batch, batch_id: str, project: str, region: str -) -> CreateBatchRequest: - return CreateBatchRequest( - parent=f"projects/{project}/locations/{region}", - batch_id=batch_id, - batch=batch, - ) - - -def poll_batch_job( - parent: str, batch_id: str, job_client: BatchControllerClient, timeout: int -) -> Batch: - batch_name = "".join([parent, "/batches/", batch_id]) - state = Batch.State.PENDING - response = None - run_time = 0 - while state in _BATCH_RUNNING_STATES and run_time < timeout: - time.sleep(1) - response = job_client.get_batch( - request=GetBatchRequest(name=batch_name), - ) - run_time = datetime.now().timestamp() - response.create_time.timestamp() - state = response.state - if not response: - raise ValueError("No response from Dataproc") - if state != Batch.State.SUCCEEDED: - if run_time >= timeout: - raise ValueError( - f"Operation did not complete within the designated timeout of {timeout} seconds." - ) - else: - raise ValueError(response.state_message) - return response - - -def update_batch_from_config(config_dict: Union[Dict, DataprocBatchConfig], target: Batch): - try: - # updates in place - ParseDict(config_dict, target._pb) - except Exception as e: - docurl = ( - "https://cloud.google.com/dataproc-serverless/docs/reference/rpc/google.cloud.dataproc.v1" - "#google.cloud.dataproc.v1.Batch" - ) - raise ValueError( - f"Unable to parse dataproc_batch as valid batch specification. See {docurl}. {str(e)}" - ) from e - return target diff --git a/dbt/adapters/bigquery/impl.py b/dbt/adapters/bigquery/impl.py index f6470e7f7..51c457129 100644 --- a/dbt/adapters/bigquery/impl.py +++ b/dbt/adapters/bigquery/impl.py @@ -1,9 +1,7 @@ from dataclasses import dataclass from datetime import datetime -import json from multiprocessing.context import SpawnContext import threading -import time from typing import ( Any, Dict, @@ -22,7 +20,7 @@ import google.auth import google.oauth2 import google.cloud.bigquery -from google.cloud.bigquery import AccessEntry, SchemaField, Table as BigQueryTable +from google.cloud.bigquery import AccessEntry, Client, SchemaField, Table as BigQueryTable import google.cloud.exceptions import pytz @@ -454,22 +452,6 @@ def get_columns_in_select_sql(self, select_sql: str) -> List[BigQueryColumn]: logger.debug("get_columns_in_select_sql error: {}".format(e)) return [] - @classmethod - def poll_until_job_completes(cls, job, timeout): - retry_count = timeout - - while retry_count > 0 and job.state != "DONE": - retry_count -= 1 - time.sleep(1) - job.reload() - - if job.state != "DONE": - raise dbt_common.exceptions.DbtRuntimeError("BigQuery Timeout Exceeded") - - elif job.error_result: - message = "\n".join(error["message"].strip() for error in job.errors) - raise dbt_common.exceptions.DbtRuntimeError(message) - def _bq_table_to_relation(self, bq_table) -> Union[BigQueryRelation, None]: if bq_table is None: return None @@ -669,55 +651,50 @@ def alter_table_add_columns(self, relation, columns): @available.parse_none def load_dataframe( self, - database, - schema, - table_name, + database: str, + schema: str, + table_name: str, agate_table: "agate.Table", - column_override, - field_delimiter, - ): - bq_schema = self._agate_to_schema(agate_table, column_override) - conn = self.connections.get_thread_connection() - client = conn.handle - - table_ref = self.connections.table_ref(database, schema, table_name) - - load_config = google.cloud.bigquery.LoadJobConfig() - load_config.skip_leading_rows = 1 - load_config.schema = bq_schema - load_config.field_delimiter = field_delimiter - job_id = self.connections.generate_job_id() - with open(agate_table.original_abspath, "rb") as f: # type: ignore - job = client.load_table_from_file( - f, table_ref, rewind=True, job_config=load_config, job_id=job_id - ) - - timeout = self.connections.get_job_execution_timeout_seconds(conn) or 300 - with self.connections.exception_handler("LOAD TABLE"): - self.poll_until_job_completes(job, timeout) + column_override: Dict[str, str], + field_delimiter: str, + ) -> None: + connection = self.connections.get_thread_connection() + client: Client = connection.handle + table_schema = self._agate_to_schema(agate_table, column_override) + file_path = agate_table.original_abspath # type: ignore + + self.connections.write_dataframe_to_table( + client, + file_path, + database, + schema, + table_name, + table_schema, + field_delimiter, + fallback_timeout=300, + ) @available.parse_none def upload_file( - self, local_file_path: str, database: str, table_schema: str, table_name: str, **kwargs + self, + local_file_path: str, + database: str, + table_schema: str, + table_name: str, + **kwargs, ) -> None: - conn = self.connections.get_thread_connection() - client = conn.handle - - table_ref = self.connections.table_ref(database, table_schema, table_name) - - load_config = google.cloud.bigquery.LoadJobConfig() - for k, v in kwargs["kwargs"].items(): - if k == "schema": - setattr(load_config, k, json.loads(v)) - else: - setattr(load_config, k, v) - - with open(local_file_path, "rb") as f: - job = client.load_table_from_file(f, table_ref, rewind=True, job_config=load_config) - - timeout = self.connections.get_job_execution_timeout_seconds(conn) or 300 - with self.connections.exception_handler("LOAD TABLE"): - self.poll_until_job_completes(job, timeout) + connection = self.connections.get_thread_connection() + client: Client = connection.handle + + self.connections.write_file_to_table( + client, + local_file_path, + database, + table_schema, + table_name, + fallback_timeout=300, + **kwargs, + ) @classmethod def _catalog_filter_table( @@ -753,7 +730,7 @@ def calculate_freshness_from_metadata( macro_resolver: Optional[MacroResolverProtocol] = None, ) -> Tuple[Optional[AdapterResponse], FreshnessResponse]: conn = self.connections.get_thread_connection() - client: google.cloud.bigquery.Client = conn.handle + client: Client = conn.handle table_ref = self.get_table_ref_from_relation(source) table = client.get_table(table_ref) diff --git a/dbt/adapters/bigquery/python_submissions.py b/dbt/adapters/bigquery/python_submissions.py index 93c82ca92..cd7f7d86f 100644 --- a/dbt/adapters/bigquery/python_submissions.py +++ b/dbt/adapters/bigquery/python_submissions.py @@ -1,187 +1,165 @@ -import uuid from typing import Dict, Union +import uuid -from google.api_core import retry -from google.api_core.client_options import ClientOptions -from google.api_core.future.polling import POLLING_PREDICATE -from google.cloud import storage, dataproc_v1 -from google.cloud.dataproc_v1.types.batches import Batch +from google.cloud.dataproc_v1 import Batch, CreateBatchRequest, Job, RuntimeConfig from dbt.adapters.base import PythonJobHelper from dbt.adapters.events.logging import AdapterLogger +from google.protobuf.json_format import ParseDict -from dbt.adapters.bigquery.connections import BigQueryConnectionManager -from dbt.adapters.bigquery.credentials import BigQueryCredentials -from dbt.adapters.bigquery.dataproc.batch import ( - DEFAULT_JAR_FILE_URI, - create_batch_request, - poll_batch_job, - update_batch_from_config, +from dbt.adapters.bigquery.credentials import BigQueryCredentials, DataprocBatchConfig +from dbt.adapters.bigquery.clients import ( + create_dataproc_batch_controller_client, + create_dataproc_job_controller_client, + create_gcs_client, ) +from dbt.adapters.bigquery.retry import RetryFactory + -OPERATION_RETRY_TIME = 10 -logger = AdapterLogger("BigQuery") +_logger = AdapterLogger("BigQuery") -class BaseDataProcHelper(PythonJobHelper): - def __init__(self, parsed_model: Dict, credential: BigQueryCredentials) -> None: - """_summary_ +_DEFAULT_JAR_FILE_URI = "gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.13-0.34.0.jar" - Args: - credential (_type_): _description_ - """ + +class _BaseDataProcHelper(PythonJobHelper): + def __init__(self, parsed_model: Dict, credentials: BigQueryCredentials) -> None: # validate all additional stuff for python is set - schema = parsed_model["schema"] - identifier = parsed_model["alias"] - self.parsed_model = parsed_model - python_required_configs = [ - "dataproc_region", - "gcs_bucket", - ] - for required_config in python_required_configs: - if not getattr(credential, required_config): + for required_config in ["dataproc_region", "gcs_bucket"]: + if not getattr(credentials, required_config): raise ValueError( f"Need to supply {required_config} in profile to submit python job" ) - self.model_file_name = f"{schema}/{identifier}.py" - self.credential = credential - self.GoogleCredentials = BigQueryConnectionManager.get_credentials(credential) - self.storage_client = storage.Client( - project=self.credential.execution_project, credentials=self.GoogleCredentials - ) - self.gcs_location = "gs://{}/{}".format(self.credential.gcs_bucket, self.model_file_name) + + self._storage_client = create_gcs_client(credentials) + self._project = credentials.execution_project + self._region = credentials.dataproc_region + + schema = parsed_model["schema"] + identifier = parsed_model["alias"] + self._model_file_name = f"{schema}/{identifier}.py" + self._gcs_bucket = credentials.gcs_bucket + self._gcs_path = f"gs://{credentials.gcs_bucket}/{self._model_file_name}" # set retry policy, default to timeout after 24 hours - self.timeout = self.parsed_model["config"].get( - "timeout", self.credential.job_execution_timeout_seconds or 60 * 60 * 24 - ) - self.result_polling_policy = retry.Retry( - predicate=POLLING_PREDICATE, maximum=10.0, timeout=self.timeout - ) - self.client_options = ClientOptions( - api_endpoint="{}-dataproc.googleapis.com:443".format(self.credential.dataproc_region) + retry = RetryFactory(credentials) + self._polling_retry = retry.create_polling( + model_timeout=parsed_model["config"].get("timeout") ) - self.job_client = self._get_job_client() - def _upload_to_gcs(self, filename: str, compiled_code: str) -> None: - bucket = self.storage_client.get_bucket(self.credential.gcs_bucket) - blob = bucket.blob(filename) + def _write_to_gcs(self, compiled_code: str) -> None: + bucket = self._storage_client.get_bucket(self._gcs_bucket) + blob = bucket.blob(self._model_file_name) blob.upload_from_string(compiled_code) - def submit(self, compiled_code: str) -> dataproc_v1.types.jobs.Job: - # upload python file to GCS - self._upload_to_gcs(self.model_file_name, compiled_code) - # submit dataproc job - return self._submit_dataproc_job() - - def _get_job_client( - self, - ) -> Union[dataproc_v1.JobControllerClient, dataproc_v1.BatchControllerClient]: - raise NotImplementedError("_get_job_client not implemented") - - def _submit_dataproc_job(self) -> dataproc_v1.types.jobs.Job: - raise NotImplementedError("_submit_dataproc_job not implemented") +class ClusterDataprocHelper(_BaseDataProcHelper): + def __init__(self, parsed_model: Dict, credentials: BigQueryCredentials) -> None: + super().__init__(parsed_model, credentials) + self._job_controller_client = create_dataproc_job_controller_client(credentials) + self._cluster_name = parsed_model["config"].get( + "dataproc_cluster_name", credentials.dataproc_cluster_name + ) -class ClusterDataprocHelper(BaseDataProcHelper): - def _get_job_client(self) -> dataproc_v1.JobControllerClient: - if not self._get_cluster_name(): + if not self._cluster_name: raise ValueError( "Need to supply dataproc_cluster_name in profile or config to submit python job with cluster submission method" ) - return dataproc_v1.JobControllerClient( - client_options=self.client_options, credentials=self.GoogleCredentials - ) - def _get_cluster_name(self) -> str: - return self.parsed_model["config"].get( - "dataproc_cluster_name", self.credential.dataproc_cluster_name - ) + def submit(self, compiled_code: str) -> Job: + _logger.debug(f"Submitting cluster job to: {self._cluster_name}") - def _submit_dataproc_job(self) -> dataproc_v1.types.jobs.Job: - job = { - "placement": {"cluster_name": self._get_cluster_name()}, - "pyspark_job": { - "main_python_file_uri": self.gcs_location, + self._write_to_gcs(compiled_code) + + request = { + "project_id": self._project, + "region": self._region, + "job": { + "placement": {"cluster_name": self._cluster_name}, + "pyspark_job": { + "main_python_file_uri": self._gcs_path, + }, }, } - operation = self.job_client.submit_job_as_operation( - request={ - "project_id": self.credential.execution_project, - "region": self.credential.dataproc_region, - "job": job, - } - ) - # check if job failed - response = operation.result(polling=self.result_polling_policy) + + # submit the job + operation = self._job_controller_client.submit_job_as_operation(request) + + # wait for the job to complete + response: Job = operation.result(polling=self._polling_retry) + if response.status.state == 6: raise ValueError(response.status.details) + return response -class ServerlessDataProcHelper(BaseDataProcHelper): - def _get_job_client(self) -> dataproc_v1.BatchControllerClient: - return dataproc_v1.BatchControllerClient( - client_options=self.client_options, credentials=self.GoogleCredentials - ) +class ServerlessDataProcHelper(_BaseDataProcHelper): + def __init__(self, parsed_model: Dict, credentials: BigQueryCredentials) -> None: + super().__init__(parsed_model, credentials) + self._batch_controller_client = create_dataproc_batch_controller_client(credentials) + self._batch_id = parsed_model["config"].get("batch_id", str(uuid.uuid4())) + self._jar_file_uri = parsed_model["config"].get("jar_file_uri", _DEFAULT_JAR_FILE_URI) + self._dataproc_batch = credentials.dataproc_batch - def _get_batch_id(self) -> str: - model = self.parsed_model - default_batch_id = str(uuid.uuid4()) - return model["config"].get("batch_id", default_batch_id) - - def _submit_dataproc_job(self) -> Batch: - batch_id = self._get_batch_id() - logger.info(f"Submitting batch job with id: {batch_id}") - request = create_batch_request( - batch=self._configure_batch(), - batch_id=batch_id, - region=self.credential.dataproc_region, # type: ignore - project=self.credential.execution_project, # type: ignore - ) - # make the request - self.job_client.create_batch(request=request) - return poll_batch_job( - parent=request.parent, - batch_id=batch_id, - job_client=self.job_client, - timeout=self.timeout, + def submit(self, compiled_code: str) -> Batch: + _logger.debug(f"Submitting batch job with id: {self._batch_id}") + + self._write_to_gcs(compiled_code) + + request = CreateBatchRequest( + parent=f"projects/{self._project}/locations/{self._region}", + batch=self._create_batch(), + batch_id=self._batch_id, ) - # there might be useful results here that we can parse and return - # Dataproc job output is saved to the Cloud Storage bucket - # allocated to the job. Use regex to obtain the bucket and blob info. - # matches = re.match("gs://(.*?)/(.*)", response.driver_output_resource_uri) - # output = ( - # self.storage_client - # .get_bucket(matches.group(1)) - # .blob(f"{matches.group(2)}.000000000") - # .download_as_string() - # ) - - def _configure_batch(self): + + # submit the batch + operation = self._batch_controller_client.create_batch(request) + + # wait for the batch to complete + response: Batch = operation.result(polling=self._polling_retry) + + return response + + def _create_batch(self) -> Batch: # create the Dataproc Serverless job config # need to pin dataproc version to 1.1 as it now defaults to 2.0 # https://cloud.google.com/dataproc-serverless/docs/concepts/properties # https://cloud.google.com/dataproc-serverless/docs/reference/rest/v1/projects.locations.batches#runtimeconfig - batch = dataproc_v1.Batch( + batch = Batch( { - "runtime_config": dataproc_v1.RuntimeConfig( + "runtime_config": RuntimeConfig( version="1.1", properties={ "spark.executor.instances": "2", }, - ) + ), + "pyspark_batch": { + "main_python_file_uri": self._gcs_path, + "jar_file_uris": [self._jar_file_uri], + }, } ) - # Apply defaults - batch.pyspark_batch.main_python_file_uri = self.gcs_location - jar_file_uri = self.parsed_model["config"].get( - "jar_file_uri", - DEFAULT_JAR_FILE_URI, - ) - batch.pyspark_batch.jar_file_uris = [jar_file_uri] # Apply configuration from dataproc_batch key, possibly overriding defaults. - if self.credential.dataproc_batch: - batch = update_batch_from_config(self.credential.dataproc_batch, batch) + if self._dataproc_batch: + batch = _update_batch_from_config(self._dataproc_batch, batch) + return batch + + +def _update_batch_from_config( + config_dict: Union[Dict, DataprocBatchConfig], target: Batch +) -> Batch: + try: + # updates in place + ParseDict(config_dict, target._pb) + except Exception as e: + docurl = ( + "https://cloud.google.com/dataproc-serverless/docs/reference/rpc/google.cloud.dataproc.v1" + "#google.cloud.dataproc.v1.Batch" + ) + raise ValueError( + f"Unable to parse dataproc_batch as valid batch specification. See {docurl}. {str(e)}" + ) from e + return target diff --git a/dbt/adapters/bigquery/retry.py b/dbt/adapters/bigquery/retry.py new file mode 100644 index 000000000..391c00e46 --- /dev/null +++ b/dbt/adapters/bigquery/retry.py @@ -0,0 +1,128 @@ +from typing import Callable, Optional + +from google.api_core.exceptions import Forbidden +from google.api_core.future.polling import DEFAULT_POLLING +from google.api_core.retry import Retry +from google.cloud.bigquery.retry import DEFAULT_RETRY +from google.cloud.exceptions import BadGateway, BadRequest, ServerError +from requests.exceptions import ConnectionError + +from dbt.adapters.contracts.connection import Connection, ConnectionState +from dbt.adapters.events.logging import AdapterLogger +from dbt.adapters.exceptions.connection import FailedToConnectError + +from dbt.adapters.bigquery.clients import create_bigquery_client +from dbt.adapters.bigquery.credentials import BigQueryCredentials + + +_logger = AdapterLogger("BigQuery") + + +_SECOND = 1.0 +_MINUTE = 60 * _SECOND +_HOUR = 60 * _MINUTE +_DAY = 24 * _HOUR +_DEFAULT_INITIAL_DELAY = _SECOND +_DEFAULT_MAXIMUM_DELAY = 3 * _SECOND +_DEFAULT_POLLING_MAXIMUM_DELAY = 10 * _SECOND + + +class RetryFactory: + + def __init__(self, credentials: BigQueryCredentials) -> None: + self._retries = credentials.job_retries or 0 + self._job_creation_timeout = credentials.job_creation_timeout_seconds + self._job_execution_timeout = credentials.job_execution_timeout_seconds + self._job_deadline = credentials.job_retry_deadline_seconds + + def create_job_creation_timeout(self, fallback: float = _MINUTE) -> float: + return ( + self._job_creation_timeout or fallback + ) # keep _MINUTE here so it's not overridden by passing fallback=None + + def create_job_execution_timeout(self, fallback: float = _DAY) -> float: + return ( + self._job_execution_timeout or fallback + ) # keep _DAY here so it's not overridden by passing fallback=None + + def create_retry(self, fallback: Optional[float] = None) -> Retry: + return DEFAULT_RETRY.with_timeout(self._job_execution_timeout or fallback or _DAY) + + def create_polling(self, model_timeout: Optional[float] = None) -> Retry: + return DEFAULT_POLLING.with_timeout(model_timeout or self._job_execution_timeout or _DAY) + + def create_reopen_with_deadline(self, connection: Connection) -> Retry: + """ + This strategy mimics what was accomplished with _retry_and_handle + """ + return Retry( + predicate=_DeferredException(self._retries), + initial=_DEFAULT_INITIAL_DELAY, + maximum=_DEFAULT_MAXIMUM_DELAY, + deadline=self._job_deadline, + on_error=_create_reopen_on_error(connection), + ) + + +class _DeferredException: + """ + Count ALL errors, not just retryable errors, up to a threshold. + Raise the next error, regardless of whether it is retryable. + """ + + def __init__(self, retries: int) -> None: + self._retries: int = retries + self._error_count = 0 + + def __call__(self, error: Exception) -> bool: + # exit immediately if the user does not want retries + if self._retries == 0: + return False + + # count all errors + self._error_count += 1 + + # if the error is retryable, and we haven't breached the threshold, log and continue + if _is_retryable(error) and self._error_count <= self._retries: + _logger.debug( + f"Retry attempt {self._error_count} of {self._retries} after error: {repr(error)}" + ) + return True + + # otherwise raise + return False + + +def _create_reopen_on_error(connection: Connection) -> Callable[[Exception], None]: + + def on_error(error: Exception): + if isinstance(error, (ConnectionResetError, ConnectionError)): + _logger.warning("Reopening connection after {!r}".format(error)) + connection.handle.close() + + try: + connection.handle = create_bigquery_client(connection.credentials) + connection.state = ConnectionState.OPEN + + except Exception as e: + _logger.debug( + f"""Got an error when attempting to create a bigquery " "client: '{e}'""" + ) + connection.handle = None + connection.state = ConnectionState.FAIL + raise FailedToConnectError(str(e)) + + return on_error + + +def _is_retryable(error: Exception) -> bool: + """Return true for errors that are unlikely to occur again if retried.""" + if isinstance( + error, (BadGateway, BadRequest, ConnectionError, ConnectionResetError, ServerError) + ): + return True + elif isinstance(error, Forbidden) and any( + e["reason"] == "rateLimitExceeded" for e in error.errors + ): + return True + return False diff --git a/dbt/adapters/bigquery/utility.py b/dbt/adapters/bigquery/utility.py index 557986b38..5914280a3 100644 --- a/dbt/adapters/bigquery/utility.py +++ b/dbt/adapters/bigquery/utility.py @@ -1,7 +1,5 @@ -import base64 -import binascii import json -from typing import Any, Optional, Union +from typing import Any, Optional import dbt_common.exceptions @@ -45,39 +43,3 @@ def sql_escape(string): if not isinstance(string, str): raise dbt_common.exceptions.CompilationError(f"cannot escape a non-string: {string}") return json.dumps(string)[1:-1] - - -def is_base64(s: Union[str, bytes]) -> bool: - """ - Checks if the given string or bytes object is valid Base64 encoded. - - Args: - s: The string or bytes object to check. - - Returns: - True if the input is valid Base64, False otherwise. - """ - - if isinstance(s, str): - # For strings, ensure they consist only of valid Base64 characters - if not s.isascii(): - return False - # Convert to bytes for decoding - s = s.encode("ascii") - - try: - # Use the 'validate' parameter to enforce strict Base64 decoding rules - base64.b64decode(s, validate=True) - return True - except TypeError: - return False - except binascii.Error: # Catch specific errors from the base64 module - return False - - -def base64_to_string(b): - return base64.b64decode(b).decode("utf-8") - - -def string_to_base64(s): - return base64.b64encode(s.encode("utf-8")) diff --git a/tests/conftest.py b/tests/conftest.py index 6dc9e6443..33f7f9d17 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,9 +1,9 @@ import pytest import os import json -from dbt.adapters.bigquery.utility import is_base64, base64_to_string +from dbt.adapters.bigquery.credentials import _is_base64, _base64_to_string -# Import the fuctional fixtures as a plugin +# Import the functional fixtures as a plugin # Note: fixtures with session scope need to be local pytest_plugins = ["dbt.tests.fixtures.project"] @@ -39,8 +39,8 @@ def oauth_target(): def service_account_target(): credentials_json_str = os.getenv("BIGQUERY_TEST_SERVICE_ACCOUNT_JSON").replace("'", "") - if is_base64(credentials_json_str): - credentials_json_str = base64_to_string(credentials_json_str) + if _is_base64(credentials_json_str): + credentials_json_str = _base64_to_string(credentials_json_str) credentials = json.loads(credentials_json_str) project_id = credentials.get("project_id") return { diff --git a/tests/functional/adapter/test_json_keyfile.py b/tests/functional/adapter/test_json_keyfile.py index 91e41a3f1..a5caaebdf 100644 --- a/tests/functional/adapter/test_json_keyfile.py +++ b/tests/functional/adapter/test_json_keyfile.py @@ -1,6 +1,11 @@ +import base64 import json import pytest -from dbt.adapters.bigquery.utility import string_to_base64, is_base64 +from dbt.adapters.bigquery.credentials import _is_base64 + + +def string_to_base64(s): + return base64.b64encode(s.encode("utf-8")) @pytest.fixture @@ -53,7 +58,7 @@ def test_valid_base64_strings(example_json_keyfile_b64): ] for s in valid_strings: - assert is_base64(s) is True + assert _is_base64(s) is True def test_valid_base64_bytes(example_json_keyfile_b64): @@ -65,7 +70,7 @@ def test_valid_base64_bytes(example_json_keyfile_b64): example_json_keyfile_b64, ] for s in valid_bytes: - assert is_base64(s) is True + assert _is_base64(s) is True def test_invalid_base64(example_json_keyfile): @@ -79,4 +84,4 @@ def test_invalid_base64(example_json_keyfile): example_json_keyfile, ] for s in invalid_inputs: - assert is_base64(s) is False + assert _is_base64(s) is False diff --git a/tests/unit/test_bigquery_adapter.py b/tests/unit/test_bigquery_adapter.py index ca3bfc24c..e57db9a62 100644 --- a/tests/unit/test_bigquery_adapter.py +++ b/tests/unit/test_bigquery_adapter.py @@ -203,7 +203,7 @@ def get_adapter(self, target) -> BigQueryAdapter: class TestBigQueryAdapterAcquire(BaseTestBigQueryAdapter): @patch( - "dbt.adapters.bigquery.credentials.get_bigquery_defaults", + "dbt.adapters.bigquery.credentials._create_bigquery_defaults", return_value=("credentials", "project_id"), ) @patch("dbt.adapters.bigquery.BigQueryConnectionManager.open", return_value=_bq_conn()) @@ -244,10 +244,12 @@ def test_acquire_connection_oauth_validations(self, mock_open_connection): mock_open_connection.assert_called_once() @patch( - "dbt.adapters.bigquery.credentials.get_bigquery_defaults", + "dbt.adapters.bigquery.credentials._create_bigquery_defaults", return_value=("credentials", "project_id"), ) - @patch("dbt.adapters.bigquery.BigQueryConnectionManager.open", return_value=_bq_conn()) + @patch( + "dbt.adapters.bigquery.connections.BigQueryConnectionManager.open", return_value=_bq_conn() + ) def test_acquire_connection_dataproc_serverless( self, mock_open_connection, mock_get_bigquery_defaults ): @@ -386,21 +388,20 @@ def test_cancel_open_connections_single(self): adapter.connections.thread_connections.update({key: master, 1: model}) self.assertEqual(len(list(adapter.cancel_open_connections())), 1) - @patch("dbt.adapters.bigquery.impl.google.api_core.client_options.ClientOptions") - @patch("dbt.adapters.bigquery.impl.google.auth.default") - @patch("dbt.adapters.bigquery.impl.google.cloud.bigquery") - def test_location_user_agent(self, mock_bq, mock_auth_default, MockClientOptions): + @patch("dbt.adapters.bigquery.clients.ClientOptions") + @patch("dbt.adapters.bigquery.credentials.default") + @patch("dbt.adapters.bigquery.clients.BigQueryClient") + def test_location_user_agent(self, MockClient, mock_auth_default, MockClientOptions): creds = MagicMock() mock_auth_default.return_value = (creds, MagicMock()) adapter = self.get_adapter("loc") connection = adapter.acquire_connection("dummy") - mock_client = mock_bq.Client mock_client_options = MockClientOptions.return_value - mock_client.assert_not_called() + MockClient.assert_not_called() connection.handle - mock_client.assert_called_once_with( + MockClient.assert_called_once_with( "dbt-unit-000000", creds, location="Luna Station", diff --git a/tests/unit/test_bigquery_connection_manager.py b/tests/unit/test_bigquery_connection_manager.py index 1c14100f6..d4c95792e 100644 --- a/tests/unit/test_bigquery_connection_manager.py +++ b/tests/unit/test_bigquery_connection_manager.py @@ -1,81 +1,59 @@ import json import unittest -from contextlib import contextmanager from requests.exceptions import ConnectionError from unittest.mock import patch, MagicMock, Mock, ANY import dbt.adapters +import google.cloud.bigquery from dbt.adapters.bigquery import BigQueryCredentials from dbt.adapters.bigquery import BigQueryRelation from dbt.adapters.bigquery.connections import BigQueryConnectionManager +from dbt.adapters.bigquery.retry import RetryFactory class TestBigQueryConnectionManager(unittest.TestCase): def setUp(self): - credentials = Mock(BigQueryCredentials) - profile = Mock(query_comment=None, credentials=credentials) - self.connections = BigQueryConnectionManager(profile=profile, mp_context=Mock()) + self.credentials = Mock(BigQueryCredentials) + self.credentials.method = "oauth" + self.credentials.job_retries = 1 + self.credentials.job_retry_deadline_seconds = 1 + self.credentials.scopes = tuple() - self.mock_client = Mock(dbt.adapters.bigquery.impl.google.cloud.bigquery.Client) - self.mock_connection = MagicMock() + self.mock_client = Mock(google.cloud.bigquery.Client) + self.mock_connection = MagicMock() self.mock_connection.handle = self.mock_client + self.mock_connection.credentials = self.credentials + self.connections = BigQueryConnectionManager( + profile=Mock(credentials=self.credentials, query_comment=None), + mp_context=Mock(), + ) self.connections.get_thread_connection = lambda: self.mock_connection - self.connections.get_job_retry_deadline_seconds = lambda x: None - self.connections.get_job_retries = lambda x: 1 - - @patch("dbt.adapters.bigquery.connections._is_retryable", return_value=True) - def test_retry_and_handle(self, is_retryable): - self.connections.DEFAULT_MAXIMUM_DELAY = 2.0 - - @contextmanager - def dummy_handler(msg): - yield - - self.connections.exception_handler = dummy_handler - - class DummyException(Exception): - """Count how many times this exception is raised""" - - count = 0 - def __init__(self): - DummyException.count += 1 + @patch( + "dbt.adapters.bigquery.retry.create_bigquery_client", + return_value=Mock(google.cloud.bigquery.Client), + ) + def test_retry_connection_reset(self, mock_client_factory): + new_mock_client = mock_client_factory.return_value - def raiseDummyException(): - raise DummyException() + @self.connections._retry.create_reopen_with_deadline(self.mock_connection) + def generate_connection_reset_error(): + raise ConnectionResetError - with self.assertRaises(DummyException): - self.connections._retry_and_handle( - "some sql", Mock(credentials=Mock(retries=8)), raiseDummyException - ) - self.assertEqual(DummyException.count, 9) + assert self.mock_connection.handle is self.mock_client - @patch("dbt.adapters.bigquery.connections._is_retryable", return_value=True) - def test_retry_connection_reset(self, is_retryable): - self.connections.open = MagicMock() - self.connections.close = MagicMock() - self.connections.DEFAULT_MAXIMUM_DELAY = 2.0 - - @contextmanager - def dummy_handler(msg): - yield - - self.connections.exception_handler = dummy_handler - - def raiseConnectionResetError(): - raise ConnectionResetError("Connection broke") - - mock_conn = Mock(credentials=Mock(retries=1)) with self.assertRaises(ConnectionResetError): - self.connections._retry_and_handle("some sql", mock_conn, raiseConnectionResetError) - self.connections.close.assert_called_once_with(mock_conn) - self.connections.open.assert_called_once_with(mock_conn) + # this will always raise the error, we just want to test that the connection was reopening in between + generate_connection_reset_error() + + assert self.mock_connection.handle is new_mock_client + assert new_mock_client is not self.mock_client def test_is_retryable(self): - _is_retryable = dbt.adapters.bigquery.connections._is_retryable + _is_retryable = dbt.adapters.bigquery.retry._is_retryable exceptions = dbt.adapters.bigquery.impl.google.cloud.exceptions internal_server_error = exceptions.InternalServerError("code broke") bad_request_error = exceptions.BadRequest("code broke") @@ -104,29 +82,30 @@ def test_drop_dataset(self): self.mock_client.delete_table.assert_not_called() self.mock_client.delete_dataset.assert_called_once() - @patch("dbt.adapters.bigquery.impl.google.cloud.bigquery") - def test_query_and_results(self, mock_bq): + @patch("dbt.adapters.bigquery.connections.QueryJobConfig") + def test_query_and_results(self, MockQueryJobConfig): self.connections._query_and_results( - self.mock_client, + self.mock_connection, "sql", - {"job_param_1": "blah"}, + {"dry_run": True}, job_id=1, - job_creation_timeout=15, - job_execution_timeout=100, ) - mock_bq.QueryJobConfig.assert_called_once() + MockQueryJobConfig.assert_called_once() self.mock_client.query.assert_called_once_with( - query="sql", job_config=mock_bq.QueryJobConfig(), job_id=1, timeout=15 + query="sql", + job_config=MockQueryJobConfig(), + job_id=1, + timeout=self.credentials.job_creation_timeout_seconds, ) def test_copy_bq_table_appends(self): self._copy_table(write_disposition=dbt.adapters.bigquery.impl.WRITE_APPEND) - args, kwargs = self.mock_client.copy_table.call_args self.mock_client.copy_table.assert_called_once_with( [self._table_ref("project", "dataset", "table1")], self._table_ref("project", "dataset", "table2"), job_config=ANY, + retry=ANY, ) args, kwargs = self.mock_client.copy_table.call_args self.assertEqual( @@ -140,6 +119,7 @@ def test_copy_bq_table_truncates(self): [self._table_ref("project", "dataset", "table1")], self._table_ref("project", "dataset", "table2"), job_config=ANY, + retry=ANY, ) args, kwargs = self.mock_client.copy_table.call_args self.assertEqual( @@ -161,7 +141,7 @@ def test_list_dataset_correctly_calls_lists_datasets(self): self.mock_client.list_datasets = mock_list_dataset result = self.connections.list_dataset("project") self.mock_client.list_datasets.assert_called_once_with( - project="project", max_results=10000 + project="project", max_results=10000, retry=ANY ) assert result == ["d1"] diff --git a/tests/unit/test_configure_dataproc_batch.py b/tests/unit/test_configure_dataproc_batch.py index f56aee129..6e5757589 100644 --- a/tests/unit/test_configure_dataproc_batch.py +++ b/tests/unit/test_configure_dataproc_batch.py @@ -1,6 +1,6 @@ from unittest.mock import patch -from dbt.adapters.bigquery.dataproc.batch import update_batch_from_config +from dbt.adapters.bigquery.python_submissions import _update_batch_from_config from google.cloud import dataproc_v1 from .test_bigquery_adapter import BaseTestBigQueryAdapter @@ -12,7 +12,7 @@ # parsed credentials class TestConfigureDataprocBatch(BaseTestBigQueryAdapter): @patch( - "dbt.adapters.bigquery.credentials.get_bigquery_defaults", + "dbt.adapters.bigquery.credentials._create_bigquery_defaults", return_value=("credentials", "project_id"), ) def test_update_dataproc_serverless_batch(self, mock_get_bigquery_defaults): @@ -39,7 +39,7 @@ def test_update_dataproc_serverless_batch(self, mock_get_bigquery_defaults): batch = dataproc_v1.Batch() - batch = update_batch_from_config(raw_batch_config, batch) + batch = _update_batch_from_config(raw_batch_config, batch) def to_str_values(d): """google's protobuf types expose maps as dict[str, str]""" @@ -64,7 +64,7 @@ def to_str_values(d): ) @patch( - "dbt.adapters.bigquery.credentials.get_bigquery_defaults", + "dbt.adapters.bigquery.credentials._create_bigquery_defaults", return_value=("credentials", "project_id"), ) def test_default_dataproc_serverless_batch(self, mock_get_bigquery_defaults): From 62695c9322f71339fd0cc839761643244d4d7ad1 Mon Sep 17 00:00:00 2001 From: FishtownBuildBot <77737458+FishtownBuildBot@users.noreply.github.com> Date: Mon, 2 Dec 2024 13:58:59 -0500 Subject: [PATCH 04/11] Cleanup main after cutting new 1.9.latest branch (#1419) * Clean up changelog on main * Bumping version to 1.10.0a1 * Code quality cleanup --- .bumpversion.cfg | 2 +- .changes/1.9.0-b1.md | 44 ------------------ .../1.9.0/Dependencies-20231211-001048.yaml | 6 --- .../1.9.0/Dependencies-20231220-002130.yaml | 6 --- .../1.9.0/Dependencies-20231222-002351.yaml | 6 --- .../1.9.0/Dependencies-20240105-004800.yaml | 6 --- .../1.9.0/Dependencies-20240429-005158.yaml | 6 --- .../1.9.0/Dependencies-20240429-005159.yaml | 6 --- .../1.9.0/Dependencies-20240520-230208.yaml | 6 --- .../1.9.0/Dependencies-20240718-005755.yaml | 6 --- .../1.9.0/Dependencies-20240718-005756.yaml | 6 --- .../1.9.0/Dependencies-20240718-005757.yaml | 6 --- .../1.9.0/Dependencies-20240719-003740.yaml | 6 --- .changes/1.9.0/Features-20240426-105319.yaml | 7 --- .changes/1.9.0/Features-20240430-185650.yaml | 6 --- .changes/1.9.0/Features-20240501-151902.yaml | 6 --- .changes/1.9.0/Features-20240516-125735.yaml | 6 --- .changes/1.9.0/Features-20240730-135911.yaml | 6 --- .changes/1.9.0/Features-20240925-232238.yaml | 6 --- .changes/1.9.0/Fixes-20240120-180818.yaml | 6 --- .changes/1.9.0/Fixes-20240201-145323.yaml | 6 --- .changes/1.9.0/Fixes-20240226-233024.yaml | 6 --- .changes/1.9.0/Fixes-20240426-105224.yaml | 7 --- .changes/1.9.0/Fixes-20241001-193207.yaml | 7 --- .../1.9.0/Under the Hood-20240331-101418.yaml | 6 --- .../1.9.0/Under the Hood-20240718-193206.yaml | 6 --- .../Breaking Changes-20241016-185117.yaml | 6 --- .../Dependencies-20240724-040744.yaml | 6 --- .../unreleased/Features-20240505-011838.yaml | 6 --- .../unreleased/Features-20240911-234859.yaml | 6 --- .../unreleased/Fixes-20241028-172719.yaml | 6 --- .../Under the Hood-20240910-212052.yaml | 6 --- .../Under the Hood-20241104-173815.yaml | 7 --- .../Under the Hood-20241107-143856.yaml | 6 --- CHANGELOG.md | 46 ------------------- dbt/adapters/bigquery/__version__.py | 2 +- 36 files changed, 2 insertions(+), 288 deletions(-) delete mode 100644 .changes/1.9.0-b1.md delete mode 100644 .changes/1.9.0/Dependencies-20231211-001048.yaml delete mode 100644 .changes/1.9.0/Dependencies-20231220-002130.yaml delete mode 100644 .changes/1.9.0/Dependencies-20231222-002351.yaml delete mode 100644 .changes/1.9.0/Dependencies-20240105-004800.yaml delete mode 100644 .changes/1.9.0/Dependencies-20240429-005158.yaml delete mode 100644 .changes/1.9.0/Dependencies-20240429-005159.yaml delete mode 100644 .changes/1.9.0/Dependencies-20240520-230208.yaml delete mode 100644 .changes/1.9.0/Dependencies-20240718-005755.yaml delete mode 100644 .changes/1.9.0/Dependencies-20240718-005756.yaml delete mode 100644 .changes/1.9.0/Dependencies-20240718-005757.yaml delete mode 100644 .changes/1.9.0/Dependencies-20240719-003740.yaml delete mode 100644 .changes/1.9.0/Features-20240426-105319.yaml delete mode 100644 .changes/1.9.0/Features-20240430-185650.yaml delete mode 100644 .changes/1.9.0/Features-20240501-151902.yaml delete mode 100644 .changes/1.9.0/Features-20240516-125735.yaml delete mode 100644 .changes/1.9.0/Features-20240730-135911.yaml delete mode 100644 .changes/1.9.0/Features-20240925-232238.yaml delete mode 100644 .changes/1.9.0/Fixes-20240120-180818.yaml delete mode 100644 .changes/1.9.0/Fixes-20240201-145323.yaml delete mode 100644 .changes/1.9.0/Fixes-20240226-233024.yaml delete mode 100644 .changes/1.9.0/Fixes-20240426-105224.yaml delete mode 100644 .changes/1.9.0/Fixes-20241001-193207.yaml delete mode 100644 .changes/1.9.0/Under the Hood-20240331-101418.yaml delete mode 100644 .changes/1.9.0/Under the Hood-20240718-193206.yaml delete mode 100644 .changes/unreleased/Breaking Changes-20241016-185117.yaml delete mode 100644 .changes/unreleased/Dependencies-20240724-040744.yaml delete mode 100644 .changes/unreleased/Features-20240505-011838.yaml delete mode 100644 .changes/unreleased/Features-20240911-234859.yaml delete mode 100644 .changes/unreleased/Fixes-20241028-172719.yaml delete mode 100644 .changes/unreleased/Under the Hood-20240910-212052.yaml delete mode 100644 .changes/unreleased/Under the Hood-20241104-173815.yaml delete mode 100644 .changes/unreleased/Under the Hood-20241107-143856.yaml diff --git a/.bumpversion.cfg b/.bumpversion.cfg index bd9430cbe..b56a8ee4f 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.9.0b1 +current_version = 1.10.0a1 parse = (?P[\d]+) # major version number \.(?P[\d]+) # minor version number \.(?P[\d]+) # patch version number diff --git a/.changes/1.9.0-b1.md b/.changes/1.9.0-b1.md deleted file mode 100644 index 7d0dd2c8f..000000000 --- a/.changes/1.9.0-b1.md +++ /dev/null @@ -1,44 +0,0 @@ -## dbt-bigquery 1.9.0-b1 - October 02, 2024 - -### Features - -- Add configuration options `enable_list_inference` and `intermediate_format` for python models ([#1047](https://github.com/dbt-labs/dbt-bigquery/issues/1047), [#1114](https://github.com/dbt-labs/dbt-bigquery/issues/1114)) -- Add tests for cross-database `cast` macro ([#1214](https://github.com/dbt-labs/dbt-bigquery/issues/1214)) -- Cross-database `date` macro ([#1221](https://github.com/dbt-labs/dbt-bigquery/issues/1221)) -- Add support for base 64 encoded json keyfile credentials ([#923](https://github.com/dbt-labs/dbt-bigquery/issues/923)) -- Add support for cancelling queries on keyboard interrupt ([#917](https://github.com/dbt-labs/dbt-bigquery/issues/917)) -- Add Microbatch Strategy to dbt-spark ([#1354](https://github.com/dbt-labs/dbt-bigquery/issues/1354)) - -### Fixes - -- Drop intermediate objects created in BigQuery for incremental models ([#1036](https://github.com/dbt-labs/dbt-bigquery/issues/1036)) -- Fix null column index issue during `dbt docs generate` for external tables ([#1079](https://github.com/dbt-labs/dbt-bigquery/issues/1079)) -- make seed delimiter configurable via `field_delimeter` in model config ([#1119](https://github.com/dbt-labs/dbt-bigquery/issues/1119)) -- Default `enableListInference` to `True` for python models to support nested lists ([#1047](https://github.com/dbt-labs/dbt-bigquery/issues/1047), [#1114](https://github.com/dbt-labs/dbt-bigquery/issues/1114)) -- Catch additional database error exception, NotFound, as a DbtDatabaseError instead of defaulting to a DbtRuntimeError ([#1360](https://github.com/dbt-labs/dbt-bigquery/issues/1360)) - -### Under the Hood - -- Lazy load `agate` ([#1162](https://github.com/dbt-labs/dbt-bigquery/issues/1162)) -- Simplify linting environment and dev dependencies ([#1291](https://github.com/dbt-labs/dbt-bigquery/issues/1291)) - -### Dependencies - -- Update pre-commit requirement from ~=3.5 to ~=3.7 ([#1052](https://github.com/dbt-labs/dbt-bigquery/pull/1052)) -- Update freezegun requirement from ~=1.3 to ~=1.4 ([#1062](https://github.com/dbt-labs/dbt-bigquery/pull/1062)) -- Bump mypy from 1.7.1 to 1.8.0 ([#1064](https://github.com/dbt-labs/dbt-bigquery/pull/1064)) -- Update flake8 requirement from ~=6.1 to ~=7.0 ([#1069](https://github.com/dbt-labs/dbt-bigquery/pull/1069)) -- Bump actions/download-artifact from 3 to 4 ([#1209](https://github.com/dbt-labs/dbt-bigquery/pull/1209)) -- Bump actions/upload-artifact from 3 to 4 ([#1210](https://github.com/dbt-labs/dbt-bigquery/pull/1210)) -- Bump ubuntu from 22.04 to 24.04 in /docker ([#1247](https://github.com/dbt-labs/dbt-bigquery/pull/1247)) -- Update pre-commit-hooks requirement from ~=4.5 to ~=4.6 ([#1281](https://github.com/dbt-labs/dbt-bigquery/pull/1281)) -- Update pytest-xdist requirement from ~=3.5 to ~=3.6 ([#1282](https://github.com/dbt-labs/dbt-bigquery/pull/1282)) -- Update flaky requirement from ~=3.7 to ~=3.8 ([#1283](https://github.com/dbt-labs/dbt-bigquery/pull/1283)) -- Update twine requirement from ~=4.0 to ~=5.1 ([#1293](https://github.com/dbt-labs/dbt-bigquery/pull/1293)) - -### Contributors -- [@d-cole](https://github.com/d-cole) ([#917](https://github.com/dbt-labs/dbt-bigquery/issues/917)) -- [@dwreeves](https://github.com/dwreeves) ([#1162](https://github.com/dbt-labs/dbt-bigquery/issues/1162)) -- [@robeleb1](https://github.com/robeleb1) ([#923](https://github.com/dbt-labs/dbt-bigquery/issues/923)) -- [@salimmoulouel](https://github.com/salimmoulouel) ([#1119](https://github.com/dbt-labs/dbt-bigquery/issues/1119)) -- [@vinit2107](https://github.com/vinit2107) ([#1036](https://github.com/dbt-labs/dbt-bigquery/issues/1036)) diff --git a/.changes/1.9.0/Dependencies-20231211-001048.yaml b/.changes/1.9.0/Dependencies-20231211-001048.yaml deleted file mode 100644 index 6f2bfada4..000000000 --- a/.changes/1.9.0/Dependencies-20231211-001048.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: "Dependencies" -body: "Update pre-commit requirement from ~=3.5 to ~=3.7" -time: 2023-12-11T00:10:48.00000Z -custom: - Author: dependabot[bot] - PR: 1052 diff --git a/.changes/1.9.0/Dependencies-20231220-002130.yaml b/.changes/1.9.0/Dependencies-20231220-002130.yaml deleted file mode 100644 index d62e50bf2..000000000 --- a/.changes/1.9.0/Dependencies-20231220-002130.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: "Dependencies" -body: "Update freezegun requirement from ~=1.3 to ~=1.4" -time: 2023-12-20T00:21:30.00000Z -custom: - Author: dependabot[bot] - PR: 1062 diff --git a/.changes/1.9.0/Dependencies-20231222-002351.yaml b/.changes/1.9.0/Dependencies-20231222-002351.yaml deleted file mode 100644 index 76591de93..000000000 --- a/.changes/1.9.0/Dependencies-20231222-002351.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: "Dependencies" -body: "Bump mypy from 1.7.1 to 1.8.0" -time: 2023-12-22T00:23:51.00000Z -custom: - Author: dependabot[bot] - PR: 1064 diff --git a/.changes/1.9.0/Dependencies-20240105-004800.yaml b/.changes/1.9.0/Dependencies-20240105-004800.yaml deleted file mode 100644 index b0d33ceed..000000000 --- a/.changes/1.9.0/Dependencies-20240105-004800.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: "Dependencies" -body: "Update flake8 requirement from ~=6.1 to ~=7.0" -time: 2024-01-05T00:48:00.00000Z -custom: - Author: dependabot[bot] - PR: 1069 diff --git a/.changes/1.9.0/Dependencies-20240429-005158.yaml b/.changes/1.9.0/Dependencies-20240429-005158.yaml deleted file mode 100644 index 5d380952c..000000000 --- a/.changes/1.9.0/Dependencies-20240429-005158.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: "Dependencies" -body: "Bump actions/download-artifact from 3 to 4" -time: 2024-04-29T00:51:58.00000Z -custom: - Author: dependabot[bot] - PR: 1209 diff --git a/.changes/1.9.0/Dependencies-20240429-005159.yaml b/.changes/1.9.0/Dependencies-20240429-005159.yaml deleted file mode 100644 index adf2cae65..000000000 --- a/.changes/1.9.0/Dependencies-20240429-005159.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: "Dependencies" -body: "Bump actions/upload-artifact from 3 to 4" -time: 2024-04-29T00:51:59.00000Z -custom: - Author: dependabot[bot] - PR: 1210 diff --git a/.changes/1.9.0/Dependencies-20240520-230208.yaml b/.changes/1.9.0/Dependencies-20240520-230208.yaml deleted file mode 100644 index f89057233..000000000 --- a/.changes/1.9.0/Dependencies-20240520-230208.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: "Dependencies" -body: "Bump ubuntu from 22.04 to 24.04 in /docker" -time: 2024-05-20T23:02:08.00000Z -custom: - Author: dependabot[bot] - PR: 1247 diff --git a/.changes/1.9.0/Dependencies-20240718-005755.yaml b/.changes/1.9.0/Dependencies-20240718-005755.yaml deleted file mode 100644 index 3d2cca66c..000000000 --- a/.changes/1.9.0/Dependencies-20240718-005755.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: "Dependencies" -body: "Update pre-commit-hooks requirement from ~=4.5 to ~=4.6" -time: 2024-07-18T00:57:55.00000Z -custom: - Author: dependabot[bot] - PR: 1281 diff --git a/.changes/1.9.0/Dependencies-20240718-005756.yaml b/.changes/1.9.0/Dependencies-20240718-005756.yaml deleted file mode 100644 index ac6b791a8..000000000 --- a/.changes/1.9.0/Dependencies-20240718-005756.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: "Dependencies" -body: "Update pytest-xdist requirement from ~=3.5 to ~=3.6" -time: 2024-07-18T00:57:56.00000Z -custom: - Author: dependabot[bot] - PR: 1282 diff --git a/.changes/1.9.0/Dependencies-20240718-005757.yaml b/.changes/1.9.0/Dependencies-20240718-005757.yaml deleted file mode 100644 index 29e12d68e..000000000 --- a/.changes/1.9.0/Dependencies-20240718-005757.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: "Dependencies" -body: "Update flaky requirement from ~=3.7 to ~=3.8" -time: 2024-07-18T00:57:57.00000Z -custom: - Author: dependabot[bot] - PR: 1283 diff --git a/.changes/1.9.0/Dependencies-20240719-003740.yaml b/.changes/1.9.0/Dependencies-20240719-003740.yaml deleted file mode 100644 index feb483a60..000000000 --- a/.changes/1.9.0/Dependencies-20240719-003740.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: "Dependencies" -body: "Update twine requirement from ~=4.0 to ~=5.1" -time: 2024-07-19T00:37:40.00000Z -custom: - Author: dependabot[bot] - PR: 1293 diff --git a/.changes/1.9.0/Features-20240426-105319.yaml b/.changes/1.9.0/Features-20240426-105319.yaml deleted file mode 100644 index 0af2f9aa8..000000000 --- a/.changes/1.9.0/Features-20240426-105319.yaml +++ /dev/null @@ -1,7 +0,0 @@ -kind: Features -body: Add configuration options `enable_list_inference` and `intermediate_format` for python - models -time: 2024-04-26T10:53:19.874239-04:00 -custom: - Author: mikealfare - Issue: 1047 1114 diff --git a/.changes/1.9.0/Features-20240430-185650.yaml b/.changes/1.9.0/Features-20240430-185650.yaml deleted file mode 100644 index 0c0eef567..000000000 --- a/.changes/1.9.0/Features-20240430-185650.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Features -body: Add tests for cross-database `cast` macro -time: 2024-04-30T18:56:50.238027-06:00 -custom: - Author: dbeatty10 - Issue: "1214" diff --git a/.changes/1.9.0/Features-20240501-151902.yaml b/.changes/1.9.0/Features-20240501-151902.yaml deleted file mode 100644 index 1522e9775..000000000 --- a/.changes/1.9.0/Features-20240501-151902.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Features -body: Cross-database `date` macro -time: 2024-05-01T15:19:02.929676-06:00 -custom: - Author: dbeatty10 - Issue: 1221 diff --git a/.changes/1.9.0/Features-20240516-125735.yaml b/.changes/1.9.0/Features-20240516-125735.yaml deleted file mode 100644 index d84b098b2..000000000 --- a/.changes/1.9.0/Features-20240516-125735.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Features -body: Add support for base 64 encoded json keyfile credentials -time: 2024-05-16T12:57:35.383416-07:00 -custom: - Author: robeleb1 - Issue: "923" diff --git a/.changes/1.9.0/Features-20240730-135911.yaml b/.changes/1.9.0/Features-20240730-135911.yaml deleted file mode 100644 index 52868c2ee..000000000 --- a/.changes/1.9.0/Features-20240730-135911.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Features -body: Add support for cancelling queries on keyboard interrupt -time: 2024-07-30T13:59:11.585452-07:00 -custom: - Author: d-cole MichelleArk colin-rogers-dbt - Issue: "917" diff --git a/.changes/1.9.0/Features-20240925-232238.yaml b/.changes/1.9.0/Features-20240925-232238.yaml deleted file mode 100644 index 903884196..000000000 --- a/.changes/1.9.0/Features-20240925-232238.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Features -body: Add Microbatch Strategy to dbt-spark -time: 2024-09-25T23:22:38.216277+01:00 -custom: - Author: michelleark - Issue: "1354" diff --git a/.changes/1.9.0/Fixes-20240120-180818.yaml b/.changes/1.9.0/Fixes-20240120-180818.yaml deleted file mode 100644 index 0d0740361..000000000 --- a/.changes/1.9.0/Fixes-20240120-180818.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Fixes -body: Drop intermediate objects created in BigQuery for incremental models -time: 2024-01-20T18:08:18.817915-06:00 -custom: - Author: vinit2107 - Issue: "1036" diff --git a/.changes/1.9.0/Fixes-20240201-145323.yaml b/.changes/1.9.0/Fixes-20240201-145323.yaml deleted file mode 100644 index ea198e54a..000000000 --- a/.changes/1.9.0/Fixes-20240201-145323.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Fixes -body: Fix null column index issue during `dbt docs generate` for external tables -time: 2024-02-01T14:53:23.434624-05:00 -custom: - Author: mikealfare - Issue: "1079" diff --git a/.changes/1.9.0/Fixes-20240226-233024.yaml b/.changes/1.9.0/Fixes-20240226-233024.yaml deleted file mode 100644 index efb1b077c..000000000 --- a/.changes/1.9.0/Fixes-20240226-233024.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Fixes -body: make seed delimiter configurable via `field_delimeter` in model config -time: 2024-02-26T23:30:24.141213+01:00 -custom: - Author: salimmoulouel - Issue: "1119" diff --git a/.changes/1.9.0/Fixes-20240426-105224.yaml b/.changes/1.9.0/Fixes-20240426-105224.yaml deleted file mode 100644 index 624006ba5..000000000 --- a/.changes/1.9.0/Fixes-20240426-105224.yaml +++ /dev/null @@ -1,7 +0,0 @@ -kind: Fixes -body: Default `enableListInference` to `True` for python models to support nested - lists -time: 2024-04-26T10:52:24.827314-04:00 -custom: - Author: mikealfare - Issue: 1047 1114 diff --git a/.changes/1.9.0/Fixes-20241001-193207.yaml b/.changes/1.9.0/Fixes-20241001-193207.yaml deleted file mode 100644 index 584445a5b..000000000 --- a/.changes/1.9.0/Fixes-20241001-193207.yaml +++ /dev/null @@ -1,7 +0,0 @@ -kind: Fixes -body: Catch additional database error exception, NotFound, as a DbtDatabaseError instead - of defaulting to a DbtRuntimeError -time: 2024-10-01T19:32:07.304353-04:00 -custom: - Author: mikealfare - Issue: "1360" diff --git a/.changes/1.9.0/Under the Hood-20240331-101418.yaml b/.changes/1.9.0/Under the Hood-20240331-101418.yaml deleted file mode 100644 index baea00248..000000000 --- a/.changes/1.9.0/Under the Hood-20240331-101418.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Under the Hood -body: Lazy load `agate` -time: 2024-03-31T10:14:18.260074-04:00 -custom: - Author: dwreeves - Issue: "1162" diff --git a/.changes/1.9.0/Under the Hood-20240718-193206.yaml b/.changes/1.9.0/Under the Hood-20240718-193206.yaml deleted file mode 100644 index 32b3084f5..000000000 --- a/.changes/1.9.0/Under the Hood-20240718-193206.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Under the Hood -body: Simplify linting environment and dev dependencies -time: 2024-07-18T19:32:06.044016-04:00 -custom: - Author: mikealfare - Issue: "1291" diff --git a/.changes/unreleased/Breaking Changes-20241016-185117.yaml b/.changes/unreleased/Breaking Changes-20241016-185117.yaml deleted file mode 100644 index 55bb37461..000000000 --- a/.changes/unreleased/Breaking Changes-20241016-185117.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Breaking Changes -body: Drop support for Python 3.8 -time: 2024-10-16T18:51:17.581547-04:00 -custom: - Author: mikealfare - Issue: "1373" diff --git a/.changes/unreleased/Dependencies-20240724-040744.yaml b/.changes/unreleased/Dependencies-20240724-040744.yaml deleted file mode 100644 index fd713788e..000000000 --- a/.changes/unreleased/Dependencies-20240724-040744.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: "Dependencies" -body: "Update wheel requirement from ~=0.42 to ~=0.43" -time: 2024-07-24T04:07:44.00000Z -custom: - Author: dependabot[bot] - PR: 1304 diff --git a/.changes/unreleased/Features-20240505-011838.yaml b/.changes/unreleased/Features-20240505-011838.yaml deleted file mode 100644 index 66411853f..000000000 --- a/.changes/unreleased/Features-20240505-011838.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Features -body: add is_retryable test case when raise ServiceUnavailable -time: 2024-05-05T01:18:38.737882+09:00 -custom: - Author: jx2lee - Issue: "682" diff --git a/.changes/unreleased/Features-20240911-234859.yaml b/.changes/unreleased/Features-20240911-234859.yaml deleted file mode 100644 index 5351c3315..000000000 --- a/.changes/unreleased/Features-20240911-234859.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Features -body: Adds the ability to set optional `quota_project` in profile -time: 2024-09-11T23:48:59.767649+01:00 -custom: - Author: jcarpenter12 - Issue: 1343 1344 diff --git a/.changes/unreleased/Fixes-20241028-172719.yaml b/.changes/unreleased/Fixes-20241028-172719.yaml deleted file mode 100644 index 87ee2c25d..000000000 --- a/.changes/unreleased/Fixes-20241028-172719.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Fixes -body: use "direct" write for non-partitioned python model materializations -time: 2024-10-28T17:27:19.306348-07:00 -custom: - Author: colin-rogers-dbt - Issue: "1318" diff --git a/.changes/unreleased/Under the Hood-20240910-212052.yaml b/.changes/unreleased/Under the Hood-20240910-212052.yaml deleted file mode 100644 index 3e4885dcd..000000000 --- a/.changes/unreleased/Under the Hood-20240910-212052.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Under the Hood -body: Isolating distribution testing -time: 2024-09-10T21:20:52.574204-04:00 -custom: - Author: leahwicz - Issue: "1290" diff --git a/.changes/unreleased/Under the Hood-20241104-173815.yaml b/.changes/unreleased/Under the Hood-20241104-173815.yaml deleted file mode 100644 index e3e81dec1..000000000 --- a/.changes/unreleased/Under the Hood-20241104-173815.yaml +++ /dev/null @@ -1,7 +0,0 @@ -kind: Under the Hood -body: Separate credentials functionality into its own module for reuse in retry and - python submissions -time: 2024-11-04T17:38:15.940962-05:00 -custom: - Author: mikealfare - Issue: "1391" diff --git a/.changes/unreleased/Under the Hood-20241107-143856.yaml b/.changes/unreleased/Under the Hood-20241107-143856.yaml deleted file mode 100644 index db8557bf0..000000000 --- a/.changes/unreleased/Under the Hood-20241107-143856.yaml +++ /dev/null @@ -1,6 +0,0 @@ -kind: Under the Hood -body: Create a retry factory to simplify retry strategies across dbt-bigquery -time: 2024-11-07T14:38:56.210445-05:00 -custom: - Author: mikealfare osalama - Issue: "1395" diff --git a/CHANGELOG.md b/CHANGELOG.md index b9bda350a..ade60b8f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,52 +5,6 @@ - "Breaking changes" listed under a version may require action from end users or external maintainers when upgrading to that version. - Do not edit this file directly. This file is auto-generated using [changie](https://github.com/miniscruff/changie). For details on how to document a change, see [the contributing guide](https://github.com/dbt-labs/dbt-bigquery/blob/main/CONTRIBUTING.md#adding-changelog-entry) -## dbt-bigquery 1.9.0-b1 - October 02, 2024 - -### Features - -- Add configuration options `enable_list_inference` and `intermediate_format` for python models ([#1047](https://github.com/dbt-labs/dbt-bigquery/issues/1047), [#1114](https://github.com/dbt-labs/dbt-bigquery/issues/1114)) -- Add tests for cross-database `cast` macro ([#1214](https://github.com/dbt-labs/dbt-bigquery/issues/1214)) -- Cross-database `date` macro ([#1221](https://github.com/dbt-labs/dbt-bigquery/issues/1221)) -- Add support for base 64 encoded json keyfile credentials ([#923](https://github.com/dbt-labs/dbt-bigquery/issues/923)) -- Add support for cancelling queries on keyboard interrupt ([#917](https://github.com/dbt-labs/dbt-bigquery/issues/917)) -- Add Microbatch Strategy to dbt-spark ([#1354](https://github.com/dbt-labs/dbt-bigquery/issues/1354)) - -### Fixes - -- Drop intermediate objects created in BigQuery for incremental models ([#1036](https://github.com/dbt-labs/dbt-bigquery/issues/1036)) -- Fix null column index issue during `dbt docs generate` for external tables ([#1079](https://github.com/dbt-labs/dbt-bigquery/issues/1079)) -- make seed delimiter configurable via `field_delimeter` in model config ([#1119](https://github.com/dbt-labs/dbt-bigquery/issues/1119)) -- Default `enableListInference` to `True` for python models to support nested lists ([#1047](https://github.com/dbt-labs/dbt-bigquery/issues/1047), [#1114](https://github.com/dbt-labs/dbt-bigquery/issues/1114)) -- Catch additional database error exception, NotFound, as a DbtDatabaseError instead of defaulting to a DbtRuntimeError ([#1360](https://github.com/dbt-labs/dbt-bigquery/issues/1360)) - -### Under the Hood - -- Lazy load `agate` ([#1162](https://github.com/dbt-labs/dbt-bigquery/issues/1162)) -- Simplify linting environment and dev dependencies ([#1291](https://github.com/dbt-labs/dbt-bigquery/issues/1291)) - -### Dependencies - -- Update pre-commit requirement from ~=3.5 to ~=3.7 ([#1052](https://github.com/dbt-labs/dbt-bigquery/pull/1052)) -- Update freezegun requirement from ~=1.3 to ~=1.4 ([#1062](https://github.com/dbt-labs/dbt-bigquery/pull/1062)) -- Bump mypy from 1.7.1 to 1.8.0 ([#1064](https://github.com/dbt-labs/dbt-bigquery/pull/1064)) -- Update flake8 requirement from ~=6.1 to ~=7.0 ([#1069](https://github.com/dbt-labs/dbt-bigquery/pull/1069)) -- Bump actions/download-artifact from 3 to 4 ([#1209](https://github.com/dbt-labs/dbt-bigquery/pull/1209)) -- Bump actions/upload-artifact from 3 to 4 ([#1210](https://github.com/dbt-labs/dbt-bigquery/pull/1210)) -- Bump ubuntu from 22.04 to 24.04 in /docker ([#1247](https://github.com/dbt-labs/dbt-bigquery/pull/1247)) -- Update pre-commit-hooks requirement from ~=4.5 to ~=4.6 ([#1281](https://github.com/dbt-labs/dbt-bigquery/pull/1281)) -- Update pytest-xdist requirement from ~=3.5 to ~=3.6 ([#1282](https://github.com/dbt-labs/dbt-bigquery/pull/1282)) -- Update flaky requirement from ~=3.7 to ~=3.8 ([#1283](https://github.com/dbt-labs/dbt-bigquery/pull/1283)) -- Update twine requirement from ~=4.0 to ~=5.1 ([#1293](https://github.com/dbt-labs/dbt-bigquery/pull/1293)) - -### Contributors -- [@d-cole](https://github.com/d-cole) ([#917](https://github.com/dbt-labs/dbt-bigquery/issues/917)) -- [@dwreeves](https://github.com/dwreeves) ([#1162](https://github.com/dbt-labs/dbt-bigquery/issues/1162)) -- [@robeleb1](https://github.com/robeleb1) ([#923](https://github.com/dbt-labs/dbt-bigquery/issues/923)) -- [@salimmoulouel](https://github.com/salimmoulouel) ([#1119](https://github.com/dbt-labs/dbt-bigquery/issues/1119)) -- [@vinit2107](https://github.com/vinit2107) ([#1036](https://github.com/dbt-labs/dbt-bigquery/issues/1036)) - - ## Previous Releases For information on prior major and minor releases, see their changelogs: - [1.6](https://github.com/dbt-labs/dbt-bigquery/blob/1.6.latest/CHANGELOG.md) diff --git a/dbt/adapters/bigquery/__version__.py b/dbt/adapters/bigquery/__version__.py index a4077fff2..1af777a62 100644 --- a/dbt/adapters/bigquery/__version__.py +++ b/dbt/adapters/bigquery/__version__.py @@ -1 +1 @@ -version = "1.9.0b1" +version = "1.10.0a1" From 1798601e7ec04ea9211a53fc7b4bdbb9f235a7de Mon Sep 17 00:00:00 2001 From: Colin Rogers <111200756+colin-rogers-dbt@users.noreply.github.com> Date: Wed, 4 Dec 2024 09:05:26 -0800 Subject: [PATCH 05/11] update libpq-dev dependency to 13.18-0+deb11u1 (#1420) * use dynamic schema in test_grant_access_to.py * use dynamic schema in test_grant_access_to.py * revert setup * update libpq-dev dependency to 13.18-0+deb11u1 --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index bda507dc5..8f371d6b4 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -9,7 +9,7 @@ RUN apt-get update \ build-essential=12.9 \ ca-certificates=20210119 \ git=1:2.30.2-1+deb11u2 \ - libpq-dev=13.14-0+deb11u1 \ + libpq-dev=13.18-0+deb11u1 \ make=4.3-4.1 \ openssh-client=1:8.4p1-5+deb11u3 \ software-properties-common=0.96.20.2-2.1 \ From 26c19e911f6d493397578c05dd218496de03a334 Mon Sep 17 00:00:00 2001 From: Mike Alfare <13974384+mikealfare@users.noreply.github.com> Date: Thu, 5 Dec 2024 14:47:35 -0500 Subject: [PATCH 06/11] Remove custom retry in get_table call (#1423) * remove custom retry in get_table call * changelog --- .changes/unreleased/Fixes-20241205-133606.yaml | 6 ++++++ dbt/adapters/bigquery/connections.py | 5 +---- 2 files changed, 7 insertions(+), 4 deletions(-) create mode 100644 .changes/unreleased/Fixes-20241205-133606.yaml diff --git a/.changes/unreleased/Fixes-20241205-133606.yaml b/.changes/unreleased/Fixes-20241205-133606.yaml new file mode 100644 index 000000000..b88a0981c --- /dev/null +++ b/.changes/unreleased/Fixes-20241205-133606.yaml @@ -0,0 +1,6 @@ +kind: Fixes +body: Fix issue where rate limit errors on table service calls are not retried +time: 2024-12-05T13:36:06.436005-05:00 +custom: + Author: mikealfare + Issue: "1423" diff --git a/dbt/adapters/bigquery/connections.py b/dbt/adapters/bigquery/connections.py index 61fa87d40..bb062f330 100644 --- a/dbt/adapters/bigquery/connections.py +++ b/dbt/adapters/bigquery/connections.py @@ -519,10 +519,7 @@ def get_bq_table(self, database, schema, identifier) -> Table: # backwards compatibility: fill in with defaults if not specified database = database or conn.credentials.database schema = schema or conn.credentials.schema - return client.get_table( - table=self.table_ref(database, schema, identifier), - retry=self._retry.create_reopen_with_deadline(conn), - ) + return client.get_table(self.table_ref(database, schema, identifier)) def drop_dataset(self, database, schema) -> None: conn = self.get_thread_connection() From 4d255b2f854d21d5d8871bdaa8d7ab47e7e863a3 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Thu, 5 Dec 2024 16:26:52 -0500 Subject: [PATCH 07/11] Fix: Cast to timestamp prior to event time comparison (#1422) --- .../unreleased/Fixes-20241204-105846.yaml | 7 +++++ dbt/adapters/bigquery/relation.py | 25 +++++++++++++++- .../incremental_strategy_fixtures.py | 20 ++++++++++++- .../test_incremental_microbatch.py | 30 +++++++++++++++++-- 4 files changed, 78 insertions(+), 4 deletions(-) create mode 100644 .changes/unreleased/Fixes-20241204-105846.yaml diff --git a/.changes/unreleased/Fixes-20241204-105846.yaml b/.changes/unreleased/Fixes-20241204-105846.yaml new file mode 100644 index 000000000..2693e4513 --- /dev/null +++ b/.changes/unreleased/Fixes-20241204-105846.yaml @@ -0,0 +1,7 @@ +kind: Fixes +body: Cast `event_time` to a timestamp prior to comparing against microbatch start/end + time +time: 2024-12-04T10:58:46.573608-05:00 +custom: + Author: michelleark + Issue: "1422" diff --git a/dbt/adapters/bigquery/relation.py b/dbt/adapters/bigquery/relation.py index 4edc8d7ac..037761918 100644 --- a/dbt/adapters/bigquery/relation.py +++ b/dbt/adapters/bigquery/relation.py @@ -4,7 +4,12 @@ from dbt_common.exceptions import CompilationError from dbt_common.utils.dict import filter_null_values -from dbt.adapters.base.relation import BaseRelation, ComponentName, InformationSchema +from dbt.adapters.base.relation import ( + BaseRelation, + ComponentName, + InformationSchema, + EventTimeFilter, +) from dbt.adapters.contracts.relation import RelationConfig, RelationType from dbt.adapters.relation_configs import RelationConfigChangeAction @@ -116,6 +121,24 @@ def materialized_view_config_changeset( def information_schema(self, identifier: Optional[str] = None) -> "BigQueryInformationSchema": return BigQueryInformationSchema.from_relation(self, identifier) + def _render_event_time_filtered(self, event_time_filter: EventTimeFilter) -> str: + """ + Returns "" if start and end are both None + """ + filter = "" + if event_time_filter.start and event_time_filter.end: + filter = f"cast({event_time_filter.field_name} as timestamp) >= '{event_time_filter.start}' and cast({event_time_filter.field_name} as timestamp) < '{event_time_filter.end}'" + elif event_time_filter.start: + filter = ( + f"cast({event_time_filter.field_name} as timestamp) >= '{event_time_filter.start}'" + ) + elif event_time_filter.end: + filter = ( + f"cast({event_time_filter.field_name} as timestamp) < '{event_time_filter.end}'" + ) + + return filter + @dataclass(frozen=True, eq=False, repr=False) class BigQueryInformationSchema(InformationSchema): diff --git a/tests/functional/adapter/incremental/incremental_strategy_fixtures.py b/tests/functional/adapter/incremental/incremental_strategy_fixtures.py index 02efbb6c2..365aba8c8 100644 --- a/tests/functional/adapter/incremental/incremental_strategy_fixtures.py +++ b/tests/functional/adapter/incremental/incremental_strategy_fixtures.py @@ -570,7 +570,7 @@ begin=modules.datetime.datetime(2020, 1, 1, 0, 0, 0) ) }} -select * from {{ ref('input_model') }} +select id, cast(event_time as timestamp) as event_time from {{ ref('input_model') }} """ microbatch_input_sql = """ @@ -582,6 +582,24 @@ select 3 as id, TIMESTAMP '2020-01-03 00:00:00-0' as event_time """ +microbatch_input_event_time_date_sql = """ +{{ config(materialized='table', event_time='event_time') }} +select 1 as id, DATE '2020-01-01' as event_time +union all +select 2 as id, DATE '2020-01-02' as event_time +union all +select 3 as id, DATE '2020-01-03' as event_time +""" + +microbatch_input_event_time_datetime_sql = """ +{{ config(materialized='table', event_time='event_time') }} +select 1 as id, DATETIME '2020-01-01' as event_time +union all +select 2 as id, DATETIME '2020-01-02' as event_time +union all +select 3 as id, DATETIME '2020-01-03' as event_time +""" + microbatch_model_no_partition_by_sql = """ {{ config( materialized='incremental', diff --git a/tests/functional/adapter/incremental/test_incremental_microbatch.py b/tests/functional/adapter/incremental/test_incremental_microbatch.py index d1bbbcea3..d0f8b62b7 100644 --- a/tests/functional/adapter/incremental/test_incremental_microbatch.py +++ b/tests/functional/adapter/incremental/test_incremental_microbatch.py @@ -13,6 +13,8 @@ microbatch_input_sql, microbatch_model_no_partition_by_sql, microbatch_model_invalid_partition_by_sql, + microbatch_input_event_time_date_sql, + microbatch_input_event_time_datetime_sql, ) @@ -22,6 +24,32 @@ def microbatch_model_sql(self) -> str: return microbatch_model_no_unique_id_sql +class TestBigQueryMicrobatchInputWithDate(TestBigQueryMicrobatch): + @pytest.fixture(scope="class") + def input_model_sql(self) -> str: + return microbatch_input_event_time_date_sql + + @pytest.fixture(scope="class") + def insert_two_rows_sql(self, project) -> str: + test_schema_relation = project.adapter.Relation.create( + database=project.database, schema=project.test_schema + ) + return f"insert into {test_schema_relation}.input_model (id, event_time) values (4, DATE '2020-01-04'), (5, DATE '2020-01-05')" + + +class TestBigQueryMicrobatchInputWithDatetime(TestBigQueryMicrobatch): + @pytest.fixture(scope="class") + def input_model_sql(self) -> str: + return microbatch_input_event_time_datetime_sql + + @pytest.fixture(scope="class") + def insert_two_rows_sql(self, project) -> str: + test_schema_relation = project.adapter.Relation.create( + database=project.database, schema=project.test_schema + ) + return f"insert into {test_schema_relation}.input_model (id, event_time) values (4, DATETIME '2020-01-04'), (5, DATETIME '2020-01-05')" + + class TestBigQueryMicrobatchMissingPartitionBy: @pytest.fixture(scope="class") def models(self) -> str: @@ -30,7 +58,6 @@ def models(self) -> str: "input_model.sql": microbatch_input_sql, } - @mock.patch.dict(os.environ, {"DBT_EXPERIMENTAL_MICROBATCH": "True"}) def test_execution_failure_no_partition_by(self, project): with patch_microbatch_end_time("2020-01-03 13:57:00"): _, stdout = run_dbt_and_capture(["run"], expect_pass=False) @@ -45,7 +72,6 @@ def models(self) -> str: "input_model.sql": microbatch_input_sql, } - @mock.patch.dict(os.environ, {"DBT_EXPERIMENTAL_MICROBATCH": "True"}) def test_execution_failure_no_partition_by(self, project): with patch_microbatch_end_time("2020-01-03 13:57:00"): _, stdout = run_dbt_and_capture(["run"], expect_pass=False) From 827fc78af56803ede19a990daa432a94c27108fc Mon Sep 17 00:00:00 2001 From: Borja Vazquez-Barreiros <75082703+borjavb@users.noreply.github.com> Date: Tue, 10 Dec 2024 21:51:03 +0000 Subject: [PATCH 08/11] [Feature] Allow `copy_partitions` when using `microbatch` (#1421) --- .../unreleased/Features-20241202-223835.yaml | 6 ++++++ .../macros/materializations/incremental.sql | 4 ++-- .../incremental_strategy_fixtures.py | 18 ++++++++++++++++++ .../incremental/test_incremental_microbatch.py | 7 +++++++ 4 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 .changes/unreleased/Features-20241202-223835.yaml diff --git a/.changes/unreleased/Features-20241202-223835.yaml b/.changes/unreleased/Features-20241202-223835.yaml new file mode 100644 index 000000000..ab59abd99 --- /dev/null +++ b/.changes/unreleased/Features-20241202-223835.yaml @@ -0,0 +1,6 @@ +kind: Features +body: Allow copy_partitions in microbatch +time: 2024-12-02T22:38:35.479052Z +custom: + Author: borjavb + Issue: "1414" diff --git a/dbt/include/bigquery/macros/materializations/incremental.sql b/dbt/include/bigquery/macros/materializations/incremental.sql index 935280d63..25a83b0c6 100644 --- a/dbt/include/bigquery/macros/materializations/incremental.sql +++ b/dbt/include/bigquery/macros/materializations/incremental.sql @@ -95,9 +95,9 @@ {{ run_hooks(pre_hooks) }} - {% if partition_by.copy_partitions is true and strategy != 'insert_overwrite' %} {#-- We can't copy partitions with merge strategy --#} + {% if partition_by.copy_partitions is true and strategy not in ['insert_overwrite', 'microbatch'] %} {#-- We can't copy partitions with merge strategy --#} {% set wrong_strategy_msg -%} - The 'copy_partitions' option requires the 'incremental_strategy' option to be set to 'insert_overwrite'. + The 'copy_partitions' option requires the 'incremental_strategy' option to be set to 'insert_overwrite' or 'microbatch'. {%- endset %} {% do exceptions.raise_compiler_error(wrong_strategy_msg) %} diff --git a/tests/functional/adapter/incremental/incremental_strategy_fixtures.py b/tests/functional/adapter/incremental/incremental_strategy_fixtures.py index 365aba8c8..21d5f15b6 100644 --- a/tests/functional/adapter/incremental/incremental_strategy_fixtures.py +++ b/tests/functional/adapter/incremental/incremental_strategy_fixtures.py @@ -629,3 +629,21 @@ }} select * from {{ ref('input_model') }} """ + +microbatch_model_no_unique_id_copy_partitions_sql = """ +{{ config( + materialized='incremental', + incremental_strategy='microbatch', + partition_by={ + 'field': 'event_time', + 'data_type': 'timestamp', + 'granularity': 'day', + 'copy_partitions': true + }, + event_time='event_time', + batch_size='day', + begin=modules.datetime.datetime(2020, 1, 1, 0, 0, 0) + ) +}} +select * from {{ ref('input_model') }} +""" diff --git a/tests/functional/adapter/incremental/test_incremental_microbatch.py b/tests/functional/adapter/incremental/test_incremental_microbatch.py index d0f8b62b7..912f96eec 100644 --- a/tests/functional/adapter/incremental/test_incremental_microbatch.py +++ b/tests/functional/adapter/incremental/test_incremental_microbatch.py @@ -13,6 +13,7 @@ microbatch_input_sql, microbatch_model_no_partition_by_sql, microbatch_model_invalid_partition_by_sql, + microbatch_model_no_unique_id_copy_partitions_sql, microbatch_input_event_time_date_sql, microbatch_input_event_time_datetime_sql, ) @@ -79,3 +80,9 @@ def test_execution_failure_no_partition_by(self, project): "The 'microbatch' strategy requires a `partition_by` config with the same granularity as its configured `batch_size`" in stdout ) + + +class TestBigQueryMicrobatchWithCopyPartitions(BaseMicrobatch): + @pytest.fixture(scope="class") + def microbatch_model_sql(self) -> str: + return microbatch_model_no_unique_id_copy_partitions_sql From e1b6e74974f1cddf4100f82f2b92a3baa22a37cf Mon Sep 17 00:00:00 2001 From: Mike Alfare <13974384+mikealfare@users.noreply.github.com> Date: Tue, 10 Dec 2024 19:01:49 -0500 Subject: [PATCH 09/11] ADAP-1116: Move to `hatch` and `pyproject.toml` (#1407) * move setup.py to pyproject.toml * move dev tool config to pyproject.toml * update integration.yml to use hatch commands * update main.yml to use hatch commands * update scripts for pyproject.toml and hatch.toml references * update release workflow to use hatch --------- Co-authored-by: Colin Rogers <111200756+colin-rogers-dbt@users.noreply.github.com> --- .bumpversion.cfg | 35 -- .../Under the Hood-20241117-194746.yaml | 6 + {scripts => .github/scripts}/env-setup.sh | 0 .github/scripts/update_dependencies.sh | 6 +- .../scripts/update_dev_dependency_branches.sh | 4 +- .github/workflows/integration.yml | 52 +- .github/workflows/main.yml | 60 +-- .github/workflows/nightly-release.yml | 19 +- .github/workflows/release.yml | 112 ++--- .github/workflows/release_prep_hatch.yml | 455 ++++++++++++++++++ .github/workflows/version-bump.yml | 28 -- MANIFEST.in | 1 - Makefile | 21 - dev-requirements.txt | 20 - hatch.toml | 61 +++ mypy.ini | 2 - pyproject.toml | 57 +++ pytest.ini | 10 - scripts/build-dist.sh | 20 - setup.py | 78 --- tox.ini | 54 --- 21 files changed, 657 insertions(+), 444 deletions(-) delete mode 100644 .bumpversion.cfg create mode 100644 .changes/unreleased/Under the Hood-20241117-194746.yaml rename {scripts => .github/scripts}/env-setup.sh (100%) create mode 100644 .github/workflows/release_prep_hatch.yml delete mode 100644 .github/workflows/version-bump.yml delete mode 100644 MANIFEST.in delete mode 100644 Makefile delete mode 100644 dev-requirements.txt create mode 100644 hatch.toml delete mode 100644 mypy.ini create mode 100644 pyproject.toml delete mode 100644 pytest.ini delete mode 100755 scripts/build-dist.sh delete mode 100644 setup.py delete mode 100644 tox.ini diff --git a/.bumpversion.cfg b/.bumpversion.cfg deleted file mode 100644 index b56a8ee4f..000000000 --- a/.bumpversion.cfg +++ /dev/null @@ -1,35 +0,0 @@ -[bumpversion] -current_version = 1.10.0a1 -parse = (?P[\d]+) # major version number - \.(?P[\d]+) # minor version number - \.(?P[\d]+) # patch version number - (?P # optional pre-release - ex: a1, b2, rc25 - (?Pa|b|rc) # pre-release type - (?P[\d]+) # pre-release version number - )? - ( # optional nightly release indicator - \.(?Pdev[0-9]+) # ex: .dev02142023 - )? # expected matches: `1.15.0`, `1.5.0a11`, `1.5.0a1.dev123`, `1.5.0.dev123457`, expected failures: `1`, `1.5`, `1.5.2-a1`, `text1.5.0` -serialize = - {major}.{minor}.{patch}{prekind}{num}.{nightly} - {major}.{minor}.{patch}.{nightly} - {major}.{minor}.{patch}{prekind}{num} - {major}.{minor}.{patch} -commit = False -tag = False - -[bumpversion:part:prekind] -first_value = a -optional_value = final -values = - a - b - rc - final - -[bumpversion:part:num] -first_value = 1 - -[bumpversion:part:nightly] - -[bumpversion:file:dbt/adapters/bigquery/__version__.py] diff --git a/.changes/unreleased/Under the Hood-20241117-194746.yaml b/.changes/unreleased/Under the Hood-20241117-194746.yaml new file mode 100644 index 000000000..e8658ee20 --- /dev/null +++ b/.changes/unreleased/Under the Hood-20241117-194746.yaml @@ -0,0 +1,6 @@ +kind: Under the Hood +body: Move from setup.py to pyproject.toml and to hatch as a dev tool +time: 2024-11-17T19:47:46.341-05:00 +custom: + Author: mikealfare + Issue: "1407" diff --git a/scripts/env-setup.sh b/.github/scripts/env-setup.sh similarity index 100% rename from scripts/env-setup.sh rename to .github/scripts/env-setup.sh diff --git a/.github/scripts/update_dependencies.sh b/.github/scripts/update_dependencies.sh index c3df48e52..fabdadff2 100644 --- a/.github/scripts/update_dependencies.sh +++ b/.github/scripts/update_dependencies.sh @@ -2,9 +2,9 @@ set -e git_branch=$1 -target_req_file="dev-requirements.txt" -core_req_sed_pattern="s|dbt-core.git.*#egg=dbt-core|dbt-core.git@${git_branch}#egg=dbt-core|g" -tests_req_sed_pattern="s|dbt-core.git.*#egg=dbt-tests|dbt-core.git@${git_branch}#egg=dbt-tests|g" +target_req_file="hatch.toml" +core_req_sed_pattern="s|dbt-core.git.*#subdirectory=core|dbt-core.git@${git_branch}#subdirectory=core|g" +tests_req_sed_pattern="s|dbt-adapters.git.*#subdirectory=dbt-tests-adapter|dbt-adapters.git@${git_branch}#subdirectory=dbt-tests-adapter|g" if [[ "$OSTYPE" == darwin* ]]; then # mac ships with a different version of sed that requires a delimiter arg sed -i "" "$core_req_sed_pattern" $target_req_file diff --git a/.github/scripts/update_dev_dependency_branches.sh b/.github/scripts/update_dev_dependency_branches.sh index 022df6a8a..9385cf885 100755 --- a/.github/scripts/update_dev_dependency_branches.sh +++ b/.github/scripts/update_dev_dependency_branches.sh @@ -5,8 +5,8 @@ set -e dbt_adapters_branch=$1 dbt_core_branch=$2 dbt_common_branch=$3 -target_req_file="dev-requirements.txt" -core_req_sed_pattern="s|dbt-core.git.*#egg=dbt-core|dbt-core.git@${dbt_core_branch}#egg=dbt-core|g" +target_req_file="hatch.toml" +core_req_sed_pattern="s|dbt-core.git.*#subdirectory=core|dbt-core.git@${dbt_core_branch}#subdirectory=core|g" adapters_req_sed_pattern="s|dbt-adapters.git|dbt-adapters.git@${dbt_adapters_branch}|g" common_req_sed_pattern="s|dbt-common.git|dbt-common.git@${dbt_common_branch}|g" if [[ "$OSTYPE" == darwin* ]]; then diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index a9179f9ce..32d937ef8 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -127,7 +127,8 @@ jobs: bigquery: - 'dbt/**' - 'tests/**' - - 'dev-requirements.txt' + - 'hatch.toml' + - 'pyproject.toml' - '.github/**' - '*.py' @@ -164,8 +165,6 @@ jobs: matrix: ${{ fromJSON(needs.test-metadata.outputs.matrix) }} env: - TOXENV: integration-${{ matrix.adapter }} - PYTEST_ADDOPTS: "-v --color=yes -n4 --csv integration_results.csv" DBT_INVOCATION_ENV: github-actions DD_CIVISIBILITY_AGENTLESS_ENABLED: true DD_API_KEY: ${{ secrets.DATADOG_API_KEY }} @@ -201,12 +200,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Install python dependencies - run: | - python -m pip install --user --upgrade pip - python -m pip install tox - python -m pip --version - tox --version + - uses: pypa/hatch@install - name: Update Adapters and Core branches (update dev_requirements.txt) if: github.event_name == 'workflow_dispatch' @@ -215,10 +209,9 @@ jobs: ${{ inputs.dbt_adapters_branch }} \ ${{ inputs.dbt_core_branch }} \ ${{ inputs.dbt_common_branch }} - cat dev-requirements.txt + cat hatch.toml - - name: Run tox (bigquery) - if: matrix.adapter == 'bigquery' + - run: hatch run integration-tests tests/functional -k "not TestPython" env: BIGQUERY_TEST_SERVICE_ACCOUNT_JSON: ${{ secrets.BIGQUERY_TEST_SERVICE_ACCOUNT_JSON }} BIGQUERY_TEST_ALT_DATABASE: ${{ secrets.BIGQUERY_TEST_ALT_DATABASE }} @@ -229,36 +222,13 @@ jobs: DATAPROC_REGION: us-central1 DATAPROC_CLUSTER_NAME: dbt-test-1 GCS_BUCKET: dbt-ci - run: tox -- --ddtrace - - - name: Get current date - if: always() - id: date - run: | - echo "date=$(date +'%Y-%m-%dT%H_%M_%S')" >> $GITHUB_OUTPUT #no colons allowed for artifacts - - - uses: actions/upload-artifact@v4 - if: always() - with: - name: logs_${{ matrix.python-version }}_${{ matrix.os }}_${{ matrix.adapter }}-${{ steps.date.outputs.date }} - path: ./logs - overwrite: true - - - uses: actions/upload-artifact@v4 - if: always() - with: - name: integration_results_${{ matrix.python-version }}_${{ matrix.os }}_${{ matrix.adapter }}-${{ steps.date.outputs.date }}.csv - path: integration_results.csv - overwrite: true # python integration tests are slow so we only run them seperately and for a single OS / python version test-python: name: "test-python" - needs: test-metadata + needs: test runs-on: ubuntu-latest if: >- - needs.test-metadata.outputs.matrix && - fromJSON( needs.test-metadata.outputs.matrix ).include[0] && ( github.event_name != 'pull_request_target' || github.event.pull_request.head.repo.full_name == github.repository || @@ -286,14 +256,9 @@ jobs: with: python-version: "3.9" - - name: Install python dependencies - run: | - python -m pip install --user --upgrade pip - python -m pip install tox - python -m pip --version - tox --version + - uses: pypa/hatch@install - - name: Run tox (python models) + - run: hatch run integration-tests tests/functional -n1 -k "TestPython" --ddtrace env: BIGQUERY_TEST_SERVICE_ACCOUNT_JSON: ${{ secrets.BIGQUERY_TEST_SERVICE_ACCOUNT_JSON }} BIGQUERY_TEST_ALT_DATABASE: ${{ secrets.BIGQUERY_TEST_ALT_DATABASE }} @@ -304,7 +269,6 @@ jobs: DATAPROC_REGION: us-central1 DATAPROC_CLUSTER_NAME: dbt-test-1 GCS_BUCKET: dbt-ci - run: tox -e python-tests -- --ddtrace require-label-comment: runs-on: ubuntu-latest diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7b82f3e0f..9ad70ce5c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -52,15 +52,8 @@ jobs: with: python-version: '3.9' - - name: Install python dependencies - run: | - python -m pip install --user --upgrade pip - python -m pip install -r dev-requirements.txt - python -m pip --version - pre-commit --version - dbt --version - - name: Run pre-comit hooks - run: pre-commit run --all-files --show-diff-on-failure + - name: Run pre-commit hooks + uses: pre-commit/action@v3.0.1 unit: name: unit test / python ${{ matrix.python-version }} @@ -72,10 +65,6 @@ jobs: matrix: python-version: ['3.9', '3.10', '3.11', '3.12'] - env: - TOXENV: "unit" - PYTEST_ADDOPTS: "-v --color=yes --csv unit_results.csv" - steps: - name: Check out the repository uses: actions/checkout@v4 @@ -88,27 +77,9 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install python dependencies - run: | - python -m pip install --user --upgrade pip - python -m pip install tox - python -m pip --version - tox --version - - - name: Run tox - run: tox + uses: pypa/hatch@install - - name: Get current date - if: always() - id: date - run: | - echo "date=$(date +'%Y-%m-%dT%H_%M_%S')" >> $GITHUB_OUTPUT #no colons allowed for artifacts - - - uses: actions/upload-artifact@v4 - if: always() - with: - name: unit_results_${{ matrix.python-version }}-${{ steps.date.outputs.date }}.csv - path: unit_results.csv - overwrite: true + - run: hatch run unit-tests build: name: build packages @@ -129,25 +100,16 @@ jobs: with: python-version: '3.9' - - name: Install python dependencies - run: | - python -m pip install --user --upgrade pip - python -m pip install --upgrade setuptools wheel twine check-wheel-contents - python -m pip --version + - uses: pypa/hatch@install - name: Build distributions - run: ./scripts/build-dist.sh + run: hatch build - name: Show distributions run: ls -lh dist/ - name: Check distribution descriptions - run: | - twine check dist/* - - - name: Check wheel contents - run: | - check-wheel-contents dist/*.whl --ignore W007,W008 + run: hatch run build:check-all - name: Check if this is an alpha version id: check-is-alpha @@ -174,7 +136,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, macos-14, windows-latest] + os: [ubuntu-22.04, macos-14, windows-2022] python-version: ['3.9', '3.10', '3.11', '3.12'] dist-type: ["whl", "gz"] @@ -184,12 +146,6 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Install python dependencies - run: | - python -m pip install --user --upgrade pip - python -m pip install --upgrade wheel - python -m pip --version - - uses: actions/download-artifact@v4 with: name: dist diff --git a/.github/workflows/nightly-release.yml b/.github/workflows/nightly-release.yml index 1dc396154..c986a4d0f 100644 --- a/.github/workflows/nightly-release.yml +++ b/.github/workflows/nightly-release.yml @@ -50,11 +50,15 @@ jobs: commit_sha=$(git rev-parse HEAD) echo "release_commit=$commit_sha" >> $GITHUB_OUTPUT - - name: "Get Current Version Number" - id: version-number-sources - run: | - current_version=`awk -F"current_version = " '{print $2}' .bumpversion.cfg | tr '\n' ' '` - echo "current_version=$current_version" >> $GITHUB_OUTPUT + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.9' + + - uses: pypa/hatch@install + + - id: version-number-sources + run: echo "current_version=$(hatch version)" >> $GITHUB_OUTPUT - name: "Audit Version And Parse Into Parts" id: semver @@ -108,10 +112,5 @@ jobs: sha: ${{ needs.aggregate-release-data.outputs.commit_sha }} target_branch: ${{ needs.aggregate-release-data.outputs.release_branch }} version_number: ${{ needs.aggregate-release-data.outputs.version_number }} - build_script_path: "scripts/build-dist.sh" - env_setup_script_path: "scripts/env-setup.sh" - s3_bucket_name: "core-team-artifacts" - package_test_command: "dbt -h" - test_run: true nightly_release: true secrets: inherit diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ad7cf76b4..15840e5ed 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -39,7 +39,7 @@ on: env_setup_script_path: description: "Environment setup script path" type: string - default: "scripts/env-setup.sh" + default: "./.github/scripts/env-setup.sh" required: false s3_bucket_name: description: "AWS S3 bucket name" @@ -88,7 +88,7 @@ on: env_setup_script_path: description: "Environment setup script path" type: string - default: "scripts/env-setup.sh" + default: "./.github/scripts/env-setup.sh" required: false s3_bucket_name: description: "AWS S3 bucket name" @@ -119,86 +119,70 @@ defaults: shell: bash jobs: - log-inputs: - name: Log Inputs - runs-on: ubuntu-latest - steps: - - name: "[DEBUG] Print Variables" - run: | - echo The last commit sha in the release: ${{ inputs.sha }} - echo The branch to release from: ${{ inputs.target_branch }} - echo The release version number: ${{ inputs.version_number }} - echo Build script path: ${{ inputs.build_script_path }} - echo Environment setup script path: ${{ inputs.env_setup_script_path }} - echo AWS S3 bucket name: ${{ inputs.s3_bucket_name }} - echo Package test command: ${{ inputs.package_test_command }} - echo Test run: ${{ inputs.test_run }} - echo Nightly release: ${{ inputs.nightly_release }} - echo Only Docker: ${{ inputs.only_docker }} - - bump-version-generate-changelog: - name: Bump package version, Generate changelog - uses: dbt-labs/dbt-release/.github/workflows/release-prep.yml@main + release-prep: + name: "Release prep: generate changelog, bump version" + uses: ./.github/workflows/release_prep_hatch.yml with: - sha: ${{ inputs.sha }} - version_number: ${{ inputs.version_number }} - target_branch: ${{ inputs.target_branch }} - env_setup_script_path: ${{ inputs.env_setup_script_path }} - test_run: ${{ inputs.test_run }} - nightly_release: ${{ inputs.nightly_release }} + branch: ${{ inputs.branch }} + version: ${{ inputs.version }} + deploy-to: ${{ inputs.deploy-to }} secrets: inherit - log-outputs-bump-version-generate-changelog: - name: "[Log output] Bump package version, Generate changelog" - if: ${{ !failure() && !cancelled() && !inputs.only_docker }} - needs: [bump-version-generate-changelog] + build-release: + name: "Build release" + needs: release-prep runs-on: ubuntu-latest + outputs: + archive-name: ${{ steps.archive.outputs.name }} steps: - - name: Print variables + - uses: actions/checkout@v4 + with: + ref: ${{ needs.release-prep.outputs.release-branch }} + persist-credentials: false + - uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python_version }} + - uses: pypa/hatch@install + - id: archive run: | - echo Final SHA : ${{ needs.bump-version-generate-changelog.outputs.final_sha }} - echo Changelog path: ${{ needs.bump-version-generate-changelog.outputs.changelog_path }} - - build-test-package: - name: Build, Test, Package - if: ${{ !failure() && !cancelled() && !inputs.only_docker }} - needs: [bump-version-generate-changelog] - uses: dbt-labs/dbt-release/.github/workflows/build.yml@main - with: - sha: ${{ needs.bump-version-generate-changelog.outputs.final_sha }} - version_number: ${{ inputs.version_number }} - changelog_path: ${{ needs.bump-version-generate-changelog.outputs.changelog_path }} - build_script_path: ${{ inputs.build_script_path }} - s3_bucket_name: ${{ inputs.s3_bucket_name }} - package_test_command: ${{ inputs.package_test_command }} - test_run: ${{ inputs.test_run }} - nightly_release: ${{ inputs.nightly_release }} - secrets: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + archive_name=${{ github.event.repository.name }}-${{ inputs.version }}-${{ inputs.deploy-to }} + echo "name=$archive_name" >> $GITHUB_OUTPUT + - run: hatch build && hatch run build:check-all + - uses: actions/upload-artifact@v4 + with: + name: ${{ steps.archive.outputs.name }} + path: dist/ + retention-days: 3 github-release: name: GitHub Release if: ${{ !failure() && !cancelled() && !inputs.only_docker }} - needs: [bump-version-generate-changelog, build-test-package] - uses: dbt-labs/dbt-release/.github/workflows/github-release.yml@main + needs: [build-release, release-prep] + uses: dbt-labs/dbt-adapters/.github/workflows/github-release.yml@main with: sha: ${{ needs.bump-version-generate-changelog.outputs.final_sha }} version_number: ${{ inputs.version_number }} changelog_path: ${{ needs.bump-version-generate-changelog.outputs.changelog_path }} test_run: ${{ inputs.test_run }} + archive_name: ${{ needs.build-release.outputs.archive-name }} pypi-release: name: PyPI Release if: ${{ !failure() && !cancelled() && !inputs.only_docker }} - needs: [bump-version-generate-changelog, build-test-package] - uses: dbt-labs/dbt-release/.github/workflows/pypi-release.yml@main - with: - version_number: ${{ inputs.version_number }} - test_run: ${{ inputs.test_run }} - secrets: - PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }} - TEST_PYPI_API_TOKEN: ${{ secrets.TEST_PYPI_API_TOKEN }} + needs: build-release + runs-on: ubuntu-latest + environment: + name: ${{ inputs.deploy-to }} + url: ${{ vars.PYPI_PROJECT_URL }} + permissions: + # this permission is required for trusted publishing + # see https://github.com/marketplace/actions/pypi-publish + id-token: write + steps: + - uses: dbt-labs/dbt-adapters/.github/actions/publish-pypi@main + with: + repository-url: ${{ vars.PYPI_REPOSITORY_URL }} + archive-name: ${{ needs.build-release.outputs.archive-name }} docker-release: name: "Docker Release" @@ -206,7 +190,7 @@ jobs: # what we need to release but draft releases don't actually tag the commit so it # finds nothing to release if: ${{ !failure() && !cancelled() && (!inputs.test_run || inputs.only_docker) }} - needs: [bump-version-generate-changelog, build-test-package, github-release] + needs: github-release permissions: packages: write uses: dbt-labs/dbt-release/.github/workflows/release-docker.yml@main diff --git a/.github/workflows/release_prep_hatch.yml b/.github/workflows/release_prep_hatch.yml new file mode 100644 index 000000000..ee092600d --- /dev/null +++ b/.github/workflows/release_prep_hatch.yml @@ -0,0 +1,455 @@ +# **what?** +# Perform the version bump, generate the changelog and run tests. +# +# Inputs: +# branch: The branch that we will release from +# version: The release version number (i.e. 1.0.0b1, 1.2.3rc2, 1.0.0) +# deploy-to: If we are deploying to prod or test, if test then release from branch +# is-nightly-release: Identifier that this is nightly release +# +# Outputs: +# release-sha: The sha that will actually be released. This can differ from the +# input sha if adding a version bump and/or changelog +# changelog-path: Path to the changelog file (ex .changes/1.2.3-rc1.md) +# +# Branching strategy: +# - During execution workflow execution the temp branch will be generated. +# - For normal runs the temp branch will be removed once changes were merged to target branch; +# - For test runs we will keep temp branch and will use it for release; +# Naming strategy: +# - For normal runs: prep-release/${{ inputs.deploy-to}}/${{ inputs.version }}_$GITHUB_RUN_ID +# - For nightly releases: prep-release/nightly-release/${{ inputs.version }}_$GITHUB_RUN_ID +# +# **why?** +# Reusable and consistent GitHub release process. +# +# **when?** +# Call when ready to kick off a build and release +# +# Validation Checks +# +# 1. Bump the version if it has not been bumped +# 2. Generate the changelog (via changie) if there is no markdown file for this version +name: "Release prep" +run-name: "Release prep: Generate changelog and bump to ${{ inputs.version }} for release to ${{ inputs.deploy-to }}" +on: + workflow_call: + inputs: + branch: + description: "The branch to release from" + type: string + default: "main" + version: + description: "The version to release" + required: true + type: string + deploy-to: + description: "Deploy to test or prod" + type: string + default: "prod" + is-nightly-release: + description: "Identify if this is a nightly release" + type: boolean + default: false + outputs: + release-branch: + description: "The branch to be released from" + value: ${{ jobs.release.outputs.branch }} + release-sha: + description: "The SHA to be released" + value: ${{ jobs.release.outputs.sha }} + changelog-path: + description: "The path to the changelog from the repo root for this version, e.g. .changes/1.8.0-b1.md" + value: ${{ jobs.release-inputs.outputs.changelog-path }} + secrets: + FISHTOWN_BOT_PAT: + description: "Token to commit/merge changes into branches" + required: true + IT_TEAM_MEMBERSHIP: + description: "Token that can view org level teams" + required: true + +permissions: + contents: write + +defaults: + run: + shell: bash + +env: + PYTHON_DEFAULT_VERSION: 3.9 + NOTIFICATION_PREFIX: "[Release Prep]" + +jobs: + release-inputs: + runs-on: ubuntu-latest + outputs: + changelog-path: ${{ steps.changelog.outputs.path }} + changelog-exists: ${{ steps.changelog.outputs.exists }} + base-version: ${{ steps.semver.outputs.base-version }} + pre-release: ${{ steps.semver.outputs.pre-release }} + is-pre-release: ${{ steps.semver.outputs.is-pre-release }} + version-is-current: ${{ steps.version.outputs.is-current }} + + steps: + - name: "[DEBUG] Log inputs" + run: | + # WORKFLOW INPUTS + echo Branch: ${{ inputs.branch }} + echo Release version: ${{ inputs.version }} + echo Deploy to: ${{ inputs.deploy-to }} + echo Nightly release: ${{ inputs.is-nightly-release }} + # ENVIRONMENT VARIABLES + echo Python version: ${{ env.PYTHON_DEFAULT_VERSION }} + echo Notification prefix: ${{ env.NOTIFICATION_PREFIX }} + + - name: "Checkout ${{ github.event.repository.name }}@${{ inputs.branch }}" + uses: actions/checkout@v4 + with: + ref: ${{ inputs.branch }} + + - uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_DEFAULT_VERSION }} + + - uses: pypa/hatch@install + + - name: "Parse input version" + id: semver + uses: dbt-labs/actions/parse-semver@v1.1.1 + with: + version: ${{ inputs.version }} + + - name: "Audit version" + id: version + run: | + is_current=false + current_version=$(hatch version) + if test "$current_version" = "${{ inputs.version }}" + then + is_current=true + fi + echo "is-current=$is_current" >> $GITHUB_OUTPUT + + - name: "[INFO] Skip version bump" + if: steps.version.outputs.is-current == 'true' + run: | + title="Skip version bump" + message="The version matches the input version ${{ inputs.version }}, skipping version bump" + echo "::notice title=${{ env.NOTIFICATION_PREFIX }}: $title::$message" + + - name: "Audit changelog" + id: changelog + run: | + path=".changes/" + if [[ ${{ steps.semver.outputs.is-pre-release }} -eq 1 ]] + then + path+="${{ steps.semver.outputs.base-version }}-${{ steps.semver.outputs.pre-release }}.md" + else + path+="${{ steps.semver.outputs.base-version }}.md" + fi + echo "path=$path" >> $GITHUB_OUTPUT + + does_exist=false + if test -f $path + then + does_exist=true + fi + echo "exists=$does_exist">> $GITHUB_OUTPUT + + - name: "[INFO] Skip changelog generation" + if: steps.changelog.outputs.exists == 'true' + run: | + title="Skip changelog generation" + message="A changelog already exists at ${{ steps.changelog.outputs.path }}, skipping generating changelog" + echo "::notice title=${{ env.NOTIFICATION_PREFIX }}: $title::$message" + + release-branch: + runs-on: ubuntu-latest + needs: release-inputs + if: | + needs.release-inputs.outputs.changelog-exists == 'false' || + needs.release-inputs.outputs.version-is-current == 'false' + outputs: + name: ${{ steps.release-branch.outputs.name }} + + steps: + - name: "Checkout ${{ github.event.repository.name }}@${{ inputs.branch }}" + uses: actions/checkout@v4 + with: + ref: ${{ inputs.branch }} + + - name: "Set release branch" + id: release-branch + run: | + name="prep-release/" + if [[ ${{ inputs.is-nightly-release }} == true ]] + then + name+="nightly-release/" + else + name+="${{ inputs.deploy-to }}/" + fi + name+="${{ inputs.version }}_$GITHUB_RUN_ID" + echo "name=$name" >> $GITHUB_OUTPUT + + - name: "Create release branch ${{ steps.release-branch.outputs.name }}" + run: | + git checkout -b ${{ steps.release-branch.outputs.name }} + git push -u origin ${{ steps.release-branch.outputs.name }} + + - name: "[INFO] Create release branch" + run: | + title="Create release branch" + message="Create release branch: ${{ steps.release-branch.outputs.name }}" + echo "::notice title=${{ env.NOTIFICATION_PREFIX }}: $title::$message" + + core-team: + if: needs.release-inputs.outputs.changelog-exists == 'false' + needs: release-inputs + uses: dbt-labs/actions/.github/workflows/determine-team-membership.yml@main + with: + github_team: "core-group" + secrets: inherit + + generate-changelog: + runs-on: ubuntu-latest + if: needs.release-inputs.outputs.changelog-exists == 'false' + # only runs if we need to make changes, determined by not skipping release-branch + needs: + - release-inputs + - release-branch + - core-team + + steps: + - name: "Checkout ${{ github.event.repository.name }}@${{ needs.release-branch.outputs.name }}" + uses: actions/checkout@v4 + with: + ref: ${{ needs.release-branch.outputs.name }} + + - uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_DEFAULT_VERSION }} + + - uses: pypa/hatch@install + + - name: "Install `changie`" + run: | + brew tap miniscruff/changie https://github.com/miniscruff/changie + brew install changie + + - name: "Generate changelog at ${{ needs.release-inputs.outputs.changelog-path }}" + run: | + if [[ ${{ needs.release-inputs.outputs.is-pre-release }} -eq 1 ]] + then + changie batch ${{ needs.release-inputs.outputs.base-version }} \ + --move-dir '${{ needs.release-inputs.outputs.base-version }}' \ + --prerelease ${{ needs.release-inputs.outputs.pre-release }} + elif [[ -d ".changes/${{ needs.release-inputs.outputs.base-version }}" ]] + then + changie batch ${{ needs.release-inputs.outputs.base-version }} \ + --include '${{ needs.release-inputs.outputs.base-version }}' \ + --remove-prereleases + else # releasing a final patch with no pre-releases + changie batch ${{ needs.release-inputs.outputs.base-version }} + fi + changie merge + env: + CHANGIE_CORE_TEAM: ${{ needs.core-team.outputs.team_membership }} + + - name: "Remove trailing whitespace and missing new lines" + # this step will fail on whitespace errors but also correct them + continue-on-error: true + run: hatch run code-quality + + - name: "Commit & push changes" + run: | + git config user.name "$USER" + git config user.email "$EMAIL" + git pull + git add . + git commit -m "$COMMIT_MESSAGE" + git push + env: + USER: "GitHub Build Bot" + EMAIL: "buildbot@fishtownanalytics.com" + COMMIT_MESSAGE: "Generate changelog at ${{ needs.release-inputs.outputs.changelog-path }}" + + - name: "[INFO] Generated changelog at ${{ needs.release-inputs.outputs.changelog-path }}" + run: | + title="Changelog generation" + if [[ -f ${{ needs.release-inputs.outputs.changelog-path }} ]] + then + message="Generated changelog file successfully" + echo "::notice title=${{ env.NOTIFICATION_PREFIX }}: $title::$message" + else + message="Failed to generate changelog file" + echo "::error title=${{ env.NOTIFICATION_PREFIX }}: $title::$message" + exit 1 + fi + + bump-version: + runs-on: ubuntu-latest + if: needs.release-inputs.outputs.version-is-current == 'false' + # only runs if we need to make changes, determined by not skipping release-branch + needs: + - release-inputs + - release-branch + - generate-changelog + + steps: + - name: "Checkout ${{ github.event.repository.name }}@${{ needs.release-branch.outputs.name }}" + uses: actions/checkout@v4 + with: + ref: ${{ needs.release-branch.outputs.name }} + + - uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_DEFAULT_VERSION }} + + - uses: pypa/hatch@install + + - name: "Bump version to ${{ inputs.version }}" + run: hatch version ${{ inputs.version }} + + - name: "Commit & push changes" + run: | + git config user.name "$USER" + git config user.email "$EMAIL" + git pull + git add . + git commit -m "$COMMIT_MESSAGE" + git push + env: + USER: "GitHub Build Bot" + EMAIL: "buildbot@fishtownanalytics.com" + COMMIT_MESSAGE: "Bump version to ${{ inputs.version }}" + + - name: "[INFO] Bumped version to ${{ inputs.version }}" + run: | + title="Version bump" + message="Bumped version to ${{ inputs.version }}" + echo "::notice title=${{ env.NOTIFICATION_PREFIX }}: $title::$message" + + unit-tests: + runs-on: ubuntu-latest + # only run unit tests if we created a release branch and already bumped the version and generated the changelog + if: | + !failure() && !cancelled() && + needs.release-branch.outputs.name != '' + needs: + - release-branch + - generate-changelog + - bump-version + + steps: + - name: "Checkout ${{ github.event.repository.name }}@${{ needs.release-branch.outputs.name }}" + uses: actions/checkout@v4 + with: + ref: ${{ needs.release-branch.outputs.name }} + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - uses: pypa/hatch@install + + - name: "Run unit tests" + run: hatch run unit-tests + + integration-tests: + runs-on: ubuntu-latest + # only run integration tests if we created a release branch and already bumped the version and generated the changelog + if: | + !failure() && !cancelled() && + needs.release-branch.outputs.name != '' + needs: + - release-branch + - generate-changelog + - bump-version + + steps: + - name: "Checkout ${{ github.event.repository.name }}@${{ needs.release-branch.outputs.name }}" + uses: actions/checkout@v4 + with: + ref: ${{ needs.release-branch.outputs.name }} + + - uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_DEFAULT_VERSION }} + + - uses: pypa/hatch@install + + - name: "Run integration tests" + run: hatch run integration-tests + + merge-release-branch: + runs-on: ubuntu-latest + needs: + - unit-tests + - integration-tests + - release-branch + - release-inputs + if: | + !failure() && !cancelled() && + needs.release-branch.result == 'success' && + inputs.deploy-to == 'prod' + + steps: + - name: "Checkout ${{ github.event.repository.name }}" + uses: actions/checkout@v4 + + - name: "Merge changes into ${{ inputs.branch }}" + uses: everlytic/branch-merge@1.1.5 + with: + source_ref: ${{ needs.release-branch.outputs.name }} + target_branch: ${{ inputs.branch }} + github_token: ${{ secrets.FISHTOWN_BOT_PAT }} + commit_message_template: "[Automated] Merged {source_ref} into target {target_branch} during release process" + + - name: "[INFO] Merge changes into ${{ inputs.branch }}" + run: | + title="Merge changes" + message="Merge ${{ needs.release-branch.outputs.name }} into ${{ inputs.branch }}" + echo "::notice title=${{ env.NOTIFICATION_PREFIX }}: $title::$message" + + release: + runs-on: ubuntu-latest + needs: + - release-branch + - merge-release-branch + if: ${{ !failure() && !cancelled() }} + + # Get the SHA that will be released. + # If the changelog already exists and the version was already current on the input branch, then release from there. + # Otherwise, we generated a changelog and/or did the version bump in this workflow and there is a + # new sha to use from the merge we just did. Grab that here instead. + outputs: + branch: ${{ steps.branch.outputs.name }} + sha: ${{ steps.sha.outputs.sha }} + + steps: + - name: "Set release branch" + id: branch + # If a release branch was created and not merged, use the release branch + # Otherwise, use the input branch because either nothing was done, or the changes were merged back in + run: | + if [[ ${{ needs.release-branch.result == 'success' }} && ${{ needs.merge-release-branch.result == 'skipped' }} ]]; then + branch="${{ needs.release-branch.outputs.name }}" + else + branch="${{ inputs.branch }}" + fi + echo "name=$branch" >> $GITHUB_OUTPUT + + - name: "Checkout ${{ github.event.repository.name }}@${{ steps.branch.outputs.name }}" + uses: actions/checkout@v4 + with: + ref: ${{ steps.branch.outputs.name }} + + - name: "Set release SHA" + id: sha + run: echo "sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT + + # if this is a real release and a release branch was created, delete it + - name: "Delete release branch: ${{ needs.release-branch.outputs.name }}" + if: ${{ inputs.deploy-to == 'prod' && inputs.is-nightly-release == 'false' && needs.release-branch.outputs.name != '' }} + run: git push origin -d ${{ needs.release-branch.outputs.name }} diff --git a/.github/workflows/version-bump.yml b/.github/workflows/version-bump.yml deleted file mode 100644 index bde34d683..000000000 --- a/.github/workflows/version-bump.yml +++ /dev/null @@ -1,28 +0,0 @@ -# **what?** -# This workflow will take the new version number to bump to. With that -# it will run versionbump to update the version number everywhere in the -# code base and then run changie to create the corresponding changelog. -# A PR will be created with the changes that can be reviewed before committing. - -# **why?** -# This is to aid in releasing dbt and making sure we have updated -# the version in all places and generated the changelog. - -# **when?** -# This is triggered manually - -name: Version Bump - -on: - workflow_dispatch: - inputs: - version_number: - description: 'The version number to bump to (ex. 1.2.0, 1.3.0b1)' - required: true - -jobs: - version_bump_and_changie: - uses: dbt-labs/actions/.github/workflows/version-bump.yml@main - with: - version_number: ${{ inputs.version_number }} - secrets: inherit # ok since what we are calling is internally maintained diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index cfbc714ed..000000000 --- a/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -recursive-include dbt/include *.sql *.yml *.md diff --git a/Makefile b/Makefile deleted file mode 100644 index bdacb538b..000000000 --- a/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -.DEFAULT_GOAL:=help - -.PHONY: dev -dev: ## Installs adapter in develop mode along with development dependencies - @\ - pip install -e . -r dev-requirements.txt && pre-commit install - -.PHONY: dev-uninstall -dev-uninstall: ## Uninstalls all packages while maintaining the virtual environment - ## Useful when updating versions, or if you accidentally installed into the system interpreter - pip freeze | grep -v "^-e" | cut -d "@" -f1 | xargs pip uninstall -y - pip uninstall -y dbt-bigquery - -.PHONY: docker-dev -docker-dev: - docker build -f docker/dev.Dockerfile -t dbt-bigquery-dev . - docker run --rm -it --name dbt-bigquery-dev -v $(shell pwd):/opt/code dbt-bigquery-dev - -.PHONY: docker-prod -docker-prod: - docker build -f docker/Dockerfile -t dbt-bigquery . diff --git a/dev-requirements.txt b/dev-requirements.txt deleted file mode 100644 index 2c0134110..000000000 --- a/dev-requirements.txt +++ /dev/null @@ -1,20 +0,0 @@ -# install latest changes in dbt-core -git+https://github.com/dbt-labs/dbt-adapters.git -git+https://github.com/dbt-labs/dbt-adapters.git#subdirectory=dbt-tests-adapter -git+https://github.com/dbt-labs/dbt-common.git -git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-core&subdirectory=core - -# dev -ddtrace==2.3.0 -pre-commit~=3.7.0 -pytest~=7.4 -pytest-csv~=3.0 -pytest-dotenv~=0.5.2 -pytest-logbook~=1.2 -pytest-xdist~=3.6 -tox~=4.11 - -# build -bumpversion~=0.6.0 -twine~=5.1 -wheel~=0.43 diff --git a/hatch.toml b/hatch.toml new file mode 100644 index 000000000..34ba6d2a3 --- /dev/null +++ b/hatch.toml @@ -0,0 +1,61 @@ +[version] +path = "dbt/adapters/bigquery/__version__.py" + +[build.targets.sdist] +packages = ["dbt"] + +[build.targets.wheel] +packages = ["dbt"] + +[envs.default] +dependencies = [ + "dbt-adapters @ git+https://github.com/dbt-labs/dbt-adapters.git", + "dbt-common @ git+https://github.com/dbt-labs/dbt-common.git", + "dbt-tests-adapter @ git+https://github.com/dbt-labs/dbt-adapters.git#subdirectory=dbt-tests-adapter", + "dbt-core @ git+https://github.com/dbt-labs/dbt-core.git#subdirectory=core", + "ddtrace==2.3.0", + "ipdb~=0.13.13", + "pre-commit==3.7.0", + "freezegun", + "pytest>=7.0,<8.0", + "pytest-csv~=3.0", + "pytest-dotenv", + "pytest-logbook~=1.2", + "pytest-mock", + "pytest-xdist", +] + +[envs.default.scripts] +setup = "pre-commit install" +code-quality = "pre-commit run --all-files" +unit-tests = "python -m pytest {args:tests/unit}" +integration-tests = "python -m pytest --profile service_account {args:tests/functional}" +docker-dev = [ + "docker build -f docker/dev.Dockerfile -t dbt-bigquery-dev .", + "docker run --rm -it --name dbt-bigquery-dev -v $(shell pwd):/opt/code dbt-bigquery-dev", +] + +[envs.build] +detached = true +dependencies = [ + "wheel", + "twine", + "check-wheel-contents", +] + +[envs.build.scripts] +check-all = [ + "- check-wheel", + "- check-sdist", +] +check-wheel = [ + "twine check dist/*", + "find ./dist/dbt_bigquery-*.whl -maxdepth 1 -type f | xargs python -m pip install --force-reinstall --find-links=dist/", + "pip freeze | grep dbt-bigquery", +] +check-sdist = [ + "check-wheel-contents dist/*.whl --ignore W007,W008", + "find ./dist/dbt_bigquery-*.gz -maxdepth 1 -type f | xargs python -m pip install --force-reinstall --find-links=dist/", + "pip freeze | grep dbt-bigquery", +] +docker-prod = "docker build -f docker/Dockerfile -t dbt-bigquery ." diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index 247a47fec..000000000 --- a/mypy.ini +++ /dev/null @@ -1,2 +0,0 @@ -[mypy] -mypy_path = third-party-stubs/ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..b2d55b25f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,57 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +dynamic = ["version"] +name = "dbt-bigquery" +description = "The BigQuery adapter plugin for dbt" +readme = "README.md" +keywords = ["dbt", "adapter", "adapters", "database", "elt", "dbt-core", "dbt Core", "dbt Cloud", "dbt Labs", "bigquery", "google"] +requires-python = ">=3.9.0" +authors = [{ name = "dbt Labs", email = "info@dbtlabs.com" }] +maintainers = [{ name = "dbt Labs", email = "info@dbtlabs.com" }] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: Apache Software License", + "Operating System :: MacOS :: MacOS X", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "dbt-common>=1.10,<2.0", + "dbt-adapters>=1.7,<2.0", + # 3.20 introduced pyarrow>=3.0 under the `pandas` extra + "google-cloud-bigquery[pandas]>=3.0,<4.0", + "google-cloud-storage~=2.4", + "google-cloud-dataproc~=5.0", + # ---- + # Expect compatibility with all new versions of these packages, so lower bounds only. + "google-api-core>=2.11.0", + # add dbt-core to ensure backwards compatibility of installation, this is not a functional dependency + "dbt-core>=1.8.0", +] + +[project.urls] +Homepage = "https://github.com/dbt-labs/dbt-bigquery" +Documentation = "https://docs.getdbt.com" +Repository = "https://github.com/dbt-labs/dbt-bigquery.git" +Issues = "https://github.com/dbt-labs/dbt-bigquery/issues" +Changelog = "https://github.com/dbt-labs/dbt-bigquery/blob/main/CHANGELOG.md" + +[tool.mypy] +mypy_path = "third-party-stubs/" + +[tool.pytest.ini_options] +testpaths = ["tests/functional", "tests/unit"] +env_files = ["test.env"] +addopts = "-v -n auto" +color = true +filterwarnings = [ + "ignore:.*'soft_unicode' has been renamed to 'soft_str'*:DeprecationWarning", + "ignore:unclosed file .*:ResourceWarning", +] diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index b04a6ccf3..000000000 --- a/pytest.ini +++ /dev/null @@ -1,10 +0,0 @@ -[pytest] -filterwarnings = - ignore:.*'soft_unicode' has been renamed to 'soft_str'*:DeprecationWarning - ignore:unclosed file .*:ResourceWarning -env_files = - test.env -testpaths = - tests/unit - tests/integration - tests/functional diff --git a/scripts/build-dist.sh b/scripts/build-dist.sh deleted file mode 100755 index 3c3808399..000000000 --- a/scripts/build-dist.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -eo pipefail - -DBT_PATH="$( cd "$(dirname "$0")/.." ; pwd -P )" - -PYTHON_BIN=${PYTHON_BIN:-python} - -echo "$PYTHON_BIN" - -set -x - -rm -rf "$DBT_PATH"/dist -rm -rf "$DBT_PATH"/build -mkdir -p "$DBT_PATH"/dist - -cd "$DBT_PATH" -$PYTHON_BIN setup.py sdist bdist_wheel - -set +x diff --git a/setup.py b/setup.py deleted file mode 100644 index 79f6025ea..000000000 --- a/setup.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python -import sys - -# require a supported version of Python -if sys.version_info < (3, 9): - print("Error: dbt does not support this version of Python.") - print("Please upgrade to Python 3.9 or higher.") - sys.exit(1) - -try: - from setuptools import find_namespace_packages -except ImportError: - # the user has a downlevel version of setuptools. - print("Error: dbt requires setuptools v40.1.0 or higher.") - print('Please upgrade setuptools with "pip install --upgrade setuptools" and try again') - sys.exit(1) - -from pathlib import Path -from setuptools import setup - - -# pull the long description from the README -README = Path(__file__).parent / "README.md" - -# used for this adapter's version and in determining the compatible dbt-core version -VERSION = Path(__file__).parent / "dbt/adapters/bigquery/__version__.py" - - -def _dbt_bigquery_version() -> str: - """ - Pull the package version from the main package version file - """ - attributes = {} - exec(VERSION.read_text(), attributes) - return attributes["version"] - - -package_name = "dbt-bigquery" -description = """The BigQuery adapter plugin for dbt""" - -setup( - name="dbt-bigquery", - version=_dbt_bigquery_version(), - description="The Bigquery adapter plugin for dbt", - long_description=README.read_text(), - long_description_content_type="text/markdown", - author="dbt Labs", - author_email="info@dbtlabs.com", - url="https://github.com/dbt-labs/dbt-bigquery", - packages=find_namespace_packages(include=["dbt", "dbt.*"]), - include_package_data=True, - install_requires=[ - "dbt-common>=1.10,<2.0", - "dbt-adapters>=1.7,<2.0", - # 3.20 introduced pyarrow>=3.0 under the `pandas` extra - "google-cloud-bigquery[pandas]>=3.0,<4.0", - "google-cloud-storage~=2.4", - "google-cloud-dataproc~=5.0", - # ---- - # Expect compatibility with all new versions of these packages, so lower bounds only. - "google-api-core>=2.11.0", - # add dbt-core to ensure backwards compatibility of installation, this is not a functional dependency - "dbt-core>=1.8.0", - ], - zip_safe=False, - classifiers=[ - "Development Status :: 5 - Production/Stable", - "License :: OSI Approved :: Apache Software License", - "Operating System :: Microsoft :: Windows", - "Operating System :: MacOS :: MacOS X", - "Operating System :: POSIX :: Linux", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - ], - python_requires=">=3.9", -) diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 240d85e34..000000000 --- a/tox.ini +++ /dev/null @@ -1,54 +0,0 @@ -[tox] -skipsdist = True -envlist = py39,py310,py311,py312 - -[testenv:{unit,py39,py310,py311,py312,py}] -description = unit testing -skip_install = true -passenv = - DBT_* - PYTEST_ADDOPTS -commands = {envpython} -m pytest {posargs} tests/unit -deps = - -rdev-requirements.txt - -e. - -[testenv:{integration,py39,py310,py311,py312,py}-{bigquery}] -description = adapter plugin integration testing -skip_install = true -passenv = - DBT_* - BIGQUERY_TEST_* - PYTEST_ADDOPTS - DATAPROC_* - GCS_BUCKET - DD_CIVISIBILITY_AGENTLESS_ENABLED - DD_API_KEY - DD_SITE - DD_ENV - DD_SERVICE -commands = - bigquery: {envpython} -m pytest -n auto {posargs} -vv tests/functional -k "not TestPython" --profile service_account -deps = - -rdev-requirements.txt - . - -[testenv:{python-tests,py39,py310,py311,py312,py}] -description = python integration testing -skip_install = true -passenv = - DBT_* - BIGQUERY_TEST_* - PYTEST_ADDOPTS - DATAPROC_* - GCS_BUCKET - DD_CIVISIBILITY_AGENTLESS_ENABLED - DD_API_KEY - DD_SITE - DD_ENV - DD_SERVICE -commands = - {envpython} -m pytest {posargs} -vv tests/functional -k "TestPython" --profile service_account -deps = - -rdev-requirements.txt - -e. From 2e1a5fdcc7b9cd978660e6e9f030e03075f6fdc5 Mon Sep 17 00:00:00 2001 From: Mike Alfare <13974384+mikealfare@users.noreply.github.com> Date: Fri, 13 Dec 2024 18:18:02 -0500 Subject: [PATCH 10/11] [Bug] Use bigquery default retryable exceptions (#1431) * replace our custom list of retryable exceptions with BigQuery's defaults * remove BadRequest as a retryable error --- .../unreleased/Fixes-20241211-144752.yaml | 6 ++++++ dbt/adapters/bigquery/retry.py | 19 ++----------------- .../unit/test_bigquery_connection_manager.py | 6 ++++-- 3 files changed, 12 insertions(+), 19 deletions(-) create mode 100644 .changes/unreleased/Fixes-20241211-144752.yaml diff --git a/.changes/unreleased/Fixes-20241211-144752.yaml b/.changes/unreleased/Fixes-20241211-144752.yaml new file mode 100644 index 000000000..e666d5c31 --- /dev/null +++ b/.changes/unreleased/Fixes-20241211-144752.yaml @@ -0,0 +1,6 @@ +kind: Fixes +body: Fix retry scenarios so that dbt always retries when BigQuery recommends a retry +time: 2024-12-11T14:47:52.36905-05:00 +custom: + Author: mikealfare + Issue: "263" diff --git a/dbt/adapters/bigquery/retry.py b/dbt/adapters/bigquery/retry.py index 391c00e46..2cbdaa245 100644 --- a/dbt/adapters/bigquery/retry.py +++ b/dbt/adapters/bigquery/retry.py @@ -1,10 +1,8 @@ from typing import Callable, Optional -from google.api_core.exceptions import Forbidden from google.api_core.future.polling import DEFAULT_POLLING from google.api_core.retry import Retry -from google.cloud.bigquery.retry import DEFAULT_RETRY -from google.cloud.exceptions import BadGateway, BadRequest, ServerError +from google.cloud.bigquery.retry import DEFAULT_RETRY, _job_should_retry from requests.exceptions import ConnectionError from dbt.adapters.contracts.connection import Connection, ConnectionState @@ -83,7 +81,7 @@ def __call__(self, error: Exception) -> bool: self._error_count += 1 # if the error is retryable, and we haven't breached the threshold, log and continue - if _is_retryable(error) and self._error_count <= self._retries: + if _job_should_retry(error) and self._error_count <= self._retries: _logger.debug( f"Retry attempt {self._error_count} of {self._retries} after error: {repr(error)}" ) @@ -113,16 +111,3 @@ def on_error(error: Exception): raise FailedToConnectError(str(e)) return on_error - - -def _is_retryable(error: Exception) -> bool: - """Return true for errors that are unlikely to occur again if retried.""" - if isinstance( - error, (BadGateway, BadRequest, ConnectionError, ConnectionResetError, ServerError) - ): - return True - elif isinstance(error, Forbidden) and any( - e["reason"] == "rateLimitExceeded" for e in error.errors - ): - return True - return False diff --git a/tests/unit/test_bigquery_connection_manager.py b/tests/unit/test_bigquery_connection_manager.py index d4c95792e..e7afd692f 100644 --- a/tests/unit/test_bigquery_connection_manager.py +++ b/tests/unit/test_bigquery_connection_manager.py @@ -53,7 +53,7 @@ def generate_connection_reset_error(): assert new_mock_client is not self.mock_client def test_is_retryable(self): - _is_retryable = dbt.adapters.bigquery.retry._is_retryable + _is_retryable = google.cloud.bigquery.retry._job_should_retry exceptions = dbt.adapters.bigquery.impl.google.cloud.exceptions internal_server_error = exceptions.InternalServerError("code broke") bad_request_error = exceptions.BadRequest("code broke") @@ -65,7 +65,9 @@ def test_is_retryable(self): service_unavailable_error = exceptions.ServiceUnavailable("service is unavailable") self.assertTrue(_is_retryable(internal_server_error)) - self.assertTrue(_is_retryable(bad_request_error)) + self.assertFalse( + _is_retryable(bad_request_error) + ) # this was removed after initially being included self.assertTrue(_is_retryable(connection_error)) self.assertFalse(_is_retryable(client_error)) self.assertTrue(_is_retryable(rate_limit_error)) From a219818c5a38339568bfb4e561405cfe8f6732eb Mon Sep 17 00:00:00 2001 From: Mike Alfare <13974384+mikealfare@users.noreply.github.com> Date: Tue, 17 Dec 2024 17:19:56 -0500 Subject: [PATCH 11/11] Retry on 503 (#1408) * add default retry on all client factories, which includes 502 and 503 errors * update retries to use defaults and ensure that a timeout or deadline is set --- .../unreleased/Fixes-20241120-163101.yaml | 7 ++++ dbt/adapters/bigquery/clients.py | 12 +++---- dbt/adapters/bigquery/retry.py | 33 ++++++++++--------- hatch.toml | 1 + 4 files changed, 31 insertions(+), 22 deletions(-) create mode 100644 .changes/unreleased/Fixes-20241120-163101.yaml diff --git a/.changes/unreleased/Fixes-20241120-163101.yaml b/.changes/unreleased/Fixes-20241120-163101.yaml new file mode 100644 index 000000000..ba1f4e937 --- /dev/null +++ b/.changes/unreleased/Fixes-20241120-163101.yaml @@ -0,0 +1,7 @@ +kind: Fixes +body: Fix issue where dbt-bigquery was not retrying in certain retryable scenarios, + e.g. 503's +time: 2024-11-20T16:31:01.60689-05:00 +custom: + Author: mikealfare + Issue: "682" diff --git a/dbt/adapters/bigquery/clients.py b/dbt/adapters/bigquery/clients.py index 18c59fc12..722266240 100644 --- a/dbt/adapters/bigquery/clients.py +++ b/dbt/adapters/bigquery/clients.py @@ -1,10 +1,10 @@ from google.api_core.client_info import ClientInfo from google.api_core.client_options import ClientOptions -from google.api_core.retry import Retry from google.auth.exceptions import DefaultCredentialsError -from google.cloud.bigquery import Client as BigQueryClient +from google.cloud.bigquery import Client as BigQueryClient, DEFAULT_RETRY as BQ_DEFAULT_RETRY from google.cloud.dataproc_v1 import BatchControllerClient, JobControllerClient from google.cloud.storage import Client as StorageClient +from google.cloud.storage.retry import DEFAULT_RETRY as GCS_DEFAULT_RETRY from dbt.adapters.events.logging import AdapterLogger @@ -28,7 +28,7 @@ def create_bigquery_client(credentials: BigQueryCredentials) -> BigQueryClient: return _create_bigquery_client(credentials) -@Retry() # google decorator. retries on transient errors with exponential backoff +@GCS_DEFAULT_RETRY def create_gcs_client(credentials: BigQueryCredentials) -> StorageClient: return StorageClient( project=credentials.execution_project, @@ -36,7 +36,7 @@ def create_gcs_client(credentials: BigQueryCredentials) -> StorageClient: ) -@Retry() # google decorator. retries on transient errors with exponential backoff +# dataproc does not appear to have a default retry like BQ and GCS def create_dataproc_job_controller_client(credentials: BigQueryCredentials) -> JobControllerClient: return JobControllerClient( credentials=create_google_credentials(credentials), @@ -44,7 +44,7 @@ def create_dataproc_job_controller_client(credentials: BigQueryCredentials) -> J ) -@Retry() # google decorator. retries on transient errors with exponential backoff +# dataproc does not appear to have a default retry like BQ and GCS def create_dataproc_batch_controller_client( credentials: BigQueryCredentials, ) -> BatchControllerClient: @@ -54,7 +54,7 @@ def create_dataproc_batch_controller_client( ) -@Retry() # google decorator. retries on transient errors with exponential backoff +@BQ_DEFAULT_RETRY def _create_bigquery_client(credentials: BigQueryCredentials) -> BigQueryClient: return BigQueryClient( credentials.execution_project, diff --git a/dbt/adapters/bigquery/retry.py b/dbt/adapters/bigquery/retry.py index 2cbdaa245..cc197a7d3 100644 --- a/dbt/adapters/bigquery/retry.py +++ b/dbt/adapters/bigquery/retry.py @@ -2,7 +2,7 @@ from google.api_core.future.polling import DEFAULT_POLLING from google.api_core.retry import Retry -from google.cloud.bigquery.retry import DEFAULT_RETRY, _job_should_retry +from google.cloud.bigquery.retry import DEFAULT_JOB_RETRY, _job_should_retry from requests.exceptions import ConnectionError from dbt.adapters.contracts.connection import Connection, ConnectionState @@ -15,14 +15,8 @@ _logger = AdapterLogger("BigQuery") - -_SECOND = 1.0 -_MINUTE = 60 * _SECOND -_HOUR = 60 * _MINUTE -_DAY = 24 * _HOUR -_DEFAULT_INITIAL_DELAY = _SECOND -_DEFAULT_MAXIMUM_DELAY = 3 * _SECOND -_DEFAULT_POLLING_MAXIMUM_DELAY = 10 * _SECOND +_MINUTE = 60.0 +_DAY = 24 * 60 * 60.0 class RetryFactory: @@ -44,7 +38,7 @@ def create_job_execution_timeout(self, fallback: float = _DAY) -> float: ) # keep _DAY here so it's not overridden by passing fallback=None def create_retry(self, fallback: Optional[float] = None) -> Retry: - return DEFAULT_RETRY.with_timeout(self._job_execution_timeout or fallback or _DAY) + return DEFAULT_JOB_RETRY.with_timeout(self._job_execution_timeout or fallback or _DAY) def create_polling(self, model_timeout: Optional[float] = None) -> Retry: return DEFAULT_POLLING.with_timeout(model_timeout or self._job_execution_timeout or _DAY) @@ -53,14 +47,21 @@ def create_reopen_with_deadline(self, connection: Connection) -> Retry: """ This strategy mimics what was accomplished with _retry_and_handle """ - return Retry( - predicate=_DeferredException(self._retries), - initial=_DEFAULT_INITIAL_DELAY, - maximum=_DEFAULT_MAXIMUM_DELAY, - deadline=self._job_deadline, - on_error=_create_reopen_on_error(connection), + + retry = DEFAULT_JOB_RETRY.with_delay(maximum=3.0).with_predicate( + _DeferredException(self._retries) ) + # there is no `with_on_error` method, but we want to retain the defaults on `DEFAULT_JOB_RETRY + retry._on_error = _create_reopen_on_error(connection) + + # don't override the default deadline to None if the user did not provide one, + # the process will never end + if deadline := self._job_deadline: + return retry.with_deadline(deadline) + + return retry + class _DeferredException: """ diff --git a/hatch.toml b/hatch.toml index 34ba6d2a3..4e0baf9f4 100644 --- a/hatch.toml +++ b/hatch.toml @@ -8,6 +8,7 @@ packages = ["dbt"] packages = ["dbt"] [envs.default] +python = "3.9" dependencies = [ "dbt-adapters @ git+https://github.com/dbt-labs/dbt-adapters.git", "dbt-common @ git+https://github.com/dbt-labs/dbt-common.git",