diff --git a/.changes/unreleased/Features-20230915-091507.yaml b/.changes/unreleased/Features-20230915-091507.yaml new file mode 100644 index 000000000..7a5fc3a54 --- /dev/null +++ b/.changes/unreleased/Features-20230915-091507.yaml @@ -0,0 +1,6 @@ +kind: Features +body: Redact cases where raw data can be leaked logs +time: 2023-09-15T09:15:07.430443+10:00 +custom: + Author: jaypeedevlin + Issue: "772" diff --git a/dbt/adapters/snowflake/connections.py b/dbt/adapters/snowflake/connections.py index 1078b9f73..cf2253f09 100644 --- a/dbt/adapters/snowflake/connections.py +++ b/dbt/adapters/snowflake/connections.py @@ -48,7 +48,11 @@ logger = AdapterLogger("Snowflake") _TOKEN_REQUEST_URL = "https://{}.snowflakecomputing.com/oauth/token-request" -ROW_VALUE_REGEX = re.compile(r"Row Values: \[(.|\n)*\]") + +ERROR_REDACTION_PATTERNS = { + re.compile(r"Row Values: \[(.|\n)*\]"): "Row Values: [redacted]", + re.compile(r"Duplicate field key '(.|\n)*'"): "Duplicate field key '[redacted]'", +} @dataclass @@ -271,13 +275,14 @@ def exception_handler(self, sql): try: yield except snowflake.connector.errors.ProgrammingError as e: - unscrubbed_msg = str(e) + msg = str(e) # A class of Snowflake errors -- such as a failure from attempting to merge # duplicate rows -- includes row values in the error message, i.e. # [12345, "col_a_value", "col_b_value", etc...]. We don't want to log potentially # sensitive user data. - msg = re.sub(ROW_VALUE_REGEX, "Row Values: [redacted]", unscrubbed_msg) + for regex_pattern, replacement_message in ERROR_REDACTION_PATTERNS.items(): + msg = re.sub(regex_pattern, replacement_message, msg) logger.debug("Snowflake query id: {}".format(e.sfqid)) logger.debug("Snowflake error: {}".format(msg)) diff --git a/tests/functional/redact_log_values/test_duplicate_key_not_in_exceptions.py b/tests/functional/redact_log_values/test_duplicate_key_not_in_exceptions.py new file mode 100644 index 000000000..e32f08c15 --- /dev/null +++ b/tests/functional/redact_log_values/test_duplicate_key_not_in_exceptions.py @@ -0,0 +1,33 @@ +import pytest + +from dbt.tests.util import ( + run_dbt, +) + +_MODELS__view = """ +{{ config( + materialized='table', +) }} + +with dupes as ( + select 'foo' as key, 1 as value + union all + select 'foo' as key, 2 as value +) + +select + object_agg(key, value) as agg +from dupes +""" + + +class TestDuplicateKeyNotInExceptions: + @pytest.fixture(scope="class") + def models(self): + return {"model.sql": _MODELS__view} + + def test_row_values_were_scrubbed_from_duplicate_merge_exception(self, project): + result = run_dbt(["run", "-s", "model"], expect_pass=False) + assert len(result) == 1 + assert "Duplicate field key '[redacted]'" in result[0].message + assert "'foo'" not in result[0].message diff --git a/tests/functional/row_values_unlogged/test_row_values_not_in_exceptions.py b/tests/functional/redact_log_values/test_row_values_not_in_exceptions.py similarity index 94% rename from tests/functional/row_values_unlogged/test_row_values_not_in_exceptions.py rename to tests/functional/redact_log_values/test_row_values_not_in_exceptions.py index 409b18ae3..ec4fe2348 100644 --- a/tests/functional/row_values_unlogged/test_row_values_not_in_exceptions.py +++ b/tests/functional/redact_log_values/test_row_values_not_in_exceptions.py @@ -31,3 +31,4 @@ def test_row_values_were_scrubbed_from_duplicate_merge_exception(self, project): result = run_dbt(["run", "-s", "model"], expect_pass=False) assert len(result) == 1 assert "Row Values: [redacted]" in result[0].message + assert "'one'" not in result[0].message