Merge branch 'copy-code-from-dbt-core' of https://github.com/dbt-labs…

…/dbt-common into er/ci-workflow
dbt-labs · Jan 5, 2024 · 73f0a35 · 73f0a35
2 parents f3cfc0b + 0443b52
commit 73f0a35
Show file tree

Hide file tree

Showing 13 changed files with 1,423 additions and 10 deletions.
diff --git a/Makefile b/Makefile
@@ -1,14 +1,24 @@
 .DEFAULT_GOAL:=help
 
-.PHONY: dev_req
-dev_req: ## Installs dbt-* packages in develop mode along with only development dependencies.
-	@\
-	pip install -r dev-requirements.txt
 
-.PHONY: dev
-dev: dev_req ## Installs dbt-* packages in develop mode along with development dependencies and pre-commit.
-	@\
-	pre-commit install
+.PHONY: run install-hatch overwrite-pre-commit install test lint json_schema
+
+run:
+	export FORMAT_JSON_LOGS="1"
+
+install-hatch:
+	pip3 install hatch
+
+# This edits your local pre-commit hook file to use Hatch when executing.
+overwrite-pre-commit:
+	hatch run dev-env:pre-commit install
+	hatch run dev-env:sed -i -e "s/exec /exec hatch run dev-env:/g" .git/hooks/pre-commit
+
+test:
+	export FORMAT_JSON_LOGS="1" && hatch -v run dev-env:pytest -n auto tests
+
+lint:
+	hatch run dev-env:pre-commit run --show-diff-on-failure --color=always --all-files
 
 .PHONY: proto_types
 proto_types:  ## generates google protobuf python file from types.proto
@@ -20,4 +30,3 @@ help: ## Show this help message.
 	@echo
 	@echo 'targets:'
 	@grep -E '^[8+a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
-
diff --git a/dbt/common/ui.py b/dbt/common/ui.py
@@ -1,8 +1,20 @@
+import sys
 import textwrap
 from typing import Dict
 
 import colorama
 
+# Colorama is needed for colored logs on Windows because we're using logger.info
+# intead of print(). If the Windows env doesn't have a TERM var set or it is set to None
+# (i.e. in the case of Git Bash on Windows- this emulates Unix), then it's safe to initialize
+# Colorama with wrapping turned on which allows us to strip ANSI sequences from stdout.
+# You can safely initialize Colorama for any OS and the coloring stays the same except
+# when piped to another process for Linux and MacOS, then it loses the coloring. To combat
+# that, we will just initialize Colorama when needed on Windows using a non-Unix terminal.
+
+if sys.platform == "win32" and (not os.getenv("TERM") or os.getenv("TERM") == "None"):
+    colorama.init(wrap=True)
+
 COLORS: Dict[str, str] = {
     "red": colorama.Fore.RED,
     "green": colorama.Fore.GREEN,

diff --git a/pyproject.toml b/pyproject.toml
@@ -21,10 +21,13 @@ classifiers = [
 ]
 dependencies = [
   "agate~=1.7.0",
+  "colorama>=0.3.9,<0.5",  # TODO: major version 0 - should we use it?
   "jsonschema~=4.0",
   "Jinja2~=3.0",
   "mashumaro[msgpack]~=3.9",
+  "protobuf>=4.0.0",
   "python-dateutil~=2.0",
+  "requests<3.0.0",
   "typing-extensions~=4.4",
 ]
 
@@ -106,4 +109,4 @@ disallow_untyped_defs = false
 profile = "black"
 
 [tool.black]
-line-length = 120
+line-length = 120
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
diff --git a/tests/unit/test_agate_helper.py b/tests/unit/test_agate_helper.py
@@ -0,0 +1,227 @@
+import unittest
+
+import agate
+
+from datetime import datetime
+from decimal import Decimal
+from isodate import tzinfo
+import os
+from shutil import rmtree
+from tempfile import mkdtemp
+from dbt.common.clients import agate_helper
+
+SAMPLE_CSV_DATA = """a,b,c,d,e,f,g
+1,n,test,3.2,20180806T11:33:29.320Z,True,NULL
+2,y,asdf,900,20180806T11:35:29.320Z,False,a string"""
+
+SAMPLE_CSV_BOM_DATA = "\ufeff" + SAMPLE_CSV_DATA
+
+
+EXPECTED = [
+    [
+        1,
+        "n",
+        "test",
+        Decimal("3.2"),
+        datetime(2018, 8, 6, 11, 33, 29, 320000, tzinfo=tzinfo.Utc()),
+        True,
+        None,
+    ],
+    [
+        2,
+        "y",
+        "asdf",
+        900,
+        datetime(2018, 8, 6, 11, 35, 29, 320000, tzinfo=tzinfo.Utc()),
+        False,
+        "a string",
+    ],
+]
+
+
+EXPECTED_STRINGS = [
+    ["1", "n", "test", "3.2", "20180806T11:33:29.320Z", "True", None],
+    ["2", "y", "asdf", "900", "20180806T11:35:29.320Z", "False", "a string"],
+]
+
+
+class TestAgateHelper(unittest.TestCase):
+    def setUp(self):
+        self.tempdir = mkdtemp()
+
+    def tearDown(self):
+        rmtree(self.tempdir)
+
+    def test_from_csv(self):
+        path = os.path.join(self.tempdir, "input.csv")
+        with open(path, "wb") as fp:
+            fp.write(SAMPLE_CSV_DATA.encode("utf-8"))
+        tbl = agate_helper.from_csv(path, ())
+        self.assertEqual(len(tbl), len(EXPECTED))
+        for idx, row in enumerate(tbl):
+            self.assertEqual(list(row), EXPECTED[idx])
+
+    def test_bom_from_csv(self):
+        path = os.path.join(self.tempdir, "input.csv")
+        with open(path, "wb") as fp:
+            fp.write(SAMPLE_CSV_BOM_DATA.encode("utf-8"))
+        tbl = agate_helper.from_csv(path, ())
+        self.assertEqual(len(tbl), len(EXPECTED))
+        for idx, row in enumerate(tbl):
+            self.assertEqual(list(row), EXPECTED[idx])
+
+    def test_from_csv_all_reserved(self):
+        path = os.path.join(self.tempdir, "input.csv")
+        with open(path, "wb") as fp:
+            fp.write(SAMPLE_CSV_DATA.encode("utf-8"))
+        tbl = agate_helper.from_csv(path, tuple("abcdefg"))
+        self.assertEqual(len(tbl), len(EXPECTED_STRINGS))
+        for expected, row in zip(EXPECTED_STRINGS, tbl):
+            self.assertEqual(list(row), expected)
+
+    def test_from_data(self):
+        column_names = ["a", "b", "c", "d", "e", "f", "g"]
+        data = [
+            {
+                "a": "1",
+                "b": "n",
+                "c": "test",
+                "d": "3.2",
+                "e": "20180806T11:33:29.320Z",
+                "f": "True",
+                "g": "NULL",
+            },
+            {
+                "a": "2",
+                "b": "y",
+                "c": "asdf",
+                "d": "900",
+                "e": "20180806T11:35:29.320Z",
+                "f": "False",
+                "g": "a string",
+            },
+        ]
+        tbl = agate_helper.table_from_data(data, column_names)
+        self.assertEqual(len(tbl), len(EXPECTED))
+        for idx, row in enumerate(tbl):
+            self.assertEqual(list(row), EXPECTED[idx])
+
+    def test_datetime_formats(self):
+        path = os.path.join(self.tempdir, "input.csv")
+        datetimes = [
+            "20180806T11:33:29.000Z",
+            "20180806T11:33:29Z",
+            "20180806T113329Z",
+        ]
+        expected = datetime(2018, 8, 6, 11, 33, 29, 0, tzinfo=tzinfo.Utc())
+        for dt in datetimes:
+            with open(path, "wb") as fp:
+                fp.write("a\n{}".format(dt).encode("utf-8"))
+            tbl = agate_helper.from_csv(path, ())
+            self.assertEqual(tbl[0][0], expected)
+
+    def test_merge_allnull(self):
+        t1 = agate_helper.table_from_rows([(1, "a", None), (2, "b", None)], ("a", "b", "c"))
+        t2 = agate_helper.table_from_rows([(3, "c", None), (4, "d", None)], ("a", "b", "c"))
+        result = agate_helper.merge_tables([t1, t2])
+        self.assertEqual(result.column_names, ("a", "b", "c"))
+        assert isinstance(result.column_types[0], agate_helper.Integer)
+        assert isinstance(result.column_types[1], agate.data_types.Text)
+        assert isinstance(result.column_types[2], agate_helper.Integer)
+        self.assertEqual(len(result), 4)
+
+    def test_merge_mixed(self):
+        t1 = agate_helper.table_from_rows(
+            [(1, "a", None, None), (2, "b", None, None)], ("a", "b", "c", "d")
+        )
+        t2 = agate_helper.table_from_rows(
+            [(3, "c", "dog", 1), (4, "d", "cat", 5)], ("a", "b", "c", "d")
+        )
+        t3 = agate_helper.table_from_rows(
+            [(3, "c", None, 1.5), (4, "d", None, 3.5)], ("a", "b", "c", "d")
+        )
+
+        result = agate_helper.merge_tables([t1, t2])
+        self.assertEqual(result.column_names, ("a", "b", "c", "d"))
+        assert isinstance(result.column_types[0], agate_helper.Integer)
+        assert isinstance(result.column_types[1], agate.data_types.Text)
+        assert isinstance(result.column_types[2], agate.data_types.Text)
+        assert isinstance(result.column_types[3], agate_helper.Integer)
+        self.assertEqual(len(result), 4)
+
+        result = agate_helper.merge_tables([t1, t3])
+        self.assertEqual(result.column_names, ("a", "b", "c", "d"))
+        assert isinstance(result.column_types[0], agate_helper.Integer)
+        assert isinstance(result.column_types[1], agate.data_types.Text)
+        assert isinstance(result.column_types[2], agate_helper.Integer)
+        assert isinstance(result.column_types[3], agate.data_types.Number)
+        self.assertEqual(len(result), 4)
+
+        result = agate_helper.merge_tables([t2, t3])
+        self.assertEqual(result.column_names, ("a", "b", "c", "d"))
+        assert isinstance(result.column_types[0], agate_helper.Integer)
+        assert isinstance(result.column_types[1], agate.data_types.Text)
+        assert isinstance(result.column_types[2], agate.data_types.Text)
+        assert isinstance(result.column_types[3], agate.data_types.Number)
+        self.assertEqual(len(result), 4)
+
+        result = agate_helper.merge_tables([t3, t2])
+        self.assertEqual(result.column_names, ("a", "b", "c", "d"))
+        assert isinstance(result.column_types[0], agate_helper.Integer)
+        assert isinstance(result.column_types[1], agate.data_types.Text)
+        assert isinstance(result.column_types[2], agate.data_types.Text)
+        assert isinstance(result.column_types[3], agate.data_types.Number)
+        self.assertEqual(len(result), 4)
+
+        result = agate_helper.merge_tables([t1, t2, t3])
+        self.assertEqual(result.column_names, ("a", "b", "c", "d"))
+        assert isinstance(result.column_types[0], agate_helper.Integer)
+        assert isinstance(result.column_types[1], agate.data_types.Text)
+        assert isinstance(result.column_types[2], agate.data_types.Text)
+        assert isinstance(result.column_types[3], agate.data_types.Number)
+        self.assertEqual(len(result), 6)
+
+    def test_nocast_string_types(self):
+        # String fields should not be coerced into a representative type
+        # See: https://github.com/dbt-labs/dbt-core/issues/2984
+
+        column_names = ["a", "b", "c", "d", "e"]
+        result_set = [
+            {"a": "0005", "b": "01T00000aabbccdd", "c": "true", "d": 10, "e": False},
+            {"a": "0006", "b": "01T00000aabbccde", "c": "false", "d": 11, "e": True},
+        ]
+
+        tbl = agate_helper.table_from_data_flat(data=result_set, column_names=column_names)
+        self.assertEqual(len(tbl), len(result_set))
+
+        expected = [
+            ["0005", "01T00000aabbccdd", "true", Decimal(10), False],
+            ["0006", "01T00000aabbccde", "false", Decimal(11), True],
+        ]
+
+        for i, row in enumerate(tbl):
+            self.assertEqual(list(row), expected[i])
+
+    def test_nocast_bool_01(self):
+        # True and False values should not be cast to 1 and 0, and vice versa
+        # See: https://github.com/dbt-labs/dbt-core/issues/4511
+
+        column_names = ["a", "b"]
+        result_set = [
+            {"a": True, "b": 1},
+            {"a": False, "b": 0},
+        ]
+
+        tbl = agate_helper.table_from_data_flat(data=result_set, column_names=column_names)
+        self.assertEqual(len(tbl), len(result_set))
+
+        assert isinstance(tbl.column_types[0], agate.data_types.Boolean)
+        assert isinstance(tbl.column_types[1], agate_helper.Integer)
+
+        expected = [
+            [True, Decimal(1)],
+            [False, Decimal(0)],
+        ]
+
+        for i, row in enumerate(tbl):
+            self.assertEqual(list(row), expected[i])
diff --git a/tests/unit/test_connection_retries.py b/tests/unit/test_connection_retries.py
@@ -0,0 +1,59 @@
+import functools
+import pytest
+from requests.exceptions import RequestException
+from dbt.common.exceptions import ConnectionError
+from dbt.common.utils.connection import connection_exception_retry
+
+
+def no_retry_fn():
+    return "success"
+
+
+class TestNoRetries:
+    def test_no_retry(self):
+        fn_to_retry = functools.partial(no_retry_fn)
+        result = connection_exception_retry(fn_to_retry, 3)
+
+        expected = "success"
+
+        assert result == expected
+
+
+def no_success_fn():
+    raise RequestException("You'll never pass")
+    return "failure"
+
+
+class TestMaxRetries:
+    def test_no_retry(self):
+        fn_to_retry = functools.partial(no_success_fn)
+
+        with pytest.raises(ConnectionError):
+            connection_exception_retry(fn_to_retry, 3)
+
+
+def single_retry_fn():
+    global counter
+    if counter == 0:
+        counter += 1
+        raise RequestException("You won't pass this one time")
+    elif counter == 1:
+        counter += 1
+        return "success on 2"
+
+    return "How did we get here?"
+
+
+class TestSingleRetry:
+    def test_no_retry(self):
+        global counter
+        counter = 0
+
+        fn_to_retry = functools.partial(single_retry_fn)
+        result = connection_exception_retry(fn_to_retry, 3)
+        expected = "success on 2"
+
+        # We need to test the return value here, not just that it did not throw an error.
+        # If the value is not being passed it causes cryptic errors
+        assert result == expected
+        assert counter == 2