feature/decouple adapters from core (#972)

* Add Github action for integration test * Update tox * Fetch spark from https link * Use Spark version 3.1.2 * Seperate running Spark session and thrift * Use Spark 3.1.2 and Hadoop 3.2 * Reset tox.ini * Remove base pythons in tox.ini * Fix reference to Docker compose file * Remove timeout * Remove artifact steps * Bump Spark and Hadoop versions * Reset Spark and Hadoop version * Update comment * Add changie * add databricks and PR execution protections * use single quotes * remove `_target` suffix * add comment to test * specify container user as root * formatting * remove python setup for pre-existing container * download simba * fix curl call * fix curl call * fix curl call * fix curl call * fix curl call * fix curl call * fix db test naming * confirm ODBC driver installed * add odbc driver env var * add odbc driver env var * specify platform * check odbc driver integrity * add dbt user env var * add dbt user env var * fix host_name env var * try removing architecture arg * swap back to pull_request_target * try running on host instead of container * Update .github/workflows/integration.yml Co-authored-by: Emily Rockman <[email protected]> * try running odbcinst -j * remove bash * add sudo * add sudo * update odbc.ini * install libsasl2-modules-gssapi-mit * install libsasl2-modules-gssapi-mit * set -e on odbc install * set -e on odbc install * set -e on odbc install * sudo echo odbc.inst * remove postgres components * remove release related items * remove irrelevant output * move long bash script into its own file * update integration.yml to align with other adapters * revert name change * revert name change * combine databricks and spark tests * combine databricks and spark tests * Add dagger * remove platform * add dagger setup * add dagger setup * set env vars * install requirements * install requirements * add DEFAULT_ENV_VARS and test_path arg * remove circle ci * formatting * update changie * Update .changes/unreleased/Under the Hood-20230929-161218.yaml Co-authored-by: Emily Rockman <[email protected]> * formatting fixes and simplify env_var handling * remove tox, update CONTRIBUTING.md and cleanup GHA workflows * remove tox, update CONTRIBUTING.md and cleanup GHA workflows * install test reqs in main.yml * install test reqs in main.yml * formatting * remove tox from dev-requirements.txt and Makefile * clarify spark crt instantiation * add comments on python-version * initial migration changes * unpin * implement core / adapters decoupling * fix list_relations * fix typing and exception imports * fix typing and exception imports * add changie * replace dbt.common with dbt_common * update setup.py * add dbt-adapters * update setup.py * fix credentials import * fix dev-requirements.txt * dagger improvements to caching and installing package under test * update requirements * add cluster start fixture * update conftest.py * re-order dagger setup to reduce cache invalidation * renove dbt-core version dependency version check --------- Co-authored-by: Cor Zuurmond <[email protected]> Co-authored-by: Florian Eiden <[email protected]> Co-authored-by: Emily Rockman <[email protected]> Co-authored-by: Mike Alfare <[email protected]> Co-authored-by: Mike Alfare <[email protected]>
dbt-labs · Jan 25, 2024 · 5d90ff9 · 5d90ff9
1 parent e97918b
commit 5d90ff9
Show file tree

Hide file tree

Showing 16 changed files with 188 additions and 132 deletions.
diff --git a/.changes/unreleased/Under the Hood-20240111-114806.yaml b/.changes/unreleased/Under the Hood-20240111-114806.yaml
@@ -0,0 +1,6 @@
+kind: Under the Hood
+body: Update import paths and list_relations to support decoupling adapters/core
+time: 2024-01-11T11:48:06.120111-08:00
+custom:
+  Author: colin-rogers-dbt
+  Issue: "972"
diff --git a/dagger/requirements.txt b/dagger/requirements.txt
@@ -1,2 +1,2 @@
-dagger-io~=0.8.0
+dagger-io~=0.9.7
 python-dotenv
diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py
@@ -2,6 +2,7 @@
 
 import argparse
 import sys
+from typing import Dict
 
 import anyio as anyio
 import dagger as dagger
@@ -19,7 +20,7 @@
 TESTING_ENV_VARS.update({"ODBC_DRIVER": "/opt/simba/spark/lib/64/libsparkodbc_sb64.so"})
 
 
-def env_variables(envs: dict[str, str]):
+def env_variables(envs: Dict[str, str]):
     def env_variables_inner(ctr: dagger.Container):
         for key, value in envs.items():
             ctr = ctr.with_env_variable(key, value)
@@ -28,18 +29,19 @@ def env_variables_inner(ctr: dagger.Container):
     return env_variables_inner
 
 
-async def get_postgres_container(client: dagger.Client) -> (dagger.Container, str):
-    ctr = await (
+def get_postgres_container(client: dagger.Client) -> (dagger.Container, str):
+    ctr = (
         client.container()
         .from_("postgres:13")
         .with_env_variable("POSTGRES_PASSWORD", "postgres")
         .with_exposed_port(PG_PORT)
+        .as_service()
     )
 
     return ctr, "postgres_db"
 
 
-async def get_spark_container(client: dagger.Client) -> (dagger.Container, str):
+def get_spark_container(client: dagger.Client) -> (dagger.Service, str):
     spark_dir = client.host().directory("./dagger/spark-container")
     spark_ctr_base = (
         client.container()
@@ -63,7 +65,7 @@ async def get_spark_container(client: dagger.Client) -> (dagger.Container, str):
     )
 
     # postgres is the metastore here
-    pg_ctr, pg_host = await get_postgres_container(client)
+    pg_ctr, pg_host = get_postgres_container(client)
 
     spark_ctr = (
         spark_ctr_base.with_service_binding(alias=pg_host, service=pg_ctr)
@@ -77,6 +79,7 @@ async def get_spark_container(client: dagger.Client) -> (dagger.Container, str):
             ]
         )
         .with_exposed_port(10000)
+        .as_service()
     )
 
     return spark_ctr, "spark_db"
@@ -85,29 +88,49 @@ async def get_spark_container(client: dagger.Client) -> (dagger.Container, str):
 async def test_spark(test_args):
     async with dagger.Connection(dagger.Config(log_output=sys.stderr)) as client:
         test_profile = test_args.profile
-        req_files = client.host().directory("./", include=["*.txt", "*.env", "*.ini"])
+
+        # create cache volumes, these are persisted between runs saving time when developing locally
+        os_reqs_cache = client.cache_volume("os_reqs")
+        pip_cache = client.cache_volume("pip")
+
+        # setup directories as we don't want to copy the whole repo into the container
+        req_files = client.host().directory(
+            "./", include=["*.txt", "*.env", "*.ini", "*.md", "setup.py"]
+        )
         dbt_spark_dir = client.host().directory("./dbt")
         test_dir = client.host().directory("./tests")
         scripts = client.host().directory("./dagger/scripts")
+
         platform = dagger.Platform("linux/amd64")
         tst_container = (
             client.container(platform=platform)
             .from_("python:3.8-slim")
-            .with_directory("/.", req_files)
-            .with_directory("/dbt", dbt_spark_dir)
-            .with_directory("/tests", test_dir)
+            .with_mounted_cache("/var/cache/apt/archives", os_reqs_cache)
+            .with_mounted_cache("/root/.cache/pip", pip_cache)
+            # install OS deps first so any local changes don't invalidate the cache
             .with_directory("/scripts", scripts)
-            .with_exec("./scripts/install_os_reqs.sh")
+            .with_exec(["./scripts/install_os_reqs.sh"])
+            # install dbt-spark + python deps
+            .with_directory("/src", req_files)
+            .with_directory("src/dbt", dbt_spark_dir)
+            .with_directory("src/tests", test_dir)
+            .with_workdir("/src")
+            .with_exec(["pip", "install", "-U", "pip"])
             .with_exec(["pip", "install", "-r", "requirements.txt"])
             .with_exec(["pip", "install", "-r", "dev-requirements.txt"])
+            .with_exec(["pip", "install", "-e", "."])
         )
 
         if test_profile == "apache_spark":
-            spark_ctr, spark_host = await get_spark_container(client)
+            spark_ctr, spark_host = get_spark_container(client)
             tst_container = tst_container.with_service_binding(alias=spark_host, service=spark_ctr)
 
         elif test_profile in ["databricks_cluster", "databricks_sql_endpoint"]:
-            tst_container = tst_container.with_exec("./scripts/configure_odbc.sh")
+            tst_container = (
+                tst_container.with_workdir("/")
+                .with_exec(["./scripts/configure_odbc.sh"])
+                .with_workdir("/src")
+            )
 
         elif test_profile == "spark_session":
             tst_container = tst_container.with_exec(["pip", "install", "pyspark"])

diff --git a/dbt/adapters/spark/column.py b/dbt/adapters/spark/column.py
@@ -2,7 +2,7 @@
 from typing import Any, Dict, Optional, TypeVar, Union
 
 from dbt.adapters.base.column import Column
-from dbt.dataclass_schema import dbtClassMixin
+from dbt_common.dataclass_schema import dbtClassMixin
 
 Self = TypeVar("Self", bound="SparkColumn")
 

diff --git a/dbt/adapters/spark/connections.py b/dbt/adapters/spark/connections.py
@@ -1,11 +1,17 @@
 from contextlib import contextmanager
 
-import dbt.exceptions
-from dbt.adapters.base import Credentials
+from dbt.adapters.contracts.connection import (
+    AdapterResponse,
+    ConnectionState,
+    Connection,
+    Credentials,
+)
+from dbt.adapters.events.logging import AdapterLogger
+from dbt.adapters.exceptions import FailedToConnectError
 from dbt.adapters.sql import SQLConnectionManager
-from dbt.contracts.connection import ConnectionState, AdapterResponse
-from dbt.events import AdapterLogger
-from dbt.utils import DECIMALS
+from dbt_common.exceptions import DbtConfigError, DbtRuntimeError, DbtDatabaseError
+
+from dbt_common.utils.encoding import DECIMALS
 from dbt.adapters.spark import __version__
 
 try:
@@ -22,8 +28,7 @@
     pyodbc = None
 from datetime import datetime
 import sqlparams
-from dbt.contracts.connection import Connection
-from dbt.dataclass_schema import StrEnum
+from dbt_common.dataclass_schema import StrEnum
 from dataclasses import dataclass, field
 from typing import Any, Dict, Optional, Union, Tuple, List, Generator, Iterable, Sequence
 
@@ -92,15 +97,15 @@ def cluster_id(self) -> Optional[str]:
 
     def __post_init__(self) -> None:
         if self.method is None:
-            raise dbt.exceptions.DbtRuntimeError("Must specify `method` in profile")
+            raise DbtRuntimeError("Must specify `method` in profile")
         if self.host is None:
-            raise dbt.exceptions.DbtRuntimeError("Must specify `host` in profile")
+            raise DbtRuntimeError("Must specify `host` in profile")
         if self.schema is None:
-            raise dbt.exceptions.DbtRuntimeError("Must specify `schema` in profile")
+            raise DbtRuntimeError("Must specify `schema` in profile")
 
         # spark classifies database and schema as the same thing
         if self.database is not None and self.database != self.schema:
-            raise dbt.exceptions.DbtRuntimeError(
+            raise DbtRuntimeError(
                 f"    schema: {self.schema} \n"
                 f"    database: {self.database} \n"
                 f"On Spark, database must be omitted or have the same value as"
@@ -112,7 +117,7 @@ def __post_init__(self) -> None:
             try:
                 import pyodbc  # noqa: F401
             except ImportError as e:
-                raise dbt.exceptions.DbtRuntimeError(
+                raise DbtRuntimeError(
                     f"{self.method} connection method requires "
                     "additional dependencies. \n"
                     "Install the additional required dependencies with "
@@ -121,7 +126,7 @@ def __post_init__(self) -> None:
                 ) from e
 
         if self.method == SparkConnectionMethod.ODBC and self.cluster and self.endpoint:
-            raise dbt.exceptions.DbtRuntimeError(
+            raise DbtRuntimeError(
                 "`cluster` and `endpoint` cannot both be set when"
                 f" using {self.method} method to connect to Spark"
             )
@@ -130,7 +135,7 @@ def __post_init__(self) -> None:
             self.method == SparkConnectionMethod.HTTP
             or self.method == SparkConnectionMethod.THRIFT
         ) and not (ThriftState and THttpClient and hive):
-            raise dbt.exceptions.DbtRuntimeError(
+            raise DbtRuntimeError(
                 f"{self.method} connection method requires "
                 "additional dependencies. \n"
                 "Install the additional required dependencies with "
@@ -141,7 +146,7 @@ def __post_init__(self) -> None:
             try:
                 import pyspark  # noqa: F401
             except ImportError as e:
-                raise dbt.exceptions.DbtRuntimeError(
+                raise DbtRuntimeError(
                     f"{self.method} connection method requires "
                     "additional dependencies. \n"
                     "Install the additional required dependencies with "
@@ -291,13 +296,11 @@ def execute(self, sql: str, bindings: Optional[List[Any]] = None) -> None:
         if poll_state.errorMessage:
             logger.debug("Poll response: {}".format(poll_state))
             logger.debug("Poll status: {}".format(state))
-            raise dbt.exceptions.DbtDatabaseError(poll_state.errorMessage)
+            raise DbtDatabaseError(poll_state.errorMessage)
 
         elif state not in STATE_SUCCESS:
             status_type = ThriftState._VALUES_TO_NAMES.get(state, "Unknown<{!r}>".format(state))
-            raise dbt.exceptions.DbtDatabaseError(
-                "Query failed with status: {}".format(status_type)
-            )
+            raise DbtDatabaseError("Query failed with status: {}".format(status_type))
 
         logger.debug("Poll status: {}, query complete".format(state))
 
@@ -358,9 +361,9 @@ def exception_handler(self, sql: str) -> Generator[None, None, None]:
             thrift_resp = exc.args[0]
             if hasattr(thrift_resp, "status"):
                 msg = thrift_resp.status.errorMessage
-                raise dbt.exceptions.DbtRuntimeError(msg)
+                raise DbtRuntimeError(msg)
             else:
-                raise dbt.exceptions.DbtRuntimeError(str(exc))
+                raise DbtRuntimeError(str(exc))
 
     def cancel(self, connection: Connection) -> None:
         connection.handle.cancel()
@@ -390,7 +393,7 @@ def validate_creds(cls, creds: Any, required: Iterable[str]) -> None:
 
         for key in required:
             if not hasattr(creds, key):
-                raise dbt.exceptions.DbtProfileError(
+                raise DbtConfigError(
                     "The config '{}' is required when using the {} method"
                     " to connect to Spark".format(key, method)
                 )
@@ -481,7 +484,7 @@ def open(cls, connection: Connection) -> Connection:
                             endpoint=creds.endpoint
                         )
                     else:
-                        raise dbt.exceptions.DbtProfileError(
+                        raise DbtConfigError(
                             "Either `cluster` or `endpoint` must set when"
                             " using the odbc method to connect to Spark"
                         )
@@ -525,9 +528,7 @@ def open(cls, connection: Connection) -> Connection:
                         Connection(server_side_parameters=creds.server_side_parameters)
                     )
                 else:
-                    raise dbt.exceptions.DbtProfileError(
-                        f"invalid credential method: {creds.method}"
-                    )
+                    raise DbtConfigError(f"invalid credential method: {creds.method}")
                 break
             except Exception as e:
                 exc = e
@@ -537,7 +538,7 @@ def open(cls, connection: Connection) -> Connection:
                     msg = "Failed to connect"
                     if creds.token is not None:
                         msg += ", is your token valid?"
-                    raise dbt.exceptions.FailedToConnectError(msg) from e
+                    raise FailedToConnectError(msg) from e
                 retryable_message = _is_retryable_error(e)
                 if retryable_message and creds.connect_retries > 0:
                     msg = (
@@ -558,7 +559,7 @@ def open(cls, connection: Connection) -> Connection:
                     logger.warning(msg)
                     time.sleep(creds.connect_timeout)
                 else:
-                    raise dbt.exceptions.FailedToConnectError("failed to connect") from e
+                    raise FailedToConnectError("failed to connect") from e
         else:
             raise exc  # type: ignore