From cdc25822467a95036d7e1743f1306c6b4f4297bd Mon Sep 17 00:00:00 2001 From: Lennart Kloppenburg Date: Wed, 18 Oct 2023 15:05:37 +0200 Subject: [PATCH 01/52] Add support for virtual env directory flag --- cosmos/config.py | 4 +++- cosmos/converter.py | 15 ++++++++++++- cosmos/operators/virtualenv.py | 41 ++++++++++++++++++++++++++-------- dev/dags/example_virtualenv.py | 4 ++++ 4 files changed, 53 insertions(+), 11 deletions(-) diff --git a/cosmos/config.py b/cosmos/config.py index fd9d09e18..8efefbb76 100644 --- a/cosmos/config.py +++ b/cosmos/config.py @@ -356,14 +356,15 @@ class ExecutionConfig: :param test_indirect_selection: The mode to configure the test behavior when performing indirect selection. :param dbt_executable_path: The path to the dbt executable for runtime execution. Defaults to dbt if available on the path. :param dbt_project_path: Configures the DBT project location accessible at runtime for dag execution. This is the project path in a docker container for ExecutionMode.DOCKER or ExecutionMode.KUBERNETES. Mutually Exclusive with ProjectConfig.dbt_project_path + :param virtualenv_dir: Directory path to locate the (cached) virtual env that should be used for execution when execution mode is set to `ExecutionMode.VIRTUALENV` """ - execution_mode: ExecutionMode = ExecutionMode.LOCAL invocation_mode: InvocationMode | None = None test_indirect_selection: TestIndirectSelection = TestIndirectSelection.EAGER dbt_executable_path: str | Path = field(default_factory=get_system_dbt) dbt_project_path: InitVar[str | Path | None] = None + virtualenv_dir: str | Path | None = None project_path: Path | None = field(init=False) def __post_init__(self, dbt_project_path: str | Path | None) -> None: @@ -382,3 +383,4 @@ def __post_init__(self, dbt_project_path: str | Path | None) -> None: ) self.invocation_mode = InvocationMode.SUBPROCESS self.project_path = Path(dbt_project_path) if dbt_project_path else None + diff --git a/cosmos/converter.py b/cosmos/converter.py index 40929ef55..9bd722fd0 100644 --- a/cosmos/converter.py +++ b/cosmos/converter.py @@ -16,7 +16,7 @@ from cosmos import cache, settings from cosmos.airflow.graph import build_airflow_graph -from cosmos.config import ExecutionConfig, ProfileConfig, ProjectConfig, RenderConfig +from cosmos.config import ProjectConfig, ExecutionConfig, RenderConfig, ProfileConfig from cosmos.constants import ExecutionMode from cosmos.dbt.graph import DbtGraph from cosmos.dbt.selector import retrieve_by_label @@ -227,6 +227,17 @@ def __init__( env_vars = project_config.env_vars or operator_args.get("env") dbt_vars = project_config.dbt_vars or operator_args.get("vars") + if (execution_config.execution_mode != ExecutionMode.VIRTUALENV and execution_config.virtualenv_dir is not None): + logger.warning("`ExecutionConfig.virtualenv_dir` is only supported when \ + ExecutionConfig.execution_mode is set to ExecutionMode.VIRTUALENV.") + + profile_args = {} + if profile_config.profile_mapping: + profile_args = profile_config.profile_mapping.profile_args + + if not operator_args: + operator_args = {} + cache_dir = None cache_identifier = None if settings.enable_cache: @@ -275,6 +286,8 @@ def __init__( task_args, execution_mode=execution_config.execution_mode, ) + if (execution_mode == ExecutionMode.VIRTUALENV and execution_config.virtualenv_dir is not None): + task_args["virtualenv_dir"] = execution_config.virtualenv_dir build_airflow_graph( nodes=self.dbt_graph.filtered_nodes, diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index cbe8c67e9..1014dd358 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -48,12 +48,14 @@ def __init__( py_requirements: list[str] | None = None, pip_install_options: list[str] | None = None, py_system_site_packages: bool = False, + virtualenv_dir: Path | str | None = None, **kwargs: Any, ) -> None: self.py_requirements = py_requirements or [] self.pip_install_options = pip_install_options or [] self.py_system_site_packages = py_system_site_packages super().__init__(**kwargs) + self._venv_dir = virtualenv_dir self._venv_tmp_dir: None | TemporaryDirectory[str] = None @cached_property @@ -63,20 +65,15 @@ def venv_dbt_path( """ Path to the dbt binary within a Python virtualenv. - The first time this property is called, it creates a virtualenv and installs the dependencies based on the - self.py_requirements, self.pip_install_options, and self.py_system_site_packages. This value is cached for future calls. + The first time this property is called, it creates a new/temporary and installs the dependencies + based on the self.py_requirements, self.pip_install_options, and self.py_system_site_packages, or retrieves an existing virtualenv. + This value is cached for future calls. """ # We are reusing the virtualenv directory for all subprocess calls within this task/operator. # For this reason, we are not using contexts at this point. # The deletion of this directory is done explicitly at the end of the `execute` method. self._venv_tmp_dir = TemporaryDirectory(prefix="cosmos-venv") - py_interpreter = prepare_virtualenv( - venv_directory=self._venv_tmp_dir.name, - python_bin=PY_INTERPRETER, - system_site_packages=self.py_system_site_packages, - requirements=self.py_requirements, - pip_install_options=self.pip_install_options, - ) + py_interpreter = self._get_or_create_venv_py_interpreter() dbt_binary = Path(py_interpreter).parent / "dbt" cmd_output = self.subprocess_hook.run_command( [ @@ -107,6 +104,32 @@ def execute(self, context: Context) -> None: self._venv_tmp_dir.cleanup() logger.info(output) + def _get_or_create_venv_py_interpreter(self) -> str: + if self._venv_dir is not None: + py_interpreter_path = Path(f"{self._venv_dir}/bin/python") + + self.log.info(f"Checking for venv interpreter: {py_interpreter_path} : {py_interpreter_path.is_file()}") + if py_interpreter_path.is_file(): + + self.log.info(f"Found Python interpreter in cached virtualenv: `{str(py_interpreter_path)}`") + return str(py_interpreter_path) + + self.log.info(f"Creating virtualenv at `{self._venv_dir}") + venv_directory = str(self._venv_dir) + + else: + self.log.info(f"Creating temporary virtualenv") + self._venv_tmp_dir = TemporaryDirectory(prefix="cosmos-venv") + venv_directory = self._venv_tmp_dir.name + + return prepare_virtualenv( + venv_directory=venv_directory, + python_bin=PY_INTERPRETER, + system_site_packages=self.py_system_site_packages, + requirements=self.py_requirements, + pip_install_options=self.pip_install_options, + ) + class DbtBuildVirtualenvOperator(DbtVirtualenvBaseOperator, DbtBuildLocalOperator): # type: ignore[misc] """ diff --git a/dev/dags/example_virtualenv.py b/dev/dags/example_virtualenv.py index cd38cba9e..cee80f7f8 100644 --- a/dev/dags/example_virtualenv.py +++ b/dev/dags/example_virtualenv.py @@ -5,6 +5,7 @@ import os from datetime import datetime from pathlib import Path +from airflow.configuration import get_airflow_home from cosmos import DbtDag, ExecutionConfig, ExecutionMode, ProfileConfig, ProjectConfig from cosmos.profiles import PostgresUserPasswordProfileMapping @@ -31,6 +32,9 @@ profile_config=profile_config, execution_config=ExecutionConfig( execution_mode=ExecutionMode.VIRTUALENV, + # We can enable this flag if we want Airflow to create one virtualenv + # and reuse that within the whole DAG. + # virtualenv_dir=f"{get_airflow_home()}/persistent-venv", ), operator_args={ "py_system_site_packages": False, From a2e40220d337ebfe4a80ac56ff3cf86e647e4233 Mon Sep 17 00:00:00 2001 From: Lennart Kloppenburg Date: Wed, 18 Oct 2023 15:58:00 +0200 Subject: [PATCH 02/52] Add basic test --- tests/operators/test_virtualenv.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/operators/test_virtualenv.py b/tests/operators/test_virtualenv.py index deb7151e5..cc2a2eab4 100644 --- a/tests/operators/test_virtualenv.py +++ b/tests/operators/test_virtualenv.py @@ -74,10 +74,36 @@ def test_run_command( assert dbt_cmd["command"][1] == "do-something" assert mock_execute.call_count == 2 +<<<<<<< HEAD def test_virtualenv_operator_append_env_is_true_by_default(): venv_operator = ConcreteDbtVirtualenvBaseOperator( dag=DAG("sample_dag", start_date=datetime(2024, 4, 16)), +======= +@patch("airflow.utils.python_virtualenv.execute_in_subprocess") +@patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.calculate_openlineage_events_completes") +@patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.store_compiled_sql") +@patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.exception_handling") +@patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.subprocess_hook") +@patch("airflow.hooks.base.BaseHook.get_connection") +def test_supply_virtualenv_dir_flag( + mock_get_connection, + mock_subprocess_hook, + mock_exception_handling, + mock_store_compiled_sql, + mock_calculate_openlineage_events_completes, + mock_execute, +): + mock_get_connection.return_value = Connection( + conn_id="fake_conn", + conn_type="postgres", + host="fake_host", + port=5432, + login="fake_login", + password="fake_password", + schema="fake_schema", + ) + venv_operator = DbtVirtualenvBaseOperator( profile_config=profile_config, task_id="fake_task", install_deps=True, @@ -87,6 +113,8 @@ def test_virtualenv_operator_append_env_is_true_by_default(): py_requirements=["dbt-postgres==1.6.0b1"], emit_datasets=False, invocation_mode=InvocationMode.SUBPROCESS, + virtualenv_dir = "mock-venv", ) assert venv_operator.append_env is True + assert venv_operator.venv_dbt_path == "mock-venv/bin/dbt" From 6ac0225a0d5cfadb0516d9401cab9df4d3062619 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Oct 2023 14:10:06 +0000 Subject: [PATCH 03/52] =?UTF-8?q?=F0=9F=8E=A8=20[pre-commit.ci]=20Auto=20f?= =?UTF-8?q?ormat=20from=20pre-commit.com=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cosmos/config.py | 4 +++- cosmos/converter.py | 8 +++++--- cosmos/operators/virtualenv.py | 11 +++++------ dev/dags/example_virtualenv.py | 5 ++--- tests/operators/test_virtualenv.py | 7 +++++-- 5 files changed, 20 insertions(+), 15 deletions(-) diff --git a/cosmos/config.py b/cosmos/config.py index 8efefbb76..ea40f3cd5 100644 --- a/cosmos/config.py +++ b/cosmos/config.py @@ -356,8 +356,10 @@ class ExecutionConfig: :param test_indirect_selection: The mode to configure the test behavior when performing indirect selection. :param dbt_executable_path: The path to the dbt executable for runtime execution. Defaults to dbt if available on the path. :param dbt_project_path: Configures the DBT project location accessible at runtime for dag execution. This is the project path in a docker container for ExecutionMode.DOCKER or ExecutionMode.KUBERNETES. Mutually Exclusive with ProjectConfig.dbt_project_path - :param virtualenv_dir: Directory path to locate the (cached) virtual env that should be used for execution when execution mode is set to `ExecutionMode.VIRTUALENV` + :param virtualenv_dir: Directory path to locate the (cached) virtual env that + should be used for execution when execution mode is set to `ExecutionMode.VIRTUALENV` """ + execution_mode: ExecutionMode = ExecutionMode.LOCAL invocation_mode: InvocationMode | None = None test_indirect_selection: TestIndirectSelection = TestIndirectSelection.EAGER diff --git a/cosmos/converter.py b/cosmos/converter.py index 9bd722fd0..7852a445c 100644 --- a/cosmos/converter.py +++ b/cosmos/converter.py @@ -227,9 +227,11 @@ def __init__( env_vars = project_config.env_vars or operator_args.get("env") dbt_vars = project_config.dbt_vars or operator_args.get("vars") - if (execution_config.execution_mode != ExecutionMode.VIRTUALENV and execution_config.virtualenv_dir is not None): - logger.warning("`ExecutionConfig.virtualenv_dir` is only supported when \ - ExecutionConfig.execution_mode is set to ExecutionMode.VIRTUALENV.") + if execution_mode != ExecutionMode.VIRTUALENV and execution_config.virtualenv_dir is not None: + logger.warning( + "`ExecutionConfig.virtualenv_dir` is only supported when \ + ExecutionConfig.execution_mode is set to ExecutionMode.VIRTUALENV." + ) profile_args = {} if profile_config.profile_mapping: diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 1014dd358..4c71a3ddd 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -65,8 +65,8 @@ def venv_dbt_path( """ Path to the dbt binary within a Python virtualenv. - The first time this property is called, it creates a new/temporary and installs the dependencies - based on the self.py_requirements, self.pip_install_options, and self.py_system_site_packages, or retrieves an existing virtualenv. + The first time this property is called, it creates a new/temporary and installs the dependencies + based on the self.py_requirements, self.pip_install_options, and self.py_system_site_packages, or retrieves an existing virtualenv. This value is cached for future calls. """ # We are reusing the virtualenv directory for all subprocess calls within this task/operator. @@ -110,15 +110,14 @@ def _get_or_create_venv_py_interpreter(self) -> str: self.log.info(f"Checking for venv interpreter: {py_interpreter_path} : {py_interpreter_path.is_file()}") if py_interpreter_path.is_file(): - self.log.info(f"Found Python interpreter in cached virtualenv: `{str(py_interpreter_path)}`") return str(py_interpreter_path) - + self.log.info(f"Creating virtualenv at `{self._venv_dir}") venv_directory = str(self._venv_dir) - + else: - self.log.info(f"Creating temporary virtualenv") + self.log.info("Creating temporary virtualenv") self._venv_tmp_dir = TemporaryDirectory(prefix="cosmos-venv") venv_directory = self._venv_tmp_dir.name diff --git a/dev/dags/example_virtualenv.py b/dev/dags/example_virtualenv.py index cee80f7f8..8dcef53af 100644 --- a/dev/dags/example_virtualenv.py +++ b/dev/dags/example_virtualenv.py @@ -5,7 +5,6 @@ import os from datetime import datetime from pathlib import Path -from airflow.configuration import get_airflow_home from cosmos import DbtDag, ExecutionConfig, ExecutionMode, ProfileConfig, ProjectConfig from cosmos.profiles import PostgresUserPasswordProfileMapping @@ -32,9 +31,9 @@ profile_config=profile_config, execution_config=ExecutionConfig( execution_mode=ExecutionMode.VIRTUALENV, - # We can enable this flag if we want Airflow to create one virtualenv + # We can enable this flag if we want Airflow to create one virtualenv # and reuse that within the whole DAG. - # virtualenv_dir=f"{get_airflow_home()}/persistent-venv", + # virtualenv_dir=f"{get_airflow_home()}/persistent-venv", ), operator_args={ "py_system_site_packages": False, diff --git a/tests/operators/test_virtualenv.py b/tests/operators/test_virtualenv.py index cc2a2eab4..4205f5b23 100644 --- a/tests/operators/test_virtualenv.py +++ b/tests/operators/test_virtualenv.py @@ -75,18 +75,22 @@ def test_run_command( assert mock_execute.call_count == 2 <<<<<<< HEAD +<<<<<<< HEAD def test_virtualenv_operator_append_env_is_true_by_default(): venv_operator = ConcreteDbtVirtualenvBaseOperator( dag=DAG("sample_dag", start_date=datetime(2024, 4, 16)), ======= +======= + +>>>>>>> 7e8f82d (🎨 [pre-commit.ci] Auto format from pre-commit.com hooks) @patch("airflow.utils.python_virtualenv.execute_in_subprocess") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.calculate_openlineage_events_completes") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.store_compiled_sql") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.exception_handling") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.subprocess_hook") @patch("airflow.hooks.base.BaseHook.get_connection") -def test_supply_virtualenv_dir_flag( +def test_supply_virtualenv_dir_flag( mock_get_connection, mock_subprocess_hook, mock_exception_handling, @@ -115,6 +119,5 @@ def test_supply_virtualenv_dir_flag( invocation_mode=InvocationMode.SUBPROCESS, virtualenv_dir = "mock-venv", ) - assert venv_operator.append_env is True assert venv_operator.venv_dbt_path == "mock-venv/bin/dbt" From bf3d1f1dd40f3a7f7ef2360f7e9e00f8da62c476 Mon Sep 17 00:00:00 2001 From: Lennart Kloppenburg Date: Mon, 23 Oct 2023 15:23:26 +0200 Subject: [PATCH 04/52] Only accept Path(...) --- tests/operators/test_virtualenv.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/operators/test_virtualenv.py b/tests/operators/test_virtualenv.py index 4205f5b23..e44d7d52c 100644 --- a/tests/operators/test_virtualenv.py +++ b/tests/operators/test_virtualenv.py @@ -1,5 +1,6 @@ from datetime import datetime from unittest.mock import MagicMock, patch +from pathlib import Path from airflow.models import DAG from airflow.models.connection import Connection @@ -74,16 +75,12 @@ def test_run_command( assert dbt_cmd["command"][1] == "do-something" assert mock_execute.call_count == 2 -<<<<<<< HEAD -<<<<<<< HEAD def test_virtualenv_operator_append_env_is_true_by_default(): venv_operator = ConcreteDbtVirtualenvBaseOperator( dag=DAG("sample_dag", start_date=datetime(2024, 4, 16)), -======= -======= ->>>>>>> 7e8f82d (🎨 [pre-commit.ci] Auto format from pre-commit.com hooks) + @patch("airflow.utils.python_virtualenv.execute_in_subprocess") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.calculate_openlineage_events_completes") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.store_compiled_sql") @@ -117,7 +114,6 @@ def test_supply_virtualenv_dir_flag( py_requirements=["dbt-postgres==1.6.0b1"], emit_datasets=False, invocation_mode=InvocationMode.SUBPROCESS, - virtualenv_dir = "mock-venv", - ) + virtualenv_dir=Path("mock-venv"), assert venv_operator.append_env is True assert venv_operator.venv_dbt_path == "mock-venv/bin/dbt" From 5715d16f4e075017b0e44946097859afc2e599eb Mon Sep 17 00:00:00 2001 From: Lennart Kloppenburg Date: Mon, 23 Oct 2023 15:23:51 +0200 Subject: [PATCH 05/52] Check if venv_dir needs to be created + document helper --- cosmos/operators/virtualenv.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 4c71a3ddd..a5d05bae1 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -1,6 +1,8 @@ from __future__ import annotations from functools import cached_property +import os + from pathlib import Path from tempfile import TemporaryDirectory from typing import TYPE_CHECKING, Any @@ -48,7 +50,7 @@ def __init__( py_requirements: list[str] | None = None, pip_install_options: list[str] | None = None, py_system_site_packages: bool = False, - virtualenv_dir: Path | str | None = None, + virtualenv_dir: Path | None = None, **kwargs: Any, ) -> None: self.py_requirements = py_requirements or [] @@ -105,13 +107,27 @@ def execute(self, context: Context) -> None: logger.info(output) def _get_or_create_venv_py_interpreter(self) -> str: + """ + Helper method that parses virtual env configuration and returns a DBT binary within the resulting virtualenv: + Do we have a persistent virtual env dir set in `self._venv_dir`? + 1. Yes: Does a directory at that path exist? + 1. No: Create it, and create a virtual env inside it + 2. Yes: Does the directory have a virtual env inside it? + 1. No: Create one in this directory and return it + 2. Yes: Return this virtual env + 2. No: Create a temporary virtual env and return it + + """ if self._venv_dir is not None: - py_interpreter_path = Path(f"{self._venv_dir}/bin/python") - - self.log.info(f"Checking for venv interpreter: {py_interpreter_path} : {py_interpreter_path.is_file()}") - if py_interpreter_path.is_file(): - self.log.info(f"Found Python interpreter in cached virtualenv: `{str(py_interpreter_path)}`") - return str(py_interpreter_path) + if self._venv_dir.is_dir(): + py_interpreter_path = Path(f"{self._venv_dir}/bin/python") + + self.log.info(f"Checking for venv interpreter: {py_interpreter_path} : {py_interpreter_path.is_file()}") + if py_interpreter_path.is_file(): + self.log.info(f"Found Python interpreter in cached virtualenv: `{str(py_interpreter_path)}`") + return str(py_interpreter_path) + else: + os.mkdir(self._venv_dir) self.log.info(f"Creating virtualenv at `{self._venv_dir}") venv_directory = str(self._venv_dir) From 4971c4ef858bcf68883afe5f0c8cb9f55112d232 Mon Sep 17 00:00:00 2001 From: Lennart Kloppenburg Date: Mon, 23 Oct 2023 15:24:06 +0200 Subject: [PATCH 06/52] Assert warning --- tests/test_converter.py | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/tests/test_converter.py b/tests/test_converter.py index bef2dc06d..be1efe381 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -168,8 +168,7 @@ def test_converter_creates_dag_with_seed(mock_load_dbt_graph, execution_mode, op "execution_mode,operator_args", [ (ExecutionMode.KUBERNETES, {}), - # (ExecutionMode.DOCKER, {"image": "sample-image"}), - ], + ] ) @patch("cosmos.converter.DbtGraph.filtered_nodes", nodes) @patch("cosmos.converter.DbtGraph.load") @@ -197,6 +196,41 @@ def test_converter_creates_dag_with_project_path_str(mock_load_dbt_graph, execut ) assert converter +@pytest.mark.parametrize( + "execution_mode,virtualenv_dir,operator_args", + [ + (ExecutionMode.KUBERNETES, Path("/some/virtualenv/dir"), {}), + # (ExecutionMode.DOCKER, {"image": "sample-image"}), + ], +) +def test_converter_raises_warning(mock_load_dbt_graph, execution_mode, virtualenv_dir, operator_args, caplog): + """ + This test will raise a warning if we are trying to pass ExecutionMode != `VirtualEnv` andm still pass a defined `virtualenv_dir` + """ + project_config = ProjectConfig(dbt_project_path=SAMPLE_DBT_PROJECT) + execution_config = ExecutionConfig(execution_mode=execution_mode, virtualenv_dir=virtualenv_dir) + render_config = RenderConfig(emit_datasets=True) + profile_config = ProfileConfig( + profile_name="my_profile_name", + target_name="my_target_name", + profiles_yml_filepath=SAMPLE_PROFILE_YML, + ) + + converter = DbtToAirflowConverter( + nodes=nodes, + project_config=project_config, + profile_config=profile_config, + execution_config=execution_config, + render_config=render_config, + operator_args=operator_args, + ) + + assert converter + + assert "`ExecutionConfig.virtualenv_dir` is only supported when \ + ExecutionConfig.execution_mode is set to ExecutionMode.VIRTUALENV." in caplog.text + + @pytest.mark.parametrize( "execution_mode,operator_args", From 522e7ac7f7527e325318323ecffc8c0abee474c0 Mon Sep 17 00:00:00 2001 From: Lennart Kloppenburg Date: Mon, 23 Oct 2023 15:24:38 +0200 Subject: [PATCH 07/52] Support both virtualenv use-cases in two task groups in the example DAG --- dev/dags/example_virtualenv.py | 78 ++++++++++++++++++++++++---------- tests/test_converter.py | 3 +- 2 files changed, 58 insertions(+), 23 deletions(-) diff --git a/dev/dags/example_virtualenv.py b/dev/dags/example_virtualenv.py index 8dcef53af..985e7925a 100644 --- a/dev/dags/example_virtualenv.py +++ b/dev/dags/example_virtualenv.py @@ -6,7 +6,12 @@ from datetime import datetime from pathlib import Path -from cosmos import DbtDag, ExecutionConfig, ExecutionMode, ProfileConfig, ProjectConfig +from airflow.decorators import dag +from airflow.configuration import get_airflow_home +from airflow.operators.empty import EmptyOperator + +from cosmos import DbtTaskGroup, ExecutionConfig, ExecutionMode, ProfileConfig, ProjectConfig + from cosmos.profiles import PostgresUserPasswordProfileMapping DEFAULT_DBT_ROOT_PATH = Path(__file__).parent / "dbt" @@ -23,29 +28,58 @@ ) # [START virtualenv_example] -example_virtualenv = DbtDag( - # dbt/cosmos-specific parameters - project_config=ProjectConfig( - DBT_ROOT_PATH / "jaffle_shop", - ), - profile_config=profile_config, - execution_config=ExecutionConfig( - execution_mode=ExecutionMode.VIRTUALENV, - # We can enable this flag if we want Airflow to create one virtualenv - # and reuse that within the whole DAG. - # virtualenv_dir=f"{get_airflow_home()}/persistent-venv", - ), - operator_args={ - "py_system_site_packages": False, - "py_requirements": ["dbt-postgres==1.6.0b1"], - "install_deps": True, - "emit_datasets": False, # Example of how to not set inlets and outlets - }, - # normal dag parameters +@dag( schedule_interval="@daily", start_date=datetime(2023, 1, 1), catchup=False, - dag_id="example_virtualenv", - default_args={"retries": 2}, ) +def example_virtualenv() -> None: + start_task = EmptyOperator(task_id='start-venv-examples') + end_task = EmptyOperator(task_id='end-venv-examples') + + tmp_venv_task_group = DbtTaskGroup( + group_id='tmp-venv-group', + # dbt/cosmos-specific parameters + project_config=ProjectConfig( + DBT_ROOT_PATH / "jaffle_shop", + ), + profile_config=profile_config, + execution_config=ExecutionConfig( + execution_mode=ExecutionMode.VIRTUALENV, + # We can enable this flag if we want Airflow to create one virtualenv + # and reuse that within the whole DAG. + # virtualenv_dir=f"{get_airflow_home()}/persistent-venv", + ), + operator_args={ + "py_system_site_packages": False, + "py_requirements": ["dbt-postgres==1.6.0b1"], + "install_deps": True, + "emit_datasets": False, # Example of how to not set inlets and outlets + }, + ) + + cached_venv_task_group = DbtTaskGroup( + group_id='cached-venv-group', + # dbt/cosmos-specific parameters + project_config=ProjectConfig( + DBT_ROOT_PATH / "jaffle_shop", + ), + profile_config=profile_config, + execution_config=ExecutionConfig( + execution_mode=ExecutionMode.VIRTUALENV, + # We can enable this flag if we want Airflow to create one virtualenv + # and reuse that within the whole DAG. + virtualenv_dir=Path(f"{get_airflow_home()}/persistent-venv"), + ), + operator_args={ + "py_system_site_packages": False, + "py_requirements": ["dbt-postgres==1.6.0b1"], + "install_deps": True, + }, + ) + + start_task >> [tmp_venv_task_group, cached_venv_task_group] >> end_task + +example_virtualenv() # [END virtualenv_example] + diff --git a/tests/test_converter.py b/tests/test_converter.py index be1efe381..6d27c011d 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -205,7 +205,8 @@ def test_converter_creates_dag_with_project_path_str(mock_load_dbt_graph, execut ) def test_converter_raises_warning(mock_load_dbt_graph, execution_mode, virtualenv_dir, operator_args, caplog): """ - This test will raise a warning if we are trying to pass ExecutionMode != `VirtualEnv` andm still pass a defined `virtualenv_dir` + This test will raise a warning if we are trying to pass ExecutionMode != `VirtualEnv` + and still pass a defined `virtualenv_dir` """ project_config = ProjectConfig(dbt_project_path=SAMPLE_DBT_PROJECT) execution_config = ExecutionConfig(execution_mode=execution_mode, virtualenv_dir=virtualenv_dir) From 905f602d6f1020ee67c4efac2c217333e42a25e2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 23 Oct 2023 13:26:42 +0000 Subject: [PATCH 08/52] =?UTF-8?q?=F0=9F=8E=A8=20[pre-commit.ci]=20Auto=20f?= =?UTF-8?q?ormat=20from=20pre-commit.com=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cosmos/operators/virtualenv.py | 2 +- dev/dags/example_virtualenv.py | 10 +++++----- tests/test_converter.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index a5d05bae1..f4a90f999 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -116,7 +116,7 @@ def _get_or_create_venv_py_interpreter(self) -> str: 1. No: Create one in this directory and return it 2. Yes: Return this virtual env 2. No: Create a temporary virtual env and return it - + """ if self._venv_dir is not None: if self._venv_dir.is_dir(): diff --git a/dev/dags/example_virtualenv.py b/dev/dags/example_virtualenv.py index 985e7925a..d13cf5099 100644 --- a/dev/dags/example_virtualenv.py +++ b/dev/dags/example_virtualenv.py @@ -34,11 +34,11 @@ catchup=False, ) def example_virtualenv() -> None: - start_task = EmptyOperator(task_id='start-venv-examples') - end_task = EmptyOperator(task_id='end-venv-examples') + start_task = EmptyOperator(task_id="start-venv-examples") + end_task = EmptyOperator(task_id="end-venv-examples") tmp_venv_task_group = DbtTaskGroup( - group_id='tmp-venv-group', + group_id="tmp-venv-group", # dbt/cosmos-specific parameters project_config=ProjectConfig( DBT_ROOT_PATH / "jaffle_shop", @@ -59,7 +59,7 @@ def example_virtualenv() -> None: ) cached_venv_task_group = DbtTaskGroup( - group_id='cached-venv-group', + group_id="cached-venv-group", # dbt/cosmos-specific parameters project_config=ProjectConfig( DBT_ROOT_PATH / "jaffle_shop", @@ -80,6 +80,6 @@ def example_virtualenv() -> None: start_task >> [tmp_venv_task_group, cached_venv_task_group] >> end_task + example_virtualenv() # [END virtualenv_example] - diff --git a/tests/test_converter.py b/tests/test_converter.py index 6d27c011d..02203e135 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -205,7 +205,7 @@ def test_converter_creates_dag_with_project_path_str(mock_load_dbt_graph, execut ) def test_converter_raises_warning(mock_load_dbt_graph, execution_mode, virtualenv_dir, operator_args, caplog): """ - This test will raise a warning if we are trying to pass ExecutionMode != `VirtualEnv` + This test will raise a warning if we are trying to pass ExecutionMode != `VirtualEnv` and still pass a defined `virtualenv_dir` """ project_config = ProjectConfig(dbt_project_path=SAMPLE_DBT_PROJECT) From ef245bad9ae30283b1ac5edc6e0477f632805ece Mon Sep 17 00:00:00 2001 From: Lennart Kloppenburg Date: Mon, 6 Nov 2023 13:28:58 +0100 Subject: [PATCH 09/52] Implement locking mechanism [WIP] --- cosmos/operators/virtualenv.py | 89 +++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 29 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index f4a90f999..b6024b2a5 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -2,6 +2,8 @@ from functools import cached_property import os +import psutil +import time from pathlib import Path from tempfile import TemporaryDirectory @@ -31,7 +33,6 @@ PY_INTERPRETER = "python3" - class DbtVirtualenvBaseOperator(DbtLocalBaseOperator): """ Executes a dbt core cli command within a Python Virtual Environment, that is created before running the dbt command @@ -44,6 +45,7 @@ class DbtVirtualenvBaseOperator(DbtLocalBaseOperator): within the virtual environment (if py_requirements argument is specified). Avoid using unless the dbt job requires it. """ + template_fields = DbtLocalBaseOperator.template_fields + ("virtualenv_dir",) def __init__( self, @@ -57,7 +59,7 @@ def __init__( self.pip_install_options = pip_install_options or [] self.py_system_site_packages = py_system_site_packages super().__init__(**kwargs) - self._venv_dir = virtualenv_dir + self.virtualenv_dir = virtualenv_dir self._venv_tmp_dir: None | TemporaryDirectory[str] = None @cached_property @@ -107,44 +109,72 @@ def execute(self, context: Context) -> None: logger.info(output) def _get_or_create_venv_py_interpreter(self) -> str: - """ - Helper method that parses virtual env configuration and returns a DBT binary within the resulting virtualenv: - Do we have a persistent virtual env dir set in `self._venv_dir`? - 1. Yes: Does a directory at that path exist? - 1. No: Create it, and create a virtual env inside it - 2. Yes: Does the directory have a virtual env inside it? - 1. No: Create one in this directory and return it - 2. Yes: Return this virtual env - 2. No: Create a temporary virtual env and return it + """Helper method that parses virtual env configuration + and returns a DBT binary within the resulting virtualenv""" - """ - if self._venv_dir is not None: - if self._venv_dir.is_dir(): - py_interpreter_path = Path(f"{self._venv_dir}/bin/python") + # No virtualenv_dir set, so revert to making a temporary virtualenv + if self.virtualenv_dir is None: + self.log.info("Creating temporary virtualenv") + self._venv_tmp_dir = TemporaryDirectory(prefix="cosmos-venv") - self.log.info(f"Checking for venv interpreter: {py_interpreter_path} : {py_interpreter_path.is_file()}") - if py_interpreter_path.is_file(): - self.log.info(f"Found Python interpreter in cached virtualenv: `{str(py_interpreter_path)}`") - return str(py_interpreter_path) - else: - os.mkdir(self._venv_dir) + return prepare_virtualenv( + venv_directory=self._venv_tmp_dir.name, + python_bin=PY_INTERPRETER, + system_site_packages=self.py_system_site_packages, + requirements=self.py_requirements, + ) - self.log.info(f"Creating virtualenv at `{self._venv_dir}") - venv_directory = str(self._venv_dir) + self.log.info(f"Checking if {str(self._lock_file)} exists") + while not self._is_lock_available(): + self.log.info("Waiting for lock to release") + time.sleep(1) - else: - self.log.info("Creating temporary virtualenv") - self._venv_tmp_dir = TemporaryDirectory(prefix="cosmos-venv") - venv_directory = self._venv_tmp_dir.name + self.log.info(f"Creating virtualenv at `{self.virtualenv_dir}") + self.log.info(f"Acquiring available lock") + self._acquire_venv_lock() - return prepare_virtualenv( - venv_directory=venv_directory, + py_bin = prepare_virtualenv( + venv_directory=str(self.virtualenv_dir), python_bin=PY_INTERPRETER, system_site_packages=self.py_system_site_packages, requirements=self.py_requirements, pip_install_options=self.pip_install_options, ) + self.log.info("Releasing lock") + self._release_venv_lock() + + return py_bin + + @property + def _lock_file(self) -> Path: + return Path(f"{self.virtualenv_dir}/LOCK") + + def _is_lock_available(self) -> bool: + if self._lock_file.is_file(): + with open(self._lock_file, "r") as lf: + pid = int(lf.read()) + + self.log.info(f"Checking for running process with PID {pid}") + _process_running = psutil.Process(pid).is_running() + + self.log.info(f"Process {pid} running: {_process_running}") + return not _process_running + + return True + + def _acquire_venv_lock(self) -> None: + if not self.virtualenv_dir.is_dir(): + os.mkdir(str(self.virtualenv_dir)) + + with open(self._lock_file, "w") as lf: + pid = str(os.getpid()) + self.log.info(f"Acquiring lock at {self._lock_file} with pid {pid}") + lf.write(pid) + + def _release_venv_lock(self) -> None: + self._lock_file.unlink() + class DbtBuildVirtualenvOperator(DbtVirtualenvBaseOperator, DbtBuildLocalOperator): # type: ignore[misc] """ @@ -200,3 +230,4 @@ class DbtDocsVirtualenvOperator(DbtVirtualenvBaseOperator, DbtDocsLocalOperator) Executes `dbt docs generate` command within a Python Virtual Environment, that is created before running the dbt command and deleted just after. """ + From 2fcc7079830b35f8758b95ccefecfd957f80a328 Mon Sep 17 00:00:00 2001 From: Lennart Kloppenburg Date: Wed, 8 Nov 2023 09:27:13 +0100 Subject: [PATCH 10/52] Return types and private methods+attrs --- cosmos/operators/virtualenv.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index b6024b2a5..76f405f7c 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -45,7 +45,7 @@ class DbtVirtualenvBaseOperator(DbtLocalBaseOperator): within the virtual environment (if py_requirements argument is specified). Avoid using unless the dbt job requires it. """ - template_fields = DbtLocalBaseOperator.template_fields + ("virtualenv_dir",) + template_fields = DbtLocalBaseOperator.template_fields + ("virtualenv_dir",) # type: ignore[operator] def __init__( self, @@ -124,14 +124,14 @@ def _get_or_create_venv_py_interpreter(self) -> str: requirements=self.py_requirements, ) - self.log.info(f"Checking if {str(self._lock_file)} exists") + self.log.info(f"Checking if {str(self.__lock_file)} exists") while not self._is_lock_available(): self.log.info("Waiting for lock to release") time.sleep(1) self.log.info(f"Creating virtualenv at `{self.virtualenv_dir}") self.log.info(f"Acquiring available lock") - self._acquire_venv_lock() + self.__acquire_venv_lock() py_bin = prepare_virtualenv( venv_directory=str(self.virtualenv_dir), @@ -142,17 +142,17 @@ def _get_or_create_venv_py_interpreter(self) -> str: ) self.log.info("Releasing lock") - self._release_venv_lock() + self.__release_venv_lock() return py_bin @property - def _lock_file(self) -> Path: + def __lock_file(self) -> Path: return Path(f"{self.virtualenv_dir}/LOCK") def _is_lock_available(self) -> bool: - if self._lock_file.is_file(): - with open(self._lock_file, "r") as lf: + if self.__lock_file.is_file(): + with open(self.__lock_file, "r") as lf: pid = int(lf.read()) self.log.info(f"Checking for running process with PID {pid}") @@ -163,17 +163,17 @@ def _is_lock_available(self) -> bool: return True - def _acquire_venv_lock(self) -> None: + def __acquire_venv_lock(self) -> None: if not self.virtualenv_dir.is_dir(): os.mkdir(str(self.virtualenv_dir)) - with open(self._lock_file, "w") as lf: + with open(self.__lock_file, "w") as lf: pid = str(os.getpid()) - self.log.info(f"Acquiring lock at {self._lock_file} with pid {pid}") + self.log.info(f"Acquiring lock at {self.__lock_file} with pid {pid}") lf.write(pid) - def _release_venv_lock(self) -> None: - self._lock_file.unlink() + def __release_venv_lock(self) -> None: + self.__lock_file.unlink() class DbtBuildVirtualenvOperator(DbtVirtualenvBaseOperator, DbtBuildLocalOperator): # type: ignore[misc] From 0a58f348e5b4d4d4af08a796c36ea40a65c64d60 Mon Sep 17 00:00:00 2001 From: Lennart Kloppenburg Date: Thu, 9 Nov 2023 19:43:13 +0100 Subject: [PATCH 11/52] Add better checks around releasing lock file --- cosmos/operators/virtualenv.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 76f405f7c..3fcbf63b3 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -125,7 +125,7 @@ def _get_or_create_venv_py_interpreter(self) -> str: ) self.log.info(f"Checking if {str(self.__lock_file)} exists") - while not self._is_lock_available(): + while not self.__is_lock_available(): self.log.info("Waiting for lock to release") time.sleep(1) @@ -150,7 +150,11 @@ def _get_or_create_venv_py_interpreter(self) -> str: def __lock_file(self) -> Path: return Path(f"{self.virtualenv_dir}/LOCK") - def _is_lock_available(self) -> bool: + @property + def _pid(self) -> int: + return os.getpid() + + def __is_lock_available(self) -> bool: if self.__lock_file.is_file(): with open(self.__lock_file, "r") as lf: pid = int(lf.read()) @@ -168,12 +172,20 @@ def __acquire_venv_lock(self) -> None: os.mkdir(str(self.virtualenv_dir)) with open(self.__lock_file, "w") as lf: - pid = str(os.getpid()) - self.log.info(f"Acquiring lock at {self.__lock_file} with pid {pid}") - lf.write(pid) + self.log.info(f"Acquiring lock at {self.__lock_file} with pid {str(self._pid)}") + lf.write(str(self._pid)) def __release_venv_lock(self) -> None: - self.__lock_file.unlink() + if not self.__lock_file.is_file(): + raise FileNotFoundError(f"Lockfile {self.__lock_file} not found") + + with open(self.__lock_file, "r") as lf: + lock_file_pid = int(lf.read()) + + if lock_file_pid == self._pid: + return self.__lock_file.unlink() + + self.log.warn(f"Lockfile owned by process of pid {lock_file_pid}, while operator has pid {self._pid}") class DbtBuildVirtualenvOperator(DbtVirtualenvBaseOperator, DbtBuildLocalOperator): # type: ignore[misc] From 55f785e0f49e2d94681761de4c656aac9b666cd2 Mon Sep 17 00:00:00 2001 From: Lennart Kloppenburg Date: Thu, 9 Nov 2023 20:49:35 +0100 Subject: [PATCH 12/52] Add a validation decorator --- cosmos/operators/virtualenv.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 3fcbf63b3..a0eff313e 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -11,6 +11,7 @@ from airflow.utils.python_virtualenv import prepare_virtualenv +from cosmos.exceptions import CosmosValueError from cosmos.hooks.subprocess import FullOutputSubprocessResult from cosmos.log import get_logger from cosmos.operators.local import ( @@ -33,6 +34,16 @@ PY_INTERPRETER = "python3" +def depends_on_virtualenv_dir(method): + def wrapper(operator: DbtVirtualenvBaseOperator, *args): + if operator.virtualenv_dir is None: + raise CosmosValueError(f"Method relies on value of parameter `virtualenv_dir` which is None.") + + logger.info(f"Operator: {operator}") + logger.info(f"Args: {args}") + method(operator, *args) + return wrapper + class DbtVirtualenvBaseOperator(DbtLocalBaseOperator): """ Executes a dbt core cli command within a Python Virtual Environment, that is created before running the dbt command @@ -154,6 +165,7 @@ def __lock_file(self) -> Path: def _pid(self) -> int: return os.getpid() + @depends_on_virtualenv_dir def __is_lock_available(self) -> bool: if self.__lock_file.is_file(): with open(self.__lock_file, "r") as lf: @@ -167,6 +179,7 @@ def __is_lock_available(self) -> bool: return True + @depends_on_virtualenv_dir def __acquire_venv_lock(self) -> None: if not self.virtualenv_dir.is_dir(): os.mkdir(str(self.virtualenv_dir)) @@ -175,6 +188,7 @@ def __acquire_venv_lock(self) -> None: self.log.info(f"Acquiring lock at {self.__lock_file} with pid {str(self._pid)}") lf.write(str(self._pid)) + @depends_on_virtualenv_dir def __release_venv_lock(self) -> None: if not self.__lock_file.is_file(): raise FileNotFoundError(f"Lockfile {self.__lock_file} not found") From cd47ddc1bfe1020912740d7f29c77e5d9fca008c Mon Sep 17 00:00:00 2001 From: Lennart Kloppenburg Date: Thu, 16 Nov 2023 11:51:02 +0100 Subject: [PATCH 13/52] Mock + rebase --- cosmos/operators/virtualenv.py | 4 ++-- tests/operators/test_virtualenv.py | 3 +++ tests/test_converter.py | 6 +++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index a0eff313e..07de3f54d 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -136,7 +136,7 @@ def _get_or_create_venv_py_interpreter(self) -> str: ) self.log.info(f"Checking if {str(self.__lock_file)} exists") - while not self.__is_lock_available(): + while not self._is_lock_available(): self.log.info("Waiting for lock to release") time.sleep(1) @@ -166,7 +166,7 @@ def _pid(self) -> int: return os.getpid() @depends_on_virtualenv_dir - def __is_lock_available(self) -> bool: + def _is_lock_available(self) -> bool: if self.__lock_file.is_file(): with open(self.__lock_file, "r") as lf: pid = int(lf.read()) diff --git a/tests/operators/test_virtualenv.py b/tests/operators/test_virtualenv.py index e44d7d52c..aa4db9988 100644 --- a/tests/operators/test_virtualenv.py +++ b/tests/operators/test_virtualenv.py @@ -86,15 +86,18 @@ def test_virtualenv_operator_append_env_is_true_by_default(): @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.store_compiled_sql") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.exception_handling") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.subprocess_hook") +@patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator._is_lock_available") @patch("airflow.hooks.base.BaseHook.get_connection") def test_supply_virtualenv_dir_flag( mock_get_connection, + mock_lock_available, mock_subprocess_hook, mock_exception_handling, mock_store_compiled_sql, mock_calculate_openlineage_events_completes, mock_execute, ): + mock_lock_available.return_value = True mock_get_connection.return_value = Connection( conn_id="fake_conn", conn_type="postgres", diff --git a/tests/test_converter.py b/tests/test_converter.py index 02203e135..4c5af77b4 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -203,12 +203,14 @@ def test_converter_creates_dag_with_project_path_str(mock_load_dbt_graph, execut # (ExecutionMode.DOCKER, {"image": "sample-image"}), ], ) +@patch("cosmos.converter.DbtGraph.filtered_nodes", nodes) +@patch("cosmos.converter.DbtGraph.load") def test_converter_raises_warning(mock_load_dbt_graph, execution_mode, virtualenv_dir, operator_args, caplog): """ This test will raise a warning if we are trying to pass ExecutionMode != `VirtualEnv` and still pass a defined `virtualenv_dir` """ - project_config = ProjectConfig(dbt_project_path=SAMPLE_DBT_PROJECT) + project_config = ProjectConfig(dbt_project_path=SAMPLE_DBT_PROJECT.as_posix()) execution_config = ExecutionConfig(execution_mode=execution_mode, virtualenv_dir=virtualenv_dir) render_config = RenderConfig(emit_datasets=True) profile_config = ProfileConfig( @@ -226,8 +228,6 @@ def test_converter_raises_warning(mock_load_dbt_graph, execution_mode, virtualen operator_args=operator_args, ) - assert converter - assert "`ExecutionConfig.virtualenv_dir` is only supported when \ ExecutionConfig.execution_mode is set to ExecutionMode.VIRTUALENV." in caplog.text From ad54b59df3c3f47444eeb2a1416952b18973f0a2 Mon Sep 17 00:00:00 2001 From: MrBones757 Date: Sat, 4 Nov 2023 00:32:11 +0800 Subject: [PATCH 14/52] Support `ProjectConfig.dbt_project_path = None` & different paths for Rendering and Execution (#634) This MR finishes the work that was started in #605 to add full support for ProjectConfig.dbt_project_path = None, and implements #568. Within this PR, several things have been updated: 1 - Added project_path fields to RenderConfig and ExecutionConfig 2 - Simplified the consumption of RenderConfig in the dbtGraph class 3 - added option to configure different dbt executables for Rendering vs Execution. Closes: #568 --- cosmos/config.py | 2 +- cosmos/converter.py | 8 ++--- cosmos/dbt/graph.py | 27 +++++++++++++++- tests/dbt/test_graph.py | 13 ++++---- tests/test_converter.py | 68 ----------------------------------------- 5 files changed, 37 insertions(+), 81 deletions(-) diff --git a/cosmos/config.py b/cosmos/config.py index ea40f3cd5..fb7052a17 100644 --- a/cosmos/config.py +++ b/cosmos/config.py @@ -367,6 +367,7 @@ class ExecutionConfig: dbt_project_path: InitVar[str | Path | None] = None virtualenv_dir: str | Path | None = None + project_path: Path | None = field(init=False) def __post_init__(self, dbt_project_path: str | Path | None) -> None: @@ -385,4 +386,3 @@ def __post_init__(self, dbt_project_path: str | Path | None) -> None: ) self.invocation_mode = InvocationMode.SUBPROCESS self.project_path = Path(dbt_project_path) if dbt_project_path else None - diff --git a/cosmos/converter.py b/cosmos/converter.py index 7852a445c..77086e5f4 100644 --- a/cosmos/converter.py +++ b/cosmos/converter.py @@ -224,10 +224,10 @@ def __init__( validate_adapted_user_config(execution_config, project_config, render_config) - env_vars = project_config.env_vars or operator_args.get("env") - dbt_vars = project_config.dbt_vars or operator_args.get("vars") + env_vars = copy.deepcopy(project_config.env_vars or operator_args.get("env")) + dbt_vars = copy.deepcopy(project_config.dbt_vars or operator_args.get("vars")) - if execution_mode != ExecutionMode.VIRTUALENV and execution_config.virtualenv_dir is not None: + if execution_config.execution_mode != ExecutionMode.VIRTUALENV and execution_config.virtualenv_dir is not None: logger.warning( "`ExecutionConfig.virtualenv_dir` is only supported when \ ExecutionConfig.execution_mode is set to ExecutionMode.VIRTUALENV." @@ -288,7 +288,7 @@ def __init__( task_args, execution_mode=execution_config.execution_mode, ) - if (execution_mode == ExecutionMode.VIRTUALENV and execution_config.virtualenv_dir is not None): + if execution_config.execution_mode == ExecutionMode.VIRTUALENV and execution_config.virtualenv_dir is not None: task_args["virtualenv_dir"] = execution_config.virtualenv_dir build_airflow_graph( diff --git a/cosmos/dbt/graph.py b/cosmos/dbt/graph.py index 68105eb21..d15c3c716 100644 --- a/cosmos/dbt/graph.py +++ b/cosmos/dbt/graph.py @@ -29,6 +29,7 @@ ExecutionMode, LoadMode, ) +from cosmos.dbt.executable import get_system_dbt from cosmos.dbt.parser.project import LegacyDbtProject from cosmos.dbt.project import create_symlinks, environ, get_partial_parse_path, has_non_empty_dependencies_file from cosmos.dbt.selector import select_nodes @@ -99,6 +100,14 @@ def context_dict(self) -> dict[str, Any]: } +def create_symlinks(project_path: Path, tmp_dir: Path) -> None: + """Helper function to create symlinks to the dbt project files.""" + ignore_paths = (DBT_LOG_DIR_NAME, DBT_TARGET_DIR_NAME, "dbt_packages", "profiles.yml") + for child_name in os.listdir(project_path): + if child_name not in ignore_paths: + os.symlink(project_path / child_name, tmp_dir / child_name) + + def run_command(command: list[str], tmp_dir: Path, env_vars: dict[str, str]) -> str: """Run a command in a subprocess, returning the stdout.""" logger.info("Running command: `%s`", " ".join(command)) @@ -154,6 +163,15 @@ class DbtGraph: Supports different ways of loading the `dbt` project into this representation. Different loading methods can result in different `nodes` and `filtered_nodes`. + + Example of how to use: + + dbt_graph = DbtGraph( + project=ProjectConfig(dbt_project_path=DBT_PROJECT_PATH), + render_config=RenderConfig(exclude=["*orders*"], select=[]), + dbt_cmd="/usr/local/bin/dbt" + ) + dbt_graph.load(method=LoadMode.DBT_LS, execution_mode=ExecutionMode.LOCAL) """ nodes: dict[str, DbtNode] = dict() @@ -170,6 +188,8 @@ def __init__( cache_identifier: str = "", dbt_vars: dict[str, str] | None = None, airflow_metadata: dict[str, str] | None = None, + dbt_cmd: str = get_system_dbt(), + operator_args: dict[str, Any] | None = None, ): self.project = project self.render_config = render_config @@ -182,6 +202,8 @@ def __init__( else: self.dbt_ls_cache_key = "" self.dbt_vars = dbt_vars or {} + self.operator_args = operator_args or {} + self.dbt_cmd = dbt_cmd @cached_property def env_vars(self) -> dict[str, str]: @@ -347,6 +369,7 @@ def load( def run_dbt_ls( self, dbt_cmd: str, project_path: Path, tmp_dir: Path, env_vars: dict[str, str] ) -> dict[str, DbtNode]: + """Runs dbt ls command and returns the parsed nodes.""" ls_command = [dbt_cmd, "ls", "--output", "json"] @@ -446,6 +469,9 @@ def load_via_dbt_ls_without_cache(self) -> None: if not self.profile_config: raise CosmosLoadDbtException("Unable to load project via dbt ls without a profile config.") + if not self.profile_config: + raise CosmosLoadDbtException("Unable to load project via dbt ls without a profile config.") + with tempfile.TemporaryDirectory() as tmpdir: logger.debug(f"Content of the dbt project dir {project_path}: `{os.listdir(project_path)}`") tmpdir_path = Path(tmpdir) @@ -490,7 +516,6 @@ def load_via_dbt_ls_without_cache(self) -> None: self.run_dbt_deps(dbt_cmd, tmpdir_path, env) nodes = self.run_dbt_ls(dbt_cmd, self.project_path, tmpdir_path, env) - self.nodes = nodes self.filtered_nodes = nodes diff --git a/tests/dbt/test_graph.py b/tests/dbt/test_graph.py index 7483c768c..e236e6629 100644 --- a/tests/dbt/test_graph.py +++ b/tests/dbt/test_graph.py @@ -15,6 +15,7 @@ from cosmos import settings from cosmos.config import CosmosConfigException, ExecutionConfig, ProfileConfig, ProjectConfig, RenderConfig from cosmos.constants import DBT_TARGET_DIR_NAME, DbtResourceType, ExecutionMode + from cosmos.dbt.graph import ( CosmosLoadDbtException, DbtGraph, @@ -498,8 +499,9 @@ def test_load_via_dbt_ls_without_exclude(project_name, postgres_profile_config): def test_load_via_custom_without_project_path(): project_config = ProjectConfig(manifest_path=SAMPLE_MANIFEST, project_name="test") execution_config = ExecutionConfig() - render_config = RenderConfig(dbt_executable_path="/inexistent/dbt") + render_config = RenderConfig() dbt_graph = DbtGraph( + dbt_cmd="/inexistent/dbt", project=project_config, execution_config=execution_config, render_config=render_config, @@ -515,10 +517,9 @@ def test_load_via_custom_without_project_path(): def test_load_via_dbt_ls_without_profile(mock_validate_dbt_command): project_config = ProjectConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) execution_config = ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) - render_config = RenderConfig( - dbt_executable_path="existing-dbt-cmd", dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME - ) + render_config = RenderConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) dbt_graph = DbtGraph( + dbt_cmd="/inexistent/dbt", project=project_config, execution_config=execution_config, render_config=render_config, @@ -534,9 +535,7 @@ def test_load_via_dbt_ls_without_profile(mock_validate_dbt_command): def test_load_via_dbt_ls_with_invalid_dbt_path(mock_which): project_config = ProjectConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) execution_config = ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) - render_config = RenderConfig( - dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, dbt_executable_path="/inexistent/dbt" - ) + render_config = RenderConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) with patch("pathlib.Path.exists", return_value=True): dbt_graph = DbtGraph( project=project_config, diff --git a/tests/test_converter.py b/tests/test_converter.py index 4c5af77b4..4404c5235 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -270,74 +270,6 @@ def test_converter_fails_execution_config_no_project_dir(mock_load_dbt_graph, ex ) -def test_converter_fails_render_config_invalid_dbt_path_with_dbt_ls(): - """ - Validate that a dbt project fails to be rendered to Airflow with DBT_LS if - the dbt command is invalid. - """ - project_config = ProjectConfig(dbt_project_path=SAMPLE_DBT_PROJECT.as_posix(), project_name="sample") - execution_config = ExecutionConfig( - execution_mode=ExecutionMode.LOCAL, - dbt_executable_path="invalid-execution-dbt", - ) - render_config = RenderConfig( - emit_datasets=True, - dbt_executable_path="invalid-render-dbt", - ) - profile_config = ProfileConfig( - profile_name="my_profile_name", - target_name="my_target_name", - profiles_yml_filepath=SAMPLE_PROFILE_YML, - ) - with pytest.raises(CosmosConfigException) as err_info: - with DAG("test-id", start_date=datetime(2022, 1, 1)) as dag: - DbtToAirflowConverter( - dag=dag, - nodes=nodes, - project_config=project_config, - profile_config=profile_config, - execution_config=execution_config, - render_config=render_config, - ) - assert ( - err_info.value.args[0] - == "Unable to find the dbt executable, attempted: and ." - ) - - -def test_converter_fails_render_config_invalid_dbt_path_with_manifest(): - """ - Validate that a dbt project succeeds to be rendered to Airflow with DBT_MANIFEST even when - the dbt command is invalid. - """ - project_config = ProjectConfig(manifest_path=SAMPLE_DBT_MANIFEST.as_posix(), project_name="sample") - - execution_config = ExecutionConfig( - execution_mode=ExecutionMode.LOCAL, - dbt_executable_path="invalid-execution-dbt", - dbt_project_path=SAMPLE_DBT_PROJECT.as_posix(), - ) - render_config = RenderConfig( - emit_datasets=True, - dbt_executable_path="invalid-render-dbt", - ) - profile_config = ProfileConfig( - profile_name="my_profile_name", - target_name="my_target_name", - profiles_yml_filepath=SAMPLE_PROFILE_YML, - ) - with DAG("test-id", start_date=datetime(2022, 1, 1)) as dag: - converter = DbtToAirflowConverter( - dag=dag, - nodes=nodes, - project_config=project_config, - profile_config=profile_config, - execution_config=execution_config, - render_config=render_config, - ) - assert converter - - @pytest.mark.parametrize( "execution_mode,operator_args", [ From 8e86ce251a8dce84456a911c61568fb9422ac2b9 Mon Sep 17 00:00:00 2001 From: Lennart Kloppenburg Date: Fri, 17 Nov 2023 10:43:56 +0100 Subject: [PATCH 15/52] Fix rebase conflicts --- cosmos/dbt/graph.py | 15 ++++----------- cosmos/operators/virtualenv.py | 8 ++++---- tests/dbt/test_graph.py | 6 ++++-- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/cosmos/dbt/graph.py b/cosmos/dbt/graph.py index d15c3c716..cb50e9bfe 100644 --- a/cosmos/dbt/graph.py +++ b/cosmos/dbt/graph.py @@ -7,6 +7,7 @@ import json import os import platform +import shutil import tempfile import zlib from dataclasses import dataclass, field @@ -100,14 +101,6 @@ def context_dict(self) -> dict[str, Any]: } -def create_symlinks(project_path: Path, tmp_dir: Path) -> None: - """Helper function to create symlinks to the dbt project files.""" - ignore_paths = (DBT_LOG_DIR_NAME, DBT_TARGET_DIR_NAME, "dbt_packages", "profiles.yml") - for child_name in os.listdir(project_path): - if child_name not in ignore_paths: - os.symlink(project_path / child_name, tmp_dir / child_name) - - def run_command(command: list[str], tmp_dir: Path, env_vars: dict[str, str]) -> str: """Run a command in a subprocess, returning the stdout.""" logger.info("Running command: `%s`", " ".join(command)) @@ -371,7 +364,7 @@ def run_dbt_ls( ) -> dict[str, DbtNode]: """Runs dbt ls command and returns the parsed nodes.""" - ls_command = [dbt_cmd, "ls", "--output", "json"] + ls_command = [self.dbt_cmd, "ls", "--output", "json"] ls_args = self.dbt_ls_args ls_command.extend(self.local_flags) @@ -469,8 +462,8 @@ def load_via_dbt_ls_without_cache(self) -> None: if not self.profile_config: raise CosmosLoadDbtException("Unable to load project via dbt ls without a profile config.") - if not self.profile_config: - raise CosmosLoadDbtException("Unable to load project via dbt ls without a profile config.") + if not shutil.which(self.dbt_cmd): + raise CosmosLoadDbtException(f"Unable to find the dbt executable: {self.dbt_cmd}") with tempfile.TemporaryDirectory() as tmpdir: logger.debug(f"Content of the dbt project dir {project_path}: `{os.listdir(project_path)}`") diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 07de3f54d..5798dca7b 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -7,7 +7,7 @@ from pathlib import Path from tempfile import TemporaryDirectory -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Callable from airflow.utils.python_virtualenv import prepare_virtualenv @@ -34,8 +34,8 @@ PY_INTERPRETER = "python3" -def depends_on_virtualenv_dir(method): - def wrapper(operator: DbtVirtualenvBaseOperator, *args): +def depends_on_virtualenv_dir(method: Callable[[Any], Any]) -> Callable[[Any], Any]: + def wrapper(operator: DbtVirtualenvBaseOperator, *args: Any) -> None: if operator.virtualenv_dir is None: raise CosmosValueError(f"Method relies on value of parameter `virtualenv_dir` which is None.") @@ -181,7 +181,7 @@ def _is_lock_available(self) -> bool: @depends_on_virtualenv_dir def __acquire_venv_lock(self) -> None: - if not self.virtualenv_dir.is_dir(): + if not self.virtualenv_dir.is_dir(): # type: ignore os.mkdir(str(self.virtualenv_dir)) with open(self.__lock_file, "w") as lf: diff --git a/tests/dbt/test_graph.py b/tests/dbt/test_graph.py index e236e6629..1f7a4ec1b 100644 --- a/tests/dbt/test_graph.py +++ b/tests/dbt/test_graph.py @@ -16,6 +16,7 @@ from cosmos.config import CosmosConfigException, ExecutionConfig, ProfileConfig, ProjectConfig, RenderConfig from cosmos.constants import DBT_TARGET_DIR_NAME, DbtResourceType, ExecutionMode + from cosmos.dbt.graph import ( CosmosLoadDbtException, DbtGraph, @@ -538,6 +539,7 @@ def test_load_via_dbt_ls_with_invalid_dbt_path(mock_which): render_config = RenderConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) with patch("pathlib.Path.exists", return_value=True): dbt_graph = DbtGraph( + dbt_cmd="/inexistent/dbt", project=project_config, execution_config=execution_config, render_config=render_config, @@ -547,10 +549,10 @@ def test_load_via_dbt_ls_with_invalid_dbt_path(mock_which): profiles_yml_filepath=Path(__file__).parent.parent / "sample/profiles.yml", ), ) - with pytest.raises(CosmosConfigException) as err_info: + with pytest.raises(CosmosLoadDbtException) as err_info: dbt_graph.load_via_dbt_ls() - expected = "Unable to find the dbt executable, attempted: and ." + expected = "Unable to find the dbt executable: /inexistent/dbt" assert err_info.value.args[0] == expected From e384177e779248e14b3325bb88b9c3e6ddc91ea9 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Thu, 9 Nov 2023 16:09:54 +0000 Subject: [PATCH 16/52] Fix reusing config accross TaskGroups/DAGs (#664) If execution_config was reused, Cosmos 1.2.2 would raise: ``` astronomer-cosmos/dags/basic_cosmos_task_group.py Traceback (most recent call last): File "/Users/tati/Code/cosmos-clean/astronomer-cosmos/venv-38/lib/python3.8/site-packages/airflow/models/dagbag.py", line 343, in parse loader.exec_module(new_module) File "", line 848, in exec_module File "", line 219, in _call_with_frames_removed File "/Users/tati/Code/cosmos-clean/astronomer-cosmos/dags/basic_cosmos_task_group.py", line 74, in basic_cosmos_task_group() File "/Users/tati/Code/cosmos-clean/astronomer-cosmos/venv-38/lib/python3.8/site-packages/airflow/models/dag.py", line 3817, in factory f(**f_kwargs) File "/Users/tati/Code/cosmos-clean/astronomer-cosmos/dags/basic_cosmos_task_group.py", line 54, in basic_cosmos_task_group orders = DbtTaskGroup( File "/Users/tati/Code/cosmos-clean/astronomer-cosmos/cosmos/airflow/task_group.py", line 26, in __init__ DbtToAirflowConverter.__init__(self, *args, **specific_kwargs(**kwargs)) File "/Users/tati/Code/cosmos-clean/astronomer-cosmos/cosmos/converter.py", line 113, in __init__ raise CosmosValueError( cosmos.exceptions.CosmosValueError: ProjectConfig.dbt_project_path is mutually exclusive with RenderConfig.dbt_project_path and ExecutionConfig.dbt_project_path.If using RenderConfig.dbt_project_path or ExecutionConfig.dbt_project_path, ProjectConfig.dbt_project_path should be None ``` This has been raised by an Astro customer and our field engineer, who tried to run: https://github.com/astronomer/cosmos-demo --- cosmos/converter.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cosmos/converter.py b/cosmos/converter.py index 77086e5f4..1a6dd6f56 100644 --- a/cosmos/converter.py +++ b/cosmos/converter.py @@ -220,7 +220,12 @@ def __init__( validate_initial_user_config(execution_config, profile_config, project_config, render_config, operator_args) if project_config.dbt_project_path: - execution_config, render_config = migrate_to_new_interface(execution_config, project_config, render_config) + # We copy the configuration so the change does not affect other DAGs or TaskGroups + # that may reuse the same original configuration + render_config = copy.deepcopy(render_config) + execution_config = copy.deepcopy(execution_config) + render_config.project_path = project_config.dbt_project_path + execution_config.project_path = project_config.dbt_project_path validate_adapted_user_config(execution_config, project_config, render_config) From d0eeae0564e51781cd9a9e9b41d2ba0e65fa36d6 Mon Sep 17 00:00:00 2001 From: Lennart Kloppenburg Date: Wed, 20 Dec 2023 11:46:28 +0100 Subject: [PATCH 17/52] Iron out locking flow --- cosmos/operators/virtualenv.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 5798dca7b..8b4437078 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -38,9 +38,7 @@ def depends_on_virtualenv_dir(method: Callable[[Any], Any]) -> Callable[[Any], A def wrapper(operator: DbtVirtualenvBaseOperator, *args: Any) -> None: if operator.virtualenv_dir is None: raise CosmosValueError(f"Method relies on value of parameter `virtualenv_dir` which is None.") - - logger.info(f"Operator: {operator}") - logger.info(f"Args: {args}") + method(operator, *args) return wrapper @@ -165,14 +163,17 @@ def __lock_file(self) -> Path: def _pid(self) -> int: return os.getpid() - @depends_on_virtualenv_dir + #@depends_on_virtualenv_dir def _is_lock_available(self) -> bool: if self.__lock_file.is_file(): with open(self.__lock_file, "r") as lf: pid = int(lf.read()) self.log.info(f"Checking for running process with PID {pid}") - _process_running = psutil.Process(pid).is_running() + try: + _process_running = psutil.Process(pid).is_running() + except psutil.NoSuchProcess: + _process_running = False self.log.info(f"Process {pid} running: {_process_running}") return not _process_running @@ -191,7 +192,9 @@ def __acquire_venv_lock(self) -> None: @depends_on_virtualenv_dir def __release_venv_lock(self) -> None: if not self.__lock_file.is_file(): - raise FileNotFoundError(f"Lockfile {self.__lock_file} not found") + self.log.warn(f"Lockfile {self.__lock_file} not found, perhaps deleted by other concurrent operator?") + + return with open(self.__lock_file, "r") as lf: lock_file_pid = int(lf.read()) From bebe6d49858d86ef980d660900911223b516edc7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 20 Dec 2023 10:46:48 +0000 Subject: [PATCH 18/52] =?UTF-8?q?=F0=9F=8E=A8=20[pre-commit.ci]=20Auto=20f?= =?UTF-8?q?ormat=20from=20pre-commit.com=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cosmos/operators/virtualenv.py | 25 ++++++++++++++----------- tests/test_converter.py | 11 +++++++---- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 8b4437078..49af9c7c1 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -34,14 +34,17 @@ PY_INTERPRETER = "python3" + def depends_on_virtualenv_dir(method: Callable[[Any], Any]) -> Callable[[Any], Any]: def wrapper(operator: DbtVirtualenvBaseOperator, *args: Any) -> None: if operator.virtualenv_dir is None: raise CosmosValueError(f"Method relies on value of parameter `virtualenv_dir` which is None.") method(operator, *args) + return wrapper + class DbtVirtualenvBaseOperator(DbtLocalBaseOperator): """ Executes a dbt core cli command within a Python Virtual Environment, that is created before running the dbt command @@ -54,7 +57,8 @@ class DbtVirtualenvBaseOperator(DbtLocalBaseOperator): within the virtual environment (if py_requirements argument is specified). Avoid using unless the dbt job requires it. """ - template_fields = DbtLocalBaseOperator.template_fields + ("virtualenv_dir",) # type: ignore[operator] + + template_fields = DbtLocalBaseOperator.template_fields + ("virtualenv_dir",) # type: ignore[operator] def __init__( self, @@ -118,7 +122,7 @@ def execute(self, context: Context) -> None: logger.info(output) def _get_or_create_venv_py_interpreter(self) -> str: - """Helper method that parses virtual env configuration + """Helper method that parses virtual env configuration and returns a DBT binary within the resulting virtualenv""" # No virtualenv_dir set, so revert to making a temporary virtualenv @@ -154,19 +158,19 @@ def _get_or_create_venv_py_interpreter(self) -> str: self.__release_venv_lock() return py_bin - + @property def __lock_file(self) -> Path: return Path(f"{self.virtualenv_dir}/LOCK") - + @property def _pid(self) -> int: return os.getpid() - - #@depends_on_virtualenv_dir + + # @depends_on_virtualenv_dir def _is_lock_available(self) -> bool: if self.__lock_file.is_file(): - with open(self.__lock_file, "r") as lf: + with open(self.__lock_file) as lf: pid = int(lf.read()) self.log.info(f"Checking for running process with PID {pid}") @@ -182,13 +186,13 @@ def _is_lock_available(self) -> bool: @depends_on_virtualenv_dir def __acquire_venv_lock(self) -> None: - if not self.virtualenv_dir.is_dir(): # type: ignore + if not self.virtualenv_dir.is_dir(): # type: ignore os.mkdir(str(self.virtualenv_dir)) with open(self.__lock_file, "w") as lf: self.log.info(f"Acquiring lock at {self.__lock_file} with pid {str(self._pid)}") lf.write(str(self._pid)) - + @depends_on_virtualenv_dir def __release_venv_lock(self) -> None: if not self.__lock_file.is_file(): @@ -196,7 +200,7 @@ def __release_venv_lock(self) -> None: return - with open(self.__lock_file, "r") as lf: + with open(self.__lock_file) as lf: lock_file_pid = int(lf.read()) if lock_file_pid == self._pid: @@ -259,4 +263,3 @@ class DbtDocsVirtualenvOperator(DbtVirtualenvBaseOperator, DbtDocsLocalOperator) Executes `dbt docs generate` command within a Python Virtual Environment, that is created before running the dbt command and deleted just after. """ - diff --git a/tests/test_converter.py b/tests/test_converter.py index 4404c5235..06b9be755 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -168,7 +168,7 @@ def test_converter_creates_dag_with_seed(mock_load_dbt_graph, execution_mode, op "execution_mode,operator_args", [ (ExecutionMode.KUBERNETES, {}), - ] + ], ) @patch("cosmos.converter.DbtGraph.filtered_nodes", nodes) @patch("cosmos.converter.DbtGraph.load") @@ -196,6 +196,7 @@ def test_converter_creates_dag_with_project_path_str(mock_load_dbt_graph, execut ) assert converter + @pytest.mark.parametrize( "execution_mode,virtualenv_dir,operator_args", [ @@ -228,9 +229,11 @@ def test_converter_raises_warning(mock_load_dbt_graph, execution_mode, virtualen operator_args=operator_args, ) - assert "`ExecutionConfig.virtualenv_dir` is only supported when \ - ExecutionConfig.execution_mode is set to ExecutionMode.VIRTUALENV." in caplog.text - + assert ( + "`ExecutionConfig.virtualenv_dir` is only supported when \ + ExecutionConfig.execution_mode is set to ExecutionMode.VIRTUALENV." + in caplog.text + ) @pytest.mark.parametrize( From 801a6dc0e63a1419c3087c325f55c73b01a02d27 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Thu, 4 Jul 2024 13:48:25 +0100 Subject: [PATCH 19/52] Enable temporarily tests on this branch --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f19831676..337064d03 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,7 @@ name: test on: push: # Run on pushes to the default branch - branches: [main] + branches: [main, feature/cache-virtualenv] pull_request_target: # Also run on pull requests originated from forks branches: [main] From 521ce5a5020aea9de261f547e3dc9b22b9b4350c Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Thu, 4 Jul 2024 13:59:19 +0100 Subject: [PATCH 20/52] Fix tests in main --- tests/dbt/test_graph.py | 11 +++++------ tests/operators/test_virtualenv.py | 18 ++++++++++++++---- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/tests/dbt/test_graph.py b/tests/dbt/test_graph.py index 1f7a4ec1b..0ba3332e6 100644 --- a/tests/dbt/test_graph.py +++ b/tests/dbt/test_graph.py @@ -15,8 +15,6 @@ from cosmos import settings from cosmos.config import CosmosConfigException, ExecutionConfig, ProfileConfig, ProjectConfig, RenderConfig from cosmos.constants import DBT_TARGET_DIR_NAME, DbtResourceType, ExecutionMode - - from cosmos.dbt.graph import ( CosmosLoadDbtException, DbtGraph, @@ -536,10 +534,11 @@ def test_load_via_dbt_ls_without_profile(mock_validate_dbt_command): def test_load_via_dbt_ls_with_invalid_dbt_path(mock_which): project_config = ProjectConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) execution_config = ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) - render_config = RenderConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) + render_config = RenderConfig( + dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, dbt_executable_path="/inexistent/dbt" + ) with patch("pathlib.Path.exists", return_value=True): dbt_graph = DbtGraph( - dbt_cmd="/inexistent/dbt", project=project_config, execution_config=execution_config, render_config=render_config, @@ -549,10 +548,10 @@ def test_load_via_dbt_ls_with_invalid_dbt_path(mock_which): profiles_yml_filepath=Path(__file__).parent.parent / "sample/profiles.yml", ), ) - with pytest.raises(CosmosLoadDbtException) as err_info: + with pytest.raises(CosmosConfigException) as err_info: dbt_graph.load_via_dbt_ls() - expected = "Unable to find the dbt executable: /inexistent/dbt" + expected = "Unable to find the dbt executable, attempted: and ." assert err_info.value.args[0] == expected diff --git a/tests/operators/test_virtualenv.py b/tests/operators/test_virtualenv.py index aa4db9988..129430215 100644 --- a/tests/operators/test_virtualenv.py +++ b/tests/operators/test_virtualenv.py @@ -1,6 +1,6 @@ from datetime import datetime -from unittest.mock import MagicMock, patch from pathlib import Path +from unittest.mock import MagicMock, patch from airflow.models import DAG from airflow.models.connection import Connection @@ -79,6 +79,18 @@ def test_run_command( def test_virtualenv_operator_append_env_is_true_by_default(): venv_operator = ConcreteDbtVirtualenvBaseOperator( dag=DAG("sample_dag", start_date=datetime(2024, 4, 16)), + profile_config=profile_config, + task_id="fake_task", + install_deps=True, + project_dir="./dev/dags/dbt/jaffle_shop", + py_system_site_packages=False, + pip_install_options=["--test-flag"], + py_requirements=["dbt-postgres==1.6.0b1"], + emit_datasets=False, + invocation_mode=InvocationMode.SUBPROCESS, + ) + + assert venv_operator.append_env is True @patch("airflow.utils.python_virtualenv.execute_in_subprocess") @@ -113,10 +125,8 @@ def test_supply_virtualenv_dir_flag( install_deps=True, project_dir="./dev/dags/dbt/jaffle_shop", py_system_site_packages=False, - pip_install_options=["--test-flag"], py_requirements=["dbt-postgres==1.6.0b1"], emit_datasets=False, - invocation_mode=InvocationMode.SUBPROCESS, virtualenv_dir=Path("mock-venv"), - assert venv_operator.append_env is True + ) assert venv_operator.venv_dbt_path == "mock-venv/bin/dbt" From bd5aa76e5f4f8e1b2e977a8d1948bf5c62c9b8fa Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Thu, 4 Jul 2024 17:52:10 +0100 Subject: [PATCH 21/52] Fix tests --- tests/operators/test_virtualenv.py | 11 +++++++---- tests/test_converter.py | 5 +++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/operators/test_virtualenv.py b/tests/operators/test_virtualenv.py index 129430215..d80e4334c 100644 --- a/tests/operators/test_virtualenv.py +++ b/tests/operators/test_virtualenv.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from datetime import datetime from pathlib import Path from unittest.mock import MagicMock, patch @@ -21,7 +23,10 @@ class ConcreteDbtVirtualenvBaseOperator(DbtVirtualenvBaseOperator): - base_cmd = ["cmd"] + + @property + def base_cmd(self) -> list[str]: + return ["cmd"] @patch("airflow.utils.python_virtualenv.execute_in_subprocess") @@ -96,7 +101,6 @@ def test_virtualenv_operator_append_env_is_true_by_default(): @patch("airflow.utils.python_virtualenv.execute_in_subprocess") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.calculate_openlineage_events_completes") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.store_compiled_sql") -@patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.exception_handling") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.subprocess_hook") @patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator._is_lock_available") @patch("airflow.hooks.base.BaseHook.get_connection") @@ -104,7 +108,6 @@ def test_supply_virtualenv_dir_flag( mock_get_connection, mock_lock_available, mock_subprocess_hook, - mock_exception_handling, mock_store_compiled_sql, mock_calculate_openlineage_events_completes, mock_execute, @@ -119,7 +122,7 @@ def test_supply_virtualenv_dir_flag( password="fake_password", schema="fake_schema", ) - venv_operator = DbtVirtualenvBaseOperator( + venv_operator = ConcreteDbtVirtualenvBaseOperator( profile_config=profile_config, task_id="fake_task", install_deps=True, diff --git a/tests/test_converter.py b/tests/test_converter.py index 06b9be755..573eba138 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -6,7 +6,7 @@ import pytest from airflow.models import DAG -from cosmos.config import CosmosConfigException, ExecutionConfig, ProfileConfig, ProjectConfig, RenderConfig +from cosmos.config import ExecutionConfig, ProfileConfig, ProjectConfig, RenderConfig from cosmos.constants import DbtResourceType, ExecutionMode, InvocationMode, LoadMode from cosmos.converter import DbtToAirflowConverter, validate_arguments, validate_initial_user_config from cosmos.dbt.graph import DbtGraph, DbtNode @@ -220,7 +220,8 @@ def test_converter_raises_warning(mock_load_dbt_graph, execution_mode, virtualen profiles_yml_filepath=SAMPLE_PROFILE_YML, ) - converter = DbtToAirflowConverter( + DbtToAirflowConverter( + dag=DAG("sample_dag", start_date=datetime(2024, 4, 16)), nodes=nodes, project_config=project_config, profile_config=profile_config, From 376616b47ddebed86d4351c0f9f86b0c78606302 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Fri, 5 Jul 2024 09:53:25 +0100 Subject: [PATCH 22/52] Remove check for dbt binary, that is handled elsewhere --- cosmos/dbt/graph.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cosmos/dbt/graph.py b/cosmos/dbt/graph.py index cb50e9bfe..087fef5a4 100644 --- a/cosmos/dbt/graph.py +++ b/cosmos/dbt/graph.py @@ -7,7 +7,6 @@ import json import os import platform -import shutil import tempfile import zlib from dataclasses import dataclass, field @@ -362,7 +361,6 @@ def load( def run_dbt_ls( self, dbt_cmd: str, project_path: Path, tmp_dir: Path, env_vars: dict[str, str] ) -> dict[str, DbtNode]: - """Runs dbt ls command and returns the parsed nodes.""" ls_command = [self.dbt_cmd, "ls", "--output", "json"] @@ -462,9 +460,6 @@ def load_via_dbt_ls_without_cache(self) -> None: if not self.profile_config: raise CosmosLoadDbtException("Unable to load project via dbt ls without a profile config.") - if not shutil.which(self.dbt_cmd): - raise CosmosLoadDbtException(f"Unable to find the dbt executable: {self.dbt_cmd}") - with tempfile.TemporaryDirectory() as tmpdir: logger.debug(f"Content of the dbt project dir {project_path}: `{os.listdir(project_path)}`") tmpdir_path = Path(tmpdir) From 738ee73bec20e3f3b7ad7cc27e6cdcfe481ef0bb Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Fri, 5 Jul 2024 10:13:49 +0100 Subject: [PATCH 23/52] Fix pre-commit hook check --- cosmos/converter.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cosmos/converter.py b/cosmos/converter.py index 1a6dd6f56..553b7b49e 100644 --- a/cosmos/converter.py +++ b/cosmos/converter.py @@ -16,7 +16,7 @@ from cosmos import cache, settings from cosmos.airflow.graph import build_airflow_graph -from cosmos.config import ProjectConfig, ExecutionConfig, RenderConfig, ProfileConfig +from cosmos.config import ExecutionConfig, ProfileConfig, ProjectConfig, RenderConfig from cosmos.constants import ExecutionMode from cosmos.dbt.graph import DbtGraph from cosmos.dbt.selector import retrieve_by_label @@ -238,10 +238,6 @@ def __init__( ExecutionConfig.execution_mode is set to ExecutionMode.VIRTUALENV." ) - profile_args = {} - if profile_config.profile_mapping: - profile_args = profile_config.profile_mapping.profile_args - if not operator_args: operator_args = {} From 610e5a46ae6d9ca6da1486fbe950d48e41f114f5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 5 Jul 2024 09:17:00 +0000 Subject: [PATCH 24/52] =?UTF-8?q?=F0=9F=8E=A8=20[pre-commit.ci]=20Auto=20f?= =?UTF-8?q?ormat=20from=20pre-commit.com=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cosmos/operators/virtualenv.py | 5 ++--- dev/dags/example_virtualenv.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 49af9c7c1..5ca408baf 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -1,14 +1,13 @@ from __future__ import annotations -from functools import cached_property import os -import psutil import time - +from functools import cached_property from pathlib import Path from tempfile import TemporaryDirectory from typing import TYPE_CHECKING, Any, Callable +import psutil from airflow.utils.python_virtualenv import prepare_virtualenv from cosmos.exceptions import CosmosValueError diff --git a/dev/dags/example_virtualenv.py b/dev/dags/example_virtualenv.py index d13cf5099..3f4be60d5 100644 --- a/dev/dags/example_virtualenv.py +++ b/dev/dags/example_virtualenv.py @@ -6,12 +6,11 @@ from datetime import datetime from pathlib import Path -from airflow.decorators import dag from airflow.configuration import get_airflow_home +from airflow.decorators import dag from airflow.operators.empty import EmptyOperator from cosmos import DbtTaskGroup, ExecutionConfig, ExecutionMode, ProfileConfig, ProjectConfig - from cosmos.profiles import PostgresUserPasswordProfileMapping DEFAULT_DBT_ROOT_PATH = Path(__file__).parent / "dbt" @@ -27,6 +26,7 @@ ), ) + # [START virtualenv_example] @dag( schedule_interval="@daily", From 7aabbf628227aa7db34d1bbba74ecc1160e67562 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Thu, 11 Jul 2024 10:09:43 +0100 Subject: [PATCH 25/52] Update .github/workflows/test.yml --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 337064d03..f19831676 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,7 @@ name: test on: push: # Run on pushes to the default branch - branches: [main, feature/cache-virtualenv] + branches: [main] pull_request_target: # Also run on pull requests originated from forks branches: [main] From c33dc5b19037e2e5c2a07c8884e2a1ee97d780a5 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Thu, 11 Jul 2024 11:52:12 +0100 Subject: [PATCH 26/52] Solve issue in deleting virtualenv temp dir if task fails --- cosmos/operators/virtualenv.py | 69 ++++++++++++++++-------------- dev/dags/example_virtualenv.py | 7 ++- tests/operators/test_virtualenv.py | 2 +- 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 5ca408baf..98f4ec41a 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -1,7 +1,9 @@ from __future__ import annotations import os +import shutil import time +import traceback from functools import cached_property from pathlib import Path from tempfile import TemporaryDirectory @@ -12,7 +14,6 @@ from cosmos.exceptions import CosmosValueError from cosmos.hooks.subprocess import FullOutputSubprocessResult -from cosmos.log import get_logger from cosmos.operators.local import ( DbtBuildLocalOperator, DbtDocsLocalOperator, @@ -28,8 +29,6 @@ if TYPE_CHECKING: from airflow.utils.context import Context -logger = get_logger(__name__) - PY_INTERPRETER = "python3" @@ -57,7 +56,7 @@ class DbtVirtualenvBaseOperator(DbtLocalBaseOperator): Avoid using unless the dbt job requires it. """ - template_fields = DbtLocalBaseOperator.template_fields + ("virtualenv_dir",) # type: ignore[operator] + template_fields = DbtLocalBaseOperator.template_fields + ("virtualenv_dir", "is_virtualenv_dir_temporary") # type: ignore[operator] def __init__( self, @@ -65,14 +64,15 @@ def __init__( pip_install_options: list[str] | None = None, py_system_site_packages: bool = False, virtualenv_dir: Path | None = None, + is_virtualenv_dir_temporary: bool = False, **kwargs: Any, ) -> None: self.py_requirements = py_requirements or [] self.pip_install_options = pip_install_options or [] self.py_system_site_packages = py_system_site_packages - super().__init__(**kwargs) self.virtualenv_dir = virtualenv_dir - self._venv_tmp_dir: None | TemporaryDirectory[str] = None + self.is_virtualenv_dir_temporary = is_virtualenv_dir_temporary + super().__init__(**kwargs) @cached_property def venv_dbt_path( @@ -88,7 +88,6 @@ def venv_dbt_path( # We are reusing the virtualenv directory for all subprocess calls within this task/operator. # For this reason, we are not using contexts at this point. # The deletion of this directory is done explicitly at the end of the `execute` method. - self._venv_tmp_dir = TemporaryDirectory(prefix="cosmos-venv") py_interpreter = self._get_or_create_venv_py_interpreter() dbt_binary = Path(py_interpreter).parent / "dbt" cmd_output = self.subprocess_hook.run_command( @@ -114,37 +113,45 @@ def run_subprocess(self, command: list[str], env: dict[str, str], cwd: str) -> F ) return subprocess_result + def clean_dir_if_temporary(self) -> None: + """ + Delete the virtualenv directory if it is temporary. + """ + if self.is_virtualenv_dir_temporary and self.virtualenv_dir: + self.log.info(f"Deleting the Python virtualenv {self.virtualenv_dir}") + shutil.rmtree(str(self.virtualenv_dir), ignore_errors=True) + def execute(self, context: Context) -> None: - output = super().execute(context) - if self._venv_tmp_dir: - self._venv_tmp_dir.cleanup() - logger.info(output) + try: + output = super().execute(context) + except Exception: + self.log.error(traceback.format_exc()) + else: + self.log.info(output) + finally: + self.clean_dir_if_temporary() + + def on_kill(self) -> None: + self.clean_dir_if_temporary() def _get_or_create_venv_py_interpreter(self) -> str: """Helper method that parses virtual env configuration and returns a DBT binary within the resulting virtualenv""" # No virtualenv_dir set, so revert to making a temporary virtualenv - if self.virtualenv_dir is None: + if self.virtualenv_dir is None or self.is_virtualenv_dir_temporary: self.log.info("Creating temporary virtualenv") - self._venv_tmp_dir = TemporaryDirectory(prefix="cosmos-venv") - - return prepare_virtualenv( - venv_directory=self._venv_tmp_dir.name, - python_bin=PY_INTERPRETER, - system_site_packages=self.py_system_site_packages, - requirements=self.py_requirements, - ) + self.virtualenv_dir = Path(TemporaryDirectory(prefix="cosmos-venv").name) - self.log.info(f"Checking if {str(self.__lock_file)} exists") - while not self._is_lock_available(): - self.log.info("Waiting for lock to release") - time.sleep(1) - - self.log.info(f"Creating virtualenv at `{self.virtualenv_dir}") - self.log.info(f"Acquiring available lock") - self.__acquire_venv_lock() + if not self.is_virtualenv_dir_temporary: + self.log.info(f"Checking if {str(self.__lock_file)} exists") + while not self._is_lock_available(): + self.log.info("Waiting for lock to release") + time.sleep(1) + self.log.info(f"Acquiring available lock") + self.__acquire_venv_lock() + self.log.info(f"Creating or updating the virtualenv at `{self.virtualenv_dir}") py_bin = prepare_virtualenv( venv_directory=str(self.virtualenv_dir), python_bin=PY_INTERPRETER, @@ -153,8 +160,9 @@ def _get_or_create_venv_py_interpreter(self) -> str: pip_install_options=self.pip_install_options, ) - self.log.info("Releasing lock") - self.__release_venv_lock() + if not self.is_virtualenv_dir_temporary: + self.log.info("Releasing lock") + self.__release_venv_lock() return py_bin @@ -196,7 +204,6 @@ def __acquire_venv_lock(self) -> None: def __release_venv_lock(self) -> None: if not self.__lock_file.is_file(): self.log.warn(f"Lockfile {self.__lock_file} not found, perhaps deleted by other concurrent operator?") - return with open(self.__lock_file) as lf: diff --git a/dev/dags/example_virtualenv.py b/dev/dags/example_virtualenv.py index 3f4be60d5..067f09b9d 100644 --- a/dev/dags/example_virtualenv.py +++ b/dev/dags/example_virtualenv.py @@ -37,6 +37,9 @@ def example_virtualenv() -> None: start_task = EmptyOperator(task_id="start-venv-examples") end_task = EmptyOperator(task_id="end-venv-examples") + # This first task group creates a new Cosmos virtualenv every time a task is run + # and deletes it afterwards + # It is much slower than if the user sets the `virtualenv_dir` tmp_venv_task_group = DbtTaskGroup( group_id="tmp-venv-group", # dbt/cosmos-specific parameters @@ -58,6 +61,8 @@ def example_virtualenv() -> None: }, ) + # The following task group reuses the Cosmos-managed Python virtualenv across multiple tasks. + # It runs approximately 70% faster than the previous TaskGroup. cached_venv_task_group = DbtTaskGroup( group_id="cached-venv-group", # dbt/cosmos-specific parameters @@ -73,7 +78,7 @@ def example_virtualenv() -> None: ), operator_args={ "py_system_site_packages": False, - "py_requirements": ["dbt-postgres==1.6.0b1"], + "py_requirements": ["dbt-postgres==1.8"], "install_deps": True, }, ) diff --git a/tests/operators/test_virtualenv.py b/tests/operators/test_virtualenv.py index d80e4334c..b79e77b66 100644 --- a/tests/operators/test_virtualenv.py +++ b/tests/operators/test_virtualenv.py @@ -64,7 +64,7 @@ def test_run_command( emit_datasets=False, invocation_mode=InvocationMode.SUBPROCESS, ) - assert venv_operator._venv_tmp_dir is None # Otherwise we are creating empty directories during DAG parsing time + assert venv_operator.virtualenv_dir is None # Otherwise we are creating empty directories during DAG parsing time # and not deleting them venv_operator.run_command(cmd=["fake-dbt", "do-something"], env={}, context={"task_instance": MagicMock()}) run_command_args = mock_subprocess_hook.run_command.call_args_list From 294986233f8307e66d1eb612ecef4258045900ac Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Thu, 11 Jul 2024 12:08:19 +0100 Subject: [PATCH 27/52] Solve issue in deleting virtualenv temp dir if task fails --- cosmos/operators/virtualenv.py | 45 ++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 98f4ec41a..049c9a1d5 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -117,7 +117,7 @@ def clean_dir_if_temporary(self) -> None: """ Delete the virtualenv directory if it is temporary. """ - if self.is_virtualenv_dir_temporary and self.virtualenv_dir: + if self.is_virtualenv_dir_temporary and self.virtualenv_dir and self.virtualenv_dir.exists(): self.log.info(f"Deleting the Python virtualenv {self.virtualenv_dir}") shutil.rmtree(str(self.virtualenv_dir), ignore_errors=True) @@ -134,23 +134,7 @@ def execute(self, context: Context) -> None: def on_kill(self) -> None: self.clean_dir_if_temporary() - def _get_or_create_venv_py_interpreter(self) -> str: - """Helper method that parses virtual env configuration - and returns a DBT binary within the resulting virtualenv""" - - # No virtualenv_dir set, so revert to making a temporary virtualenv - if self.virtualenv_dir is None or self.is_virtualenv_dir_temporary: - self.log.info("Creating temporary virtualenv") - self.virtualenv_dir = Path(TemporaryDirectory(prefix="cosmos-venv").name) - - if not self.is_virtualenv_dir_temporary: - self.log.info(f"Checking if {str(self.__lock_file)} exists") - while not self._is_lock_available(): - self.log.info("Waiting for lock to release") - time.sleep(1) - self.log.info(f"Acquiring available lock") - self.__acquire_venv_lock() - + def prepare_virtualenv(self) -> str: self.log.info(f"Creating or updating the virtualenv at `{self.virtualenv_dir}") py_bin = prepare_virtualenv( venv_directory=str(self.virtualenv_dir), @@ -159,9 +143,32 @@ def _get_or_create_venv_py_interpreter(self) -> str: requirements=self.py_requirements, pip_install_options=self.pip_install_options, ) + return py_bin + + def _get_or_create_venv_py_interpreter(self) -> str: + """Helper method that parses virtual env configuration + and returns a DBT binary within the resulting virtualenv""" + + # No virtualenv_dir set, so create a temporary virtualenv + if self.virtualenv_dir is None or self.is_virtualenv_dir_temporary: + self.log.info("Creating temporary virtualenv") + with TemporaryDirectory(prefix="cosmos-venv") as tempdir: + self.virtualenv_dir = Path(tempdir) + py_bin = self.prepare_virtualenv() + return py_bin + + # Use a reusable virtualenv + self.log.info(f"Checking if the virtualenv lock {str(self.__lock_file)} exists") + while not self._is_lock_available(): + self.log.info("Waiting for virtualenv lock to be released") + time.sleep(1) + + self.log.info(f"Acquiring the virtualenv lock") + self.__acquire_venv_lock() + py_bin = self.prepare_virtualenv() if not self.is_virtualenv_dir_temporary: - self.log.info("Releasing lock") + self.log.info("Releasing virtualenv lock") self.__release_venv_lock() return py_bin From 490855a4c3a8df3dd665472025e702421c03626f Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Mon, 15 Jul 2024 11:16:44 +0100 Subject: [PATCH 28/52] Add comments to improve the coverage --- cosmos/operators/virtualenv.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 049c9a1d5..95bc6a28a 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -36,6 +36,7 @@ def depends_on_virtualenv_dir(method: Callable[[Any], Any]) -> Callable[[Any], Any]: def wrapper(operator: DbtVirtualenvBaseOperator, *args: Any) -> None: if operator.virtualenv_dir is None: + # TODO: test raise CosmosValueError(f"Method relies on value of parameter `virtualenv_dir` which is None.") method(operator, *args) @@ -118,6 +119,7 @@ def clean_dir_if_temporary(self) -> None: Delete the virtualenv directory if it is temporary. """ if self.is_virtualenv_dir_temporary and self.virtualenv_dir and self.virtualenv_dir.exists(): + # TODO: test self.log.info(f"Deleting the Python virtualenv {self.virtualenv_dir}") shutil.rmtree(str(self.virtualenv_dir), ignore_errors=True) @@ -132,6 +134,7 @@ def execute(self, context: Context) -> None: self.clean_dir_if_temporary() def on_kill(self) -> None: + # TODO: test self.clean_dir_if_temporary() def prepare_virtualenv(self) -> str: @@ -160,6 +163,7 @@ def _get_or_create_venv_py_interpreter(self) -> str: # Use a reusable virtualenv self.log.info(f"Checking if the virtualenv lock {str(self.__lock_file)} exists") while not self._is_lock_available(): + # TODO: test self.log.info("Waiting for virtualenv lock to be released") time.sleep(1) @@ -191,9 +195,11 @@ def _is_lock_available(self) -> bool: try: _process_running = psutil.Process(pid).is_running() except psutil.NoSuchProcess: + # TODO: test _process_running = False self.log.info(f"Process {pid} running: {_process_running}") + # TODO: test return not _process_running return True @@ -214,11 +220,14 @@ def __release_venv_lock(self) -> None: return with open(self.__lock_file) as lf: + # TODO: test lock_file_pid = int(lf.read()) if lock_file_pid == self._pid: return self.__lock_file.unlink() + # TODO: test + # TODO: release lock if other process is not running self.log.warn(f"Lockfile owned by process of pid {lock_file_pid}, while operator has pid {self._pid}") From 551fd28cc59df762aad6f5aa8871288162462a0b Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Mon, 29 Jul 2024 15:44:15 +0100 Subject: [PATCH 29/52] Add tests --- cosmos/operators/virtualenv.py | 6 +---- tests/operators/test_virtualenv.py | 42 ++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 95bc6a28a..d93279f9c 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -36,9 +36,7 @@ def depends_on_virtualenv_dir(method: Callable[[Any], Any]) -> Callable[[Any], Any]: def wrapper(operator: DbtVirtualenvBaseOperator, *args: Any) -> None: if operator.virtualenv_dir is None: - # TODO: test raise CosmosValueError(f"Method relies on value of parameter `virtualenv_dir` which is None.") - method(operator, *args) return wrapper @@ -119,7 +117,6 @@ def clean_dir_if_temporary(self) -> None: Delete the virtualenv directory if it is temporary. """ if self.is_virtualenv_dir_temporary and self.virtualenv_dir and self.virtualenv_dir.exists(): - # TODO: test self.log.info(f"Deleting the Python virtualenv {self.virtualenv_dir}") shutil.rmtree(str(self.virtualenv_dir), ignore_errors=True) @@ -134,7 +131,6 @@ def execute(self, context: Context) -> None: self.clean_dir_if_temporary() def on_kill(self) -> None: - # TODO: test self.clean_dir_if_temporary() def prepare_virtualenv(self) -> str: @@ -185,7 +181,7 @@ def __lock_file(self) -> Path: def _pid(self) -> int: return os.getpid() - # @depends_on_virtualenv_dir + @depends_on_virtualenv_dir def _is_lock_available(self) -> bool: if self.__lock_file.is_file(): with open(self.__lock_file) as lf: diff --git a/tests/operators/test_virtualenv.py b/tests/operators/test_virtualenv.py index b79e77b66..3340facd6 100644 --- a/tests/operators/test_virtualenv.py +++ b/tests/operators/test_virtualenv.py @@ -4,11 +4,13 @@ from pathlib import Path from unittest.mock import MagicMock, patch +import pytest from airflow.models import DAG from airflow.models.connection import Connection from cosmos.config import ProfileConfig from cosmos.constants import InvocationMode +from cosmos.exceptions import CosmosValueError from cosmos.operators.virtualenv import DbtVirtualenvBaseOperator from cosmos.profiles import PostgresUserPasswordProfileMapping @@ -98,6 +100,46 @@ def test_virtualenv_operator_append_env_is_true_by_default(): assert venv_operator.append_env is True +def test_depends_on_virtualenv_dir_raises_exeption(): + venv_operator = ConcreteDbtVirtualenvBaseOperator( + profile_config=profile_config, + project_dir="./dev/dags/dbt/jaffle_shop", + task_id="buggy_task", + ) + venv_operator.virtualenv_dir = None + with pytest.raises(CosmosValueError) as excepion_info: + venv_operator._is_lock_available() + assert str(excepion_info.value) == "Method relies on value of parameter `virtualenv_dir` which is None." + + +def test_clean_dir_if_temporary(tmpdir): + tmp_filepath = Path(tmpdir / "tmpfile.txt") + tmp_filepath.touch() + assert tmp_filepath.exists() + + venv_operator = ConcreteDbtVirtualenvBaseOperator( + profile_config=profile_config, + project_dir="./dev/dags/dbt/jaffle_shop", + task_id="okay_task", + is_virtualenv_dir_temporary=True, + virtualenv_dir=tmpdir, + ) + venv_operator.clean_dir_if_temporary() + assert not tmp_filepath.exists() + assert not tmpdir.exists() + + +@patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator.clean_dir_if_temporary") +def test_on_kill(mock_clean_dir_if_temporary): + venv_operator = ConcreteDbtVirtualenvBaseOperator( + profile_config=profile_config, + project_dir="./dev/dags/dbt/jaffle_shop", + task_id="okay_task", + ) + venv_operator.on_kill() + assert mock_clean_dir_if_temporary.called + + @patch("airflow.utils.python_virtualenv.execute_in_subprocess") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.calculate_openlineage_events_completes") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.store_compiled_sql") From 16b83bb34f22e189bfaa961d8a5d875580f1c9a9 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Mon, 29 Jul 2024 16:15:24 +0100 Subject: [PATCH 30/52] Add tests --- cosmos/operators/virtualenv.py | 23 +++++++++++------------ tests/operators/test_virtualenv.py | 18 ++++++++++++++++++ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index d93279f9c..288f53b17 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -159,17 +159,15 @@ def _get_or_create_venv_py_interpreter(self) -> str: # Use a reusable virtualenv self.log.info(f"Checking if the virtualenv lock {str(self.__lock_file)} exists") while not self._is_lock_available(): - # TODO: test self.log.info("Waiting for virtualenv lock to be released") time.sleep(1) self.log.info(f"Acquiring the virtualenv lock") - self.__acquire_venv_lock() + self._acquire_venv_lock() py_bin = self.prepare_virtualenv() - if not self.is_virtualenv_dir_temporary: - self.log.info("Releasing virtualenv lock") - self.__release_venv_lock() + self.log.info("Releasing virtualenv lock") + self._release_venv_lock() return py_bin @@ -183,25 +181,26 @@ def _pid(self) -> int: @depends_on_virtualenv_dir def _is_lock_available(self) -> bool: + is_available = True if self.__lock_file.is_file(): + _process_running = False with open(self.__lock_file) as lf: pid = int(lf.read()) - self.log.info(f"Checking for running process with PID {pid}") try: _process_running = psutil.Process(pid).is_running() + self.log.info(f"Process {pid} running: {_process_running}. ") except psutil.NoSuchProcess: # TODO: test _process_running = False + self.log.info(f"Process {pid} is not running: {_process_running}. Releasing the lock.") - self.log.info(f"Process {pid} running: {_process_running}") - # TODO: test - return not _process_running + is_available = not _process_running - return True + return is_available @depends_on_virtualenv_dir - def __acquire_venv_lock(self) -> None: + def _acquire_venv_lock(self) -> None: if not self.virtualenv_dir.is_dir(): # type: ignore os.mkdir(str(self.virtualenv_dir)) @@ -210,7 +209,7 @@ def __acquire_venv_lock(self) -> None: lf.write(str(self._pid)) @depends_on_virtualenv_dir - def __release_venv_lock(self) -> None: + def _release_venv_lock(self) -> None: if not self.__lock_file.is_file(): self.log.warn(f"Lockfile {self.__lock_file} not found, perhaps deleted by other concurrent operator?") return diff --git a/tests/operators/test_virtualenv.py b/tests/operators/test_virtualenv.py index 3340facd6..6843491be 100644 --- a/tests/operators/test_virtualenv.py +++ b/tests/operators/test_virtualenv.py @@ -140,6 +140,24 @@ def test_on_kill(mock_clean_dir_if_temporary): assert mock_clean_dir_if_temporary.called +@patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator._release_venv_lock") +@patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator.prepare_virtualenv") +@patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator._acquire_venv_lock") +@patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator._is_lock_available", side_effect=[False, False, True]) +def test__get_or_create_venv_py_interpreter_waits_for_lock( + mock_is_lock_available, mock_acquire, mock_prepare, mock_release, tmpdir, caplog +): + venv_operator = ConcreteDbtVirtualenvBaseOperator( + profile_config=profile_config, + project_dir="./dev/dags/dbt/jaffle_shop", + task_id="okay_task", + is_virtualenv_dir_temporary=False, + virtualenv_dir=tmpdir, + ) + venv_operator._get_or_create_venv_py_interpreter() + assert caplog.text.count("Waiting for virtualenv lock to be released") == 2 + + @patch("airflow.utils.python_virtualenv.execute_in_subprocess") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.calculate_openlineage_events_completes") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.store_compiled_sql") From 4fe457e020d865c9dd62895960fa0705040a0fc0 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Mon, 29 Jul 2024 19:39:40 +0100 Subject: [PATCH 31/52] Add more tests --- cosmos/operators/virtualenv.py | 49 +++++++++++++-------------- tests/operators/test_virtualenv.py | 53 ++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 27 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 288f53b17..5467a47a2 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -3,7 +3,6 @@ import os import shutil import time -import traceback from functools import cached_property from pathlib import Path from tempfile import TemporaryDirectory @@ -31,13 +30,14 @@ PY_INTERPRETER = "python3" +LOCK_FILENAME = "cosmos_virtualenv.lock" def depends_on_virtualenv_dir(method: Callable[[Any], Any]) -> Callable[[Any], Any]: - def wrapper(operator: DbtVirtualenvBaseOperator, *args: Any) -> None: + def wrapper(operator: DbtVirtualenvBaseOperator, *args: Any) -> Any | None: if operator.virtualenv_dir is None: raise CosmosValueError(f"Method relies on value of parameter `virtualenv_dir` which is None.") - method(operator, *args) + return method(operator, *args) return wrapper @@ -122,11 +122,7 @@ def clean_dir_if_temporary(self) -> None: def execute(self, context: Context) -> None: try: - output = super().execute(context) - except Exception: - self.log.error(traceback.format_exc()) - else: - self.log.info(output) + super().execute(context) finally: self.clean_dir_if_temporary() @@ -157,7 +153,7 @@ def _get_or_create_venv_py_interpreter(self) -> str: return py_bin # Use a reusable virtualenv - self.log.info(f"Checking if the virtualenv lock {str(self.__lock_file)} exists") + self.log.info(f"Checking if the virtualenv lock {str(self._lock_file)} exists") while not self._is_lock_available(): self.log.info("Waiting for virtualenv lock to be released") time.sleep(1) @@ -172,31 +168,30 @@ def _get_or_create_venv_py_interpreter(self) -> str: return py_bin @property - def __lock_file(self) -> Path: - return Path(f"{self.virtualenv_dir}/LOCK") + def _lock_file(self) -> Path: + filepath = Path(f"{self.virtualenv_dir}/{LOCK_FILENAME}") + return filepath @property def _pid(self) -> int: return os.getpid() + # TODO: test @depends_on_virtualenv_dir def _is_lock_available(self) -> bool: is_available = True - if self.__lock_file.is_file(): - _process_running = False - with open(self.__lock_file) as lf: + if self._lock_file.is_file(): + with open(self._lock_file) as lf: pid = int(lf.read()) self.log.info(f"Checking for running process with PID {pid}") try: _process_running = psutil.Process(pid).is_running() - self.log.info(f"Process {pid} running: {_process_running}. ") + self.log.info(f"Process {pid} running: {_process_running} and has the lock {self._lock_file}.") except psutil.NoSuchProcess: - # TODO: test - _process_running = False - self.log.info(f"Process {pid} is not running: {_process_running}. Releasing the lock.") - - is_available = not _process_running - + self.log.info(f"Process {pid} is not running. Lock {self._lock_file} was outdated.") + is_available = True + else: + is_available = not _process_running return is_available @depends_on_virtualenv_dir @@ -204,22 +199,22 @@ def _acquire_venv_lock(self) -> None: if not self.virtualenv_dir.is_dir(): # type: ignore os.mkdir(str(self.virtualenv_dir)) - with open(self.__lock_file, "w") as lf: - self.log.info(f"Acquiring lock at {self.__lock_file} with pid {str(self._pid)}") + with open(self._lock_file, "w") as lf: + self.log.info(f"Acquiring lock at {self._lock_file} with pid {str(self._pid)}") lf.write(str(self._pid)) @depends_on_virtualenv_dir def _release_venv_lock(self) -> None: - if not self.__lock_file.is_file(): - self.log.warn(f"Lockfile {self.__lock_file} not found, perhaps deleted by other concurrent operator?") + if not self._lock_file.is_file(): + self.log.warn(f"Lockfile {self._lock_file} not found, perhaps deleted by other concurrent operator?") return - with open(self.__lock_file) as lf: + with open(self._lock_file) as lf: # TODO: test lock_file_pid = int(lf.read()) if lock_file_pid == self._pid: - return self.__lock_file.unlink() + return self._lock_file.unlink() # TODO: test # TODO: release lock if other process is not running diff --git a/tests/operators/test_virtualenv.py b/tests/operators/test_virtualenv.py index 6843491be..fc7236a15 100644 --- a/tests/operators/test_virtualenv.py +++ b/tests/operators/test_virtualenv.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os from datetime import datetime from pathlib import Path from unittest.mock import MagicMock, patch @@ -158,6 +159,58 @@ def test__get_or_create_venv_py_interpreter_waits_for_lock( assert caplog.text.count("Waiting for virtualenv lock to be released") == 2 +@patch("cosmos.operators.local.DbtLocalBaseOperator.execute", side_effect=ValueError) +@patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator.clean_dir_if_temporary") +def test__execute_cleans_dir(mock_clean_dir_if_temporary, mock_execute, caplog): + venv_operator = ConcreteDbtVirtualenvBaseOperator( + profile_config=profile_config, + project_dir="./dev/dags/dbt/jaffle_shop", + task_id="okay_task", + ) + with pytest.raises(ValueError): + venv_operator.execute(None) + assert mock_clean_dir_if_temporary.called + + +def test__is_lock_available_returns_false(tmpdir): + parent_pid = os.getppid() + lockfile = tmpdir / "cosmos_virtualenv.lock" + lockfile.write_text(str(parent_pid), encoding="utf-8") + venv_operator = ConcreteDbtVirtualenvBaseOperator( + profile_config=profile_config, + project_dir="./dev/dags/dbt/jaffle_shop", + task_id="okay_task", + is_virtualenv_dir_temporary=False, + virtualenv_dir=tmpdir, + ) + assert not venv_operator._is_lock_available() + + +def test__is_lock_available_returns_true_pid_no_longer_running(tmpdir): + non_existent_pid = "74717471" + lockfile = tmpdir / "cosmos_virtualenv.lock" + lockfile.write_text(str(non_existent_pid), encoding="utf-8") + venv_operator = ConcreteDbtVirtualenvBaseOperator( + profile_config=profile_config, + project_dir="./dev/dags/dbt/jaffle_shop", + task_id="okay_task", + is_virtualenv_dir_temporary=False, + virtualenv_dir=tmpdir, + ) + assert venv_operator._is_lock_available() + + +def test__is_lock_available_returns_true_pid_no_lockfile(tmpdir): + venv_operator = ConcreteDbtVirtualenvBaseOperator( + profile_config=profile_config, + project_dir="./dev/dags/dbt/jaffle_shop", + task_id="okay_task", + is_virtualenv_dir_temporary=False, + virtualenv_dir=tmpdir, + ) + assert venv_operator._is_lock_available() + + @patch("airflow.utils.python_virtualenv.execute_in_subprocess") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.calculate_openlineage_events_completes") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.store_compiled_sql") From 4068edcddfae06c09b739d6c3b95b24e9b6b305f Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Mon, 29 Jul 2024 19:39:49 +0100 Subject: [PATCH 32/52] Add more tests --- cosmos/operators/virtualenv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 5467a47a2..3646a262d 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -34,7 +34,7 @@ def depends_on_virtualenv_dir(method: Callable[[Any], Any]) -> Callable[[Any], Any]: - def wrapper(operator: DbtVirtualenvBaseOperator, *args: Any) -> Any | None: + def wrapper(operator: DbtVirtualenvBaseOperator, *args: Any) -> Any: if operator.virtualenv_dir is None: raise CosmosValueError(f"Method relies on value of parameter `virtualenv_dir` which is None.") return method(operator, *args) From 0e17d3058324e5fbb33263159868da326701cd38 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Mon, 29 Jul 2024 20:00:59 +0100 Subject: [PATCH 33/52] Finish covering virtualenv wuth tests --- cosmos/operators/virtualenv.py | 9 ++-- cosmos/settings.py | 1 + tests/operators/test_virtualenv.py | 70 ++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 5 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 3646a262d..8ebee6bfa 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -11,6 +11,7 @@ import psutil from airflow.utils.python_virtualenv import prepare_virtualenv +from cosmos import settings from cosmos.exceptions import CosmosValueError from cosmos.hooks.subprocess import FullOutputSubprocessResult from cosmos.operators.local import ( @@ -71,6 +72,7 @@ def __init__( self.py_system_site_packages = py_system_site_packages self.virtualenv_dir = virtualenv_dir self.is_virtualenv_dir_temporary = is_virtualenv_dir_temporary + self.max_retries_lock = settings.virtualenv_max_retries_lock super().__init__(**kwargs) @cached_property @@ -154,9 +156,10 @@ def _get_or_create_venv_py_interpreter(self) -> str: # Use a reusable virtualenv self.log.info(f"Checking if the virtualenv lock {str(self._lock_file)} exists") - while not self._is_lock_available(): + while not self._is_lock_available() and self.max_retries_lock: self.log.info("Waiting for virtualenv lock to be released") time.sleep(1) + self.max_retries_lock -= 1 self.log.info(f"Acquiring the virtualenv lock") self._acquire_venv_lock() @@ -176,7 +179,6 @@ def _lock_file(self) -> Path: def _pid(self) -> int: return os.getpid() - # TODO: test @depends_on_virtualenv_dir def _is_lock_available(self) -> bool: is_available = True @@ -210,14 +212,11 @@ def _release_venv_lock(self) -> None: return with open(self._lock_file) as lf: - # TODO: test lock_file_pid = int(lf.read()) if lock_file_pid == self._pid: return self._lock_file.unlink() - # TODO: test - # TODO: release lock if other process is not running self.log.warn(f"Lockfile owned by process of pid {lock_file_pid}, while operator has pid {self._pid}") diff --git a/cosmos/settings.py b/cosmos/settings.py index 67d5928d9..b9996717a 100644 --- a/cosmos/settings.py +++ b/cosmos/settings.py @@ -21,6 +21,7 @@ dbt_docs_index_file_name = conf.get("cosmos", "dbt_docs_index_file_name", fallback="index.html") enable_cache_profile = conf.getboolean("cosmos", "enable_cache_profile", fallback=True) dbt_profile_cache_dir_name = conf.get("cosmos", "profile_cache_dir_name", fallback="profile") +virtualenv_max_retries_lock = conf.getint("cosmos", "virtualenv_max_retries_lock", fallback=120) try: LINEAGE_NAMESPACE = conf.get("openlineage", "namespace") diff --git a/tests/operators/test_virtualenv.py b/tests/operators/test_virtualenv.py index fc7236a15..26f6659a2 100644 --- a/tests/operators/test_virtualenv.py +++ b/tests/operators/test_virtualenv.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import os from datetime import datetime from pathlib import Path @@ -211,6 +212,75 @@ def test__is_lock_available_returns_true_pid_no_lockfile(tmpdir): assert venv_operator._is_lock_available() +def test__acquire_venv_lock_existing_dir(tmpdir, caplog): + venv_operator = ConcreteDbtVirtualenvBaseOperator( + profile_config=profile_config, + project_dir="./dev/dags/dbt/jaffle_shop", + task_id="okay_task", + is_virtualenv_dir_temporary=False, + virtualenv_dir=Path(tmpdir), + ) + assert venv_operator._acquire_venv_lock() is None + assert "Acquiring lock at" in caplog.text + + +def test__acquire_venv_lock_new_subdir(tmpdir, caplog): + subdir = Path(tmpdir / "subdir") + venv_operator = ConcreteDbtVirtualenvBaseOperator( + profile_config=profile_config, + project_dir="./dev/dags/dbt/jaffle_shop", + task_id="okay_task", + is_virtualenv_dir_temporary=False, + virtualenv_dir=subdir, + ) + assert venv_operator._acquire_venv_lock() is None + assert "Acquiring lock at" in caplog.text + + +def test__release_venv_lock_inexistent(tmpdir, caplog): + venv_operator = ConcreteDbtVirtualenvBaseOperator( + profile_config=profile_config, + project_dir="./dev/dags/dbt/jaffle_shop", + task_id="okay_task", + is_virtualenv_dir_temporary=False, + virtualenv_dir=tmpdir, + ) + assert venv_operator._release_venv_lock() is None + assert "not found, perhaps deleted by other concurrent operator?" in caplog.text + + +def test__release_venv_lock_another_process(tmpdir, caplog): + caplog.set_level(logging.WARNING) + non_existent_pid = "747174" + lockfile = tmpdir / "cosmos_virtualenv.lock" + lockfile.write_text(str(non_existent_pid), encoding="utf-8") + venv_operator = ConcreteDbtVirtualenvBaseOperator( + profile_config=profile_config, + project_dir="./dev/dags/dbt/jaffle_shop", + task_id="okay_task", + is_virtualenv_dir_temporary=False, + virtualenv_dir=Path(tmpdir), + ) + assert venv_operator._release_venv_lock() is None + assert lockfile.exists() + assert "Lockfile owned by process of pid 747174, while operator has pid" in caplog.text + + +def test__release_venv_lock_current_process(tmpdir): + parent_pid = os.getpid() + lockfile = tmpdir / "cosmos_virtualenv.lock" + lockfile.write_text(str(parent_pid), encoding="utf-8") + venv_operator = ConcreteDbtVirtualenvBaseOperator( + profile_config=profile_config, + project_dir="./dev/dags/dbt/jaffle_shop", + task_id="okay_task", + is_virtualenv_dir_temporary=False, + virtualenv_dir=Path(tmpdir), + ) + assert venv_operator._release_venv_lock() is None + assert not lockfile.exists() + + @patch("airflow.utils.python_virtualenv.execute_in_subprocess") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.calculate_openlineage_events_completes") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.store_compiled_sql") From d9c7cc45b839ab91b4d96e763c041de0afa86893 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Wed, 14 Aug 2024 08:52:37 -0300 Subject: [PATCH 34/52] Workaround issue of caplog not capturing Airflow Operator self.log.info --- cosmos/operators/virtualenv.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 8ebee6bfa..576db037e 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -14,6 +14,7 @@ from cosmos import settings from cosmos.exceptions import CosmosValueError from cosmos.hooks.subprocess import FullOutputSubprocessResult +from cosmos.log import get_logger from cosmos.operators.local import ( DbtBuildLocalOperator, DbtDocsLocalOperator, @@ -32,6 +33,7 @@ PY_INTERPRETER = "python3" LOCK_FILENAME = "cosmos_virtualenv.lock" +logger = get_logger(__name__) def depends_on_virtualenv_dir(method: Callable[[Any], Any]) -> Callable[[Any], Any]: @@ -157,7 +159,7 @@ def _get_or_create_venv_py_interpreter(self) -> str: # Use a reusable virtualenv self.log.info(f"Checking if the virtualenv lock {str(self._lock_file)} exists") while not self._is_lock_available() and self.max_retries_lock: - self.log.info("Waiting for virtualenv lock to be released") + logger.info("Waiting for virtualenv lock to be released") time.sleep(1) self.max_retries_lock -= 1 @@ -202,13 +204,13 @@ def _acquire_venv_lock(self) -> None: os.mkdir(str(self.virtualenv_dir)) with open(self._lock_file, "w") as lf: - self.log.info(f"Acquiring lock at {self._lock_file} with pid {str(self._pid)}") + logger.info(f"Acquiring lock at {self._lock_file} with pid {str(self._pid)}") lf.write(str(self._pid)) @depends_on_virtualenv_dir def _release_venv_lock(self) -> None: if not self._lock_file.is_file(): - self.log.warn(f"Lockfile {self._lock_file} not found, perhaps deleted by other concurrent operator?") + logger.warning(f"Lockfile {self._lock_file} not found, perhaps deleted by other concurrent operator?") return with open(self._lock_file) as lf: @@ -217,7 +219,7 @@ def _release_venv_lock(self) -> None: if lock_file_pid == self._pid: return self._lock_file.unlink() - self.log.warn(f"Lockfile owned by process of pid {lock_file_pid}, while operator has pid {self._pid}") + logger.warning(f"Lockfile owned by process of pid {lock_file_pid}, while operator has pid {self._pid}") class DbtBuildVirtualenvOperator(DbtVirtualenvBaseOperator, DbtBuildLocalOperator): # type: ignore[misc] From 0042e7630f978140c97f75061eaa62b23285b382 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Jul 2024 14:26:33 +0530 Subject: [PATCH 35/52] =?UTF-8?q?=E2=AC=86=20[pre-commit.ci]=20pre-commit?= =?UTF-8?q?=20autoupdate=20(#1125)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/asottile/pyupgrade: v3.16.0 → v3.17.0](https://github.com/asottile/pyupgrade/compare/v3.16.0...v3.17.0) - [github.com/astral-sh/ruff-pre-commit: v0.5.4 → v0.5.5](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.4...v0.5.5) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 53b081faf..ea3b0172e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -47,14 +47,14 @@ repos: - id: remove-tabs exclude: ^docs/make.bat$|^docs/Makefile$|^dev/dags/dbt/jaffle_shop/seeds/raw_orders.csv$ - repo: https://github.com/asottile/pyupgrade - rev: v3.16.0 + rev: v3.17.0 hooks: - id: pyupgrade args: - --py37-plus - --keep-runtime-typing - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.4 + rev: v0.5.5 hooks: - id: ruff args: From 63a2bf4570a66198394e1b3a8b5cffb2052ffa9c Mon Sep 17 00:00:00 2001 From: Daniel Reeves <31971762+dwreeves@users.noreply.github.com> Date: Fri, 2 Aug 2024 10:53:53 -0400 Subject: [PATCH 36/52] Fix import handling by lazy loading hooks introduced in PR #1109 (#1132) Making an update to #1109, which introduced module-level imports of optional dependencies. This is inappropriate as it will break if the user does not have them installed, and indeed the user really does not need them installed if they are not relying on them directly. This PR lazy-loads the imports so that it does not impact users who do not need them. In the upath library, `az:`, `adl:`, `abfs:` and `abfss:` are also all valid schemes, albeit Airflow only references the latter 3 in the code: https://github.com/apache/airflow/blob/e3824eaaba7eada9a807f7a2f9f89d977a210e15/airflow/providers/microsoft/azure/fs/adls.py#L29, so `adl:`, `abfs:` and `abfss:` also have been added to the list of schemes supported. --- cosmos/config.py | 2 +- cosmos/constants.py | 34 ++++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/cosmos/config.py b/cosmos/config.py index fb7052a17..0ffb726e0 100644 --- a/cosmos/config.py +++ b/cosmos/config.py @@ -184,7 +184,7 @@ def __init__( if not manifest_conn_id: manifest_scheme = manifest_path_str.split("://")[0] # Use the default Airflow connection ID for the scheme if it is not provided. - manifest_conn_id = FILE_SCHEME_AIRFLOW_DEFAULT_CONN_ID_MAP.get(manifest_scheme, None) + manifest_conn_id = FILE_SCHEME_AIRFLOW_DEFAULT_CONN_ID_MAP.get(manifest_scheme, lambda: None)() if manifest_conn_id is not None and not AIRFLOW_IO_AVAILABLE: raise CosmosValueError( diff --git a/cosmos/constants.py b/cosmos/constants.py index d512faf16..cc9841ed6 100644 --- a/cosmos/constants.py +++ b/cosmos/constants.py @@ -1,11 +1,9 @@ import os from enum import Enum from pathlib import Path +from typing import Callable, Dict import aenum -from airflow.providers.amazon.aws.hooks.s3 import S3Hook -from airflow.providers.google.cloud.hooks.gcs import GCSHook -from airflow.providers.microsoft.azure.hooks.wasb import WasbHook from packaging.version import Version DBT_PROFILE_PATH = Path(os.path.expanduser("~")).joinpath(".dbt/profiles.yml") @@ -31,14 +29,30 @@ PARTIALLY_SUPPORTED_AIRFLOW_VERSIONS = [Version("2.9.0"), Version("2.9.1")] -S3_FILE_SCHEME = "s3" -GS_FILE_SCHEME = "gs" -ABFS_FILE_SCHEME = "abfs" +def _default_s3_conn() -> str: + from airflow.providers.amazon.aws.hooks.s3 import S3Hook -FILE_SCHEME_AIRFLOW_DEFAULT_CONN_ID_MAP = { - S3_FILE_SCHEME: S3Hook.default_conn_name, - GS_FILE_SCHEME: GCSHook.default_conn_name, - ABFS_FILE_SCHEME: WasbHook.default_conn_name, + return S3Hook.default_conn_name # type: ignore[no-any-return] + + +def _default_gcs_conn() -> str: + from airflow.providers.google.cloud.hooks.gcs import GCSHook + + return GCSHook.default_conn_name # type: ignore[no-any-return] + + +def _default_wasb_conn() -> str: + from airflow.providers.microsoft.azure.hooks.wasb import WasbHook + + return WasbHook.default_conn_name # type: ignore[no-any-return] + + +FILE_SCHEME_AIRFLOW_DEFAULT_CONN_ID_MAP: Dict[str, Callable[[], str]] = { + "s3": _default_s3_conn, + "gs": _default_gcs_conn, + "adl": _default_wasb_conn, + "abfs": _default_wasb_conn, + "abfss": _default_wasb_conn, } From 31041fe33827ed0a6bdaff9838c738420f8f6a95 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 6 Aug 2024 20:49:08 +0530 Subject: [PATCH 37/52] =?UTF-8?q?=E2=AC=86=20[pre-commit.ci]=20pre-commit?= =?UTF-8?q?=20autoupdate=20(#1144)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.5.5 → v0.5.6](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.5...v0.5.6) - [github.com/psf/black: 24.4.2 → 24.8.0](https://github.com/psf/black/compare/24.4.2...24.8.0) - [github.com/pre-commit/mirrors-mypy: v1.11.0 → v1.11.1](https://github.com/pre-commit/mirrors-mypy/compare/v1.11.0...v1.11.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ea3b0172e..2daabe4dd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -54,13 +54,13 @@ repos: - --py37-plus - --keep-runtime-typing - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.5 + rev: v0.5.6 hooks: - id: ruff args: - --fix - repo: https://github.com/psf/black - rev: 24.4.2 + rev: 24.8.0 hooks: - id: black args: ["--config", "./pyproject.toml"] @@ -71,7 +71,7 @@ repos: alias: black additional_dependencies: [black>=22.10.0] - repo: https://github.com/pre-commit/mirrors-mypy - rev: "v1.11.0" + rev: "v1.11.1" hooks: - id: mypy From 0f0d93a96cbbe42c7138c57a3fba6f1240daa5e0 Mon Sep 17 00:00:00 2001 From: Pankaj Singh <98807258+pankajastro@users.noreply.github.com> Date: Fri, 9 Aug 2024 04:23:30 +0530 Subject: [PATCH 38/52] Cache package-lock.yml file (#1086) This PR aims to cache the package-lock.yml in `cache_dir/dbt_project` Since dbt version 1.7.0, executing the dbt deps command results in the generation of a package-lock.yml file. This file pins the dependencies and their versions for the dbt project. dbt uses this file to install packages, ensuring predictable and consistent package installations across environments. - This feature is enabled only if the user checks in package-lock.yml in their dbt project. Also, I'm assuming if `package-lock.yml` their dbt-core version is >= 1.7.0 since this feature is available for only dbt >= 1.7.0 - package-lock.yml also contains the sha1_hash of the packages. This is used to check if the cached package-lock.yml is outdated or not in this PR - The cached `package-lock.yml` is finally copied from from cached path to the tmp project and used - To update dependencies or versions, it is expected that the user will manually update their package-lock.yml in the dbt project using the dbt deps command. closes: https://github.com/astronomer/astronomer-cosmos/issues/930 --- cosmos/cache.py | 83 ++++++++++++++++++++- cosmos/constants.py | 1 + cosmos/dbt/graph.py | 9 +++ cosmos/dbt/project.py | 3 +- cosmos/operators/local.py | 13 ++++ cosmos/settings.py | 1 + dev/dags/dbt/jaffle_shop/package-lock.yml | 4 + docs/configuration/cosmos-conf.rst | 8 ++ tests/dbt/test_graph.py | 4 +- tests/test_cache.py | 89 +++++++++++++++++++++++ 10 files changed, 210 insertions(+), 5 deletions(-) create mode 100644 dev/dags/dbt/jaffle_shop/package-lock.yml diff --git a/cosmos/cache.py b/cosmos/cache.py index 32a21d30d..102186423 100644 --- a/cosmos/cache.py +++ b/cosmos/cache.py @@ -5,12 +5,14 @@ import json import os import shutil +import tempfile import time from collections import defaultdict from datetime import datetime, timedelta, timezone from pathlib import Path import msgpack +import yaml from airflow.models import DagRun, Variable from airflow.models.dag import DAG from airflow.utils.session import provide_session @@ -19,10 +21,21 @@ from sqlalchemy.orm import Session from cosmos import settings -from cosmos.constants import DBT_MANIFEST_FILE_NAME, DBT_TARGET_DIR_NAME, DEFAULT_PROFILES_FILE_NAME +from cosmos.constants import ( + DBT_MANIFEST_FILE_NAME, + DBT_TARGET_DIR_NAME, + DEFAULT_PROFILES_FILE_NAME, + PACKAGE_LOCKFILE_YML, +) from cosmos.dbt.project import get_partial_parse_path from cosmos.log import get_logger -from cosmos.settings import cache_dir, dbt_profile_cache_dir_name, enable_cache, enable_cache_profile +from cosmos.settings import ( + cache_dir, + dbt_profile_cache_dir_name, + enable_cache, + enable_cache_package_lockfile, + enable_cache_profile, +) logger = get_logger(__name__) VAR_KEY_CACHE_PREFIX = "cosmos_cache__" @@ -400,3 +413,69 @@ def create_cache_profile(version: str, profile_content: str) -> Path: profile_yml_path = profile_yml_dir / DEFAULT_PROFILES_FILE_NAME profile_yml_path.write_text(profile_content) return profile_yml_path + + +def is_cache_package_lockfile_enabled(project_dir: Path) -> bool: + if not enable_cache_package_lockfile: + return False + package_lockfile = project_dir / PACKAGE_LOCKFILE_YML + return package_lockfile.is_file() + + +def _get_sha1_hash(yaml_file: Path) -> str: + """Read package-lock.yml file and return sha1_hash""" + with open(yaml_file) as file: + yaml_content = file.read() + data = yaml.safe_load(yaml_content) + sha1_hash: str = data.get("sha1_hash", "") + return sha1_hash + + +def _get_latest_cached_package_lockfile(project_dir: Path) -> Path | None: + """ + Retrieves the latest cached package-lock.yml for the specified project directory, + or creates and caches it if not already cached and hashes match. + """ + cache_identifier = project_dir.name + package_lockfile = project_dir / PACKAGE_LOCKFILE_YML + cached_package_lockfile = cache_dir / cache_identifier / PACKAGE_LOCKFILE_YML + + if cached_package_lockfile.exists() and cached_package_lockfile.is_file(): + project_sha1_hash = _get_sha1_hash(package_lockfile) + cached_sha1_hash = _get_sha1_hash(cached_package_lockfile) + if project_sha1_hash == cached_sha1_hash: + return cached_package_lockfile + cached_lockfile_dir = cache_dir / cache_identifier + cached_lockfile_dir.mkdir(parents=True, exist_ok=True) + _safe_copy(package_lockfile, cached_package_lockfile) + return cached_package_lockfile + + +def _copy_cached_package_lockfile_to_project(cached_package_lockfile: Path, project_dir: Path) -> None: + """Copy the cached package-lock.yml to tmp project dir""" + package_lockfile = project_dir / PACKAGE_LOCKFILE_YML + _safe_copy(cached_package_lockfile, package_lockfile) + + +# TODO: Move this function to a different location +def _safe_copy(src: Path, dst: Path) -> None: + """ + Safely copies a file from a source path to a destination path. + + This function ensures that the copy operation is atomic by first + copying the file to a temporary file in the same directory as the + destination and then renaming the temporary file to the destination + file. This approach minimizes the risk of file corruption or partial + writes in case of a failure or interruption during the copy process. + + See the blog for atomic file operations: + https://alexwlchan.net/2019/atomic-cross-filesystem-moves-in-python/ + """ + # Create a temporary file in the same directory as the destination + dir_name, base_name = os.path.split(dst) + temp_fd, temp_path = tempfile.mkstemp(dir=dir_name) + + shutil.copyfile(src, temp_path) + + # Rename the temporary file to the destination file + os.rename(temp_path, dst) diff --git a/cosmos/constants.py b/cosmos/constants.py index cc9841ed6..25b33f28a 100644 --- a/cosmos/constants.py +++ b/cosmos/constants.py @@ -20,6 +20,7 @@ DBT_LOG_FILENAME = "dbt.log" DBT_BINARY_NAME = "dbt" DEFAULT_PROFILES_FILE_NAME = "profiles.yml" +PACKAGE_LOCKFILE_YML = "package-lock.yml" DEFAULT_OPENLINEAGE_NAMESPACE = "cosmos" OPENLINEAGE_PRODUCER = "https://github.com/astronomer/astronomer-cosmos/" diff --git a/cosmos/dbt/graph.py b/cosmos/dbt/graph.py index 087fef5a4..889968982 100644 --- a/cosmos/dbt/graph.py +++ b/cosmos/dbt/graph.py @@ -18,6 +18,11 @@ from airflow.models import Variable from cosmos import cache, settings +from cosmos.cache import ( + _copy_cached_package_lockfile_to_project, + _get_latest_cached_package_lockfile, + is_cache_package_lockfile_enabled, +) from cosmos.config import ExecutionConfig, ProfileConfig, ProjectConfig, RenderConfig from cosmos.constants import ( DBT_LOG_DIR_NAME, @@ -501,6 +506,10 @@ def load_via_dbt_ls_without_cache(self) -> None: env[DBT_TARGET_PATH_ENVVAR] = str(self.target_dir) if self.render_config.dbt_deps and has_non_empty_dependencies_file(self.project_path): + if is_cache_package_lockfile_enabled(project_path): + latest_package_lockfile = _get_latest_cached_package_lockfile(project_path) + if latest_package_lockfile: + _copy_cached_package_lockfile_to_project(latest_package_lockfile, tmpdir_path) self.run_dbt_deps(dbt_cmd, tmpdir_path, env) nodes = self.run_dbt_ls(dbt_cmd, self.project_path, tmpdir_path, env) diff --git a/cosmos/dbt/project.py b/cosmos/dbt/project.py index 987d10f3a..2c9f9743a 100644 --- a/cosmos/dbt/project.py +++ b/cosmos/dbt/project.py @@ -10,6 +10,7 @@ DBT_LOG_DIR_NAME, DBT_PARTIAL_PARSE_FILE_NAME, DBT_TARGET_DIR_NAME, + PACKAGE_LOCKFILE_YML, ) from cosmos.log import get_logger @@ -38,7 +39,7 @@ def has_non_empty_dependencies_file(project_path: Path) -> bool: def create_symlinks(project_path: Path, tmp_dir: Path, ignore_dbt_packages: bool) -> None: """Helper function to create symlinks to the dbt project files.""" - ignore_paths = [DBT_LOG_DIR_NAME, DBT_TARGET_DIR_NAME, "profiles.yml"] + ignore_paths = [DBT_LOG_DIR_NAME, DBT_TARGET_DIR_NAME, PACKAGE_LOCKFILE_YML, "profiles.yml"] if ignore_dbt_packages: # this is linked to dbt deps so if dbt deps is true then ignore existing dbt_packages folder ignore_paths.append("dbt_packages") diff --git a/cosmos/operators/local.py b/cosmos/operators/local.py index 4acaa9453..8a4ea1ba2 100644 --- a/cosmos/operators/local.py +++ b/cosmos/operators/local.py @@ -17,6 +17,11 @@ from attr import define from cosmos import cache +from cosmos.cache import ( + _copy_cached_package_lockfile_to_project, + _get_latest_cached_package_lockfile, + is_cache_package_lockfile_enabled, +) from cosmos.constants import InvocationMode from cosmos.dbt.project import get_partial_parse_path, has_non_empty_dependencies_file from cosmos.exceptions import AirflowCompatibilityError @@ -275,6 +280,13 @@ def run_dbt_runner(self, command: list[str], env: dict[str, str], cwd: str) -> d return result + def _cache_package_lockfile(self, tmp_project_dir: Path) -> None: + project_dir = Path(self.project_dir) + if is_cache_package_lockfile_enabled(project_dir): + latest_package_lockfile = _get_latest_cached_package_lockfile(project_dir) + if latest_package_lockfile: + _copy_cached_package_lockfile_to_project(latest_package_lockfile, tmp_project_dir) + def run_command( self, cmd: list[str], @@ -320,6 +332,7 @@ def run_command( ] if self.install_deps: + self._cache_package_lockfile(tmp_dir_path) deps_command = [self.dbt_executable_path, "deps"] deps_command.extend(flags) self.invoke_dbt( diff --git a/cosmos/settings.py b/cosmos/settings.py index b9996717a..fde58fde2 100644 --- a/cosmos/settings.py +++ b/cosmos/settings.py @@ -14,6 +14,7 @@ cache_dir = Path(conf.get("cosmos", "cache_dir", fallback=DEFAULT_CACHE_DIR) or DEFAULT_CACHE_DIR) enable_cache = conf.getboolean("cosmos", "enable_cache", fallback=True) enable_cache_partial_parse = conf.getboolean("cosmos", "enable_cache_partial_parse", fallback=True) +enable_cache_package_lockfile = conf.getboolean("cosmos", "enable_cache_package_lockfile", fallback=True) enable_cache_dbt_ls = conf.getboolean("cosmos", "enable_cache_dbt_ls", fallback=True) propagate_logs = conf.getboolean("cosmos", "propagate_logs", fallback=True) dbt_docs_dir = conf.get("cosmos", "dbt_docs_dir", fallback=None) diff --git a/dev/dags/dbt/jaffle_shop/package-lock.yml b/dev/dags/dbt/jaffle_shop/package-lock.yml new file mode 100644 index 000000000..669f9637d --- /dev/null +++ b/dev/dags/dbt/jaffle_shop/package-lock.yml @@ -0,0 +1,4 @@ +packages: + - package: dbt-labs/dbt_utils + version: 1.1.1 +sha1_hash: a158c48c59c2bb7d729d2a4e215aabe5bb4f3353 diff --git a/docs/configuration/cosmos-conf.rst b/docs/configuration/cosmos-conf.rst index 8dc90a5c1..037a43d3b 100644 --- a/docs/configuration/cosmos-conf.rst +++ b/docs/configuration/cosmos-conf.rst @@ -46,6 +46,14 @@ This page lists all available Airflow configurations that affect ``astronomer-co - Default: ``True`` - Environment Variable: ``AIRFLOW__COSMOS__ENABLE_CACHE_PARTIAL_PARSE`` +.. _enable_cache_package_lockfile: + +`enable_cache_package_lockfile`_: + Enable or disable caching of dbt project package lockfile. + + - Default: ``True`` + - Environment Variable: ``AIRFLOW__COSMOS__ENABLE_CACHE_PACKAGE_LOCKFILE`` + .. _propagate_logs: `propagate_logs`_: diff --git a/tests/dbt/test_graph.py b/tests/dbt/test_graph.py index 0ba3332e6..bcce46b87 100644 --- a/tests/dbt/test_graph.py +++ b/tests/dbt/test_graph.py @@ -1422,9 +1422,9 @@ def test_save_dbt_ls_cache(mock_variable_set, mock_datetime, tmp_dbt_project_dir hash_dir, hash_args = version.split(",") assert hash_args == "d41d8cd98f00b204e9800998ecf8427e" if sys.platform == "darwin": - assert hash_dir == "18b97e2bff2684161f71db817f1f50e2" + assert hash_dir == "65595448aded2c2b52878a801c1d9c59" else: - assert hash_dir == "6c662da10b64a8390c469c884af88321" + assert hash_dir == "4c826c84a94b0f1f5508c4e425170677" @pytest.mark.integration diff --git a/tests/test_cache.py b/tests/test_cache.py index cc1d11b21..b9ab087a7 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -16,12 +16,15 @@ _copy_partial_parse_to_project, _create_cache_identifier, _create_folder_version_hash, + _get_latest_cached_package_lockfile, _get_latest_partial_parse, _get_or_create_profile_cache_dir, + _get_sha1_hash, _update_partial_parse_cache, create_cache_profile, delete_unused_dbt_ls_cache, get_cached_profile, + is_cache_package_lockfile_enabled, is_profile_cache_enabled, ) from cosmos.constants import DBT_PARTIAL_PARSE_FILE_NAME, DBT_TARGET_DIR_NAME, DEFAULT_PROFILES_FILE_NAME @@ -317,3 +320,89 @@ def test_create_cache_profile(): # Check content of the created file assert expected_path.read_text() == profile_content assert profile_yml_path == expected_path + + +@patch("pathlib.Path.is_file") +def test_cache_package_lockfile_enabled(mock_path_is_file): + # Mocking the return value of Path.is_file() + mock_path_is_file.return_value = True + + # Test case where lockfile exists + project_dir = Path("/path/to/your/project") + result = is_cache_package_lockfile_enabled(project_dir) + assert result is True + + # Test case where lockfile doesn't exist + mock_path_is_file.return_value = False + result = is_cache_package_lockfile_enabled(project_dir) + assert result is False + + +@pytest.fixture +def mock_package_lockfile(): + # Create a temporary YAML file with test data + yaml_data = """ + packages: + - package: dbt-labs/dbt_utils + version: 1.1.1 + sha1_hash: a158c48c59c2bb7d729d2a4e215aabe5bb4f3353 + """ + tmp_file = Path("test_package-lock.yml") + with open(tmp_file, "w") as file: + file.write(yaml_data) + yield tmp_file + # Clean up: delete the temporary file after the test + tmp_file.unlink() + + +def test_get_sha1_hash(): + profile_lock_content = """ + packages: + - package: dbt-labs/dbt_utils + version: 1.1.1 + sha1_hash: a158c48c59c2bb7d729d2a4e215aabe5bb4f3353 + """ + tmp_file = Path("package-lock.yml") + with open(tmp_file, "w") as file: + file.write(profile_lock_content) + + sha1_hash = _get_sha1_hash(tmp_file) + assert sha1_hash == "a158c48c59c2bb7d729d2a4e215aabe5bb4f3353" + tmp_file.unlink() + + +def _test_tmp_dir(dir_name: str): + # Create a temporary directory for cache_dir + with tempfile.TemporaryDirectory() as temp_dir: + cache_dir = Path(temp_dir) / "test_cache" + cache_dir.mkdir() + return cache_dir + + +@patch("cosmos.cache.cache_dir") +@patch("cosmos.cache._get_sha1_hash") +def test_get_latest_cached_package_lockfile_with_cache(mock_get_sha, cache_dir): + # Create a fake cached lockfile + project_dir = _test_tmp_dir("test_project") + cache_dir.return_value = _test_tmp_dir("test_cache") + cache_identifier = project_dir.name + cached_profile_lockfile = cache_dir / cache_identifier / "package-lock.yml" + cached_profile_lockfile.parent.mkdir(parents=True, exist_ok=True) + cached_profile_lockfile.touch() + + # Test case where there is a cached file + result = _get_latest_cached_package_lockfile(project_dir) + assert result == cached_profile_lockfile + assert cached_profile_lockfile.exists() + + +@patch("cosmos.cache._get_sha1_hash") +def test_get_latest_cached_lockfile_with_no_cache(mock_get_sha): + project_dir = _test_tmp_dir("test_project") + project_package_lockfile = project_dir / "package-lock.yml" + project_package_lockfile.parent.mkdir(parents=True, exist_ok=True) + project_package_lockfile.touch() + + # Test case where there is a cached file + result = _get_latest_cached_package_lockfile(project_dir) + assert result.exists() From f333714e717594296c9dfe948b020bcc18b9382e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 09:52:08 +0530 Subject: [PATCH 39/52] =?UTF-8?q?=E2=AC=86=20[pre-commit.ci]=20pre-commit?= =?UTF-8?q?=20autoupdate=20(#1154)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.5.6 → v0.5.7](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.6...v0.5.7) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2daabe4dd..f8f972831 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -54,7 +54,7 @@ repos: - --py37-plus - --keep-runtime-typing - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.6 + rev: v0.5.7 hooks: - id: ruff args: From f1a1273ad7d843b66e7022fe939d255724bcbb80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Rojas=20Ben=C3=ADtez?= Date: Wed, 14 Aug 2024 03:53:16 -0500 Subject: [PATCH 40/52] Add default source nodes rendering (#1107) Re-Opening of PR #661 This PR features a new way of rendering source nodes: - Check freshness for sources with freshness checks - Source tests - Empty operators for nodes without tests or freshness. One of the main limitations I found while using the `custom_callback` functions on source nodes to check freshness is that nodes were being created on 100% of sources but not all of them required freshness checks, this made workers waste compute time. I'm adding a new variable into the DbtNode class called has_freshness which would be True for sources with freshness checks and False for any other resource type. If this feature is enabled with the option `ALL`: All sources with the has_freshness == False will be rendered as Empty Operators, to keep the dbt's behavior of showing sources as suggested in issue #630 A new rendered template field is included too: `freshness` which is the sources.json generated by dbt when running `dbt source freshness` This adds a new node type (source), which changes some tests behavior. This PR also updates the dev dbt project jaffle_shop to include source nodes when enabled. ![image](https://github.com/user-attachments/assets/e972ac58-8741-4c13-9905-e78775f9cc80) As seen in the image, source nodes with freshness checks are rendered with a blue color, while the ones rendered as EmptyOperator show a white/light green color Closes: #630 Closes: #572 Closes: https://github.com/astronomer/astronomer-cosmos/issues/875 This won't be a breaking change since the default behavior will still be ignoring this new feature. That can be changed with the new RenderConfig variable called `source_rendering_behavior`. Co-authored-by: Pankaj Co-authored-by: Pankaj Singh <98807258+pankajastro@users.noreply.github.com> --- .github/workflows/test.yml | 3 + cosmos/airflow/graph.py | 34 +++- cosmos/config.py | 3 + cosmos/constants.py | 10 + cosmos/core/airflow.py | 2 +- cosmos/dbt/graph.py | 59 +++++- cosmos/operators/aws_eks.py | 7 + cosmos/operators/azure_container_instance.py | 7 + cosmos/operators/base.py | 9 + cosmos/operators/docker.py | 7 + cosmos/operators/kubernetes.py | 7 + cosmos/operators/local.py | 36 +++- cosmos/operators/virtualenv.py | 8 + .../jaffle_shop/models/staging/sources.yml | 31 ++++ .../models/staging/stg_customers.sql | 9 +- .../jaffle_shop/models/staging/stg_orders.sql | 9 +- .../models/staging/stg_payments.sql | 11 +- docs/configuration/cosmos-conf.rst | 1 - docs/configuration/index.rst | 1 + docs/configuration/source-nodes-rendering.rst | 36 ++++ scripts/test/integration-dbt-1-5-4.sh | 1 + scripts/test/integration-expensive.sh | 1 + scripts/test/integration.sh | 2 + tests/airflow/test_graph.py | 75 +++++++- tests/dbt/test_graph.py | 174 ++++++++++++++---- tests/operators/test_local.py | 84 ++++++++- 26 files changed, 566 insertions(+), 61 deletions(-) create mode 100644 dev/dags/dbt/jaffle_shop/models/staging/sources.yml create mode 100644 docs/configuration/source-nodes-rendering.rst diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f19831676..219ea2019 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -163,6 +163,7 @@ jobs: POSTGRES_DB: postgres POSTGRES_SCHEMA: public POSTGRES_PORT: 5432 + SOURCE_RENDERING_BEHAVIOR: all - name: Upload coverage to Github uses: actions/upload-artifact@v2 @@ -234,6 +235,7 @@ jobs: POSTGRES_DB: postgres POSTGRES_SCHEMA: public POSTGRES_PORT: 5432 + SOURCE_RENDERING_BEHAVIOR: all - name: Upload coverage to Github uses: actions/upload-artifact@v2 @@ -377,6 +379,7 @@ jobs: POSTGRES_DB: postgres POSTGRES_SCHEMA: public POSTGRES_PORT: 5432 + SOURCE_RENDERING_BEHAVIOR: all - name: Upload coverage to Github uses: actions/upload-artifact@v2 diff --git a/cosmos/airflow/graph.py b/cosmos/airflow/graph.py index 5639d3bcb..17ee22c95 100644 --- a/cosmos/airflow/graph.py +++ b/cosmos/airflow/graph.py @@ -12,6 +12,7 @@ TESTABLE_DBT_RESOURCES, DbtResourceType, ExecutionMode, + SourceRenderingBehavior, TestBehavior, TestIndirectSelection, ) @@ -127,7 +128,11 @@ def create_test_task_metadata( def create_task_metadata( - node: DbtNode, execution_mode: ExecutionMode, args: dict[str, Any], use_task_group: bool = False + node: DbtNode, + execution_mode: ExecutionMode, + args: dict[str, Any], + use_task_group: bool = False, + source_rendering_behavior: SourceRenderingBehavior = SourceRenderingBehavior.NONE, ) -> TaskMetadata | None: """ Create the metadata that will be used to instantiate the Airflow Task used to run the Dbt node. @@ -145,6 +150,7 @@ def create_task_metadata( DbtResourceType.SNAPSHOT: "DbtSnapshot", DbtResourceType.SEED: "DbtSeed", DbtResourceType.TEST: "DbtTest", + DbtResourceType.SOURCE: "DbtSource", } args = {**args, **{"models": node.resource_name}} @@ -154,6 +160,23 @@ def create_task_metadata( task_id = f"{node.name}_run" if use_task_group is True: task_id = "run" + elif node.resource_type == DbtResourceType.SOURCE: + if (source_rendering_behavior == SourceRenderingBehavior.NONE) or ( + source_rendering_behavior == SourceRenderingBehavior.WITH_TESTS_OR_FRESHNESS + and node.has_freshness is False + and node.has_test is False + ): + return None + # TODO: https://github.com/astronomer/astronomer-cosmos + # pragma: no cover + task_id = f"{node.name}_source" + args["select"] = f"source:{node.resource_name}" + args.pop("models") + if use_task_group is True: + task_id = node.resource_type.value + if node.has_freshness is False and source_rendering_behavior == SourceRenderingBehavior.ALL: + # render sources without freshness as empty operators + return TaskMetadata(id=task_id, operator_class="airflow.operators.empty.EmptyOperator") else: task_id = f"{node.name}_{node.resource_type.value}" if use_task_group is True: @@ -185,6 +208,7 @@ def generate_task_or_group( execution_mode: ExecutionMode, task_args: dict[str, Any], test_behavior: TestBehavior, + source_rendering_behavior: SourceRenderingBehavior, test_indirect_selection: TestIndirectSelection, on_warning_callback: Callable[..., Any] | None, **kwargs: Any, @@ -198,7 +222,11 @@ def generate_task_or_group( ) task_meta = create_task_metadata( - node=node, execution_mode=execution_mode, args=task_args, use_task_group=use_task_group + node=node, + execution_mode=execution_mode, + args=task_args, + use_task_group=use_task_group, + source_rendering_behavior=source_rendering_behavior, ) # In most cases, we'll map one DBT node to one Airflow task @@ -260,6 +288,7 @@ def build_airflow_graph( """ node_converters = render_config.node_converters or {} test_behavior = render_config.test_behavior + source_rendering_behavior = render_config.source_rendering_behavior tasks_map = {} task_or_group: TaskGroup | BaseOperator @@ -278,6 +307,7 @@ def build_airflow_graph( execution_mode=execution_mode, task_args=task_args, test_behavior=test_behavior, + source_rendering_behavior=source_rendering_behavior, test_indirect_selection=test_indirect_selection, on_warning_callback=on_warning_callback, node=node, diff --git a/cosmos/config.py b/cosmos/config.py index 0ffb726e0..2cebbf3cc 100644 --- a/cosmos/config.py +++ b/cosmos/config.py @@ -20,6 +20,7 @@ ExecutionMode, InvocationMode, LoadMode, + SourceRenderingBehavior, TestBehavior, TestIndirectSelection, ) @@ -59,6 +60,7 @@ class RenderConfig: :param dbt_project_path: Configures the DBT project location accessible on the airflow controller for DAG rendering. Mutually Exclusive with ProjectConfig.dbt_project_path. Required when using ``load_method=LoadMode.DBT_LS`` or ``load_method=LoadMode.CUSTOM``. :param dbt_ls_path: Configures the location of an output of ``dbt ls``. Required when using ``load_method=LoadMode.DBT_LS_FILE``. :param enable_mock_profile: Allows to enable/disable mocking profile. Enabled by default. Mock profiles are useful for parsing Cosmos DAGs in the CI, but should be disabled to benefit from partial parsing (since Cosmos 1.4). + :param source_rendering_behavior: Determines how source nodes are rendered when using cosmos default source node rendering (ALL, NONE, WITH_TESTS_OR_FRESHNESS). Defaults to "NONE" (since Cosmos 1.6). """ emit_datasets: bool = True @@ -75,6 +77,7 @@ class RenderConfig: dbt_ls_path: Path | None = None project_path: Path | None = field(init=False) enable_mock_profile: bool = True + source_rendering_behavior: SourceRenderingBehavior = SourceRenderingBehavior.NONE airflow_vars_to_purge_dbt_ls_cache: list[str] = field(default_factory=list) def __post_init__(self, dbt_project_path: str | Path | None) -> None: diff --git a/cosmos/constants.py b/cosmos/constants.py index 25b33f28a..6e96551e4 100644 --- a/cosmos/constants.py +++ b/cosmos/constants.py @@ -113,6 +113,16 @@ class TestIndirectSelection(Enum): EMPTY = "empty" +class SourceRenderingBehavior(Enum): + """ + Modes to configure the source rendering behavior. + """ + + NONE = "none" + ALL = "all" + WITH_TESTS_OR_FRESHNESS = "with_tests_or_freshness" + + class DbtResourceType(aenum.Enum): # type: ignore """ Type of dbt node. diff --git a/cosmos/core/airflow.py b/cosmos/core/airflow.py index 38d5113ec..acff5d012 100644 --- a/cosmos/core/airflow.py +++ b/cosmos/core/airflow.py @@ -35,7 +35,7 @@ def get_airflow_task(task: Task, dag: DAG, task_group: "TaskGroup | None" = None dag=dag, task_group=task_group, owner=task_owner, - extra_context=task.extra_context, + **({} if class_name == "EmptyOperator" else {"extra_context": task.extra_context}), **task.arguments, ) diff --git a/cosmos/dbt/graph.py b/cosmos/dbt/graph.py index 889968982..93e9430ec 100644 --- a/cosmos/dbt/graph.py +++ b/cosmos/dbt/graph.py @@ -13,7 +13,7 @@ from functools import cached_property from pathlib import Path from subprocess import PIPE, Popen -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Optional from airflow.models import Variable @@ -33,6 +33,7 @@ DbtResourceType, ExecutionMode, LoadMode, + SourceRenderingBehavior, ) from cosmos.dbt.executable import get_system_dbt from cosmos.dbt.parser.project import LegacyDbtProject @@ -54,7 +55,7 @@ class CosmosLoadDbtException(Exception): @dataclass class DbtNode: """ - Metadata related to a dbt node (e.g. model, seed, snapshot). + Metadata related to a dbt node (e.g. model, seed, snapshot, source). """ unique_id: str @@ -63,6 +64,7 @@ class DbtNode: file_path: Path tags: list[str] = field(default_factory=lambda: []) config: dict[str, Any] = field(default_factory=lambda: {}) + has_freshness: bool = False has_test: bool = False @property @@ -105,6 +107,30 @@ def context_dict(self) -> dict[str, Any]: } +def is_freshness_effective(freshness: Optional[dict[str, Any]]) -> bool: + """Function to find if a source has null freshness. Scenarios where freshness + looks like: + "freshness": { + "warn_after": { + "count": null, + "period": null + }, + "error_after": { + "count": null, + "period": null + }, + "filter": null + } + should be considered as null, this function ensures that.""" + if freshness is None: + return False + for _, value in freshness.items(): + if isinstance(value, dict): + if any(subvalue is not None for subvalue in value.values()): + return True + return False + + def run_command(command: list[str], tmp_dir: Path, env_vars: dict[str, str]) -> str: """Run a command in a subprocess, returning the stdout.""" logger.info("Running command: `%s`", " ".join(command)) @@ -148,6 +174,11 @@ def parse_dbt_ls_output(project_path: Path | None, ls_stdout: str) -> dict[str, file_path=project_path / node_dict["original_file_path"], tags=node_dict.get("tags", []), config=node_dict.get("config", {}), + has_freshness=( + is_freshness_effective(node_dict.get("freshness")) + if DbtResourceType(node_dict["resource_type"]) == DbtResourceType.SOURCE + else False + ), ) nodes[node.unique_id] = node logger.debug("Parsed dbt resource `%s` of type `%s`", node.unique_id, node.resource_type) @@ -367,7 +398,24 @@ def run_dbt_ls( self, dbt_cmd: str, project_path: Path, tmp_dir: Path, env_vars: dict[str, str] ) -> dict[str, DbtNode]: """Runs dbt ls command and returns the parsed nodes.""" - ls_command = [self.dbt_cmd, "ls", "--output", "json"] + if self.render_config.source_rendering_behavior != SourceRenderingBehavior.NONE: + ls_command = [ + self.dbt_cmd, + "ls", + "--output", + "json", + "--output-keys", + "name", + "unique_id", + "resource_type", + "depends_on", + "original_file_path", + "tags", + "config", + "freshness", + ] + else: + ls_command = [self.dbt_cmd, "ls", "--output", "json"] ls_args = self.dbt_ls_args ls_command.extend(self.local_flags) @@ -649,6 +697,11 @@ def load_from_dbt_manifest(self) -> None: file_path=self.execution_config.project_path / Path(node_dict["original_file_path"]), tags=node_dict["tags"], config=node_dict["config"], + has_freshness=( + is_freshness_effective(node_dict.get("freshness")) + if DbtResourceType(node_dict["resource_type"]) == DbtResourceType.SOURCE + else False + ), ) nodes[node.unique_id] = node diff --git a/cosmos/operators/aws_eks.py b/cosmos/operators/aws_eks.py index 180028378..0d98b2c63 100644 --- a/cosmos/operators/aws_eks.py +++ b/cosmos/operators/aws_eks.py @@ -14,6 +14,7 @@ DbtRunOperationKubernetesOperator, DbtSeedKubernetesOperator, DbtSnapshotKubernetesOperator, + DbtSourceKubernetesOperator, DbtTestKubernetesOperator, ) @@ -101,6 +102,12 @@ class DbtSnapshotAwsEksOperator(DbtAwsEksBaseOperator, DbtSnapshotKubernetesOper """ +class DbtSourceAzureContainerInstanceOperator(DbtAwsEksBaseOperator, DbtSourceKubernetesOperator): + """ + Executes a dbt source freshness command. + """ + + class DbtRunAwsEksOperator(DbtAwsEksBaseOperator, DbtRunKubernetesOperator): """ Executes a dbt core run command. diff --git a/cosmos/operators/azure_container_instance.py b/cosmos/operators/azure_container_instance.py index d8427b2fb..993d4315f 100644 --- a/cosmos/operators/azure_container_instance.py +++ b/cosmos/operators/azure_container_instance.py @@ -13,6 +13,7 @@ DbtRunOperationMixin, DbtSeedMixin, DbtSnapshotMixin, + DbtSourceMixin, DbtTestMixin, ) @@ -102,6 +103,12 @@ class DbtSnapshotAzureContainerInstanceOperator(DbtSnapshotMixin, DbtAzureContai """ +class DbtSourceAzureContainerInstanceOperator(DbtSourceMixin, DbtAzureContainerInstanceBaseOperator): + """ + Executes a dbt source freshness command. + """ + + class DbtRunAzureContainerInstanceOperator(DbtRunMixin, DbtAzureContainerInstanceBaseOperator): # type: ignore """ Executes a dbt core run command. diff --git a/cosmos/operators/base.py b/cosmos/operators/base.py index d0cbdd282..37aae7a81 100644 --- a/cosmos/operators/base.py +++ b/cosmos/operators/base.py @@ -344,6 +344,15 @@ class DbtSnapshotMixin: ui_color = "#964B00" +class DbtSourceMixin: + """ + Executes a dbt source freshness command. + """ + + base_cmd = ["source", "freshness"] + ui_color = "#34CCEB" + + class DbtRunMixin: """ Mixin for dbt run command. diff --git a/cosmos/operators/docker.py b/cosmos/operators/docker.py index 532de380e..4abf9e994 100644 --- a/cosmos/operators/docker.py +++ b/cosmos/operators/docker.py @@ -13,6 +13,7 @@ DbtRunOperationMixin, DbtSeedMixin, DbtSnapshotMixin, + DbtSourceMixin, DbtTestMixin, ) @@ -94,6 +95,12 @@ class DbtSnapshotDockerOperator(DbtSnapshotMixin, DbtDockerBaseOperator): """ +class DbtSourceDockerOperator(DbtSourceMixin, DbtDockerBaseOperator): + """ + Executes a dbt source freshness command. + """ + + class DbtRunDockerOperator(DbtRunMixin, DbtDockerBaseOperator): """ Executes a dbt core run command. diff --git a/cosmos/operators/kubernetes.py b/cosmos/operators/kubernetes.py index f84219199..ef69cd561 100644 --- a/cosmos/operators/kubernetes.py +++ b/cosmos/operators/kubernetes.py @@ -17,6 +17,7 @@ DbtRunOperationMixin, DbtSeedMixin, DbtSnapshotMixin, + DbtSourceMixin, DbtTestMixin, ) @@ -125,6 +126,12 @@ class DbtSnapshotKubernetesOperator(DbtSnapshotMixin, DbtKubernetesBaseOperator) """ +class DbtSourceKubernetesOperator(DbtSourceMixin, DbtKubernetesBaseOperator): + """ + Executes a dbt source freshness command. + """ + + class DbtRunKubernetesOperator(DbtRunMixin, DbtKubernetesBaseOperator): """ Executes a dbt core run command. diff --git a/cosmos/operators/local.py b/cosmos/operators/local.py index 8a4ea1ba2..8baa42716 100644 --- a/cosmos/operators/local.py +++ b/cosmos/operators/local.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import os import tempfile import warnings @@ -67,6 +68,7 @@ DbtRunOperationMixin, DbtSeedMixin, DbtSnapshotMixin, + DbtSourceMixin, DbtTestMixin, ) @@ -115,9 +117,10 @@ class DbtLocalBaseOperator(AbstractDbtBaseOperator): and does not inherit the current process environment. """ - template_fields: Sequence[str] = AbstractDbtBaseOperator.template_fields + ("compiled_sql",) # type: ignore[operator] + template_fields: Sequence[str] = AbstractDbtBaseOperator.template_fields + ("compiled_sql", "freshness") # type: ignore[operator] template_fields_renderers = { "compiled_sql": "sql", + "freshness": "json", } def __init__( @@ -133,6 +136,7 @@ def __init__( self.profile_config = profile_config self.callback = callback self.compiled_sql = "" + self.freshness = "" self.should_store_compiled_sql = should_store_compiled_sql self.openlineage_events_completes: list[RunEvent] = [] self.invocation_mode = invocation_mode @@ -248,6 +252,29 @@ def store_compiled_sql(self, tmp_project_dir: str, context: Context, session: Se else: logger.info("Warning: ti is of type TaskInstancePydantic. Cannot update template_fields.") + @provide_session + def store_freshness_json(self, tmp_project_dir: str, context: Context, session: Session = NEW_SESSION) -> None: + """ + Takes the compiled sources.json file from the dbt source freshness and stores it in the freshness rendered template. + Gets called after every dbt run / source freshness. + """ + if not self.should_store_compiled_sql: + return + + sources_json_path = Path(os.path.join(tmp_project_dir, "target", "sources.json")) + + if sources_json_path.exists(): + sources_json_content = sources_json_path.read_text(encoding="utf-8").strip() + + sources_data = json.loads(sources_json_content) + + formatted_sources_json = json.dumps(sources_data, indent=4) + + self.freshness = formatted_sources_json + + else: + self.freshness = "" + def run_subprocess(self, command: list[str], env: dict[str, str], cwd: str) -> FullOutputSubprocessResult: logger.info("Trying to run the command:\n %s\nFrom %s", command, cwd) subprocess_result: FullOutputSubprocessResult = self.subprocess_hook.run_command( @@ -368,6 +395,7 @@ def run_command( if partial_parse_file.exists(): cache._update_partial_parse_cache(partial_parse_file, self.cache_dir) + self.store_freshness_json(tmp_project_dir, context) self.store_compiled_sql(tmp_project_dir, context) self.handle_exception(result) if self.callback: @@ -538,6 +566,12 @@ class DbtSnapshotLocalOperator(DbtSnapshotMixin, DbtLocalBaseOperator): """ +class DbtSourceLocalOperator(DbtSourceMixin, DbtLocalBaseOperator): + """ + Executes a dbt source freshness command. + """ + + class DbtRunLocalOperator(DbtRunMixin, DbtLocalBaseOperator): """ Executes a dbt core run command. diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 576db037e..4e25cb783 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -24,6 +24,7 @@ DbtRunOperationLocalOperator, DbtSeedLocalOperator, DbtSnapshotLocalOperator, + DbtSourceLocalOperator, DbtTestLocalOperator, ) @@ -250,6 +251,13 @@ class DbtSnapshotVirtualenvOperator(DbtVirtualenvBaseOperator, DbtSnapshotLocalO """ +class DbtSourceVirtualenvOperator(DbtVirtualenvBaseOperator, DbtSourceLocalOperator): + """ + Executes `dbt source freshness` command within a Python Virtual Environment, that is created before running the dbt + command and deleted just after. + """ + + class DbtRunVirtualenvOperator(DbtVirtualenvBaseOperator, DbtRunLocalOperator): # type: ignore[misc] """ Executes a dbt core run command within a Python Virtual Environment, that is created before running the dbt command diff --git a/dev/dags/dbt/jaffle_shop/models/staging/sources.yml b/dev/dags/dbt/jaffle_shop/models/staging/sources.yml new file mode 100644 index 000000000..a3139b585 --- /dev/null +++ b/dev/dags/dbt/jaffle_shop/models/staging/sources.yml @@ -0,0 +1,31 @@ + +version: 2 + +sources: + - name: postgres_db + database: "{{ env_var('POSTGRES_DB') }}" + schema: "{{ env_var('POSTGRES_SCHEMA') }}" + tables: + - name: raw_customers + columns: + - name: id + tests: + - unique + - not_null + - name: raw_payments + columns: + - name: id + tests: + - unique + - not_null + - name: raw_orders + columns: + - name: id + tests: + - unique + - not_null + freshness: + warn_after: + count: 3650 + period: day + loaded_at_field: CAST(order_date AS TIMESTAMP) diff --git a/dev/dags/dbt/jaffle_shop/models/staging/stg_customers.sql b/dev/dags/dbt/jaffle_shop/models/staging/stg_customers.sql index cad047269..71b6c7c0a 100644 --- a/dev/dags/dbt/jaffle_shop/models/staging/stg_customers.sql +++ b/dev/dags/dbt/jaffle_shop/models/staging/stg_customers.sql @@ -4,8 +4,15 @@ with source as ( Normally we would select from the table here, but we are using seeds to load our data in this project #} - select * from {{ ref('raw_customers') }} + select * from {{ source('postgres_db', 'raw_customers') }} + +), +force_seed_dep as ( + {#- + This CTE is used to ensure tests wait for seeds to run if source_node_rendering = none + #} + select * from {{ ref('raw_customers') }} ), renamed as ( diff --git a/dev/dags/dbt/jaffle_shop/models/staging/stg_orders.sql b/dev/dags/dbt/jaffle_shop/models/staging/stg_orders.sql index a654dcb94..b6c13a33f 100644 --- a/dev/dags/dbt/jaffle_shop/models/staging/stg_orders.sql +++ b/dev/dags/dbt/jaffle_shop/models/staging/stg_orders.sql @@ -4,10 +4,17 @@ with source as ( Normally we would select from the table here, but we are using seeds to load our data in this project #} - select * from {{ ref('raw_orders') }} + select * from {{ source('postgres_db', 'raw_orders') }} ), +force_seed_dep as ( + {#- + This CTE is used to ensure tests wait for seeds to run if source_node_rendering = none + #} + select * from {{ ref('raw_customers') }} +), + renamed as ( select diff --git a/dev/dags/dbt/jaffle_shop/models/staging/stg_payments.sql b/dev/dags/dbt/jaffle_shop/models/staging/stg_payments.sql index f718596ad..3ff1fbece 100644 --- a/dev/dags/dbt/jaffle_shop/models/staging/stg_payments.sql +++ b/dev/dags/dbt/jaffle_shop/models/staging/stg_payments.sql @@ -1,11 +1,14 @@ with source as ( + select * from {{ source('postgres_db', 'raw_payments') }} + +), + +force_seed_dep as ( {#- - Normally we would select from the table here, but we are using seeds to load - our data in this project + This CTE is used to ensure tests wait for seeds to run if source_node_rendering = none #} - select * from {{ ref('raw_payments') }} - + select * from {{ ref('raw_customers') }} ), renamed as ( diff --git a/docs/configuration/cosmos-conf.rst b/docs/configuration/cosmos-conf.rst index 037a43d3b..5b8ee210b 100644 --- a/docs/configuration/cosmos-conf.rst +++ b/docs/configuration/cosmos-conf.rst @@ -94,7 +94,6 @@ This page lists all available Airflow configurations that affect ``astronomer-co - Default: ``profile`` - Environment Variable: ``AIRFLOW__COSMOS__PROFILE_CACHE_DIR_NAME`` - [openlineage] ~~~~~~~~~~~~~ diff --git a/docs/configuration/index.rst b/docs/configuration/index.rst index 90f195938..f6e60f61b 100644 --- a/docs/configuration/index.rst +++ b/docs/configuration/index.rst @@ -22,6 +22,7 @@ Cosmos offers a number of configuration options to customize its behavior. For m Testing Behavior Selecting & Excluding Partial Parsing + Source Nodes Rendering Operator Args Compiled SQL Logging diff --git a/docs/configuration/source-nodes-rendering.rst b/docs/configuration/source-nodes-rendering.rst new file mode 100644 index 000000000..ae1417361 --- /dev/null +++ b/docs/configuration/source-nodes-rendering.rst @@ -0,0 +1,36 @@ +.. _source-nodes-rendering: + +Source Nodes Rendering +================ + +.. note:: + This feature is only available for dbt-core >= 1.5 and cosmos >= 1.6.0. + +By default, Cosmos does not render dbt sources automatically. Instead, you need to configure the rendering of sources explicitly. +You can control this behavior using the ``source_rendering_behavior`` field in the ``RenderConfig`` object. This is how it works: + +- **all**: + When set to ``all``, Cosmos renders all sources in the dbt project. It uses three different node types for this: + - ``EmptyOperator``: For sources that do not have tests or freshness checks. + - ``DbtSourceOperator``: For sources that have freshness checks. + - ``DbtTestOperator``: For sources that have tests. + + This approach aims to create a comprehensive DAG that aligns with dbt documentation, allowing for the rendering of both sources and models for a more detailed visual representation. + It also ensures that model dependencies do not run if their sources are not fresh, thus preventing the execution of stale or incomplete data. + +- **none** (default): When set to ``none``, Cosmos does not automatically render any sources. Note that if node converters are being used for sources, they will still function as intended. + +- **with_tests_or_freshness**: When set to ``with_tests_or_freshness``, Cosmos only renders sources that have either tests or freshness checks. + +Example: + +.. code-block:: python + + from cosmos import DbtTaskGroup, RenderConfig + from cosmos.constants import SourceRenderingBehavior + + jaffle_shop = DbtTaskGroup( + render_config=RenderConfig( + source_rendering_behavior=SourceRenderingBehavior.WITH_TESTS_OR_FRESHNESS, + ) + ) diff --git a/scripts/test/integration-dbt-1-5-4.sh b/scripts/test/integration-dbt-1-5-4.sh index 087533082..284f60517 100644 --- a/scripts/test/integration-dbt-1-5-4.sh +++ b/scripts/test/integration-dbt-1-5-4.sh @@ -1,5 +1,6 @@ pip uninstall dbt-adapters dbt-common dbt-core dbt-extractor dbt-postgres dbt-semantic-interfaces -y pip install dbt-postgres==1.5.4 dbt-databricks==1.5.4 +export SOURCE_RENDERING_BEHAVIOR=all rm -rf airflow.*; \ airflow db init; \ pytest -vv \ diff --git a/scripts/test/integration-expensive.sh b/scripts/test/integration-expensive.sh index 24bace86d..96c2388cf 100644 --- a/scripts/test/integration-expensive.sh +++ b/scripts/test/integration-expensive.sh @@ -1,3 +1,4 @@ +export SOURCE_RENDERING_BEHAVIOR=all pytest -vv \ --cov=cosmos \ --cov-report=term-missing \ diff --git a/scripts/test/integration.sh b/scripts/test/integration.sh index 1d8264768..a39ef63b1 100644 --- a/scripts/test/integration.sh +++ b/scripts/test/integration.sh @@ -3,6 +3,8 @@ set -x set -e +export SOURCE_RENDERING_BEHAVIOR=all + pip freeze | grep airflow echo $AIRFLOW_HOME ls $AIRFLOW_HOME diff --git a/tests/airflow/test_graph.py b/tests/airflow/test_graph.py index f9a62106d..ddffa226c 100644 --- a/tests/airflow/test_graph.py +++ b/tests/airflow/test_graph.py @@ -1,3 +1,4 @@ +import os from datetime import datetime from pathlib import Path from unittest.mock import patch @@ -21,6 +22,7 @@ from cosmos.constants import ( DbtResourceType, ExecutionMode, + SourceRenderingBehavior, TestBehavior, TestIndirectSelection, ) @@ -29,6 +31,7 @@ from cosmos.profiles import PostgresUserPasswordProfileMapping SAMPLE_PROJ_PATH = Path("/home/user/path/dbt-proj/") +SOURCE_RENDERING_BEHAVIOR = SourceRenderingBehavior(os.getenv("SOURCE_RENDERING_BEHAVIOR", "none")) parent_seed = DbtNode( unique_id=f"{DbtResourceType.SEED.value}.{SAMPLE_PROJ_PATH.stem}.seed_parent", @@ -100,6 +103,7 @@ def test_build_airflow_graph_with_after_each(): task_args=task_args, render_config=RenderConfig( test_behavior=TestBehavior.AFTER_EACH, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, ), dbt_project_name="astro_shop", ) @@ -169,6 +173,7 @@ def test_create_task_group_for_after_each_supported_nodes(node_type: DbtResource }, test_behavior=TestBehavior.AFTER_EACH, on_warning_callback=None, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, ) assert isinstance(output, TaskGroup) assert list(output.children.keys()) == [f"dbt_node.{task_suffix}", "dbt_node.test"] @@ -196,6 +201,7 @@ def test_build_airflow_graph_with_after_all(): render_config = RenderConfig( select=["tag:some"], test_behavior=TestBehavior.AFTER_ALL, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, ) build_airflow_graph( nodes=sample_nodes, @@ -309,12 +315,12 @@ def test_create_task_metadata_unsupported(caplog): ( f"{DbtResourceType.SOURCE.value}.my_folder.my_source", DbtResourceType.SOURCE, - "my_source_run", - "cosmos.operators.local.DbtRunLocalOperator", - {"models": "my_source"}, + "my_source_source", + "cosmos.operators.local.DbtSourceLocalOperator", + {"select": "source:my_source"}, { "dbt_node_config": { - "unique_id": "model.my_folder.my_source", + "unique_id": "source.my_folder.my_source", "resource_type": "source", "depends_on": [], "file_path": ".", @@ -364,6 +370,7 @@ def test_create_task_metadata_model( file_path=Path(""), tags=[], config={}, + has_freshness=True, ) metadata = create_task_metadata(child_node, execution_mode=ExecutionMode.LOCAL, args={}) @@ -402,6 +409,64 @@ def test_create_task_metadata_model_use_task_group(caplog): assert metadata.id == "run" +@pytest.mark.parametrize( + "unique_id, resource_type, has_freshness, source_rendering_behavior, expected_id, expected_operator_class", + [ + ( + f"{DbtResourceType.SOURCE.value}.my_folder.my_source", + DbtResourceType.SOURCE, + True, + SOURCE_RENDERING_BEHAVIOR, + "my_source_source", + "cosmos.operators.local.DbtSourceLocalOperator", + ), + ( + f"{DbtResourceType.SOURCE.value}.my_folder.my_source", + DbtResourceType.SOURCE, + False, + SOURCE_RENDERING_BEHAVIOR, + "my_source_source", + "airflow.operators.empty.EmptyOperator", + ), + ( + f"{DbtResourceType.SOURCE.value}.my_folder.my_source", + DbtResourceType.SOURCE, + True, + SourceRenderingBehavior.NONE, + None, + None, + ), + ( + f"{DbtResourceType.SOURCE.value}.my_folder.my_source", + DbtResourceType.SOURCE, + False, + SourceRenderingBehavior.NONE, + None, + None, + ), + ], +) +def test_create_task_metadata_source_with_rendering_options( + unique_id, resource_type, has_freshness, source_rendering_behavior, expected_id, expected_operator_class, caplog +): + child_node = DbtNode( + unique_id=unique_id, + resource_type=resource_type, + depends_on=[], + file_path=Path(""), + tags=[], + config={}, + has_freshness=has_freshness, + ) + + metadata = create_task_metadata( + child_node, execution_mode=ExecutionMode.LOCAL, source_rendering_behavior=source_rendering_behavior, args={} + ) + if metadata: + assert metadata.id == expected_id + assert metadata.operator_class == expected_operator_class + + @pytest.mark.parametrize("use_task_group", (None, True, False)) def test_create_task_metadata_seed(caplog, use_task_group): sample_node = DbtNode( @@ -524,7 +589,7 @@ def test_airflow_kwargs_generation(): "group_id": "fake_group_id", "project_dir": SAMPLE_PROJ_PATH, "conn_id": "fake_conn", - "render_config": RenderConfig(select=["fake-render"]), + "render_config": RenderConfig(select=["fake-render"], source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR), "default_args": {"retries": 2}, "profile_config": ProfileConfig( profile_name="default", diff --git a/tests/dbt/test_graph.py b/tests/dbt/test_graph.py index bcce46b87..4cca8015f 100644 --- a/tests/dbt/test_graph.py +++ b/tests/dbt/test_graph.py @@ -14,7 +14,7 @@ from cosmos import settings from cosmos.config import CosmosConfigException, ExecutionConfig, ProfileConfig, ProjectConfig, RenderConfig -from cosmos.constants import DBT_TARGET_DIR_NAME, DbtResourceType, ExecutionMode +from cosmos.constants import DBT_TARGET_DIR_NAME, DbtResourceType, ExecutionMode, SourceRenderingBehavior from cosmos.dbt.graph import ( CosmosLoadDbtException, DbtGraph, @@ -32,6 +32,7 @@ SAMPLE_MANIFEST_MODEL_VERSION = Path(__file__).parent.parent / "sample/manifest_model_version.json" SAMPLE_MANIFEST_SOURCE = Path(__file__).parent.parent / "sample/manifest_source.json" SAMPLE_DBT_LS_OUTPUT = Path(__file__).parent.parent / "sample/sample_dbt_ls.txt" +SOURCE_RENDERING_BEHAVIOR = SourceRenderingBehavior(os.getenv("SOURCE_RENDERING_BEHAVIOR", "none")) @pytest.fixture @@ -131,7 +132,10 @@ def test_load_via_manifest_with_exclude(project_name, manifest_filepath, model_f target_name="test", profiles_yml_filepath=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME / "profiles.yml", ) - render_config = RenderConfig(exclude=["config.materialized:table"]) + render_config = RenderConfig( + exclude=["config.materialized:table"], + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) execution_config = ExecutionConfig(dbt_project_path=project_config.dbt_project_path) dbt_graph = DbtGraph( project=project_config, @@ -170,7 +174,7 @@ def test_load_via_manifest_with_select(project_name, manifest_filepath, model_fi target_name="test", profiles_yml_filepath=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME / "profiles.yml", ) - render_config = RenderConfig(select=["+customers"]) + render_config = RenderConfig(select=["+customers"], source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR) execution_config = ExecutionConfig(dbt_project_path=project_config.dbt_project_path) dbt_graph = DbtGraph( project=project_config, @@ -251,7 +255,10 @@ def test_load_automatic_dbt_ls_file_is_available(mock_load_via_dbt_ls_file): target_name="test", profiles_yml_filepath=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME / "profiles.yml", ) - render_config = RenderConfig(dbt_ls_path=SAMPLE_DBT_LS_OUTPUT) + render_config = RenderConfig( + dbt_ls_path=SAMPLE_DBT_LS_OUTPUT, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) dbt_graph = DbtGraph(project=project_config, profile_config=profile_config, render_config=render_config) dbt_graph.load(method=LoadMode.DBT_LS_FILE, execution_mode=ExecutionMode.LOCAL) assert mock_load_via_dbt_ls_file.called @@ -264,7 +271,7 @@ def test_load_dbt_ls_file_without_file(): target_name="test", profiles_yml_filepath=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME / "profiles.yml", ) - render_config = RenderConfig(dbt_ls_path=None) + render_config = RenderConfig(dbt_ls_path=None, source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR) dbt_graph = DbtGraph(project=project_config, profile_config=profile_config, render_config=render_config) with pytest.raises(CosmosLoadDbtException) as err_info: dbt_graph.load(execution_mode=ExecutionMode.LOCAL, method=LoadMode.DBT_LS_FILE) @@ -278,7 +285,11 @@ def test_load_dbt_ls_file_without_project_path(): target_name="test", profiles_yml_filepath=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME / "profiles.yml", ) - render_config = RenderConfig(dbt_ls_path=SAMPLE_DBT_LS_OUTPUT, dbt_project_path=None) + render_config = RenderConfig( + dbt_ls_path=SAMPLE_DBT_LS_OUTPUT, + dbt_project_path=None, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) dbt_graph = DbtGraph( project=project_config, profile_config=profile_config, @@ -419,7 +430,10 @@ def test_load_via_dbt_ls_does_not_create_target_logs_in_original_folder( assert not (tmp_dbt_project_dir / "logs").exists() project_config = ProjectConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) - render_config = RenderConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) + render_config = RenderConfig( + dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) execution_config = ExecutionConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) dbt_graph = DbtGraph( project=project_config, @@ -440,7 +454,10 @@ def test_load_via_dbt_ls_does_not_create_target_logs_in_original_folder( def test_load_via_dbt_ls_with_exclude(postgres_profile_config): project_config = ProjectConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) render_config = RenderConfig( - dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, select=["*customers*"], exclude=["*orders*"] + dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, + select=["*customers*"], + exclude=["*orders*"], + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, ) execution_config = ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) dbt_graph = DbtGraph( @@ -453,13 +470,15 @@ def test_load_via_dbt_ls_with_exclude(postgres_profile_config): dbt_graph.load_via_dbt_ls() assert dbt_graph.nodes == dbt_graph.filtered_nodes # This test is dependent upon dbt >= 1.5.4 - assert len(dbt_graph.nodes) == 7 + assert len(dbt_graph.nodes) == 9 expected_keys = [ "model.jaffle_shop.customers", "model.jaffle_shop.stg_customers", "seed.jaffle_shop.raw_customers", "test.jaffle_shop.not_null_customers_customer_id.5c9bf9911d", "test.jaffle_shop.not_null_stg_customers_customer_id.e2cfb1f9aa", + "test.jaffle_shop.source_not_null_postgres_db_raw_customers_id.de3e9fff76", + "test.jaffle_shop.source_unique_postgres_db_raw_customers_id.6e5ad1d707", "test.jaffle_shop.unique_customers_customer_id.c5af1ff4b1", "test.jaffle_shop.unique_stg_customers_customer_id.c7614daada", ] @@ -481,7 +500,10 @@ def test_load_via_dbt_ls_with_exclude(postgres_profile_config): @pytest.mark.parametrize("project_name", ("jaffle_shop", "jaffle_shop_python")) def test_load_via_dbt_ls_without_exclude(project_name, postgres_profile_config): project_config = ProjectConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / project_name) - render_config = RenderConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) + render_config = RenderConfig( + dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) execution_config = ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) dbt_graph = DbtGraph( project=project_config, @@ -492,13 +514,16 @@ def test_load_via_dbt_ls_without_exclude(project_name, postgres_profile_config): dbt_graph.load_via_dbt_ls() assert dbt_graph.nodes == dbt_graph.filtered_nodes - assert len(dbt_graph.nodes) == 28 + assert len(dbt_graph.nodes) == 37 def test_load_via_custom_without_project_path(): project_config = ProjectConfig(manifest_path=SAMPLE_MANIFEST, project_name="test") execution_config = ExecutionConfig() - render_config = RenderConfig() + render_config = RenderConfig( + dbt_executable_path="/inexistent/dbt", + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) dbt_graph = DbtGraph( dbt_cmd="/inexistent/dbt", project=project_config, @@ -516,7 +541,11 @@ def test_load_via_custom_without_project_path(): def test_load_via_dbt_ls_without_profile(mock_validate_dbt_command): project_config = ProjectConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) execution_config = ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) - render_config = RenderConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) + render_config = RenderConfig( + dbt_executable_path="existing-dbt-cmd", + dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) dbt_graph = DbtGraph( dbt_cmd="/inexistent/dbt", project=project_config, @@ -535,7 +564,9 @@ def test_load_via_dbt_ls_with_invalid_dbt_path(mock_which): project_config = ProjectConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) execution_config = ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) render_config = RenderConfig( - dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, dbt_executable_path="/inexistent/dbt" + dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, + dbt_executable_path="/inexistent/dbt", + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, ) with patch("pathlib.Path.exists", return_value=True): dbt_graph = DbtGraph( @@ -569,6 +600,7 @@ def test_load_via_dbt_ls_with_sources(load_method): dbt_project_path=DBT_PROJECTS_ROOT_DIR / project_name, dbt_deps=False, env_vars={"DBT_SQLITE_PATH": str(DBT_PROJECTS_ROOT_DIR / "data")}, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, ), execution_config=ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / project_name), profile_config=ProfileConfig( @@ -586,7 +618,11 @@ def test_load_via_dbt_ls_with_sources(load_method): @pytest.mark.integration def test_load_via_dbt_ls_without_dbt_deps(postgres_profile_config): project_config = ProjectConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) - render_config = RenderConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, dbt_deps=False) + render_config = RenderConfig( + dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, + dbt_deps=False, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) execution_config = ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) dbt_graph = DbtGraph( project=project_config, @@ -629,7 +665,11 @@ def test_load_via_dbt_ls_without_dbt_deps_and_preinstalled_dbt_packages( stdout, stderr = process.communicate() project_config = ProjectConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) - render_config = RenderConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, dbt_deps=False) + render_config = RenderConfig( + dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, + dbt_deps=False, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) execution_config = ExecutionConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) dbt_graph = DbtGraph( project=project_config, @@ -657,7 +697,10 @@ def test_load_via_dbt_ls_caching_partial_parsing( project_config = ProjectConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) render_config = RenderConfig( - dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, dbt_deps=True, enable_mock_profile=False + dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, + dbt_deps=True, + enable_mock_profile=False, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, ) execution_config = ExecutionConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) dbt_graph = DbtGraph( @@ -697,7 +740,10 @@ def test_load_via_dbt_ls_uses_partial_parse_when_cache_is_disabled( caplog.set_level(logging.DEBUG) project_config = ProjectConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) render_config = RenderConfig( - dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, dbt_deps=True, enable_mock_profile=False + dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, + dbt_deps=True, + enable_mock_profile=False, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, ) execution_config = ExecutionConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) dbt_graph = DbtGraph( @@ -738,7 +784,10 @@ def test_load_via_dbt_ls_with_zero_returncode_and_non_empty_stderr( mock_popen().returncode = 0 project_config = ProjectConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) - render_config = RenderConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) + render_config = RenderConfig( + dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) execution_config = ExecutionConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) dbt_graph = DbtGraph( project=project_config, @@ -757,7 +806,10 @@ def test_load_via_dbt_ls_with_non_zero_returncode(mock_popen, postgres_profile_c mock_popen().returncode = 1 project_config = ProjectConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) - render_config = RenderConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) + render_config = RenderConfig( + dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) execution_config = ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) dbt_graph = DbtGraph( project=project_config, @@ -775,7 +827,10 @@ def test_load_via_dbt_ls_with_non_zero_returncode(mock_popen, postgres_profile_c def test_load_via_dbt_ls_with_runtime_error_in_stdout(mock_popen_communicate, postgres_profile_config): # It may seem strange, but at least until dbt 1.6.0, there are circumstances when it outputs errors to stdout project_config = ProjectConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) - render_config = RenderConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) + render_config = RenderConfig( + dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) execution_config = ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) dbt_graph = DbtGraph( project=project_config, @@ -794,7 +849,10 @@ def test_load_via_dbt_ls_with_runtime_error_in_stdout(mock_popen_communicate, po def test_load_via_load_via_custom_parser(project_name): project_config = ProjectConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / project_name) execution_config = ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) - render_config = RenderConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) + render_config = RenderConfig( + dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) profile_config = ProfileConfig( profile_name="test", target_name="test", @@ -816,7 +874,11 @@ def test_load_via_load_via_custom_parser(project_name): def test_load_via_load_via_custom_parser_select_rendering_config(): project_config = ProjectConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / "jaffle_shop") execution_config = ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME) - render_config = RenderConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, select=["customers"]) + render_config = RenderConfig( + dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, + select=["customers"], + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) profile_config = ProfileConfig( profile_name="test", target_name="test", @@ -884,7 +946,10 @@ def test_update_node_dependency_test_not_exist(): target_name="test", profiles_yml_filepath=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME / "profiles.yml", ) - render_config = RenderConfig(exclude=["config.materialized:test"]) + render_config = RenderConfig( + exclude=["config.materialized:test"], + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) execution_config = ExecutionConfig(dbt_project_path=project_config.dbt_project_path) dbt_graph = DbtGraph( project=project_config, @@ -907,7 +972,7 @@ def test_tag_selected_node_test_exist(): target_name="test", profiles_yml_filepath=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME / "profiles.yml", ) - render_config = RenderConfig(select=["tag:test_tag"]) + render_config = RenderConfig(select=["tag:test_tag"], source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR) execution_config = ExecutionConfig(dbt_project_path=project_config.dbt_project_path) dbt_graph = DbtGraph( project=project_config, @@ -932,7 +997,10 @@ def test_load_dbt_ls_and_manifest_with_model_version(load_method, postgres_profi dbt_project_path=DBT_PROJECTS_ROOT_DIR / "model_version", manifest_path=SAMPLE_MANIFEST_MODEL_VERSION if load_method == "load_from_dbt_manifest" else None, ), - render_config=RenderConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / "model_version"), + render_config=RenderConfig( + dbt_project_path=DBT_PROJECTS_ROOT_DIR / "model_version", + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ), execution_config=ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / "model_version"), profile_config=postgres_profile_config, ) @@ -980,7 +1048,9 @@ def test_load_via_dbt_ls_file(): profiles_yml_filepath=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME / "profiles.yml", ) render_config = RenderConfig( - dbt_ls_path=SAMPLE_DBT_LS_OUTPUT, dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME + dbt_ls_path=SAMPLE_DBT_LS_OUTPUT, + dbt_project_path=DBT_PROJECTS_ROOT_DIR / DBT_PROJECT_NAME, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, ) dbt_graph = DbtGraph( project=project_config, @@ -1078,7 +1148,10 @@ def test_load_via_dbt_ls_project_config_env_vars( mock_popen().returncode = 0 env_vars = {"MY_ENV_VAR": "my_value"} project_config = ProjectConfig(env_vars=env_vars) - render_config = RenderConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) + render_config = RenderConfig( + dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) profile_config = ProfileConfig( profile_name="test", target_name="test", @@ -1115,7 +1188,10 @@ def test_profile_created_correctly_with_profile_mapping( mock_popen().communicate.return_value = ("", "") mock_popen().returncode = 0 project_config = ProjectConfig(env_vars={}) - render_config = RenderConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) + render_config = RenderConfig( + dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) profile_config = postgres_profile_config execution_config = ExecutionConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) dbt_graph = DbtGraph( @@ -1140,7 +1216,10 @@ def test_load_via_dbt_ls_project_config_dbt_vars( mock_popen().returncode = 0 dbt_vars = {"my_var1": "my_value1", "my_var2": "my_value2"} project_config = ProjectConfig(dbt_vars=dbt_vars) - render_config = RenderConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME) + render_config = RenderConfig( + dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) profile_config = ProfileConfig( profile_name="test", target_name="test", @@ -1175,6 +1254,7 @@ def test_load_via_dbt_ls_render_config_selector_arg_is_used( dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, load_method=LoadMode.DBT_LS, selector=selector, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, ) profile_config = ProfileConfig( profile_name="test", @@ -1205,7 +1285,11 @@ def test_load_via_dbt_ls_render_config_no_partial_parse( mock_popen().communicate.return_value = ("", "") mock_popen().returncode = 0 project_config = ProjectConfig(partial_parse=False) - render_config = RenderConfig(dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, load_method=LoadMode.DBT_LS) + render_config = RenderConfig( + dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, + load_method=LoadMode.DBT_LS, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ) profile_config = ProfileConfig( profile_name="test", target_name="test", @@ -1231,7 +1315,11 @@ def test_load_method_with_unsupported_render_config_selector_arg(load_method): f"RenderConfig.selector is not yet supported when loading dbt projects using the {load_method} parser." ) dbt_graph = DbtGraph( - render_config=RenderConfig(load_method=load_method, selector="my_selector"), + render_config=RenderConfig( + load_method=load_method, + selector="my_selector", + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ), project=MagicMock(), ) with pytest.raises(CosmosLoadDbtException, match=expected_error_msg): @@ -1255,6 +1343,7 @@ def test_load_via_dbt_ls_with_project_config_vars(): render_config=RenderConfig( dbt_project_path=DBT_PROJECTS_ROOT_DIR / project_name, dbt_deps=False, + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, ), execution_config=ExecutionConfig(dbt_project_path=DBT_PROJECTS_ROOT_DIR / project_name), profile_config=ProfileConfig( @@ -1291,6 +1380,7 @@ def test_load_via_dbt_ls_with_selector_arg(tmp_dbt_project_dir, postgres_profile render_config = RenderConfig( dbt_project_path=tmp_dbt_project_dir / DBT_PROJECT_NAME, selector="stage_customers", + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, ) dbt_graph = DbtGraph( @@ -1302,11 +1392,13 @@ def test_load_via_dbt_ls_with_selector_arg(tmp_dbt_project_dir, postgres_profile dbt_graph.load_via_dbt_ls() filtered_nodes = dbt_graph.filtered_nodes.keys() - assert len(filtered_nodes) == 4 + assert len(filtered_nodes) == 7 assert "model.jaffle_shop.stg_customers" in filtered_nodes assert "seed.jaffle_shop.raw_customers" in filtered_nodes - # Two tests should be filtered - assert sum(node.startswith("test.jaffle_shop") for node in filtered_nodes) == 2 + if SOURCE_RENDERING_BEHAVIOR == SourceRenderingBehavior.ALL: + assert "source.jaffle_shop.postgres_db.raw_customers" in filtered_nodes + # Four tests should be filtered + assert sum(node.startswith("test.jaffle_shop") for node in filtered_nodes) == 4 @pytest.mark.parametrize( @@ -1404,7 +1496,13 @@ def airflow_variable(): @pytest.mark.integration def test_dbt_ls_cache_key_args_uses_airflow_vars_to_purge_dbt_ls_cache(airflow_variable): key, value = airflow_variable - graph = DbtGraph(project=ProjectConfig(), render_config=RenderConfig(airflow_vars_to_purge_dbt_ls_cache=[key])) + graph = DbtGraph( + project=ProjectConfig(), + render_config=RenderConfig( + airflow_vars_to_purge_dbt_ls_cache=[key], + source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, + ), + ) assert graph.dbt_ls_cache_key_args == [key, value] @@ -1422,9 +1520,9 @@ def test_save_dbt_ls_cache(mock_variable_set, mock_datetime, tmp_dbt_project_dir hash_dir, hash_args = version.split(",") assert hash_args == "d41d8cd98f00b204e9800998ecf8427e" if sys.platform == "darwin": - assert hash_dir == "65595448aded2c2b52878a801c1d9c59" + assert hash_dir == "a9879ec2ec503b0fe023d059caf50d41" else: - assert hash_dir == "4c826c84a94b0f1f5508c4e425170677" + assert hash_dir == "9001bedf4aa8a329f7b669c89f337c24" @pytest.mark.integration diff --git a/tests/operators/test_local.py b/tests/operators/test_local.py index ed46caf88..54d9d01da 100644 --- a/tests/operators/test_local.py +++ b/tests/operators/test_local.py @@ -1,3 +1,4 @@ +import json import logging import os import shutil @@ -36,6 +37,7 @@ DbtRunOperationLocalOperator, DbtSeedLocalOperator, DbtSnapshotLocalOperator, + DbtSourceLocalOperator, DbtTestLocalOperator, ) from cosmos.profiles import PostgresUserPasswordProfileMapping @@ -444,7 +446,7 @@ def test_run_operator_dataset_inlets_and_outlets(caplog): run_test_dag(dag) - assert run_operator.inlets == [] + assert run_operator.inlets == [Dataset(uri="postgres://0.0.0.0:5432/postgres.public.raw_customers", extra=None)] assert run_operator.outlets == [Dataset(uri="postgres://0.0.0.0:5432/postgres.public.stg_customers", extra=None)] assert test_operator.inlets == [Dataset(uri="postgres://0.0.0.0:5432/postgres.public.stg_customers", extra=None)] assert test_operator.outlets == [] @@ -770,15 +772,19 @@ def test_calculate_openlineage_events_completes_openlineage_errors(mock_processo [ ( DbtSeedLocalOperator, - ("env", "select", "exclude", "selector", "vars", "models", "compiled_sql", "full_refresh"), + ("env", "select", "exclude", "selector", "vars", "models", "compiled_sql", "freshness", "full_refresh"), ), ( DbtRunLocalOperator, - ("env", "select", "exclude", "selector", "vars", "models", "compiled_sql", "full_refresh"), + ("env", "select", "exclude", "selector", "vars", "models", "compiled_sql", "freshness", "full_refresh"), ), ( DbtBuildLocalOperator, - ("env", "select", "exclude", "selector", "vars", "models", "compiled_sql", "full_refresh"), + ("env", "select", "exclude", "selector", "vars", "models", "compiled_sql", "freshness", "full_refresh"), + ), + ( + DbtSourceLocalOperator, + ("env", "select", "exclude", "selector", "vars", "models", "compiled_sql", "freshness"), ), ], ) @@ -941,3 +947,73 @@ def test_handle_exception_subprocess(caplog): operator.handle_exception_subprocess(result) assert len(str(err_context.value)) < 100 # Ensure the error message is not too long assert len(caplog.text) > 1000 # Ensure the log message is not truncated + + +@pytest.fixture +def mock_context(): + return MagicMock() + + +@pytest.fixture +def mock_session(): + return MagicMock() + + +@patch("cosmos.operators.local.Path") +def test_store_freshness_json(mock_path_class, mock_context, mock_session): + instance = DbtSourceLocalOperator( + task_id="test", + profile_config=None, + project_dir="my/dir", + ) + + # Mock the behavior of Path.exists() and Path.read_text() + mock_sources_json_path = MagicMock() + mock_path_class.return_value = mock_sources_json_path + mock_sources_json_path.exists.return_value = True + mock_sources_json_path.read_text.return_value = '{"key": "value"}' + + # Expected formatted JSON content + expected_freshness = json.dumps({"key": "value"}, indent=4) + + # Call the method under test + instance.store_freshness_json(tmp_project_dir="/mock/dir", context=mock_context, session=mock_session) + + # Verify the freshness attribute is set correctly + assert instance.freshness == expected_freshness + + +@patch("cosmos.operators.local.Path") +def test_store_freshness_json_no_file(mock_path_class, mock_context, mock_session): + # Create an instance of the class that contains the method + instance = DbtSourceLocalOperator( + task_id="test", + profile_config=None, + project_dir="my/dir", + ) + + # Mock the behavior of Path.exists() and Path.read_text() + mock_sources_json_path = MagicMock() + mock_path_class.return_value = mock_sources_json_path + mock_sources_json_path.exists.return_value = False + + # Call the method under test + instance.store_freshness_json(tmp_project_dir="/mock/dir", context=mock_context, session=mock_session) + + # Verify the freshness attribute is set correctly + assert instance.freshness == "" + + +def test_store_freshness_not_store_compiled_sql(mock_context, mock_session): + instance = DbtSourceLocalOperator( + task_id="test", + profile_config=None, + project_dir="my/dir", + should_store_compiled_sql=False, + ) + + # Call the method under test + instance.store_freshness_json(tmp_project_dir="/mock/dir", context=mock_context, session=mock_session) + + # Verify the freshness attribute is set correctly + assert instance.freshness == "" From 5da43952fb1ccf4885bbdebc3d3834d835f9011f Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Wed, 14 Aug 2024 09:14:38 -0300 Subject: [PATCH 41/52] Blackfy --- cosmos/dbt/graph.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cosmos/dbt/graph.py b/cosmos/dbt/graph.py index 93e9430ec..cbe2741f1 100644 --- a/cosmos/dbt/graph.py +++ b/cosmos/dbt/graph.py @@ -35,7 +35,6 @@ LoadMode, SourceRenderingBehavior, ) -from cosmos.dbt.executable import get_system_dbt from cosmos.dbt.parser.project import LegacyDbtProject from cosmos.dbt.project import create_symlinks, environ, get_partial_parse_path, has_non_empty_dependencies_file from cosmos.dbt.selector import select_nodes @@ -216,7 +215,6 @@ def __init__( cache_identifier: str = "", dbt_vars: dict[str, str] | None = None, airflow_metadata: dict[str, str] | None = None, - dbt_cmd: str = get_system_dbt(), operator_args: dict[str, Any] | None = None, ): self.project = project @@ -231,7 +229,6 @@ def __init__( self.dbt_ls_cache_key = "" self.dbt_vars = dbt_vars or {} self.operator_args = operator_args or {} - self.dbt_cmd = dbt_cmd @cached_property def env_vars(self) -> dict[str, str]: @@ -400,7 +397,7 @@ def run_dbt_ls( """Runs dbt ls command and returns the parsed nodes.""" if self.render_config.source_rendering_behavior != SourceRenderingBehavior.NONE: ls_command = [ - self.dbt_cmd, + dbt_cmd, "ls", "--output", "json", @@ -415,7 +412,7 @@ def run_dbt_ls( "freshness", ] else: - ls_command = [self.dbt_cmd, "ls", "--output", "json"] + ls_command = [dbt_cmd, "ls", "--output", "json"] ls_args = self.dbt_ls_args ls_command.extend(self.local_flags) From 7d0c31a24871fb27e391fc508bb92c856dd19097 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Wed, 14 Aug 2024 09:20:28 -0300 Subject: [PATCH 42/52] Fix broken test --- tests/dbt/test_graph.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/dbt/test_graph.py b/tests/dbt/test_graph.py index 4cca8015f..8b526a8ab 100644 --- a/tests/dbt/test_graph.py +++ b/tests/dbt/test_graph.py @@ -525,7 +525,6 @@ def test_load_via_custom_without_project_path(): source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, ) dbt_graph = DbtGraph( - dbt_cmd="/inexistent/dbt", project=project_config, execution_config=execution_config, render_config=render_config, @@ -547,7 +546,6 @@ def test_load_via_dbt_ls_without_profile(mock_validate_dbt_command): source_rendering_behavior=SOURCE_RENDERING_BEHAVIOR, ) dbt_graph = DbtGraph( - dbt_cmd="/inexistent/dbt", project=project_config, execution_config=execution_config, render_config=render_config, From 31bbd192662e2ee881a4ac8f7087b6c1b5447889 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Wed, 14 Aug 2024 09:21:25 -0300 Subject: [PATCH 43/52] Fix broken test --- tests/dbt/test_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dbt/test_graph.py b/tests/dbt/test_graph.py index 8b526a8ab..c450d8d85 100644 --- a/tests/dbt/test_graph.py +++ b/tests/dbt/test_graph.py @@ -1518,7 +1518,7 @@ def test_save_dbt_ls_cache(mock_variable_set, mock_datetime, tmp_dbt_project_dir hash_dir, hash_args = version.split(",") assert hash_args == "d41d8cd98f00b204e9800998ecf8427e" if sys.platform == "darwin": - assert hash_dir == "a9879ec2ec503b0fe023d059caf50d41" + assert hash_dir == "6534a9963174a3adf58725f7f06a30a1" else: assert hash_dir == "9001bedf4aa8a329f7b669c89f337c24" From 8621e497c5ce5f6de74c05116fc17d48aa0935e2 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Thu, 15 Aug 2024 14:19:58 -0300 Subject: [PATCH 44/52] Refactor so integration DAG runs again --- cosmos/operators/virtualenv.py | 91 ++++++++++++---------------------- dev/dags/example_virtualenv.py | 7 ++- 2 files changed, 35 insertions(+), 63 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 4e25cb783..e816887bc 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -3,7 +3,6 @@ import os import shutil import time -from functools import cached_property from pathlib import Path from tempfile import TemporaryDirectory from typing import TYPE_CHECKING, Any, Callable @@ -77,44 +76,46 @@ def __init__( self.is_virtualenv_dir_temporary = is_virtualenv_dir_temporary self.max_retries_lock = settings.virtualenv_max_retries_lock super().__init__(**kwargs) - - @cached_property - def venv_dbt_path( - self, - ) -> str: - """ - Path to the dbt binary within a Python virtualenv. - - The first time this property is called, it creates a new/temporary and installs the dependencies - based on the self.py_requirements, self.pip_install_options, and self.py_system_site_packages, or retrieves an existing virtualenv. - This value is cached for future calls. - """ - # We are reusing the virtualenv directory for all subprocess calls within this task/operator. - # For this reason, we are not using contexts at this point. - # The deletion of this directory is done explicitly at the end of the `execute` method. - py_interpreter = self._get_or_create_venv_py_interpreter() - dbt_binary = Path(py_interpreter).parent / "dbt" - cmd_output = self.subprocess_hook.run_command( - [ - py_interpreter, - "-c", - "from importlib.metadata import version; print(version('dbt-core'))", - ] - ) - dbt_version = cmd_output.output - self.log.info("Using dbt version %s available at %s", dbt_version, dbt_binary) - return str(dbt_binary) + if not self.py_requirements: + self.log.error("Cosmos virtualenv operators require the `py_requirements` parameter") def run_subprocess(self, command: list[str], env: dict[str, str], cwd: str) -> FullOutputSubprocessResult: - if self.py_requirements: - command[0] = self.venv_dbt_path + # No virtualenv_dir set, so create a temporary virtualenv + if self.virtualenv_dir is None or self.is_virtualenv_dir_temporary: + self.log.info("Creating temporary virtualenv") + with TemporaryDirectory(prefix="cosmos-venv") as tempdir: + self.virtualenv_dir = Path(tempdir) + py_bin = self.prepare_virtualenv() + dbt_bin = Path(py_bin).parent / "dbt" + command[0] = dbt_bin # type: ignore + subprocess_result: FullOutputSubprocessResult = self.subprocess_hook.run_command( + command=command, + env=env, + cwd=cwd, + output_encoding=self.output_encoding, + ) + return subprocess_result - subprocess_result: FullOutputSubprocessResult = self.subprocess_hook.run_command( + # Use a reusable virtualenv + self.log.info(f"Checking if the virtualenv lock {str(self._lock_file)} exists") + while not self._is_lock_available() and self.max_retries_lock: + logger.info("Waiting for virtualenv lock to be released") + time.sleep(1) + self.max_retries_lock -= 1 + + self.log.info(f"Acquiring the virtualenv lock") + self._acquire_venv_lock() + py_bin = self.prepare_virtualenv() + dbt_bin = Path(py_bin).parent / "dbt" + command[0] = dbt_bin # type: ignore + subprocess_result = self.subprocess_hook.run_command( command=command, env=env, cwd=cwd, output_encoding=self.output_encoding, ) + self.log.info("Releasing virtualenv lock") + self._release_venv_lock() return subprocess_result def clean_dir_if_temporary(self) -> None: @@ -145,34 +146,6 @@ def prepare_virtualenv(self) -> str: ) return py_bin - def _get_or_create_venv_py_interpreter(self) -> str: - """Helper method that parses virtual env configuration - and returns a DBT binary within the resulting virtualenv""" - - # No virtualenv_dir set, so create a temporary virtualenv - if self.virtualenv_dir is None or self.is_virtualenv_dir_temporary: - self.log.info("Creating temporary virtualenv") - with TemporaryDirectory(prefix="cosmos-venv") as tempdir: - self.virtualenv_dir = Path(tempdir) - py_bin = self.prepare_virtualenv() - return py_bin - - # Use a reusable virtualenv - self.log.info(f"Checking if the virtualenv lock {str(self._lock_file)} exists") - while not self._is_lock_available() and self.max_retries_lock: - logger.info("Waiting for virtualenv lock to be released") - time.sleep(1) - self.max_retries_lock -= 1 - - self.log.info(f"Acquiring the virtualenv lock") - self._acquire_venv_lock() - py_bin = self.prepare_virtualenv() - - self.log.info("Releasing virtualenv lock") - self._release_venv_lock() - - return py_bin - @property def _lock_file(self) -> Path: filepath = Path(f"{self.virtualenv_dir}/{LOCK_FILENAME}") diff --git a/dev/dags/example_virtualenv.py b/dev/dags/example_virtualenv.py index 067f09b9d..a1cb1b932 100644 --- a/dev/dags/example_virtualenv.py +++ b/dev/dags/example_virtualenv.py @@ -6,7 +6,6 @@ from datetime import datetime from pathlib import Path -from airflow.configuration import get_airflow_home from airflow.decorators import dag from airflow.operators.empty import EmptyOperator @@ -55,7 +54,7 @@ def example_virtualenv() -> None: ), operator_args={ "py_system_site_packages": False, - "py_requirements": ["dbt-postgres==1.6.0b1"], + "py_requirements": ["dbt-postgres==1.7"], "install_deps": True, "emit_datasets": False, # Example of how to not set inlets and outlets }, @@ -74,11 +73,11 @@ def example_virtualenv() -> None: execution_mode=ExecutionMode.VIRTUALENV, # We can enable this flag if we want Airflow to create one virtualenv # and reuse that within the whole DAG. - virtualenv_dir=Path(f"{get_airflow_home()}/persistent-venv"), + virtualenv_dir=Path("/tmp/persistent-venv2"), ), operator_args={ "py_system_site_packages": False, - "py_requirements": ["dbt-postgres==1.8"], + "py_requirements": ["dbt-postgres"], "install_deps": True, }, ) From c48f1cae98a62757ea05e87a858bc0af4a20071d Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Thu, 15 Aug 2024 14:26:02 -0300 Subject: [PATCH 45/52] Refactor so integration DAG runs again --- dev/dags/example_virtualenv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/dags/example_virtualenv.py b/dev/dags/example_virtualenv.py index a1cb1b932..1168aa4d0 100644 --- a/dev/dags/example_virtualenv.py +++ b/dev/dags/example_virtualenv.py @@ -54,7 +54,7 @@ def example_virtualenv() -> None: ), operator_args={ "py_system_site_packages": False, - "py_requirements": ["dbt-postgres==1.7"], + "py_requirements": ["dbt-postgres"], "install_deps": True, "emit_datasets": False, # Example of how to not set inlets and outlets }, From 0f7fcf462958279679d1c96f67eb1c6cbd520bb6 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Fri, 16 Aug 2024 07:08:49 -0300 Subject: [PATCH 46/52] Fix unittests --- cosmos/operators/virtualenv.py | 4 +- dev/dags/example_virtualenv.py | 6 +- tests/operators/test_virtualenv.py | 109 +++++++++++++++-------------- 3 files changed, 62 insertions(+), 57 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index e816887bc..dd351aed3 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -86,7 +86,7 @@ def run_subprocess(self, command: list[str], env: dict[str, str], cwd: str) -> F with TemporaryDirectory(prefix="cosmos-venv") as tempdir: self.virtualenv_dir = Path(tempdir) py_bin = self.prepare_virtualenv() - dbt_bin = Path(py_bin).parent / "dbt" + dbt_bin = str(Path(py_bin).parent / "dbt") command[0] = dbt_bin # type: ignore subprocess_result: FullOutputSubprocessResult = self.subprocess_hook.run_command( command=command, @@ -106,7 +106,7 @@ def run_subprocess(self, command: list[str], env: dict[str, str], cwd: str) -> F self.log.info(f"Acquiring the virtualenv lock") self._acquire_venv_lock() py_bin = self.prepare_virtualenv() - dbt_bin = Path(py_bin).parent / "dbt" + dbt_bin = str(Path(py_bin).parent / "dbt") command[0] = dbt_bin # type: ignore subprocess_result = self.subprocess_hook.run_command( command=command, diff --git a/dev/dags/example_virtualenv.py b/dev/dags/example_virtualenv.py index 1168aa4d0..a4023c2cd 100644 --- a/dev/dags/example_virtualenv.py +++ b/dev/dags/example_virtualenv.py @@ -48,9 +48,9 @@ def example_virtualenv() -> None: profile_config=profile_config, execution_config=ExecutionConfig( execution_mode=ExecutionMode.VIRTUALENV, - # We can enable this flag if we want Airflow to create one virtualenv - # and reuse that within the whole DAG. - # virtualenv_dir=f"{get_airflow_home()}/persistent-venv", + # Without setting: + # virtualenv_dir="/some/path/persistent-venv", + # Cosmos creates a new virtualenv for each dbt task being executed ), operator_args={ "py_system_site_packages": False, diff --git a/tests/operators/test_virtualenv.py b/tests/operators/test_virtualenv.py index 26f6659a2..cd5c708ea 100644 --- a/tests/operators/test_virtualenv.py +++ b/tests/operators/test_virtualenv.py @@ -39,7 +39,7 @@ def base_cmd(self) -> list[str]: @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.handle_exception_subprocess") @patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.subprocess_hook") @patch("airflow.hooks.base.BaseHook.get_connection") -def test_run_command( +def test_run_command_without_virtualenv_dir( mock_get_connection, mock_subprocess_hook, mock_exception_handling, @@ -68,21 +68,62 @@ def test_run_command( emit_datasets=False, invocation_mode=InvocationMode.SUBPROCESS, ) - assert venv_operator.virtualenv_dir is None # Otherwise we are creating empty directories during DAG parsing time - # and not deleting them + assert venv_operator.virtualenv_dir == None venv_operator.run_command(cmd=["fake-dbt", "do-something"], env={}, context={"task_instance": MagicMock()}) run_command_args = mock_subprocess_hook.run_command.call_args_list - assert len(run_command_args) == 3 - python_cmd = run_command_args[0] - dbt_deps = run_command_args[1].kwargs - dbt_cmd = run_command_args[2].kwargs - assert python_cmd[0][0][0].endswith("/bin/python") - assert python_cmd[0][-1][-1] == "from importlib.metadata import version; print(version('dbt-core'))" - assert dbt_deps["command"][1] == "deps" - assert dbt_deps["command"][0].endswith("/bin/dbt") + assert len(run_command_args) == 2 + dbt_deps = run_command_args[0].kwargs + dbt_cmd = run_command_args[1].kwargs assert dbt_deps["command"][0] == dbt_cmd["command"][0] + assert dbt_deps["command"][1] == "deps" assert dbt_cmd["command"][1] == "do-something" - assert mock_execute.call_count == 2 + assert mock_execute.call_count == 4 + + +@patch("airflow.utils.python_virtualenv.execute_in_subprocess") +@patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.calculate_openlineage_events_completes") +@patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.store_compiled_sql") +@patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.handle_exception_subprocess") +@patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.subprocess_hook") +@patch("airflow.hooks.base.BaseHook.get_connection") +def test_run_command_with_virtualenv_dir( + mock_get_connection, + mock_subprocess_hook, + mock_exception_handling, + mock_store_compiled_sql, + mock_calculate_openlineage_events_completes, + mock_execute, +): + mock_get_connection.return_value = Connection( + conn_id="fake_conn", + conn_type="postgres", + host="fake_host", + port=5432, + login="fake_login", + password="fake_password", + schema="fake_schema", + ) + venv_operator = ConcreteDbtVirtualenvBaseOperator( + dag=DAG("sample_dag", start_date=datetime(2024, 4, 16)), + profile_config=profile_config, + task_id="fake_task", + install_deps=True, + project_dir="./dev/dags/dbt/jaffle_shop", + py_system_site_packages=False, + pip_install_options=["--test-flag"], + py_requirements=["dbt-postgres==1.6.0b1"], + emit_datasets=False, + invocation_mode=InvocationMode.SUBPROCESS, + virtualenv_dir=Path("mock-venv"), + ) + venv_operator.run_command(cmd=["fake-dbt", "do-something"], env={}, context={"task_instance": MagicMock()}) + assert str(venv_operator.virtualenv_dir) == "mock-venv" + run_command_args = mock_subprocess_hook.run_command.call_args_list + assert len(run_command_args) == 2 + dbt_deps = run_command_args[0].kwargs + dbt_cmd = run_command_args[1].kwargs + assert dbt_deps["command"][0] == "mock-venv/bin/dbt" + assert dbt_cmd["command"][0] == "mock-venv/bin/dbt" def test_virtualenv_operator_append_env_is_true_by_default(): @@ -142,12 +183,13 @@ def test_on_kill(mock_clean_dir_if_temporary): assert mock_clean_dir_if_temporary.called +@patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator.subprocess_hook") @patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator._release_venv_lock") @patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator.prepare_virtualenv") @patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator._acquire_venv_lock") @patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator._is_lock_available", side_effect=[False, False, True]) -def test__get_or_create_venv_py_interpreter_waits_for_lock( - mock_is_lock_available, mock_acquire, mock_prepare, mock_release, tmpdir, caplog +def test_run_subprocess_waits_for_lock( + mock_is_lock_available, mock_acquire, mock_prepare, mock_release, mock_subprocess_hook, tmpdir, caplog ): venv_operator = ConcreteDbtVirtualenvBaseOperator( profile_config=profile_config, @@ -156,7 +198,7 @@ def test__get_or_create_venv_py_interpreter_waits_for_lock( is_virtualenv_dir_temporary=False, virtualenv_dir=tmpdir, ) - venv_operator._get_or_create_venv_py_interpreter() + venv_operator.run_subprocess(["dbt", "run"], {}, "./dev/dags/dbt/jaffle_shop") assert caplog.text.count("Waiting for virtualenv lock to be released") == 2 @@ -279,40 +321,3 @@ def test__release_venv_lock_current_process(tmpdir): ) assert venv_operator._release_venv_lock() is None assert not lockfile.exists() - - -@patch("airflow.utils.python_virtualenv.execute_in_subprocess") -@patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.calculate_openlineage_events_completes") -@patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.store_compiled_sql") -@patch("cosmos.operators.virtualenv.DbtLocalBaseOperator.subprocess_hook") -@patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator._is_lock_available") -@patch("airflow.hooks.base.BaseHook.get_connection") -def test_supply_virtualenv_dir_flag( - mock_get_connection, - mock_lock_available, - mock_subprocess_hook, - mock_store_compiled_sql, - mock_calculate_openlineage_events_completes, - mock_execute, -): - mock_lock_available.return_value = True - mock_get_connection.return_value = Connection( - conn_id="fake_conn", - conn_type="postgres", - host="fake_host", - port=5432, - login="fake_login", - password="fake_password", - schema="fake_schema", - ) - venv_operator = ConcreteDbtVirtualenvBaseOperator( - profile_config=profile_config, - task_id="fake_task", - install_deps=True, - project_dir="./dev/dags/dbt/jaffle_shop", - py_system_site_packages=False, - py_requirements=["dbt-postgres==1.6.0b1"], - emit_datasets=False, - virtualenv_dir=Path("mock-venv"), - ) - assert venv_operator.venv_dbt_path == "mock-venv/bin/dbt" From a712c274248b75372c6f1958ddf0db5e20c203d1 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Fri, 16 Aug 2024 07:14:10 -0300 Subject: [PATCH 47/52] Add docs --- docs/configuration/cosmos-conf.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/configuration/cosmos-conf.rst b/docs/configuration/cosmos-conf.rst index 5b8ee210b..a15f3ce70 100644 --- a/docs/configuration/cosmos-conf.rst +++ b/docs/configuration/cosmos-conf.rst @@ -94,6 +94,14 @@ This page lists all available Airflow configurations that affect ``astronomer-co - Default: ``profile`` - Environment Variable: ``AIRFLOW__COSMOS__PROFILE_CACHE_DIR_NAME`` +.. `virtualenv_max_retries_lock`_: + When using ``ExecutionMode.VIRTUALENV`` and persisted virtualenv directories (`virtualenv_dir` argument), + users can define how many seconds Cosmos waits for the lock to be released. + + - Default: 120 + - Environment Variable: ``AIRFLOW__COSMOS__VIRTUALENV_MAX_RETRIES_LOCK`` + + [openlineage] ~~~~~~~~~~~~~ From 43cb07d19c9b7882fd7642152e7e05b538ccffcd Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Fri, 16 Aug 2024 07:17:15 -0300 Subject: [PATCH 48/52] Address PR feedback --- cosmos/operators/virtualenv.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index dd351aed3..7ddf93ea8 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -56,6 +56,8 @@ class DbtVirtualenvBaseOperator(DbtLocalBaseOperator): :param py_system_site_packages: Whether or not all the Python packages from the Airflow instance will be accessible within the virtual environment (if py_requirements argument is specified). Avoid using unless the dbt job requires it. + :param virtualenv_dir: Directory path where Cosmos will create/update Python virtualenv. If defined, will persist the Python virtualenv in the Airflow worker node. + :param is_virtualenv_dir_temporary: Tells Cosmos if virtualenv should be persisted or not. """ template_fields = DbtLocalBaseOperator.template_fields + ("virtualenv_dir", "is_virtualenv_dir_temporary") # type: ignore[operator] From b465b2a21e7ee21a6017e379a16a607ea2259859 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Fri, 16 Aug 2024 07:20:10 -0300 Subject: [PATCH 49/52] Address PR feedback --- cosmos/operators/virtualenv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 7ddf93ea8..8d5e3115a 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -87,7 +87,7 @@ def run_subprocess(self, command: list[str], env: dict[str, str], cwd: str) -> F self.log.info("Creating temporary virtualenv") with TemporaryDirectory(prefix="cosmos-venv") as tempdir: self.virtualenv_dir = Path(tempdir) - py_bin = self.prepare_virtualenv() + py_bin = self._prepare_virtualenv() dbt_bin = str(Path(py_bin).parent / "dbt") command[0] = dbt_bin # type: ignore subprocess_result: FullOutputSubprocessResult = self.subprocess_hook.run_command( @@ -107,7 +107,7 @@ def run_subprocess(self, command: list[str], env: dict[str, str], cwd: str) -> F self.log.info(f"Acquiring the virtualenv lock") self._acquire_venv_lock() - py_bin = self.prepare_virtualenv() + py_bin = self._prepare_virtualenv() dbt_bin = str(Path(py_bin).parent / "dbt") command[0] = dbt_bin # type: ignore subprocess_result = self.subprocess_hook.run_command( @@ -137,7 +137,7 @@ def execute(self, context: Context) -> None: def on_kill(self) -> None: self.clean_dir_if_temporary() - def prepare_virtualenv(self) -> str: + def _prepare_virtualenv(self) -> str: self.log.info(f"Creating or updating the virtualenv at `{self.virtualenv_dir}") py_bin = prepare_virtualenv( venv_directory=str(self.virtualenv_dir), From bf3598896454c94b8bb5a520eb93d8f6d4ae3358 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Fri, 16 Aug 2024 07:27:18 -0300 Subject: [PATCH 50/52] Fix unittest mock --- tests/operators/test_virtualenv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/operators/test_virtualenv.py b/tests/operators/test_virtualenv.py index cd5c708ea..0a7128626 100644 --- a/tests/operators/test_virtualenv.py +++ b/tests/operators/test_virtualenv.py @@ -185,7 +185,7 @@ def test_on_kill(mock_clean_dir_if_temporary): @patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator.subprocess_hook") @patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator._release_venv_lock") -@patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator.prepare_virtualenv") +@patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator._prepare_virtualenv") @patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator._acquire_venv_lock") @patch("cosmos.operators.virtualenv.DbtVirtualenvBaseOperator._is_lock_available", side_effect=[False, False, True]) def test_run_subprocess_waits_for_lock( From b6813aff07c8279f0fe7d844b2490a1e7205a053 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Fri, 16 Aug 2024 07:41:50 -0300 Subject: [PATCH 51/52] Improve comments --- dev/dags/example_virtualenv.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/dev/dags/example_virtualenv.py b/dev/dags/example_virtualenv.py index a4023c2cd..24f4a8250 100644 --- a/dev/dags/example_virtualenv.py +++ b/dev/dags/example_virtualenv.py @@ -48,9 +48,8 @@ def example_virtualenv() -> None: profile_config=profile_config, execution_config=ExecutionConfig( execution_mode=ExecutionMode.VIRTUALENV, - # Without setting: - # virtualenv_dir="/some/path/persistent-venv", - # Cosmos creates a new virtualenv for each dbt task being executed + # Without setting virtualenv_dir="/some/path/persistent-venv", + # Cosmos creates a new Python virtualenv for each dbt task being executed ), operator_args={ "py_system_site_packages": False, @@ -71,8 +70,8 @@ def example_virtualenv() -> None: profile_config=profile_config, execution_config=ExecutionConfig( execution_mode=ExecutionMode.VIRTUALENV, - # We can enable this flag if we want Airflow to create one virtualenv - # and reuse that within the whole DAG. + # We can set the argument `virtualenv_dir` if we want Cosmos to create one Python virtualenv + # and reuse that to run all the dbt tasks within the same worker node virtualenv_dir=Path("/tmp/persistent-venv2"), ), operator_args={ From 0ff5c7acd38d858f25ee665945d0f28cf8bcb071 Mon Sep 17 00:00:00 2001 From: Tatiana Al-Chueyr Date: Fri, 16 Aug 2024 07:44:36 -0300 Subject: [PATCH 52/52] Fix issue after rebase --- cosmos/operators/virtualenv.py | 3 +-- tests/dbt/test_graph.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cosmos/operators/virtualenv.py b/cosmos/operators/virtualenv.py index 146a6ea5e..60918a0a6 100644 --- a/cosmos/operators/virtualenv.py +++ b/cosmos/operators/virtualenv.py @@ -131,10 +131,9 @@ def clean_dir_if_temporary(self) -> None: def execute(self, context: Context) -> None: try: output = super().execute(context) + self.log.info(output) finally: self.clean_dir_if_temporary() - else: - self.log.info(output) def on_kill(self) -> None: self.clean_dir_if_temporary() diff --git a/tests/dbt/test_graph.py b/tests/dbt/test_graph.py index d12483ee7..0a1942270 100644 --- a/tests/dbt/test_graph.py +++ b/tests/dbt/test_graph.py @@ -1518,7 +1518,7 @@ def test_save_dbt_ls_cache(mock_variable_set, mock_datetime, tmp_dbt_project_dir hash_dir, hash_args = version.split(",") assert hash_args == "d41d8cd98f00b204e9800998ecf8427e" if sys.platform == "darwin": - assert hash_dir == "6534a9963174a3adf58725f7f06a30a1" + assert hash_dir == "c1e25b0679b5ddcb636bcc30f2f85a06" else: assert hash_dir == "6f63493009733a7be34364a6ea3ffd3c"