dyvenia · m-paz · Oct 25, 2023 · Oct 25, 2023
diff --git a/tests/integration/flows/test_supermetrics_to_adls.py b/tests/integration/flows/test_supermetrics_to_adls.py
@@ -5,9 +5,10 @@
 from prefect.storage import Local
 
 from viadot.flows import SupermetricsToADLS
+from viadot.exceptions import ValidationError
 
 CWD = os.getcwd()
-adls_dir_path = "raw/supermetrics"
+adls_dir_path = "raw/tests/supermetrics"
 STORAGE = Local(path=CWD)
 
 logger = logging.getLogger(__name__)
@@ -110,3 +111,91 @@ def test_supermetrics_to_adls_file_name(expectation_suite):
     )
     result = flow.run()
     assert result.is_successful()
+
+
+def test_supermetrics_to_adls_validate_df_success(expectation_suite):
+    flow = SupermetricsToADLS(
+        "test_supermetrics_to_adls_validate_df_success",
+        ds_id="GA",
+        ds_segments=[
+            "R1fbzFNQQ3q_GYvdpRr42w",
+            "I8lnFFvdSFKc50lP7mBKNA",
+            "Lg7jR0VWS5OqGPARtGYKrw",
+            "h8ViuGLfRX-cCL4XKk6yfQ",
+            "-1",
+        ],
+        ds_accounts=["8326007", "58338899"],
+        date_range_type="last_year_inc",
+        fields=[
+            {"id": "Date"},
+            {"id": "segment", "split": "column"},
+            {"id": "AvgPageLoadTime_calc"},
+        ],
+        settings={"avoid_sampling": "true"},
+        order_columns="alphabetic",
+        max_columns=100,
+        max_rows=10,
+        expectation_suite=expectation_suite,
+        evaluation_parameters=dict(previous_run_row_count=9),
+        adls_dir_path=adls_dir_path,
+        parallel=False,
+        validate_df_dict={
+            "column_size": {"Date": 10},
+            "column_list_to_match": [
+                "Date",
+                "All Users",
+                "M-Site_Better Space: All Landing Page Sessions",
+                "M-site_Accessories: All Landing Page Sessions",
+                "M-site_More Space: All Landing Page Sessions",
+                "M-site_Replacement: All Landing Page Sessions",
+            ],
+        },
+    )
+    result = flow.run()
+    assert result.is_successful()
+
+    task_results = result.result.values()
+    assert all([task_result.is_successful() for task_result in task_results])
+
+
+def test_supermetrics_to_adls_validate_df_fail(expectation_suite):
+    flow = SupermetricsToADLS(
+        "test_supermetrics_to_adls_validate_df_fail",
+        ds_id="GA",
+        ds_segments=[
+            "R1fbzFNQQ3q_GYvdpRr42w",
+            "I8lnFFvdSFKc50lP7mBKNA",
+            "Lg7jR0VWS5OqGPARtGYKrw",
+            "h8ViuGLfRX-cCL4XKk6yfQ",
+            "-1",
+        ],
+        ds_accounts=["8326007", "58338899"],
+        date_range_type="last_year_inc",
+        fields=[
+            {"id": "Date"},
+            {"id": "segment", "split": "column"},
+            {"id": "AvgPageLoadTime_calc"},
+        ],
+        settings={"avoid_sampling": "true"},
+        order_columns="alphabetic",
+        max_columns=100,
+        max_rows=10,
+        expectation_suite=expectation_suite,
+        evaluation_parameters=dict(previous_run_row_count=9),
+        adls_dir_path=adls_dir_path,
+        parallel=False,
+        validate_df_dict={
+            "column_list_to_match": [
+                "All Users",
+                "All Landing Page Sessions",
+                "All Landing Page Sessions",
+                "All Landing Page Sessions",
+                "All Landing Page Sessions",
+            ],
+        },
+    )
+
+    try:
+        flow.run()
+    except ValidationError:
+        pass
diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py
@@ -19,6 +19,7 @@
     union_dfs_task,
     update_dtypes_dict,
     write_to_json,
+    validate_df,
 )
 from viadot.tasks import (
     AzureDataLakeUpload,
@@ -68,6 +69,7 @@ def __init__(
         vault_name: str = None,
         check_missing_data: bool = True,
         timeout: int = 3600,
+        validate_df_dict: dict = None,
         *args: List[any],
         **kwargs: Dict[str, Any],
     ):
@@ -112,6 +114,8 @@ def __init__(
             check_missing_data (bool, optional): Whether to check missing data. Defaults to True.
             timeout(int, optional): The amount of time (in seconds) to wait while running this task before
                 a timeout occurs. Defaults to 3600.
+            validate_df_dict (Dict[str], optional): A dictionary with optional list of tests to verify the output dataframe.
+                If defined, triggers the `validate_df` task from task_utils. Defaults to None.
         """
         if not ds_user:
             try:
@@ -140,6 +144,9 @@ def __init__(
         self.if_exists = if_exists
         self.output_file_extension = output_file_extension
 
+        # validate_df
+        self.validate_df_dict = validate_df_dict
+
         # RunGreatExpectationsValidation
         self.expectation_suite = expectation_suite
         self.expectations_path = "/home/viadot/tmp/expectations"
@@ -229,6 +236,13 @@ def gen_flow(self) -> Flow:
         else:
             df = self.gen_supermetrics_task(ds_accounts=self.ds_accounts, flow=self)
 
+        # run validate_df task from task_utils
+        if self.validate_df_dict:
+            validation_df_task = validate_df.bind(
+                df, tests=self.validate_df_dict, flow=self
+            )
+            validation_df_task.set_upstream(df, flow=self)
+
         write_json = write_to_json.bind(
             dict_=self.expectation_suite,
             path=os.path.join(