From 9eb2e96e88fb039137ba2916b59e6e9eb769da7f Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 12 Oct 2023 15:45:25 +0200 Subject: [PATCH 01/47] =?UTF-8?q?=F0=9F=9A=80=20Bumped=20version=20after?= =?UTF-8?q?=20release?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_viadot.py | 2 +- viadot/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_viadot.py b/tests/test_viadot.py index bffda072a..1f0874453 100644 --- a/tests/test_viadot.py +++ b/tests/test_viadot.py @@ -2,4 +2,4 @@ def test_version(): - assert __version__ == "0.4.20" + assert __version__ == "0.4.21" diff --git a/viadot/__init__.py b/viadot/__init__.py index b4ed79e09..e427a5547 100644 --- a/viadot/__init__.py +++ b/viadot/__init__.py @@ -1 +1 @@ -__version__ = "0.4.20" +__version__ = "0.4.21" From e2d3cc61cfeac25440098345b64a2cf836f9414c Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 13 Oct 2023 16:17:41 +0200 Subject: [PATCH 02/47] =?UTF-8?q?=E2=9C=A8=20Added=20`validate=5Fdf`=20tas?= =?UTF-8?q?k?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 100 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 99 insertions(+), 1 deletion(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 28442a739..080eebb9d 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -23,7 +23,7 @@ from visions.typesets.complete_set import CompleteSet from viadot.config import local_config -from viadot.exceptions import CredentialError +from viadot.exceptions import CredentialError, ValidationError from viadot.tasks import AzureDataLakeUpload, AzureKeyVaultSecret logger = logging.get_logger() @@ -668,3 +668,101 @@ def anonymize_df( df.drop(columns=["temp_date_col"], inplace=True, errors="ignore") return df + + +import re + + +@task(timeout=3600) +def validate_df(df: pd.DataFrame, tests: dict = None) -> None: + """ + Task to validate the data on DataFrame level. + tests: + - `column_size`: dict{column: size} + - `column_unique_values`: list[columns] + - `column_list_to_match`: list[columns] + - `dataset_row_count`: dict: {'min': number, 'max', number} + - `column_match_regex`: dict: {column: 'regex'} + + Args: + df (pd.DataFrame): The data frame for validation. + tests (dict, optional): Tests to apply on the data frame. Defaults to None. + + Raises: + ValidationError: If validation failed for at least one test. + """ + failed_tests = 0 + if tests is not None: + # column_row_count + if "column_size" in tests: + try: + for k, v in tests["column_size"].items(): + column_max_length = ( + df.astype(str).apply(lambda s: s.str.len()).max().to_dict() + ) + try: + if v == column_max_length[k]: + logger.info(f"[column_size] for {k} passed.") + else: + logger.error( + f"[column_size] test for {k} failed. field lenght is different than {v}" + ) + failed_tests += 1 + except Exception as e: + logger.error(f"{e}") + except TypeError as e: + logger.error( + "Please provide `column_size` parameter as dictionary {'columns': value}." + ) + + # column_unique_values + if "column_unique_values" in tests: + for column in tests["column_unique_values"]: + df_size = df.shape[0] + if df[column].nunique() == df_size: + logger.info( + f"[column_unique_values] Values are unique for {column} column." + ) + else: + failed_tests += 1 + logger.error( + f"[column_unique_values] Values for {column} are not unique." + ) + + # list_column_to_match + if "column_list_to_match" in tests: + if set(tests["column_list_to_match"]) == set(df.columns): + logger.info(f"[column_list_to_match] passed.") + else: + failed_tests += 1 + logger.error( + "[column_list_to_match] failed. Columns are different than expected." + ) + + # dataset_row_count + if "dataset_row_count" in tests: + row_count = len(df.iloc[:, 0]) + max_value = tests["dataset_row_count"]["max"] or 10_000_000 + min_value = tests["dataset_row_count"]["min"] or 0 + + if (row_count > min_value) and (row_count < max_value): + print("[dataset_row_count] passed.") + else: + failed_tests += 1 + logging.error( + f"[dataset_row_count] Row count is not between {min_value} and {max_value}" + ) + # to improve + if "column_match_regex" in tests: + for k, v in tests["column_match_regex"].items(): + if df[k].apply(lambda x: re.match("(g\w+)\W(g\w+)", x)): + print(f"[column_match_regex] on {k} column passed.") + else: + failed_tests += 1 + logging.error(f"[column_match_regex] on {k} column filed!") + + else: + return "No tests to run." + + if failed_tests > 0: + raise ValidationError(f"Validation failed for {failed_tests} test/tests.") From 5ec195654472c217a9f735e026bc4cdb06f44e5a Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 13 Oct 2023 16:26:10 +0200 Subject: [PATCH 03/47] =?UTF-8?q?=F0=9F=93=9D=20Updated=20Changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 20efb4496..22dc6a2c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - +- Added `validate_df` task to task_utils. ### Fixed ### Changed From d9bab5f6e34402c50cd521b7638bb27979106e03 Mon Sep 17 00:00:00 2001 From: Mateusz Pazdzior <59165045+m-paz@users.noreply.github.com> Date: Thu, 19 Oct 2023 09:24:51 +0100 Subject: [PATCH 04/47] Update task_utils.py --- viadot/task_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 080eebb9d..b77a90ebf 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -2,6 +2,7 @@ import json import os import shutil +import re from datetime import datetime, timedelta, timezone from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, List, Literal, Union, cast @@ -670,9 +671,6 @@ def anonymize_df( return df -import re - - @task(timeout=3600) def validate_df(df: pd.DataFrame, tests: dict = None) -> None: """ From c4a661c1bac590f32e22c4e2cfdd04c2523e43ac Mon Sep 17 00:00:00 2001 From: burzekj Date: Mon, 23 Oct 2023 11:49:51 +0200 Subject: [PATCH 05/47] =?UTF-8?q?=F0=9F=8E=A8=20added=20new=20field=20extr?= =?UTF-8?q?action?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/genesys.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index 403ed7081..f39bff6d2 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -477,6 +477,8 @@ def run( key: value for (key, value) in attributes.items() if key in key_list } temp_dict["conversationId"] = json_file["id"] + temp_dict["startTime"] = json_file["startTime"] + temp_dict["endTime"] = json_file["endTime"] data_list.append(temp_dict) df = pd.DataFrame(data_list) From ec87569803886afc33fecb5fa3afbd7ed8fb985b Mon Sep 17 00:00:00 2001 From: burzekj Date: Mon, 23 Oct 2023 13:33:24 +0200 Subject: [PATCH 06/47] Changed CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 20efb4496..d7263d1c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### Changed - +- Changed `GenesysToCSV` logic for end_point == "conversations". Added new fields to extraction. ## [0.4.20] - 2023-10-12 ### Added From 1e93ea82ea8ef146d2cc92e6ed15b1e3c0e20e9e Mon Sep 17 00:00:00 2001 From: m-paz Date: Mon, 23 Oct 2023 18:02:48 +0100 Subject: [PATCH 07/47] =?UTF-8?q?=E2=9C=A8=20Added=20test=20for=20column?= =?UTF-8?q?=20sum+=20tests=20for=20validate=5Fdf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_task_utils.py | 98 +++++++++++++++++++++++++++++++++++ viadot/task_utils.py | 60 +++++++++++++++------ 2 files changed, 142 insertions(+), 16 deletions(-) diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index 1faee7071..14e62ae42 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -7,6 +7,7 @@ import prefect import pytest +from viadot.exceptions import ValidationError from viadot.task_utils import ( add_ingestion_metadata_task, adls_bulk_upload, @@ -21,6 +22,7 @@ dtypes_to_json_task, union_dfs_task, write_to_json, + validate_df, ) @@ -393,3 +395,99 @@ def test_wrong_method(): ) with pytest.raises(ValueError, match="Method not found"): anonymize_df.run(data, ["name", "last_name", "email"], method="anonymize") + + +def test_validate_df_column_size_pass(): + df = pd.DataFrame({"col1": ["a", "bb", "ccc"]}) + tests = {"column_size": {"col1": 3}} + try: + validate_df.run(df, tests) + except ValidationError: + assert False, "Validation failed but was expected to pass" + + +def test_validate_df_column_size_fail(): + df = pd.DataFrame({"col1": ["a", "bb", "cccc"]}) + tests = {"column_size": {"col1": 3}} + with pytest.raises(ValidationError): + validate_df.run(df, tests) + + +def test_validate_df_column_unique_values_pass(): + df = pd.DataFrame({"col1": [1, 2, 3]}) + tests = {"column_unique_values": ["col1"]} + try: + validate_df.run(df, tests) + except ValidationError: + assert False, "Validation failed but was expected to pass" + + +def test_validate_df_column_unique_values_fail(): + df = pd.DataFrame({"col1": [1, 2, 2]}) + tests = {"column_unique_values": ["col1"]} + with pytest.raises(ValidationError): + validate_df.run(df, tests) + + +def test_validate_df_column_list_to_match_pass(): + df = pd.DataFrame({"col1": [1], "col2": [2]}) + tests = {"column_list_to_match": ["col1", "col2"]} + try: + validate_df.run(df, tests) + except ValidationError: + assert False, "Validation failed but was expected to pass" + + +def test_validate_df_column_list_to_match_fail(): + df = pd.DataFrame({"col1": [1]}) + tests = {"column_list_to_match": ["col1", "col2"]} + with pytest.raises(ValidationError): + validate_df.run(df, tests) + + +def test_validate_df_dataset_row_count_pass(): + df = pd.DataFrame({"col1": [1, 2, 3]}) + tests = {"dataset_row_count": {"min": 1, "max": 5}} + try: + validate_df.run(df, tests) + except ValidationError: + assert False, "Validation failed but was expected to pass" + + +def test_validate_df_dataset_row_count_fail(): + df = pd.DataFrame({"col1": [1, 2, 3, 4, 5, 6]}) + tests = {"dataset_row_count": {"min": 1, "max": 5}} + with pytest.raises(ValidationError): + validate_df.run(df, tests) + + +def test_validate_df_column_match_regex_pass(): + df = pd.DataFrame({"col1": ["A12", "B34", "C45"]}) + tests = {"column_match_regex": {"col1": "^[A-Z][0-9]{2}$"}} + try: + validate_df.run(df, tests) + except ValidationError: + assert False, "Validation failed but was expected to pass" + + +def test_validate_df_column_match_regex_fail(): + df = pd.DataFrame({"col1": ["A123", "B34", "C45"]}) + tests = {"column_match_regex": {"col1": "^[A-Z][0-9]{2}$"}} + with pytest.raises(ValidationError): + validate_df.run(df, tests) + + +def test_validate_df_column_sum_pass(): + df = pd.DataFrame({"col1": [1, 2, 3]}) + tests = {"column_sum": {"col1": {"min": 5, "max": 10}}} + try: + validate_df.run(df, tests) + except ValidationError: + assert False, "Validation failed but was expected to pass" + + +def test_validate_df_column_sum_fail(): + df = pd.DataFrame({"col1": [1, 2, 3, 4]}) + tests = {"column_sum": {"col1": {"min": 5, "max": 6}}} + with pytest.raises(ValidationError): + validate_df.run(df, tests) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index b77a90ebf..f903c464d 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -2,7 +2,7 @@ import json import os import shutil -import re +import re from datetime import datetime, timedelta, timezone from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, List, Literal, Union, cast @@ -674,13 +674,14 @@ def anonymize_df( @task(timeout=3600) def validate_df(df: pd.DataFrame, tests: dict = None) -> None: """ - Task to validate the data on DataFrame level. + Task to validate the data on DataFrame level. All numbers in the ranges are inclusive. tests: - `column_size`: dict{column: size} - `column_unique_values`: list[columns] - `column_list_to_match`: list[columns] - `dataset_row_count`: dict: {'min': number, 'max', number} - `column_match_regex`: dict: {column: 'regex'} + - `column_sum`: dict: {column: {'min': number, 'max': number}} Args: df (pd.DataFrame): The data frame for validation. @@ -690,8 +691,9 @@ def validate_df(df: pd.DataFrame, tests: dict = None) -> None: ValidationError: If validation failed for at least one test. """ failed_tests = 0 + failed_tests_list = [] + if tests is not None: - # column_row_count if "column_size" in tests: try: for k, v in tests["column_size"].items(): @@ -706,6 +708,7 @@ def validate_df(df: pd.DataFrame, tests: dict = None) -> None: f"[column_size] test for {k} failed. field lenght is different than {v}" ) failed_tests += 1 + failed_tests_list.append("column_size error") except Exception as e: logger.error(f"{e}") except TypeError as e: @@ -713,7 +716,6 @@ def validate_df(df: pd.DataFrame, tests: dict = None) -> None: "Please provide `column_size` parameter as dictionary {'columns': value}." ) - # column_unique_values if "column_unique_values" in tests: for column in tests["column_unique_values"]: df_size = df.shape[0] @@ -723,44 +725,70 @@ def validate_df(df: pd.DataFrame, tests: dict = None) -> None: ) else: failed_tests += 1 + failed_tests_list.append("column_unique_values error") logger.error( f"[column_unique_values] Values for {column} are not unique." ) - # list_column_to_match if "column_list_to_match" in tests: if set(tests["column_list_to_match"]) == set(df.columns): logger.info(f"[column_list_to_match] passed.") else: failed_tests += 1 + failed_tests_list.append("column_list_to_match error") logger.error( "[column_list_to_match] failed. Columns are different than expected." ) - # dataset_row_count if "dataset_row_count" in tests: row_count = len(df.iloc[:, 0]) - max_value = tests["dataset_row_count"]["max"] or 10_000_000 + max_value = tests["dataset_row_count"]["max"] or 100_000_000 min_value = tests["dataset_row_count"]["min"] or 0 if (row_count > min_value) and (row_count < max_value): print("[dataset_row_count] passed.") else: failed_tests += 1 - logging.error( - f"[dataset_row_count] Row count is not between {min_value} and {max_value}" + failed_tests_list.append("dataset_row_count error") + logger.error( + f"[dataset_row_count] Row count ({row_count}) is not between {min_value} and {max_value}." ) - # to improve + if "column_match_regex" in tests: for k, v in tests["column_match_regex"].items(): - if df[k].apply(lambda x: re.match("(g\w+)\W(g\w+)", x)): - print(f"[column_match_regex] on {k} column passed.") + try: + matches = df[k].apply(lambda x: bool(re.match(v, str(x)))) + if all(matches): + logger.info(f"[column_match_regex] on {k} column passed.") + else: + failed_tests += 1 + failed_tests_list.append("column_match_regex error") + logger.error(f"[column_match_regex] on {k} column failed!") + except Exception as e: + failed_tests += 1 + failed_tests_list.append("column_match_regex error") + logger.error(f"[column_match_regex] Error in {k} column: {e}") + + if "column_sum" in tests: + for column, bounds in tests["column_sum"].items(): + col_sum = df[column].sum() + min_bound = bounds["min"] + max_bound = bounds["max"] + if min_bound <= col_sum <= max_bound: + logger.info( + f"[column_sum] Sum of {col_sum} for {column} is within the expected range." + ) else: failed_tests += 1 - logging.error(f"[column_match_regex] on {k} column filed!") - + failed_tests_list.append("column_sum error") + logger.error( + f"[column_sum] Sum of {col_sum} for {column} is out of the expected range - <{min_bound}:{max_bound}>" + ) else: - return "No tests to run." + return "No dataframe tests to run." if failed_tests > 0: - raise ValidationError(f"Validation failed for {failed_tests} test/tests.") + failed_tests_msg = ", ".join(failed_tests_list) + raise ValidationError( + f"Validation failed for {failed_tests} test/tests: {failed_tests_msg}" + ) From 6b87f520e784e8a0e2ac712a1e5caadcbb47a30f Mon Sep 17 00:00:00 2001 From: m-paz Date: Tue, 24 Oct 2023 09:18:26 +0100 Subject: [PATCH 08/47] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Removed=20unused=20i?= =?UTF-8?q?mports=20and=20changed=20logger?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_task_utils.py | 2 -- viadot/task_utils.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index 14e62ae42..e77c24fdd 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -2,9 +2,7 @@ from typing import List from unittest import mock -import numpy as np import pandas as pd -import prefect import pytest from viadot.exceptions import ValidationError diff --git a/viadot/task_utils.py b/viadot/task_utils.py index f903c464d..e40da4b9e 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -746,7 +746,7 @@ def validate_df(df: pd.DataFrame, tests: dict = None) -> None: min_value = tests["dataset_row_count"]["min"] or 0 if (row_count > min_value) and (row_count < max_value): - print("[dataset_row_count] passed.") + logger.info(f"[dataset_row_count] passed.") else: failed_tests += 1 failed_tests_list.append("dataset_row_count error") From c3dd445cf74906e0ea208ac7ea3b5048b62a4403 Mon Sep 17 00:00:00 2001 From: m-paz Date: Tue, 24 Oct 2023 09:22:16 +0100 Subject: [PATCH 09/47] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Cleaned=20logger?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index e40da4b9e..6173e2994 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -746,7 +746,7 @@ def validate_df(df: pd.DataFrame, tests: dict = None) -> None: min_value = tests["dataset_row_count"]["min"] or 0 if (row_count > min_value) and (row_count < max_value): - logger.info(f"[dataset_row_count] passed.") + logger.info("[dataset_row_count] passed.") else: failed_tests += 1 failed_tests_list.append("dataset_row_count error") From 2118b6dfd7028518707bf79823205c34b212aa33 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 24 Oct 2023 14:00:56 +0200 Subject: [PATCH 10/47] =?UTF-8?q?=F0=9F=8E=A8=20Added=20`validation=5Fdf?= =?UTF-8?q?=5Fdict`=20param?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...test_cloud_for_customers_report_to_adls.py | 61 +++++++++++++++++++ .../cloud_for_customers_report_to_adls.py | 19 ++++-- viadot/sources/cloud_for_customers.py | 4 +- 3 files changed, 77 insertions(+), 7 deletions(-) diff --git a/tests/integration/flows/test_cloud_for_customers_report_to_adls.py b/tests/integration/flows/test_cloud_for_customers_report_to_adls.py index b506e4a14..04f97cf44 100644 --- a/tests/integration/flows/test_cloud_for_customers_report_to_adls.py +++ b/tests/integration/flows/test_cloud_for_customers_report_to_adls.py @@ -1,5 +1,8 @@ from viadot.config import local_config from viadot.flows import CloudForCustomersReportToADLS +import pytest + +from viadot.exceptions import ValidationError def test_cloud_for_customers_report_to_adls(): @@ -27,3 +30,61 @@ def test_cloud_for_customers_report_to_adls(): task_results = result.result.values() assert all([task_result.is_successful() for task_result in task_results]) + + assert len(flow.tasks) == 6 + + +def test_cloud_for_customers_report_to_adls_validation_fail(caplog): + credentials = local_config.get("CLOUD_FOR_CUSTOMERS") + credentials_prod = credentials["Prod"] + channels = ["VEL_B_AFS", "VEL_B_ASA"] + month = ["01"] + year = ["2021"] + flow = CloudForCustomersReportToADLS( + report_url=credentials_prod["server"], + env="Prod", + channels=channels, + months=month, + years=year, + name="test_c4c_report_to_adls", + local_file_path=f"test_c4c_report_to_adls.csv", + adls_sp_credentials_secret=credentials["adls_sp_credentials_secret"], + adls_dir_path=credentials["adls_dir_path"], + validation_df_dict={"column_size": {"ChannelName ID": 10}}, + ) + try: + result = flow.run() + except ValidationError: + pass + + +def test_cloud_for_customers_report_to_adls_validation_success(): + credentials = local_config.get("CLOUD_FOR_CUSTOMERS") + credentials_prod = credentials["Prod"] + channels = ["VEL_B_AFS", "VEL_B_ASA"] + month = ["01"] + year = ["2021"] + flow = CloudForCustomersReportToADLS( + report_url=credentials_prod["server"], + env="Prod", + channels=channels, + months=month, + years=year, + name="test_c4c_report_to_adls", + local_file_path=f"test_c4c_report_to_adls.csv", + adls_sp_credentials_secret=credentials["adls_sp_credentials_secret"], + adls_dir_path=credentials["adls_dir_path"], + validation_df_dict={"column_size": {"ChannelName ID": 13}}, + ) + + try: + result = flow.run() + except ValidationError: + assert False, "Validation failed but was expected to pass" + + assert result.is_successful() + + task_results = result.result.values() + assert all([task_result.is_successful() for task_result in task_results]) + + assert len(flow.tasks) == 7 diff --git a/viadot/flows/cloud_for_customers_report_to_adls.py b/viadot/flows/cloud_for_customers_report_to_adls.py index 332698060..27b586f45 100644 --- a/viadot/flows/cloud_for_customers_report_to_adls.py +++ b/viadot/flows/cloud_for_customers_report_to_adls.py @@ -9,6 +9,7 @@ df_to_csv, df_to_parquet, union_dfs_task, + validate_df, ) from viadot.tasks import AzureDataLakeUpload, C4CReportToDF, C4CToDF from viadot.utils import slugify @@ -38,6 +39,7 @@ def __init__( adls_sp_credentials_secret: str = None, if_empty: str = "warn", if_exists: str = "replace", + validation_df_dict: dict = None, timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], @@ -49,16 +51,16 @@ def __init__( Args: name (str): The name of the flow. report_url (str, optional): The url to the API. Defaults to None. - url (str, optional): ??? - endpoint (str, optional): ??? - params (dict, optional): ??? - fields (list, optional): ??? + url (str, optional): The url to the C4C API. Defaults to None. + endpoint (str, optional): The C4C API endpoint. Defaults to None. + params (dict, optional): The query parameters like filter by creation date time. Defaults to None. + fields (list, optional): List of columns to put in DataFrame. Defaults to None. skip (int, optional): Initial index value of reading row. Defaults to 0. top (int, optional): The value of top reading row. Defaults to 1000. channels (List[str], optional): Filtering parameters passed to the url. Defaults to None. months (List[str], optional): Filtering parameters passed to the url. Defaults to None. years (List[str], optional): Filtering parameters passed to the url. Defaults to None. - env (str, optional): ??? + env (str, optional): The credentials environments. Defaults to 'QA'. c4c_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with username and password for the Cloud for Customers instance. local_file_path (str, optional): Local destination path. Defaults to None. @@ -72,6 +74,8 @@ def __init__( Defaults to None. if_empty (str, optional): What to do if the Supermetrics query returns no data. Defaults to "warn". if_exists (str, optional): What to do if the local file already exists. Defaults to "replace". + validation_df_dict (dict, optional): A dictionary with optional list of tests to verify the output + dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. """ @@ -82,6 +86,7 @@ def __init__( self.if_empty = if_empty self.env = env self.c4c_credentials_secret = c4c_credentials_secret + self.validation_df_dict = validation_df_dict self.timeout = timeout # AzureDataLakeUpload @@ -197,6 +202,10 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validation_df_dict: + validation = validate_df(df=df, tests=self.validation_df_dict, flow=self) + validation.set_upstream(df, flow=self) + df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) if self.output_file_extension == ".parquet": diff --git a/viadot/sources/cloud_for_customers.py b/viadot/sources/cloud_for_customers.py index 4856439cf..1bb68375e 100644 --- a/viadot/sources/cloud_for_customers.py +++ b/viadot/sources/cloud_for_customers.py @@ -35,8 +35,8 @@ def __init__( report_url (str, optional): The url to the API in case of prepared report. Defaults to None. url (str, optional): The url to the API. Defaults to None. endpoint (str, optional): The endpoint of the API. Defaults to None. - params (Dict[str, Any]): The query parameters like filter by creation date time. Defaults to json format. - env (str, optional): The development environments. Defaults to 'QA'. + params (Dict[str, Any]): The query parameters like filter by creation date time. Defaults to None. + env (str, optional): The credentials environments. Defaults to 'QA'. credentials (Dict[str, Any], optional): The credentials are populated with values from config file or this parameter. Defaults to None than use credentials from local_config. """ From 6979defa268ba32f354ee90dad426fd6f8f6cddd Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 24 Oct 2023 14:04:33 +0200 Subject: [PATCH 11/47] =?UTF-8?q?=F0=9F=94=A5=20removed=20unused=20import?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_cloud_for_customers_report_to_adls.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/integration/flows/test_cloud_for_customers_report_to_adls.py b/tests/integration/flows/test_cloud_for_customers_report_to_adls.py index 04f97cf44..5d7cb7091 100644 --- a/tests/integration/flows/test_cloud_for_customers_report_to_adls.py +++ b/tests/integration/flows/test_cloud_for_customers_report_to_adls.py @@ -1,7 +1,5 @@ from viadot.config import local_config from viadot.flows import CloudForCustomersReportToADLS -import pytest - from viadot.exceptions import ValidationError From 6f6581b1e1f767d35004482e79fa3ea1c7fc5964 Mon Sep 17 00:00:00 2001 From: gwieloch Date: Tue, 24 Oct 2023 16:16:26 +0200 Subject: [PATCH 12/47] added validate_df parameter to EurostatToADLS --- .../flows/test_eurostat_to_adls.py | 26 ++++++++++++++++++- viadot/flows/eurostat_to_adls.py | 13 ++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/tests/integration/flows/test_eurostat_to_adls.py b/tests/integration/flows/test_eurostat_to_adls.py index 9225655e9..e15f10a7d 100644 --- a/tests/integration/flows/test_eurostat_to_adls.py +++ b/tests/integration/flows/test_eurostat_to_adls.py @@ -6,7 +6,11 @@ from viadot.flows import EurostatToADLS -DATA = {"geo": ["PL", "DE", "NL"], "indicator": [35, 55, 77]} +DATA = { + "geo": ["PL", "DE", "NL"], + "indicator": [35, 55, 77], + "time": ["2023-01", "2023-51", "2023-07"], +} ADLS_FILE_NAME = "test_eurostat.parquet" ADLS_DIR_PATH = "raw/tests/" @@ -28,3 +32,23 @@ def test_eurostat_to_adls_run_flow(mocked_class): assert result.is_successful() os.remove("test_eurostat_to_adls_flow_run.parquet") os.remove("test_eurostat_to_adls_flow_run.json") + + +@mock.patch( + "viadot.tasks.EurostatToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_validate_df(mocked_class): + flow = EurostatToADLS( + "test_validate_df", + dataset_code="ILC_DI04", + overwrite_adls=True, + validate_df_dict={"column_size": {"time": 7}}, + adls_dir_path=ADLS_DIR_PATH, + adls_file_name=ADLS_FILE_NAME, + ) + result = flow.run() + assert result.is_successful() + os.remove("test_validate_df.parquet") + os.remove("test_validate_df.json") diff --git a/viadot/flows/eurostat_to_adls.py b/viadot/flows/eurostat_to_adls.py index 0348d7de4..e6c76a084 100644 --- a/viadot/flows/eurostat_to_adls.py +++ b/viadot/flows/eurostat_to_adls.py @@ -15,6 +15,7 @@ df_to_parquet, dtypes_to_json_task, update_dtypes_dict, + validate_df, ) from ..tasks import AzureDataLakeUpload, EurostatToDF @@ -40,6 +41,7 @@ def __init__( adls_file_name: str = None, adls_sp_credentials_secret: str = None, overwrite_adls: bool = False, + validate_df_dict: dict = None, if_exists: str = "replace", *args: List[Any], **kwargs: Dict[str, Any], @@ -70,6 +72,8 @@ def __init__( ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake. Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. + validate_df_dict (Dict[str], optional): A dictionary with optional list of tests to verify the output dataframe. + If defined, triggers the `validate_df` task from task_utils. Defaults to None if_exists (str, optional): What to do if the file exists. Defaults to "replace". """ @@ -79,6 +83,9 @@ def __init__( self.base_url = base_url self.requested_columns = requested_columns + # validate df + self.validate_df_dict = validate_df_dict + # AzureDataLakeUpload self.overwrite = overwrite_adls self.adls_sp_credentials_secret = adls_sp_credentials_secret @@ -123,6 +130,12 @@ def gen_flow(self) -> Flow: df = df.bind(flow=self) + if self.validate_df_dict: + validation_task = validate_df.bind( + df, tests=self.validate_df_dict, flow=self + ) + validation_task.set_upstream(df, flow=self) + df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) From 8f5348af799fc63f94fa9be7682d325d5954356d Mon Sep 17 00:00:00 2001 From: dominikjedlinski Date: Tue, 24 Oct 2023 16:56:31 +0200 Subject: [PATCH 13/47] added df validation to aselite --- .../integration/flows/test_aselite_to_adls.py | 62 ++++++++++++++++++- viadot/flows/aselite_to_adls.py | 16 ++++- 2 files changed, 76 insertions(+), 2 deletions(-) diff --git a/tests/integration/flows/test_aselite_to_adls.py b/tests/integration/flows/test_aselite_to_adls.py index 48146c293..32a31aca7 100644 --- a/tests/integration/flows/test_aselite_to_adls.py +++ b/tests/integration/flows/test_aselite_to_adls.py @@ -8,7 +8,7 @@ from prefect.tasks.secrets import PrefectSecret from viadot.flows.aselite_to_adls import ASELiteToADLS -from viadot.task_utils import df_converts_bytes_to_int, df_to_csv +from viadot.task_utils import df_converts_bytes_to_int, df_to_csv, validate_df from viadot.tasks import AzureDataLakeUpload from viadot.tasks.aselite import ASELiteToDF @@ -62,3 +62,63 @@ def test_aselite_to_adls(): assert MAIN_DF.shape == (10, 17) os.remove(TMP_FILE_NAME) + + +def test_aselite_to_adls_validate_df(): + credentials_secret = PrefectSecret("aselite").run() + vault_name = PrefectSecret("AZURE_DEFAULT_KEYVAULT").run() + + query_designer = """SELECT TOP 10 [ID] + ,[SpracheText] + ,[SpracheKat] + ,[SpracheMM] + ,[KatSprache] + ,[KatBasisSprache] + ,[CodePage] + ,[Font] + ,[Neu] + ,[Upd] + ,[UpdL] + ,[LosKZ] + ,[AstNr] + ,[KomKz] + ,[RKZ] + ,[ParentLanguageNo] + ,[UPD_FIELD] + FROM [UCRMDEV].[dbo].[CRM_00]""" + + validate_df_dict = { + "column_size": {"ParentLanguageNo": 1}, + "column_unique_values": ["ID"], + "column_list_to_match": [ + "SpracheText", + "SpracheMM", + "KatSprache", + "KatBasisSprache", + "CodePage", + ], + "dataset_row_count": {"min": 10, "max": 10}, + "column_match_regex": {"SpracheText", r"TE_.*"}, + } + + flow = ASELiteToADLS( + "Test flow", + query=query_designer, + sqldb_credentials_secret=credentials_secret, + vault_name=vault_name, + file_path=TMP_FILE_NAME, + validate_df_dict=validate_df_dict, + to_path="raw/supermetrics/mp/result_df_flow_at_des_m.csv", + run_config=None, + ) + + result = flow.run() + assert result.is_successful() + + MAIN_DF = pd.read_csv(TMP_FILE_NAME, delimiter="\t") + + assert isinstance(MAIN_DF, pd.DataFrame) == True + + assert MAIN_DF.shape == (10, 17) + + os.remove(TMP_FILE_NAME) diff --git a/viadot/flows/aselite_to_adls.py b/viadot/flows/aselite_to_adls.py index 86e9b215b..8fcdaddb3 100644 --- a/viadot/flows/aselite_to_adls.py +++ b/viadot/flows/aselite_to_adls.py @@ -2,7 +2,12 @@ from prefect import Flow -from viadot.task_utils import df_clean_column, df_converts_bytes_to_int, df_to_csv +from viadot.task_utils import ( + df_clean_column, + df_converts_bytes_to_int, + df_to_csv, + validate_df, +) from viadot.tasks import AzureDataLakeUpload from viadot.tasks.aselite import ASELiteToDF @@ -19,6 +24,7 @@ def __init__( to_path: str = None, if_exists: Literal["replace", "append", "delete"] = "replace", overwrite: bool = True, + validate_df_dict: dict = None, convert_bytes: bool = False, sp_credentials_secret: str = None, remove_special_characters: bool = None, @@ -41,6 +47,7 @@ def __init__( to_path (str): The path to an ADLS file. Defaults to None. if_exists (Literal, optional): What to do if the table exists. Defaults to "replace". overwrite (str, optional): Whether to overwrite the destination file. Defaults to True. + validate_df_dict (Dict[str], optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. remove_special_characters (str, optional): Call a function that remove special characters like escape symbols. Defaults to None. @@ -53,6 +60,7 @@ def __init__( self.sqldb_credentials_secret = sqldb_credentials_secret self.vault_name = vault_name self.overwrite = overwrite + self.validate_df_dict = validate_df_dict self.file_path = file_path self.sep = sep @@ -83,6 +91,11 @@ def gen_flow(self) -> Flow: if self.remove_special_characters == True: df = df_clean_column(df, columns_to_clean=self.columns_to_clean, flow=self) + if self.validate_df_dict: + validation_task = validate_df.bind( + df, tests=self.validate_df_dict, flow=self + ) + create_csv = df_to_csv.bind( df, path=self.file_path, @@ -100,5 +113,6 @@ def gen_flow(self) -> Flow: flow=self, ) + # validation_task.set_upstream(df, flow=self) create_csv.set_upstream(df, flow=self) adls_upload.set_upstream(create_csv, flow=self) From 176f46a49960f1e8ec576ca363b70e40ac96f379 Mon Sep 17 00:00:00 2001 From: gwieloch Date: Tue, 24 Oct 2023 17:42:38 +0200 Subject: [PATCH 14/47] added `validate_df` task to mysql flow --- tests/integration/flows/test_mysql_to_adls.py | 16 +++++++++++++++- viadot/flows/mysql_to_adls.py | 14 +++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/tests/integration/flows/test_mysql_to_adls.py b/tests/integration/flows/test_mysql_to_adls.py index a66fca5db..942bab99d 100644 --- a/tests/integration/flows/test_mysql_to_adls.py +++ b/tests/integration/flows/test_mysql_to_adls.py @@ -1,5 +1,4 @@ from unittest import mock - from viadot.flows.mysql_to_adls import MySqlToADLS query = """SELECT * FROM `example-views`.`sales`""" @@ -23,3 +22,18 @@ def test_adls_gen1_to_azure_sql_new_mock(TEST_PARQUET_FILE_PATH): ) flow.run() mock_method.assert_called_with() + + +def test_validate_df(TEST_PARQUET_FILE_PATH): + with mock.patch.object(MySqlToADLS, "run", return_value=True) as mock_method: + flow = MySqlToADLS( + "test validate_df", + country_short="DE", + query=query, + file_path=TEST_PARQUET_FILE_PATH, + sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA", + to_path=f"raw/examples/{TEST_PARQUET_FILE_PATH}", + validate_df_dict={"column_size": {"sales_org": 3}}, + ) + flow.run() + mock_method.assert_called_with() diff --git a/viadot/flows/mysql_to_adls.py b/viadot/flows/mysql_to_adls.py index 4c18148fe..afe594e47 100644 --- a/viadot/flows/mysql_to_adls.py +++ b/viadot/flows/mysql_to_adls.py @@ -2,7 +2,7 @@ from prefect import Flow -from viadot.task_utils import df_to_csv +from viadot.task_utils import df_to_csv, validate_df from viadot.tasks import AzureDataLakeUpload from viadot.tasks.mysql_to_df import MySqlToDf @@ -20,6 +20,7 @@ def __init__( to_path: str = None, if_exists: Literal["replace", "append", "delete"] = "replace", overwrite_adls: bool = True, + validate_df_dict: dict = None, sp_credentials_secret: str = None, credentials_secret: str = None, timeout: int = 3600, @@ -41,6 +42,8 @@ def __init__( to_path (str): The path to an ADLS file. Defaults to None. if_exists (Literal, optional): What to do if the table exists. Defaults to "replace". overwrite_adls (str, optional): Whether to overwrite_adls the destination file. Defaults to True. + validate_df_dict (Dict[str], optional): A dictionary with optional list of tests to verify the output dataframe. + If defined, triggers the `validate_df` task from task_utils. Defaults to None. sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. credentials_secret (str, optional): Key Vault name. Defaults to None. @@ -57,6 +60,9 @@ def __init__( self.vault_name = vault_name self.overwrite_adls = overwrite_adls + # validate df + self.validate_df_dict = validate_df_dict + # Upload to ADLS self.file_path = file_path self.sep = sep @@ -76,6 +82,12 @@ def gen_flow(self) -> Flow: credentials_secret=self.credentials_secret, query=self.query, flow=self ) + if self.validate_df_dict: + validation_task = validate_df.bind( + df, tests=self.validate_df_dict, flow=self + ) + validation_task.set_upstream(df, flow=self) + create_csv = df_to_csv.bind( df, path=self.file_path, From 60148cb036880eeff8803ed94676f362a4873c22 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 25 Oct 2023 08:29:34 +0200 Subject: [PATCH 15/47] =?UTF-8?q?=F0=9F=8E=A8=20changed=20parameter=20name?= =?UTF-8?q?=20to=20`validate=5Fdf=5Fdict`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_cloud_for_customers_report_to_adls.py | 4 ++-- viadot/flows/cloud_for_customers_report_to_adls.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/integration/flows/test_cloud_for_customers_report_to_adls.py b/tests/integration/flows/test_cloud_for_customers_report_to_adls.py index 5d7cb7091..f0661e314 100644 --- a/tests/integration/flows/test_cloud_for_customers_report_to_adls.py +++ b/tests/integration/flows/test_cloud_for_customers_report_to_adls.py @@ -48,7 +48,7 @@ def test_cloud_for_customers_report_to_adls_validation_fail(caplog): local_file_path=f"test_c4c_report_to_adls.csv", adls_sp_credentials_secret=credentials["adls_sp_credentials_secret"], adls_dir_path=credentials["adls_dir_path"], - validation_df_dict={"column_size": {"ChannelName ID": 10}}, + validate_df_dict={"column_size": {"ChannelName ID": 10}}, ) try: result = flow.run() @@ -72,7 +72,7 @@ def test_cloud_for_customers_report_to_adls_validation_success(): local_file_path=f"test_c4c_report_to_adls.csv", adls_sp_credentials_secret=credentials["adls_sp_credentials_secret"], adls_dir_path=credentials["adls_dir_path"], - validation_df_dict={"column_size": {"ChannelName ID": 13}}, + validate_df_dict={"column_size": {"ChannelName ID": 13}}, ) try: diff --git a/viadot/flows/cloud_for_customers_report_to_adls.py b/viadot/flows/cloud_for_customers_report_to_adls.py index 27b586f45..11b3e92bf 100644 --- a/viadot/flows/cloud_for_customers_report_to_adls.py +++ b/viadot/flows/cloud_for_customers_report_to_adls.py @@ -39,7 +39,7 @@ def __init__( adls_sp_credentials_secret: str = None, if_empty: str = "warn", if_exists: str = "replace", - validation_df_dict: dict = None, + validate_df_dict: dict = None, timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], @@ -74,7 +74,7 @@ def __init__( Defaults to None. if_empty (str, optional): What to do if the Supermetrics query returns no data. Defaults to "warn". if_exists (str, optional): What to do if the local file already exists. Defaults to "replace". - validation_df_dict (dict, optional): A dictionary with optional list of tests to verify the output + validate_df_dict (dict, optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. @@ -86,7 +86,7 @@ def __init__( self.if_empty = if_empty self.env = env self.c4c_credentials_secret = c4c_credentials_secret - self.validation_df_dict = validation_df_dict + self.validate_df_dict = validate_df_dict self.timeout = timeout # AzureDataLakeUpload @@ -202,8 +202,8 @@ def gen_flow(self) -> Flow: flow=self, ) - if self.validation_df_dict: - validation = validate_df(df=df, tests=self.validation_df_dict, flow=self) + if self.validate_df_dict: + validation = validate_df(df=df, tests=self.validate_df_dict, flow=self) validation.set_upstream(df, flow=self) df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) From ff508499ac3a7380adbe406521062ac6ab1c1e41 Mon Sep 17 00:00:00 2001 From: jkrobicki Date: Wed, 25 Oct 2023 09:18:25 +0200 Subject: [PATCH 16/47] add df validation to BigQueryToADLS --- viadot/flows/bigquery_to_adls.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/viadot/flows/bigquery_to_adls.py b/viadot/flows/bigquery_to_adls.py index 8e8095b5a..5283a2bda 100644 --- a/viadot/flows/bigquery_to_adls.py +++ b/viadot/flows/bigquery_to_adls.py @@ -15,6 +15,7 @@ df_to_parquet, dtypes_to_json_task, update_dtypes_dict, + validate_df, ) from viadot.tasks import AzureDataLakeUpload, BigQueryToDF @@ -40,6 +41,7 @@ def __init__( adls_sp_credentials_secret: str = None, overwrite_adls: bool = False, if_exists: str = "replace", + validation_df_dict: dict = None, timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], @@ -78,6 +80,8 @@ def __init__( Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_exists (str, optional): What to do if the file exists. Defaults to "replace". + validation_df_dict (dict, optional): An optional dictionary to verify the received dataframe. + When passed, `validate_df` task validation tests are triggered. Defaults to None. timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. """ @@ -91,6 +95,9 @@ def __init__( self.vault_name = vault_name self.credentials_key = credentials_key + # Validate DataFrame + self.validation_df_dict = validation_df_dict + # AzureDataLakeUpload self.overwrite = overwrite_adls self.adls_sp_credentials_secret = adls_sp_credentials_secret @@ -140,6 +147,10 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validation_df_dict: + validation = validate_df(df=df, tests=self.validation_df_dict, flow=self) + validation.set_upstream(df, flow=self) + df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) df_to_be_loaded = df_map_mixed_dtypes_for_parquet( From d6f4e6c11bda43339bab019e46f9f006fab3294f Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 25 Oct 2023 09:58:54 +0200 Subject: [PATCH 17/47] =?UTF-8?q?=E2=9C=A8=20Added=20validate=5Fdf=5Fdict?= =?UTF-8?q?=20parameter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_mediatool_to_adls.py | 47 +++++++++++++++++++ viadot/flows/mediatool_to_adls.py | 10 ++++ 2 files changed, 57 insertions(+) diff --git a/tests/integration/flows/test_mediatool_to_adls.py b/tests/integration/flows/test_mediatool_to_adls.py index 88746d16b..d7b5b2658 100644 --- a/tests/integration/flows/test_mediatool_to_adls.py +++ b/tests/integration/flows/test_mediatool_to_adls.py @@ -5,6 +5,7 @@ import pytest from viadot.flows import MediatoolToADLS +from viadot.exceptions import ValidationError DATA = {"country": ["DK", "DE"], "sales": [3, 4]} ADLS_FILE_NAME = "test_mediatool.parquet" @@ -28,5 +29,51 @@ def test_mediatool_to_adls_run_flow(mocked_class): ) result = flow.run() assert result.is_successful() + assert len(flow.tasks) == 10 + os.remove("test_mediatool_to_adls_flow_run.parquet") + os.remove("test_mediatool_to_adls_flow_run.json") + + +@mock.patch( + "viadot.tasks.MediatoolToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_mediatool_to_adls_run_flow_validate_fail(mocked_class): + flow = MediatoolToADLS( + "test_mediatool_to_adls_flow_run", + organization_ids=["1000000001", "200000001"], + media_entries_columns=["id", "name", "num"], + mediatool_credentials_key="MEDIATOOL-TESTS", + overwrite_adls=True, + adls_dir_path=ADLS_DIR_PATH, + adls_file_name=ADLS_FILE_NAME, + validate_df_dict={"column_size": {"country": 10}}, + ) + try: + flow.run() + except ValidationError: + pass + + +@mock.patch( + "viadot.tasks.MediatoolToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_mediatool_to_adls_run_flow_validate_success(mocked_class): + flow = MediatoolToADLS( + "test_mediatool_to_adls_flow_run", + organization_ids=["1000000001", "200000001"], + media_entries_columns=["id", "name", "num"], + mediatool_credentials_key="MEDIATOOL-TESTS", + overwrite_adls=True, + adls_dir_path=ADLS_DIR_PATH, + adls_file_name=ADLS_FILE_NAME, + validate_df_dict={"column_size": {"country": 2}}, + ) + result = flow.run() + assert result.is_successful() + assert len(flow.tasks) == 11 os.remove("test_mediatool_to_adls_flow_run.parquet") os.remove("test_mediatool_to_adls_flow_run.json") diff --git a/viadot/flows/mediatool_to_adls.py b/viadot/flows/mediatool_to_adls.py index f87a6293b..c4e92432b 100644 --- a/viadot/flows/mediatool_to_adls.py +++ b/viadot/flows/mediatool_to_adls.py @@ -16,6 +16,7 @@ df_to_parquet, dtypes_to_json_task, update_dtypes_dict, + validate_df, ) from ..tasks import AzureDataLakeUpload, MediatoolToDF @@ -41,6 +42,7 @@ def __init__( adls_sp_credentials_secret: str = None, overwrite_adls: bool = False, if_exists: str = "replace", + validate_df_dict: Dict[str, Any] = None, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -66,6 +68,8 @@ def __init__( Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_exists (str, optional): What to do if the file exists. Defaults to "replace". + validate_df_dict (Dict[str,Any], optional): A dictionary with optional list of tests to verify the output + dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. """ # MediatoolToDF self.organization_ids = organization_ids @@ -73,6 +77,7 @@ def __init__( self.mediatool_credentials_key = mediatool_credentials_key self.media_entries_columns = media_entries_columns self.vault_name = vault_name + self.validate_df_dict = validate_df_dict # AzureDataLakeUpload self.overwrite = overwrite_adls @@ -119,6 +124,11 @@ def gen_flow(self) -> Flow: media_entries_columns=self.media_entries_columns, flow=self, ) + if self.validate_df_dict: + validation_task = validate_df.bind( + df, tests=self.validate_df_dict, flow=self + ) + validation_task.set_upstream(df, flow=self) df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) df_casted_to_str = cast_df_to_str(df_with_metadata, flow=self) From 62d906b218652efe00f316954b5a38955da8dd34 Mon Sep 17 00:00:00 2001 From: dominikjedlinski Date: Wed, 25 Oct 2023 12:20:53 +0200 Subject: [PATCH 18/47] added df_validation --- .../integration/flows/test_hubspot_to_adls.py | 79 +++++++++++++++++++ viadot/flows/hubspot_to_adls.py | 11 +++ 2 files changed, 90 insertions(+) diff --git a/tests/integration/flows/test_hubspot_to_adls.py b/tests/integration/flows/test_hubspot_to_adls.py index a67d00aca..d960fc079 100644 --- a/tests/integration/flows/test_hubspot_to_adls.py +++ b/tests/integration/flows/test_hubspot_to_adls.py @@ -6,6 +6,7 @@ import pytest from viadot.flows import HubspotToADLS +from viadot.exceptions import ValidationError DATA = { "id": {"0": "820306930"}, @@ -60,3 +61,81 @@ def test_hubspot_to_adls_flow_run(mocked_class): assert result.is_successful() os.remove("test_hubspot_to_adls_flow_run.parquet") os.remove("test_hubspot_to_adls_flow_run.json") + + +@mock.patch( + "viadot.tasks.HubspotToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_hubspot_to_adls_flow_run_validate_fail(mocked_class): + flow = HubspotToADLS( + "test_hubspot_to_adls_flow_run", + hubspot_credentials_key="HUBSPOT", + endpoint="line_items", + filters=[ + { + "filters": [ + { + "propertyName": "createdate", + "operator": "BETWEEN", + "highValue": "2021-01-01", + "value": "2021-01-01", + }, + {"propertyName": "quantity", "operator": "EQ", "value": "2"}, + ] + }, + { + "filters": [ + {"propertyName": "amount", "operator": "EQ", "value": "3744.000"} + ] + }, + ], + overwrite_adls=True, + adls_dir_path=ADLS_DIR_PATH, + adls_file_name=ADLS_FILE_NAME, + validate_df_dict={"column_size": {"id": 0}}, + ) + try: + flow.run() + except ValidationError: + pass + + +@mock.patch( + "viadot.tasks.HubspotToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_hubspot_to_adls_flow_run_validate_success(mocked_class): + flow = HubspotToADLS( + "test_hubspot_to_adls_flow_run", + hubspot_credentials_key="HUBSPOT", + endpoint="line_items", + filters=[ + { + "filters": [ + { + "propertyName": "createdate", + "operator": "BETWEEN", + "highValue": "2021-01-01", + "value": "2021-01-01", + }, + {"propertyName": "quantity", "operator": "EQ", "value": "2"}, + ] + }, + { + "filters": [ + {"propertyName": "amount", "operator": "EQ", "value": "3744.000"} + ] + }, + ], + overwrite_adls=True, + adls_dir_path=ADLS_DIR_PATH, + adls_file_name=ADLS_FILE_NAME, + validate_df_dict={"column_unique_values": ["id"]}, + ) + result = flow.run() + assert result.is_successful() + os.remove("test_hubspot_to_adls_flow_run.parquet") + os.remove("test_hubspot_to_adls_flow_run.json") diff --git a/viadot/flows/hubspot_to_adls.py b/viadot/flows/hubspot_to_adls.py index 8f79d5d1b..7a2fe4387 100644 --- a/viadot/flows/hubspot_to_adls.py +++ b/viadot/flows/hubspot_to_adls.py @@ -13,6 +13,7 @@ df_to_parquet, dtypes_to_json_task, update_dtypes_dict, + validate_df, ) from viadot.tasks import AzureDataLakeUpload, HubspotToDF @@ -33,6 +34,7 @@ def __init__( adls_dir_path: str = None, if_exists: Literal["replace", "append", "delete"] = "replace", overwrite_adls: bool = True, + validate_df_dict: Dict[str, Any] = None, vault_name: str = None, sp_credentials_secret: str = None, *args: List[any], @@ -75,6 +77,8 @@ def __init__( output_file_extension (str, optional): Output file extension. Defaults to ".parquet". if_exists (Literal, optional): What to do if the table exists. Defaults to "replace". overwrite_adls (str, optional): Whether to overwrite the destination file in ADLS. Defaults to True. + validate_df_dict (Dict[str], optional): A dictionary with optional list of tests to verify the output + dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. """ @@ -86,6 +90,7 @@ def __init__( self.hubspot_credentials = hubspot_credentials self.hubspot_credentials_key = hubspot_credentials_key self.output_file_extension = output_file_extension + self.validate_df_dict = validate_df_dict self.local_file_path = ( local_file_path or self.slugify(name) + self.output_file_extension @@ -137,6 +142,12 @@ def gen_flow(self) -> Flow: df_viadot_downloaded = add_ingestion_metadata_task.bind(df=df, flow=self) dtypes_dict = df_get_data_types_task.bind(df_viadot_downloaded, flow=self) + if self.validate_df_dict: + validation_task = validate_df.bind( + df, tests=self.validate_df_dict, flow=self + ) + validation_task.set_upstream(df, flow=self) + df_to_be_loaded = df_map_mixed_dtypes_for_parquet( df_viadot_downloaded, dtypes_dict, flow=self ) From 2ca685ce898e8e997b8894012a61608a99a0312d Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 25 Oct 2023 13:40:04 +0200 Subject: [PATCH 19/47] =?UTF-8?q?=E2=9C=A8=20Added=20`validate=5Fdf=5Fdict?= =?UTF-8?q?`=20parameter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../integration/flows/test_sap_rfc_to_adls.py | 44 +++++++++++++++++-- viadot/flows/sap_rfc_to_adls.py | 19 +++++--- 2 files changed, 55 insertions(+), 8 deletions(-) diff --git a/tests/integration/flows/test_sap_rfc_to_adls.py b/tests/integration/flows/test_sap_rfc_to_adls.py index 5b9f0d556..ed33fa320 100644 --- a/tests/integration/flows/test_sap_rfc_to_adls.py +++ b/tests/integration/flows/test_sap_rfc_to_adls.py @@ -2,6 +2,7 @@ from viadot.flows import SAPRFCToADLS from viadot.sources import AzureDataLake from viadot.tasks import AzureDataLakeRemove +from viadot.exceptions import ValidationError try: import pyrfc @@ -10,15 +11,15 @@ ADLS_PATH = "raw/supermetrics/mp/test_file_sap.parquet" FILE_NAME = "test_file.parquet" +SAP_TEST_CREDS = local_config.get("SAP").get("QA") def test_sap_rfc_to_adls_query(): - sap_test_creds = local_config.get("SAP").get("QA") flow = SAPRFCToADLS( name="test flow", - query="SELECT MATNR, MATKL FROM MARA WHERE LAEDA LIKE '2022%'", + query="SELECT MATNR, MATKL FROM MARA WHERE LAEDA LIKE '2022%' LIMIT 5", func="BBP_RFC_READ_TABLE", - sap_credentials=sap_test_creds, + sap_credentials=SAP_TEST_CREDS, local_file_path=FILE_NAME, adls_path=ADLS_PATH, overwrite=True, @@ -27,4 +28,41 @@ def test_sap_rfc_to_adls_query(): assert result.is_successful() file = AzureDataLake(ADLS_PATH) assert file.exists() + assert len(flow.tasks) == 3 + AzureDataLakeRemove(ADLS_PATH) + + +def test_sap_rfc_to_adls_validation_fail(): + flow = SAPRFCToADLS( + name="test flow", + query="SELECT MATNR, MATKL FROM MARA WHERE LAEDA LIKE '2022%' LIMIT 5", + func="BBP_RFC_READ_TABLE", + sap_credentials=SAP_TEST_CREDS, + local_file_path=FILE_NAME, + adls_path=ADLS_PATH, + overwrite=True, + validate_df_dict={"column_list_to_match": ["MATNR"]}, + ) + try: + result = flow.run() + except ValidationError: + pass + + +def test_sap_rfc_to_adls_validation_success(): + flow = SAPRFCToADLS( + name="test flow", + query="SELECT MATNR, MATKL FROM MARA WHERE LAEDA LIKE '2022%' LIMIT 5", + func="BBP_RFC_READ_TABLE", + sap_credentials=SAP_TEST_CREDS, + local_file_path=FILE_NAME, + adls_path=ADLS_PATH, + overwrite=True, + validate_df_dict={"column_list_to_match": ["MATNR", "MATKL"]}, + ) + result = flow.run() + assert result.is_successful() + file = AzureDataLake(ADLS_PATH) + assert file.exists() + assert len(flow.tasks) == 4 AzureDataLakeRemove(ADLS_PATH) diff --git a/viadot/flows/sap_rfc_to_adls.py b/viadot/flows/sap_rfc_to_adls.py index 0d56efeec..c2bc71d8d 100644 --- a/viadot/flows/sap_rfc_to_adls.py +++ b/viadot/flows/sap_rfc_to_adls.py @@ -1,8 +1,8 @@ from typing import Any, Dict, List, Literal -from prefect import Flow, task, unmapped +from prefect import Flow -from viadot.task_utils import concat_dfs, df_to_csv, df_to_parquet, set_new_kv +from viadot.task_utils import df_to_csv, df_to_parquet, set_new_kv, validate_df from viadot.tasks import AzureDataLakeUpload, SAPRFCToDF @@ -27,8 +27,9 @@ def __init__( vault_name: str = None, update_kv: bool = False, filter_column: str = None, - timeout: int = 3600, alternative_version: bool = False, + validate_df_dict: Dict[str, Any] = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -77,9 +78,11 @@ def __init__( vault_name(str, optional): The name of the vault from which to obtain the secrets. Defaults to None. update_kv (bool, optional): Whether or not to update key value on Prefect. Defaults to False. filter_column (str, optional): Name of the field based on which key value will be updated. Defaults to None. + alternative_version (bool, optional): Enable the use version 2 in source. Defaults to False. + validate_df_dict (Dict[str,Any], optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers + the `validate_df` task from task_utils. Defaults to None. timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. - alternative_version (bool, optional): Enable the use version 2 in source. Defaults to False. """ self.query = query self.rfc_sep = rfc_sep @@ -96,8 +99,9 @@ def __init__( self.overwrite = overwrite self.adls_sp_credentials_secret = adls_sp_credentials_secret self.vault_name = vault_name - self.timeout = timeout self.alternative_version = alternative_version + self.validate_df_dict = validate_df_dict + self.timeout = timeout self.update_kv = update_kv self.filter_column = filter_column @@ -119,6 +123,11 @@ def gen_flow(self) -> Flow: credentials=self.sap_credentials, flow=self, ) + if self.validate_df_dict: + validation_task = validate_df.bind( + df, tests=self.validate_df_dict, flow=self + ) + validation_task.set_upstream(df, flow=self) if self.output_file_extension == ".parquet": df_to_file = df_to_parquet.bind( From 9cead23459a5918232cd4562f4c51952968d24bc Mon Sep 17 00:00:00 2001 From: gwieloch Date: Wed, 25 Oct 2023 13:44:13 +0200 Subject: [PATCH 20/47] added validate_df task to VidClubToADLS flow --- .../integration/flows/test_vidclub_to_adls.py | 61 +++++++++++++++++++ viadot/flows/vid_club_to_adls.py | 13 ++++ 2 files changed, 74 insertions(+) diff --git a/tests/integration/flows/test_vidclub_to_adls.py b/tests/integration/flows/test_vidclub_to_adls.py index a0c86c2ec..c18eaad10 100644 --- a/tests/integration/flows/test_vidclub_to_adls.py +++ b/tests/integration/flows/test_vidclub_to_adls.py @@ -5,9 +5,11 @@ import pytest from viadot.flows import VidClubToADLS +from viadot.exceptions import ValidationError DATA = {"col1": ["aaa", "bbb", "ccc"], "col2": [11, 22, 33]} ADLS_FILE_NAME = "test_vid_club.parquet" +ADLS_FILE_NAME2 = "test_vid_club_validate_df.parquet" ADLS_DIR_PATH = "raw/test/" @@ -29,3 +31,62 @@ def test_vidclub_to_adls_run_flow(mocked_class): assert result.is_successful() os.remove("test_vidclub_to_adls_flow_run.parquet") os.remove("test_vidclub_to_adls_flow_run.json") + + +def test_vidclub_validate_df_task_success(caplog): + flow = VidClubToADLS( + "test_vidclub_validate_df_task_success", + source="product", + cols_to_drop=[ + "submissionProductID", + "submissionProductDate", + "brand", + "productCode", + ], + from_date="2023-10-25", + to_date="2023-10-25", + adls_dir_path="raw/tests", + adls_file_name="test.parquet", + adls_sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA", + overwrite_adls=True, + validate_df_dict={ + "column_size": {"submissionID": 5}, + "column_list_to_match": [ + "submissionID", + "regionID", + "productQuantity", + "unit", + ], + }, + ) + + result = flow.run() + assert result.is_successful() + + +def test_vidclub_validate_df_task_fail(caplog): + flow = VidClubToADLS( + "test_vidclub_validate_df_task_fail", + source="product", + cols_to_drop=[ + "submissionProductID", + "submissionProductDate", + "brand", + "productCode", + ], + from_date="2023-10-25", + to_date="2023-10-25", + adls_dir_path="raw/tests", + adls_file_name="test.parquet", + adls_sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA", + overwrite_adls=True, + validate_df_dict={ + "column_size": {"submissionID": 5}, + "column_unique_values": ["regionID"], + }, + ) + + try: + flow.run() + except ValidationError: + pass diff --git a/viadot/flows/vid_club_to_adls.py b/viadot/flows/vid_club_to_adls.py index 59d676c51..df9c0d531 100644 --- a/viadot/flows/vid_club_to_adls.py +++ b/viadot/flows/vid_club_to_adls.py @@ -16,6 +16,7 @@ df_to_parquet, dtypes_to_json_task, update_dtypes_dict, + validate_df, ) from viadot.tasks import AzureDataLakeUpload, VidClubToDF @@ -44,6 +45,7 @@ def __init__( adls_sp_credentials_secret: str = None, overwrite_adls: bool = False, if_exists: str = "replace", + validate_df_dict: dict = None, timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any] @@ -75,6 +77,8 @@ def __init__( Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_exists (str, optional): What to do if the file exists. Defaults to "replace". + validate_df_dict (dict, optional): A dictionary with optional list of tests to verify the output + dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. timeout (int, optional): The time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. """ # VidClubToDF @@ -89,6 +93,9 @@ def __init__( self.vidclub_credentials_secret = vidclub_credentials_secret self.vidclub_vault_name = vidclub_vault_name + # validate_df + self.validate_df_dict = validate_df_dict + # AzureDataLakeUpload self.adls_file_name = adls_file_name self.adls_dir_path = adls_dir_path @@ -147,6 +154,12 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + validation_task = validate_df.bind( + vid_club_df, tests=self.validate_df_dict, flow=self + ) + validation_task.set_upstream(vid_club_df, flow=self) + df_with_metadata = add_ingestion_metadata_task.bind(vid_club_df, flow=self) dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) From 65c755309b579b54bdcba8fe8125983f4dcd7cb0 Mon Sep 17 00:00:00 2001 From: gwieloch Date: Wed, 25 Oct 2023 13:51:51 +0200 Subject: [PATCH 21/47] added validate_df task to VidClubToADLS --- .../integration/flows/test_vidclub_to_adls.py | 61 +++++++++++++++++++ viadot/flows/vid_club_to_adls.py | 13 ++++ 2 files changed, 74 insertions(+) diff --git a/tests/integration/flows/test_vidclub_to_adls.py b/tests/integration/flows/test_vidclub_to_adls.py index a0c86c2ec..c18eaad10 100644 --- a/tests/integration/flows/test_vidclub_to_adls.py +++ b/tests/integration/flows/test_vidclub_to_adls.py @@ -5,9 +5,11 @@ import pytest from viadot.flows import VidClubToADLS +from viadot.exceptions import ValidationError DATA = {"col1": ["aaa", "bbb", "ccc"], "col2": [11, 22, 33]} ADLS_FILE_NAME = "test_vid_club.parquet" +ADLS_FILE_NAME2 = "test_vid_club_validate_df.parquet" ADLS_DIR_PATH = "raw/test/" @@ -29,3 +31,62 @@ def test_vidclub_to_adls_run_flow(mocked_class): assert result.is_successful() os.remove("test_vidclub_to_adls_flow_run.parquet") os.remove("test_vidclub_to_adls_flow_run.json") + + +def test_vidclub_validate_df_task_success(caplog): + flow = VidClubToADLS( + "test_vidclub_validate_df_task_success", + source="product", + cols_to_drop=[ + "submissionProductID", + "submissionProductDate", + "brand", + "productCode", + ], + from_date="2023-10-25", + to_date="2023-10-25", + adls_dir_path="raw/tests", + adls_file_name="test.parquet", + adls_sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA", + overwrite_adls=True, + validate_df_dict={ + "column_size": {"submissionID": 5}, + "column_list_to_match": [ + "submissionID", + "regionID", + "productQuantity", + "unit", + ], + }, + ) + + result = flow.run() + assert result.is_successful() + + +def test_vidclub_validate_df_task_fail(caplog): + flow = VidClubToADLS( + "test_vidclub_validate_df_task_fail", + source="product", + cols_to_drop=[ + "submissionProductID", + "submissionProductDate", + "brand", + "productCode", + ], + from_date="2023-10-25", + to_date="2023-10-25", + adls_dir_path="raw/tests", + adls_file_name="test.parquet", + adls_sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA", + overwrite_adls=True, + validate_df_dict={ + "column_size": {"submissionID": 5}, + "column_unique_values": ["regionID"], + }, + ) + + try: + flow.run() + except ValidationError: + pass diff --git a/viadot/flows/vid_club_to_adls.py b/viadot/flows/vid_club_to_adls.py index 59d676c51..df9c0d531 100644 --- a/viadot/flows/vid_club_to_adls.py +++ b/viadot/flows/vid_club_to_adls.py @@ -16,6 +16,7 @@ df_to_parquet, dtypes_to_json_task, update_dtypes_dict, + validate_df, ) from viadot.tasks import AzureDataLakeUpload, VidClubToDF @@ -44,6 +45,7 @@ def __init__( adls_sp_credentials_secret: str = None, overwrite_adls: bool = False, if_exists: str = "replace", + validate_df_dict: dict = None, timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any] @@ -75,6 +77,8 @@ def __init__( Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_exists (str, optional): What to do if the file exists. Defaults to "replace". + validate_df_dict (dict, optional): A dictionary with optional list of tests to verify the output + dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. timeout (int, optional): The time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. """ # VidClubToDF @@ -89,6 +93,9 @@ def __init__( self.vidclub_credentials_secret = vidclub_credentials_secret self.vidclub_vault_name = vidclub_vault_name + # validate_df + self.validate_df_dict = validate_df_dict + # AzureDataLakeUpload self.adls_file_name = adls_file_name self.adls_dir_path = adls_dir_path @@ -147,6 +154,12 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + validation_task = validate_df.bind( + vid_club_df, tests=self.validate_df_dict, flow=self + ) + validation_task.set_upstream(vid_club_df, flow=self) + df_with_metadata = add_ingestion_metadata_task.bind(vid_club_df, flow=self) dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) From 99cbcecf0157f7347f9495c18e8e5d97c65909a2 Mon Sep 17 00:00:00 2001 From: dominikjedlinski Date: Wed, 25 Oct 2023 13:53:05 +0200 Subject: [PATCH 22/47] added validate_df --- .../flows/test_salesforce_to_adls.py | 27 +++++++++++++++++++ viadot/flows/salesforce_to_adls.py | 12 +++++++++ 2 files changed, 39 insertions(+) diff --git a/tests/integration/flows/test_salesforce_to_adls.py b/tests/integration/flows/test_salesforce_to_adls.py index 60ec41aaa..ec68a1227 100644 --- a/tests/integration/flows/test_salesforce_to_adls.py +++ b/tests/integration/flows/test_salesforce_to_adls.py @@ -4,6 +4,7 @@ from viadot.flows import SalesforceToADLS from viadot.tasks import AzureDataLakeRemove +from viadot.exceptions import ValidationError ADLS_FILE_NAME = "test_salesforce.parquet" ADLS_DIR_PATH = "raw/tests/" @@ -32,3 +33,29 @@ def test_salesforce_to_adls(): vault_name="azuwevelcrkeyv001s", ) rm.run(sp_credentials_secret=credentials_secret) + + +def test_salesforce_to_adls_validate_success(): + credentials_secret = PrefectSecret( + "AZURE_DEFAULT_ADLS_SERVICE_PRINCIPAL_SECRET" + ).run() + + flow = SalesforceToADLS( + "test_salesforce_to_adls_run_flow", + query="SELECT IsDeleted, FiscalYear FROM Opportunity LIMIT 50", + adls_sp_credentials_secret=credentials_secret, + adls_dir_path=ADLS_DIR_PATH, + adls_file_name=ADLS_FILE_NAME, + validate_df_dict={"column_list_to_match": ["IsDeleted", "FiscalYear"]}, + ) + + result = flow.run() + assert result.is_successful() + + os.remove("test_salesforce_to_adls_run_flow.parquet") + os.remove("test_salesforce_to_adls_run_flow.json") + rm = AzureDataLakeRemove( + path=ADLS_DIR_PATH + ADLS_FILE_NAME, + vault_name="azuwevelcrkeyv001s", + ) + rm.run(sp_credentials_secret=credentials_secret) diff --git a/viadot/flows/salesforce_to_adls.py b/viadot/flows/salesforce_to_adls.py index fe84be381..1ace9aa5a 100644 --- a/viadot/flows/salesforce_to_adls.py +++ b/viadot/flows/salesforce_to_adls.py @@ -16,6 +16,7 @@ df_to_parquet, dtypes_to_json_task, update_dtypes_dict, + validate_df, ) from viadot.tasks import AzureDataLakeUpload, SalesforceToDF @@ -41,6 +42,7 @@ def __init__( adls_file_name: str = None, adls_sp_credentials_secret: str = None, if_exists: str = "replace", + validate_df_dict: Dict[str, Any] = None, timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], @@ -70,6 +72,8 @@ def __init__( ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake. Defaults to None. if_exists (str, optional): What to do if the file exists. Defaults to "replace". + validate_df_dict (Dict[str,Any], optional): A dictionary with optional list of tests to verify the output + dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. """ @@ -82,6 +86,7 @@ def __init__( self.env = env self.vault_name = vault_name self.credentials_secret = credentials_secret + self.validate_df_dict = validate_df_dict # AzureDataLakeUpload self.adls_sp_credentials_secret = adls_sp_credentials_secret @@ -135,6 +140,13 @@ def gen_flow(self) -> Flow: df_clean = df_clean_column.bind(df=df, flow=self) df_with_metadata = add_ingestion_metadata_task.bind(df_clean, flow=self) dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) + + if self.validate_df_dict: + validation_task = validate_df.bind( + df, tests=self.validate_df_dict, flow=self + ) + validation_task.set_upstream(df, flow=self) + df_to_be_loaded = df_map_mixed_dtypes_for_parquet( df_with_metadata, dtypes_dict, flow=self ) From c9cc2005a1202395cea04500fcdfc24231b5165c Mon Sep 17 00:00:00 2001 From: dominikjedlinski Date: Wed, 25 Oct 2023 14:39:23 +0200 Subject: [PATCH 23/47] updated validation_task for aselite --- tests/integration/flows/test_aselite_to_adls.py | 11 ++--------- viadot/flows/aselite_to_adls.py | 7 ++++--- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/tests/integration/flows/test_aselite_to_adls.py b/tests/integration/flows/test_aselite_to_adls.py index 32a31aca7..60022f47e 100644 --- a/tests/integration/flows/test_aselite_to_adls.py +++ b/tests/integration/flows/test_aselite_to_adls.py @@ -64,7 +64,7 @@ def test_aselite_to_adls(): os.remove(TMP_FILE_NAME) -def test_aselite_to_adls_validate_df(): +def test_aselite_to_adls_validate_success(): credentials_secret = PrefectSecret("aselite").run() vault_name = PrefectSecret("AZURE_DEFAULT_KEYVAULT").run() @@ -90,14 +90,7 @@ def test_aselite_to_adls_validate_df(): validate_df_dict = { "column_size": {"ParentLanguageNo": 1}, "column_unique_values": ["ID"], - "column_list_to_match": [ - "SpracheText", - "SpracheMM", - "KatSprache", - "KatBasisSprache", - "CodePage", - ], - "dataset_row_count": {"min": 10, "max": 10}, + "dataset_row_count": {"min": 0, "max": 10}, "column_match_regex": {"SpracheText", r"TE_.*"}, } diff --git a/viadot/flows/aselite_to_adls.py b/viadot/flows/aselite_to_adls.py index 8fcdaddb3..bd77cf40f 100644 --- a/viadot/flows/aselite_to_adls.py +++ b/viadot/flows/aselite_to_adls.py @@ -24,7 +24,7 @@ def __init__( to_path: str = None, if_exists: Literal["replace", "append", "delete"] = "replace", overwrite: bool = True, - validate_df_dict: dict = None, + validate_df_dict: Dict[str, Any] = None, convert_bytes: bool = False, sp_credentials_secret: str = None, remove_special_characters: bool = None, @@ -47,7 +47,8 @@ def __init__( to_path (str): The path to an ADLS file. Defaults to None. if_exists (Literal, optional): What to do if the table exists. Defaults to "replace". overwrite (str, optional): Whether to overwrite the destination file. Defaults to True. - validate_df_dict (Dict[str], optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. + validate_df_dict (Dict[str], optional): A dictionary with optional list of tests to verify the output + dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. remove_special_characters (str, optional): Call a function that remove special characters like escape symbols. Defaults to None. @@ -95,6 +96,7 @@ def gen_flow(self) -> Flow: validation_task = validate_df.bind( df, tests=self.validate_df_dict, flow=self ) + validation_task.set_upstream(df, flow=self) create_csv = df_to_csv.bind( df, @@ -113,6 +115,5 @@ def gen_flow(self) -> Flow: flow=self, ) - # validation_task.set_upstream(df, flow=self) create_csv.set_upstream(df, flow=self) adls_upload.set_upstream(create_csv, flow=self) From 25e66b2a89d59a7bed515ea6f794923242ab4f0f Mon Sep 17 00:00:00 2001 From: jkrobicki Date: Wed, 25 Oct 2023 14:41:23 +0200 Subject: [PATCH 24/47] =?UTF-8?q?=E2=9C=A8=20Added=20validate=5Fdf=5Fdict?= =?UTF-8?q?=20param=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_bigquery_to_adls.py | 70 ++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/tests/integration/flows/test_bigquery_to_adls.py b/tests/integration/flows/test_bigquery_to_adls.py index 1a867b58c..f986d0791 100644 --- a/tests/integration/flows/test_bigquery_to_adls.py +++ b/tests/integration/flows/test_bigquery_to_adls.py @@ -1,11 +1,16 @@ import os import pendulum -from prefect.tasks.secrets import PrefectSecret +import pytest +from unittest import mock +import pandas as pd +from prefect.tasks.secrets import PrefectSecret from viadot.flows import BigQueryToADLS from viadot.tasks import AzureDataLakeRemove +from viadot.exceptions import ValidationError + ADLS_DIR_PATH = "raw/tests/" ADLS_FILE_NAME = str(pendulum.now("utc")) + ".parquet" BIGQ_CREDENTIAL_KEY = "BIGQUERY-TESTS" @@ -72,6 +77,69 @@ def test_bigquery_to_adls_false(): assert result.is_failed() os.remove("test_bigquery_to_adls_overwrite_false.parquet") os.remove("test_bigquery_to_adls_overwrite_false.json") + + +DATA = { + "type": ["banner", "banner"], + "country": ["PL", "DE"], +} + + +@mock.patch( + "viadot.tasks.BigQueryToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_bigquery_to_adls_validate_df_fail(mocked_data): + flow_bigquery = BigQueryToADLS( + name="Test BigQuery to ADLS validate df fail", + dataset_name="official_empty", + table_name="space", + credentials_key=BIGQ_CREDENTIAL_KEY, + adls_file_name=ADLS_FILE_NAME, + overwrite_adls=True, + adls_dir_path=ADLS_DIR_PATH, + adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET, + validation_df_dict={"column_list_to_match": ["type", "country", "test"]}, + ) + + try: + result = flow_bigquery.run() + except ValidationError: + pass + + os.remove("test_bigquery_to_adls_validate_df_fail.parquet") + os.remove("test_bigquery_to_adls_validate_df_fail.json") + + +@mock.patch( + "viadot.tasks.BigQueryToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_bigquery_to_adls_validate_df_success(mocked_data): + flow_bigquery = BigQueryToADLS( + name="Test BigQuery to ADLS validate df success", + dataset_name="official_empty", + table_name="space", + credentials_key=BIGQ_CREDENTIAL_KEY, + adls_file_name=ADLS_FILE_NAME, + overwrite_adls=True, + adls_dir_path=ADLS_DIR_PATH, + adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET, + validation_df_dict={"column_list_to_match": ["type", "country"]}, + ) + result = flow_bigquery.run() + + result = flow_bigquery.run() + assert result.is_successful() + + task_results = result.result.values() + assert all([task_result.is_successful() for task_result in task_results]) + + os.remove("test_bigquery_to_adls_validate_df_success.parquet") + os.remove("test_bigquery_to_adls_validate_df_success.json") + rm = AzureDataLakeRemove( path=ADLS_DIR_PATH + ADLS_FILE_NAME, vault_name="azuwevelcrkeyv001s" ) From 6dce986240da9c21f16281511e3382fd74bfe3e5 Mon Sep 17 00:00:00 2001 From: jkrobicki Date: Wed, 25 Oct 2023 16:11:59 +0200 Subject: [PATCH 25/47] =?UTF-8?q?=E2=9C=A8=20Added=20df=20validation=20tas?= =?UTF-8?q?k=20to=20the=20flow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/outlook_to_adls.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/viadot/flows/outlook_to_adls.py b/viadot/flows/outlook_to_adls.py index 3f24a65d8..c736cd236 100644 --- a/viadot/flows/outlook_to_adls.py +++ b/viadot/flows/outlook_to_adls.py @@ -9,6 +9,7 @@ df_to_csv, df_to_parquet, union_dfs_task, + validate_df, ) from viadot.tasks import AzureDataLakeUpload, OutlookToDF @@ -29,6 +30,7 @@ def __init__( limit: int = 10000, timeout: int = 3600, if_exists: Literal["append", "replace", "skip"] = "append", + validation_df_dict: dict = None, outlook_credentials_secret: str = "OUTLOOK", *args: List[Any], **kwargs: Dict[str, Any], @@ -54,6 +56,8 @@ def __init__( timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. if_exists (Literal['append', 'replace', 'skip'], optional): What to do if the local file already exists. Defaults to "append". + validation_df_dict (dict, optional): An optional dictionary to verify the received dataframe. + When passed, `validate_df` task validation tests are triggered. Defaults to None. """ self.mailbox_list = mailbox_list @@ -65,6 +69,9 @@ def __init__( self.local_file_path = local_file_path self.if_exsists = if_exists + # Validate DataFrame + self.validation_df_dict = validation_df_dict + # AzureDataLakeUpload self.adls_file_path = adls_file_path self.output_file_extension = output_file_extension @@ -98,6 +105,11 @@ def gen_flow(self) -> Flow: dfs = apply_map(self.gen_outlook_df, self.mailbox_list, flow=self) df = union_dfs_task.bind(dfs, flow=self) + + if self.validation_df_dict: + validation = validate_df(df=df, tests=self.validation_df_dict, flow=self) + validation.set_upstream(df, flow=self) + df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) if self.output_file_extension == ".parquet": From 46db7a8216e47f216c5541e6b2f2e797424cf8e2 Mon Sep 17 00:00:00 2001 From: gwieloch Date: Wed, 25 Oct 2023 16:14:59 +0200 Subject: [PATCH 26/47] added validate_df task to SAP BW class --- .../integration/flows/test_sap_bw_to_adls.py | 72 +++++++++++++++++++ viadot/flows/sap_bw_to_adls.py | 10 +++ viadot/sources/sap_bw.py | 1 + viadot/tasks/sap_bw.py | 12 ++-- 4 files changed, 90 insertions(+), 5 deletions(-) diff --git a/tests/integration/flows/test_sap_bw_to_adls.py b/tests/integration/flows/test_sap_bw_to_adls.py index c337336de..2c01049e8 100644 --- a/tests/integration/flows/test_sap_bw_to_adls.py +++ b/tests/integration/flows/test_sap_bw_to_adls.py @@ -5,6 +5,7 @@ import pytest from viadot.flows import SAPBWToADLS +from viadot.exceptions import ValidationError DATA = { "[0CALMONTH].[LEVEL01].[DESCRIPTION]": ["January 2023"], @@ -51,3 +52,74 @@ def test_sap_bw_to_adls_flow_run(mocked_class): assert result.is_successful() os.remove("test_sap_bw_to_adls_flow_run.parquet") os.remove("test_sap_bw_to_adls_flow_run.json") + + +def test_sap_bw_validate_df_task_success(caplog): + flow = SAPBWToADLS( + "test_sap_bw_validate_df_task_success", + mdx_query=""" + SELECT + { [Measures].[003YPR4ERQZRMSMFEQ8123HRR] } ON COLUMNS, + NON EMPTY + { + { [0CALDAY].[20230102] } * + { + [0COMP_CODE].[1120] + } + } + ON ROWS + FROM ZCSALBIL1/ZBW4_ZCSALBIL1_002_BOA + """, + overwrite_adls=True, + adls_dir_path=ADLS_DIR_PATH, + adls_file_name=ADLS_FILE_NAME, + validate_df_dict={ + "column_size": {"[0CALDAY].[LEVEL01].[MEMBER_CAPTION]": 10}, + "column_list_to_match": [ + "[0CALDAY].[LEVEL01].[MEMBER_CAPTION]", + "[0COMP_CODE].[LEVEL01].[MEMBER_CAPTION]", + "[Measures].[003YPR4ERQZRMSMFEQ8123HRR]", + ], + }, + ) + result = flow.run() + assert result.is_successful() + os.remove("test_sap_bw_validate_df_task_success.parquet") + os.remove("test_sap_bw_validate_df_task_success.json") + + +def test_sap_bw_validate_df_task_fail(caplog): + flow = SAPBWToADLS( + "test_sap_bw_validate_df_task_fail", + mdx_query=""" + SELECT + { [Measures].[003YPR4ERQZRMSMFEQ8123HRR] } ON COLUMNS, + NON EMPTY + { + { [0CALDAY].[20230102] } * + { + [0COMP_CODE].[1120] + } + } + ON ROWS + FROM ZCSALBIL1/ZBW4_ZCSALBIL1_002_BOA + """, + overwrite_adls=True, + adls_dir_path=ADLS_DIR_PATH, + adls_file_name=ADLS_FILE_NAME, + validate_df_dict={ + "column_size": {"[0CALDAY].[LEVEL01].[MEMBER_CAPTION]": 8}, + "column_list_to_match": [ + "[0CALDAY].[LEVEL01].[MEMBER_CAPTION]", + "[0COMP_CODE].[LEVEL01].[MEMBER_CAPTION]", + ], + }, + ) + + try: + flow.run() + except ValidationError: + pass + + os.remove("test_sap_bw_validate_df_task_fail.parquet") + os.remove("test_sap_bw_validate_df_task_fail.json") diff --git a/viadot/flows/sap_bw_to_adls.py b/viadot/flows/sap_bw_to_adls.py index 375f30775..9619df98e 100644 --- a/viadot/flows/sap_bw_to_adls.py +++ b/viadot/flows/sap_bw_to_adls.py @@ -14,6 +14,7 @@ df_to_parquet, dtypes_to_json_task, update_dtypes_dict, + validate_df, ) from viadot.tasks import AzureDataLakeUpload, SAPBWToDF @@ -35,6 +36,7 @@ def __init__( overwrite_adls: bool = True, vault_name: str = None, sp_credentials_secret: str = None, + validate_df_dict: dict = None, *args: List[any], **kwargs: Dict[str, Any], ): @@ -56,6 +58,7 @@ def __init__( overwrite_adls (bool, optional): Whether to overwrite the file in ADLS. Defaults to True. vault_name (str, optional): The name of the vault from which to obtain the secrets.. Defaults to None. sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. + validate_df_dict (Dict[str], optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. """ self.sapbw_credentials = sapbw_credentials self.sapbw_credentials_key = sapbw_credentials_key @@ -89,6 +92,7 @@ def __init__( self.overwrite_adls = overwrite_adls self.vault_name = vault_name self.sp_credentials_secret = sp_credentials_secret + self.validate_df_dict = validate_df_dict super().__init__(*args, name=name, **kwargs) self.gen_flow() @@ -110,6 +114,12 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + validation_task = validate_df.bind( + df, tests=self.validate_df_dict, flow=self + ) + validation_task.set_upstream(df, flow=self) + df_viadot_downloaded = add_ingestion_metadata_task.bind(df=df, flow=self) dtypes_dict = df_get_data_types_task.bind(df_viadot_downloaded, flow=self) diff --git a/viadot/sources/sap_bw.py b/viadot/sources/sap_bw.py index 73b1d2efa..8f4fb0583 100644 --- a/viadot/sources/sap_bw.py +++ b/viadot/sources/sap_bw.py @@ -25,6 +25,7 @@ def __init__(self, credentials: dict, *args, **kwargs): Raises: CredentialError: If provided credentials are incorrect. """ + self.credentials = credentials if credentials is None: raise CredentialError("Missing credentials.") diff --git a/viadot/tasks/sap_bw.py b/viadot/tasks/sap_bw.py index 4d34f9960..acc92c246 100644 --- a/viadot/tasks/sap_bw.py +++ b/viadot/tasks/sap_bw.py @@ -1,6 +1,7 @@ import pandas as pd from prefect import Task from prefect.tasks.secrets import PrefectSecret +from viadot.tasks import AzureKeyVaultSecret from prefect.utilities import logging from viadot.exceptions import ValidationError @@ -27,13 +28,14 @@ def __init__( sapbw_credentials_key (str, optional): Azure KV secret. Defaults to "SAP". env (str, optional): SAP environment. Defaults to "BW". """ - if sapbw_credentials is None: - self.sapbw_credentials = credentials_loader.run( - credentials_secret=sapbw_credentials_key - ).get(env) + if not sapbw_credentials: + credentials_str = AzureKeyVaultSecret( + sapbw_credentials_key, + ).run() + self.sapbw_credentials = json.loads(credentials_str).get(env) else: - self.sapbw_credentials = sapbw_credentials + self.sapbw_credentials = PrefectSecret("SAP_BW").run() super().__init__( name="sapbw_to_df", From f41f523a001ca8b5427e310ce5860c3fd2f580f5 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 25 Oct 2023 16:16:58 +0200 Subject: [PATCH 27/47] =?UTF-8?q?=E2=9C=A8=20Added=20validate=5Fdf=5Fdict?= =?UTF-8?q?=20parameter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/genesys_to_adls.py | 7 ++++++- viadot/tasks/genesys.py | 11 +++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/viadot/flows/genesys_to_adls.py b/viadot/flows/genesys_to_adls.py index 4f8c54f3e..830c02c71 100644 --- a/viadot/flows/genesys_to_adls.py +++ b/viadot/flows/genesys_to_adls.py @@ -5,7 +5,7 @@ import pandas as pd from prefect import Flow, task -from viadot.task_utils import add_ingestion_metadata_task, adls_bulk_upload +from viadot.task_utils import add_ingestion_metadata_task, adls_bulk_upload, validate_df from viadot.tasks.genesys import GenesysToCSV @@ -95,6 +95,7 @@ def __init__( overwrite_adls: bool = True, adls_sp_credentials_secret: str = None, credentials_genesys: Dict[str, Any] = None, + validate_df_dict: Dict[str, Any] = None, timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], @@ -143,6 +144,8 @@ def __init__( adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. credentials(dict, optional): Credentials for the genesys api. Defaults to None. + validate_df_dict (Dict[str,Any], optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers + the `validate_df` task from task_utils. Defaults to None. timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. """ @@ -165,6 +168,7 @@ def __init__( self.start_date = start_date self.end_date = end_date self.sep = sep + self.validate_df_dict = validate_df_dict self.timeout = timeout # AzureDataLake @@ -183,6 +187,7 @@ def gen_flow(self) -> Flow: timeout=self.timeout, local_file_path=self.local_file_path, sep=self.sep, + validate_df_dict=self.validate_df_dict, ) file_names = to_csv.bind( diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index f39bff6d2..b7386d815 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -10,6 +10,7 @@ from prefect.engine import signals from prefect.utilities import logging from prefect.utilities.tasks import defaults_from_attrs +from viadot.task_utils import validate_df from viadot.exceptions import APIError from viadot.sources import Genesys @@ -33,6 +34,7 @@ def __init__( conversationId_list: List[str] = None, key_list: List[str] = None, credentials_genesys: Dict[str, Any] = None, + validate_df_dict: Dict[str, Any] = None, timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], @@ -54,6 +56,8 @@ def __init__( sep (str, optional): Separator in csv file. Defaults to "\t". conversationId_list (List[str], optional): List of conversationId passed as attribute of GET method. Defaults to None. key_list (List[str], optional): List of keys needed to specify the columns in the GET request method. Defaults to None. + validate_df_dict (Dict[str,Any], optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers + the `validate_df` task from task_utils. Defaults to None. timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. """ @@ -72,6 +76,7 @@ def __init__( self.sep = sep self.conversationId_list = conversationId_list self.key_list = key_list + self.validate_df_dict = validate_df_dict super().__init__( name=self.report_name, @@ -293,6 +298,7 @@ def merge_conversations_dfs(self, data_to_merge: list) -> DataFrame: "credentials_genesys", "conversationId_list", "key_list", + "validate_df_dict", ) def run( self, @@ -309,6 +315,7 @@ def run( conversationId_list: List[str] = None, key_list: List[str] = None, credentials_genesys: Dict[str, Any] = None, + validate_df_dict: Dict[str, Any] = None, ) -> List[str]: """ Task for downloading data from the Genesys API to DF. @@ -327,6 +334,8 @@ def run( report_columns (List[str], optional): List of exisiting column in report. Defaults to None. conversationId_list (List[str], optional): List of conversationId passed as attribute of GET method. Defaults to None. key_list (List[str], optional): List of keys needed to specify the columns in the GET request method. Defaults to None. + validate_df_dict (Dict[str,Any], optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers + the `validate_df` task from task_utils. Defaults to None. Returns: List[str]: List of file names. @@ -488,6 +497,8 @@ def run( end = end_date.replace("-", "") file_name = f"WEBMESSAGE_{start}-{end}.csv" + if validate_df_dict: + validate_df(df=df, tests=validate_df_dict) df.to_csv( os.path.join(file_name), index=False, From 4f482a7bbdc2d6c21c0da2eebeb60ac8289fd16a Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 25 Oct 2023 16:31:05 +0200 Subject: [PATCH 28/47] =?UTF-8?q?=F0=9F=8E=A8=20changed=20import?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/genesys.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index b7386d815..6588810be 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -10,7 +10,7 @@ from prefect.engine import signals from prefect.utilities import logging from prefect.utilities.tasks import defaults_from_attrs -from viadot.task_utils import validate_df +from viadot.task_utils import * from viadot.exceptions import APIError from viadot.sources import Genesys From ef6002cb0939b339e569022967b0471e8eee9d78 Mon Sep 17 00:00:00 2001 From: gwieloch Date: Wed, 25 Oct 2023 17:01:07 +0200 Subject: [PATCH 29/47] added `validate_df` task to `SupermetricsToADLS` flow --- .../flows/test_supermetrics_to_adls.py | 91 ++++++++++++++++++- viadot/flows/supermetrics_to_adls.py | 14 +++ 2 files changed, 104 insertions(+), 1 deletion(-) diff --git a/tests/integration/flows/test_supermetrics_to_adls.py b/tests/integration/flows/test_supermetrics_to_adls.py index d901ad7ad..9738ddeb1 100644 --- a/tests/integration/flows/test_supermetrics_to_adls.py +++ b/tests/integration/flows/test_supermetrics_to_adls.py @@ -5,9 +5,10 @@ from prefect.storage import Local from viadot.flows import SupermetricsToADLS +from viadot.exceptions import ValidationError CWD = os.getcwd() -adls_dir_path = "raw/supermetrics" +adls_dir_path = "raw/tests/supermetrics" STORAGE = Local(path=CWD) logger = logging.getLogger(__name__) @@ -110,3 +111,91 @@ def test_supermetrics_to_adls_file_name(expectation_suite): ) result = flow.run() assert result.is_successful() + + +def test_supermetrics_to_adls_validate_df_success(expectation_suite): + flow = SupermetricsToADLS( + "test_supermetrics_to_adls_validate_df_success", + ds_id="GA", + ds_segments=[ + "R1fbzFNQQ3q_GYvdpRr42w", + "I8lnFFvdSFKc50lP7mBKNA", + "Lg7jR0VWS5OqGPARtGYKrw", + "h8ViuGLfRX-cCL4XKk6yfQ", + "-1", + ], + ds_accounts=["8326007", "58338899"], + date_range_type="last_year_inc", + fields=[ + {"id": "Date"}, + {"id": "segment", "split": "column"}, + {"id": "AvgPageLoadTime_calc"}, + ], + settings={"avoid_sampling": "true"}, + order_columns="alphabetic", + max_columns=100, + max_rows=10, + expectation_suite=expectation_suite, + evaluation_parameters=dict(previous_run_row_count=9), + adls_dir_path=adls_dir_path, + parallel=False, + validate_df_dict={ + "column_size": {"Date": 10}, + "column_list_to_match": [ + "Date", + "All Users", + "M-Site_Better Space: All Landing Page Sessions", + "M-site_Accessories: All Landing Page Sessions", + "M-site_More Space: All Landing Page Sessions", + "M-site_Replacement: All Landing Page Sessions", + ], + }, + ) + result = flow.run() + assert result.is_successful() + + task_results = result.result.values() + assert all([task_result.is_successful() for task_result in task_results]) + + +def test_supermetrics_to_adls_validate_df_fail(expectation_suite): + flow = SupermetricsToADLS( + "test_supermetrics_to_adls_validate_df_fail", + ds_id="GA", + ds_segments=[ + "R1fbzFNQQ3q_GYvdpRr42w", + "I8lnFFvdSFKc50lP7mBKNA", + "Lg7jR0VWS5OqGPARtGYKrw", + "h8ViuGLfRX-cCL4XKk6yfQ", + "-1", + ], + ds_accounts=["8326007", "58338899"], + date_range_type="last_year_inc", + fields=[ + {"id": "Date"}, + {"id": "segment", "split": "column"}, + {"id": "AvgPageLoadTime_calc"}, + ], + settings={"avoid_sampling": "true"}, + order_columns="alphabetic", + max_columns=100, + max_rows=10, + expectation_suite=expectation_suite, + evaluation_parameters=dict(previous_run_row_count=9), + adls_dir_path=adls_dir_path, + parallel=False, + validate_df_dict={ + "column_list_to_match": [ + "All Users", + "All Landing Page Sessions", + "All Landing Page Sessions", + "All Landing Page Sessions", + "All Landing Page Sessions", + ], + }, + ) + + try: + flow.run() + except ValidationError: + pass diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index 38255a38f..5a104cff7 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -19,6 +19,7 @@ union_dfs_task, update_dtypes_dict, write_to_json, + validate_df, ) from viadot.tasks import ( AzureDataLakeUpload, @@ -68,6 +69,7 @@ def __init__( vault_name: str = None, check_missing_data: bool = True, timeout: int = 3600, + validate_df_dict: dict = None, *args: List[any], **kwargs: Dict[str, Any], ): @@ -112,6 +114,8 @@ def __init__( check_missing_data (bool, optional): Whether to check missing data. Defaults to True. timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. + validate_df_dict (Dict[str], optional): A dictionary with optional list of tests to verify the output dataframe. + If defined, triggers the `validate_df` task from task_utils. Defaults to None. """ if not ds_user: try: @@ -140,6 +144,9 @@ def __init__( self.if_exists = if_exists self.output_file_extension = output_file_extension + # validate_df + self.validate_df_dict = validate_df_dict + # RunGreatExpectationsValidation self.expectation_suite = expectation_suite self.expectations_path = "/home/viadot/tmp/expectations" @@ -229,6 +236,13 @@ def gen_flow(self) -> Flow: else: df = self.gen_supermetrics_task(ds_accounts=self.ds_accounts, flow=self) + # run validate_df task from task_utils + if self.validate_df_dict: + validation_df_task = validate_df.bind( + df, tests=self.validate_df_dict, flow=self + ) + validation_df_task.set_upstream(df, flow=self) + write_json = write_to_json.bind( dict_=self.expectation_suite, path=os.path.join( From 101efa6317430d92241774e6706b1cdac9be167c Mon Sep 17 00:00:00 2001 From: gwieloch Date: Wed, 25 Oct 2023 17:23:21 +0200 Subject: [PATCH 30/47] added `validate_df` task to `CustomerGaugeToADLS` class --- .../flows/test_customer_gauge_to_adls.py | 53 +++++++++++++++++++ viadot/flows/customer_gauge_to_adls.py | 13 +++++ 2 files changed, 66 insertions(+) diff --git a/tests/integration/flows/test_customer_gauge_to_adls.py b/tests/integration/flows/test_customer_gauge_to_adls.py index e6cdf1545..0e7afd3e2 100644 --- a/tests/integration/flows/test_customer_gauge_to_adls.py +++ b/tests/integration/flows/test_customer_gauge_to_adls.py @@ -5,6 +5,7 @@ import pytest from viadot.flows import CustomerGaugeToADLS +from viadot.exceptions import ValidationError DATA = { "user_name": ["Jane", "Bob"], @@ -15,6 +16,7 @@ "user_address_country_name": "United States", "user_address_country_code": "US", } + COLUMNS = ["user_name", "user_address_street"] ADLS_FILE_NAME = "test_customer_gauge.parquet" ADLS_DIR_PATH = "raw/tests/" @@ -40,3 +42,54 @@ def test_customer_gauge_to_adls_run_flow(mocked_class): assert result.is_successful() os.remove("test_customer_gauge_to_adls_flow_run.parquet") os.remove("test_customer_gauge_to_adls_flow_run.json") + + +@mock.patch( + "viadot.tasks.CustomerGaugeToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_customer_gauge_to_adls_run_flow_validation_success(mocked_class): + flow = CustomerGaugeToADLS( + "test_customer_gauge_to_adls_run_flow_validation_success", + endpoint="responses", + total_load=False, + anonymize=True, + columns_to_anonymize=COLUMNS, + adls_dir_path=ADLS_DIR_PATH, + adls_file_name=ADLS_FILE_NAME, + overwrite_adls=True, + validate_df_dict={"column_size": {"user_address_state": 2}}, + ) + result = flow.run() + assert result.is_successful() + assert len(flow.tasks) == 11 + + os.remove("test_customer_gauge_to_adls_run_flow_validation_success.parquet") + os.remove("test_customer_gauge_to_adls_run_flow_validation_success.json") + + +@mock.patch( + "viadot.tasks.CustomerGaugeToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_customer_gauge_to_adls_run_flow_validation_failure(mocked_class): + flow = CustomerGaugeToADLS( + "test_customer_gauge_to_adls_run_flow_validation_failure", + endpoint="responses", + total_load=False, + anonymize=True, + columns_to_anonymize=COLUMNS, + adls_dir_path=ADLS_DIR_PATH, + adls_file_name=ADLS_FILE_NAME, + overwrite_adls=True, + validate_df_dict={"column_size": {"user_name": 5}}, + ) + try: + flow.run() + except ValidationError: + pass + + os.remove("test_customer_gauge_to_adls_run_flow_validation_failure.parquet") + os.remove("test_customer_gauge_to_adls_run_flow_validation_failure.json") diff --git a/viadot/flows/customer_gauge_to_adls.py b/viadot/flows/customer_gauge_to_adls.py index 8053aeda3..6c54a0704 100644 --- a/viadot/flows/customer_gauge_to_adls.py +++ b/viadot/flows/customer_gauge_to_adls.py @@ -17,6 +17,7 @@ df_to_parquet, dtypes_to_json_task, update_dtypes_dict, + validate_df, ) from viadot.tasks import AzureDataLakeUpload, CustomerGaugeToDF @@ -52,6 +53,7 @@ def __init__( adls_sp_credentials_secret: str = None, overwrite_adls: bool = False, if_exists: str = "replace", + validate_df_dict: dict = None, timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any] @@ -92,6 +94,8 @@ def __init__( Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_exists (str, optional): What to do if the file exists. Defaults to "replace". + validate_df_dict (Dict[str], optional): A dictionary with optional list of tests to verify the output dataframe. + If defined, triggers the `validate_df` task from task_utils. Defaults to None. timeout (int, optional): The time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. """ # CustomerGaugeToDF @@ -105,6 +109,9 @@ def __init__( self.end_date = end_date self.customer_gauge_credentials_secret = customer_gauge_credentials_secret + # validate_df + self.validate_df_dict = validate_df_dict + # anonymize_df self.anonymize = anonymize self.columns_to_anonymize = columns_to_anonymize @@ -169,6 +176,12 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + validation_task = validate_df.bind( + customerg_df, tests=self.validate_df_dict, flow=self + ) + validation_task.set_upstream(customerg_df, flow=self) + if self.anonymize == True: anonymized_df = anonymize_df.bind( customerg_df, From a8de722acd31887024f27aa4d658895a9507b1ab Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 26 Oct 2023 08:42:32 +0200 Subject: [PATCH 31/47] =?UTF-8?q?=F0=9F=8E=A8=20Updated=20validate=5Fdf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/genesys.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index 6588810be..7eec313b6 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -459,7 +459,8 @@ def run( date = start_date.replace("-", "") file_name = f"conversations_detail_{date}".upper() + ".csv" - + if validate_df_dict: + validate_df(df=final_df, tests=validate_df_dict) final_df.to_csv( os.path.join(self.local_file_path, file_name), index=False, From 5e49816e20487c2da7424aa097108e147f02f211 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 26 Oct 2023 08:44:25 +0200 Subject: [PATCH 32/47] =?UTF-8?q?=F0=9F=8E=A8=20Added=20run=20to=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/genesys.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index 7eec313b6..de47ddebf 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -460,7 +460,7 @@ def run( date = start_date.replace("-", "") file_name = f"conversations_detail_{date}".upper() + ".csv" if validate_df_dict: - validate_df(df=final_df, tests=validate_df_dict) + validate_df.run(df=final_df, tests=validate_df_dict) final_df.to_csv( os.path.join(self.local_file_path, file_name), index=False, @@ -499,7 +499,7 @@ def run( file_name = f"WEBMESSAGE_{start}-{end}.csv" if validate_df_dict: - validate_df(df=df, tests=validate_df_dict) + validate_df.run(df=df, tests=validate_df_dict) df.to_csv( os.path.join(file_name), index=False, From b7f9f5673d9c31e6ac47a24732801df2994ab842 Mon Sep 17 00:00:00 2001 From: jkrobicki Date: Thu, 26 Oct 2023 09:28:52 +0200 Subject: [PATCH 33/47] =?UTF-8?q?=E2=9C=85=20Added=20tests=20for=20outlook?= =?UTF-8?q?=5Fto=5Fadls=20flow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../integration/flows/test_outlook_to_adls.py | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 tests/integration/flows/test_outlook_to_adls.py diff --git a/tests/integration/flows/test_outlook_to_adls.py b/tests/integration/flows/test_outlook_to_adls.py new file mode 100644 index 000000000..8250235cb --- /dev/null +++ b/tests/integration/flows/test_outlook_to_adls.py @@ -0,0 +1,91 @@ +import os +from unittest import mock +from datetime import date, timedelta +from prefect.tasks.secrets import PrefectSecret + +import pandas as pd +import pytest + +from viadot.flows import OutlookToADLS + + +ADLS_FILE_NAME = "test_outlook_to_adls.parquet" +ADLS_DIR_PATH = "raw/tests/" + +start_date = date.today() - timedelta(days=1) +start_date = start_date.strftime("%Y-%m-%d") +end_date = date.today().strftime("%Y-%m-%d") + +mailbox_list = [ + "romania.tehnic@velux.com", +] + +DATA = { + "sender": ["sender@mail.com"], + "receivers": ["receiver@mail.com"], +} + + +def test_outlook_to_adls_flow_run(): + flow = OutlookToADLS( + name="Test OutlookToADLS flow run", + mailbox_list=mailbox_list, + outbox_list=["Outbox", "Sent Items"], + start_date=start_date, + end_date=end_date, + local_file_path=ADLS_FILE_NAME, + adls_file_path=ADLS_DIR_PATH + ADLS_FILE_NAME, + adls_sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA", + if_exists="replace", + timeout=4400, + ) + + result = flow.run() + assert result.is_successful() + + +def test_outlook_to_adls_run_flow_validate_fail(): + flow = OutlookToADLS( + name="Test OutlookToADLS validate flow df fail", + mailbox_list=mailbox_list, + outbox_list=["Outbox", "Sent Items"], + start_date=start_date, + end_date=end_date, + local_file_path=ADLS_FILE_NAME, + adls_file_path=ADLS_DIR_PATH + ADLS_FILE_NAME, + adls_sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA", + if_exists="replace", + validation_df_dict={"column_list_to_match": ["test", "wrong", "columns"]}, + timeout=4400, + ) + + result = flow.run() + assert result.is_failed() + + +@mock.patch( + "viadot.tasks.OutlookToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_outlook_to_adls_run_flow_validate_success(mocked_task): + flow = OutlookToADLS( + name="Test OutlookToADLS validate flow df success", + mailbox_list=mailbox_list, + outbox_list=["Outbox", "Sent Items"], + start_date=start_date, + end_date=end_date, + local_file_path=ADLS_FILE_NAME, + adls_file_path=ADLS_DIR_PATH + ADLS_FILE_NAME, + adls_sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA", + if_exists="replace", + validation_df_dict={"column_list_to_match": ["sender", "receivers"]}, + timeout=4400, + ) + + result = flow.run() + assert result.is_successful() + + os.remove("test_outlook_to_adls.parquet") + os.remove("romania_tehnic.csv") + os.remove("o365_token.txt") From bed6995d110233658372c19228f199a4213fb4a6 Mon Sep 17 00:00:00 2001 From: jkrobicki Date: Thu, 26 Oct 2023 09:43:35 +0200 Subject: [PATCH 34/47] Cleaned imports and secrets --- tests/integration/flows/test_outlook_to_adls.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/integration/flows/test_outlook_to_adls.py b/tests/integration/flows/test_outlook_to_adls.py index 8250235cb..ad88ce5da 100644 --- a/tests/integration/flows/test_outlook_to_adls.py +++ b/tests/integration/flows/test_outlook_to_adls.py @@ -7,8 +7,11 @@ import pytest from viadot.flows import OutlookToADLS +from viadot.tasks import AzureDataLakeRemove - +ADLS_CREDENTIAL_SECRET = PrefectSecret( + "AZURE_DEFAULT_ADLS_SERVICE_PRINCIPAL_SECRET" +).run() ADLS_FILE_NAME = "test_outlook_to_adls.parquet" ADLS_DIR_PATH = "raw/tests/" @@ -35,7 +38,7 @@ def test_outlook_to_adls_flow_run(): end_date=end_date, local_file_path=ADLS_FILE_NAME, adls_file_path=ADLS_DIR_PATH + ADLS_FILE_NAME, - adls_sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA", + adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET, if_exists="replace", timeout=4400, ) @@ -53,7 +56,7 @@ def test_outlook_to_adls_run_flow_validate_fail(): end_date=end_date, local_file_path=ADLS_FILE_NAME, adls_file_path=ADLS_DIR_PATH + ADLS_FILE_NAME, - adls_sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA", + adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET, if_exists="replace", validation_df_dict={"column_list_to_match": ["test", "wrong", "columns"]}, timeout=4400, @@ -77,7 +80,7 @@ def test_outlook_to_adls_run_flow_validate_success(mocked_task): end_date=end_date, local_file_path=ADLS_FILE_NAME, adls_file_path=ADLS_DIR_PATH + ADLS_FILE_NAME, - adls_sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA", + adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET, if_exists="replace", validation_df_dict={"column_list_to_match": ["sender", "receivers"]}, timeout=4400, @@ -89,3 +92,7 @@ def test_outlook_to_adls_run_flow_validate_success(mocked_task): os.remove("test_outlook_to_adls.parquet") os.remove("romania_tehnic.csv") os.remove("o365_token.txt") + rm = AzureDataLakeRemove( + path=ADLS_DIR_PATH + ADLS_FILE_NAME, vault_name="azuwevelcrkeyv001s" + ) + rm.run(sp_credentials_secret=ADLS_CREDENTIAL_SECRET) From 5a5b153445d485c63fb064f6306c786bdf67d71c Mon Sep 17 00:00:00 2001 From: jkrobicki Date: Thu, 26 Oct 2023 10:16:34 +0200 Subject: [PATCH 35/47] Corrected parameter to validate_df_dict --- tests/integration/flows/test_outlook_to_adls.py | 4 ++-- viadot/flows/outlook_to_adls.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/integration/flows/test_outlook_to_adls.py b/tests/integration/flows/test_outlook_to_adls.py index ad88ce5da..dc215191d 100644 --- a/tests/integration/flows/test_outlook_to_adls.py +++ b/tests/integration/flows/test_outlook_to_adls.py @@ -58,7 +58,7 @@ def test_outlook_to_adls_run_flow_validate_fail(): adls_file_path=ADLS_DIR_PATH + ADLS_FILE_NAME, adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET, if_exists="replace", - validation_df_dict={"column_list_to_match": ["test", "wrong", "columns"]}, + validate_df_dict={"column_list_to_match": ["test", "wrong", "columns"]}, timeout=4400, ) @@ -82,7 +82,7 @@ def test_outlook_to_adls_run_flow_validate_success(mocked_task): adls_file_path=ADLS_DIR_PATH + ADLS_FILE_NAME, adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET, if_exists="replace", - validation_df_dict={"column_list_to_match": ["sender", "receivers"]}, + validate_df_dict={"column_list_to_match": ["sender", "receivers"]}, timeout=4400, ) diff --git a/viadot/flows/outlook_to_adls.py b/viadot/flows/outlook_to_adls.py index c736cd236..28575640c 100644 --- a/viadot/flows/outlook_to_adls.py +++ b/viadot/flows/outlook_to_adls.py @@ -30,7 +30,7 @@ def __init__( limit: int = 10000, timeout: int = 3600, if_exists: Literal["append", "replace", "skip"] = "append", - validation_df_dict: dict = None, + validate_df_dict: dict = None, outlook_credentials_secret: str = "OUTLOOK", *args: List[Any], **kwargs: Dict[str, Any], @@ -56,7 +56,7 @@ def __init__( timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. if_exists (Literal['append', 'replace', 'skip'], optional): What to do if the local file already exists. Defaults to "append". - validation_df_dict (dict, optional): An optional dictionary to verify the received dataframe. + validate_df_dict (dict, optional): An optional dictionary to verify the received dataframe. When passed, `validate_df` task validation tests are triggered. Defaults to None. """ @@ -70,7 +70,7 @@ def __init__( self.if_exsists = if_exists # Validate DataFrame - self.validation_df_dict = validation_df_dict + self.validate_df_dict = validate_df_dict # AzureDataLakeUpload self.adls_file_path = adls_file_path @@ -106,8 +106,8 @@ def gen_flow(self) -> Flow: df = union_dfs_task.bind(dfs, flow=self) - if self.validation_df_dict: - validation = validate_df(df=df, tests=self.validation_df_dict, flow=self) + if self.validate_df_dict: + validation = validate_df(df=df, tests=self.validate_df_dict, flow=self) validation.set_upstream(df, flow=self) df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) From d18fabe9b6e6cb21fb5bef66db9616420712ede8 Mon Sep 17 00:00:00 2001 From: jkrobicki Date: Thu, 26 Oct 2023 10:18:55 +0200 Subject: [PATCH 36/47] Corrected parameter to validate_df_dict --- tests/integration/flows/test_bigquery_to_adls.py | 5 ++--- viadot/flows/bigquery_to_adls.py | 10 +++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/integration/flows/test_bigquery_to_adls.py b/tests/integration/flows/test_bigquery_to_adls.py index f986d0791..de793344a 100644 --- a/tests/integration/flows/test_bigquery_to_adls.py +++ b/tests/integration/flows/test_bigquery_to_adls.py @@ -100,9 +100,8 @@ def test_bigquery_to_adls_validate_df_fail(mocked_data): overwrite_adls=True, adls_dir_path=ADLS_DIR_PATH, adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET, - validation_df_dict={"column_list_to_match": ["type", "country", "test"]}, + validate_df_dict={"column_list_to_match": ["type", "country", "test"]}, ) - try: result = flow_bigquery.run() except ValidationError: @@ -127,7 +126,7 @@ def test_bigquery_to_adls_validate_df_success(mocked_data): overwrite_adls=True, adls_dir_path=ADLS_DIR_PATH, adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET, - validation_df_dict={"column_list_to_match": ["type", "country"]}, + validate_df_dict={"column_list_to_match": ["type", "country"]}, ) result = flow_bigquery.run() diff --git a/viadot/flows/bigquery_to_adls.py b/viadot/flows/bigquery_to_adls.py index 5283a2bda..6fa9c5eb9 100644 --- a/viadot/flows/bigquery_to_adls.py +++ b/viadot/flows/bigquery_to_adls.py @@ -41,7 +41,7 @@ def __init__( adls_sp_credentials_secret: str = None, overwrite_adls: bool = False, if_exists: str = "replace", - validation_df_dict: dict = None, + validate_df_dict: dict = None, timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], @@ -80,7 +80,7 @@ def __init__( Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_exists (str, optional): What to do if the file exists. Defaults to "replace". - validation_df_dict (dict, optional): An optional dictionary to verify the received dataframe. + validate_df_dict (dict, optional): An optional dictionary to verify the received dataframe. When passed, `validate_df` task validation tests are triggered. Defaults to None. timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. @@ -96,7 +96,7 @@ def __init__( self.credentials_key = credentials_key # Validate DataFrame - self.validation_df_dict = validation_df_dict + self.validate_df_dict = validate_df_dict # AzureDataLakeUpload self.overwrite = overwrite_adls @@ -147,8 +147,8 @@ def gen_flow(self) -> Flow: flow=self, ) - if self.validation_df_dict: - validation = validate_df(df=df, tests=self.validation_df_dict, flow=self) + if self.validate_df_dict: + validation = validate_df(df=df, tests=self.validate_df_dict, flow=self) validation.set_upstream(df, flow=self) df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) From ad6432fa01fab32bc4c8bc1a1e86e7aa19a20648 Mon Sep 17 00:00:00 2001 From: jkrobicki Date: Thu, 26 Oct 2023 10:30:37 +0200 Subject: [PATCH 37/47] Corrected validation if statement --- viadot/flows/bigquery_to_adls.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/viadot/flows/bigquery_to_adls.py b/viadot/flows/bigquery_to_adls.py index 6fa9c5eb9..935b3f7a1 100644 --- a/viadot/flows/bigquery_to_adls.py +++ b/viadot/flows/bigquery_to_adls.py @@ -148,8 +148,10 @@ def gen_flow(self) -> Flow: ) if self.validate_df_dict: - validation = validate_df(df=df, tests=self.validate_df_dict, flow=self) - validation.set_upstream(df, flow=self) + validation_task = validate_df.bind( + df, tests=self.validate_df_dict, flow=self + ) + validation_task.set_upstream(df, flow=self) df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) From 6e396c4adccae2df793adf6715caaacd584fa56c Mon Sep 17 00:00:00 2001 From: jkrobicki Date: Thu, 26 Oct 2023 10:33:25 +0200 Subject: [PATCH 38/47] Corrected validation if statement --- viadot/flows/outlook_to_adls.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/viadot/flows/outlook_to_adls.py b/viadot/flows/outlook_to_adls.py index 28575640c..606674ba1 100644 --- a/viadot/flows/outlook_to_adls.py +++ b/viadot/flows/outlook_to_adls.py @@ -107,8 +107,10 @@ def gen_flow(self) -> Flow: df = union_dfs_task.bind(dfs, flow=self) if self.validate_df_dict: - validation = validate_df(df=df, tests=self.validate_df_dict, flow=self) - validation.set_upstream(df, flow=self) + validation_task = validate_df.bind( + df, tests=self.validate_df_dict, flow=self + ) + validation_task.set_upstream(df, flow=self) df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) From 34e40cb6987849210f3976e3d3d735e72fe15fd1 Mon Sep 17 00:00:00 2001 From: Mateusz Pazdzior <59165045+m-paz@users.noreply.github.com> Date: Thu, 26 Oct 2023 11:07:45 +0100 Subject: [PATCH 39/47] Delete tests/integration/flows/test_outlook_to_adls.py --- .../integration/flows/test_outlook_to_adls.py | 98 ------------------- 1 file changed, 98 deletions(-) delete mode 100644 tests/integration/flows/test_outlook_to_adls.py diff --git a/tests/integration/flows/test_outlook_to_adls.py b/tests/integration/flows/test_outlook_to_adls.py deleted file mode 100644 index dc215191d..000000000 --- a/tests/integration/flows/test_outlook_to_adls.py +++ /dev/null @@ -1,98 +0,0 @@ -import os -from unittest import mock -from datetime import date, timedelta -from prefect.tasks.secrets import PrefectSecret - -import pandas as pd -import pytest - -from viadot.flows import OutlookToADLS -from viadot.tasks import AzureDataLakeRemove - -ADLS_CREDENTIAL_SECRET = PrefectSecret( - "AZURE_DEFAULT_ADLS_SERVICE_PRINCIPAL_SECRET" -).run() -ADLS_FILE_NAME = "test_outlook_to_adls.parquet" -ADLS_DIR_PATH = "raw/tests/" - -start_date = date.today() - timedelta(days=1) -start_date = start_date.strftime("%Y-%m-%d") -end_date = date.today().strftime("%Y-%m-%d") - -mailbox_list = [ - "romania.tehnic@velux.com", -] - -DATA = { - "sender": ["sender@mail.com"], - "receivers": ["receiver@mail.com"], -} - - -def test_outlook_to_adls_flow_run(): - flow = OutlookToADLS( - name="Test OutlookToADLS flow run", - mailbox_list=mailbox_list, - outbox_list=["Outbox", "Sent Items"], - start_date=start_date, - end_date=end_date, - local_file_path=ADLS_FILE_NAME, - adls_file_path=ADLS_DIR_PATH + ADLS_FILE_NAME, - adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET, - if_exists="replace", - timeout=4400, - ) - - result = flow.run() - assert result.is_successful() - - -def test_outlook_to_adls_run_flow_validate_fail(): - flow = OutlookToADLS( - name="Test OutlookToADLS validate flow df fail", - mailbox_list=mailbox_list, - outbox_list=["Outbox", "Sent Items"], - start_date=start_date, - end_date=end_date, - local_file_path=ADLS_FILE_NAME, - adls_file_path=ADLS_DIR_PATH + ADLS_FILE_NAME, - adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET, - if_exists="replace", - validate_df_dict={"column_list_to_match": ["test", "wrong", "columns"]}, - timeout=4400, - ) - - result = flow.run() - assert result.is_failed() - - -@mock.patch( - "viadot.tasks.OutlookToDF.run", - return_value=pd.DataFrame(data=DATA), -) -@pytest.mark.run -def test_outlook_to_adls_run_flow_validate_success(mocked_task): - flow = OutlookToADLS( - name="Test OutlookToADLS validate flow df success", - mailbox_list=mailbox_list, - outbox_list=["Outbox", "Sent Items"], - start_date=start_date, - end_date=end_date, - local_file_path=ADLS_FILE_NAME, - adls_file_path=ADLS_DIR_PATH + ADLS_FILE_NAME, - adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET, - if_exists="replace", - validate_df_dict={"column_list_to_match": ["sender", "receivers"]}, - timeout=4400, - ) - - result = flow.run() - assert result.is_successful() - - os.remove("test_outlook_to_adls.parquet") - os.remove("romania_tehnic.csv") - os.remove("o365_token.txt") - rm = AzureDataLakeRemove( - path=ADLS_DIR_PATH + ADLS_FILE_NAME, vault_name="azuwevelcrkeyv001s" - ) - rm.run(sp_credentials_secret=ADLS_CREDENTIAL_SECRET) From ee2071dcf5ad68742f8a451b76ba8f1e745e5835 Mon Sep 17 00:00:00 2001 From: dominikjedlinski Date: Thu, 26 Oct 2023 12:28:26 +0200 Subject: [PATCH 40/47] reverted test_aselite_to_adls --- .../integration/flows/test_aselite_to_adls.py | 55 +------------------ 1 file changed, 1 insertion(+), 54 deletions(-) diff --git a/tests/integration/flows/test_aselite_to_adls.py b/tests/integration/flows/test_aselite_to_adls.py index 60022f47e..48146c293 100644 --- a/tests/integration/flows/test_aselite_to_adls.py +++ b/tests/integration/flows/test_aselite_to_adls.py @@ -8,7 +8,7 @@ from prefect.tasks.secrets import PrefectSecret from viadot.flows.aselite_to_adls import ASELiteToADLS -from viadot.task_utils import df_converts_bytes_to_int, df_to_csv, validate_df +from viadot.task_utils import df_converts_bytes_to_int, df_to_csv from viadot.tasks import AzureDataLakeUpload from viadot.tasks.aselite import ASELiteToDF @@ -62,56 +62,3 @@ def test_aselite_to_adls(): assert MAIN_DF.shape == (10, 17) os.remove(TMP_FILE_NAME) - - -def test_aselite_to_adls_validate_success(): - credentials_secret = PrefectSecret("aselite").run() - vault_name = PrefectSecret("AZURE_DEFAULT_KEYVAULT").run() - - query_designer = """SELECT TOP 10 [ID] - ,[SpracheText] - ,[SpracheKat] - ,[SpracheMM] - ,[KatSprache] - ,[KatBasisSprache] - ,[CodePage] - ,[Font] - ,[Neu] - ,[Upd] - ,[UpdL] - ,[LosKZ] - ,[AstNr] - ,[KomKz] - ,[RKZ] - ,[ParentLanguageNo] - ,[UPD_FIELD] - FROM [UCRMDEV].[dbo].[CRM_00]""" - - validate_df_dict = { - "column_size": {"ParentLanguageNo": 1}, - "column_unique_values": ["ID"], - "dataset_row_count": {"min": 0, "max": 10}, - "column_match_regex": {"SpracheText", r"TE_.*"}, - } - - flow = ASELiteToADLS( - "Test flow", - query=query_designer, - sqldb_credentials_secret=credentials_secret, - vault_name=vault_name, - file_path=TMP_FILE_NAME, - validate_df_dict=validate_df_dict, - to_path="raw/supermetrics/mp/result_df_flow_at_des_m.csv", - run_config=None, - ) - - result = flow.run() - assert result.is_successful() - - MAIN_DF = pd.read_csv(TMP_FILE_NAME, delimiter="\t") - - assert isinstance(MAIN_DF, pd.DataFrame) == True - - assert MAIN_DF.shape == (10, 17) - - os.remove(TMP_FILE_NAME) From d6590aed9a7fe955f9153bf7f8819f74bddbc4b6 Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Thu, 26 Oct 2023 13:08:14 +0200 Subject: [PATCH 41/47] Sharepoint list connector --- CHANGELOG.md | 9 +- tests/integration/test_sharepoint.py | 152 +++++++++- viadot/flows/__init__.py | 4 +- viadot/flows/sharepoint_to_adls.py | 207 ++++++++++++- viadot/sources/__init__.py | 4 +- viadot/sources/sharepoint.py | 424 ++++++++++++++++++++++++++- viadot/tasks/__init__.py | 2 +- viadot/tasks/sharepoint.py | 175 ++++++++++- viadot/utils.py | 11 + 9 files changed, 970 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a0be2f82..2ede484c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - Added `validate_df` task to task_utils. +- Added `SharepointList` source class. +- Added `SharepointListToDF` task class. +- Added `SharepointListToADLS` flow class. +- Added tests for `SharepointList`. +- Added `get_nested_dict` to untils.py. +- Added `validate_df` task to `SharepointToADLS` class. + ### Fixed ### Changed @@ -618,4 +625,4 @@ specified in the `SUPERMETRICS_DEFAULT_USER` secret - Moved from poetry to pip ### Fixed -- Fix `AzureBlobStorage`'s `to_storage()` method is missing the final upload blob part +- Fix `AzureBlobStorage`'s `to_storage()` method is missing the final upload blob part \ No newline at end of file diff --git a/tests/integration/test_sharepoint.py b/tests/integration/test_sharepoint.py index c784fa682..b56d10182 100644 --- a/tests/integration/test_sharepoint.py +++ b/tests/integration/test_sharepoint.py @@ -1,6 +1,8 @@ import os +import re import pandas as pd +from copy import deepcopy import pytest from prefect.tasks.secrets import PrefectSecret @@ -9,6 +11,7 @@ from viadot.sources import Sharepoint from viadot.task_utils import df_get_data_types_task from viadot.tasks.sharepoint import SharepointToDF +from viadot.sources import SharepointList def get_url() -> str: @@ -18,7 +21,7 @@ def get_url() -> str: Returns: str: File URL. """ - return local_config["SHAREPOINT"].get("url") + return local_config["SHAREPOINT"].get("file_url") @pytest.fixture(scope="session") @@ -163,3 +166,150 @@ def test_get_data_types(file_name): dtypes = dtypes_map.values() assert "String" in dtypes + + +# testing get_connection function passing invalid credentials and raises AuthenticationContext error. +def test_get_connection(): + site_url = "https://velux.sharepoint.com/" + credentials ={ + "SHAREPOINT_CERT": + {"TENANT": "xxx", + "CLIENT_ID": "123", + "SCOPES": "https://velux.sharepoint.com/", + "THUMBPRINT": "xyz", + "PRIVATE_KEY": "private"} + } + + spl = SharepointList(credentials=credentials) + with pytest.raises(AttributeError, match="'SharepointList' object has no attribute 'ctx'"): + spl.get_connection(site_url=site_url) + + +@pytest.fixture(scope="session") +def sharepoint_list(): + """ + Fixture for creating a Sharepoint class instance. + The class instance can be used within a test functions to interact with Sharepoint. + """ + spl = SharepointList() + yield spl + + +def test_valid_filters(sharepoint_list): + filters = { + 'filter1': {'dtype': 'int', 'operator1': '<', 'value1': 10}, + 'filter2': {'dtype': 'str', 'operator1': '==', 'value1': 'value'}, + } + result = sharepoint_list.check_filters(filters) + assert result is True + +def test_invalid_dtype(sharepoint_list): + filters = { + 'filter1': {'dtype': 'list', 'operator1': '>', 'value1': 10}, + } + with pytest.raises(ValueError, match="dtype not allowed!"): + sharepoint_list.check_filters(filters) + +def test_missing_operator1(sharepoint_list): + filters = { + 'filter1': {'dtype': 'int', 'value1': 10}, + } + with pytest.raises(ValueError, match="Operator1 is missing!"): + sharepoint_list.check_filters(filters) + +def test_invalid_operator1(sharepoint_list): + filters = { + 'filter1': {'dtype': 'int', 'operator1': '*', 'value1': 10}, + } + with pytest.raises(ValueError, match="Operator type not allowed!"): + sharepoint_list.check_filters(filters) + +def test_missing_value1(sharepoint_list): + filters = { + 'filter1': {'dtype': 'int', 'operator1': '>', 'value1': None}, + } + with pytest.raises(ValueError, match="Value for operator1 is missing!"): + sharepoint_list.check_filters(filters) + +def test_missing_operators_conjuction(sharepoint_list): + filters = { + 'filter1': {'dtype': 'int', 'operator1': '>', 'value1': 10, 'operator2': '<', 'value2': 20}, + } + with pytest.raises(ValueError, match="Operators for conjuction is missing!"): + sharepoint_list.check_filters(filters) + +def test_invalid_operators_conjuction(sharepoint_list): + filters = { + 'filter1': {'dtype': 'int', 'operator1': '>', 'value1': 10, 'operator2': '<', 'value2': 20, 'operators_conjuction': '!'}, + } + with pytest.raises(ValueError, match="Operators for conjuction not allowed!"): + sharepoint_list.check_filters(filters) + +def test_invalid_filters_conjuction(sharepoint_list): + filters = { + 'filter1': {'dtype': 'int', 'operator1': '>', 'value1': 10, 'filters_conjuction': '!'}, + } + with pytest.raises(ValueError, match="Filters operators for conjuction not allowed!"): + sharepoint_list.check_filters(filters) + +def test_valid_mapping(sharepoint_list): + filters = { + 'filter1': {'operator1': '>', 'operator2': '<=', 'operators_conjuction': '&', 'filters_conjuction': '|'}, + 'filter2': {'operator1': '==', 'operator2': '!=', 'operators_conjuction': '|'}, + } + expected_result = { + 'filter1': {'operator1': 'gt', 'operator2': 'le', 'operators_conjuction': 'and', 'filters_conjuction': 'or'}, + 'filter2': {'operator1': 'eq', 'operator2': 'ne', 'operators_conjuction': 'or'}, + } + result = sharepoint_list.operators_mapping(deepcopy(filters)) + assert result == expected_result + +def test_invalid_comparison_operator(sharepoint_list): + filters = { + 'filter1': {'operator1': '*', 'operator2': '<=', 'operators_conjuction': '&', 'filters_conjuction': '|'}, + } + error_message = "This comparison operator: * is not allowed. Please read the function documentation for details!" + with pytest.raises(ValueError, match=re.escape(error_message)): + sharepoint_list.operators_mapping(deepcopy(filters)) + +def test_invalid_logical_operator(sharepoint_list): + filters = { + 'filter1': {'operator1': '>', 'operator2': '<=', 'operators_conjuction': '!', 'filters_conjuction': '|'}, + } + error_message = "This conjuction(logical) operator: ! is not allowed. Please read the function documentation for details!" + with pytest.raises(ValueError, match=re.escape(error_message)): + sharepoint_list.operators_mapping(deepcopy(filters)) + +def test_single_filter_datetime_api(sharepoint_list): + filters = { + 'date_column': {'dtype': 'datetime', 'operator1': '>', 'value1': '2023-01-01'} + } + result = sharepoint_list.make_filter_for_api(filters) + expected_result = "date_column gt datetime'2023-01-01T00:00:00' " + assert result == expected_result + +def test_multiple_filters_api(sharepoint_list): + filters = { + 'int_column': {'dtype': 'int', 'operator1': '>=', 'value1': 10, 'operator2': '<', 'value2': 20}, + 'str_column': {'dtype': 'str', 'operator1': '==', 'value1': 'example'} + } + result = sharepoint_list.make_filter_for_api(filters) + expected_result = "int_column ge '10'int_column lt '20'str_column eq 'example'" + assert result == expected_result + +def test_single_df_filter(sharepoint_list): + filters = { + 'column1': {'operator1': '>', 'value1': 10} + } + result = sharepoint_list.make_filter_for_df(filters) + expected_result = "df.loc[(df.column1 > '10')]" + assert result == expected_result + +def test_multiple_df_filters(sharepoint_list): + filters = { + 'column1': {'operator1': '>', 'value1': 10, 'filters_conjuction': '&'}, + 'column2': {'operator1': '<', 'value1': 20} + } + result = sharepoint_list.make_filter_for_df(filters) + expected_result = "df.loc[(df.column1 > '10')&(df.column2 < '20')]" + assert result == expected_result \ No newline at end of file diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index de2f618ab..ed6b5a004 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -11,7 +11,7 @@ from .genesys_to_adls import GenesysToADLS from .outlook_to_adls import OutlookToADLS from .salesforce_to_adls import SalesforceToADLS -from .sharepoint_to_adls import SharepointToADLS +from .sharepoint_to_adls import SharepointToADLS, SharepointListToADLS from .supermetrics_to_adls import SupermetricsToADLS from .supermetrics_to_azure_sql import SupermetricsToAzureSQL @@ -46,4 +46,4 @@ from .sql_server_to_parquet import SQLServerToParquet from .sql_server_transform import SQLServerTransform from .transform_and_catalog import TransformAndCatalogToLuma -from .vid_club_to_adls import VidClubToADLS +from .vid_club_to_adls import VidClubToADLS \ No newline at end of file diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index df5562221..c234b4a88 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -3,12 +3,10 @@ from typing import Any, Dict, List import pendulum -from prefect import Flow, task +from prefect import Flow from prefect.backend import set_key_value from prefect.utilities import logging -logger = logging.get_logger() - from viadot.task_utils import ( add_ingestion_metadata_task, df_get_data_types_task, @@ -16,9 +14,13 @@ df_to_csv, df_to_parquet, dtypes_to_json_task, + validate_df, ) from viadot.tasks import AzureDataLakeUpload -from viadot.tasks.sharepoint import SharepointToDF +from viadot.tasks.sharepoint import SharepointToDF, SharepointListToDF + + +logger = logging.get_logger() class SharepointToADLS(Flow): @@ -38,6 +40,7 @@ def __init__( overwrite_adls: bool = False, if_empty: str = "warn", if_exists: str = "replace", + validate_df_dict: dict = None, timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], @@ -62,6 +65,8 @@ def __init__( Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_empty (str, optional): What to do if query returns no data. Defaults to "warn". + validate_df_dict (dict, optional): A dictionary with optional list of tests to verify the output + dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. """ @@ -74,6 +79,7 @@ def __init__( self.sheet_number = sheet_number self.validate_excel_file = validate_excel_file self.timeout = timeout + self.validate_df_dict = validate_df_dict # AzureDataLakeUpload self.overwrite = overwrite_adls @@ -117,6 +123,10 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + validation = validate_df(df=df, tests=self.validate_df_dict, flow=self) + validation.set_upstream(df, flow=self) + df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) df_mapped = df_map_mixed_dtypes_for_parquet.bind( @@ -169,3 +179,192 @@ def gen_flow(self) -> Flow: @staticmethod def slugify(name): return name.replace(" ", "_").lower() + + + +class SharepointListToADLS(Flow): + def __init__( + self, + name: str, + list_title: str = None, + site_url: str = None, + required_fields: List[str] = None, + field_property: str = "Title", + filters: dict = None, + row_count: int = 5000, + sp_cert_credentials_secret: str = None, + vault_name: str = None, + path: str = None, + adls_dir_path: str = None, + adls_file_name: str = None, + adls_sp_credentials_secret: str = None, + overwrite_adls: bool = True, + output_file_extension: str = ".parquet", + validate_df_dict: dict = None, + *args: List[any], + **kwargs: Dict[str, Any], + ): + """ + Run Flow SharepointListToADLS. + + Args: + name (str): Prefect flow name. + list_title (str): Title of Sharepoint List. Default to None. + site_url (str): URL to set of Sharepoint Lists. Default to None. + required_fields (List[str]): Required fields(columns) need to be extracted from + Sharepoint List. Default to None. + field_property (List[str]): Property to expand with expand query method. + All propertys can be found under list.item.properties. + Default to ["Title"] + filters (dict): Dictionary with operators which filters the SharepointList output. + allowed dtypes: ('datetime','date','bool','int', 'float', 'complex', 'str') + allowed conjuction: ('&','|') + allowed operators: ('<','>','<=','>=','==','!=') + Example how to build the dict: + filters = { + 'Column_name_1' : + { + 'dtype': 'datetime', + 'value1':'YYYY-MM-DD', + 'value2':'YYYY-MM-DD', + 'operator1':'>=', + 'operator2':'<=', + 'operators_conjuction':'&', # conjuction operators allowed only when 2 values passed + 'filters_conjuction':'&', # conjuction filters allowed only when 2 columns passed + } + , + 'Column_name_2' : + { + 'dtype': 'str', + 'value1':'NM-PL', + 'operator1':'==', + }, + } + row_count (int): Number of downloaded rows in single request. Default to 5000. + sp_cert_credentials_secret (str): Credentials to verify Sharepoint connection. Default to None. + vault_name (str): KeyVaultSecret name. Default to None. + path (str): Local file path. Default to None. + adls_dir_path (str): Azure Data Lake destination folder/catalog path. Defaults to None. + adls_file_name (str, optional): Name of file in ADLS. Defaults to None. + adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with + ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, + CLIENT_SECRET) for the Azure Data Lake. Defaults to None. + overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to True. + + Returns: + .parquet file inside ADLS. + """ + + # SharepointListToDF + self.path = path + self.list_title = list_title + self.site_url = site_url + self.required_fields = required_fields + self.field_property = field_property + self.filters = filters + self.sp_cert_credentials_secret = sp_cert_credentials_secret + self.vault_name = vault_name + self.row_count = row_count + self.validate_df_dict = validate_df_dict + + + # AzureDataLakeUpload + self.adls_dir_path = adls_dir_path + self.adls_file_name = adls_file_name + self.overwrite = overwrite_adls + self.adls_sp_credentials_secret = adls_sp_credentials_secret + self.output_file_extension = output_file_extension + self.now = str(pendulum.now("utc")) + if self.path is not None: + self.local_file_path = ( + self.path + self.slugify(name) + self.output_file_extension + ) + else: + self.local_file_path = self.slugify(name) + self.output_file_extension + self.local_json_path = self.slugify(name) + ".json" + self.adls_dir_path = adls_dir_path + if adls_file_name is not None: + self.adls_file_path = os.path.join(adls_dir_path, adls_file_name) + self.adls_schema_file_dir_file = os.path.join( + adls_dir_path, "schema", Path(adls_file_name).stem + ".json" + ) + else: + self.adls_file_path = os.path.join( + adls_dir_path, self.now + self.output_file_extension + ) + self.adls_schema_file_dir_file = os.path.join( + adls_dir_path, "schema", self.now + ".json" + ) + + + super().__init__( + name=name, + *args, + **kwargs, + ) + + self.gen_flow() + + def gen_flow(self) -> Flow: + s = SharepointListToDF( + path = self.path, + list_title = self.list_title, + site_url = self.site_url, + required_fields = self.required_fields, + field_property = self.field_property, + filters = self.filters, + row_count = self.row_count, + credentials_secret=self.sp_cert_credentials_secret, + ) + df = s.run() + + if self.validate_df_dict: + validation = validate_df(df=df, tests=self.validate_df_dict, flow=self) + validation.set_upstream(df, flow=self) + + df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) + dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) + df_mapped = df_map_mixed_dtypes_for_parquet.bind( + df_with_metadata, dtypes_dict, flow=self + ) + + df_to_file = df_to_parquet.bind( + df=df_mapped, + path=self.path, + flow=self, + ) + + file_to_adls_task = AzureDataLakeUpload() + file_to_adls_task.bind( + from_path=self.path, + to_path=self.adls_dir_path, + overwrite=self.overwrite, + sp_credentials_secret=self.adls_sp_credentials_secret, + flow=self, + ) + + dtypes_to_json_task.bind( + dtypes_dict=dtypes_dict, local_json_path=self.local_json_path, flow=self + ) + + + json_to_adls_task = AzureDataLakeUpload() + json_to_adls_task.bind( + from_path=self.local_json_path, + to_path=self.adls_schema_file_dir_file, + overwrite=self.overwrite, + sp_credentials_secret=self.adls_sp_credentials_secret, + flow=self, + ) + + df_mapped.set_upstream(df_with_metadata, flow=self) + dtypes_to_json_task.set_upstream(df_mapped, flow=self) + df_to_file.set_upstream(dtypes_to_json_task, flow=self) + + file_to_adls_task.set_upstream(df_to_file, flow=self) + json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) + set_key_value(key=self.adls_dir_path, value=self.adls_file_path) + + @staticmethod + def slugify(name): + return name.replace(" ", "_").lower() \ No newline at end of file diff --git a/viadot/sources/__init__.py b/viadot/sources/__init__.py index 094cec14e..8308c78c9 100644 --- a/viadot/sources/__init__.py +++ b/viadot/sources/__init__.py @@ -8,7 +8,7 @@ from .outlook import Outlook from .salesforce import Salesforce from .sftp import SftpConnector -from .sharepoint import Sharepoint +from .sharepoint import Sharepoint, SharepointList from .supermetrics import Supermetrics try: @@ -33,4 +33,4 @@ # APIS from .uk_carbon_intensity import UKCarbonIntensity -from .vid_club import VidClub +from .vid_club import VidClub \ No newline at end of file diff --git a/viadot/sources/sharepoint.py b/viadot/sources/sharepoint.py index 61eda17f2..028499c94 100644 --- a/viadot/sources/sharepoint.py +++ b/viadot/sources/sharepoint.py @@ -1,10 +1,26 @@ -from typing import Any, Dict - -import sharepy - from ..config import local_config from ..exceptions import CredentialError from .base import Source +from viadot.utils import get_nested_dict + +from typing import Any, Dict, List +from fnmatch import fnmatch +from datetime import datetime +from copy import deepcopy +import pandas as pd + +import sharepy +from office365.runtime.auth.authentication_context import AuthenticationContext +from office365.sharepoint.client_context import ClientContext +from office365.runtime.client_request_exception import ClientRequestException +from prefect.utilities import logging + + +logger = logging.get_logger() + +# Print out how many rows was extracted in specific iteration +def log_of_progress(items): + logger.info("Items read: {0}".format(len(items))) class Sharepoint(Source): @@ -64,3 +80,403 @@ def download_file( url=download_from_path, filename=download_to_path, ) + + + +class SharepointList(Source): + """ + A Sharepoint_List class to connect and download data from sharpoint lists. + + Args: + credentials (dict): Credentials should include: + - "tenant" + - "client_id" + - "scopes" + - "thumbprint" + - "private_key" + """ + + def __init__( + self, + credentials: Dict[str, Any] = None, + *args, + **kwargs, + ): + DEFAULT_CREDENTIALS = local_config.get("SHAREPOINT_CERT") + credentials = credentials or DEFAULT_CREDENTIALS + if credentials is None: + raise CredentialError("Credentials not found.") + + super().__init__(*args, credentials=credentials, **kwargs) + + + def get_connection( + self, + site_url: str = None, + ): + + # Connecting into Sharepoint with AuthenticationContext + try: + auth_context = AuthenticationContext(site_url) + auth_context.with_client_certificate(tenant=self.credentials["TENANT"], + client_id=self.credentials["CLIENT_ID"], + scopes=[self.credentials["SCOPES"]], + thumbprint=self.credentials["THUMBPRINT"], + private_key=self.credentials["PRIVATE_KEY"]) + + self.ctx = ClientContext(site_url, auth_context) + logger.info("Successfully connect to Sharepoint Lists") + + except Exception as ex: + logger.info(f"Error at ClientContext or authentication level: {ex}") + + return self.ctx + + + # Function for extracting list items from search fields + def _unpack_fields( + self, + list_item, + selected_fields: dict = None, + ): + + # Creating the body of dictionary + new_dict = dict() + + # For loop scanning the propertys of searching fields + item_values_dict = list_item.properties + for field,val in item_values_dict.items(): + nested_dict = get_nested_dict(val) + # Check if the dictionary is nested + if nested_dict != None: + # It might be that there are different field properties than expected + nested_value = nested_dict.get(selected_fields["FieldProperty"]) + if nested_value != None: + new_dict[field] = nested_value + else: + logger.info("I'm not the right value") + raise ValueError + else: + new_dict[field] = val + + return new_dict + + + def get_fields( + self, + list_title: str = None, + site_url: str = None, + required_fields: List[str] = None, + ): + + ctx = self.get_connection(site_url = site_url) + + # Get list of lists object by List Title + self.list_object = ctx.web.lists.get_by_title(list_title) + list_fields_all = self.list_object.fields + + # Get all or specifics list of objects + if required_fields is None: + ctx.load(list_fields_all) + ctx.execute_query() + + return list_fields_all + + else: + list_fields_required = [list_fields_all.get_by_internal_name_or_title(field).get() + for field in required_fields] + ctx.execute_batch() + + return list_fields_required + + + def select_expandable_user_fields( + self, + list_title: str = None, + site_url: str = None, + required_fields: List[str] = None, + field_property: str = "Title", + ): + """ + Method to expand fields and get more informations. + field_property to expand can be: ID, Title, FieldTypeKind, TypeAsString and many more. + -> more properties can be discovered by getting list.item.properties. + Default to "Title" + """ + + list_fields = self.get_fields(list_title=list_title, + site_url=site_url, + required_fields=required_fields) + + # Finding the "selected" fields + fields_to_select = [field.properties['InternalName'] +f"/{field_property}" + if fnmatch(field.properties['TypeAsString'],"User*") + else field.properties['InternalName'] for field in list_fields] + # Finding the "expanded" fields + fields_to_expand = [field.properties['InternalName'] for field in list_fields + if fnmatch(field.properties['TypeAsString'],f"User*")] + + # Creating the body of the function output + selected_fields = {"FieldInternalNames" : fields_to_select, "FieldToExpand": fields_to_expand, "FieldProperty": field_property} + + return selected_fields + + + def check_filters( + self, + filters: dict = None, + ) -> bool: + """ + Function to check if filters dict is valid. + example1: if operator2 is present value2 must be in place as well + example2: if dtype is not on allowed list it will throw an error + """ + + allowed_dtypes = ['datetime','date','bool','int', 'float', 'complex', 'str'] + allowed_conjuction = ['&','|'] + allowed_operators = ['<','>','<=','>=','==','!='] + + for parameters in filters.values(): + if parameters.get('dtype') not in allowed_dtypes: + raise ValueError(f"dtype not allowed! Expected {allowed_dtypes} got: {parameters.get('dtype')}.") + if parameters.get('operator1'): + if parameters.get('operator1') not in allowed_operators: + raise ValueError(f"Operator type not allowed! Expected {allowed_operators} got: {parameters.get('operator1')}.") + if not parameters.get('value1'): + raise ValueError('Value for operator1 is missing!') + elif not parameters.get('operator1') : raise ValueError('Operator1 is missing!') + if not parameters.get('operator2') and parameters.get('operators_conjuction') is not None: + raise ValueError(f"Operator conjuction allowed only with more than one filter operator!") + if parameters.get('operator2'): + if parameters.get('operator2') not in allowed_operators: + raise ValueError(f"Operator type not allowed! Expected {allowed_operators} got: {parameters.get('operator2')}.") + if not parameters.get('value2'): + raise ValueError('Value for operator2 is missing!') + if not parameters.get('operators_conjuction'): + raise ValueError(f"Operators for conjuction is missing! Expected {allowed_conjuction} got empty.") + if parameters.get('operators_conjuction') not in allowed_conjuction: + raise ValueError(f"Operators for conjuction not allowed! Expected {allowed_conjuction} got {parameters.get('operators_conjuction')}.") + if parameters.get('filters_conjuction'): + if len(filters.keys()) == 1 and parameters.get('filters_conjuction') is not None: + raise ValueError(f"Filters conjuction allowed only with more than one filter column!") + if parameters.get('filters_conjuction') not in allowed_conjuction: + raise ValueError(f"Filters operators for conjuction not allowed! Expected {allowed_conjuction} got {parameters.get('filters_conjuction')}.") + + return True + + + def operators_mapping( + self, + filters: dict = None, + ) -> dict: + """ + Function for mapping comparison and conjuction(logical) operators of filters to the format which is recognized by Microsoft API. + + Args: + filters (dict): A dictionar which contains operators. + + Returns: + New modified dict. + """ + + filters_dict = deepcopy(filters) + operators = { + '<' : 'lt', + '>' : 'gt', + '<=' : 'le', + '>=' : 'ge', + '==' : 'eq', + '!=' : 'ne' + } + logical_op = { + '&' : 'and', + '|' : 'or' + } + + for parameters in filters_dict.values(): + if parameters.get('operator1'): + operator1_to_change = parameters.get('operator1') + if operator1_to_change in operators.keys(): + parameters['operator1'] = operators[operator1_to_change] + else: raise ValueError(f'This comparison operator: {operator1_to_change} is not allowed. Please read the function documentation for details!') + if parameters.get('operator2'): + operator2_to_change = parameters.get('operator2') + if operator2_to_change in operators.keys(): + parameters['operator2'] = operators[operator2_to_change] + else: raise ValueError(f'This comparison operator: {operator2_to_change} is not allowed. Please read the function documentation for details!') + if parameters.get('operators_conjuction'): + logical_op_to_change = parameters.get('operators_conjuction') + if logical_op_to_change in logical_op.keys(): + parameters['operators_conjuction'] = logical_op[logical_op_to_change] + else: raise ValueError(f'This conjuction(logical) operator: {logical_op_to_change} is not allowed. Please read the function documentation for details!') + if parameters.get('filters_conjuction'): + logical_fl_to_change = parameters.get('filters_conjuction') + if logical_fl_to_change in logical_op.keys(): + parameters['filters_conjuction'] = logical_op[logical_fl_to_change] + else: raise ValueError(f'This conjuction(logical) operator: {logical_fl_to_change} is not allowed. Please read the function documentation for details!') + + return filters_dict + + + def make_filter_for_api( + self, + filters: dict + ) -> 'str': + """ + Function changing type of operators to match MS API style as 'str' passing to URL call. + + Args: + filters (dict): A dictionar which contains operators. + + Returns: + Output as string to pass as filter parameter to API. + """ + + filter_text = '' + filters_mod = self.operators_mapping(filters) + + for column, parameters in filters_mod.items(): + if parameters.get('dtype') in ['datetime','date']: + from_date1 = datetime.strptime(parameters.get('value1'),'%Y-%m-%d').isoformat() + filter_text = filter_text + f"{column} {parameters.get('operator1')} datetime'{from_date1}' " + if parameters.get('operator2'): + from_date2 = datetime.strptime(parameters.get('value2'),'%Y-%m-%d').isoformat() + filter_text = filter_text + f" {parameters.get('operators_conjuction')} {column} {parameters.get('operator2')} datetime'{from_date2}' " + elif parameters.get('dtype') not in ['datetime','date']: + filter_text = filter_text + f"{column} {parameters.get('operator1')} '{parameters.get('value1')}'" + if parameters.get('operator2'): + filter_text = filter_text + f"{column} {parameters.get('operator2')} '{parameters.get('value2')}'" + if parameters.get('filters_conjuction'): + filter_text = filter_text + f"{parameters.get('filters_conjuction')} " + + return filter_text + + + def make_filter_for_df( + self, + filters: dict = None, + ) -> 'str': + """ + Function changing dict operators into pandas DataFrame filters. + + Args: + filters (dict): A dictionar which contains operators. + + Returns: + Output as string to pass as filter to DataFrame. + """ + + filter_in_df = 'df.loc[' + + for column, parameters in filters.items(): + filter_in_df = filter_in_df + f"(df.{column} {parameters.get('operator1', '')} '{parameters.get('value1', '')}'" + + if parameters.get('operator2'): + filter_in_df = filter_in_df + f") {parameters.get('operators_conjuction')} (df.{column} {parameters.get('operator2', '')} '{parameters.get('value2', '')}'" + + if parameters.get('filters_conjuction'): + filter_in_df = filter_in_df + ')' + parameters.get('filters_conjuction') + + else: + filter_in_df = filter_in_df + ')' + + filter_in_df = filter_in_df + ']' + + return filter_in_df + + + def list_item_to_df( + self, + list_title: str = None, + site_url: str = None, + required_fields: List[str] = None, + field_property: str = "Title", + filters: dict = None, + row_count: int = 5000, + ): + """ + Method to extract data from Sharepoint List into DataFrame. + + Args: + list_title (str): Title of Sharepoint List. Default to None. + site_url (str): URL to set of Sharepoint Lists. Default to None. + required_fields (List[str]): Required fields(columns) need to be extracted from + Sharepoint List. Default to None. + field_property (List[str]): Property to expand with expand query method. + All propertys can be found under list.item.properties. + Default to ["Title"] + filters (dict): Dictionary with operators which filters the SharepointList output. + allowed dtypes: ('datetime','date','bool','int', 'float', 'complex', 'str') + allowed conjuction: ('&','|') + allowed operators: ('<','>','<=','>=','==','!=') + Example how to build the dict: + filters = { + 'Column_name_1' : + { + 'dtype': 'datetime', + 'value1':'YYYY-MM-DD', + 'value2':'YYYY-MM-DD', + 'operator1':'>=', + 'operator2':'<=', + 'operators_conjuction':'&', + 'filters_conjuction':'&', + } + , + 'Column_name_2' : + { + 'dtype': 'str', + 'value1':'NM-PL', + 'operator1':'==', + }, + } + row_count (int): Number of downloaded rows in single request. Default to 5000. + + Returns: + pd.DataFrame + """ + + # checking if the passed filters dictionary is correctly built + if filters is not None: + self.check_filters(filters) + # checking that the filter parameters are included in the desired field parameters + for key in filters: + if key not in required_fields: + raise AttributeError(f"Filter '{key}' not included inside required fields. It is obligatory to extract data which is filtered!") + + # changing the body of the filter for MS API call + filter_text = self.make_filter_for_api(filters) + + download_all = False + + # extracting requeird_fields SP_List objects + selected_fields = self.select_expandable_user_fields(list_title=list_title, + site_url=site_url, + required_fields=required_fields, + field_property=field_property) + + try: + # Extract data below 5k rows or max limitation of the specific SP List with basic filtering. + if filters is None: + raise ValueError("There is no filter. Starting extraxction all data") + else: + list_items= self.list_object.items.filter(filter_text).select(selected_fields["FieldInternalNames"]).top(row_count).expand(selected_fields["FieldToExpand"]) + self.ctx.load(list_items) + self.ctx.execute_query() + + except (ClientRequestException, ValueError) as e: + # Extract all data from specific SP List without basic filtering. Additional logic for filtering applied on DataFreame level. + logger.info(f"Exception SPQueryThrottledException occurred: {e}") + list_items = self.list_object.items.get_all(row_count,log_of_progress).select(selected_fields["FieldInternalNames"]).expand(selected_fields["FieldToExpand"]) + self.ctx.load(list_items) + self.ctx.execute_query() + download_all = True + + df = pd.DataFrame([self._unpack_fields(row_item,selected_fields) for row_item in list_items]) + + if download_all == True and filters is not None: + # Filter for desired range of created date and for factory Namyslow PL + self.logger.info("Filtering df with all data output") + filter_for_df = self.make_filter_for_df(filters) + df = eval(filter_for_df) + + return df \ No newline at end of file diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index 02791852b..1e67d96a6 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -31,7 +31,7 @@ from .outlook import OutlookToDF from .prefect_date_range import GetFlowNewDateRange from .salesforce import SalesforceBulkUpsert, SalesforceToDF, SalesforceUpsert -from .sharepoint import SharepointToDF +from .sharepoint import SharepointToDF, SharepointListToDF from .sqlite import SQLiteInsert, SQLiteQuery, SQLiteSQLtoDF from .supermetrics import SupermetricsToCSV, SupermetricsToDF diff --git a/viadot/tasks/sharepoint.py b/viadot/tasks/sharepoint.py index 7ba9c4d41..cb7f30a19 100644 --- a/viadot/tasks/sharepoint.py +++ b/viadot/tasks/sharepoint.py @@ -1,16 +1,17 @@ +from typing import List +import pandas as pd import copy import json import os -from typing import List +import re -import pandas as pd from prefect import Task from prefect.tasks.secrets import PrefectSecret from prefect.utilities import logging from prefect.utilities.tasks import defaults_from_attrs from ..exceptions import ValidationError -from ..sources import Sharepoint +from ..sources import Sharepoint, SharepointList from .azure_key_vault import AzureKeyVaultSecret from ..utils import add_viadot_metadata_columns @@ -230,3 +231,171 @@ def run( df = self.df_replace_special_chars(df) self.logger.info(f"Successfully converted data to a DataFrame.") return df + + + +class SharepointListToDF(Task): + """ + Task to extract data from Sharepoint List into DataFrame. + + Args: + list_title (str): Title of Sharepoint List. Default to None. + site_url (str): URL to set of Sharepoint Lists. Default to None. + required_fields (List[str]): Required fields(columns) need to be extracted from + Sharepoint List. Default to None. + field_property (List[str]): Property to expand with expand query method. + All propertys can be found under list.item.properties. + Default to ["Title"] + filters (dict): Dictionary with operators which filters the SharepointList output. + allowed dtypes: ('datetime','date','bool','int', 'float', 'complex', 'str') + allowed conjuction: ('&','|') + allowed operators: ('<','>','<=','>=','==','!=') + Example how to build the dict: + filters = { + 'Column_name_1' : + { + 'dtype': 'datetime', + 'value1':'YYYY-MM-DD', + 'value2':'YYYY-MM-DD', + 'operator1':'>=', + 'operator2':'<=', + 'operators_conjuction':'&', + 'filters_conjuction':'&', + } + , + 'Column_name_2' : + { + 'dtype': 'str', + 'value1':'NM-PL', + 'operator1':'==', + }, + } + row_count (int): Number of downloaded rows in single request. Default to 5000. + + Returns: + pandas DataFrame + """ + + def __init__( + self, + path: str = None, + list_title: str = None, + site_url: str = None, + required_fields: List[str] = None, + field_property: str = "Title", + filters: dict = None, + row_count: int = 5000, + credentials_secret: str = None, + vault_name: str = None, + *args, + **kwargs, + ): + + self.path = path + self.list_title = list_title + self.site_url = site_url + self.required_fields = required_fields + self.field_property = field_property + self.filters = filters + self.row_count = row_count + self.vault_name = vault_name + self.credentials_secret = credentials_secret + + if not credentials_secret: + # Attempt to read a default for the service principal secret name + try: + credentials_secret = PrefectSecret("SHAREPOINT-CERT").run() + except ValueError: + pass + + if credentials_secret: + credentials_str = AzureKeyVaultSecret( + secret = self.credentials_secret, vault_name=self.vault_name + ).run() + self.credentials = json.loads(credentials_str) + + + super().__init__( + *args, + **kwargs, + ) + + + def __call__(self): + """Download Sharepoint_List data to a .parquet file""" + super().__call__(self) + + + def _convert_camel_case_to_words( + self, + input_str: str) -> str: + + self.input_str = input_str + + words = re.findall(r'[A-Z][a-z]*|[0-9]+', self.input_str) + converted = ' '.join(words) + + return converted + + + def change_column_name( + self, + df: pd.DataFrame = None, + ): + s = SharepointList() + list_fields = s.get_fields( + list_title= self.list_title, + site_url= self.site_url, + required_fields= self.required_fields, + ) + + self.logger.info("Changing columns names") + column_names_correct = [field.properties['Title'] for field in list_fields] + column_names_code = [field.properties['InternalName'] for field in list_fields] + dictionary = dict(zip(column_names_code, column_names_correct)) + + # If duplicates in names from "Title" take "InternalName" + value_count = {} + duplicates = [] + + for key, value in dictionary.items(): + if value in value_count: + if value_count[value] not in duplicates: + duplicates.append(value_count[value]) + duplicates.append(key) + else: + value_count[value] = key + + for key in duplicates: + dictionary[key] = self._convert_camel_case_to_words(key) + + # Rename columns names inside DataFrame + df = df.rename(columns=dictionary) + + return df + + + def run( + self, + ) -> None: + """ + Run Task SharepointListToDF. + + Returns: + pd.DataFrame + """ + + s = SharepointList(credentials=self.credentials,) + df_raw = s.list_item_to_df( + list_title = self.list_title, + site_url = self.site_url, + required_fields = self.required_fields, + field_property = self.field_property, + filters = self.filters, + row_count = self.row_count, + ) + + df = self.change_column_name(df=df_raw) + self.logger.info("Successfully changed structure of the DataFrame") + + return df \ No newline at end of file diff --git a/viadot/utils.py b/viadot/utils.py index 2b4c80538..49fa33e09 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -449,3 +449,14 @@ def wrapper(*args, **kwargs) -> pd.DataFrame: return wrapper return decorator + + +def get_nested_dict(d): + if isinstance(d, dict): + for lvl in d.values(): + if isinstance(lvl, dict): + return get_nested_dict(lvl) + else: + return d + else: + return None \ No newline at end of file From dceee62735e4f6327f317be55e8811f3d567cb08 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 26 Oct 2023 13:46:34 +0200 Subject: [PATCH 42/47] =?UTF-8?q?=E2=9C=A8=20Added=20fetch=20all?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 006f22e39..ad2f298bd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -73,6 +73,7 @@ jobs: git config --global user.email 'github-actions[bot]@users.noreply.github.com' git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY black . + git fetch --all git checkout $GITHUB_HEAD_REF git commit -am "🎨 Format Python code with Black" git push From 44dc50feb5d84690ecc16a410c015eebb91923cb Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 26 Oct 2023 14:11:57 +0200 Subject: [PATCH 43/47] =?UTF-8?q?=F0=9F=8E=A8=20formatted=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/build.yml | 1 - viadot/flows/__init__.py | 2 +- viadot/flows/sharepoint_to_adls.py | 44 ++- viadot/sources/__init__.py | 2 +- viadot/sources/sharepoint.py | 434 +++++++++++++++++------------ viadot/tasks/__init__.py | 4 +- viadot/tasks/sharepoint.py | 78 +++--- viadot/utils.py | 2 +- 8 files changed, 314 insertions(+), 253 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ad2f298bd..006f22e39 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -73,7 +73,6 @@ jobs: git config --global user.email 'github-actions[bot]@users.noreply.github.com' git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY black . - git fetch --all git checkout $GITHUB_HEAD_REF git commit -am "🎨 Format Python code with Black" git push diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index ed6b5a004..e138735d6 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -46,4 +46,4 @@ from .sql_server_to_parquet import SQLServerToParquet from .sql_server_transform import SQLServerTransform from .transform_and_catalog import TransformAndCatalogToLuma -from .vid_club_to_adls import VidClubToADLS \ No newline at end of file +from .vid_club_to_adls import VidClubToADLS diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index c234b4a88..4b8e477dd 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -181,16 +181,15 @@ def slugify(name): return name.replace(" ", "_").lower() - class SharepointListToADLS(Flow): def __init__( self, name: str, - list_title: str = None, + list_title: str = None, site_url: str = None, required_fields: List[str] = None, field_property: str = "Title", - filters: dict = None, + filters: dict = None, row_count: int = 5000, sp_cert_credentials_secret: str = None, vault_name: str = None, @@ -202,7 +201,7 @@ def __init__( output_file_extension: str = ".parquet", validate_df_dict: dict = None, *args: List[any], - **kwargs: Dict[str, Any], + **kwargs: Dict[str, Any], ): """ Run Flow SharepointListToADLS. @@ -233,7 +232,7 @@ def __init__( 'filters_conjuction':'&', # conjuction filters allowed only when 2 columns passed } , - 'Column_name_2' : + 'Column_name_2' : { 'dtype': 'str', 'value1':'NM-PL', @@ -247,10 +246,10 @@ def __init__( adls_dir_path (str): Azure Data Lake destination folder/catalog path. Defaults to None. adls_file_name (str, optional): Name of file in ADLS. Defaults to None. adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with - ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, + ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake. Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to True. - + Returns: .parquet file inside ADLS. """ @@ -267,7 +266,6 @@ def __init__( self.row_count = row_count self.validate_df_dict = validate_df_dict - # AzureDataLakeUpload self.adls_dir_path = adls_dir_path self.adls_file_name = adls_file_name @@ -296,26 +294,25 @@ def __init__( adls_dir_path, "schema", self.now + ".json" ) - super().__init__( name=name, - *args, + *args, **kwargs, - ) + ) self.gen_flow() def gen_flow(self) -> Flow: - s = SharepointListToDF( - path = self.path, - list_title = self.list_title, - site_url = self.site_url, - required_fields = self.required_fields, - field_property = self.field_property, - filters = self.filters, - row_count = self.row_count, - credentials_secret=self.sp_cert_credentials_secret, - ) + s = SharepointListToDF( + path=self.path, + list_title=self.list_title, + site_url=self.site_url, + required_fields=self.required_fields, + field_property=self.field_property, + filters=self.filters, + row_count=self.row_count, + credentials_secret=self.sp_cert_credentials_secret, + ) df = s.run() if self.validate_df_dict: @@ -327,7 +324,7 @@ def gen_flow(self) -> Flow: df_mapped = df_map_mixed_dtypes_for_parquet.bind( df_with_metadata, dtypes_dict, flow=self ) - + df_to_file = df_to_parquet.bind( df=df_mapped, path=self.path, @@ -347,7 +344,6 @@ def gen_flow(self) -> Flow: dtypes_dict=dtypes_dict, local_json_path=self.local_json_path, flow=self ) - json_to_adls_task = AzureDataLakeUpload() json_to_adls_task.bind( from_path=self.local_json_path, @@ -367,4 +363,4 @@ def gen_flow(self) -> Flow: @staticmethod def slugify(name): - return name.replace(" ", "_").lower() \ No newline at end of file + return name.replace(" ", "_").lower() diff --git a/viadot/sources/__init__.py b/viadot/sources/__init__.py index 8308c78c9..c0d96abe2 100644 --- a/viadot/sources/__init__.py +++ b/viadot/sources/__init__.py @@ -33,4 +33,4 @@ # APIS from .uk_carbon_intensity import UKCarbonIntensity -from .vid_club import VidClub \ No newline at end of file +from .vid_club import VidClub diff --git a/viadot/sources/sharepoint.py b/viadot/sources/sharepoint.py index 028499c94..096de825b 100644 --- a/viadot/sources/sharepoint.py +++ b/viadot/sources/sharepoint.py @@ -82,15 +82,14 @@ def download_file( ) - class SharepointList(Source): """ A Sharepoint_List class to connect and download data from sharpoint lists. Args: credentials (dict): Credentials should include: - - "tenant" - - "client_id" + - "tenant" + - "client_id" - "scopes" - "thumbprint" - "private_key" @@ -106,46 +105,46 @@ def __init__( credentials = credentials or DEFAULT_CREDENTIALS if credentials is None: raise CredentialError("Credentials not found.") - - super().__init__(*args, credentials=credentials, **kwargs) + super().__init__(*args, credentials=credentials, **kwargs) def get_connection( - self, - site_url: str = None, - ): - + self, + site_url: str = None, + ): + # Connecting into Sharepoint with AuthenticationContext try: auth_context = AuthenticationContext(site_url) - auth_context.with_client_certificate(tenant=self.credentials["TENANT"], - client_id=self.credentials["CLIENT_ID"], - scopes=[self.credentials["SCOPES"]], - thumbprint=self.credentials["THUMBPRINT"], - private_key=self.credentials["PRIVATE_KEY"]) - + auth_context.with_client_certificate( + tenant=self.credentials["TENANT"], + client_id=self.credentials["CLIENT_ID"], + scopes=[self.credentials["SCOPES"]], + thumbprint=self.credentials["THUMBPRINT"], + private_key=self.credentials["PRIVATE_KEY"], + ) + self.ctx = ClientContext(site_url, auth_context) logger.info("Successfully connect to Sharepoint Lists") except Exception as ex: logger.info(f"Error at ClientContext or authentication level: {ex}") - return self.ctx - + return self.ctx # Function for extracting list items from search fields def _unpack_fields( - self, - list_item, - selected_fields: dict = None, - ): - + self, + list_item, + selected_fields: dict = None, + ): + # Creating the body of dictionary new_dict = dict() # For loop scanning the propertys of searching fields item_values_dict = list_item.properties - for field,val in item_values_dict.items(): + for field, val in item_values_dict.items(): nested_dict = get_nested_dict(val) # Check if the dictionary is nested if nested_dict != None: @@ -158,18 +157,17 @@ def _unpack_fields( raise ValueError else: new_dict[field] = val - - return new_dict - + + return new_dict def get_fields( - self, - list_title: str = None, - site_url: str = None, - required_fields: List[str] = None, - ): - - ctx = self.get_connection(site_url = site_url) + self, + list_title: str = None, + site_url: str = None, + required_fields: List[str] = None, + ): + + ctx = self.get_connection(site_url=site_url) # Get list of lists object by List Title self.list_object = ctx.web.lists.get_by_title(list_title) @@ -181,219 +179,278 @@ def get_fields( ctx.execute_query() return list_fields_all - + else: - list_fields_required = [list_fields_all.get_by_internal_name_or_title(field).get() - for field in required_fields] + list_fields_required = [ + list_fields_all.get_by_internal_name_or_title(field).get() + for field in required_fields + ] ctx.execute_batch() - + return list_fields_required - - + def select_expandable_user_fields( - self, - list_title: str = None, - site_url: str = None, - required_fields: List[str] = None, - field_property: str = "Title", - ): + self, + list_title: str = None, + site_url: str = None, + required_fields: List[str] = None, + field_property: str = "Title", + ): """ Method to expand fields and get more informations. - field_property to expand can be: ID, Title, FieldTypeKind, TypeAsString and many more. - -> more properties can be discovered by getting list.item.properties. + field_property to expand can be: ID, Title, FieldTypeKind, TypeAsString and many more. + -> more properties can be discovered by getting list.item.properties. Default to "Title" """ - list_fields = self.get_fields(list_title=list_title, - site_url=site_url, - required_fields=required_fields) - + list_fields = self.get_fields( + list_title=list_title, site_url=site_url, required_fields=required_fields + ) + # Finding the "selected" fields - fields_to_select = [field.properties['InternalName'] +f"/{field_property}" - if fnmatch(field.properties['TypeAsString'],"User*") - else field.properties['InternalName'] for field in list_fields] + fields_to_select = [ + field.properties["InternalName"] + f"/{field_property}" + if fnmatch(field.properties["TypeAsString"], "User*") + else field.properties["InternalName"] + for field in list_fields + ] # Finding the "expanded" fields - fields_to_expand = [field.properties['InternalName'] for field in list_fields - if fnmatch(field.properties['TypeAsString'],f"User*")] - + fields_to_expand = [ + field.properties["InternalName"] + for field in list_fields + if fnmatch(field.properties["TypeAsString"], f"User*") + ] + # Creating the body of the function output - selected_fields = {"FieldInternalNames" : fields_to_select, "FieldToExpand": fields_to_expand, "FieldProperty": field_property} + selected_fields = { + "FieldInternalNames": fields_to_select, + "FieldToExpand": fields_to_expand, + "FieldProperty": field_property, + } return selected_fields - def check_filters( - self, - filters: dict = None, - ) -> bool: + self, + filters: dict = None, + ) -> bool: """ Function to check if filters dict is valid. example1: if operator2 is present value2 must be in place as well example2: if dtype is not on allowed list it will throw an error """ - allowed_dtypes = ['datetime','date','bool','int', 'float', 'complex', 'str'] - allowed_conjuction = ['&','|'] - allowed_operators = ['<','>','<=','>=','==','!='] - + allowed_dtypes = ["datetime", "date", "bool", "int", "float", "complex", "str"] + allowed_conjuction = ["&", "|"] + allowed_operators = ["<", ">", "<=", ">=", "==", "!="] + for parameters in filters.values(): - if parameters.get('dtype') not in allowed_dtypes: - raise ValueError(f"dtype not allowed! Expected {allowed_dtypes} got: {parameters.get('dtype')}.") - if parameters.get('operator1'): - if parameters.get('operator1') not in allowed_operators: - raise ValueError(f"Operator type not allowed! Expected {allowed_operators} got: {parameters.get('operator1')}.") - if not parameters.get('value1'): - raise ValueError('Value for operator1 is missing!') - elif not parameters.get('operator1') : raise ValueError('Operator1 is missing!') - if not parameters.get('operator2') and parameters.get('operators_conjuction') is not None: - raise ValueError(f"Operator conjuction allowed only with more than one filter operator!") - if parameters.get('operator2'): - if parameters.get('operator2') not in allowed_operators: - raise ValueError(f"Operator type not allowed! Expected {allowed_operators} got: {parameters.get('operator2')}.") - if not parameters.get('value2'): - raise ValueError('Value for operator2 is missing!') - if not parameters.get('operators_conjuction'): - raise ValueError(f"Operators for conjuction is missing! Expected {allowed_conjuction} got empty.") - if parameters.get('operators_conjuction') not in allowed_conjuction: - raise ValueError(f"Operators for conjuction not allowed! Expected {allowed_conjuction} got {parameters.get('operators_conjuction')}.") - if parameters.get('filters_conjuction'): - if len(filters.keys()) == 1 and parameters.get('filters_conjuction') is not None: - raise ValueError(f"Filters conjuction allowed only with more than one filter column!") - if parameters.get('filters_conjuction') not in allowed_conjuction: - raise ValueError(f"Filters operators for conjuction not allowed! Expected {allowed_conjuction} got {parameters.get('filters_conjuction')}.") + if parameters.get("dtype") not in allowed_dtypes: + raise ValueError( + f"dtype not allowed! Expected {allowed_dtypes} got: {parameters.get('dtype')}." + ) + if parameters.get("operator1"): + if parameters.get("operator1") not in allowed_operators: + raise ValueError( + f"Operator type not allowed! Expected {allowed_operators} got: {parameters.get('operator1')}." + ) + if not parameters.get("value1"): + raise ValueError("Value for operator1 is missing!") + elif not parameters.get("operator1"): + raise ValueError("Operator1 is missing!") + if ( + not parameters.get("operator2") + and parameters.get("operators_conjuction") is not None + ): + raise ValueError( + f"Operator conjuction allowed only with more than one filter operator!" + ) + if parameters.get("operator2"): + if parameters.get("operator2") not in allowed_operators: + raise ValueError( + f"Operator type not allowed! Expected {allowed_operators} got: {parameters.get('operator2')}." + ) + if not parameters.get("value2"): + raise ValueError("Value for operator2 is missing!") + if not parameters.get("operators_conjuction"): + raise ValueError( + f"Operators for conjuction is missing! Expected {allowed_conjuction} got empty." + ) + if parameters.get("operators_conjuction") not in allowed_conjuction: + raise ValueError( + f"Operators for conjuction not allowed! Expected {allowed_conjuction} got {parameters.get('operators_conjuction')}." + ) + if parameters.get("filters_conjuction"): + if ( + len(filters.keys()) == 1 + and parameters.get("filters_conjuction") is not None + ): + raise ValueError( + f"Filters conjuction allowed only with more than one filter column!" + ) + if parameters.get("filters_conjuction") not in allowed_conjuction: + raise ValueError( + f"Filters operators for conjuction not allowed! Expected {allowed_conjuction} got {parameters.get('filters_conjuction')}." + ) return True - def operators_mapping( - self, - filters: dict = None, - ) -> dict: - """ + self, + filters: dict = None, + ) -> dict: + """ Function for mapping comparison and conjuction(logical) operators of filters to the format which is recognized by Microsoft API. - + Args: - filters (dict): A dictionar which contains operators. - + filters (dict): A dictionar which contains operators. + Returns: New modified dict. """ filters_dict = deepcopy(filters) operators = { - '<' : 'lt', - '>' : 'gt', - '<=' : 'le', - '>=' : 'ge', - '==' : 'eq', - '!=' : 'ne' - } - logical_op = { - '&' : 'and', - '|' : 'or' + "<": "lt", + ">": "gt", + "<=": "le", + ">=": "ge", + "==": "eq", + "!=": "ne", } - + logical_op = {"&": "and", "|": "or"} + for parameters in filters_dict.values(): - if parameters.get('operator1'): - operator1_to_change = parameters.get('operator1') + if parameters.get("operator1"): + operator1_to_change = parameters.get("operator1") if operator1_to_change in operators.keys(): - parameters['operator1'] = operators[operator1_to_change] - else: raise ValueError(f'This comparison operator: {operator1_to_change} is not allowed. Please read the function documentation for details!') - if parameters.get('operator2'): - operator2_to_change = parameters.get('operator2') + parameters["operator1"] = operators[operator1_to_change] + else: + raise ValueError( + f"This comparison operator: {operator1_to_change} is not allowed. Please read the function documentation for details!" + ) + if parameters.get("operator2"): + operator2_to_change = parameters.get("operator2") if operator2_to_change in operators.keys(): - parameters['operator2'] = operators[operator2_to_change] - else: raise ValueError(f'This comparison operator: {operator2_to_change} is not allowed. Please read the function documentation for details!') - if parameters.get('operators_conjuction'): - logical_op_to_change = parameters.get('operators_conjuction') + parameters["operator2"] = operators[operator2_to_change] + else: + raise ValueError( + f"This comparison operator: {operator2_to_change} is not allowed. Please read the function documentation for details!" + ) + if parameters.get("operators_conjuction"): + logical_op_to_change = parameters.get("operators_conjuction") if logical_op_to_change in logical_op.keys(): - parameters['operators_conjuction'] = logical_op[logical_op_to_change] - else: raise ValueError(f'This conjuction(logical) operator: {logical_op_to_change} is not allowed. Please read the function documentation for details!') - if parameters.get('filters_conjuction'): - logical_fl_to_change = parameters.get('filters_conjuction') + parameters["operators_conjuction"] = logical_op[ + logical_op_to_change + ] + else: + raise ValueError( + f"This conjuction(logical) operator: {logical_op_to_change} is not allowed. Please read the function documentation for details!" + ) + if parameters.get("filters_conjuction"): + logical_fl_to_change = parameters.get("filters_conjuction") if logical_fl_to_change in logical_op.keys(): - parameters['filters_conjuction'] = logical_op[logical_fl_to_change] - else: raise ValueError(f'This conjuction(logical) operator: {logical_fl_to_change} is not allowed. Please read the function documentation for details!') - - return filters_dict + parameters["filters_conjuction"] = logical_op[logical_fl_to_change] + else: + raise ValueError( + f"This conjuction(logical) operator: {logical_fl_to_change} is not allowed. Please read the function documentation for details!" + ) + return filters_dict - def make_filter_for_api( - self, - filters: dict - ) -> 'str': + def make_filter_for_api(self, filters: dict) -> "str": """ Function changing type of operators to match MS API style as 'str' passing to URL call. Args: - filters (dict): A dictionar which contains operators. + filters (dict): A dictionar which contains operators. Returns: Output as string to pass as filter parameter to API. """ - - filter_text = '' + + filter_text = "" filters_mod = self.operators_mapping(filters) for column, parameters in filters_mod.items(): - if parameters.get('dtype') in ['datetime','date']: - from_date1 = datetime.strptime(parameters.get('value1'),'%Y-%m-%d').isoformat() - filter_text = filter_text + f"{column} {parameters.get('operator1')} datetime'{from_date1}' " - if parameters.get('operator2'): - from_date2 = datetime.strptime(parameters.get('value2'),'%Y-%m-%d').isoformat() - filter_text = filter_text + f" {parameters.get('operators_conjuction')} {column} {parameters.get('operator2')} datetime'{from_date2}' " - elif parameters.get('dtype') not in ['datetime','date']: - filter_text = filter_text + f"{column} {parameters.get('operator1')} '{parameters.get('value1')}'" - if parameters.get('operator2'): - filter_text = filter_text + f"{column} {parameters.get('operator2')} '{parameters.get('value2')}'" - if parameters.get('filters_conjuction'): + if parameters.get("dtype") in ["datetime", "date"]: + from_date1 = datetime.strptime( + parameters.get("value1"), "%Y-%m-%d" + ).isoformat() + filter_text = ( + filter_text + + f"{column} {parameters.get('operator1')} datetime'{from_date1}' " + ) + if parameters.get("operator2"): + from_date2 = datetime.strptime( + parameters.get("value2"), "%Y-%m-%d" + ).isoformat() + filter_text = ( + filter_text + + f" {parameters.get('operators_conjuction')} {column} {parameters.get('operator2')} datetime'{from_date2}' " + ) + elif parameters.get("dtype") not in ["datetime", "date"]: + filter_text = ( + filter_text + + f"{column} {parameters.get('operator1')} '{parameters.get('value1')}'" + ) + if parameters.get("operator2"): + filter_text = ( + filter_text + + f"{column} {parameters.get('operator2')} '{parameters.get('value2')}'" + ) + if parameters.get("filters_conjuction"): filter_text = filter_text + f"{parameters.get('filters_conjuction')} " return filter_text - def make_filter_for_df( - self, - filters: dict = None, - ) -> 'str': + self, + filters: dict = None, + ) -> "str": """ Function changing dict operators into pandas DataFrame filters. Args: - filters (dict): A dictionar which contains operators. + filters (dict): A dictionar which contains operators. Returns: Output as string to pass as filter to DataFrame. """ - filter_in_df = 'df.loc[' + filter_in_df = "df.loc[" for column, parameters in filters.items(): - filter_in_df = filter_in_df + f"(df.{column} {parameters.get('operator1', '')} '{parameters.get('value1', '')}'" + filter_in_df = ( + filter_in_df + + f"(df.{column} {parameters.get('operator1', '')} '{parameters.get('value1', '')}'" + ) - if parameters.get('operator2'): - filter_in_df = filter_in_df + f") {parameters.get('operators_conjuction')} (df.{column} {parameters.get('operator2', '')} '{parameters.get('value2', '')}'" + if parameters.get("operator2"): + filter_in_df = ( + filter_in_df + + f") {parameters.get('operators_conjuction')} (df.{column} {parameters.get('operator2', '')} '{parameters.get('value2', '')}'" + ) - if parameters.get('filters_conjuction'): - filter_in_df = filter_in_df + ')' + parameters.get('filters_conjuction') + if parameters.get("filters_conjuction"): + filter_in_df = filter_in_df + ")" + parameters.get("filters_conjuction") - else: - filter_in_df = filter_in_df + ')' + else: + filter_in_df = filter_in_df + ")" - filter_in_df = filter_in_df + ']' + filter_in_df = filter_in_df + "]" return filter_in_df - def list_item_to_df( - self, - list_title: str = None, - site_url: str = None, - required_fields: List[str] = None, - field_property: str = "Title", - filters: dict = None, - row_count: int = 5000, - ): + self, + list_title: str = None, + site_url: str = None, + required_fields: List[str] = None, + field_property: str = "Title", + filters: dict = None, + row_count: int = 5000, + ): """ Method to extract data from Sharepoint List into DataFrame. @@ -422,7 +479,7 @@ def list_item_to_df( 'filters_conjuction':'&', } , - 'Column_name_2' : + 'Column_name_2' : { 'dtype': 'str', 'value1':'NM-PL', @@ -430,7 +487,7 @@ def list_item_to_df( }, } row_count (int): Number of downloaded rows in single request. Default to 5000. - + Returns: pd.DataFrame """ @@ -441,42 +498,57 @@ def list_item_to_df( # checking that the filter parameters are included in the desired field parameters for key in filters: if key not in required_fields: - raise AttributeError(f"Filter '{key}' not included inside required fields. It is obligatory to extract data which is filtered!") - + raise AttributeError( + f"Filter '{key}' not included inside required fields. It is obligatory to extract data which is filtered!" + ) + # changing the body of the filter for MS API call filter_text = self.make_filter_for_api(filters) download_all = False # extracting requeird_fields SP_List objects - selected_fields = self.select_expandable_user_fields(list_title=list_title, - site_url=site_url, - required_fields=required_fields, - field_property=field_property) + selected_fields = self.select_expandable_user_fields( + list_title=list_title, + site_url=site_url, + required_fields=required_fields, + field_property=field_property, + ) try: # Extract data below 5k rows or max limitation of the specific SP List with basic filtering. if filters is None: raise ValueError("There is no filter. Starting extraxction all data") else: - list_items= self.list_object.items.filter(filter_text).select(selected_fields["FieldInternalNames"]).top(row_count).expand(selected_fields["FieldToExpand"]) + list_items = ( + self.list_object.items.filter(filter_text) + .select(selected_fields["FieldInternalNames"]) + .top(row_count) + .expand(selected_fields["FieldToExpand"]) + ) self.ctx.load(list_items) self.ctx.execute_query() except (ClientRequestException, ValueError) as e: # Extract all data from specific SP List without basic filtering. Additional logic for filtering applied on DataFreame level. logger.info(f"Exception SPQueryThrottledException occurred: {e}") - list_items = self.list_object.items.get_all(row_count,log_of_progress).select(selected_fields["FieldInternalNames"]).expand(selected_fields["FieldToExpand"]) + list_items = ( + self.list_object.items.get_all(row_count, log_of_progress) + .select(selected_fields["FieldInternalNames"]) + .expand(selected_fields["FieldToExpand"]) + ) self.ctx.load(list_items) self.ctx.execute_query() download_all = True - df = pd.DataFrame([self._unpack_fields(row_item,selected_fields) for row_item in list_items]) + df = pd.DataFrame( + [self._unpack_fields(row_item, selected_fields) for row_item in list_items] + ) - if download_all == True and filters is not None: + if download_all == True and filters is not None: # Filter for desired range of created date and for factory Namyslow PL self.logger.info("Filtering df with all data output") filter_for_df = self.make_filter_for_df(filters) df = eval(filter_for_df) - return df \ No newline at end of file + return df diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index 1e67d96a6..ecba1d5c5 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -31,7 +31,7 @@ from .outlook import OutlookToDF from .prefect_date_range import GetFlowNewDateRange from .salesforce import SalesforceBulkUpsert, SalesforceToDF, SalesforceUpsert -from .sharepoint import SharepointToDF, SharepointListToDF +from .sharepoint import SharepointToDF, SharepointListToDF from .sqlite import SQLiteInsert, SQLiteQuery, SQLiteSQLtoDF from .supermetrics import SupermetricsToCSV, SupermetricsToDF @@ -57,4 +57,4 @@ from .sql_server import SQLServerCreateTable, SQLServerQuery, SQLServerToDF from .vid_club import VidClubToDF from .git import CloneRepo -from .luma import LumaIngest \ No newline at end of file +from .luma import LumaIngest diff --git a/viadot/tasks/sharepoint.py b/viadot/tasks/sharepoint.py index cb7f30a19..2a1cb0bc4 100644 --- a/viadot/tasks/sharepoint.py +++ b/viadot/tasks/sharepoint.py @@ -233,7 +233,6 @@ def run( return df - class SharepointListToDF(Task): """ Task to extract data from Sharepoint List into DataFrame. @@ -263,7 +262,7 @@ class SharepointListToDF(Task): 'filters_conjuction':'&', } , - 'Column_name_2' : + 'Column_name_2' : { 'dtype': 'str', 'value1':'NM-PL', @@ -279,18 +278,18 @@ class SharepointListToDF(Task): def __init__( self, path: str = None, - list_title: str = None, + list_title: str = None, site_url: str = None, required_fields: List[str] = None, field_property: str = "Title", - filters: dict = None, + filters: dict = None, row_count: int = 5000, credentials_secret: str = None, vault_name: str = None, *args, **kwargs, ): - + self.path = path self.list_title = list_title self.site_url = site_url @@ -300,7 +299,7 @@ def __init__( self.row_count = row_count self.vault_name = vault_name self.credentials_secret = credentials_secret - + if not credentials_secret: # Attempt to read a default for the service principal secret name try: @@ -310,50 +309,44 @@ def __init__( if credentials_secret: credentials_str = AzureKeyVaultSecret( - secret = self.credentials_secret, vault_name=self.vault_name + secret=self.credentials_secret, vault_name=self.vault_name ).run() self.credentials = json.loads(credentials_str) - super().__init__( *args, **kwargs, ) - def __call__(self): """Download Sharepoint_List data to a .parquet file""" super().__call__(self) - - def _convert_camel_case_to_words( - self, - input_str: str) -> str: - + def _convert_camel_case_to_words(self, input_str: str) -> str: + self.input_str = input_str - words = re.findall(r'[A-Z][a-z]*|[0-9]+', self.input_str) - converted = ' '.join(words) + words = re.findall(r"[A-Z][a-z]*|[0-9]+", self.input_str) + converted = " ".join(words) + + return converted - return converted - - def change_column_name( self, df: pd.DataFrame = None, ): s = SharepointList() list_fields = s.get_fields( - list_title= self.list_title, - site_url= self.site_url, - required_fields= self.required_fields, - ) - + list_title=self.list_title, + site_url=self.site_url, + required_fields=self.required_fields, + ) + self.logger.info("Changing columns names") - column_names_correct = [field.properties['Title'] for field in list_fields] - column_names_code = [field.properties['InternalName'] for field in list_fields] - dictionary = dict(zip(column_names_code, column_names_correct)) - + column_names_correct = [field.properties["Title"] for field in list_fields] + column_names_code = [field.properties["InternalName"] for field in list_fields] + dictionary = dict(zip(column_names_code, column_names_correct)) + # If duplicates in names from "Title" take "InternalName" value_count = {} duplicates = [] @@ -365,19 +358,18 @@ def change_column_name( duplicates.append(key) else: value_count[value] = key - + for key in duplicates: dictionary[key] = self._convert_camel_case_to_words(key) - + # Rename columns names inside DataFrame df = df.rename(columns=dictionary) return df - def run( self, - ) -> None: + ) -> None: """ Run Task SharepointListToDF. @@ -385,17 +377,19 @@ def run( pd.DataFrame """ - s = SharepointList(credentials=self.credentials,) + s = SharepointList( + credentials=self.credentials, + ) df_raw = s.list_item_to_df( - list_title = self.list_title, - site_url = self.site_url, - required_fields = self.required_fields, - field_property = self.field_property, - filters = self.filters, - row_count = self.row_count, - ) - + list_title=self.list_title, + site_url=self.site_url, + required_fields=self.required_fields, + field_property=self.field_property, + filters=self.filters, + row_count=self.row_count, + ) + df = self.change_column_name(df=df_raw) self.logger.info("Successfully changed structure of the DataFrame") - return df \ No newline at end of file + return df diff --git a/viadot/utils.py b/viadot/utils.py index 49fa33e09..d05cfdd95 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -459,4 +459,4 @@ def get_nested_dict(d): else: return d else: - return None \ No newline at end of file + return None From 61dc36f6dbc2168fb8ae93d1992b92dd92518b9a Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 26 Oct 2023 14:18:50 +0200 Subject: [PATCH 44/47] =?UTF-8?q?=F0=9F=8E=A8=20formatted?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_sharepoint.py | 123 ++++++++++++++++++++------- 1 file changed, 92 insertions(+), 31 deletions(-) diff --git a/tests/integration/test_sharepoint.py b/tests/integration/test_sharepoint.py index b56d10182..d1157fd7a 100644 --- a/tests/integration/test_sharepoint.py +++ b/tests/integration/test_sharepoint.py @@ -171,17 +171,20 @@ def test_get_data_types(file_name): # testing get_connection function passing invalid credentials and raises AuthenticationContext error. def test_get_connection(): site_url = "https://velux.sharepoint.com/" - credentials ={ - "SHAREPOINT_CERT": - {"TENANT": "xxx", + credentials = { + "SHAREPOINT_CERT": { + "TENANT": "xxx", "CLIENT_ID": "123", "SCOPES": "https://velux.sharepoint.com/", "THUMBPRINT": "xyz", - "PRIVATE_KEY": "private"} - } + "PRIVATE_KEY": "private", + } + } spl = SharepointList(credentials=credentials) - with pytest.raises(AttributeError, match="'SharepointList' object has no attribute 'ctx'"): + with pytest.raises( + AttributeError, match="'SharepointList' object has no attribute 'ctx'" + ): spl.get_connection(site_url=site_url) @@ -197,119 +200,177 @@ def sharepoint_list(): def test_valid_filters(sharepoint_list): filters = { - 'filter1': {'dtype': 'int', 'operator1': '<', 'value1': 10}, - 'filter2': {'dtype': 'str', 'operator1': '==', 'value1': 'value'}, + "filter1": {"dtype": "int", "operator1": "<", "value1": 10}, + "filter2": {"dtype": "str", "operator1": "==", "value1": "value"}, } result = sharepoint_list.check_filters(filters) assert result is True + def test_invalid_dtype(sharepoint_list): filters = { - 'filter1': {'dtype': 'list', 'operator1': '>', 'value1': 10}, + "filter1": {"dtype": "list", "operator1": ">", "value1": 10}, } with pytest.raises(ValueError, match="dtype not allowed!"): sharepoint_list.check_filters(filters) + def test_missing_operator1(sharepoint_list): filters = { - 'filter1': {'dtype': 'int', 'value1': 10}, + "filter1": {"dtype": "int", "value1": 10}, } with pytest.raises(ValueError, match="Operator1 is missing!"): sharepoint_list.check_filters(filters) + def test_invalid_operator1(sharepoint_list): filters = { - 'filter1': {'dtype': 'int', 'operator1': '*', 'value1': 10}, + "filter1": {"dtype": "int", "operator1": "*", "value1": 10}, } with pytest.raises(ValueError, match="Operator type not allowed!"): sharepoint_list.check_filters(filters) + def test_missing_value1(sharepoint_list): filters = { - 'filter1': {'dtype': 'int', 'operator1': '>', 'value1': None}, + "filter1": {"dtype": "int", "operator1": ">", "value1": None}, } with pytest.raises(ValueError, match="Value for operator1 is missing!"): sharepoint_list.check_filters(filters) + def test_missing_operators_conjuction(sharepoint_list): filters = { - 'filter1': {'dtype': 'int', 'operator1': '>', 'value1': 10, 'operator2': '<', 'value2': 20}, + "filter1": { + "dtype": "int", + "operator1": ">", + "value1": 10, + "operator2": "<", + "value2": 20, + }, } with pytest.raises(ValueError, match="Operators for conjuction is missing!"): sharepoint_list.check_filters(filters) + def test_invalid_operators_conjuction(sharepoint_list): filters = { - 'filter1': {'dtype': 'int', 'operator1': '>', 'value1': 10, 'operator2': '<', 'value2': 20, 'operators_conjuction': '!'}, + "filter1": { + "dtype": "int", + "operator1": ">", + "value1": 10, + "operator2": "<", + "value2": 20, + "operators_conjuction": "!", + }, } with pytest.raises(ValueError, match="Operators for conjuction not allowed!"): sharepoint_list.check_filters(filters) + def test_invalid_filters_conjuction(sharepoint_list): filters = { - 'filter1': {'dtype': 'int', 'operator1': '>', 'value1': 10, 'filters_conjuction': '!'}, + "filter1": { + "dtype": "int", + "operator1": ">", + "value1": 10, + "filters_conjuction": "!", + }, } - with pytest.raises(ValueError, match="Filters operators for conjuction not allowed!"): + with pytest.raises( + ValueError, match="Filters operators for conjuction not allowed!" + ): sharepoint_list.check_filters(filters) + def test_valid_mapping(sharepoint_list): filters = { - 'filter1': {'operator1': '>', 'operator2': '<=', 'operators_conjuction': '&', 'filters_conjuction': '|'}, - 'filter2': {'operator1': '==', 'operator2': '!=', 'operators_conjuction': '|'}, + "filter1": { + "operator1": ">", + "operator2": "<=", + "operators_conjuction": "&", + "filters_conjuction": "|", + }, + "filter2": {"operator1": "==", "operator2": "!=", "operators_conjuction": "|"}, } expected_result = { - 'filter1': {'operator1': 'gt', 'operator2': 'le', 'operators_conjuction': 'and', 'filters_conjuction': 'or'}, - 'filter2': {'operator1': 'eq', 'operator2': 'ne', 'operators_conjuction': 'or'}, + "filter1": { + "operator1": "gt", + "operator2": "le", + "operators_conjuction": "and", + "filters_conjuction": "or", + }, + "filter2": {"operator1": "eq", "operator2": "ne", "operators_conjuction": "or"}, } result = sharepoint_list.operators_mapping(deepcopy(filters)) assert result == expected_result + def test_invalid_comparison_operator(sharepoint_list): filters = { - 'filter1': {'operator1': '*', 'operator2': '<=', 'operators_conjuction': '&', 'filters_conjuction': '|'}, + "filter1": { + "operator1": "*", + "operator2": "<=", + "operators_conjuction": "&", + "filters_conjuction": "|", + }, } error_message = "This comparison operator: * is not allowed. Please read the function documentation for details!" with pytest.raises(ValueError, match=re.escape(error_message)): sharepoint_list.operators_mapping(deepcopy(filters)) + def test_invalid_logical_operator(sharepoint_list): filters = { - 'filter1': {'operator1': '>', 'operator2': '<=', 'operators_conjuction': '!', 'filters_conjuction': '|'}, + "filter1": { + "operator1": ">", + "operator2": "<=", + "operators_conjuction": "!", + "filters_conjuction": "|", + }, } error_message = "This conjuction(logical) operator: ! is not allowed. Please read the function documentation for details!" with pytest.raises(ValueError, match=re.escape(error_message)): sharepoint_list.operators_mapping(deepcopy(filters)) + def test_single_filter_datetime_api(sharepoint_list): filters = { - 'date_column': {'dtype': 'datetime', 'operator1': '>', 'value1': '2023-01-01'} + "date_column": {"dtype": "datetime", "operator1": ">", "value1": "2023-01-01"} } result = sharepoint_list.make_filter_for_api(filters) expected_result = "date_column gt datetime'2023-01-01T00:00:00' " assert result == expected_result + def test_multiple_filters_api(sharepoint_list): filters = { - 'int_column': {'dtype': 'int', 'operator1': '>=', 'value1': 10, 'operator2': '<', 'value2': 20}, - 'str_column': {'dtype': 'str', 'operator1': '==', 'value1': 'example'} + "int_column": { + "dtype": "int", + "operator1": ">=", + "value1": 10, + "operator2": "<", + "value2": 20, + }, + "str_column": {"dtype": "str", "operator1": "==", "value1": "example"}, } result = sharepoint_list.make_filter_for_api(filters) expected_result = "int_column ge '10'int_column lt '20'str_column eq 'example'" assert result == expected_result + def test_single_df_filter(sharepoint_list): - filters = { - 'column1': {'operator1': '>', 'value1': 10} - } + filters = {"column1": {"operator1": ">", "value1": 10}} result = sharepoint_list.make_filter_for_df(filters) expected_result = "df.loc[(df.column1 > '10')]" assert result == expected_result + def test_multiple_df_filters(sharepoint_list): filters = { - 'column1': {'operator1': '>', 'value1': 10, 'filters_conjuction': '&'}, - 'column2': {'operator1': '<', 'value1': 20} + "column1": {"operator1": ">", "value1": 10, "filters_conjuction": "&"}, + "column2": {"operator1": "<", "value1": 20}, } result = sharepoint_list.make_filter_for_df(filters) expected_result = "df.loc[(df.column1 > '10')&(df.column2 < '20')]" - assert result == expected_result \ No newline at end of file + assert result == expected_result From fa9b8143404e77bf8f95a4cf9dc962d3e26a0795 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 26 Oct 2023 14:26:45 +0200 Subject: [PATCH 45/47] =?UTF-8?q?=F0=9F=94=A5=20removed=20connection=20tes?= =?UTF-8?q?t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_sharepoint.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/tests/integration/test_sharepoint.py b/tests/integration/test_sharepoint.py index d1157fd7a..502ffded0 100644 --- a/tests/integration/test_sharepoint.py +++ b/tests/integration/test_sharepoint.py @@ -168,26 +168,6 @@ def test_get_data_types(file_name): assert "String" in dtypes -# testing get_connection function passing invalid credentials and raises AuthenticationContext error. -def test_get_connection(): - site_url = "https://velux.sharepoint.com/" - credentials = { - "SHAREPOINT_CERT": { - "TENANT": "xxx", - "CLIENT_ID": "123", - "SCOPES": "https://velux.sharepoint.com/", - "THUMBPRINT": "xyz", - "PRIVATE_KEY": "private", - } - } - - spl = SharepointList(credentials=credentials) - with pytest.raises( - AttributeError, match="'SharepointList' object has no attribute 'ctx'" - ): - spl.get_connection(site_url=site_url) - - @pytest.fixture(scope="session") def sharepoint_list(): """ From 4338f55d82a906ed7bb33776c6a8d50d1798a853 Mon Sep 17 00:00:00 2001 From: gwieloch Date: Thu, 26 Oct 2023 16:04:26 +0200 Subject: [PATCH 46/47] added order for validation_task for vaidot flows --- viadot/flows/aselite_to_adls.py | 3 +++ viadot/flows/bigquery_to_adls.py | 3 +++ viadot/flows/cloud_for_customers_report_to_adls.py | 7 +++++-- viadot/flows/customer_gauge_to_adls.py | 3 +++ viadot/flows/eurostat_to_adls.py | 3 +++ viadot/flows/hubspot_to_adls.py | 3 +++ viadot/flows/mediatool_to_adls.py | 3 +++ viadot/flows/mysql_to_adls.py | 3 +++ viadot/flows/outlook_to_adls.py | 3 +++ viadot/flows/salesforce_to_adls.py | 3 +++ viadot/flows/sap_bw_to_adls.py | 3 +++ viadot/flows/sap_rfc_to_adls.py | 3 +++ viadot/flows/sharepoint_to_adls.py | 14 ++++++++++---- viadot/flows/supermetrics_to_adls.py | 3 +++ viadot/flows/vid_club_to_adls.py | 3 +++ 15 files changed, 54 insertions(+), 6 deletions(-) diff --git a/viadot/flows/aselite_to_adls.py b/viadot/flows/aselite_to_adls.py index bd77cf40f..61a91c963 100644 --- a/viadot/flows/aselite_to_adls.py +++ b/viadot/flows/aselite_to_adls.py @@ -115,5 +115,8 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + create_csv.set_upstream(validation_task, flow=self) + create_csv.set_upstream(df, flow=self) adls_upload.set_upstream(create_csv, flow=self) diff --git a/viadot/flows/bigquery_to_adls.py b/viadot/flows/bigquery_to_adls.py index 935b3f7a1..e09981ebe 100644 --- a/viadot/flows/bigquery_to_adls.py +++ b/viadot/flows/bigquery_to_adls.py @@ -197,6 +197,9 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + df_with_metadata.set_upstream(validation_task, flow=self) + df_with_metadata.set_upstream(df, flow=self) dtypes_dict.set_upstream(df_with_metadata, flow=self) df_to_be_loaded.set_upstream(dtypes_dict, flow=self) diff --git a/viadot/flows/cloud_for_customers_report_to_adls.py b/viadot/flows/cloud_for_customers_report_to_adls.py index 11b3e92bf..386a7224e 100644 --- a/viadot/flows/cloud_for_customers_report_to_adls.py +++ b/viadot/flows/cloud_for_customers_report_to_adls.py @@ -203,8 +203,8 @@ def gen_flow(self) -> Flow: ) if self.validate_df_dict: - validation = validate_df(df=df, tests=self.validate_df_dict, flow=self) - validation.set_upstream(df, flow=self) + validation_task = validate_df(df=df, tests=self.validate_df_dict, flow=self) + validation_task.set_upstream(df, flow=self) df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) @@ -232,6 +232,9 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + df_with_metadata.set_upstream(validation_task, flow=self) + df_with_metadata.set_upstream(df, flow=self) df_to_file.set_upstream(df_with_metadata, flow=self) file_to_adls_task.set_upstream(df_to_file, flow=self) diff --git a/viadot/flows/customer_gauge_to_adls.py b/viadot/flows/customer_gauge_to_adls.py index 6c54a0704..39225330c 100644 --- a/viadot/flows/customer_gauge_to_adls.py +++ b/viadot/flows/customer_gauge_to_adls.py @@ -241,6 +241,9 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + df_with_metadata.set_upstream(validation_task, flow=self) + file_to_adls_task.set_upstream(df_to_file, flow=self) json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) set_key_value(key=self.adls_dir_path, value=self.adls_file_path) diff --git a/viadot/flows/eurostat_to_adls.py b/viadot/flows/eurostat_to_adls.py index e6c76a084..08a8677d6 100644 --- a/viadot/flows/eurostat_to_adls.py +++ b/viadot/flows/eurostat_to_adls.py @@ -178,6 +178,9 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + df_with_metadata.set_upstream(validation_task, flow=self) + file_to_adls_task.set_upstream(df_to_file, flow=self) json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) set_key_value(key=self.adls_dir_path, value=self.adls_file_path) diff --git a/viadot/flows/hubspot_to_adls.py b/viadot/flows/hubspot_to_adls.py index 7a2fe4387..87f5e2504 100644 --- a/viadot/flows/hubspot_to_adls.py +++ b/viadot/flows/hubspot_to_adls.py @@ -183,6 +183,9 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + df_viadot_downloaded.set_upstream(validation_task, flow=self) + df_viadot_downloaded.set_upstream(df, flow=self) dtypes_dict.set_upstream(df_viadot_downloaded, flow=self) df_to_be_loaded.set_upstream(dtypes_dict, flow=self) diff --git a/viadot/flows/mediatool_to_adls.py b/viadot/flows/mediatool_to_adls.py index c4e92432b..156cfeef5 100644 --- a/viadot/flows/mediatool_to_adls.py +++ b/viadot/flows/mediatool_to_adls.py @@ -173,6 +173,9 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + df_with_metadata.set_upstream(validation_task, flow=self) + file_to_adls_task.set_upstream(df_to_file, flow=self) json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) set_key_value(key=self.adls_dir_path, value=self.adls_file_path) diff --git a/viadot/flows/mysql_to_adls.py b/viadot/flows/mysql_to_adls.py index afe594e47..c8c4c9c44 100644 --- a/viadot/flows/mysql_to_adls.py +++ b/viadot/flows/mysql_to_adls.py @@ -105,5 +105,8 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + create_csv.set_upstream(validation_task, flow=self) + create_csv.set_upstream(df, flow=self) adls_upload.set_upstream(create_csv, flow=self) diff --git a/viadot/flows/outlook_to_adls.py b/viadot/flows/outlook_to_adls.py index 606674ba1..dfeef1302 100644 --- a/viadot/flows/outlook_to_adls.py +++ b/viadot/flows/outlook_to_adls.py @@ -138,6 +138,9 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + df_with_metadata.set_upstream(validation_task, flow=self) + df_with_metadata.set_upstream(df, flow=self) df_to_file.set_upstream(df_with_metadata, flow=self) file_to_adls_task.set_upstream(df_to_file, flow=self) diff --git a/viadot/flows/salesforce_to_adls.py b/viadot/flows/salesforce_to_adls.py index 1ace9aa5a..11dec54fd 100644 --- a/viadot/flows/salesforce_to_adls.py +++ b/viadot/flows/salesforce_to_adls.py @@ -189,6 +189,9 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + df_clean.set_upstream(validation_task, flow=self) + df_clean.set_upstream(df, flow=self) df_with_metadata.set_upstream(df_clean, flow=self) dtypes_dict.set_upstream(df_with_metadata, flow=self) diff --git a/viadot/flows/sap_bw_to_adls.py b/viadot/flows/sap_bw_to_adls.py index 9619df98e..90b965f92 100644 --- a/viadot/flows/sap_bw_to_adls.py +++ b/viadot/flows/sap_bw_to_adls.py @@ -166,6 +166,9 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + df_viadot_downloaded.set_upstream(validation_task, flow=self) + df_viadot_downloaded.set_upstream(df, flow=self) dtypes_dict.set_upstream(df_viadot_downloaded, flow=self) df_to_be_loaded.set_upstream(dtypes_dict, flow=self) diff --git a/viadot/flows/sap_rfc_to_adls.py b/viadot/flows/sap_rfc_to_adls.py index c2bc71d8d..d7a2ac390 100644 --- a/viadot/flows/sap_rfc_to_adls.py +++ b/viadot/flows/sap_rfc_to_adls.py @@ -154,6 +154,9 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + df_to_file.set_upstream(validation_task, flow=self) + df_to_file.set_upstream(df, flow=self) adls_upload.set_upstream(df_to_file, flow=self) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 4b8e477dd..eaf747bab 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -124,8 +124,8 @@ def gen_flow(self) -> Flow: ) if self.validate_df_dict: - validation = validate_df(df=df, tests=self.validate_df_dict, flow=self) - validation.set_upstream(df, flow=self) + validation_task = validate_df(df=df, tests=self.validate_df_dict, flow=self) + validation_task.set_upstream(df, flow=self) df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) @@ -168,6 +168,9 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + df_with_metadata.set_upstream(validation_task, flow=self) + df_mapped.set_upstream(df_with_metadata, flow=self) dtypes_to_json_task.set_upstream(df_mapped, flow=self) df_to_file.set_upstream(dtypes_to_json_task, flow=self) @@ -316,8 +319,8 @@ def gen_flow(self) -> Flow: df = s.run() if self.validate_df_dict: - validation = validate_df(df=df, tests=self.validate_df_dict, flow=self) - validation.set_upstream(df, flow=self) + validation_task = validate_df(df=df, tests=self.validate_df_dict, flow=self) + validation_task.set_upstream(df, flow=self) df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) @@ -353,6 +356,9 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + df_with_metadata.set_upstream(validation_task, flow=self) + df_mapped.set_upstream(df_with_metadata, flow=self) dtypes_to_json_task.set_upstream(df_mapped, flow=self) df_to_file.set_upstream(dtypes_to_json_task, flow=self) diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index 5a104cff7..80253eb88 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -315,6 +315,9 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + df_with_metadata.set_upstream(validation_task, flow=self) + write_json.set_upstream(df, flow=self) validation.set_upstream(write_json, flow=self) df_with_metadata.set_upstream(validation_upstream, flow=self) diff --git a/viadot/flows/vid_club_to_adls.py b/viadot/flows/vid_club_to_adls.py index df9c0d531..40f53d8ae 100644 --- a/viadot/flows/vid_club_to_adls.py +++ b/viadot/flows/vid_club_to_adls.py @@ -204,6 +204,9 @@ def gen_flow(self) -> Flow: flow=self, ) + if self.validate_df_dict: + df_with_metadata.set_upstream(validation_task, flow=self) + file_to_adls_task.set_upstream(df_to_file, flow=self) json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) set_key_value(key=self.adls_dir_path, value=self.adls_file_path) From 6be1ad8639aa2ea53029859bc363c4377465c1f6 Mon Sep 17 00:00:00 2001 From: m-paz Date: Thu, 26 Oct 2023 16:08:29 +0100 Subject: [PATCH 47/47] =?UTF-8?q?=F0=9F=93=9D=20Updated=20changelog=20befo?= =?UTF-8?q?re=20release?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ede484c9..76eb3280b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] + +## [0.4.21] - 2023-10-26 ### Added - Added `validate_df` task to task_utils. - Added `SharepointList` source class. @@ -12,7 +14,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `SharepointListToADLS` flow class. - Added tests for `SharepointList`. - Added `get_nested_dict` to untils.py. -- Added `validate_df` task to `SharepointToADLS` class. ### Fixed