From 62f7d4c3b92ab715224811c71cd7fbb2e25b0ff7 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 15 Dec 2022 14:39:41 +0100 Subject: [PATCH 01/55] =?UTF-8?q?=F0=9F=9A=80=20Bumped=20version=20after?= =?UTF-8?q?=20release?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_viadot.py | 2 +- viadot/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_viadot.py b/tests/test_viadot.py index 60eb8acdf..ac0879563 100644 --- a/tests/test_viadot.py +++ b/tests/test_viadot.py @@ -2,4 +2,4 @@ def test_version(): - assert __version__ == "0.4.11" + assert __version__ == "0.4.12" diff --git a/viadot/__init__.py b/viadot/__init__.py index 58ce5cd17..9b084a609 100644 --- a/viadot/__init__.py +++ b/viadot/__init__.py @@ -1 +1 @@ -__version__ = "0.4.11" +__version__ = "0.4.12" From de2d9d8864b9826fac1a3b8cac29ea5b95fa284a Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Thu, 22 Dec 2022 16:31:27 +0100 Subject: [PATCH 02/55] =?UTF-8?q?=E2=9C=A8=20Added=20dtype=20sort=20check?= =?UTF-8?q?=20into=20ADLSToAzureSQL=20flow.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/adls_to_azure_sql.py | 46 ++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/viadot/flows/adls_to_azure_sql.py b/viadot/flows/adls_to_azure_sql.py index 9cc371b5e..b1d662bef 100644 --- a/viadot/flows/adls_to_azure_sql.py +++ b/viadot/flows/adls_to_azure_sql.py @@ -87,6 +87,46 @@ def df_to_csv_task(df, remove_tab, path: str, sep: str = "\t"): df.to_csv(path, sep=sep, index=False) +@task +def check_dtypes_sort( + df: pd.DataFrame, + dtypes: Dict[str, Any] = None, +) -> Dict[str, Any]: + """Check dtype column order to avoid malformation SQL table. + + Args: + df (pd.DataFrame): Data Frame from original ADLS file. + dtypes (Dict[str, Any], optional): Dictionary of columns and data type to apply + to the Data Frame downloaded. Defaults to None. + + Returns: + Dict[str, Any]: Sorted dtype. + """ + if df is None: + logger.warning("DataFrame is None") + else: + # first check if all dtypes keys are in df.columns + if all(d in df.columns for d in list(dtypes.keys())) and len(df.columns) == len( + list(dtypes.keys()) + ): + # check if have the same sort + matches = list(map(lambda x, y: x == y, df.columns, dtypes.keys())) + if not all(matches): + logger.warning( + "Some keys are not sorted in dtypes. Repositioning the key:value..." + ) + # re-sort in a new dtype + new_dtypes = dict() + for key in df.columns: + new_dtypes.update([(key, dtypes[key])]) + else: + logger.warning( + "dtype dictionary contains key(s) that not matching with the ADLS file columns name, or they have different length." + ) + + return new_dtypes + + class ADLSToAzureSQL(Flow): def __init__( self, @@ -236,7 +276,11 @@ def gen_flow(self) -> Flow: dtypes = map_data_types_task.bind(self.local_json_path, flow=self) map_data_types_task.set_upstream(download_json_file_task, flow=self) else: - dtypes = self.dtypes + dtypes = check_dtypes_sort.bind( + df, + dtypes=self.dtypes, + flow=self, + ) df_reorder = check_column_order_task.bind( table=self.table, From b6a335d8fa82fd10e885fdacee7d53a0b318dbe6 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Thu, 22 Dec 2022 16:41:50 +0100 Subject: [PATCH 03/55] =?UTF-8?q?=F0=9F=93=9D=20Updated=20CHANGELOG.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 178084905..8629b2138 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- Added `check_dtypes_sort` task into `ADLSToAzureSQL` to check if dtypes is properly sorted. # [0.4.11] - 2022-12-15 From d41f3fea9d7c6f84e325c4375d5c6ba3f34a7106 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Fri, 23 Dec 2022 14:51:23 +0100 Subject: [PATCH 04/55] =?UTF-8?q?=E2=9C=A8=20Added=20new=20task=20`adls=5F?= =?UTF-8?q?bulk=5Fupload`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 4c4d7c54f..983ed94ec 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -23,7 +23,7 @@ from visions.typesets.complete_set import CompleteSet from viadot.config import local_config -from viadot.tasks import AzureKeyVaultSecret +from viadot.tasks import AzureKeyVaultSecret, AzureDataLakeUpload from viadot.exceptions import CredentialError @@ -570,3 +570,36 @@ def credentials_loader(credentials_secret: str, vault_name: str = None) -> dict: raise CredentialError("Credentials secret not provided.") return credentials + + +@task +def adls_bulk_upload( + file_names: List[str], + file_name_relative_path: str = "", + adls_file_path: str = None, + adls_sp_credentials_secret: str = None, + adls_overwrite: bool = True, +) -> List[str]: + """Function that upload files to defined path in ADLS. + + Args: + file_names (List[str]): List of file names to generate paths. + file_name_relative_path (str, optional): Path where to save the file locally. Defaults to ''. + adls_file_path (str, optional): Azure Data Lake path. Defaults to None. + adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with + ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. + adls_overwrite (bool, optional): Whether to overwrite files in the data lake. Defaults to True. + + Returns: + List[str]: List of paths + """ + + file_to_adls_task = AzureDataLakeUpload() + + for file in file_names: + file_to_adls_task.run( + from_path=os.path.join(file_name_relative_path, file), + to_path=os.path.join(adls_file_path, file), + sp_credentials_secret=adls_sp_credentials_secret, + overwrite=adls_overwrite, + ) From cd0b7988cd471d1b9dc2779701a4930d2bd7b00e Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Fri, 23 Dec 2022 14:52:10 +0100 Subject: [PATCH 05/55] =?UTF-8?q?=F0=9F=8E=A8=20Improved=20`adls=5Fbulk=5F?= =?UTF-8?q?upload`=20task=20in=20genesys=20and=20mindful.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/genesys_to_adls.py | 35 ++---------------------------- viadot/flows/mindful_to_adls.py | 38 +-------------------------------- 2 files changed, 3 insertions(+), 70 deletions(-) diff --git a/viadot/flows/genesys_to_adls.py b/viadot/flows/genesys_to_adls.py index c4c894a76..42eae247b 100644 --- a/viadot/flows/genesys_to_adls.py +++ b/viadot/flows/genesys_to_adls.py @@ -6,44 +6,13 @@ from viadot.task_utils import df_to_csv from viadot.tasks import AzureDataLakeUpload from viadot.tasks.genesys import GenesysToCSV, GenesysToDF -from ..task_utils import ( +from viadot.task_utils import ( add_ingestion_metadata_task, df_to_csv, df_to_parquet, + adls_bulk_upload, ) -file_to_adls_task = AzureDataLakeUpload() - - -@task -def adls_bulk_upload( - file_names: List[str], - adls_file_path: str = None, - adls_sp_credentials_secret: str = None, - adls_overwrite: bool = True, -) -> List[str]: - """ - Function that upload files to defined path in ADLS. - - Args: - file_names (List[str]): List of file names to generate paths. - adls_file_path (str, optional): Azure Data Lake path. Defaults to None. - adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with - ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. - adls_overwrite (bool, optional): Whether to overwrite files in the data lake. Defaults to True. - Returns: - List[str]: List of paths - """ - - for file in file_names: - file_path = str(adls_file_path + "/" + file) - file_to_adls_task.run( - from_path=file, - to_path=file_path, - sp_credentials_secret=adls_sp_credentials_secret, - overwrite=adls_overwrite, - ) - @task def add_timestamp(files_names: List = None, sep: str = None) -> None: diff --git a/viadot/flows/mindful_to_adls.py b/viadot/flows/mindful_to_adls.py index 6b1981fba..58045755e 100644 --- a/viadot/flows/mindful_to_adls.py +++ b/viadot/flows/mindful_to_adls.py @@ -1,46 +1,10 @@ -import os from typing import Any, Dict, List, Literal import pandas as pd from datetime import datetime from prefect import Flow, task from viadot.tasks import MindfulToCSV -from viadot.tasks import AzureDataLakeUpload -from viadot.task_utils import add_ingestion_metadata_task - -file_to_adls_task = AzureDataLakeUpload() - - -@task -def adls_bulk_upload( - file_names: List[str], - file_name_relative_path: str = "", - adls_file_path: str = None, - adls_sp_credentials_secret: str = None, - adls_overwrite: bool = True, -) -> List[str]: - """Function that upload files to defined path in ADLS. - - Args: - file_names (List[str]): List of file names to generate paths. - file_name_relative_path (str, optional): Path where to save the file locally. Defaults to ''. - adls_file_path (str, optional): Azure Data Lake path. Defaults to None. - adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with - ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. - adls_overwrite (bool, optional): Whether to overwrite files in the data lake. Defaults to True. - - Returns: - List[str]: List of paths - """ - - for file in file_names: - file_path = str(adls_file_path + "/" + file) - file_to_adls_task.run( - from_path=os.path.join(file_name_relative_path, file), - to_path=file_path, - sp_credentials_secret=adls_sp_credentials_secret, - overwrite=adls_overwrite, - ) +from viadot.task_utils import add_ingestion_metadata_task, adls_bulk_upload @task From 2c18d77a436a8ef993c542767879c86394e36f85 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Fri, 23 Dec 2022 14:52:54 +0100 Subject: [PATCH 06/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20CHANGELOG.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 178084905..5ede267f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- Added `adls_bulk_upload` task function to `task_utils.py` +### Changed +- Updated `genesys_to_adls.py` flow with the `adls_bulk_upload` task +- Updated `mindful_to_adls.py` flow with the `adls_bulk_upload` task # [0.4.11] - 2022-12-15 ### Added From 5c9bbb99bfc5b89811c4e95071481afb7cc91730 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Fri, 23 Dec 2022 15:09:44 +0100 Subject: [PATCH 07/55] =?UTF-8?q?=F0=9F=8E=A8=20improved=20genesys=20to=20?= =?UTF-8?q?adls=20estructure.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/genesys_to_adls.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/viadot/flows/genesys_to_adls.py b/viadot/flows/genesys_to_adls.py index 42eae247b..8f7dc5578 100644 --- a/viadot/flows/genesys_to_adls.py +++ b/viadot/flows/genesys_to_adls.py @@ -13,6 +13,8 @@ adls_bulk_upload, ) +file_to_adls_task = AzureDataLakeUpload() + @task def add_timestamp(files_names: List = None, sep: str = None) -> None: From afb13fb737b8460524e84fe55a5276e0208338e8 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Thu, 29 Dec 2022 13:58:28 +0100 Subject: [PATCH 08/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20docstring=20in?= =?UTF-8?q?=20azure=5Fsql.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/azure_sql.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index e8de0c4c5..50027e1d0 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -21,6 +21,8 @@ def get_credentials(credentials_secret: str, vault_name: str = None): """ Get Azure credentials. + If the credential secret is not provided it will be taken from Prefect Secrets. If Prefect Secrets does not + contain the credential, it will be taken from the local credential file. Args: credentials_secret (str): The name of the Azure Key Vault secret containing a dictionary From 8b78cdf9d04d78a0b19f55397f5393b1e58109f6 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Tue, 3 Jan 2023 10:31:01 +0100 Subject: [PATCH 09/55] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20added=20timeout=20pa?= =?UTF-8?q?rameter=20to=20all=20Tasks.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 ++ viadot/tasks/aselite.py | 10 ++++++++- viadot/tasks/azure_blob_storage.py | 4 ++-- viadot/tasks/azure_data_lake.py | 24 ++++++++++++++++++++++ viadot/tasks/azure_sql.py | 29 +++++++++++++++++++------- viadot/tasks/bigquery.py | 4 ++++ viadot/tasks/cloud_for_customers.py | 4 ++++ viadot/tasks/duckdb.py | 2 +- viadot/tasks/epicor.py | 4 ++++ viadot/tasks/genesys.py | 6 ++++++ viadot/tasks/github.py | 5 ++++- viadot/tasks/mindful.py | 4 ++++ viadot/tasks/mysql_to_df.py | 4 ++++ viadot/tasks/outlook.py | 2 +- viadot/tasks/prefect_date_range.py | 6 +++++- viadot/tasks/salesforce.py | 6 ++++++ viadot/tasks/sftp.py | 8 ++++++++ viadot/tasks/sharepoint.py | 4 ++++ viadot/tasks/sql_server.py | 14 +++++++++++-- viadot/tasks/sqlite.py | 32 ++++++++++++++++++++++++----- viadot/tasks/supermetrics.py | 10 +++++---- 21 files changed, 159 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 178084905..af590867f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- Added `timeout` parameter to all `Task`s where it can be added. # [0.4.11] - 2022-12-15 diff --git a/viadot/tasks/aselite.py b/viadot/tasks/aselite.py index 58bf6f887..ca9a4d971 100644 --- a/viadot/tasks/aselite.py +++ b/viadot/tasks/aselite.py @@ -13,13 +13,20 @@ class ASELiteToDF(Task): def __init__( - self, credentials: Dict[str, Any] = None, query: str = None, *args, **kwargs + self, + credentials: Dict[str, Any] = None, + query: str = None, + timeout: int = 3600, + *args, + **kwargs ): """ Task for obtaining data from ASElite source. Args: credentials (Dict[str, Any], optional): ASElite SQL Database credentials. Defaults to None. query(str, optional): Query to perform on a database. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: Pandas DataFrame """ self.credentials = credentials @@ -27,6 +34,7 @@ def __init__( super().__init__( name="ASElite_to_df", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/azure_blob_storage.py b/viadot/tasks/azure_blob_storage.py index b15399bf2..0d8ae57d3 100644 --- a/viadot/tasks/azure_blob_storage.py +++ b/viadot/tasks/azure_blob_storage.py @@ -10,8 +10,8 @@ class BlobFromCSV(Task): Task for generating Azure Blob Storage from CSV file """ - def __init__(self, *args, **kwargs): - super().__init__(name="csv_to_blob_storage", *args, **kwargs) + def __init__(self, timeout: int = 3600, *args, **kwargs): + super().__init__(name="csv_to_blob_storage", timeout=timeout, *args, **kwargs) def __call__(self): """Generate a blob from a local CSV file""" diff --git a/viadot/tasks/azure_data_lake.py b/viadot/tasks/azure_data_lake.py index 9ca86ed0d..5b2329d45 100644 --- a/viadot/tasks/azure_data_lake.py +++ b/viadot/tasks/azure_data_lake.py @@ -23,6 +23,8 @@ class AzureDataLakeDownload(Task): recursive (bool, optional): Set this to true if downloading entire directories. gen (int, optional): The generation of the Azure Data Lake. Defaults to 2. vault_name (str, optional): The name of the vault from which to fetch the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. max_retries (int, optional): [description]. Defaults to 3. retry_delay (timedelta, optional): [description]. Defaults to timedelta(seconds=10). """ @@ -34,6 +36,7 @@ def __init__( recursive: bool = False, gen: int = 2, vault_name: str = None, + timeout: int = 3600, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), *args, @@ -49,6 +52,7 @@ def __init__( name="adls_download", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -131,6 +135,8 @@ class AzureDataLakeUpload(Task): overwrite (bool, optional): Whether to overwrite files in the lake. Defaults to False. gen (int, optional): The generation of the Azure Data Lake. Defaults to 2. vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -141,6 +147,7 @@ def __init__( overwrite: bool = False, gen: int = 2, vault_name: str = None, + timeout: int = 3600, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), *args, @@ -157,6 +164,7 @@ def __init__( name="adls_upload", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -245,6 +253,7 @@ def __init__( error_bad_lines: bool = None, gen: int = 2, vault_name: str = None, + timeout: int = 3600, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), *args, @@ -261,6 +270,8 @@ def __init__( error_bad_lines (bool, optional): Whether to raise an exception on bad lines. Defaults to None. gen (int, optional): The generation of the Azure Data Lake. Defaults to 2. vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.path = path self.sep = sep @@ -274,6 +285,7 @@ def __init__( name="adls_to_df", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -372,6 +384,8 @@ class AzureDataLakeCopy(Task): recursive (bool, optional): Set this to true if copy entire directories. gen (int, optional): The generation of the Azure Data Lake. Defaults to 2. vault_name (str, optional): The name of the vault from which to fetch the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. max_retries (int, optional): [description]. Defaults to 3. retry_delay (timedelta, optional): [description]. Defaults to timedelta(seconds=10). """ @@ -383,6 +397,7 @@ def __init__( recursive: bool = False, gen: int = 2, vault_name: str = None, + timeout: int = 3600, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), *args, @@ -398,6 +413,7 @@ def __init__( name="adls_copy", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -478,6 +494,8 @@ class AzureDataLakeList(Task): path (str, optional): The path to the directory which contents you want to list. Defaults to None. gen (int, optional): The generation of the Azure Data Lake. Defaults to 2. vault_name (str, optional): The name of the vault from which to fetch the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. max_retries (int, optional): [description]. Defaults to 3. retry_delay (timedelta, optional): [description]. Defaults to timedelta(seconds=10). @@ -493,6 +511,7 @@ def __init__( path: str = None, gen: int = 2, vault_name: str = None, + timeout: int = 3600, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), *args, @@ -506,6 +525,7 @@ def __init__( name="adls_list", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -602,6 +622,8 @@ class AzureDataLakeRemove(Task): recursive (bool): Set this to true if removing entire directories. gen (int, optional): The generation of the Azure Data Lake. Defaults to 2. vault_name (str, optional): The name of the vault from which to fetch the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. max_retries (int, optional): Maximum number of retries before failing. Defaults to 3. retry_delay (timedelta, optional): Time to wait before the next retry attempt. Defaults to timedelta(seconds=10). """ @@ -612,6 +634,7 @@ def __init__( recursive: bool = False, gen: int = 2, vault_name: str = None, + timeout: int = 3600, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), *args, @@ -626,6 +649,7 @@ def __init__( name="adls_rm", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index e8de0c4c5..94bbb4f85 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -50,9 +50,9 @@ def get_credentials(credentials_secret: str, vault_name: str = None): class CreateTableFromBlob(Task): - def __init__(self, sep="\t", *args, **kwargs): + def __init__(self, sep="\t", timeout: int = 3600, *args, **kwargs): self.sep = sep - super().__init__(name="blob_to_azure_sql", *args, **kwargs) + super().__init__(name="blob_to_azure_sql", timeout=timeout, *args, **kwargs) @defaults_from_attrs("sep") def run( @@ -107,6 +107,7 @@ def __init__( sep="\t", if_exists: Literal["fail", "replace", "append", "delete"] = "fail", credentials_secret: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -117,7 +118,7 @@ def __init__( self.sep = sep self.if_exists = if_exists self.credentials_secret = credentials_secret - super().__init__(name="azure_sql_bulk_insert", *args, **kwargs) + super().__init__(name="azure_sql_bulk_insert", timeout=timeout, *args, **kwargs) @defaults_from_attrs("sep", "if_exists", "credentials_secret") def run( @@ -178,6 +179,7 @@ def __init__( if_exists: Literal["fail", "replace", "skip", "delete"] = "fail", credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), *args, @@ -193,6 +195,7 @@ def __init__( name="azure_sql_create_table", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -246,19 +249,22 @@ class AzureSQLDBQuery(Task): credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with SQL db credentials (server, db_name, user, and password). vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( self, credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args, **kwargs, ): self.credentials_secret = credentials_secret self.vault_name = vault_name - super().__init__(name="azure_sql_db_query", *args, **kwargs) + super().__init__(name="azure_sql_db_query", timeout=timeout, *args, **kwargs) def run( self, @@ -294,19 +300,22 @@ class AzureSQLToDF(Task): credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with SQL db credentials (server, db_name, user, and password). vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( self, credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args, **kwargs, ): self.credentials_secret = credentials_secret self.vault_name = vault_name - super().__init__(name="azure_sql_to_df", *args, **kwargs) + super().__init__(name="azure_sql_to_df", timeout=timeout, *args, **kwargs) def run( self, @@ -350,13 +359,16 @@ def __init__( df: pd.DataFrame = None, credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args, **kwargs, ): self.credentials_secret = credentials_secret self.vault_name = vault_name - super().__init__(name="run_check_column_order", *args, **kwargs) + super().__init__( + name="run_check_column_order", timeout=timeout, *args, **kwargs + ) def df_change_order( self, df: pd.DataFrame = None, sql_column_list: List[str] = None @@ -443,6 +455,8 @@ class AzureSQLUpsert(Task): on (str, optional): The field on which to merge (upsert). Defaults to None. credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -451,6 +465,7 @@ def __init__( table: str = None, on: str = None, credentials_secret: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -458,7 +473,7 @@ def __init__( self.table = table self.on = on self.credentials_secret = credentials_secret - super().__init__(name="azure_sql_upsert", *args, **kwargs) + super().__init__(name="azure_sql_upsert", timeout=timeout, *args, **kwargs) @defaults_from_attrs( "schema", diff --git a/viadot/tasks/bigquery.py b/viadot/tasks/bigquery.py index 315097425..e4831afad 100644 --- a/viadot/tasks/bigquery.py +++ b/viadot/tasks/bigquery.py @@ -28,6 +28,7 @@ def __init__( credentials_key: str = None, credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -52,6 +53,8 @@ def __init__( credentials can be generated as key for User Principal inside a BigQuery project. Defaults to None. credentials_secret (str, optional): The name of the Azure Key Vault secret for Bigquery project. Defaults to None. vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.dataset_name = dataset_name self.table_name = table_name @@ -64,6 +67,7 @@ def __init__( super().__init__( name="bigquery_to_df", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/cloud_for_customers.py b/viadot/tasks/cloud_for_customers.py index 9bc5e6b9d..2413f32b4 100644 --- a/viadot/tasks/cloud_for_customers.py +++ b/viadot/tasks/cloud_for_customers.py @@ -23,6 +23,7 @@ def __init__( env: str = "QA", max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), + timeout: int = 3600, **kwargs, ): @@ -35,6 +36,7 @@ def __init__( name="c4c_report_to_df", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -122,6 +124,7 @@ def __init__( if_empty: str = "warn", max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), + timeout: int = 3600, **kwargs, ): @@ -137,6 +140,7 @@ def __init__( name="c4c_to_df", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/duckdb.py b/viadot/tasks/duckdb.py index a26a3ce07..d5d72028b 100644 --- a/viadot/tasks/duckdb.py +++ b/viadot/tasks/duckdb.py @@ -22,7 +22,7 @@ class DuckDBQuery(Task): def __init__( self, credentials: dict = None, - timeout: int = 600, + timeout: int = 3600, *args, **kwargs, ): diff --git a/viadot/tasks/epicor.py b/viadot/tasks/epicor.py index 51f12376f..271154162 100644 --- a/viadot/tasks/epicor.py +++ b/viadot/tasks/epicor.py @@ -18,6 +18,7 @@ def __init__( config_key: str = None, start_date_field: str = "BegInvoiceDate", end_date_field: str = "EndInvoiceDate", + timeout: int = 3600, *args, **kwargs, ) -> pd.DataFrame: @@ -32,6 +33,8 @@ def __init__( config_key (str, optional): Credential key to dictionary where details are stored. Defauls to None. start_date_field (str, optional) The name of filters filed containing start date. Defaults to "BegInvoiceDate". end_date_field (str, optional) The name of filters filed containing end date. Defaults to "EndInvoiceDate". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: pd.DataFrame: DataFrame with parsed API output @@ -44,6 +47,7 @@ def __init__( self.end_date_field = end_date_field super().__init__( name="epicor_orders_to_df", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index aec674f17..aab7685ec 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -30,6 +30,7 @@ def __init__( report_url: str = None, report_columns: List[str] = None, credentials_genesys: Dict[str, Any] = None, + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -50,6 +51,8 @@ def __init__( schedule_id (str, optional): The ID of report. Defaults to None. report_url (str, optional): The url of report generated in json response. Defaults to None. report_columns (List[str], optional): List of exisiting column in report. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.logger = prefect.context.get("logger") @@ -69,6 +72,7 @@ def __init__( super().__init__( name=self.report_name, + timeout=timeout, *args, **kwargs, ) @@ -219,6 +223,7 @@ def __init__( report_url: str = None, report_columns: List[str] = None, credentials_genesys: Dict[str, Any] = None, + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -233,6 +238,7 @@ def __init__( super().__init__( name="genesys_to_df", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/github.py b/viadot/tasks/github.py index 0a8e34e62..eb4789471 100644 --- a/viadot/tasks/github.py +++ b/viadot/tasks/github.py @@ -90,6 +90,8 @@ class DownloadGitHubFile(Task): to_path (str, optional): The destination path. Defaults to None. access_token_secret (str, optional): The Prefect secret containing GitHub token. Defaults to "github_token". branch (str, optional): The GitHub branch to use. Defaults to "main". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -99,6 +101,7 @@ def __init__( to_path: str = None, access_token_secret: str = "github_token", branch: str = "main", + timeout: int = 3600, **kwargs, ): self.repo = repo @@ -106,7 +109,7 @@ def __init__( self.to_path = to_path self.access_token_secret = access_token_secret self.branch = branch - super().__init__(name="download_github_file", **kwargs) + super().__init__(name="download_github_file", timeout=timeout, **kwargs) @defaults_from_attrs( "repo", "from_path", "to_path", "access_token_secret", "branch" diff --git a/viadot/tasks/mindful.py b/viadot/tasks/mindful.py index abc870817..fe1a86230 100644 --- a/viadot/tasks/mindful.py +++ b/viadot/tasks/mindful.py @@ -25,6 +25,7 @@ def __init__( region: Literal["us1", "us2", "us3", "ca1", "eu1", "au1"] = "eu1", file_extension: Literal["parquet", "csv"] = "csv", file_path: str = "", + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -39,6 +40,8 @@ def __init__( region (Literal[us1, us2, us3, ca1, eu1, au1], optional): SD region from where to interact with the mindful API. Defaults to "eu1". file_extension (Literal[parquet, csv], optional): File extensions for storing responses. Defaults to "csv". file_path (str, optional): Path where to save the file locally. Defaults to ''. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Raises: CredentialError: If credentials are not provided in local_config or directly as a parameter inside run method. @@ -53,6 +56,7 @@ def __init__( super().__init__( name=report_name, + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/mysql_to_df.py b/viadot/tasks/mysql_to_df.py index a2eeda4a8..62a13f7c9 100644 --- a/viadot/tasks/mysql_to_df.py +++ b/viadot/tasks/mysql_to_df.py @@ -17,6 +17,7 @@ def __init__( country_short: Literal["AT", "DE", "CH", None], credentials: Dict[str, Any] = None, query: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -27,6 +28,8 @@ def __init__( credentials (Dict[str, Any], optional): MySql Database credentials. Defaults to None. query(str, optional): Query to perform on a database. Defaults to None. country_short (Dict[str, Any], optional): Country short to select proper credential. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: Pandas DataFrame """ @@ -36,6 +39,7 @@ def __init__( super().__init__( name="MySQLToDF", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/outlook.py b/viadot/tasks/outlook.py index ad70e3fe5..dbff99a54 100644 --- a/viadot/tasks/outlook.py +++ b/viadot/tasks/outlook.py @@ -17,7 +17,7 @@ def __init__( credentials: Dict[str, Any] = None, output_file_extension: str = ".csv", limit: int = 10000, - timeout: int = 1200, + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): diff --git a/viadot/tasks/prefect_date_range.py b/viadot/tasks/prefect_date_range.py index 9d0fe9591..fda09a198 100644 --- a/viadot/tasks/prefect_date_range.py +++ b/viadot/tasks/prefect_date_range.py @@ -144,6 +144,7 @@ def __init__( self, flow_name: str = None, date_range_type: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -153,7 +154,9 @@ def __init__( Args: flow_name (str, optional): Prefect flow name. Defaults to None. date_range_type (str, optional): String argument that should look like this: 'last_X_days' - - X is a number of days. Defaults to None. + X is a number of days. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.flow_name = flow_name @@ -161,6 +164,7 @@ def __init__( super().__init__( name="prefect_extract_details", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/salesforce.py b/viadot/tasks/salesforce.py index a4a4527ce..7557f0744 100644 --- a/viadot/tasks/salesforce.py +++ b/viadot/tasks/salesforce.py @@ -56,6 +56,7 @@ def __init__( raise_on_error: bool = False, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), + timeout: int = 3600, *args, **kwargs, ): @@ -70,6 +71,7 @@ def __init__( name="salesforce_upsert", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -149,6 +151,7 @@ def __init__( raise_on_error: bool = False, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), + timeout: int = 3600, *args, **kwargs, ): @@ -163,6 +166,7 @@ def __init__( name="salesforce_bulk_upsert", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -253,6 +257,7 @@ def __init__( domain: str = "test", client_id: str = "viadot", env: str = "DEV", + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -265,6 +270,7 @@ def __init__( super().__init__( name="salesforce_to_df", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/sftp.py b/viadot/tasks/sftp.py index 1ee6b5f80..cf4365c2b 100644 --- a/viadot/tasks/sftp.py +++ b/viadot/tasks/sftp.py @@ -18,6 +18,7 @@ def __init__( credentials: Dict[str, Any] = None, sftp_credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -27,6 +28,8 @@ def __init__( credentials (Dict[str, Any], optional): SFTP credentials. Defaults to None. sftp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary credentials for SFTP connection. Defaults to None. vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: Pandas DataFrame """ @@ -36,6 +39,7 @@ def __init__( super().__init__( name="SftpToDF", + timeout=timeout, *args, **kwargs, ) @@ -87,6 +91,7 @@ def __init__( credentials: Dict[str, Any] = None, sftp_credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -96,6 +101,8 @@ def __init__( credentials (Dict[str, Any], optional): SFTP credentials. Defaults to None. sftp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary credentials for SFTP connection. Defaults to None. vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: files_list (List): List of files in SFTP server. @@ -106,6 +113,7 @@ def __init__( super().__init__( name="SftpList", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/sharepoint.py b/viadot/tasks/sharepoint.py index a4ad69fa9..fa1b7e83a 100644 --- a/viadot/tasks/sharepoint.py +++ b/viadot/tasks/sharepoint.py @@ -28,6 +28,8 @@ class SharepointToDF(Task): sheet_number (int): Sheet number to be extracted from file. Counting from 0, if None all sheets are axtracted. Defaults to None. validate_excel_file (bool, optional): Check if columns in separate sheets are the same. Defaults to False. if_empty (str, optional): What to do if query returns no data. Defaults to "warn". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: pd.DataFrame: Pandas data frame @@ -41,6 +43,7 @@ def __init__( sheet_number: int = None, validate_excel_file: bool = False, if_empty: str = "warn", + timeout: int = 3600, *args, **kwargs, ): @@ -54,6 +57,7 @@ def __init__( super().__init__( name="sharepoint_to_df", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/sql_server.py b/viadot/tasks/sql_server.py index 556877d0f..203d1c06b 100644 --- a/viadot/tasks/sql_server.py +++ b/viadot/tasks/sql_server.py @@ -20,6 +20,8 @@ class SQLServerCreateTable(Task): dtypes (Dict[str, Any], optional): Data types to enforce. if_exists (Literal, optional): What to do if the table already exists. credentials (dict, optional): Credentials for the connection. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -31,6 +33,7 @@ def __init__( credentials: dict = None, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), + timeout: int = 3600, *args, **kwargs, ): @@ -43,6 +46,7 @@ def __init__( name="sql_server_create_table", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -89,6 +93,7 @@ class SQLServerToDF(Task): def __init__( self, config_key: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -97,11 +102,13 @@ def __init__( Args: config_key (str, optional): The key inside local config containing the credentials. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.config_key = config_key - super().__init__(name="sql_server_to_df", *args, **kwargs) + super().__init__(name="sql_server_to_df", timeout=timeout, *args, **kwargs) @defaults_from_attrs("config_key") def run( @@ -135,6 +142,7 @@ class SQLServerQuery(Task): def __init__( self, config_key: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -143,10 +151,12 @@ def __init__( Args: config_key (str, optional): The key inside local config containing the credentials. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.config_key = config_key - super().__init__(name="sql_server_query", *args, **kwargs) + super().__init__(name="sql_server_query", timeout=timeout, *args, **kwargs) @defaults_from_attrs("config_key") def run( diff --git a/viadot/tasks/sqlite.py b/viadot/tasks/sqlite.py index fbf767ab1..f72530fc1 100644 --- a/viadot/tasks/sqlite.py +++ b/viadot/tasks/sqlite.py @@ -17,6 +17,8 @@ class SQLiteInsert(Task): Args: db_path (str, optional): The path to the database to be used. Defaults to None. sql_path (str, optional): The path to the text file containing the query. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ @@ -28,6 +30,7 @@ def __init__( table_name: str = None, if_exists: str = "fail", dtypes: Dict[str, Any] = None, + timeout: int = 3600, *args, **kwargs, ): @@ -38,7 +41,7 @@ def __init__( self.schema = schema self.if_exists = if_exists - super().__init__(name="sqlite_insert", *args, **kwargs) + super().__init__(name="sqlite_insert", timeout=timeout, *args, **kwargs) @defaults_from_attrs("df", "db_path", "schema", "table_name", "if_exists", "dtypes") def run( @@ -75,14 +78,23 @@ class SQLiteSQLtoDF(Task): Args: db_path (str, optional): The path to the database to be used. Defaults to None. sql_path (str, optional): The path to the text file containing the query. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ - def __init__(self, db_path: str = None, sql_path: str = None, *args, **kwargs): + def __init__( + self, + db_path: str = None, + sql_path: str = None, + timeout: int = 3600, + *args, + **kwargs, + ): self.db_path = db_path self.sql_path = sql_path - super().__init__(name="sqlite_sql_to_df", *args, **kwargs) + super().__init__(name="sqlite_sql_to_df", timeout=timeout, *args, **kwargs) def __call__(self): """Generate a DataFrame from a SQLite SQL query""" @@ -111,12 +123,22 @@ class SQLiteQuery(Task): Args: query (str, optional): The query to execute on the database. Defaults to None. db_path (str, optional): The path to the database to be used. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. + """ - def __init__(self, query: str = None, db_path: str = None, *args, **kwargs): + def __init__( + self, + query: str = None, + db_path: str = None, + timeout: int = 3600, + *args, + **kwargs, + ): self.query = query self.db_path = db_path - super().__init__(name="sqlite_query", *args, **kwargs) + super().__init__(name="sqlite_query", timeout=timeout, *args, **kwargs) def __call__(self): """Run an SQL query on SQLite""" diff --git a/viadot/tasks/supermetrics.py b/viadot/tasks/supermetrics.py index 5fe41f8f5..1187d46a9 100644 --- a/viadot/tasks/supermetrics.py +++ b/viadot/tasks/supermetrics.py @@ -16,7 +16,8 @@ class SupermetricsToCSV(Task): path (str, optional): The destination path. Defaults to "supermetrics_extract.csv". max_retries (int, optional): The maximum number of retries. Defaults to 5. retry_delay (timedelta, optional): The delay between task retries. Defaults to 10 seconds. - timeout (int, optional): Task timeout. Defaults to 30 minuntes. + timeout (int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. max_rows (int, optional): Maximum number of rows the query results should contain. Defaults to 1 000 000. max_cols (int, optional): Maximum number of columns the query results should contain. Defaults to None. if_exists (str, optional): What to do if file already exists. Defaults to "replace". @@ -31,7 +32,7 @@ def __init__( path: str = "supermetrics_extract.csv", max_retries: int = 5, retry_delay: timedelta = timedelta(seconds=10), - timeout: int = 60 * 30, + timeout: int = 3600, max_rows: int = 1_000_000, if_exists: str = "replace", if_empty: str = "warn", @@ -173,7 +174,8 @@ class SupermetricsToDF(Task): if_empty (str, optional): What to do if query returns no data. Defaults to "warn". max_retries (int, optional): The maximum number of retries. Defaults to 5. retry_delay (timedelta, optional): The delay between task retries. Defaults to 10 seconds. - timeout (int, optional): Task timeout. Defaults to 30 minuntes. + timeout (int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -183,7 +185,7 @@ def __init__( max_rows: int = 1_000_000, max_retries: int = 5, retry_delay: timedelta = timedelta(seconds=10), - timeout: int = 60 * 30, + timeout: int = 3600, **kwargs, ): From 23add93629ddec8a94774b477bb4ea71c849f32d Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 4 Jan 2023 08:15:34 +0100 Subject: [PATCH 10/55] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20added=20timeout=20to?= =?UTF-8?q?=20more=20tasks.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/task_utils.py | 45 +++++++++++++++++++---------------------- viadot/tasks/bcp.py | 4 ++++ viadot/tasks/duckdb.py | 12 +++++++++-- viadot/tasks/sap_rfc.py | 8 +++----- 5 files changed, 39 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index af590867f..7c562df34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - Added `timeout` parameter to all `Task`s where it can be added. +- Added `timeout` parameter to all `Flow`s where it can be added. # [0.4.11] - 2022-12-15 diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 4c4d7c54f..9d057599c 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -32,7 +32,7 @@ METADATA_COLUMNS = {"_viadot_downloaded_at_utc": "DATETIME"} -@task +@task(timeout=3600) def add_ingestion_metadata_task( df: pd.DataFrame, ): @@ -52,7 +52,7 @@ def add_ingestion_metadata_task( return df2 -@task +@task(timeout=3600) def get_latest_timestamp_file_path(files: List[str]) -> str: """ Return the name of the latest file in a given data lake directory, @@ -75,7 +75,7 @@ def get_latest_timestamp_file_path(files: List[str]) -> str: return latest_file -@task +@task(timeout=3600) def dtypes_to_json_task(dtypes_dict, local_json_path: str): """ Creates json file from a dictionary. @@ -87,7 +87,7 @@ def dtypes_to_json_task(dtypes_dict, local_json_path: str): json.dump(dtypes_dict, fp) -@task +@task(timeout=3600) def chunk_df(df: pd.DataFrame, size: int = 10_000) -> List[pd.DataFrame]: """ Creates pandas Dataframes list of chunks with a given size. @@ -100,7 +100,7 @@ def chunk_df(df: pd.DataFrame, size: int = 10_000) -> List[pd.DataFrame]: return chunks -@task +@task(timeout=3600) def df_get_data_types_task(df: pd.DataFrame) -> dict: """ Returns dictionary containing datatypes of pandas DataFrame columns. @@ -113,7 +113,7 @@ def df_get_data_types_task(df: pd.DataFrame) -> dict: return dtypes_dict -@task +@task(timeout=3600) def get_sql_dtypes_from_df(df: pd.DataFrame) -> dict: """Obtain SQL data types from a pandas DataFrame""" typeset = CompleteSet() @@ -156,14 +156,14 @@ def get_sql_dtypes_from_df(df: pd.DataFrame) -> dict: return dtypes_dict_fixed -@task +@task(timeout=3600) def update_dict(d: dict, d_new: dict) -> dict: d_copy = copy.deepcopy(d) d_copy.update(d_new) return d_copy -@task +@task(timeout=3600) def df_map_mixed_dtypes_for_parquet( df: pd.DataFrame, dtypes_dict: dict ) -> pd.DataFrame: @@ -185,7 +185,7 @@ def df_map_mixed_dtypes_for_parquet( return df_mapped -@task +@task(timeout=3600) def update_dtypes_dict(dtypes_dict: dict) -> dict: """ Task to update dtypes_dictionary that will be stored in the schema. It's required due to workaround Pandas to_parquet bug connected with mixed dtypes in object @@ -203,7 +203,7 @@ def update_dtypes_dict(dtypes_dict: dict) -> dict: return dtypes_dict_updated -@task +@task(timeout=3600) def df_to_csv( df: pd.DataFrame, path: str, @@ -243,7 +243,7 @@ def df_to_csv( out_df.to_csv(path, index=False, sep=sep) -@task +@task(timeout=3600) def df_to_parquet( df: pd.DataFrame, path: str, @@ -279,7 +279,7 @@ def df_to_parquet( out_df.to_parquet(path, index=False, **kwargs) -@task +@task(timeout=3600) def union_dfs_task(dfs: List[pd.DataFrame]): """ Create one DataFrame from a list of pandas DataFrames. @@ -289,7 +289,7 @@ def union_dfs_task(dfs: List[pd.DataFrame]): return pd.concat(dfs, ignore_index=True) -@task +@task(timeout=3600) def write_to_json(dict_, path): """ Creates json file from a dictionary. Log record informs about the writing file proccess. @@ -312,23 +312,20 @@ def write_to_json(dict_, path): logger.debug(f"Successfully wrote to {path}.") -@task +@task(timeout=3600) def cleanup_validation_clutter(expectations_path): ge_project_path = Path(expectations_path).parent shutil.rmtree(ge_project_path) -@task +@task(timeout=3600) def df_converts_bytes_to_int(df: pd.DataFrame) -> pd.DataFrame: logger = prefect.context.get("logger") logger.info("Converting bytes in dataframe columns to list of integers") return df.applymap(lambda x: list(map(int, x)) if isinstance(x, bytes) else x) -@task( - max_retries=3, - retry_delay=timedelta(seconds=10), -) +@task(max_retries=3, retry_delay=timedelta(seconds=10), timeout=3600) def df_to_dataset( df: pd.DataFrame, partitioning_flavor="hive", format="parquet", **kwargs ) -> None: @@ -436,7 +433,7 @@ def custom_mail_state_handler( return new_state -@task +@task(timeout=3600) def df_clean_column( df: pd.DataFrame, columns_to_clean: List[str] = None ) -> pd.DataFrame: @@ -473,7 +470,7 @@ def df_clean_column( return df -@task +@task(timeout=3600) def concat_dfs(dfs: List[pd.DataFrame]): """ Task to combine list of data frames into one. @@ -486,7 +483,7 @@ def concat_dfs(dfs: List[pd.DataFrame]): return pd.concat(dfs, axis=1) -@task +@task(timeout=3600) def cast_df_to_str(df: pd.DataFrame) -> pd.DataFrame: """ Task for casting an entire DataFrame to a string data type. Task is needed @@ -503,7 +500,7 @@ def cast_df_to_str(df: pd.DataFrame) -> pd.DataFrame: return df_mapped -@task +@task(timeout=3600) def set_new_kv(kv_name: str, df: pd.DataFrame, filter_column: str): """ Task for updating/setting key value on Prefect based on the newest @@ -532,7 +529,7 @@ def git_clone_url(self): return f"https://{self.git_token_secret}@{self.repo_host}/{self.repo}" -@task +@task(timeout=3600) def credentials_loader(credentials_secret: str, vault_name: str = None) -> dict: """ Function that gets credentials from azure Key Vault or PrefectSecret or from local config. diff --git a/viadot/tasks/bcp.py b/viadot/tasks/bcp.py index c81aba71c..e23b22459 100644 --- a/viadot/tasks/bcp.py +++ b/viadot/tasks/bcp.py @@ -34,6 +34,8 @@ class BCPTask(ShellTask): - on_error (Literal["skip", "fail"], optional): What to do if error occurs. Defaults to "skip". - credentials (dict, optional): The credentials to use for connecting with the database. - vault_name (str): The name of the vault from which to fetch the secret. + - timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. - **kwargs (dict, optional): Additional keyword arguments to pass to the Task constructor. """ @@ -49,6 +51,7 @@ def __init__( vault_name: str = None, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), + timeout: int = 3600, *args, **kwargs, ): @@ -67,6 +70,7 @@ def __init__( return_all=True, max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/duckdb.py b/viadot/tasks/duckdb.py index d5d72028b..88fd69b7d 100644 --- a/viadot/tasks/duckdb.py +++ b/viadot/tasks/duckdb.py @@ -17,6 +17,8 @@ class DuckDBQuery(Task): Args: credentials (dict, optional): The config to use for connecting with the db. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -35,7 +37,6 @@ def run( query: str, fetch_type: Literal["record", "dataframe"] = "record", credentials: dict = None, - timeout: int = None, ) -> Union[List[Record], bool]: """Run a query on DuckDB. @@ -71,6 +72,8 @@ class DuckDBCreateTableFromParquet(Task): if_exists (Literal, optional): What to do if the table already exists. if_empty (Literal, optional): What to do if ".parquet" file is emty. Defaults to "skip". credentials(dict, optional): The config to use for connecting with the db. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Raises: ValueError: If the table exists and `if_exists`is set to `fail` or when parquet file @@ -86,6 +89,7 @@ def __init__( if_exists: Literal["fail", "replace", "append", "skip", "delete"] = "fail", if_empty: Literal["skip", "fail"] = "skip", credentials: dict = None, + timeout: int = 3600, *args, **kwargs, ): @@ -96,6 +100,7 @@ def __init__( super().__init__( name="duckdb_create_table", + timeout=timeout, *args, **kwargs, ) @@ -157,6 +162,8 @@ class DuckDBToDF(Task): if_empty (Literal[, optional): What to do if the query returns no data. Defaults to "warn". credentials (dict, optional): The config to use for connecting with the db. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: pd.DataFrame: a pandas DataFrame containing the table data. @@ -168,6 +175,7 @@ def __init__( table: str = None, if_empty: Literal["warn", "skip", "fail"] = "warn", credentials: dict = None, + timeout: int = 3600, *args, **kwargs, ): @@ -177,7 +185,7 @@ def __init__( self.if_empty = if_empty self.credentials = credentials - super().__init__(name="duckdb_to_df", *args, **kwargs) + super().__init__(name="duckdb_to_df", timeout=timeout, *args, **kwargs) @defaults_from_attrs("schema", "table", "if_empty", "credentials") def run( diff --git a/viadot/tasks/sap_rfc.py b/viadot/tasks/sap_rfc.py index a24f0ae33..b863db2fb 100644 --- a/viadot/tasks/sap_rfc.py +++ b/viadot/tasks/sap_rfc.py @@ -73,7 +73,6 @@ def __init__( "credentials", "max_retries", "retry_delay", - "timeout", ) def run( self, @@ -84,7 +83,6 @@ def run( rfc_total_col_width_character_limit: int = None, max_retries: int = None, retry_delay: timedelta = None, - timeout: int = None, ) -> pd.DataFrame: """Task run method. @@ -94,9 +92,9 @@ def run( multiple options are automatically tried. Defaults to None. func (str, optional): SAP RFC function to use. Defaults to None. rfc_total_col_width_character_limit (int, optional): Number of characters by which query will be split in chunks - in case of too many columns for RFC function. According to SAP documentation, the limit is - 512 characters. However, we observed SAP raising an exception even on a slightly lower number - of characters, so we add a safety margin. Defaults to None. + in case of too many columns for RFC function. According to SAP documentation, the limit is + 512 characters. However, we observed SAP raising an exception even on a slightly lower number + of characters, so we add a safety margin. Defaults to None. """ if query is None: raise ValueError("Please provide the query.") From d8b84fe4cb0830c4b2cb69e23240d308edc3db6e Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 4 Jan 2023 08:16:16 +0100 Subject: [PATCH 11/55] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20added=20timeout=20to?= =?UTF-8?q?=20all=20Flows=20where=20it=20can=20be=20added.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/adls_container_to_container.py | 8 ++++-- viadot/flows/adls_gen1_to_azure_sql.py | 11 +++++--- viadot/flows/adls_gen1_to_azure_sql_new.py | 12 ++++++--- viadot/flows/adls_gen1_to_gen2.py | 9 ++++--- viadot/flows/adls_to_azure_sql.py | 27 ++++++++++--------- viadot/flows/aselite_to_adls.py | 9 ++++--- viadot/flows/azure_sql_transform.py | 10 ++++--- viadot/flows/bigquery_to_adls.py | 14 ++++++---- .../cloud_for_customers_report_to_adls.py | 17 +++++++----- viadot/flows/duckdb_to_sql_server.py | 20 ++++++++------ viadot/flows/duckdb_transform.py | 10 ++++--- viadot/flows/epicor_to_duckdb.py | 11 +++++--- viadot/flows/flow_of_flows.py | 9 ++++--- viadot/flows/genesys_to_adls.py | 22 +++++++++++---- viadot/flows/mindful_to_adls.py | 11 +++++--- viadot/flows/multiple_flows.py | 11 +++++++- viadot/flows/mysql_to_adls.py | 11 +++++--- viadot/flows/outlook_to_adls.py | 11 ++++---- viadot/flows/prefect_logs.py | 6 ++++- viadot/flows/salesforce_to_adls.py | 14 ++++++---- viadot/flows/sap_rfc_to_adls.py | 10 ++++--- viadot/flows/sap_to_duckdb.py | 11 +++++--- viadot/flows/sftp_operations.py | 19 +++++++++---- viadot/flows/sharepoint_to_adls.py | 17 +++++++----- viadot/flows/sql_server_to_duckdb.py | 13 ++++----- viadot/flows/sql_server_transform.py | 9 ++++--- viadot/flows/supermetrics_to_adls.py | 17 +++++++----- viadot/flows/supermetrics_to_azure_sql.py | 14 ++++------ 28 files changed, 229 insertions(+), 134 deletions(-) diff --git a/viadot/flows/adls_container_to_container.py b/viadot/flows/adls_container_to_container.py index 7e59f3afc..e9018203e 100644 --- a/viadot/flows/adls_container_to_container.py +++ b/viadot/flows/adls_container_to_container.py @@ -6,12 +6,11 @@ from ..tasks import AzureDataLakeCopy -copy_task = AzureDataLakeCopy() logger = logging.get_logger(__name__) -@task +@task(timeout=3600) def is_stored_locally(f: Flow): return f.storage is None or isinstance(f.storage, Local) @@ -27,6 +26,8 @@ class ADLSContainerToContainer(Flow): ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake. Defaults to None. vault_name (str): The name of the vault from which to retrieve the secrets. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -36,6 +37,7 @@ def __init__( to_path: str, adls_sp_credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -45,6 +47,7 @@ def __init__( self.to_path = to_path self.adls_sp_credentials_secret = adls_sp_credentials_secret self.vault_name = vault_name + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() @@ -53,6 +56,7 @@ def slugify(name): return name.replace(" ", "_").lower() def gen_flow(self) -> Flow: + copy_task = AzureDataLakeCopy(timeout=self.timeout) copy_task.bind( from_path=self.from_path, to_path=self.to_path, diff --git a/viadot/flows/adls_gen1_to_azure_sql.py b/viadot/flows/adls_gen1_to_azure_sql.py index 0ad2e8a21..766a2b58e 100644 --- a/viadot/flows/adls_gen1_to_azure_sql.py +++ b/viadot/flows/adls_gen1_to_azure_sql.py @@ -5,10 +5,6 @@ from ..tasks import AzureDataLakeDownload, BlobFromCSV, CreateTableFromBlob -gen1_download_task = AzureDataLakeDownload(gen=1) -csv_to_blob_storage_task = BlobFromCSV() -blob_to_azure_sql_task = CreateTableFromBlob() - logger = logging.get_logger(__name__) @@ -24,6 +20,8 @@ class ADLSGen1ToAzureSQL(Flow): local_file_path (str): Where the gen1 file should be downloaded. sp_credentials_secret (str): The Key Vault secret holding Service Pricipal credentials vault_name (str): The name of the vault from which to retrieve `sp_credentials_secret` + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -40,6 +38,7 @@ def __init__( if_exists: str = "replace", sp_credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -55,6 +54,7 @@ def __init__( self.if_exists = if_exists self.sp_credentials_secret = sp_credentials_secret self.vault_name = vault_name + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() @@ -63,6 +63,7 @@ def slugify(name): return name.replace(" ", "_").lower() def gen_flow(self) -> Flow: + gen1_download_task = AzureDataLakeDownload(gen=1, timeout=self.timeout) gen1_download_task.bind( from_path=self.path, to_path=self.local_file_path, @@ -71,12 +72,14 @@ def gen_flow(self) -> Flow: vault_name=self.vault_name, flow=self, ) + csv_to_blob_storage_task = BlobFromCSV(timeout=self.timeout) csv_to_blob_storage_task.bind( from_path=self.local_file_path, to_path=self.blob_path, overwrite=self.overwrite_blob, flow=self, ) + blob_to_azure_sql_task = CreateTableFromBlob(timeout=self.timeout) blob_to_azure_sql_task.bind( blob_path=self.blob_path, schema=self.schema, diff --git a/viadot/flows/adls_gen1_to_azure_sql_new.py b/viadot/flows/adls_gen1_to_azure_sql_new.py index 05f977280..972037d28 100644 --- a/viadot/flows/adls_gen1_to_azure_sql_new.py +++ b/viadot/flows/adls_gen1_to_azure_sql_new.py @@ -10,10 +10,6 @@ from ..tasks import AzureDataLakeToDF, AzureDataLakeUpload, AzureSQLCreateTable, BCPTask -gen1_to_df_task = AzureDataLakeToDF(gen=1) -gen2_upload_task = AzureDataLakeUpload(gen=2) -create_table_task = AzureSQLCreateTable() -bulk_insert_task = BCPTask() logger = logging.get_logger(__name__) @@ -47,6 +43,8 @@ class ADLSGen1ToAzureSQLNew(Flow): gen2_sp_credentials_secret (str): The Key Vault secret holding Service Pricipal credentials for gen2 lake sqldb_credentials_secret (str): The Key Vault secret holding Azure SQL Database credentials vault_name (str): The name of the vault from which to retrieve `sp_credentials_secret` + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -69,6 +67,7 @@ def __init__( gen2_sp_credentials_secret: str = None, sqldb_credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -90,6 +89,7 @@ def __init__( self.gen2_sp_credentials_secret = gen2_sp_credentials_secret self.sqldb_credentials_secret = sqldb_credentials_secret self.vault_name = vault_name + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.dtypes.update(METADATA_COLUMNS) self.gen_flow() @@ -99,6 +99,7 @@ def slugify(name): return name.replace(" ", "_").lower() def gen_flow(self) -> Flow: + gen1_to_df_task = AzureDataLakeToDF(gen=1, timeout=self.timeout) df = gen1_to_df_task.bind( path=self.gen1_path, gen=1, @@ -118,6 +119,7 @@ def gen_flow(self) -> Flow: sep=self.write_sep, flow=self, ) + gen2_upload_task = AzureDataLakeUpload(gen=2, timeout=self.timeout) gen2_upload_task.bind( from_path=self.local_file_path, to_path=self.gen2_path, @@ -126,6 +128,7 @@ def gen_flow(self) -> Flow: vault_name=self.vault_name, flow=self, ) + create_table_task = AzureSQLCreateTable(timeout=self.timeout) create_table_task.bind( schema=self.schema, table=self.table, @@ -135,6 +138,7 @@ def gen_flow(self) -> Flow: vault_name=self.vault_name, flow=self, ) + bulk_insert_task = BCPTask(timeout=self.timeout) bulk_insert_task.bind( path=self.local_file_path, schema=self.schema, diff --git a/viadot/flows/adls_gen1_to_gen2.py b/viadot/flows/adls_gen1_to_gen2.py index 3764ec05b..fcea775d4 100644 --- a/viadot/flows/adls_gen1_to_gen2.py +++ b/viadot/flows/adls_gen1_to_gen2.py @@ -7,9 +7,6 @@ from ..tasks import AzureDataLakeDownload, AzureDataLakeUpload -gen1_download_task = AzureDataLakeDownload(gen=1) -gen2_upload_task = AzureDataLakeUpload(gen=2) - logger = logging.get_logger(__name__) @@ -37,6 +34,8 @@ class ADLSGen1ToGen2(Flow): gen1_sp_credentials_secret (str): The Key Vault secret holding Service Pricipal credentials for gen1 lake gen2_sp_credentials_secret (str): The Key Vault secret holding Service Pricipal credentials for gen2 lake vault_name (str): The name of the vault from which to retrieve the secrets. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -50,6 +49,7 @@ def __init__( gen1_sp_credentials_secret: str = None, gen2_sp_credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -62,6 +62,7 @@ def __init__( self.gen1_sp_credentials_secret = gen1_sp_credentials_secret self.gen2_sp_credentials_secret = gen2_sp_credentials_secret self.vault_name = vault_name + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() @@ -70,6 +71,7 @@ def slugify(name): return name.replace(" ", "_").lower() def gen_flow(self) -> Flow: + gen1_download_task = AzureDataLakeDownload(gen=1, timeout=self.timeout) gen1_download_task.bind( from_path=self.gen1_path, to_path=self.local_file_path, @@ -79,6 +81,7 @@ def gen_flow(self) -> Flow: flow=self, ) add_ingestion_metadata.bind(path=self.local_file_path, sep=self.sep, flow=self) + gen2_upload_task = AzureDataLakeUpload(gen=2, timeout=self.timeout) gen2_upload_task.bind( from_path=self.local_file_path, to_path=self.gen2_path, diff --git a/viadot/flows/adls_to_azure_sql.py b/viadot/flows/adls_to_azure_sql.py index 9cc371b5e..e69750a9e 100644 --- a/viadot/flows/adls_to_azure_sql.py +++ b/viadot/flows/adls_to_azure_sql.py @@ -21,23 +21,13 @@ logger = logging.get_logger(__name__) -lake_to_df_task = AzureDataLakeToDF() -download_json_file_task = AzureDataLakeDownload() -download_github_file_task = DownloadGitHubFile() -promote_to_conformed_task = AzureDataLakeCopy() -promote_to_operations_task = AzureDataLakeCopy() -create_table_task = AzureSQLCreateTable() -bulk_insert_task = BCPTask() -azure_query_task = AzureSQLDBQuery() -check_column_order_task = CheckColumnOrder() - -@task +@task(timeout=3600) def union_dfs_task(dfs: List[pd.DataFrame]): return pd.concat(dfs, ignore_index=True) -@task +@task(timeout=3600) def map_data_types_task(json_shema_path: str): file_dtypes = open(json_shema_path) dict_dtypes = json.load(file_dtypes) @@ -71,7 +61,7 @@ def map_data_types_task(json_shema_path: str): return dict_dtypes_mapped -@task +@task(timeout=3600) def df_to_csv_task(df, remove_tab, path: str, sep: str = "\t"): # if table doesn't exist it will be created later - df equals None if df is None: @@ -109,6 +99,7 @@ def __init__( max_download_retries: int = 5, tags: List[str] = ["promotion"], vault_name: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -141,6 +132,8 @@ def __init__( max_download_retries (int, optional): How many times to retry the download. Defaults to 5. tags (List[str], optional): Flow tags to use, eg. to control flow concurrency. Defaults to ["promotion"]. vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ adls_path = adls_path.strip("/") @@ -189,6 +182,7 @@ def __init__( self.max_download_retries = max_download_retries self.tags = tags self.vault_name = vault_name + self.timeout = timeout super().__init__(*args, name=name, **kwargs) @@ -219,6 +213,7 @@ def get_promoted_path(self, env: str) -> str: return promoted_path def gen_flow(self) -> Flow: + lake_to_df_task = AzureDataLakeToDF(timeout=self.timeout) df = lake_to_df_task.bind( path=self.adls_path, sp_credentials_secret=self.adls_sp_credentials_secret, @@ -227,6 +222,7 @@ def gen_flow(self) -> Flow: ) if not self.dtypes: + download_json_file_task = AzureDataLakeDownload(timeout=self.timeout) download_json_file_task.bind( from_path=self.json_shema_path, to_path=self.local_json_path, @@ -238,6 +234,7 @@ def gen_flow(self) -> Flow: else: dtypes = self.dtypes + check_column_order_task = CheckColumnOrder(timeout=self.timeout) df_reorder = check_column_order_task.bind( table=self.table, schema=self.schema, @@ -263,6 +260,7 @@ def gen_flow(self) -> Flow: flow=self, ) + promote_to_conformed_task = AzureDataLakeCopy(timeout=self.timeout) promote_to_conformed_task.bind( from_path=self.adls_path, to_path=self.adls_path_conformed, @@ -270,6 +268,7 @@ def gen_flow(self) -> Flow: vault_name=self.vault_name, flow=self, ) + promote_to_operations_task = AzureDataLakeCopy(timeout=self.timeout) promote_to_operations_task.bind( from_path=self.adls_path_conformed, to_path=self.adls_path_operations, @@ -277,6 +276,7 @@ def gen_flow(self) -> Flow: vault_name=self.vault_name, flow=self, ) + create_table_task = AzureSQLCreateTable(timeout=self.timeout) create_table_task.bind( schema=self.schema, table=self.table, @@ -286,6 +286,7 @@ def gen_flow(self) -> Flow: vault_name=self.vault_name, flow=self, ) + bulk_insert_task = BCPTask(timeout=self.timeout) bulk_insert_task.bind( path=self.local_file_path, schema=self.schema, diff --git a/viadot/flows/aselite_to_adls.py b/viadot/flows/aselite_to_adls.py index 363c37fba..86e9b215b 100644 --- a/viadot/flows/aselite_to_adls.py +++ b/viadot/flows/aselite_to_adls.py @@ -6,9 +6,6 @@ from viadot.tasks import AzureDataLakeUpload from viadot.tasks.aselite import ASELiteToDF -df_task = ASELiteToDF() -file_to_adls_task = AzureDataLakeUpload() - class ASELiteToADLS(Flow): def __init__( @@ -26,6 +23,7 @@ def __init__( sp_credentials_secret: str = None, remove_special_characters: bool = None, columns_to_clean: List[str] = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -48,6 +46,8 @@ def __init__( remove_special_characters (str, optional): Call a function that remove special characters like escape symbols. Defaults to None. columns_to_clean (List(str), optional): Select columns to clean, used with remove_special_characters. If None whole data frame will be processed. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.query = query self.sqldb_credentials_secret = sqldb_credentials_secret @@ -62,12 +62,14 @@ def __init__( self.sp_credentials_secret = sp_credentials_secret self.remove_special_characters = remove_special_characters self.columns_to_clean = columns_to_clean + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() def gen_flow(self) -> Flow: + df_task = ASELiteToDF(timeout=self.timeout) df = df_task.bind( query=self.query, credentials_secret=self.sqldb_credentials_secret, @@ -89,6 +91,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) adls_upload = file_to_adls_task.bind( from_path=self.file_path, to_path=self.to_path, diff --git a/viadot/flows/azure_sql_transform.py b/viadot/flows/azure_sql_transform.py index 2f854d5f9..4f1d5db78 100644 --- a/viadot/flows/azure_sql_transform.py +++ b/viadot/flows/azure_sql_transform.py @@ -2,9 +2,7 @@ from prefect import Flow -from ..tasks.azure_sql import AzureSQLDBQuery - -query_task = AzureSQLDBQuery() +from viadot.tasks.azure_sql import AzureSQLDBQuery class AzureSQLTransform(Flow): @@ -15,6 +13,7 @@ def __init__( sqldb_credentials_secret: str = None, vault_name: str = None, tags: List[str] = ["transform"], + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -28,17 +27,20 @@ def __init__( with SQL db credentials (server, db_name, user, and password). vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. tags (list, optional): Tag for marking flow. Defaults to "transform". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.query = query self.tags = tags self.sqldb_credentials_secret = sqldb_credentials_secret self.vault_name = vault_name - self.tasks = [query_task] + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() def gen_flow(self) -> Flow: + query_task = AzureSQLDBQuery(timeout=self.timeout) query_task.bind( query=self.query, credentials_secret=self.sqldb_credentials_secret, diff --git a/viadot/flows/bigquery_to_adls.py b/viadot/flows/bigquery_to_adls.py index cce497005..30dcc5a08 100644 --- a/viadot/flows/bigquery_to_adls.py +++ b/viadot/flows/bigquery_to_adls.py @@ -7,7 +7,7 @@ from prefect.backend import set_key_value from prefect.utilities import logging -from ..task_utils import ( +from viadot.task_utils import ( add_ingestion_metadata_task, df_get_data_types_task, df_map_mixed_dtypes_for_parquet, @@ -16,11 +16,8 @@ dtypes_to_json_task, update_dtypes_dict, ) -from ..tasks import AzureDataLakeUpload, BigQueryToDF +from viadot.tasks import AzureDataLakeUpload, BigQueryToDF -bigquery_to_df_task = BigQueryToDF() -file_to_adls_task = AzureDataLakeUpload() -json_to_adls_task = AzureDataLakeUpload() logger = logging.get_logger(__name__) @@ -44,6 +41,7 @@ def __init__( adls_sp_credentials_secret: str = None, overwrite_adls: bool = False, if_exists: str = "replace", + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -79,6 +77,8 @@ def __init__( Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_exists (str, optional): What to do if the file exists. Defaults to "replace". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # BigQueryToDF self.credentials_key = credentials_key @@ -96,6 +96,7 @@ def __init__( self.if_exists = if_exists self.output_file_extension = output_file_extension self.now = str(pendulum.now("utc")) + self.timeout = timeout self.local_file_path = ( local_file_path or self.slugify(name) + self.output_file_extension @@ -125,6 +126,7 @@ def slugify(name): return name.replace(" ", "_").lower() def gen_flow(self) -> Flow: + bigquery_to_df_task = BigQueryToDF(timeout=self.timeout) df = bigquery_to_df_task.bind( dataset_name=self.dataset_name, table_name=self.table_name, @@ -158,6 +160,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, @@ -171,6 +174,7 @@ def gen_flow(self) -> Flow: dtypes_dict=dtypes_updated, local_json_path=self.local_json_path, flow=self ) + json_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) json_to_adls_task.bind( from_path=self.local_json_path, to_path=self.adls_schema_file_dir_file, diff --git a/viadot/flows/cloud_for_customers_report_to_adls.py b/viadot/flows/cloud_for_customers_report_to_adls.py index 60c49c3b2..70481d9d4 100644 --- a/viadot/flows/cloud_for_customers_report_to_adls.py +++ b/viadot/flows/cloud_for_customers_report_to_adls.py @@ -4,18 +4,14 @@ import pendulum from prefect import Flow, Task, apply_map -from ..task_utils import ( +from viadot.task_utils import ( add_ingestion_metadata_task, df_to_csv, df_to_parquet, union_dfs_task, ) -from ..tasks import AzureDataLakeUpload, C4CReportToDF, C4CToDF -from ..utils import slugify - -file_to_adls_task = AzureDataLakeUpload() -c4c_report_to_df = C4CReportToDF() -c4c_to_df = C4CToDF() +from viadot.tasks import AzureDataLakeUpload, C4CReportToDF, C4CToDF +from viadot.utils import slugify class CloudForCustomersReportToADLS(Flow): @@ -42,6 +38,7 @@ def __init__( adls_sp_credentials_secret: str = None, if_empty: str = "warn", if_exists: str = "replace", + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -75,6 +72,8 @@ def __init__( Defaults to None. if_empty (str, optional): What to do if the Supermetrics query returns no data. Defaults to "warn". if_exists (str, optional): What to do if the local file already exists. Defaults to "replace". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.report_url = report_url @@ -83,6 +82,7 @@ def __init__( self.if_empty = if_empty self.env = env self.c4c_credentials_secret = c4c_credentials_secret + self.timeout = timeout # AzureDataLakeUpload self.adls_sp_credentials_secret = adls_sp_credentials_secret @@ -155,6 +155,7 @@ def gen_c4c( flow: Flow = None, ) -> Task: + c4c_to_df = C4CToDF(timeout=self.timeout) df = c4c_to_df.bind( url=url, endpoint=endpoint, @@ -170,6 +171,7 @@ def gen_c4c_report_months( self, report_urls_with_filters: Union[str, List[str]], flow: Flow = None ) -> Task: + c4c_report_to_df = C4CReportToDF(timeout=self.timeout) report = c4c_report_to_df.bind( report_url=report_urls_with_filters, skip=self.skip, @@ -214,6 +216,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, diff --git a/viadot/flows/duckdb_to_sql_server.py b/viadot/flows/duckdb_to_sql_server.py index 5d83e14fb..890498fe8 100644 --- a/viadot/flows/duckdb_to_sql_server.py +++ b/viadot/flows/duckdb_to_sql_server.py @@ -5,18 +5,14 @@ from prefect import Flow, task from prefect.utilities import logging -from ..task_utils import df_to_csv as df_to_csv_task -from ..task_utils import get_sql_dtypes_from_df as get_sql_dtypes_from_df_task -from ..tasks import BCPTask, DuckDBToDF, SQLServerCreateTable, DuckDBQuery +from viadot.task_utils import df_to_csv as df_to_csv_task +from viadot.task_utils import get_sql_dtypes_from_df as get_sql_dtypes_from_df_task +from viadot.tasks import BCPTask, DuckDBToDF, SQLServerCreateTable, DuckDBQuery logger = logging.get_logger(__name__) -duckdb_to_df_task = DuckDBToDF() -create_table_task = SQLServerCreateTable() -bulk_insert_task = BCPTask() - -@task +@task(timeout=3600) def cleanup_csv_task(path: str): logger = prefect.context.get("logger") @@ -50,6 +46,7 @@ def __init__( on_bcp_error: Literal["skip", "fail"] = "skip", bcp_error_log_path="./log_file.log", tags: List[str] = ["load"], + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -76,6 +73,8 @@ def __init__( on_bcp_error (Literal["skip", "fail"], optional): What to do if error occurs. Defaults to "skip". bcp_error_log_path (string, optional): Full path of an error file. Defaults to "./log_file.log". tags (List[str], optional): Flow tags to use, eg. to control flow concurrency. Defaults to ["load"]. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # DuckDBToDF @@ -102,6 +101,7 @@ def __init__( # Global self.tags = tags + self.timeout = timeout super().__init__(*args, name=name, **kwargs) @@ -120,6 +120,7 @@ def slugify(name): def gen_flow(self) -> Flow: if self.duckdb_query is None: + duckdb_to_df_task = DuckDBToDF(timeout=self.timeout) df = duckdb_to_df_task.bind( schema=self.duckdb_schema, table=self.duckdb_table, @@ -147,6 +148,7 @@ def gen_flow(self) -> Flow: else: dtypes = get_sql_dtypes_from_df_task.bind(df=df, flow=self) + create_table_task = SQLServerCreateTable(timeout=self.timeout) create_table_task.bind( schema=self.sql_server_schema, table=self.sql_server_table, @@ -155,6 +157,8 @@ def gen_flow(self) -> Flow: credentials=self.sql_server_credentials, flow=self, ) + + bulk_insert_task = BCPTask(timeout=self.timeout) bulk_insert_task.bind( path=self.local_file_path, schema=self.sql_server_schema, diff --git a/viadot/flows/duckdb_transform.py b/viadot/flows/duckdb_transform.py index 934539cc7..950a2812b 100644 --- a/viadot/flows/duckdb_transform.py +++ b/viadot/flows/duckdb_transform.py @@ -2,9 +2,7 @@ from prefect import Flow -from ..tasks.duckdb import DuckDBQuery - -query_task = DuckDBQuery() +from viadot.tasks.duckdb import DuckDBQuery class DuckDBTransform(Flow): @@ -14,6 +12,7 @@ def __init__( query: str, credentials: dict = None, tags: List[str] = ["transform"], + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -25,16 +24,19 @@ def __init__( query (str, required): The query to execute on the database. credentials (dict, optional): Credentials for the connection. Defaults to None. tags (list, optional): Tag for marking flow. Defaults to "transform". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.query = query self.credentials = credentials self.tags = tags - self.tasks = [query_task] + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() def gen_flow(self) -> Flow: + query_task = DuckDBQuery(timeout=self.timeout) query_task.bind( query=self.query, credentials=self.credentials, diff --git a/viadot/flows/epicor_to_duckdb.py b/viadot/flows/epicor_to_duckdb.py index b074b0f4b..77d4da895 100644 --- a/viadot/flows/epicor_to_duckdb.py +++ b/viadot/flows/epicor_to_duckdb.py @@ -2,8 +2,8 @@ from prefect import Flow -from ..task_utils import add_ingestion_metadata_task, cast_df_to_str, df_to_parquet -from ..tasks import DuckDBCreateTableFromParquet, EpicorOrdersToDF +from viadot.task_utils import add_ingestion_metadata_task, cast_df_to_str, df_to_parquet +from viadot.tasks import DuckDBCreateTableFromParquet, EpicorOrdersToDF class EpicorOrdersToDuckDB(Flow): @@ -22,6 +22,7 @@ def __init__( if_exists: Literal["fail", "replace", "append", "skip", "delete"] = "fail", if_empty: Literal["warn", "skip", "fail"] = "skip", duckdb_credentials: dict = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -42,6 +43,8 @@ def __init__( if_exists (Literal, optional): What to do if the table already exists. Defaults to "fail". if_empty (Literal, optional): What to do if Parquet file is empty. Defaults to "skip". duckdb_credentials (dict, optional): Credentials for the DuckDB connection. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.base_url = base_url self.epicor_credentials = epicor_credentials @@ -61,9 +64,11 @@ def __init__( self.df_task = EpicorOrdersToDF( base_url=self.base_url, filters_xml=self.filters_xml, + timeout=timeout, ) self.create_duckdb_table_task = DuckDBCreateTableFromParquet( - credentials=duckdb_credentials + credentials=duckdb_credentials, + timeout=timeout, ) self.gen_flow() diff --git a/viadot/flows/flow_of_flows.py b/viadot/flows/flow_of_flows.py index c6bf4b1a0..9f8053eb3 100644 --- a/viadot/flows/flow_of_flows.py +++ b/viadot/flows/flow_of_flows.py @@ -3,9 +3,6 @@ from prefect import Flow, Task, apply_map from prefect.tasks.prefect import StartFlowRun -start_flow_run_task = StartFlowRun(wait=True) -start_flow_run_task_2 = StartFlowRun(wait=True) - class Pipeline(Flow): def __init__( @@ -14,25 +11,29 @@ def __init__( project_name: str, extract_flows_names: List[str], transform_flow_name: str, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): self.extract_flows_names = extract_flows_names self.transform_flow_name = transform_flow_name self.project_name = project_name + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() def gen_start_flow_run_task(self, flow_name: str, flow: Flow = None) -> Task: + start_flow_run_task = StartFlowRun(wait=True, timeout=self.timeout) t = start_flow_run_task.bind( flow_name=flow_name, project_name=self.project_name, flow=flow ) return t - def gen_flow(self) -> Flow: + def gen_flow(self): extract_flow_runs = apply_map( self.gen_start_flow_run_task, self.extract_flows_names, flow=self ) + start_flow_run_task_2 = StartFlowRun(wait=True, timeout=self.timeout) transform_flow_run = start_flow_run_task_2.bind( flow_name=self.transform_flow_name, project_name=self.project_name, diff --git a/viadot/flows/genesys_to_adls.py b/viadot/flows/genesys_to_adls.py index c4c894a76..da867c176 100644 --- a/viadot/flows/genesys_to_adls.py +++ b/viadot/flows/genesys_to_adls.py @@ -12,8 +12,6 @@ df_to_parquet, ) -file_to_adls_task = AzureDataLakeUpload() - @task def adls_bulk_upload( @@ -21,6 +19,7 @@ def adls_bulk_upload( adls_file_path: str = None, adls_sp_credentials_secret: str = None, adls_overwrite: bool = True, + task_timeout: int = 3600, ) -> List[str]: """ Function that upload files to defined path in ADLS. @@ -31,12 +30,15 @@ def adls_bulk_upload( adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. adls_overwrite (bool, optional): Whether to overwrite files in the data lake. Defaults to True. + task_timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: List[str]: List of paths """ for file in file_names: file_path = str(adls_file_path + "/" + file) + file_to_adls_task = AzureDataLakeUpload(timeout=task_timeout) file_to_adls_task.run( from_path=file, to_path=file_path, @@ -81,6 +83,7 @@ def __init__( overwrite_adls: bool = True, adls_sp_credentials_secret: str = None, credentials_genesys: Dict[str, Any] = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -109,6 +112,8 @@ def __init__( adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. credentials(dict, optional): Credentials for the genesys api. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # GenesysToCSV self.flow_name = name @@ -125,6 +130,8 @@ def __init__( self.end_date = end_date self.days_interval = days_interval self.sep = sep + self.timeout = timeout + # AzureDataLake self.local_file_path = local_file_path self.adls_file_path = adls_file_path @@ -138,7 +145,7 @@ def __init__( def gen_flow(self) -> Flow: - to_csv = GenesysToCSV() + to_csv = GenesysToCSV(timeout=self.timeout) if self.view_type == "queue_performance_detail_view": file_names = to_csv.bind( @@ -175,6 +182,7 @@ def gen_flow(self) -> Flow: file_names=file_names, adls_file_path=self.adls_file_path, adls_sp_credentials_secret=self.adls_sp_credentials_secret, + task_timeout=self.timeout, flow=self, ) @@ -197,6 +205,7 @@ def __init__( adls_sp_credentials_secret: str = None, credentials_secret: str = None, schedule_id: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -219,7 +228,8 @@ def __init__( Defaults to None. credentials_secret (str, optional): The name of the Azure Key Vault secret for Genesys project. Defaults to None. schedule_id (str, optional): ID of the schedule report job. Defaults to None. - + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.name = name @@ -235,6 +245,7 @@ def __init__( self.credentials_secret = credentials_secret self.if_exsists = if_exists self.schedule_id = schedule_id + self.timeout = timeout super().__init__(*args, name=name, **kwargs) @@ -242,7 +253,7 @@ def __init__( def gen_flow(self) -> Flow: - genesys_report = GenesysToDF() + genesys_report = GenesysToDF(timeout=self.timeout) df = genesys_report.bind( report_columns=self.columns, @@ -268,6 +279,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, diff --git a/viadot/flows/mindful_to_adls.py b/viadot/flows/mindful_to_adls.py index 6b1981fba..296cef894 100644 --- a/viadot/flows/mindful_to_adls.py +++ b/viadot/flows/mindful_to_adls.py @@ -8,8 +8,6 @@ from viadot.tasks import AzureDataLakeUpload from viadot.task_utils import add_ingestion_metadata_task -file_to_adls_task = AzureDataLakeUpload() - @task def adls_bulk_upload( @@ -18,6 +16,7 @@ def adls_bulk_upload( adls_file_path: str = None, adls_sp_credentials_secret: str = None, adls_overwrite: bool = True, + task_timeout: int = 3600, ) -> List[str]: """Function that upload files to defined path in ADLS. @@ -35,6 +34,7 @@ def adls_bulk_upload( for file in file_names: file_path = str(adls_file_path + "/" + file) + file_to_adls_task = AzureDataLakeUpload(timeout=task_timeout) file_to_adls_task.run( from_path=os.path.join(file_name_relative_path, file), to_path=file_path, @@ -69,6 +69,7 @@ def __init__( region: Literal["us1", "us2", "us3", "ca1", "eu1", "au1"] = "eu1", file_extension: Literal["parquet", "csv"] = "csv", sep: str = "\t", + timeout: int = 3600, file_path: str = "", adls_file_path: str = None, adls_overwrite: bool = True, @@ -89,6 +90,8 @@ def __init__( region (Literal[us1, us2, us3, ca1, eu1, au1], optional): SD region from where to interact with the mindful API. Defaults to "eu1". file_extension (Literal[parquet, csv], optional): File extensions for storing responses. Defaults to "csv". sep (str, optional): Separator in csv file. Defaults to "\t". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. file_path (str, optional): Path where to save the file locally. Defaults to ''. adls_file_path (str, optional): The destination path at ADLS. Defaults to None. adls_overwrite (bool, optional): Whether to overwrite files in the data lake. Defaults to True. @@ -105,6 +108,7 @@ def __init__( self.file_extension = file_extension self.sep = sep self.file_path = file_path + self.timeout = timeout self.adls_file_path = adls_file_path self.adls_overwrite = adls_overwrite @@ -115,7 +119,7 @@ def __init__( self.mind_flow() def mind_flow(self) -> Flow: - to_csv = MindfulToCSV() + to_csv = MindfulToCSV(timeout=self.timeout) file_names = to_csv.bind( credentials_mindful=self.credentials_mindful, @@ -137,6 +141,7 @@ def mind_flow(self) -> Flow: adls_file_path=self.adls_file_path, adls_sp_credentials_secret=self.adls_sp_credentials_secret, adls_overwrite=self.adls_overwrite, + task_timeout=self.timeout, flow=self, ) diff --git a/viadot/flows/multiple_flows.py b/viadot/flows/multiple_flows.py index 8631d5799..461c4e2db 100644 --- a/viadot/flows/multiple_flows.py +++ b/viadot/flows/multiple_flows.py @@ -40,18 +40,27 @@ class MultipleFlows(Flow): flow_name(str): Name of a new flow. flows_list(List[List]): List containing lists of flow names and project names - [["flow1_name" , "project_name"], ["flow2_name" , "project_name"]]. Flows have to be in the correct oreder. Defaults to [List[None]]. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( self, name: str, flows_list: List[List] = [List[None]], + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): self.flows_list = flows_list + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() def gen_flow(self) -> Flow: - run_flows_list.bind(flow_name=self.name, flows_list=self.flows_list, flow=self) + run_flows_list.bind( + flow_name=self.name, + flows_list=self.flows_list, + timeout=self.timeout, + flow=self, + ) diff --git a/viadot/flows/mysql_to_adls.py b/viadot/flows/mysql_to_adls.py index a7a390717..4452a5536 100644 --- a/viadot/flows/mysql_to_adls.py +++ b/viadot/flows/mysql_to_adls.py @@ -6,8 +6,6 @@ from viadot.tasks import AzureDataLakeUpload from viadot.tasks.mysql_to_df import MySqlToDf -file_to_adls_task = AzureDataLakeUpload() - class MySqlToADLS(Flow): def __init__( @@ -24,6 +22,7 @@ def __init__( overwrite_adls: bool = True, sp_credentials_secret: str = None, credentials_secret: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -47,6 +46,8 @@ def __init__( credentials_secret (str, optional): Key Vault name. Defaults to None. columns_to_clean (List(str), optional): Select columns to clean, used with remove_special_characters. If None whole data frame will be processed. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # Connect to sql @@ -55,6 +56,7 @@ def __init__( self.sqldb_credentials_secret = sqldb_credentials_secret self.vault_name = vault_name self.overwrite_adls = overwrite_adls + # Upload to ADLS self.file_path = file_path self.sep = sep @@ -62,6 +64,7 @@ def __init__( self.if_exists = if_exists self.sp_credentials_secret = sp_credentials_secret self.credentials_secret = credentials_secret + self.timeout = timeout super().__init__(*args, name=name, **kwargs) @@ -69,8 +72,7 @@ def __init__( def gen_flow(self) -> Flow: - df_task = MySqlToDf(country_short=self.country_short) - + df_task = MySqlToDf(country_short=self.country_short, timeout=self.timeout) df = df_task.bind( credentials_secret=self.credentials_secret, query=self.query, flow=self ) @@ -83,6 +85,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) adls_upload = file_to_adls_task.bind( from_path=self.file_path, to_path=self.to_path, diff --git a/viadot/flows/outlook_to_adls.py b/viadot/flows/outlook_to_adls.py index c5e86346d..af77bb553 100644 --- a/viadot/flows/outlook_to_adls.py +++ b/viadot/flows/outlook_to_adls.py @@ -10,9 +10,7 @@ union_dfs_task, credentials_loader, ) -from ..tasks import AzureDataLakeUpload, OutlookToDF - -file_to_adls_task = AzureDataLakeUpload() +from viadot.tasks import AzureDataLakeUpload, OutlookToDF class OutlookToADLS(Flow): @@ -28,7 +26,7 @@ def __init__( overwrite_adls: bool = True, adls_sp_credentials_secret: str = None, limit: int = 10000, - timeout: int = 1200, + timeout: int = 3600, if_exists: Literal["append", "replace", "skip"] = "append", outlook_credentials_secret: str = "OUTLOOK", *args: List[Any], @@ -50,7 +48,8 @@ def __init__( ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake. Defaults to None. outlook_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with outlook credentials. limit (int, optional): Number of fetched top messages. Defaults to 10000. - timeout (int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 1200. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. if_exists (Literal['append', 'replace', 'skip'], optional): What to do if the local file already exists. Defaults to "append". """ @@ -80,7 +79,6 @@ def gen_outlook_df( credentials_secret=self.outlook_credentials_secret ) outlook_to_df = OutlookToDF(timeout=self.timeout, credentials=credentials) - df = outlook_to_df.bind( mailbox_name=mailbox_list, start_date=self.start_date, @@ -113,6 +111,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, diff --git a/viadot/flows/prefect_logs.py b/viadot/flows/prefect_logs.py index 3c377a4cb..6995d4dd3 100644 --- a/viadot/flows/prefect_logs.py +++ b/viadot/flows/prefect_logs.py @@ -12,7 +12,6 @@ from viadot.task_utils import add_ingestion_metadata_task, df_to_parquet logger = logging.get_logger() -azure_dl_upload_task = AzureDataLakeUpload() class PrefectLogs(Flow): @@ -27,6 +26,7 @@ def __init__( adls_sp_credentials_secret: str = None, vault_name: str = None, overwrite_adls: bool = True, + timeout: int = 3600, *args, **kwargs, ): @@ -46,6 +46,8 @@ def __init__( Defaults to None. vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. overwrite_adls (bool, optional): Whether to overwrite the file in ADLS. Defaults to True. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Example query: { @@ -86,6 +88,7 @@ def __init__( self.adls_path = adls_path self.vault_name = vault_name self.overwrite_adls = overwrite_adls + self.timeout = timeout self.adls_sp_credentials_secret = adls_sp_credentials_secret if scheduled_start_time == "yesterday": @@ -247,6 +250,7 @@ def gen_flow(self) -> Flow: flow=self, ) + azure_dl_upload_task = AzureDataLakeUpload(timeout=self.timeout) adls_upload = azure_dl_upload_task.bind( from_path=self.local_file_path, to_path=self.adls_path, diff --git a/viadot/flows/salesforce_to_adls.py b/viadot/flows/salesforce_to_adls.py index 043b5118e..f98b63a23 100644 --- a/viadot/flows/salesforce_to_adls.py +++ b/viadot/flows/salesforce_to_adls.py @@ -7,7 +7,7 @@ from prefect.backend import set_key_value from prefect.utilities import logging -from ..task_utils import ( +from viadot.task_utils import ( add_ingestion_metadata_task, df_clean_column, df_get_data_types_task, @@ -17,11 +17,8 @@ dtypes_to_json_task, update_dtypes_dict, ) -from ..tasks import AzureDataLakeUpload, SalesforceToDF +from viadot.tasks import AzureDataLakeUpload, SalesforceToDF -salesforce_to_df_task = SalesforceToDF() -file_to_adls_task = AzureDataLakeUpload() -json_to_adls_task = AzureDataLakeUpload() logger = logging.get_logger(__name__) @@ -45,6 +42,7 @@ def __init__( adls_file_name: str = None, adls_sp_credentials_secret: str = None, if_exists: str = "replace", + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -73,6 +71,8 @@ def __init__( ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake. Defaults to None. if_exists (str, optional): What to do if the file exists. Defaults to "replace". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # SalesforceToDF self.query = query @@ -89,6 +89,7 @@ def __init__( self.if_exists = if_exists self.output_file_extension = output_file_extension self.now = str(pendulum.now("utc")) + self.timeout = timeout self.local_file_path = ( local_file_path or self.slugify(name) + self.output_file_extension @@ -119,6 +120,7 @@ def slugify(name): return name.replace(" ", "_").lower() def gen_flow(self) -> Flow: + salesforce_to_df_task = SalesforceToDF(timeout=self.timeout) df = salesforce_to_df_task.bind( query=self.query, table=self.table, @@ -153,6 +155,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, @@ -166,6 +169,7 @@ def gen_flow(self) -> Flow: dtypes_dict=dtypes_updated, local_json_path=self.local_json_path, flow=self ) + json_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) json_to_adls_task.bind( from_path=self.local_json_path, to_path=self.adls_schema_file_dir_file, diff --git a/viadot/flows/sap_rfc_to_adls.py b/viadot/flows/sap_rfc_to_adls.py index 037ee5701..d23ffc428 100644 --- a/viadot/flows/sap_rfc_to_adls.py +++ b/viadot/flows/sap_rfc_to_adls.py @@ -6,9 +6,6 @@ from viadot.task_utils import concat_dfs, df_to_csv, df_to_parquet, set_new_kv from viadot.tasks import AzureDataLakeUpload, SAPRFCToDF -download_sap_task = SAPRFCToDF() -file_to_adls_task = AzureDataLakeUpload() - class SAPRFCToADLS(Flow): def __init__( @@ -29,6 +26,7 @@ def __init__( vault_name: str = None, update_kv: bool = False, filter_column: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -66,6 +64,8 @@ def __init__( vault_name(str, optional): The name of the vault from which to obtain the secrets. Defaults to None. update_kv (bool, optional): Whether or not to update key value on Prefect. Defaults to False. filter_column (str, optional): Name of the field based on which key value will be updated. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.query = query self.rfc_sep = rfc_sep @@ -80,6 +80,7 @@ def __init__( self.overwrite = overwrite self.adls_sp_credentials_secret = adls_sp_credentials_secret self.vault_name = vault_name + self.timeout = timeout self.update_kv = update_kv self.filter_column = filter_column @@ -89,7 +90,7 @@ def __init__( self.gen_flow() def gen_flow(self) -> Flow: - + download_sap_task = SAPRFCToDF(timeout=self.timeout) df = download_sap_task( query=self.query, sep=self.rfc_sep, @@ -115,6 +116,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) adls_upload = file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_path, diff --git a/viadot/flows/sap_to_duckdb.py b/viadot/flows/sap_to_duckdb.py index aa9847b14..081ae338c 100644 --- a/viadot/flows/sap_to_duckdb.py +++ b/viadot/flows/sap_to_duckdb.py @@ -7,13 +7,13 @@ logger = logging.get_logger() -from ..task_utils import ( +from viadot.task_utils import ( add_ingestion_metadata_task, cast_df_to_str, df_to_parquet, set_new_kv, ) -from ..tasks import DuckDBCreateTableFromParquet, SAPRFCToDF +from viadot.tasks import DuckDBCreateTableFromParquet, SAPRFCToDF class SAPToDuckDB(Flow): @@ -35,6 +35,7 @@ def __init__( duckdb_credentials: dict = None, update_kv: bool = False, filter_column: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -60,6 +61,8 @@ def __init__( duckdb_credentials (dict, optional): The config to use for connecting with DuckDB. Defaults to None. update_kv (bool, optional): Whether or not to update key value on Prefect. Defaults to False. filter_column (str, optional): Name of the field based on which key value will be updated. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # SAPRFCToDF @@ -81,9 +84,9 @@ def __init__( super().__init__(*args, name=name, **kwargs) - self.sap_to_df_task = SAPRFCToDF(credentials=sap_credentials) + self.sap_to_df_task = SAPRFCToDF(credentials=sap_credentials, timeout=timeout) self.create_duckdb_table_task = DuckDBCreateTableFromParquet( - credentials=duckdb_credentials + credentials=duckdb_credentials, timeout=timeout ) self.gen_flow() diff --git a/viadot/flows/sftp_operations.py b/viadot/flows/sftp_operations.py index 2396c8f3e..7bc6d60a1 100644 --- a/viadot/flows/sftp_operations.py +++ b/viadot/flows/sftp_operations.py @@ -7,11 +7,6 @@ from viadot.task_utils import add_ingestion_metadata_task -upload_to_adls = AzureDataLakeUpload() -create_table_task = AzureSQLCreateTable() -bulk_insert_task = BCPTask() - - class SftpToAzureSQL(Flow): def __init__( self, @@ -31,6 +26,7 @@ def __init__( on_bcp_error: Literal["skip", "fail"] = "fail", error_log_file_path: str = "SFTP_logs.log", vault_name: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -55,6 +51,8 @@ def __init__( on_bcp_error (Literal["skip", "fail"], optional): What to do if error occurs. Defaults to "fail". error_log_file_path (string, optional): Full path of an error file. Defaults to "./log_file.log". vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # SFTP self.from_path = from_path @@ -70,6 +68,7 @@ def __init__( self.sep = sep self.remove_tab = remove_tab + self.timeout = timeout # Read schema self.schema = schema @@ -108,6 +107,7 @@ def gen_flow(self) -> Flow: sftp = SftpToDF( sftp_credentials_secret=self.sftp_credentials_secret, credentials=self.sftp_credentials, + timeout=self.timeout, ) df = sftp.bind( from_path=self.from_path, @@ -122,6 +122,7 @@ def gen_flow(self) -> Flow: flow=self, ) + create_table_task = AzureSQLCreateTable(timeout=self.timeout) create_table_task.bind( schema=self.schema, table=self.table, @@ -132,6 +133,7 @@ def gen_flow(self) -> Flow: flow=self, ) + bulk_insert_task = BCPTask(timeout=self.timeout) bulk_insert_task.bind( path=self.file_name, schema=self.schema, @@ -163,6 +165,7 @@ def __init__( sftp_credentials: Dict[str, Any] = None, sp_credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -183,12 +186,16 @@ def __init__( sftp_credentials (Dict[str, Any], optional): SFTP server credentials. Defaults to None. sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary. vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # SFTP self.from_path = from_path self.sftp_credentials_secret = sftp_credentials_secret self.sftp_credentials = sftp_credentials self.columns = columns + self.timeout = timeout + # File args if file_name is None: self.file_name = from_path.split("/")[-1] @@ -221,6 +228,7 @@ def gen_flow(self) -> Flow: ftp = SftpToDF( sftp_credentials_secret=self.sftp_credentials_secret, credentials=self.sftp_credentials, + timeout=self.timeout, ) df = ftp.bind( from_path=self.from_path, @@ -231,6 +239,7 @@ def gen_flow(self) -> Flow: df=df, remove_tab=self.remove_tab, path=self.file_name, flow=self ) + upload_to_adls = AzureDataLakeUpload(timeout=self.timeout) upload_df = upload_to_adls.bind( from_path=self.file_name, to_path=self.to_path, diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 08841abcb..d6ecb7b6d 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -9,7 +9,7 @@ logger = logging.get_logger() -from ..task_utils import ( +from viadot.task_utils import ( add_ingestion_metadata_task, df_get_data_types_task, df_map_mixed_dtypes_for_parquet, @@ -17,12 +17,8 @@ df_to_parquet, dtypes_to_json_task, ) -from ..tasks import AzureDataLakeUpload -from ..tasks.sharepoint import SharepointToDF - -excel_to_df_task = SharepointToDF() -file_to_adls_task = AzureDataLakeUpload() -json_to_adls_task = AzureDataLakeUpload() +from viadot.tasks import AzureDataLakeUpload +from viadot.tasks.sharepoint import SharepointToDF class SharepointToADLS(Flow): @@ -42,6 +38,7 @@ def __init__( overwrite_adls: bool = False, if_empty: str = "warn", if_exists: str = "replace", + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -65,6 +62,8 @@ def __init__( Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_empty (str, optional): What to do if query returns no data. Defaults to "warn". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # SharepointToDF self.if_empty = if_empty @@ -74,6 +73,7 @@ def __init__( self.local_dir_path = local_dir_path self.sheet_number = sheet_number self.validate_excel_file = validate_excel_file + self.timeout = timeout # AzureDataLakeUpload self.overwrite = overwrite_adls @@ -107,6 +107,7 @@ def __init__( self.gen_flow() def gen_flow(self) -> Flow: + excel_to_df_task = SharepointToDF(timeout=self.timeout) df = excel_to_df_task.bind( path_to_file=self.path_to_file, url_to_file=self.url_to_file, @@ -137,6 +138,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, @@ -148,6 +150,7 @@ def gen_flow(self) -> Flow: dtypes_to_json_task.bind( dtypes_dict=dtypes_dict, local_json_path=self.local_json_path, flow=self ) + json_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) json_to_adls_task.bind( from_path=self.local_json_path, to_path=self.adls_schema_file_dir_file, diff --git a/viadot/flows/sql_server_to_duckdb.py b/viadot/flows/sql_server_to_duckdb.py index f41348ab0..60461f47c 100644 --- a/viadot/flows/sql_server_to_duckdb.py +++ b/viadot/flows/sql_server_to_duckdb.py @@ -2,10 +2,8 @@ from prefect import Flow -from ..task_utils import add_ingestion_metadata_task, cast_df_to_str, df_to_parquet -from ..tasks import DuckDBCreateTableFromParquet, SQLServerToDF - -df_task = SQLServerToDF() +from viadot.task_utils import add_ingestion_metadata_task, cast_df_to_str, df_to_parquet +from viadot.tasks import DuckDBCreateTableFromParquet, SQLServerToDF class SQLServerToDuckDB(Flow): @@ -20,6 +18,7 @@ def __init__( if_exists: Literal["fail", "replace", "append", "skip", "delete"] = "fail", if_empty: Literal["warn", "skip", "fail"] = "skip", duckdb_credentials: dict = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -37,7 +36,8 @@ def __init__( if_exists (Literal, optional): What to do if the table already exists. Defaults to "fail". if_empty (Literal, optional): What to do if Parquet file is empty. Defaults to "skip". duckdb_credentials (dict, optional): Credentials for the DuckDB connection. Defaults to None. - + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # SQLServerToDF self.sql_query = sql_query @@ -54,12 +54,13 @@ def __init__( super().__init__(*args, name=name, **kwargs) self.create_duckdb_table_task = DuckDBCreateTableFromParquet( - credentials=duckdb_credentials + credentials=duckdb_credentials, timeout=timeout ) self.gen_flow() def gen_flow(self) -> Flow: + df_task = SQLServerToDF() df = df_task.bind( config_key=self.sqlserver_config_key, query=self.sql_query, flow=self ) diff --git a/viadot/flows/sql_server_transform.py b/viadot/flows/sql_server_transform.py index dcefe59ed..324dc93bd 100644 --- a/viadot/flows/sql_server_transform.py +++ b/viadot/flows/sql_server_transform.py @@ -1,9 +1,7 @@ from prefect import Flow, config from typing import Any, Dict, List, Literal -from ..tasks import SQLServerQuery - -query_task = SQLServerQuery() +from viadot.tasks import SQLServerQuery class SQLServerTransform(Flow): @@ -12,6 +10,7 @@ def __init__( name: str, query: str, config_key: str, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -22,14 +21,18 @@ def __init__( name (str,required): The name of the flow. query (str, required): The query to execute on the database. config_key (str, required): Config key containing credentials for the SQL Server connection. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.query = query self.config_key = config_key + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() def gen_flow(self) -> Flow: + query_task = SQLServerQuery(timeout=self.timeout) query_task.bind( query=self.query, config_key=self.config_key, diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index 883cd9964..38255a38f 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -8,7 +8,7 @@ from prefect.tasks.secrets import PrefectSecret from prefect.utilities import logging -from ..task_utils import ( +from viadot.task_utils import ( add_ingestion_metadata_task, cleanup_validation_clutter, df_get_data_types_task, @@ -20,7 +20,7 @@ update_dtypes_dict, write_to_json, ) -from ..tasks import ( +from viadot.tasks import ( AzureDataLakeUpload, DownloadGitHubFile, GetFlowNewDateRange, @@ -30,12 +30,7 @@ logger = logging.get_logger(__name__) -supermetrics_to_df_task = SupermetricsToDF() -download_github_file_task = DownloadGitHubFile() validation_task = RunGreatExpectationsValidation() -file_to_adls_task = AzureDataLakeUpload() -json_to_adls_task = AzureDataLakeUpload() -prefect_get_new_date_range = GetFlowNewDateRange() class SupermetricsToADLS(Flow): @@ -72,6 +67,7 @@ def __init__( tags: List[str] = ["extract"], vault_name: str = None, check_missing_data: bool = True, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -114,6 +110,8 @@ def __init__( tags (List[str], optional): Flow tags to use, eg. to control flow concurrency. Defaults to ["extract"]. vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. check_missing_data (bool, optional): Whether to check missing data. Defaults to True. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ if not ds_user: try: @@ -124,6 +122,7 @@ def __init__( self.flow_name = name self.check_missing_data = check_missing_data + self.timeout = timeout # SupermetricsToDF self.ds_id = ds_id self.ds_accounts = ds_accounts @@ -191,6 +190,7 @@ def slugify(name): def gen_supermetrics_task( self, ds_accounts: Union[str, List[str]], flow: Flow = None ) -> Task: + supermetrics_to_df_task = SupermetricsToDF(timeout=self.timeout) t = supermetrics_to_df_task.bind( ds_id=self.ds_id, ds_accounts=ds_accounts, @@ -215,6 +215,7 @@ def gen_supermetrics_task( def gen_flow(self) -> Flow: if self.check_missing_data is True: if self.date_range_type is not None and "days" in self.date_range_type: + prefect_get_new_date_range = GetFlowNewDateRange(timeout=self.timeout) self.date_range_type = prefect_get_new_date_range.run( flow_name=self.flow_name, date_range_type=self.date_range_type, @@ -276,6 +277,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, @@ -289,6 +291,7 @@ def gen_flow(self) -> Flow: dtypes_to_json_task.bind( dtypes_dict=dtypes_updated, local_json_path=self.local_json_path, flow=self ) + json_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) json_to_adls_task.bind( from_path=self.local_json_path, to_path=self.adls_schema_file_dir_file, diff --git a/viadot/flows/supermetrics_to_azure_sql.py b/viadot/flows/supermetrics_to_azure_sql.py index ddb6e087e..c98c34c05 100644 --- a/viadot/flows/supermetrics_to_azure_sql.py +++ b/viadot/flows/supermetrics_to_azure_sql.py @@ -7,10 +7,6 @@ logger = logging.get_logger(__name__) -supermetrics_to_csv_task = SupermetricsToCSV() -csv_to_blob_storage_task = BlobFromCSV() -blob_to_azure_sql_task = CreateTableFromBlob() - class SupermetricsToAzureSQL(Flow): def __init__( @@ -42,6 +38,7 @@ def __init__( parallel: bool = True, tags: List[str] = ["extract"], sep: str = "\t", + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -71,11 +68,7 @@ def __init__( self.parallel = parallel self.tags = tags self.sep = sep - self.tasks = [ - supermetrics_to_csv_task, - csv_to_blob_storage_task, - blob_to_azure_sql_task, - ] + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() @@ -86,6 +79,7 @@ def slugify(name): def gen_supermetrics_task( self, ds_accounts: Union[str, List[str]], flow: Flow = None ) -> Task: + supermetrics_to_csv_task = SupermetricsToCSV(timeout=self.timeout) t = supermetrics_to_csv_task.bind( ds_id=self.ds_id, ds_accounts=ds_accounts, @@ -121,12 +115,14 @@ def gen_flow(self) -> Flow: ds_accounts=self.ds_accounts, flow=self ) + csv_to_blob_storage_task = BlobFromCSV(timeout=self.timeout) csv_to_blob_storage_task.bind( from_path=self.local_file_path, to_path=self.blob_path, overwrite=self.overwrite_blob, flow=self, ) + blob_to_azure_sql_task = CreateTableFromBlob(timeout=self.timeout) blob_to_azure_sql_task.bind( blob_path=self.blob_path, schema=self.schema, From 50942757223bde79670ca056c955d88955e4d295 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 4 Jan 2023 08:41:03 +0100 Subject: [PATCH 12/55] =?UTF-8?q?=F0=9F=8E=A8=20correct=20timeout=20in=20m?= =?UTF-8?q?ultiple=5Fflows.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/multiple_flows.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/viadot/flows/multiple_flows.py b/viadot/flows/multiple_flows.py index 461c4e2db..49e88bd34 100644 --- a/viadot/flows/multiple_flows.py +++ b/viadot/flows/multiple_flows.py @@ -7,7 +7,7 @@ logger = logging.get_logger() -@task +@task(timeout=3600) def run_flows_list(flow_name: str, flows_list: List[List] = [List[None]]): """ Task for running multiple flows in the given order. Task will create flow of flows. @@ -40,20 +40,16 @@ class MultipleFlows(Flow): flow_name(str): Name of a new flow. flows_list(List[List]): List containing lists of flow names and project names - [["flow1_name" , "project_name"], ["flow2_name" , "project_name"]]. Flows have to be in the correct oreder. Defaults to [List[None]]. - timeout(int, optional): The amount of time (in seconds) to wait while running this task before - a timeout occurs. Defaults to 3600. """ def __init__( self, name: str, flows_list: List[List] = [List[None]], - timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): self.flows_list = flows_list - self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() @@ -61,6 +57,5 @@ def gen_flow(self) -> Flow: run_flows_list.bind( flow_name=self.name, flows_list=self.flows_list, - timeout=self.timeout, flow=self, ) From fdc078ddc3a215d1bd3787b19d8c6315f7e608e3 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 4 Jan 2023 08:49:13 +0100 Subject: [PATCH 13/55] =?UTF-8?q?=F0=9F=8E=A8=20added=20timeout=20sql=20se?= =?UTF-8?q?rver=20to=20duckdb.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/sql_server_to_duckdb.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/viadot/flows/sql_server_to_duckdb.py b/viadot/flows/sql_server_to_duckdb.py index 60461f47c..71ea15d22 100644 --- a/viadot/flows/sql_server_to_duckdb.py +++ b/viadot/flows/sql_server_to_duckdb.py @@ -42,6 +42,7 @@ def __init__( # SQLServerToDF self.sql_query = sql_query self.sqlserver_config_key = sqlserver_config_key + self.timeout = timeout # DuckDBCreateTableFromParquet self.local_file_path = local_file_path @@ -60,7 +61,7 @@ def __init__( self.gen_flow() def gen_flow(self) -> Flow: - df_task = SQLServerToDF() + df_task = SQLServerToDF(timeout=self.timeout) df = df_task.bind( config_key=self.sqlserver_config_key, query=self.sql_query, flow=self ) From 921287979564a7fd5046b567bf9c33132261c20a Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 4 Jan 2023 09:07:27 +0100 Subject: [PATCH 14/55] =?UTF-8?q?=F0=9F=90=9B=20fixed=20simple=20bug=20on?= =?UTF-8?q?=20duckbd.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/duckdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/tasks/duckdb.py b/viadot/tasks/duckdb.py index 88fd69b7d..0f12aad43 100644 --- a/viadot/tasks/duckdb.py +++ b/viadot/tasks/duckdb.py @@ -31,7 +31,7 @@ def __init__( self.credentials = credentials super().__init__(name="run_duckdb_query", timeout=timeout, *args, **kwargs) - @defaults_from_attrs("credentials", "timeout") + @defaults_from_attrs("credentials") def run( self, query: str, From f58e4f5ad525412230066677f6be147ac36cc532 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 4 Jan 2023 09:21:53 +0100 Subject: [PATCH 15/55] =?UTF-8?q?=F0=9F=93=9D=20added=20Rafal=20feedback.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 983ed94ec..7ade37305 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -591,7 +591,7 @@ def adls_bulk_upload( adls_overwrite (bool, optional): Whether to overwrite files in the data lake. Defaults to True. Returns: - List[str]: List of paths + List[str]: List of paths. """ file_to_adls_task = AzureDataLakeUpload() From c2fbde416837f3d61281e4f2086203ade1420d69 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 4 Jan 2023 13:03:17 +0100 Subject: [PATCH 16/55] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20added=20timeout=20pa?= =?UTF-8?q?rameter.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/task_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 7ade37305..0cef1f1bc 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -579,6 +579,7 @@ def adls_bulk_upload( adls_file_path: str = None, adls_sp_credentials_secret: str = None, adls_overwrite: bool = True, + timeout: int = 3600, ) -> List[str]: """Function that upload files to defined path in ADLS. @@ -589,12 +590,13 @@ def adls_bulk_upload( adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. adls_overwrite (bool, optional): Whether to overwrite files in the data lake. Defaults to True. - + timeout (int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: List[str]: List of paths. """ - file_to_adls_task = AzureDataLakeUpload() + file_to_adls_task = AzureDataLakeUpload(timeout=timeout) for file in file_names: file_to_adls_task.run( From 403530042492d387c736c98afd89b44f5c431784 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 4 Jan 2023 16:02:17 +0100 Subject: [PATCH 17/55] =?UTF-8?q?=F0=9F=8E=A8=20added=20signal.FAIL=20to?= =?UTF-8?q?=20dtype=20sort.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/adls_to_azure_sql.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/viadot/flows/adls_to_azure_sql.py b/viadot/flows/adls_to_azure_sql.py index b1d662bef..f2ef5d3ce 100644 --- a/viadot/flows/adls_to_azure_sql.py +++ b/viadot/flows/adls_to_azure_sql.py @@ -5,8 +5,10 @@ import pandas as pd from prefect import Flow, task from prefect.backend import get_key_value +from prefect.engine import signals from prefect.utilities import logging + from viadot.tasks.azure_data_lake import AzureDataLakeDownload from ..tasks import ( @@ -89,13 +91,13 @@ def df_to_csv_task(df, remove_tab, path: str, sep: str = "\t"): @task def check_dtypes_sort( - df: pd.DataFrame, + df: pd.DataFrame = None, dtypes: Dict[str, Any] = None, ) -> Dict[str, Any]: """Check dtype column order to avoid malformation SQL table. Args: - df (pd.DataFrame): Data Frame from original ADLS file. + df (pd.DataFrame, optional): Data Frame from original ADLS file. Defaults to None. dtypes (Dict[str, Any], optional): Dictionary of columns and data type to apply to the Data Frame downloaded. Defaults to None. @@ -103,7 +105,8 @@ def check_dtypes_sort( Dict[str, Any]: Sorted dtype. """ if df is None: - logger.warning("DataFrame is None") + logger.error("DataFrame argument is mandatory") + raise signals.FAIL("DataFrame is None.") else: # first check if all dtypes keys are in df.columns if all(d in df.columns for d in list(dtypes.keys())) and len(df.columns) == len( @@ -120,7 +123,8 @@ def check_dtypes_sort( for key in df.columns: new_dtypes.update([(key, dtypes[key])]) else: - logger.warning( + logger.error("There is a discrepancy with any of the columns.") + raise signals.FAIL( "dtype dictionary contains key(s) that not matching with the ADLS file columns name, or they have different length." ) From 8bc744851d67fd0eac32d2485b1aa6a3bc19d75e Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Mon, 9 Jan 2023 11:49:41 +0100 Subject: [PATCH 18/55] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Avoid=20Prefect=20FA?= =?UTF-8?q?IL=20in=20mindful=20flow.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/mindful_to_adls.py | 36 +++++++++++++++++++++------------ viadot/tasks/mindful.py | 6 +++--- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/viadot/flows/mindful_to_adls.py b/viadot/flows/mindful_to_adls.py index 6b1981fba..beddfe68d 100644 --- a/viadot/flows/mindful_to_adls.py +++ b/viadot/flows/mindful_to_adls.py @@ -4,16 +4,20 @@ from datetime import datetime from prefect import Flow, task +from prefect.utilities import logging +from prefect.engine.signals import FAIL +from prefect.triggers import all_successful from viadot.tasks import MindfulToCSV from viadot.tasks import AzureDataLakeUpload from viadot.task_utils import add_ingestion_metadata_task +logger = logging.get_logger() file_to_adls_task = AzureDataLakeUpload() @task def adls_bulk_upload( - file_names: List[str], + file_names: List[str] = None, file_name_relative_path: str = "", adls_file_path: str = None, adls_sp_credentials_secret: str = None, @@ -33,14 +37,17 @@ def adls_bulk_upload( List[str]: List of paths """ - for file in file_names: - file_path = str(adls_file_path + "/" + file) - file_to_adls_task.run( - from_path=os.path.join(file_name_relative_path, file), - to_path=file_path, - sp_credentials_secret=adls_sp_credentials_secret, - overwrite=adls_overwrite, - ) + if not file_names: + logger.warning("Avoided uploading any file to ADLS. No files were reported.") + else: + for file in file_names: + file_path = str(adls_file_path + "/" + file) + file_to_adls_task.run( + from_path=os.path.join(file_name_relative_path, file), + to_path=file_path, + sp_credentials_secret=adls_sp_credentials_secret, + overwrite=adls_overwrite, + ) @task @@ -51,10 +58,13 @@ def add_timestamp(files_names: List = None, sep: str = "\t") -> None: files_names (List, optional): File names where to add the new column. Defaults to None. sep (str, optional): Separator type to load and to save data. Defaults to "\t". """ - for file in files_names: - df = pd.read_csv(file, sep=sep) - df_updated = add_ingestion_metadata_task.run(df) - df_updated.to_csv(file, index=False, sep=sep) + if not files_names: + logger.warning("Avoided adding a timestamp. No files were reported.") + else: + for file in files_names: + df = pd.read_csv(file, sep=sep) + df_updated = add_ingestion_metadata_task.run(df) + df_updated.to_csv(file, index=False, sep=sep) class MindfulToADLS(Flow): diff --git a/viadot/tasks/mindful.py b/viadot/tasks/mindful.py index abc870817..7927de311 100644 --- a/viadot/tasks/mindful.py +++ b/viadot/tasks/mindful.py @@ -99,7 +99,7 @@ def run( ): if credentials_mindful is not None: - self.logger.info("Mindful credentials provided by user") + logger.info("Mindful credentials provided by user") elif credentials_mindful is None and credentials_secret is not None: credentials_str = AzureKeyVaultSecret( credentials_secret, vault_name=vault_name @@ -109,7 +109,7 @@ def run( else: try: credentials_mindful = local_config["MINDFUL"] - self.logger.info("Mindful credentials loaded from local config") + logger.info("Mindful credentials loaded from local config") except KeyError: credentials_mindful = None raise CredentialError("Credentials not found.") @@ -149,6 +149,6 @@ def run( logger.info("Successfully downloaded responses data from the Mindful API.") if not file_names: - raise TypeError("Files were not created.") + return None else: return file_names From ed2f2e2ef8e44aee1530550013d4db7e5d7a617b Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Thu, 12 Jan 2023 14:51:59 +0100 Subject: [PATCH 19/55] =?UTF-8?q?=E2=9C=85=20added=20test=20to=20bulk=20up?= =?UTF-8?q?load.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_task_utils.py | 21 +++++++++++++++++++++ viadot/flows/genesys_to_adls.py | 5 ++--- viadot/flows/mindful_to_adls.py | 4 ++-- viadot/task_utils.py | 4 +--- 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index e2fcc7e7c..0b82fd0ed 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -5,6 +5,7 @@ import pandas as pd import prefect import pytest +from unittest import mock from viadot.task_utils import ( add_ingestion_metadata_task, @@ -18,9 +19,20 @@ dtypes_to_json_task, union_dfs_task, write_to_json, + adls_bulk_upload, ) +class MockClass: + def run( + from_path: str = "", + to_path: str = "", + sp_credentials_secret: str = "", + overwrite: bool = False, + ) -> None: + pass + + def count_dtypes(dtypes_dict: dict = None, dtypes_to_count: List[str] = None) -> int: dtypes_counter = 0 for v in dtypes_dict.values(): @@ -221,3 +233,12 @@ def test_df_clean_column_defined(): df = pd.DataFrame.from_dict(data) output = df_clean_column.run(df, ["col_2"]).to_dict() assert output == expected_output + + +@mock.patch("viadot.task_utils.AzureDataLakeUpload", return_value=MockClass) +@pytest.mark.bulk +def test_adls_bulk_upload(mock_upload): + file_names = ["random_1.csv", "random_2.csv"] + + adls_bulk_upload.run(file_names=file_names, adls_file_path="any/at/random") + mock_upload.assert_called_once() diff --git a/viadot/flows/genesys_to_adls.py b/viadot/flows/genesys_to_adls.py index 8f7dc5578..dc9c79069 100644 --- a/viadot/flows/genesys_to_adls.py +++ b/viadot/flows/genesys_to_adls.py @@ -3,7 +3,6 @@ import pandas as pd from prefect import Flow, task -from viadot.task_utils import df_to_csv from viadot.tasks import AzureDataLakeUpload from viadot.tasks.genesys import GenesysToCSV, GenesysToDF from viadot.task_utils import ( @@ -142,7 +141,7 @@ def gen_flow(self) -> Flow: add_timestamp.bind(file_names, sep=self.sep, flow=self) - uploader = adls_bulk_upload( + adls_bulk_upload( file_names=file_names, adls_file_path=self.adls_file_path, adls_sp_credentials_secret=self.adls_sp_credentials_secret, @@ -150,7 +149,7 @@ def gen_flow(self) -> Flow: ) add_timestamp.set_upstream(file_names, flow=self) - uploader.set_upstream(add_timestamp, flow=self) + adls_bulk_upload.set_upstream(add_timestamp, flow=self) class GenesysReportToADLS(Flow): diff --git a/viadot/flows/mindful_to_adls.py b/viadot/flows/mindful_to_adls.py index 58045755e..1429b57ed 100644 --- a/viadot/flows/mindful_to_adls.py +++ b/viadot/flows/mindful_to_adls.py @@ -95,7 +95,7 @@ def mind_flow(self) -> Flow: add_timestamp.bind(file_names, sep=self.sep, flow=self) - uploader = adls_bulk_upload( + adls_bulk_upload( file_names=file_names, file_name_relative_path=self.file_path, adls_file_path=self.adls_file_path, @@ -105,4 +105,4 @@ def mind_flow(self) -> Flow: ) add_timestamp.set_upstream(file_names, flow=self) - uploader.set_upstream(add_timestamp, flow=self) + adls_bulk_upload.set_upstream(add_timestamp, flow=self) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 0cef1f1bc..55088b8b0 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -580,7 +580,7 @@ def adls_bulk_upload( adls_sp_credentials_secret: str = None, adls_overwrite: bool = True, timeout: int = 3600, -) -> List[str]: +) -> None: """Function that upload files to defined path in ADLS. Args: @@ -592,8 +592,6 @@ def adls_bulk_upload( adls_overwrite (bool, optional): Whether to overwrite files in the data lake. Defaults to True. timeout (int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. - Returns: - List[str]: List of paths. """ file_to_adls_task = AzureDataLakeUpload(timeout=timeout) From bb526de65e3010e8967da8340d48b750aa0cb49c Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Thu, 12 Jan 2023 14:54:16 +0100 Subject: [PATCH 20/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20CHANGELOG.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ede267f8..6d50ed6c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - Added `adls_bulk_upload` task function to `task_utils.py` +- Added `test_adls_bulk_upload` test function to `test_task_utils.py` ### Changed - Updated `genesys_to_adls.py` flow with the `adls_bulk_upload` task From 13578c0fd86e69e30a2bfa10d0bc23fa89973f39 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Mon, 16 Jan 2023 13:15:00 +0100 Subject: [PATCH 21/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20test=20and=20CHA?= =?UTF-8?q?NGELOG?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 - tests/unit/test_task_utils.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d50ed6c6..5ede267f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - Added `adls_bulk_upload` task function to `task_utils.py` -- Added `test_adls_bulk_upload` test function to `test_task_utils.py` ### Changed - Updated `genesys_to_adls.py` flow with the `adls_bulk_upload` task diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index 0b82fd0ed..61ff93a58 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -23,7 +23,7 @@ ) -class MockClass: +class MockAzureUploadClass: def run( from_path: str = "", to_path: str = "", @@ -235,7 +235,7 @@ def test_df_clean_column_defined(): assert output == expected_output -@mock.patch("viadot.task_utils.AzureDataLakeUpload", return_value=MockClass) +@mock.patch("viadot.task_utils.AzureDataLakeUpload", return_value=MockAzureUploadClass) @pytest.mark.bulk def test_adls_bulk_upload(mock_upload): file_names = ["random_1.csv", "random_2.csv"] From 9033990c9d9a2df5b3998415fe3cf9f955903475 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Mon, 16 Jan 2023 16:11:59 +0100 Subject: [PATCH 22/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20CHANGELOG.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 178084905..7b4041a88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] - +- Added `FileNotFoundError` to catch up failures in `MindfulToCSV` and when creating SQL tables. # [0.4.11] - 2022-12-15 ### Added From cf867ccbe8d31f7aefd25546c1f0ab45dd14360a Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Tue, 17 Jan 2023 13:01:13 +0100 Subject: [PATCH 23/55] =?UTF-8?q?=F0=9F=8E=A8=20added=20surveys=20endpoint?= =?UTF-8?q?=20to=20Mindful.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/mindful.py | 36 +++++++++++++++++++++++++++++++++++- viadot/tasks/mindful.py | 14 ++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/viadot/sources/mindful.py b/viadot/sources/mindful.py index 512a10cc1..2879962b0 100644 --- a/viadot/sources/mindful.py +++ b/viadot/sources/mindful.py @@ -1,4 +1,4 @@ -import os +import os, sys from io import StringIO from typing import Any, Dict, Literal @@ -182,6 +182,40 @@ def get_responses_list( return response + def get_survey_list( + self, + limit: int = 1000, + **kwargs, + ) -> Response: + """Gets a list of survey resources associated with the authenticated customer. + + Returns: + Response: request object with the response from the Mindful API. + """ + self.endpoint = "surveys" + params = { + "_limit": 1000, + } + + response = self._mindful_api_response( + endpoint=self.endpoint, + params=params, + ) + + if response.status_code == 200: + self.logger.info("Succesfully downloaded responses data from mindful API.") + elif response.status_code == 204 and not response.content.decode(): + self.logger.warning( + f"Thera are not responses data to download from {self.start_date} to {self.end_date}." + ) + else: + self.logger.error( + f"Failed to downloaded responses data. - {response.content}" + ) + raise APIError("Failed to downloaded responses data.") + + return response + def response_to_file( self, response: Response, diff --git a/viadot/tasks/mindful.py b/viadot/tasks/mindful.py index abc870817..265a970fd 100644 --- a/viadot/tasks/mindful.py +++ b/viadot/tasks/mindful.py @@ -128,6 +128,7 @@ def run( ) file_names = [] + # interactions interactions_response = mindful.get_interactions_list() if interactions_response.status_code == 200: interaction_file_name = mindful.response_to_file( @@ -139,6 +140,8 @@ def run( "Successfully downloaded interactions data from the Mindful API." ) time.sleep(0.5) + + # responses responses_response = mindful.get_responses_list() if responses_response.status_code == 200: response_file_name = mindful.response_to_file( @@ -147,6 +150,17 @@ def run( ) file_names.append(response_file_name) logger.info("Successfully downloaded responses data from the Mindful API.") + time.sleep(0.5) + + # surveys + surveys_response = mindful.get_survey_list() + if surveys_response.status_code == 200: + surveys_file_name = mindful.response_to_file( + surveys_response, + file_path=file_path, + ) + file_names.append(surveys_file_name) + logger.info("Successfully downloaded surveys data from the Mindful API.") if not file_names: raise TypeError("Files were not created.") From 5836b24215aa5cc5e0a16b3ba2da98e6ccbba899 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Tue, 17 Jan 2023 13:01:35 +0100 Subject: [PATCH 24/55] =?UTF-8?q?=E2=9C=85=20updated=20test=5Fmindful.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_mindful.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/integration/test_mindful.py b/tests/integration/test_mindful.py index 5ae339f98..b26d6825d 100644 --- a/tests/integration/test_mindful.py +++ b/tests/integration/test_mindful.py @@ -67,6 +67,17 @@ def test_mindful_api_response3(mock_api_response): assert mf.endpoint == "responses" +@mock.patch("viadot.sources.mindful.handle_api_response", return_value=MockClass) +@pytest.mark.connect +def test_mindful_api_response4(mock_api_response): + mf = Mindful(header=header) + + response = mf.get_survey_list() + + assert response.status_code == 200 and isinstance(response.json(), list) + assert mf.endpoint == "surveys" + + @mock.patch("viadot.sources.Mindful._mindful_api_response", return_value=MockClass) @pytest.mark.save def test_mindful_interactions(mock_connection): @@ -89,3 +100,15 @@ def test_mindful_responses(mock_connection): assert mf.endpoint == "responses" and isinstance(mf.endpoint, str) assert os.path.exists("responses.csv") os.remove("responses.csv") + + +@mock.patch("viadot.sources.Mindful._mindful_api_response", return_value=MockClass) +@pytest.mark.save +def test_mindful_surveys(mock_connection): + mf = Mindful(header=header) + response = mf.get_survey_list() + mf.response_to_file(response) + + assert mf.endpoint == "surveys" and isinstance(mf.endpoint, str) + assert os.path.exists("surveys.csv") + os.remove("surveys.csv") From e736f2c105f7109391f435640e9bd2a618836801 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Tue, 17 Jan 2023 13:02:05 +0100 Subject: [PATCH 25/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20CHANGELOG.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 178084905..fcfe65b80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +- Added `get_survey_list` into `Mindful` Source file. +### Changed +- Changed `MindfulToCSV` task to download surveys info. # [0.4.11] - 2022-12-15 ### Added From a2f20627a1448c76c525d9e450d20cfeb633e4ed Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Tue, 17 Jan 2023 13:10:32 +0100 Subject: [PATCH 26/55] =?UTF-8?q?=F0=9F=93=9D=20remove=20some=20unused=20i?= =?UTF-8?q?mport.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/mindful.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/sources/mindful.py b/viadot/sources/mindful.py index 2879962b0..c642f6e16 100644 --- a/viadot/sources/mindful.py +++ b/viadot/sources/mindful.py @@ -1,4 +1,4 @@ -import os, sys +import os from io import StringIO from typing import Any, Dict, Literal From ae9e5e93f6e2e5908ab358a8bab340279d75657e Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 18 Jan 2023 10:06:10 +0100 Subject: [PATCH 27/55] =?UTF-8?q?=F0=9F=8E=A8=20added=20another=20option?= =?UTF-8?q?=20to=20avoid=20errors.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/adls_to_azure_sql.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/viadot/flows/adls_to_azure_sql.py b/viadot/flows/adls_to_azure_sql.py index f2ef5d3ce..55f96a3e9 100644 --- a/viadot/flows/adls_to_azure_sql.py +++ b/viadot/flows/adls_to_azure_sql.py @@ -8,7 +8,6 @@ from prefect.engine import signals from prefect.utilities import logging - from viadot.tasks.azure_data_lake import AzureDataLakeDownload from ..tasks import ( @@ -95,6 +94,8 @@ def check_dtypes_sort( dtypes: Dict[str, Any] = None, ) -> Dict[str, Any]: """Check dtype column order to avoid malformation SQL table. + When data is loaded by the user, a data frame is passed to this task + to check the column sort with dtypes and re-sort if neccessary. Args: df (pd.DataFrame, optional): Data Frame from original ADLS file. Defaults to None. @@ -122,6 +123,8 @@ def check_dtypes_sort( new_dtypes = dict() for key in df.columns: new_dtypes.update([(key, dtypes[key])]) + else: + new_dtypes = dtypes.copy() else: logger.error("There is a discrepancy with any of the columns.") raise signals.FAIL( From 5eb6619b46d325bc872c99e937713c79031b5872 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 18 Jan 2023 10:06:43 +0100 Subject: [PATCH 28/55] =?UTF-8?q?=E2=9C=85=20added=20test=20for=20check=5F?= =?UTF-8?q?dtypes=5Fsort.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_adls_to_azure_sql.py | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/tests/integration/flows/test_adls_to_azure_sql.py b/tests/integration/flows/test_adls_to_azure_sql.py index 950d479fb..dcbfee594 100644 --- a/tests/integration/flows/test_adls_to_azure_sql.py +++ b/tests/integration/flows/test_adls_to_azure_sql.py @@ -1,9 +1,12 @@ import os +import pytest +from unittest import mock import pandas as pd +from prefect.engine import signals from viadot.flows import ADLSToAzureSQL -from viadot.flows.adls_to_azure_sql import df_to_csv_task +from viadot.flows.adls_to_azure_sql import df_to_csv_task, check_dtypes_sort def test_get_promoted_adls_path_csv_file(): @@ -67,3 +70,35 @@ def test_df_to_csv_task_none(caplog): task.run(df, path=path, remove_tab=False) assert "DataFrame is None" in caplog.text assert os.path.isfile(path) == False + + +@pytest.mark.dtypes +def test_check_dtypes_sort(): + d = {"col1": ["rat", "cat"], "col2": [3, 4]} + df = pd.DataFrame(data=d) + dtypes = { + "col1": "varchar(6)", + "col2": "varchar(6)", + } + task = check_dtypes_sort + n_dtypes = task.run(df=df, dtypes=dtypes) + assert list(dtypes.keys()) == list(n_dtypes.keys()) + + dtypes = { + "col2": "varchar(6)", + "col1": "varchar(6)", + } + task = check_dtypes_sort + n_dtypes = task.run(df=df, dtypes=dtypes) + assert list(dtypes.keys()) != list(n_dtypes.keys()) + + dtypes = { + "col1": "varchar(6)", + "col3": "varchar(6)", + } + task = check_dtypes_sort + try: + n_dtypes = task.run(df=df, dtypes=dtypes) + assert False + except signals.FAIL: + assert True From f94dac915250027a97ce958a71cfc7a8518401fe Mon Sep 17 00:00:00 2001 From: AnnaGerlich <74189459+AnnaGerlich@users.noreply.github.com> Date: Thu, 19 Jan 2023 12:12:20 +0100 Subject: [PATCH 29/55] Fix sap rfc setup (#580) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🐛 fixed bug with missing permissions * 🐛changed base image to remote image * 📝 Improved set up instructions Co-authored-by: Rafał Ziemianek <49795849+Rafalz13@users.noreply.github.com> Co-authored-by: AnnaGerlich --- viadot/examples/sap_rfc/Dockerfile | 12 ++------ viadot/examples/sap_rfc/README.md | 46 +++++++++++++++++++++++++++--- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/viadot/examples/sap_rfc/Dockerfile b/viadot/examples/sap_rfc/Dockerfile index f1160c8e7..c58a7d19b 100644 --- a/viadot/examples/sap_rfc/Dockerfile +++ b/viadot/examples/sap_rfc/Dockerfile @@ -1,4 +1,4 @@ -FROM viadot:dev +FROM ghcr.io/dyvenia/viadot/viadot:dev USER root @@ -9,6 +9,8 @@ ENV SAPNWRFC_HOME=/usr/local/sap/nwrfcsdk RUN ldconfig +USER ${USER} + ARG HTTP_PROXY="" ARG HTTPS_PROXY="" ARG NO_PROXY="" @@ -18,11 +20,3 @@ ENV NO_PROXY=$NO_PROXY RUN git config --global http.proxy ${HTTP_PROXY:-""} RUN pip install pyrfc==2.5.0 - -ARG VIADOT_USER=viadot_user -ARG GID=1111 -ARG UID=1111 -RUN groupadd -g $GID -o $VIADOT_USER -RUN useradd -m -u $UID -g $GID -o -s /bin/bash $VIADOT_USER - -USER $VIADOT_USER diff --git a/viadot/examples/sap_rfc/README.md b/viadot/examples/sap_rfc/README.md index 46e584e04..fcebcaf23 100644 --- a/viadot/examples/sap_rfc/README.md +++ b/viadot/examples/sap_rfc/README.md @@ -1,11 +1,49 @@ -## SAP RFC example +# SAP RFC example This is an example environment for running the `SAPRFC` connector. Note that we refer to a `sap_netweaver_rfc` folder in the Dockerfile. This is the folder containing the proprietary SAP NetWeaver driver that would have to be obtained and installed by the user. -### Running SAPRFC -To build the image, run `docker build . -t viadot:sap_rfc`, and spin it up with the provided `docker-compose`: `docker-compose up -d`. You can now open up Jupyter Lab at `localhost:5678`. +## Running SAPRFC +Clone the viadot, enter the sap_rfc folder, and build the image: +``` +git clone https://github.com/dyvenia/viadot.git && \ +cd viadot/viadot/examples/sap_rfc && \ +docker build -t viadot:sap_rfc . --no-cache +``` -To run tests, run eg. `docker exec -it viadot_saprfc_lab pytest tests/integration/test_sap_rfc.py`. \ No newline at end of file +Spin it up with the provided `docker-compose` +``` +docker-compose up -d +``` + +You can now open up Jupyter Lab at `localhost:5678`. + +## Config File +Credentials and other settings are stored in a file named `credentials.json`. A credential file needs to be written in json format. A typical credentials file looks like so: + +``` +{ + "SAP": { + "PROD": { + "sysnr": "system_number_prod", + "user": "user_name_prod", + "passwd": "password_prod", + "ashost": "host_name_prod" + }, + "DEV": { + "sysnr": "system_number_dev", + "user": "user_name_dev", + "passwd": "password_dev", + "ashost": "host_name_dev" + } + } +} +``` + +## Running tests +To run tests, run pytest: +``` +docker exec -it viadot_saprfc_lab pytest tests/integration/test_sap_rfc.py +``` \ No newline at end of file From eaffabcfe60324f74846b2e772793661778b61ad Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Mon, 23 Jan 2023 10:35:48 +0100 Subject: [PATCH 30/55] =?UTF-8?q?=E2=9C=85=20added=20feedback=20into=20tes?= =?UTF-8?q?t=20files.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_adls_to_azure_sql.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/integration/flows/test_adls_to_azure_sql.py b/tests/integration/flows/test_adls_to_azure_sql.py index dcbfee594..b9f6edc0c 100644 --- a/tests/integration/flows/test_adls_to_azure_sql.py +++ b/tests/integration/flows/test_adls_to_azure_sql.py @@ -81,6 +81,7 @@ def test_check_dtypes_sort(): "col2": "varchar(6)", } task = check_dtypes_sort + n_dtypes = task.run(df=df, dtypes=dtypes) assert list(dtypes.keys()) == list(n_dtypes.keys()) @@ -88,7 +89,6 @@ def test_check_dtypes_sort(): "col2": "varchar(6)", "col1": "varchar(6)", } - task = check_dtypes_sort n_dtypes = task.run(df=df, dtypes=dtypes) assert list(dtypes.keys()) != list(n_dtypes.keys()) @@ -96,7 +96,6 @@ def test_check_dtypes_sort(): "col1": "varchar(6)", "col3": "varchar(6)", } - task = check_dtypes_sort try: n_dtypes = task.run(df=df, dtypes=dtypes) assert False From ce6cf2ef38e1f64f2570ebb10c49c33f043a0106 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Tue, 24 Jan 2023 09:38:27 +0100 Subject: [PATCH 31/55] =?UTF-8?q?=E2=9C=85=20added=20test=20to=20mindful.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_mindful.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/integration/test_mindful.py b/tests/integration/test_mindful.py index 5ae339f98..1b1575040 100644 --- a/tests/integration/test_mindful.py +++ b/tests/integration/test_mindful.py @@ -1,4 +1,5 @@ import os +import ast import pytest from unittest import mock from viadot.sources import Mindful @@ -30,6 +31,14 @@ def json(): return test +class MockClass2: + status_code = 204 + content = b"" + + def json(): + return None + + @pytest.mark.init def test_instance_mindful(): mf = Mindful(header=header) @@ -89,3 +98,11 @@ def test_mindful_responses(mock_connection): assert mf.endpoint == "responses" and isinstance(mf.endpoint, str) assert os.path.exists("responses.csv") os.remove("responses.csv") + + +@mock.patch("viadot.sources.Mindful._mindful_api_response", return_value=MockClass2) +@pytest.mark.exception +def test_file_exception(mock_nindful_1): + mf = MindfulToCSV() + response = mf.run(credentials_mindful=credentials_mindful) + assert response == None From aa01dc48125d810b20fb0628a754f05bc99a5fde Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Tue, 24 Jan 2023 09:48:25 +0100 Subject: [PATCH 32/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20some=20docs=20fe?= =?UTF-8?q?edback.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/mindful.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/viadot/sources/mindful.py b/viadot/sources/mindful.py index c642f6e16..496862237 100644 --- a/viadot/sources/mindful.py +++ b/viadot/sources/mindful.py @@ -189,12 +189,15 @@ def get_survey_list( ) -> Response: """Gets a list of survey resources associated with the authenticated customer. + Args: + limit (int, optional): The number of matching interactions to return. Defaults to 1000. + Returns: - Response: request object with the response from the Mindful API. + Response: Request object with the response from the Mindful API. """ self.endpoint = "surveys" params = { - "_limit": 1000, + "_limit": limit, } response = self._mindful_api_response( @@ -210,7 +213,7 @@ def get_survey_list( ) else: self.logger.error( - f"Failed to downloaded responses data. - {response.content}" + f"Failed to download responses data. - {response.content}" ) raise APIError("Failed to downloaded responses data.") From 27e2ea4d0c50b76577127104adebb5341547aac4 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Tue, 24 Jan 2023 09:49:14 +0100 Subject: [PATCH 33/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20CHANGELOG.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fcfe65b80..b5dfad8b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added - Added `get_survey_list` into `Mindful` Source file. ### Changed From 4cf4b5f32197ecd5730c6afcf52ff4700a6af28d Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Tue, 24 Jan 2023 10:52:22 +0100 Subject: [PATCH 34/55] =?UTF-8?q?=F0=9F=8E=A8=20change=20task=20to=20merge?= =?UTF-8?q?=20in=20PR.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/mindful.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/tasks/mindful.py b/viadot/tasks/mindful.py index 265a970fd..24474dad3 100644 --- a/viadot/tasks/mindful.py +++ b/viadot/tasks/mindful.py @@ -163,6 +163,6 @@ def run( logger.info("Successfully downloaded surveys data from the Mindful API.") if not file_names: - raise TypeError("Files were not created.") + return None else: return file_names From 2b39b030e475a36d2f917e0f0ddf66b7152eafbc Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Tue, 24 Jan 2023 11:28:29 +0100 Subject: [PATCH 35/55] =?UTF-8?q?=E2=9C=85=20updated=20mindful=20test.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_mindful.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/integration/test_mindful.py b/tests/integration/test_mindful.py index 1b1575040..073434946 100644 --- a/tests/integration/test_mindful.py +++ b/tests/integration/test_mindful.py @@ -1,5 +1,4 @@ import os -import ast import pytest from unittest import mock from viadot.sources import Mindful @@ -102,7 +101,7 @@ def test_mindful_responses(mock_connection): @mock.patch("viadot.sources.Mindful._mindful_api_response", return_value=MockClass2) @pytest.mark.exception -def test_file_exception(mock_nindful_1): +def test_file_exception(mock_mindful): mf = MindfulToCSV() response = mf.run(credentials_mindful=credentials_mindful) assert response == None From c5d53235efde63311fe167fa8a3aa09779459c75 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 25 Jan 2023 10:25:43 +0100 Subject: [PATCH 36/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20docs.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/mindful_to_adls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/flows/mindful_to_adls.py b/viadot/flows/mindful_to_adls.py index beddfe68d..9eb6c740c 100644 --- a/viadot/flows/mindful_to_adls.py +++ b/viadot/flows/mindful_to_adls.py @@ -26,7 +26,7 @@ def adls_bulk_upload( """Function that upload files to defined path in ADLS. Args: - file_names (List[str]): List of file names to generate paths. + file_names (List[str]): List of file names to generate its paths. file_name_relative_path (str, optional): Path where to save the file locally. Defaults to ''. adls_file_path (str, optional): Azure Data Lake path. Defaults to None. adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with From ad0688144d9e4e857865ad34a2a1848b65b00a99 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 25 Jan 2023 10:35:21 +0100 Subject: [PATCH 37/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20CHANGELOG.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b4041a88..6ce88d377 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added - Added `FileNotFoundError` to catch up failures in `MindfulToCSV` and when creating SQL tables. +- Added `check_dtypes_sort` task into `ADLSToAzureSQL` to check if dtypes is properly sorted. +- Added `timeout` parameter to all `Task`s where it can be added. +- Added `timeout` parameter to all `Flow`s where it can be added. +- Added `adls_bulk_upload` task function to `task_utils.py` +- Added `get_survey_list` into `Mindful` Source file. + +### Changed +- Updated `genesys_to_adls.py` flow with the `adls_bulk_upload` task +- Updated `mindful_to_adls.py` flow with the `adls_bulk_upload` task +- Changed `MindfulToCSV` task to download surveys info. # [0.4.11] - 2022-12-15 ### Added From 743bab6df2cf08c081e53e2f3b0db38024c8fb13 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 25 Jan 2023 10:58:51 +0100 Subject: [PATCH 38/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20docs.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_adls_to_azure_sql.py | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/tests/integration/flows/test_adls_to_azure_sql.py b/tests/integration/flows/test_adls_to_azure_sql.py index 950d479fb..b9f6edc0c 100644 --- a/tests/integration/flows/test_adls_to_azure_sql.py +++ b/tests/integration/flows/test_adls_to_azure_sql.py @@ -1,9 +1,12 @@ import os +import pytest +from unittest import mock import pandas as pd +from prefect.engine import signals from viadot.flows import ADLSToAzureSQL -from viadot.flows.adls_to_azure_sql import df_to_csv_task +from viadot.flows.adls_to_azure_sql import df_to_csv_task, check_dtypes_sort def test_get_promoted_adls_path_csv_file(): @@ -67,3 +70,34 @@ def test_df_to_csv_task_none(caplog): task.run(df, path=path, remove_tab=False) assert "DataFrame is None" in caplog.text assert os.path.isfile(path) == False + + +@pytest.mark.dtypes +def test_check_dtypes_sort(): + d = {"col1": ["rat", "cat"], "col2": [3, 4]} + df = pd.DataFrame(data=d) + dtypes = { + "col1": "varchar(6)", + "col2": "varchar(6)", + } + task = check_dtypes_sort + + n_dtypes = task.run(df=df, dtypes=dtypes) + assert list(dtypes.keys()) == list(n_dtypes.keys()) + + dtypes = { + "col2": "varchar(6)", + "col1": "varchar(6)", + } + n_dtypes = task.run(df=df, dtypes=dtypes) + assert list(dtypes.keys()) != list(n_dtypes.keys()) + + dtypes = { + "col1": "varchar(6)", + "col3": "varchar(6)", + } + try: + n_dtypes = task.run(df=df, dtypes=dtypes) + assert False + except signals.FAIL: + assert True From 737aca9d02ac0738be1e3ca0649bf2fad86e5988 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 25 Jan 2023 11:03:09 +0100 Subject: [PATCH 39/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20docs.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_mindful.py | 23 +++++++++++++++++++++++ tests/unit/test_task_utils.py | 21 +++++++++++++++++++++ viadot/examples/sap_rfc/Dockerfile | 2 ++ 3 files changed, 46 insertions(+) diff --git a/tests/integration/test_mindful.py b/tests/integration/test_mindful.py index 073434946..7961fbe16 100644 --- a/tests/integration/test_mindful.py +++ b/tests/integration/test_mindful.py @@ -75,6 +75,17 @@ def test_mindful_api_response3(mock_api_response): assert mf.endpoint == "responses" +@mock.patch("viadot.sources.mindful.handle_api_response", return_value=MockClass) +@pytest.mark.connect +def test_mindful_api_response4(mock_api_response): + mf = Mindful(header=header) + + response = mf.get_survey_list() + + assert response.status_code == 200 and isinstance(response.json(), list) + assert mf.endpoint == "surveys" + + @mock.patch("viadot.sources.Mindful._mindful_api_response", return_value=MockClass) @pytest.mark.save def test_mindful_interactions(mock_connection): @@ -99,6 +110,18 @@ def test_mindful_responses(mock_connection): os.remove("responses.csv") +@mock.patch("viadot.sources.Mindful._mindful_api_response", return_value=MockClass) +@pytest.mark.save +def test_mindful_surveys(mock_connection): + mf = Mindful(header=header) + response = mf.get_survey_list() + mf.response_to_file(response) + + assert mf.endpoint == "surveys" and isinstance(mf.endpoint, str) + assert os.path.exists("surveys.csv") + os.remove("surveys.csv") + + @mock.patch("viadot.sources.Mindful._mindful_api_response", return_value=MockClass2) @pytest.mark.exception def test_file_exception(mock_mindful): diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index e2fcc7e7c..61ff93a58 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -5,6 +5,7 @@ import pandas as pd import prefect import pytest +from unittest import mock from viadot.task_utils import ( add_ingestion_metadata_task, @@ -18,9 +19,20 @@ dtypes_to_json_task, union_dfs_task, write_to_json, + adls_bulk_upload, ) +class MockAzureUploadClass: + def run( + from_path: str = "", + to_path: str = "", + sp_credentials_secret: str = "", + overwrite: bool = False, + ) -> None: + pass + + def count_dtypes(dtypes_dict: dict = None, dtypes_to_count: List[str] = None) -> int: dtypes_counter = 0 for v in dtypes_dict.values(): @@ -221,3 +233,12 @@ def test_df_clean_column_defined(): df = pd.DataFrame.from_dict(data) output = df_clean_column.run(df, ["col_2"]).to_dict() assert output == expected_output + + +@mock.patch("viadot.task_utils.AzureDataLakeUpload", return_value=MockAzureUploadClass) +@pytest.mark.bulk +def test_adls_bulk_upload(mock_upload): + file_names = ["random_1.csv", "random_2.csv"] + + adls_bulk_upload.run(file_names=file_names, adls_file_path="any/at/random") + mock_upload.assert_called_once() diff --git a/viadot/examples/sap_rfc/Dockerfile b/viadot/examples/sap_rfc/Dockerfile index f1160c8e7..da913574d 100644 --- a/viadot/examples/sap_rfc/Dockerfile +++ b/viadot/examples/sap_rfc/Dockerfile @@ -9,6 +9,8 @@ ENV SAPNWRFC_HOME=/usr/local/sap/nwrfcsdk RUN ldconfig +USER ${USER} + ARG HTTP_PROXY="" ARG HTTPS_PROXY="" ARG NO_PROXY="" From 959363995f8b58d7f28595b15e774f2f571f35b8 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 25 Jan 2023 11:08:54 +0100 Subject: [PATCH 40/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20docs.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/examples/sap_rfc/Dockerfile | 2 +- viadot/examples/sap_rfc/README.md | 44 +++++++++++++++++++-- viadot/flows/adls_container_to_container.py | 8 +++- 3 files changed, 48 insertions(+), 6 deletions(-) diff --git a/viadot/examples/sap_rfc/Dockerfile b/viadot/examples/sap_rfc/Dockerfile index da913574d..c5c5e1efa 100644 --- a/viadot/examples/sap_rfc/Dockerfile +++ b/viadot/examples/sap_rfc/Dockerfile @@ -1,4 +1,4 @@ -FROM viadot:dev +FROM ghcr.io/dyvenia/viadot/viadot:dev USER root diff --git a/viadot/examples/sap_rfc/README.md b/viadot/examples/sap_rfc/README.md index 46e584e04..f84eab580 100644 --- a/viadot/examples/sap_rfc/README.md +++ b/viadot/examples/sap_rfc/README.md @@ -5,7 +5,45 @@ This is an example environment for running the `SAPRFC` connector. Note that we refer to a `sap_netweaver_rfc` folder in the Dockerfile. This is the folder containing the proprietary SAP NetWeaver driver that would have to be obtained and installed by the user. -### Running SAPRFC -To build the image, run `docker build . -t viadot:sap_rfc`, and spin it up with the provided `docker-compose`: `docker-compose up -d`. You can now open up Jupyter Lab at `localhost:5678`. +## Running SAPRFC +Clone the viadot, enter the sap_rfc folder, and build the image: +``` +git clone https://github.com/dyvenia/viadot.git && \ +cd viadot/viadot/examples/sap_rfc && \ +docker build -t viadot:sap_rfc . --no-cache +``` -To run tests, run eg. `docker exec -it viadot_saprfc_lab pytest tests/integration/test_sap_rfc.py`. \ No newline at end of file +Spin it up with the provided `docker-compose` +``` +docker-compose up -d +``` + +You can now open up Jupyter Lab at `localhost:5678`. + +## Config File +Credentials and other settings are stored in a file named `credentials.json`. A credential file needs to be written in json format. A typical credentials file looks like so: + +``` +{ + "SAP": { + "PROD": { + "sysnr": "system_number_prod", + "user": "user_name_prod", + "passwd": "password_prod", + "ashost": "host_name_prod" + }, + "DEV": { + "sysnr": "system_number_dev", + "user": "user_name_dev", + "passwd": "password_dev", + "ashost": "host_name_dev" + } + } +} +``` + +## Running tests +To run tests, run pytest: +``` +docker exec -it viadot_saprfc_lab pytest tests/integration/test_sap_rfc.py +``` \ No newline at end of file diff --git a/viadot/flows/adls_container_to_container.py b/viadot/flows/adls_container_to_container.py index 7e59f3afc..e9018203e 100644 --- a/viadot/flows/adls_container_to_container.py +++ b/viadot/flows/adls_container_to_container.py @@ -6,12 +6,11 @@ from ..tasks import AzureDataLakeCopy -copy_task = AzureDataLakeCopy() logger = logging.get_logger(__name__) -@task +@task(timeout=3600) def is_stored_locally(f: Flow): return f.storage is None or isinstance(f.storage, Local) @@ -27,6 +26,8 @@ class ADLSContainerToContainer(Flow): ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake. Defaults to None. vault_name (str): The name of the vault from which to retrieve the secrets. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -36,6 +37,7 @@ def __init__( to_path: str, adls_sp_credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -45,6 +47,7 @@ def __init__( self.to_path = to_path self.adls_sp_credentials_secret = adls_sp_credentials_secret self.vault_name = vault_name + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() @@ -53,6 +56,7 @@ def slugify(name): return name.replace(" ", "_").lower() def gen_flow(self) -> Flow: + copy_task = AzureDataLakeCopy(timeout=self.timeout) copy_task.bind( from_path=self.from_path, to_path=self.to_path, From 8adb5d18ad85ab09bcd81057bab6c5b19755b4c9 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 25 Jan 2023 11:13:25 +0100 Subject: [PATCH 41/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20docs.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/azure_sql_transform.py | 10 +++++---- viadot/flows/bigquery_to_adls.py | 14 +++++++----- .../cloud_for_customers_report_to_adls.py | 17 ++++++++------ viadot/flows/duckdb_to_sql_server.py | 20 ++++++++++------- viadot/flows/duckdb_transform.py | 10 +++++---- viadot/flows/epicor_to_duckdb.py | 11 +++++++--- viadot/flows/flow_of_flows.py | 9 ++++---- viadot/flows/genesys_to_adls.py | 22 ++++++++++++++----- viadot/flows/multiple_flows.py | 8 +++++-- viadot/flows/mysql_to_adls.py | 11 ++++++---- viadot/flows/outlook_to_adls.py | 11 +++++----- viadot/flows/prefect_logs.py | 6 ++++- viadot/flows/salesforce_to_adls.py | 14 +++++++----- viadot/flows/sap_rfc_to_adls.py | 10 +++++---- viadot/flows/sap_to_duckdb.py | 11 ++++++---- viadot/flows/sftp_operations.py | 19 +++++++++++----- viadot/flows/sharepoint_to_adls.py | 17 ++++++++------ viadot/flows/sql_server_to_duckdb.py | 14 +++++++----- viadot/flows/sql_server_transform.py | 9 +++++--- viadot/flows/supermetrics_to_adls.py | 17 ++++++++------ viadot/flows/supermetrics_to_azure_sql.py | 14 +++++------- 21 files changed, 171 insertions(+), 103 deletions(-) diff --git a/viadot/flows/azure_sql_transform.py b/viadot/flows/azure_sql_transform.py index 2f854d5f9..4f1d5db78 100644 --- a/viadot/flows/azure_sql_transform.py +++ b/viadot/flows/azure_sql_transform.py @@ -2,9 +2,7 @@ from prefect import Flow -from ..tasks.azure_sql import AzureSQLDBQuery - -query_task = AzureSQLDBQuery() +from viadot.tasks.azure_sql import AzureSQLDBQuery class AzureSQLTransform(Flow): @@ -15,6 +13,7 @@ def __init__( sqldb_credentials_secret: str = None, vault_name: str = None, tags: List[str] = ["transform"], + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -28,17 +27,20 @@ def __init__( with SQL db credentials (server, db_name, user, and password). vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. tags (list, optional): Tag for marking flow. Defaults to "transform". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.query = query self.tags = tags self.sqldb_credentials_secret = sqldb_credentials_secret self.vault_name = vault_name - self.tasks = [query_task] + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() def gen_flow(self) -> Flow: + query_task = AzureSQLDBQuery(timeout=self.timeout) query_task.bind( query=self.query, credentials_secret=self.sqldb_credentials_secret, diff --git a/viadot/flows/bigquery_to_adls.py b/viadot/flows/bigquery_to_adls.py index cce497005..30dcc5a08 100644 --- a/viadot/flows/bigquery_to_adls.py +++ b/viadot/flows/bigquery_to_adls.py @@ -7,7 +7,7 @@ from prefect.backend import set_key_value from prefect.utilities import logging -from ..task_utils import ( +from viadot.task_utils import ( add_ingestion_metadata_task, df_get_data_types_task, df_map_mixed_dtypes_for_parquet, @@ -16,11 +16,8 @@ dtypes_to_json_task, update_dtypes_dict, ) -from ..tasks import AzureDataLakeUpload, BigQueryToDF +from viadot.tasks import AzureDataLakeUpload, BigQueryToDF -bigquery_to_df_task = BigQueryToDF() -file_to_adls_task = AzureDataLakeUpload() -json_to_adls_task = AzureDataLakeUpload() logger = logging.get_logger(__name__) @@ -44,6 +41,7 @@ def __init__( adls_sp_credentials_secret: str = None, overwrite_adls: bool = False, if_exists: str = "replace", + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -79,6 +77,8 @@ def __init__( Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_exists (str, optional): What to do if the file exists. Defaults to "replace". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # BigQueryToDF self.credentials_key = credentials_key @@ -96,6 +96,7 @@ def __init__( self.if_exists = if_exists self.output_file_extension = output_file_extension self.now = str(pendulum.now("utc")) + self.timeout = timeout self.local_file_path = ( local_file_path or self.slugify(name) + self.output_file_extension @@ -125,6 +126,7 @@ def slugify(name): return name.replace(" ", "_").lower() def gen_flow(self) -> Flow: + bigquery_to_df_task = BigQueryToDF(timeout=self.timeout) df = bigquery_to_df_task.bind( dataset_name=self.dataset_name, table_name=self.table_name, @@ -158,6 +160,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, @@ -171,6 +174,7 @@ def gen_flow(self) -> Flow: dtypes_dict=dtypes_updated, local_json_path=self.local_json_path, flow=self ) + json_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) json_to_adls_task.bind( from_path=self.local_json_path, to_path=self.adls_schema_file_dir_file, diff --git a/viadot/flows/cloud_for_customers_report_to_adls.py b/viadot/flows/cloud_for_customers_report_to_adls.py index 60c49c3b2..70481d9d4 100644 --- a/viadot/flows/cloud_for_customers_report_to_adls.py +++ b/viadot/flows/cloud_for_customers_report_to_adls.py @@ -4,18 +4,14 @@ import pendulum from prefect import Flow, Task, apply_map -from ..task_utils import ( +from viadot.task_utils import ( add_ingestion_metadata_task, df_to_csv, df_to_parquet, union_dfs_task, ) -from ..tasks import AzureDataLakeUpload, C4CReportToDF, C4CToDF -from ..utils import slugify - -file_to_adls_task = AzureDataLakeUpload() -c4c_report_to_df = C4CReportToDF() -c4c_to_df = C4CToDF() +from viadot.tasks import AzureDataLakeUpload, C4CReportToDF, C4CToDF +from viadot.utils import slugify class CloudForCustomersReportToADLS(Flow): @@ -42,6 +38,7 @@ def __init__( adls_sp_credentials_secret: str = None, if_empty: str = "warn", if_exists: str = "replace", + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -75,6 +72,8 @@ def __init__( Defaults to None. if_empty (str, optional): What to do if the Supermetrics query returns no data. Defaults to "warn". if_exists (str, optional): What to do if the local file already exists. Defaults to "replace". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.report_url = report_url @@ -83,6 +82,7 @@ def __init__( self.if_empty = if_empty self.env = env self.c4c_credentials_secret = c4c_credentials_secret + self.timeout = timeout # AzureDataLakeUpload self.adls_sp_credentials_secret = adls_sp_credentials_secret @@ -155,6 +155,7 @@ def gen_c4c( flow: Flow = None, ) -> Task: + c4c_to_df = C4CToDF(timeout=self.timeout) df = c4c_to_df.bind( url=url, endpoint=endpoint, @@ -170,6 +171,7 @@ def gen_c4c_report_months( self, report_urls_with_filters: Union[str, List[str]], flow: Flow = None ) -> Task: + c4c_report_to_df = C4CReportToDF(timeout=self.timeout) report = c4c_report_to_df.bind( report_url=report_urls_with_filters, skip=self.skip, @@ -214,6 +216,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, diff --git a/viadot/flows/duckdb_to_sql_server.py b/viadot/flows/duckdb_to_sql_server.py index 5d83e14fb..890498fe8 100644 --- a/viadot/flows/duckdb_to_sql_server.py +++ b/viadot/flows/duckdb_to_sql_server.py @@ -5,18 +5,14 @@ from prefect import Flow, task from prefect.utilities import logging -from ..task_utils import df_to_csv as df_to_csv_task -from ..task_utils import get_sql_dtypes_from_df as get_sql_dtypes_from_df_task -from ..tasks import BCPTask, DuckDBToDF, SQLServerCreateTable, DuckDBQuery +from viadot.task_utils import df_to_csv as df_to_csv_task +from viadot.task_utils import get_sql_dtypes_from_df as get_sql_dtypes_from_df_task +from viadot.tasks import BCPTask, DuckDBToDF, SQLServerCreateTable, DuckDBQuery logger = logging.get_logger(__name__) -duckdb_to_df_task = DuckDBToDF() -create_table_task = SQLServerCreateTable() -bulk_insert_task = BCPTask() - -@task +@task(timeout=3600) def cleanup_csv_task(path: str): logger = prefect.context.get("logger") @@ -50,6 +46,7 @@ def __init__( on_bcp_error: Literal["skip", "fail"] = "skip", bcp_error_log_path="./log_file.log", tags: List[str] = ["load"], + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -76,6 +73,8 @@ def __init__( on_bcp_error (Literal["skip", "fail"], optional): What to do if error occurs. Defaults to "skip". bcp_error_log_path (string, optional): Full path of an error file. Defaults to "./log_file.log". tags (List[str], optional): Flow tags to use, eg. to control flow concurrency. Defaults to ["load"]. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # DuckDBToDF @@ -102,6 +101,7 @@ def __init__( # Global self.tags = tags + self.timeout = timeout super().__init__(*args, name=name, **kwargs) @@ -120,6 +120,7 @@ def slugify(name): def gen_flow(self) -> Flow: if self.duckdb_query is None: + duckdb_to_df_task = DuckDBToDF(timeout=self.timeout) df = duckdb_to_df_task.bind( schema=self.duckdb_schema, table=self.duckdb_table, @@ -147,6 +148,7 @@ def gen_flow(self) -> Flow: else: dtypes = get_sql_dtypes_from_df_task.bind(df=df, flow=self) + create_table_task = SQLServerCreateTable(timeout=self.timeout) create_table_task.bind( schema=self.sql_server_schema, table=self.sql_server_table, @@ -155,6 +157,8 @@ def gen_flow(self) -> Flow: credentials=self.sql_server_credentials, flow=self, ) + + bulk_insert_task = BCPTask(timeout=self.timeout) bulk_insert_task.bind( path=self.local_file_path, schema=self.sql_server_schema, diff --git a/viadot/flows/duckdb_transform.py b/viadot/flows/duckdb_transform.py index 934539cc7..950a2812b 100644 --- a/viadot/flows/duckdb_transform.py +++ b/viadot/flows/duckdb_transform.py @@ -2,9 +2,7 @@ from prefect import Flow -from ..tasks.duckdb import DuckDBQuery - -query_task = DuckDBQuery() +from viadot.tasks.duckdb import DuckDBQuery class DuckDBTransform(Flow): @@ -14,6 +12,7 @@ def __init__( query: str, credentials: dict = None, tags: List[str] = ["transform"], + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -25,16 +24,19 @@ def __init__( query (str, required): The query to execute on the database. credentials (dict, optional): Credentials for the connection. Defaults to None. tags (list, optional): Tag for marking flow. Defaults to "transform". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.query = query self.credentials = credentials self.tags = tags - self.tasks = [query_task] + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() def gen_flow(self) -> Flow: + query_task = DuckDBQuery(timeout=self.timeout) query_task.bind( query=self.query, credentials=self.credentials, diff --git a/viadot/flows/epicor_to_duckdb.py b/viadot/flows/epicor_to_duckdb.py index b074b0f4b..77d4da895 100644 --- a/viadot/flows/epicor_to_duckdb.py +++ b/viadot/flows/epicor_to_duckdb.py @@ -2,8 +2,8 @@ from prefect import Flow -from ..task_utils import add_ingestion_metadata_task, cast_df_to_str, df_to_parquet -from ..tasks import DuckDBCreateTableFromParquet, EpicorOrdersToDF +from viadot.task_utils import add_ingestion_metadata_task, cast_df_to_str, df_to_parquet +from viadot.tasks import DuckDBCreateTableFromParquet, EpicorOrdersToDF class EpicorOrdersToDuckDB(Flow): @@ -22,6 +22,7 @@ def __init__( if_exists: Literal["fail", "replace", "append", "skip", "delete"] = "fail", if_empty: Literal["warn", "skip", "fail"] = "skip", duckdb_credentials: dict = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -42,6 +43,8 @@ def __init__( if_exists (Literal, optional): What to do if the table already exists. Defaults to "fail". if_empty (Literal, optional): What to do if Parquet file is empty. Defaults to "skip". duckdb_credentials (dict, optional): Credentials for the DuckDB connection. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.base_url = base_url self.epicor_credentials = epicor_credentials @@ -61,9 +64,11 @@ def __init__( self.df_task = EpicorOrdersToDF( base_url=self.base_url, filters_xml=self.filters_xml, + timeout=timeout, ) self.create_duckdb_table_task = DuckDBCreateTableFromParquet( - credentials=duckdb_credentials + credentials=duckdb_credentials, + timeout=timeout, ) self.gen_flow() diff --git a/viadot/flows/flow_of_flows.py b/viadot/flows/flow_of_flows.py index c6bf4b1a0..9f8053eb3 100644 --- a/viadot/flows/flow_of_flows.py +++ b/viadot/flows/flow_of_flows.py @@ -3,9 +3,6 @@ from prefect import Flow, Task, apply_map from prefect.tasks.prefect import StartFlowRun -start_flow_run_task = StartFlowRun(wait=True) -start_flow_run_task_2 = StartFlowRun(wait=True) - class Pipeline(Flow): def __init__( @@ -14,25 +11,29 @@ def __init__( project_name: str, extract_flows_names: List[str], transform_flow_name: str, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): self.extract_flows_names = extract_flows_names self.transform_flow_name = transform_flow_name self.project_name = project_name + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() def gen_start_flow_run_task(self, flow_name: str, flow: Flow = None) -> Task: + start_flow_run_task = StartFlowRun(wait=True, timeout=self.timeout) t = start_flow_run_task.bind( flow_name=flow_name, project_name=self.project_name, flow=flow ) return t - def gen_flow(self) -> Flow: + def gen_flow(self): extract_flow_runs = apply_map( self.gen_start_flow_run_task, self.extract_flows_names, flow=self ) + start_flow_run_task_2 = StartFlowRun(wait=True, timeout=self.timeout) transform_flow_run = start_flow_run_task_2.bind( flow_name=self.transform_flow_name, project_name=self.project_name, diff --git a/viadot/flows/genesys_to_adls.py b/viadot/flows/genesys_to_adls.py index c4c894a76..da867c176 100644 --- a/viadot/flows/genesys_to_adls.py +++ b/viadot/flows/genesys_to_adls.py @@ -12,8 +12,6 @@ df_to_parquet, ) -file_to_adls_task = AzureDataLakeUpload() - @task def adls_bulk_upload( @@ -21,6 +19,7 @@ def adls_bulk_upload( adls_file_path: str = None, adls_sp_credentials_secret: str = None, adls_overwrite: bool = True, + task_timeout: int = 3600, ) -> List[str]: """ Function that upload files to defined path in ADLS. @@ -31,12 +30,15 @@ def adls_bulk_upload( adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. adls_overwrite (bool, optional): Whether to overwrite files in the data lake. Defaults to True. + task_timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: List[str]: List of paths """ for file in file_names: file_path = str(adls_file_path + "/" + file) + file_to_adls_task = AzureDataLakeUpload(timeout=task_timeout) file_to_adls_task.run( from_path=file, to_path=file_path, @@ -81,6 +83,7 @@ def __init__( overwrite_adls: bool = True, adls_sp_credentials_secret: str = None, credentials_genesys: Dict[str, Any] = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -109,6 +112,8 @@ def __init__( adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. credentials(dict, optional): Credentials for the genesys api. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # GenesysToCSV self.flow_name = name @@ -125,6 +130,8 @@ def __init__( self.end_date = end_date self.days_interval = days_interval self.sep = sep + self.timeout = timeout + # AzureDataLake self.local_file_path = local_file_path self.adls_file_path = adls_file_path @@ -138,7 +145,7 @@ def __init__( def gen_flow(self) -> Flow: - to_csv = GenesysToCSV() + to_csv = GenesysToCSV(timeout=self.timeout) if self.view_type == "queue_performance_detail_view": file_names = to_csv.bind( @@ -175,6 +182,7 @@ def gen_flow(self) -> Flow: file_names=file_names, adls_file_path=self.adls_file_path, adls_sp_credentials_secret=self.adls_sp_credentials_secret, + task_timeout=self.timeout, flow=self, ) @@ -197,6 +205,7 @@ def __init__( adls_sp_credentials_secret: str = None, credentials_secret: str = None, schedule_id: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -219,7 +228,8 @@ def __init__( Defaults to None. credentials_secret (str, optional): The name of the Azure Key Vault secret for Genesys project. Defaults to None. schedule_id (str, optional): ID of the schedule report job. Defaults to None. - + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.name = name @@ -235,6 +245,7 @@ def __init__( self.credentials_secret = credentials_secret self.if_exsists = if_exists self.schedule_id = schedule_id + self.timeout = timeout super().__init__(*args, name=name, **kwargs) @@ -242,7 +253,7 @@ def __init__( def gen_flow(self) -> Flow: - genesys_report = GenesysToDF() + genesys_report = GenesysToDF(timeout=self.timeout) df = genesys_report.bind( report_columns=self.columns, @@ -268,6 +279,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, diff --git a/viadot/flows/multiple_flows.py b/viadot/flows/multiple_flows.py index 8631d5799..49e88bd34 100644 --- a/viadot/flows/multiple_flows.py +++ b/viadot/flows/multiple_flows.py @@ -7,7 +7,7 @@ logger = logging.get_logger() -@task +@task(timeout=3600) def run_flows_list(flow_name: str, flows_list: List[List] = [List[None]]): """ Task for running multiple flows in the given order. Task will create flow of flows. @@ -54,4 +54,8 @@ def __init__( self.gen_flow() def gen_flow(self) -> Flow: - run_flows_list.bind(flow_name=self.name, flows_list=self.flows_list, flow=self) + run_flows_list.bind( + flow_name=self.name, + flows_list=self.flows_list, + flow=self, + ) diff --git a/viadot/flows/mysql_to_adls.py b/viadot/flows/mysql_to_adls.py index a7a390717..4452a5536 100644 --- a/viadot/flows/mysql_to_adls.py +++ b/viadot/flows/mysql_to_adls.py @@ -6,8 +6,6 @@ from viadot.tasks import AzureDataLakeUpload from viadot.tasks.mysql_to_df import MySqlToDf -file_to_adls_task = AzureDataLakeUpload() - class MySqlToADLS(Flow): def __init__( @@ -24,6 +22,7 @@ def __init__( overwrite_adls: bool = True, sp_credentials_secret: str = None, credentials_secret: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -47,6 +46,8 @@ def __init__( credentials_secret (str, optional): Key Vault name. Defaults to None. columns_to_clean (List(str), optional): Select columns to clean, used with remove_special_characters. If None whole data frame will be processed. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # Connect to sql @@ -55,6 +56,7 @@ def __init__( self.sqldb_credentials_secret = sqldb_credentials_secret self.vault_name = vault_name self.overwrite_adls = overwrite_adls + # Upload to ADLS self.file_path = file_path self.sep = sep @@ -62,6 +64,7 @@ def __init__( self.if_exists = if_exists self.sp_credentials_secret = sp_credentials_secret self.credentials_secret = credentials_secret + self.timeout = timeout super().__init__(*args, name=name, **kwargs) @@ -69,8 +72,7 @@ def __init__( def gen_flow(self) -> Flow: - df_task = MySqlToDf(country_short=self.country_short) - + df_task = MySqlToDf(country_short=self.country_short, timeout=self.timeout) df = df_task.bind( credentials_secret=self.credentials_secret, query=self.query, flow=self ) @@ -83,6 +85,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) adls_upload = file_to_adls_task.bind( from_path=self.file_path, to_path=self.to_path, diff --git a/viadot/flows/outlook_to_adls.py b/viadot/flows/outlook_to_adls.py index c5e86346d..af77bb553 100644 --- a/viadot/flows/outlook_to_adls.py +++ b/viadot/flows/outlook_to_adls.py @@ -10,9 +10,7 @@ union_dfs_task, credentials_loader, ) -from ..tasks import AzureDataLakeUpload, OutlookToDF - -file_to_adls_task = AzureDataLakeUpload() +from viadot.tasks import AzureDataLakeUpload, OutlookToDF class OutlookToADLS(Flow): @@ -28,7 +26,7 @@ def __init__( overwrite_adls: bool = True, adls_sp_credentials_secret: str = None, limit: int = 10000, - timeout: int = 1200, + timeout: int = 3600, if_exists: Literal["append", "replace", "skip"] = "append", outlook_credentials_secret: str = "OUTLOOK", *args: List[Any], @@ -50,7 +48,8 @@ def __init__( ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake. Defaults to None. outlook_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with outlook credentials. limit (int, optional): Number of fetched top messages. Defaults to 10000. - timeout (int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 1200. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. if_exists (Literal['append', 'replace', 'skip'], optional): What to do if the local file already exists. Defaults to "append". """ @@ -80,7 +79,6 @@ def gen_outlook_df( credentials_secret=self.outlook_credentials_secret ) outlook_to_df = OutlookToDF(timeout=self.timeout, credentials=credentials) - df = outlook_to_df.bind( mailbox_name=mailbox_list, start_date=self.start_date, @@ -113,6 +111,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, diff --git a/viadot/flows/prefect_logs.py b/viadot/flows/prefect_logs.py index 3c377a4cb..6995d4dd3 100644 --- a/viadot/flows/prefect_logs.py +++ b/viadot/flows/prefect_logs.py @@ -12,7 +12,6 @@ from viadot.task_utils import add_ingestion_metadata_task, df_to_parquet logger = logging.get_logger() -azure_dl_upload_task = AzureDataLakeUpload() class PrefectLogs(Flow): @@ -27,6 +26,7 @@ def __init__( adls_sp_credentials_secret: str = None, vault_name: str = None, overwrite_adls: bool = True, + timeout: int = 3600, *args, **kwargs, ): @@ -46,6 +46,8 @@ def __init__( Defaults to None. vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. overwrite_adls (bool, optional): Whether to overwrite the file in ADLS. Defaults to True. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Example query: { @@ -86,6 +88,7 @@ def __init__( self.adls_path = adls_path self.vault_name = vault_name self.overwrite_adls = overwrite_adls + self.timeout = timeout self.adls_sp_credentials_secret = adls_sp_credentials_secret if scheduled_start_time == "yesterday": @@ -247,6 +250,7 @@ def gen_flow(self) -> Flow: flow=self, ) + azure_dl_upload_task = AzureDataLakeUpload(timeout=self.timeout) adls_upload = azure_dl_upload_task.bind( from_path=self.local_file_path, to_path=self.adls_path, diff --git a/viadot/flows/salesforce_to_adls.py b/viadot/flows/salesforce_to_adls.py index 043b5118e..f98b63a23 100644 --- a/viadot/flows/salesforce_to_adls.py +++ b/viadot/flows/salesforce_to_adls.py @@ -7,7 +7,7 @@ from prefect.backend import set_key_value from prefect.utilities import logging -from ..task_utils import ( +from viadot.task_utils import ( add_ingestion_metadata_task, df_clean_column, df_get_data_types_task, @@ -17,11 +17,8 @@ dtypes_to_json_task, update_dtypes_dict, ) -from ..tasks import AzureDataLakeUpload, SalesforceToDF +from viadot.tasks import AzureDataLakeUpload, SalesforceToDF -salesforce_to_df_task = SalesforceToDF() -file_to_adls_task = AzureDataLakeUpload() -json_to_adls_task = AzureDataLakeUpload() logger = logging.get_logger(__name__) @@ -45,6 +42,7 @@ def __init__( adls_file_name: str = None, adls_sp_credentials_secret: str = None, if_exists: str = "replace", + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -73,6 +71,8 @@ def __init__( ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake. Defaults to None. if_exists (str, optional): What to do if the file exists. Defaults to "replace". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # SalesforceToDF self.query = query @@ -89,6 +89,7 @@ def __init__( self.if_exists = if_exists self.output_file_extension = output_file_extension self.now = str(pendulum.now("utc")) + self.timeout = timeout self.local_file_path = ( local_file_path or self.slugify(name) + self.output_file_extension @@ -119,6 +120,7 @@ def slugify(name): return name.replace(" ", "_").lower() def gen_flow(self) -> Flow: + salesforce_to_df_task = SalesforceToDF(timeout=self.timeout) df = salesforce_to_df_task.bind( query=self.query, table=self.table, @@ -153,6 +155,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, @@ -166,6 +169,7 @@ def gen_flow(self) -> Flow: dtypes_dict=dtypes_updated, local_json_path=self.local_json_path, flow=self ) + json_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) json_to_adls_task.bind( from_path=self.local_json_path, to_path=self.adls_schema_file_dir_file, diff --git a/viadot/flows/sap_rfc_to_adls.py b/viadot/flows/sap_rfc_to_adls.py index 037ee5701..d23ffc428 100644 --- a/viadot/flows/sap_rfc_to_adls.py +++ b/viadot/flows/sap_rfc_to_adls.py @@ -6,9 +6,6 @@ from viadot.task_utils import concat_dfs, df_to_csv, df_to_parquet, set_new_kv from viadot.tasks import AzureDataLakeUpload, SAPRFCToDF -download_sap_task = SAPRFCToDF() -file_to_adls_task = AzureDataLakeUpload() - class SAPRFCToADLS(Flow): def __init__( @@ -29,6 +26,7 @@ def __init__( vault_name: str = None, update_kv: bool = False, filter_column: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -66,6 +64,8 @@ def __init__( vault_name(str, optional): The name of the vault from which to obtain the secrets. Defaults to None. update_kv (bool, optional): Whether or not to update key value on Prefect. Defaults to False. filter_column (str, optional): Name of the field based on which key value will be updated. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.query = query self.rfc_sep = rfc_sep @@ -80,6 +80,7 @@ def __init__( self.overwrite = overwrite self.adls_sp_credentials_secret = adls_sp_credentials_secret self.vault_name = vault_name + self.timeout = timeout self.update_kv = update_kv self.filter_column = filter_column @@ -89,7 +90,7 @@ def __init__( self.gen_flow() def gen_flow(self) -> Flow: - + download_sap_task = SAPRFCToDF(timeout=self.timeout) df = download_sap_task( query=self.query, sep=self.rfc_sep, @@ -115,6 +116,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) adls_upload = file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_path, diff --git a/viadot/flows/sap_to_duckdb.py b/viadot/flows/sap_to_duckdb.py index aa9847b14..081ae338c 100644 --- a/viadot/flows/sap_to_duckdb.py +++ b/viadot/flows/sap_to_duckdb.py @@ -7,13 +7,13 @@ logger = logging.get_logger() -from ..task_utils import ( +from viadot.task_utils import ( add_ingestion_metadata_task, cast_df_to_str, df_to_parquet, set_new_kv, ) -from ..tasks import DuckDBCreateTableFromParquet, SAPRFCToDF +from viadot.tasks import DuckDBCreateTableFromParquet, SAPRFCToDF class SAPToDuckDB(Flow): @@ -35,6 +35,7 @@ def __init__( duckdb_credentials: dict = None, update_kv: bool = False, filter_column: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -60,6 +61,8 @@ def __init__( duckdb_credentials (dict, optional): The config to use for connecting with DuckDB. Defaults to None. update_kv (bool, optional): Whether or not to update key value on Prefect. Defaults to False. filter_column (str, optional): Name of the field based on which key value will be updated. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # SAPRFCToDF @@ -81,9 +84,9 @@ def __init__( super().__init__(*args, name=name, **kwargs) - self.sap_to_df_task = SAPRFCToDF(credentials=sap_credentials) + self.sap_to_df_task = SAPRFCToDF(credentials=sap_credentials, timeout=timeout) self.create_duckdb_table_task = DuckDBCreateTableFromParquet( - credentials=duckdb_credentials + credentials=duckdb_credentials, timeout=timeout ) self.gen_flow() diff --git a/viadot/flows/sftp_operations.py b/viadot/flows/sftp_operations.py index 2396c8f3e..7bc6d60a1 100644 --- a/viadot/flows/sftp_operations.py +++ b/viadot/flows/sftp_operations.py @@ -7,11 +7,6 @@ from viadot.task_utils import add_ingestion_metadata_task -upload_to_adls = AzureDataLakeUpload() -create_table_task = AzureSQLCreateTable() -bulk_insert_task = BCPTask() - - class SftpToAzureSQL(Flow): def __init__( self, @@ -31,6 +26,7 @@ def __init__( on_bcp_error: Literal["skip", "fail"] = "fail", error_log_file_path: str = "SFTP_logs.log", vault_name: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -55,6 +51,8 @@ def __init__( on_bcp_error (Literal["skip", "fail"], optional): What to do if error occurs. Defaults to "fail". error_log_file_path (string, optional): Full path of an error file. Defaults to "./log_file.log". vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # SFTP self.from_path = from_path @@ -70,6 +68,7 @@ def __init__( self.sep = sep self.remove_tab = remove_tab + self.timeout = timeout # Read schema self.schema = schema @@ -108,6 +107,7 @@ def gen_flow(self) -> Flow: sftp = SftpToDF( sftp_credentials_secret=self.sftp_credentials_secret, credentials=self.sftp_credentials, + timeout=self.timeout, ) df = sftp.bind( from_path=self.from_path, @@ -122,6 +122,7 @@ def gen_flow(self) -> Flow: flow=self, ) + create_table_task = AzureSQLCreateTable(timeout=self.timeout) create_table_task.bind( schema=self.schema, table=self.table, @@ -132,6 +133,7 @@ def gen_flow(self) -> Flow: flow=self, ) + bulk_insert_task = BCPTask(timeout=self.timeout) bulk_insert_task.bind( path=self.file_name, schema=self.schema, @@ -163,6 +165,7 @@ def __init__( sftp_credentials: Dict[str, Any] = None, sp_credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -183,12 +186,16 @@ def __init__( sftp_credentials (Dict[str, Any], optional): SFTP server credentials. Defaults to None. sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary. vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # SFTP self.from_path = from_path self.sftp_credentials_secret = sftp_credentials_secret self.sftp_credentials = sftp_credentials self.columns = columns + self.timeout = timeout + # File args if file_name is None: self.file_name = from_path.split("/")[-1] @@ -221,6 +228,7 @@ def gen_flow(self) -> Flow: ftp = SftpToDF( sftp_credentials_secret=self.sftp_credentials_secret, credentials=self.sftp_credentials, + timeout=self.timeout, ) df = ftp.bind( from_path=self.from_path, @@ -231,6 +239,7 @@ def gen_flow(self) -> Flow: df=df, remove_tab=self.remove_tab, path=self.file_name, flow=self ) + upload_to_adls = AzureDataLakeUpload(timeout=self.timeout) upload_df = upload_to_adls.bind( from_path=self.file_name, to_path=self.to_path, diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 08841abcb..d6ecb7b6d 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -9,7 +9,7 @@ logger = logging.get_logger() -from ..task_utils import ( +from viadot.task_utils import ( add_ingestion_metadata_task, df_get_data_types_task, df_map_mixed_dtypes_for_parquet, @@ -17,12 +17,8 @@ df_to_parquet, dtypes_to_json_task, ) -from ..tasks import AzureDataLakeUpload -from ..tasks.sharepoint import SharepointToDF - -excel_to_df_task = SharepointToDF() -file_to_adls_task = AzureDataLakeUpload() -json_to_adls_task = AzureDataLakeUpload() +from viadot.tasks import AzureDataLakeUpload +from viadot.tasks.sharepoint import SharepointToDF class SharepointToADLS(Flow): @@ -42,6 +38,7 @@ def __init__( overwrite_adls: bool = False, if_empty: str = "warn", if_exists: str = "replace", + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -65,6 +62,8 @@ def __init__( Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_empty (str, optional): What to do if query returns no data. Defaults to "warn". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # SharepointToDF self.if_empty = if_empty @@ -74,6 +73,7 @@ def __init__( self.local_dir_path = local_dir_path self.sheet_number = sheet_number self.validate_excel_file = validate_excel_file + self.timeout = timeout # AzureDataLakeUpload self.overwrite = overwrite_adls @@ -107,6 +107,7 @@ def __init__( self.gen_flow() def gen_flow(self) -> Flow: + excel_to_df_task = SharepointToDF(timeout=self.timeout) df = excel_to_df_task.bind( path_to_file=self.path_to_file, url_to_file=self.url_to_file, @@ -137,6 +138,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, @@ -148,6 +150,7 @@ def gen_flow(self) -> Flow: dtypes_to_json_task.bind( dtypes_dict=dtypes_dict, local_json_path=self.local_json_path, flow=self ) + json_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) json_to_adls_task.bind( from_path=self.local_json_path, to_path=self.adls_schema_file_dir_file, diff --git a/viadot/flows/sql_server_to_duckdb.py b/viadot/flows/sql_server_to_duckdb.py index f41348ab0..71ea15d22 100644 --- a/viadot/flows/sql_server_to_duckdb.py +++ b/viadot/flows/sql_server_to_duckdb.py @@ -2,10 +2,8 @@ from prefect import Flow -from ..task_utils import add_ingestion_metadata_task, cast_df_to_str, df_to_parquet -from ..tasks import DuckDBCreateTableFromParquet, SQLServerToDF - -df_task = SQLServerToDF() +from viadot.task_utils import add_ingestion_metadata_task, cast_df_to_str, df_to_parquet +from viadot.tasks import DuckDBCreateTableFromParquet, SQLServerToDF class SQLServerToDuckDB(Flow): @@ -20,6 +18,7 @@ def __init__( if_exists: Literal["fail", "replace", "append", "skip", "delete"] = "fail", if_empty: Literal["warn", "skip", "fail"] = "skip", duckdb_credentials: dict = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -37,11 +36,13 @@ def __init__( if_exists (Literal, optional): What to do if the table already exists. Defaults to "fail". if_empty (Literal, optional): What to do if Parquet file is empty. Defaults to "skip". duckdb_credentials (dict, optional): Credentials for the DuckDB connection. Defaults to None. - + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ # SQLServerToDF self.sql_query = sql_query self.sqlserver_config_key = sqlserver_config_key + self.timeout = timeout # DuckDBCreateTableFromParquet self.local_file_path = local_file_path @@ -54,12 +55,13 @@ def __init__( super().__init__(*args, name=name, **kwargs) self.create_duckdb_table_task = DuckDBCreateTableFromParquet( - credentials=duckdb_credentials + credentials=duckdb_credentials, timeout=timeout ) self.gen_flow() def gen_flow(self) -> Flow: + df_task = SQLServerToDF(timeout=self.timeout) df = df_task.bind( config_key=self.sqlserver_config_key, query=self.sql_query, flow=self ) diff --git a/viadot/flows/sql_server_transform.py b/viadot/flows/sql_server_transform.py index dcefe59ed..324dc93bd 100644 --- a/viadot/flows/sql_server_transform.py +++ b/viadot/flows/sql_server_transform.py @@ -1,9 +1,7 @@ from prefect import Flow, config from typing import Any, Dict, List, Literal -from ..tasks import SQLServerQuery - -query_task = SQLServerQuery() +from viadot.tasks import SQLServerQuery class SQLServerTransform(Flow): @@ -12,6 +10,7 @@ def __init__( name: str, query: str, config_key: str, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -22,14 +21,18 @@ def __init__( name (str,required): The name of the flow. query (str, required): The query to execute on the database. config_key (str, required): Config key containing credentials for the SQL Server connection. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.query = query self.config_key = config_key + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() def gen_flow(self) -> Flow: + query_task = SQLServerQuery(timeout=self.timeout) query_task.bind( query=self.query, config_key=self.config_key, diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index 883cd9964..38255a38f 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -8,7 +8,7 @@ from prefect.tasks.secrets import PrefectSecret from prefect.utilities import logging -from ..task_utils import ( +from viadot.task_utils import ( add_ingestion_metadata_task, cleanup_validation_clutter, df_get_data_types_task, @@ -20,7 +20,7 @@ update_dtypes_dict, write_to_json, ) -from ..tasks import ( +from viadot.tasks import ( AzureDataLakeUpload, DownloadGitHubFile, GetFlowNewDateRange, @@ -30,12 +30,7 @@ logger = logging.get_logger(__name__) -supermetrics_to_df_task = SupermetricsToDF() -download_github_file_task = DownloadGitHubFile() validation_task = RunGreatExpectationsValidation() -file_to_adls_task = AzureDataLakeUpload() -json_to_adls_task = AzureDataLakeUpload() -prefect_get_new_date_range = GetFlowNewDateRange() class SupermetricsToADLS(Flow): @@ -72,6 +67,7 @@ def __init__( tags: List[str] = ["extract"], vault_name: str = None, check_missing_data: bool = True, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -114,6 +110,8 @@ def __init__( tags (List[str], optional): Flow tags to use, eg. to control flow concurrency. Defaults to ["extract"]. vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. check_missing_data (bool, optional): Whether to check missing data. Defaults to True. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ if not ds_user: try: @@ -124,6 +122,7 @@ def __init__( self.flow_name = name self.check_missing_data = check_missing_data + self.timeout = timeout # SupermetricsToDF self.ds_id = ds_id self.ds_accounts = ds_accounts @@ -191,6 +190,7 @@ def slugify(name): def gen_supermetrics_task( self, ds_accounts: Union[str, List[str]], flow: Flow = None ) -> Task: + supermetrics_to_df_task = SupermetricsToDF(timeout=self.timeout) t = supermetrics_to_df_task.bind( ds_id=self.ds_id, ds_accounts=ds_accounts, @@ -215,6 +215,7 @@ def gen_supermetrics_task( def gen_flow(self) -> Flow: if self.check_missing_data is True: if self.date_range_type is not None and "days" in self.date_range_type: + prefect_get_new_date_range = GetFlowNewDateRange(timeout=self.timeout) self.date_range_type = prefect_get_new_date_range.run( flow_name=self.flow_name, date_range_type=self.date_range_type, @@ -276,6 +277,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) file_to_adls_task.bind( from_path=self.local_file_path, to_path=self.adls_file_path, @@ -289,6 +291,7 @@ def gen_flow(self) -> Flow: dtypes_to_json_task.bind( dtypes_dict=dtypes_updated, local_json_path=self.local_json_path, flow=self ) + json_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) json_to_adls_task.bind( from_path=self.local_json_path, to_path=self.adls_schema_file_dir_file, diff --git a/viadot/flows/supermetrics_to_azure_sql.py b/viadot/flows/supermetrics_to_azure_sql.py index ddb6e087e..c98c34c05 100644 --- a/viadot/flows/supermetrics_to_azure_sql.py +++ b/viadot/flows/supermetrics_to_azure_sql.py @@ -7,10 +7,6 @@ logger = logging.get_logger(__name__) -supermetrics_to_csv_task = SupermetricsToCSV() -csv_to_blob_storage_task = BlobFromCSV() -blob_to_azure_sql_task = CreateTableFromBlob() - class SupermetricsToAzureSQL(Flow): def __init__( @@ -42,6 +38,7 @@ def __init__( parallel: bool = True, tags: List[str] = ["extract"], sep: str = "\t", + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -71,11 +68,7 @@ def __init__( self.parallel = parallel self.tags = tags self.sep = sep - self.tasks = [ - supermetrics_to_csv_task, - csv_to_blob_storage_task, - blob_to_azure_sql_task, - ] + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() @@ -86,6 +79,7 @@ def slugify(name): def gen_supermetrics_task( self, ds_accounts: Union[str, List[str]], flow: Flow = None ) -> Task: + supermetrics_to_csv_task = SupermetricsToCSV(timeout=self.timeout) t = supermetrics_to_csv_task.bind( ds_id=self.ds_id, ds_accounts=ds_accounts, @@ -121,12 +115,14 @@ def gen_flow(self) -> Flow: ds_accounts=self.ds_accounts, flow=self ) + csv_to_blob_storage_task = BlobFromCSV(timeout=self.timeout) csv_to_blob_storage_task.bind( from_path=self.local_file_path, to_path=self.blob_path, overwrite=self.overwrite_blob, flow=self, ) + blob_to_azure_sql_task = CreateTableFromBlob(timeout=self.timeout) blob_to_azure_sql_task.bind( blob_path=self.blob_path, schema=self.schema, From 56e0e856fea54454a378810f87a32497c1f69d6c Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 25 Jan 2023 11:22:26 +0100 Subject: [PATCH 42/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20docs.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 +- viadot/examples/sap_rfc/Dockerfile | 10 +----- viadot/examples/sap_rfc/README.md | 2 +- viadot/flows/adls_gen1_to_azure_sql.py | 11 +++--- viadot/flows/adls_gen1_to_azure_sql_new.py | 12 ++++--- viadot/flows/adls_gen1_to_gen2.py | 9 +++-- viadot/flows/adls_to_azure_sql.py | 27 +++++++------- viadot/flows/aselite_to_adls.py | 9 +++-- viadot/flows/genesys_to_adls.py | 41 +++------------------- viadot/flows/mindful_to_adls.py | 37 +------------------ 10 files changed, 49 insertions(+), 111 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ce88d377..e9c32fc51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,7 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Updated `mindful_to_adls.py` flow with the `adls_bulk_upload` task - Changed `MindfulToCSV` task to download surveys info. -# [0.4.11] - 2022-12-15 +## [0.4.11] - 2022-12-15 ### Added - Added into `Genesys` the new view type `AGENT`. diff --git a/viadot/examples/sap_rfc/Dockerfile b/viadot/examples/sap_rfc/Dockerfile index c5c5e1efa..24fa64969 100644 --- a/viadot/examples/sap_rfc/Dockerfile +++ b/viadot/examples/sap_rfc/Dockerfile @@ -19,12 +19,4 @@ ENV HTTPS_PROXY=$HTTPS_PROXY ENV NO_PROXY=$NO_PROXY RUN git config --global http.proxy ${HTTP_PROXY:-""} -RUN pip install pyrfc==2.5.0 - -ARG VIADOT_USER=viadot_user -ARG GID=1111 -ARG UID=1111 -RUN groupadd -g $GID -o $VIADOT_USER -RUN useradd -m -u $UID -g $GID -o -s /bin/bash $VIADOT_USER - -USER $VIADOT_USER +RUN pip install pyrfc==2.5.0 \ No newline at end of file diff --git a/viadot/examples/sap_rfc/README.md b/viadot/examples/sap_rfc/README.md index f84eab580..fcebcaf23 100644 --- a/viadot/examples/sap_rfc/README.md +++ b/viadot/examples/sap_rfc/README.md @@ -1,4 +1,4 @@ -## SAP RFC example +# SAP RFC example This is an example environment for running the `SAPRFC` connector. diff --git a/viadot/flows/adls_gen1_to_azure_sql.py b/viadot/flows/adls_gen1_to_azure_sql.py index 0ad2e8a21..766a2b58e 100644 --- a/viadot/flows/adls_gen1_to_azure_sql.py +++ b/viadot/flows/adls_gen1_to_azure_sql.py @@ -5,10 +5,6 @@ from ..tasks import AzureDataLakeDownload, BlobFromCSV, CreateTableFromBlob -gen1_download_task = AzureDataLakeDownload(gen=1) -csv_to_blob_storage_task = BlobFromCSV() -blob_to_azure_sql_task = CreateTableFromBlob() - logger = logging.get_logger(__name__) @@ -24,6 +20,8 @@ class ADLSGen1ToAzureSQL(Flow): local_file_path (str): Where the gen1 file should be downloaded. sp_credentials_secret (str): The Key Vault secret holding Service Pricipal credentials vault_name (str): The name of the vault from which to retrieve `sp_credentials_secret` + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -40,6 +38,7 @@ def __init__( if_exists: str = "replace", sp_credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -55,6 +54,7 @@ def __init__( self.if_exists = if_exists self.sp_credentials_secret = sp_credentials_secret self.vault_name = vault_name + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() @@ -63,6 +63,7 @@ def slugify(name): return name.replace(" ", "_").lower() def gen_flow(self) -> Flow: + gen1_download_task = AzureDataLakeDownload(gen=1, timeout=self.timeout) gen1_download_task.bind( from_path=self.path, to_path=self.local_file_path, @@ -71,12 +72,14 @@ def gen_flow(self) -> Flow: vault_name=self.vault_name, flow=self, ) + csv_to_blob_storage_task = BlobFromCSV(timeout=self.timeout) csv_to_blob_storage_task.bind( from_path=self.local_file_path, to_path=self.blob_path, overwrite=self.overwrite_blob, flow=self, ) + blob_to_azure_sql_task = CreateTableFromBlob(timeout=self.timeout) blob_to_azure_sql_task.bind( blob_path=self.blob_path, schema=self.schema, diff --git a/viadot/flows/adls_gen1_to_azure_sql_new.py b/viadot/flows/adls_gen1_to_azure_sql_new.py index 05f977280..972037d28 100644 --- a/viadot/flows/adls_gen1_to_azure_sql_new.py +++ b/viadot/flows/adls_gen1_to_azure_sql_new.py @@ -10,10 +10,6 @@ from ..tasks import AzureDataLakeToDF, AzureDataLakeUpload, AzureSQLCreateTable, BCPTask -gen1_to_df_task = AzureDataLakeToDF(gen=1) -gen2_upload_task = AzureDataLakeUpload(gen=2) -create_table_task = AzureSQLCreateTable() -bulk_insert_task = BCPTask() logger = logging.get_logger(__name__) @@ -47,6 +43,8 @@ class ADLSGen1ToAzureSQLNew(Flow): gen2_sp_credentials_secret (str): The Key Vault secret holding Service Pricipal credentials for gen2 lake sqldb_credentials_secret (str): The Key Vault secret holding Azure SQL Database credentials vault_name (str): The name of the vault from which to retrieve `sp_credentials_secret` + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -69,6 +67,7 @@ def __init__( gen2_sp_credentials_secret: str = None, sqldb_credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -90,6 +89,7 @@ def __init__( self.gen2_sp_credentials_secret = gen2_sp_credentials_secret self.sqldb_credentials_secret = sqldb_credentials_secret self.vault_name = vault_name + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.dtypes.update(METADATA_COLUMNS) self.gen_flow() @@ -99,6 +99,7 @@ def slugify(name): return name.replace(" ", "_").lower() def gen_flow(self) -> Flow: + gen1_to_df_task = AzureDataLakeToDF(gen=1, timeout=self.timeout) df = gen1_to_df_task.bind( path=self.gen1_path, gen=1, @@ -118,6 +119,7 @@ def gen_flow(self) -> Flow: sep=self.write_sep, flow=self, ) + gen2_upload_task = AzureDataLakeUpload(gen=2, timeout=self.timeout) gen2_upload_task.bind( from_path=self.local_file_path, to_path=self.gen2_path, @@ -126,6 +128,7 @@ def gen_flow(self) -> Flow: vault_name=self.vault_name, flow=self, ) + create_table_task = AzureSQLCreateTable(timeout=self.timeout) create_table_task.bind( schema=self.schema, table=self.table, @@ -135,6 +138,7 @@ def gen_flow(self) -> Flow: vault_name=self.vault_name, flow=self, ) + bulk_insert_task = BCPTask(timeout=self.timeout) bulk_insert_task.bind( path=self.local_file_path, schema=self.schema, diff --git a/viadot/flows/adls_gen1_to_gen2.py b/viadot/flows/adls_gen1_to_gen2.py index 3764ec05b..fcea775d4 100644 --- a/viadot/flows/adls_gen1_to_gen2.py +++ b/viadot/flows/adls_gen1_to_gen2.py @@ -7,9 +7,6 @@ from ..tasks import AzureDataLakeDownload, AzureDataLakeUpload -gen1_download_task = AzureDataLakeDownload(gen=1) -gen2_upload_task = AzureDataLakeUpload(gen=2) - logger = logging.get_logger(__name__) @@ -37,6 +34,8 @@ class ADLSGen1ToGen2(Flow): gen1_sp_credentials_secret (str): The Key Vault secret holding Service Pricipal credentials for gen1 lake gen2_sp_credentials_secret (str): The Key Vault secret holding Service Pricipal credentials for gen2 lake vault_name (str): The name of the vault from which to retrieve the secrets. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -50,6 +49,7 @@ def __init__( gen1_sp_credentials_secret: str = None, gen2_sp_credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -62,6 +62,7 @@ def __init__( self.gen1_sp_credentials_secret = gen1_sp_credentials_secret self.gen2_sp_credentials_secret = gen2_sp_credentials_secret self.vault_name = vault_name + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() @@ -70,6 +71,7 @@ def slugify(name): return name.replace(" ", "_").lower() def gen_flow(self) -> Flow: + gen1_download_task = AzureDataLakeDownload(gen=1, timeout=self.timeout) gen1_download_task.bind( from_path=self.gen1_path, to_path=self.local_file_path, @@ -79,6 +81,7 @@ def gen_flow(self) -> Flow: flow=self, ) add_ingestion_metadata.bind(path=self.local_file_path, sep=self.sep, flow=self) + gen2_upload_task = AzureDataLakeUpload(gen=2, timeout=self.timeout) gen2_upload_task.bind( from_path=self.local_file_path, to_path=self.gen2_path, diff --git a/viadot/flows/adls_to_azure_sql.py b/viadot/flows/adls_to_azure_sql.py index 9cc371b5e..e69750a9e 100644 --- a/viadot/flows/adls_to_azure_sql.py +++ b/viadot/flows/adls_to_azure_sql.py @@ -21,23 +21,13 @@ logger = logging.get_logger(__name__) -lake_to_df_task = AzureDataLakeToDF() -download_json_file_task = AzureDataLakeDownload() -download_github_file_task = DownloadGitHubFile() -promote_to_conformed_task = AzureDataLakeCopy() -promote_to_operations_task = AzureDataLakeCopy() -create_table_task = AzureSQLCreateTable() -bulk_insert_task = BCPTask() -azure_query_task = AzureSQLDBQuery() -check_column_order_task = CheckColumnOrder() - -@task +@task(timeout=3600) def union_dfs_task(dfs: List[pd.DataFrame]): return pd.concat(dfs, ignore_index=True) -@task +@task(timeout=3600) def map_data_types_task(json_shema_path: str): file_dtypes = open(json_shema_path) dict_dtypes = json.load(file_dtypes) @@ -71,7 +61,7 @@ def map_data_types_task(json_shema_path: str): return dict_dtypes_mapped -@task +@task(timeout=3600) def df_to_csv_task(df, remove_tab, path: str, sep: str = "\t"): # if table doesn't exist it will be created later - df equals None if df is None: @@ -109,6 +99,7 @@ def __init__( max_download_retries: int = 5, tags: List[str] = ["promotion"], vault_name: str = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any], ): @@ -141,6 +132,8 @@ def __init__( max_download_retries (int, optional): How many times to retry the download. Defaults to 5. tags (List[str], optional): Flow tags to use, eg. to control flow concurrency. Defaults to ["promotion"]. vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ adls_path = adls_path.strip("/") @@ -189,6 +182,7 @@ def __init__( self.max_download_retries = max_download_retries self.tags = tags self.vault_name = vault_name + self.timeout = timeout super().__init__(*args, name=name, **kwargs) @@ -219,6 +213,7 @@ def get_promoted_path(self, env: str) -> str: return promoted_path def gen_flow(self) -> Flow: + lake_to_df_task = AzureDataLakeToDF(timeout=self.timeout) df = lake_to_df_task.bind( path=self.adls_path, sp_credentials_secret=self.adls_sp_credentials_secret, @@ -227,6 +222,7 @@ def gen_flow(self) -> Flow: ) if not self.dtypes: + download_json_file_task = AzureDataLakeDownload(timeout=self.timeout) download_json_file_task.bind( from_path=self.json_shema_path, to_path=self.local_json_path, @@ -238,6 +234,7 @@ def gen_flow(self) -> Flow: else: dtypes = self.dtypes + check_column_order_task = CheckColumnOrder(timeout=self.timeout) df_reorder = check_column_order_task.bind( table=self.table, schema=self.schema, @@ -263,6 +260,7 @@ def gen_flow(self) -> Flow: flow=self, ) + promote_to_conformed_task = AzureDataLakeCopy(timeout=self.timeout) promote_to_conformed_task.bind( from_path=self.adls_path, to_path=self.adls_path_conformed, @@ -270,6 +268,7 @@ def gen_flow(self) -> Flow: vault_name=self.vault_name, flow=self, ) + promote_to_operations_task = AzureDataLakeCopy(timeout=self.timeout) promote_to_operations_task.bind( from_path=self.adls_path_conformed, to_path=self.adls_path_operations, @@ -277,6 +276,7 @@ def gen_flow(self) -> Flow: vault_name=self.vault_name, flow=self, ) + create_table_task = AzureSQLCreateTable(timeout=self.timeout) create_table_task.bind( schema=self.schema, table=self.table, @@ -286,6 +286,7 @@ def gen_flow(self) -> Flow: vault_name=self.vault_name, flow=self, ) + bulk_insert_task = BCPTask(timeout=self.timeout) bulk_insert_task.bind( path=self.local_file_path, schema=self.schema, diff --git a/viadot/flows/aselite_to_adls.py b/viadot/flows/aselite_to_adls.py index 363c37fba..86e9b215b 100644 --- a/viadot/flows/aselite_to_adls.py +++ b/viadot/flows/aselite_to_adls.py @@ -6,9 +6,6 @@ from viadot.tasks import AzureDataLakeUpload from viadot.tasks.aselite import ASELiteToDF -df_task = ASELiteToDF() -file_to_adls_task = AzureDataLakeUpload() - class ASELiteToADLS(Flow): def __init__( @@ -26,6 +23,7 @@ def __init__( sp_credentials_secret: str = None, remove_special_characters: bool = None, columns_to_clean: List[str] = None, + timeout: int = 3600, *args: List[any], **kwargs: Dict[str, Any] ): @@ -48,6 +46,8 @@ def __init__( remove_special_characters (str, optional): Call a function that remove special characters like escape symbols. Defaults to None. columns_to_clean (List(str), optional): Select columns to clean, used with remove_special_characters. If None whole data frame will be processed. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.query = query self.sqldb_credentials_secret = sqldb_credentials_secret @@ -62,12 +62,14 @@ def __init__( self.sp_credentials_secret = sp_credentials_secret self.remove_special_characters = remove_special_characters self.columns_to_clean = columns_to_clean + self.timeout = timeout super().__init__(*args, name=name, **kwargs) self.gen_flow() def gen_flow(self) -> Flow: + df_task = ASELiteToDF(timeout=self.timeout) df = df_task.bind( query=self.query, credentials_secret=self.sqldb_credentials_secret, @@ -89,6 +91,7 @@ def gen_flow(self) -> Flow: flow=self, ) + file_to_adls_task = AzureDataLakeUpload(timeout=self.timeout) adls_upload = file_to_adls_task.bind( from_path=self.file_path, to_path=self.to_path, diff --git a/viadot/flows/genesys_to_adls.py b/viadot/flows/genesys_to_adls.py index da867c176..6de3c23e5 100644 --- a/viadot/flows/genesys_to_adls.py +++ b/viadot/flows/genesys_to_adls.py @@ -6,47 +6,14 @@ from viadot.task_utils import df_to_csv from viadot.tasks import AzureDataLakeUpload from viadot.tasks.genesys import GenesysToCSV, GenesysToDF -from ..task_utils import ( +from viadot.task_utils import ( add_ingestion_metadata_task, df_to_csv, df_to_parquet, + adls_bulk_upload, ) -@task -def adls_bulk_upload( - file_names: List[str], - adls_file_path: str = None, - adls_sp_credentials_secret: str = None, - adls_overwrite: bool = True, - task_timeout: int = 3600, -) -> List[str]: - """ - Function that upload files to defined path in ADLS. - - Args: - file_names (List[str]): List of file names to generate paths. - adls_file_path (str, optional): Azure Data Lake path. Defaults to None. - adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with - ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. - adls_overwrite (bool, optional): Whether to overwrite files in the data lake. Defaults to True. - task_timeout(int, optional): The amount of time (in seconds) to wait while running this task before - a timeout occurs. Defaults to 3600. - Returns: - List[str]: List of paths - """ - - for file in file_names: - file_path = str(adls_file_path + "/" + file) - file_to_adls_task = AzureDataLakeUpload(timeout=task_timeout) - file_to_adls_task.run( - from_path=file, - to_path=file_path, - sp_credentials_secret=adls_sp_credentials_secret, - overwrite=adls_overwrite, - ) - - @task def add_timestamp(files_names: List = None, sep: str = None) -> None: """Add new column _viadot_downloaded_at_utc into every genesys file. @@ -178,7 +145,7 @@ def gen_flow(self) -> Flow: add_timestamp.bind(file_names, sep=self.sep, flow=self) - uploader = adls_bulk_upload( + adls_bulk_upload( file_names=file_names, adls_file_path=self.adls_file_path, adls_sp_credentials_secret=self.adls_sp_credentials_secret, @@ -187,7 +154,7 @@ def gen_flow(self) -> Flow: ) add_timestamp.set_upstream(file_names, flow=self) - uploader.set_upstream(add_timestamp, flow=self) + adls_bulk_upload.set_upstream(add_timestamp, flow=self) class GenesysReportToADLS(Flow): diff --git a/viadot/flows/mindful_to_adls.py b/viadot/flows/mindful_to_adls.py index dd5c365f4..973d03e44 100644 --- a/viadot/flows/mindful_to_adls.py +++ b/viadot/flows/mindful_to_adls.py @@ -9,47 +9,12 @@ from prefect.triggers import all_successful from viadot.tasks import MindfulToCSV from viadot.tasks import AzureDataLakeUpload -from viadot.task_utils import add_ingestion_metadata_task +from viadot.task_utils import add_ingestion_metadata_task, adls_bulk_upload logger = logging.get_logger() file_to_adls_task = AzureDataLakeUpload() -@task -def adls_bulk_upload( - file_names: List[str] = None, - file_name_relative_path: str = "", - adls_file_path: str = None, - adls_sp_credentials_secret: str = None, - adls_overwrite: bool = True, -) -> List[str]: - """Function that upload files to defined path in ADLS. - - Args: - file_names (List[str]): List of file names to generate its paths. - file_name_relative_path (str, optional): Path where to save the file locally. Defaults to ''. - adls_file_path (str, optional): Azure Data Lake path. Defaults to None. - adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with - ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. - adls_overwrite (bool, optional): Whether to overwrite files in the data lake. Defaults to True. - - Returns: - List[str]: List of paths - """ - - if not file_names: - logger.warning("Avoided uploading any file to ADLS. No files were reported.") - else: - for file in file_names: - file_path = str(adls_file_path + "/" + file) - file_to_adls_task.run( - from_path=os.path.join(file_name_relative_path, file), - to_path=file_path, - sp_credentials_secret=adls_sp_credentials_secret, - overwrite=adls_overwrite, - ) - - @task def add_timestamp(files_names: List = None, sep: str = "\t") -> None: """Add new column _viadot_downloaded_at_utc into each file given in the function. From 53bd3a86721dbfe1cda2611c04950010649c4d8f Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 25 Jan 2023 11:27:15 +0100 Subject: [PATCH 43/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20docs.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/adls_to_azure_sql.py | 53 +++++++++++++++++++++++++++++-- viadot/flows/genesys_to_adls.py | 1 - viadot/sources/mindful.py | 35 ++++++++++++++++++++ 3 files changed, 86 insertions(+), 3 deletions(-) diff --git a/viadot/flows/adls_to_azure_sql.py b/viadot/flows/adls_to_azure_sql.py index e69750a9e..77bb2f352 100644 --- a/viadot/flows/adls_to_azure_sql.py +++ b/viadot/flows/adls_to_azure_sql.py @@ -5,11 +5,12 @@ import pandas as pd from prefect import Flow, task from prefect.backend import get_key_value +from prefect.engine import signals from prefect.utilities import logging from viadot.tasks.azure_data_lake import AzureDataLakeDownload -from ..tasks import ( +from viadot.tasks import ( AzureDataLakeCopy, AzureDataLakeToDF, AzureSQLCreateTable, @@ -77,6 +78,50 @@ def df_to_csv_task(df, remove_tab, path: str, sep: str = "\t"): df.to_csv(path, sep=sep, index=False) +@task(timeout=3600) +def check_dtypes_sort( + df: pd.DataFrame = None, + dtypes: Dict[str, Any] = None, +) -> Dict[str, Any]: + """Check dtype column order to avoid malformation SQL table. + When data is loaded by the user, a data frame is passed to this task + to check the column sort with dtypes and re-sort if neccessary. + Args: + df (pd.DataFrame, optional): Data Frame from original ADLS file. Defaults to None. + dtypes (Dict[str, Any], optional): Dictionary of columns and data type to apply + to the Data Frame downloaded. Defaults to None. + Returns: + Dict[str, Any]: Sorted dtype. + """ + if df is None: + logger.error("DataFrame argument is mandatory") + raise signals.FAIL("DataFrame is None.") + else: + # first check if all dtypes keys are in df.columns + if all(d in df.columns for d in list(dtypes.keys())) and len(df.columns) == len( + list(dtypes.keys()) + ): + # check if have the same sort + matches = list(map(lambda x, y: x == y, df.columns, dtypes.keys())) + if not all(matches): + logger.warning( + "Some keys are not sorted in dtypes. Repositioning the key:value..." + ) + # re-sort in a new dtype + new_dtypes = dict() + for key in df.columns: + new_dtypes.update([(key, dtypes[key])]) + else: + new_dtypes = dtypes.copy() + else: + logger.error("There is a discrepancy with any of the columns.") + raise signals.FAIL( + "dtype dictionary contains key(s) that not matching with the ADLS file columns name, or they have different length." + ) + + return new_dtypes + + class ADLSToAzureSQL(Flow): def __init__( self, @@ -232,7 +277,11 @@ def gen_flow(self) -> Flow: dtypes = map_data_types_task.bind(self.local_json_path, flow=self) map_data_types_task.set_upstream(download_json_file_task, flow=self) else: - dtypes = self.dtypes + dtypes = check_dtypes_sort.bind( + df, + dtypes=self.dtypes, + flow=self, + ) check_column_order_task = CheckColumnOrder(timeout=self.timeout) df_reorder = check_column_order_task.bind( diff --git a/viadot/flows/genesys_to_adls.py b/viadot/flows/genesys_to_adls.py index 6de3c23e5..e7c13e90e 100644 --- a/viadot/flows/genesys_to_adls.py +++ b/viadot/flows/genesys_to_adls.py @@ -3,7 +3,6 @@ import pandas as pd from prefect import Flow, task -from viadot.task_utils import df_to_csv from viadot.tasks import AzureDataLakeUpload from viadot.tasks.genesys import GenesysToCSV, GenesysToDF from viadot.task_utils import ( diff --git a/viadot/sources/mindful.py b/viadot/sources/mindful.py index 512a10cc1..4bf2bc892 100644 --- a/viadot/sources/mindful.py +++ b/viadot/sources/mindful.py @@ -182,6 +182,41 @@ def get_responses_list( return response + def get_survey_list( + self, + limit: int = 1000, + **kwargs, + ) -> Response: + """Gets a list of survey resources associated with the authenticated customer. + Args: + limit (int, optional): The number of matching interactions to return. Defaults to 1000. + Returns: + Response: Request object with the response from the Mindful API. + """ + self.endpoint = "surveys" + params = { + "_limit": limit, + } + + response = self._mindful_api_response( + endpoint=self.endpoint, + params=params, + ) + + if response.status_code == 200: + self.logger.info("Succesfully downloaded responses data from mindful API.") + elif response.status_code == 204 and not response.content.decode(): + self.logger.warning( + f"Thera are not responses data to download from {self.start_date} to {self.end_date}." + ) + else: + self.logger.error( + f"Failed to download responses data. - {response.content}" + ) + raise APIError("Failed to downloaded responses data.") + + return response + def response_to_file( self, response: Response, From d64f0af536d325362f049bed6ca45a6d893b3aec Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 25 Jan 2023 11:32:11 +0100 Subject: [PATCH 44/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20docs.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/examples/sap_rfc/Dockerfile | 2 +- viadot/task_utils.py | 45 ++++++++++++++--------------- viadot/tasks/aselite.py | 10 ++++++- viadot/tasks/azure_blob_storage.py | 4 +-- viadot/tasks/azure_data_lake.py | 24 +++++++++++++++ viadot/tasks/azure_sql.py | 29 ++++++++++++++----- viadot/tasks/bcp.py | 4 +++ viadot/tasks/bigquery.py | 4 +++ viadot/tasks/cloud_for_customers.py | 4 +++ viadot/tasks/duckdb.py | 16 +++++++--- viadot/tasks/epicor.py | 4 +++ viadot/tasks/genesys.py | 6 ++++ viadot/tasks/github.py | 5 +++- viadot/tasks/mindful.py | 10 +++++-- viadot/tasks/mysql_to_df.py | 4 +++ viadot/tasks/outlook.py | 2 +- viadot/tasks/prefect_date_range.py | 6 +++- viadot/tasks/salesforce.py | 6 ++++ viadot/tasks/sap_rfc.py | 8 ++--- viadot/tasks/sftp.py | 8 +++++ viadot/tasks/sharepoint.py | 4 +++ viadot/tasks/sql_server.py | 14 +++++++-- viadot/tasks/sqlite.py | 32 ++++++++++++++++---- viadot/tasks/supermetrics.py | 10 ++++--- 24 files changed, 200 insertions(+), 61 deletions(-) diff --git a/viadot/examples/sap_rfc/Dockerfile b/viadot/examples/sap_rfc/Dockerfile index 24fa64969..c58a7d19b 100644 --- a/viadot/examples/sap_rfc/Dockerfile +++ b/viadot/examples/sap_rfc/Dockerfile @@ -19,4 +19,4 @@ ENV HTTPS_PROXY=$HTTPS_PROXY ENV NO_PROXY=$NO_PROXY RUN git config --global http.proxy ${HTTP_PROXY:-""} -RUN pip install pyrfc==2.5.0 \ No newline at end of file +RUN pip install pyrfc==2.5.0 diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 4c4d7c54f..9d057599c 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -32,7 +32,7 @@ METADATA_COLUMNS = {"_viadot_downloaded_at_utc": "DATETIME"} -@task +@task(timeout=3600) def add_ingestion_metadata_task( df: pd.DataFrame, ): @@ -52,7 +52,7 @@ def add_ingestion_metadata_task( return df2 -@task +@task(timeout=3600) def get_latest_timestamp_file_path(files: List[str]) -> str: """ Return the name of the latest file in a given data lake directory, @@ -75,7 +75,7 @@ def get_latest_timestamp_file_path(files: List[str]) -> str: return latest_file -@task +@task(timeout=3600) def dtypes_to_json_task(dtypes_dict, local_json_path: str): """ Creates json file from a dictionary. @@ -87,7 +87,7 @@ def dtypes_to_json_task(dtypes_dict, local_json_path: str): json.dump(dtypes_dict, fp) -@task +@task(timeout=3600) def chunk_df(df: pd.DataFrame, size: int = 10_000) -> List[pd.DataFrame]: """ Creates pandas Dataframes list of chunks with a given size. @@ -100,7 +100,7 @@ def chunk_df(df: pd.DataFrame, size: int = 10_000) -> List[pd.DataFrame]: return chunks -@task +@task(timeout=3600) def df_get_data_types_task(df: pd.DataFrame) -> dict: """ Returns dictionary containing datatypes of pandas DataFrame columns. @@ -113,7 +113,7 @@ def df_get_data_types_task(df: pd.DataFrame) -> dict: return dtypes_dict -@task +@task(timeout=3600) def get_sql_dtypes_from_df(df: pd.DataFrame) -> dict: """Obtain SQL data types from a pandas DataFrame""" typeset = CompleteSet() @@ -156,14 +156,14 @@ def get_sql_dtypes_from_df(df: pd.DataFrame) -> dict: return dtypes_dict_fixed -@task +@task(timeout=3600) def update_dict(d: dict, d_new: dict) -> dict: d_copy = copy.deepcopy(d) d_copy.update(d_new) return d_copy -@task +@task(timeout=3600) def df_map_mixed_dtypes_for_parquet( df: pd.DataFrame, dtypes_dict: dict ) -> pd.DataFrame: @@ -185,7 +185,7 @@ def df_map_mixed_dtypes_for_parquet( return df_mapped -@task +@task(timeout=3600) def update_dtypes_dict(dtypes_dict: dict) -> dict: """ Task to update dtypes_dictionary that will be stored in the schema. It's required due to workaround Pandas to_parquet bug connected with mixed dtypes in object @@ -203,7 +203,7 @@ def update_dtypes_dict(dtypes_dict: dict) -> dict: return dtypes_dict_updated -@task +@task(timeout=3600) def df_to_csv( df: pd.DataFrame, path: str, @@ -243,7 +243,7 @@ def df_to_csv( out_df.to_csv(path, index=False, sep=sep) -@task +@task(timeout=3600) def df_to_parquet( df: pd.DataFrame, path: str, @@ -279,7 +279,7 @@ def df_to_parquet( out_df.to_parquet(path, index=False, **kwargs) -@task +@task(timeout=3600) def union_dfs_task(dfs: List[pd.DataFrame]): """ Create one DataFrame from a list of pandas DataFrames. @@ -289,7 +289,7 @@ def union_dfs_task(dfs: List[pd.DataFrame]): return pd.concat(dfs, ignore_index=True) -@task +@task(timeout=3600) def write_to_json(dict_, path): """ Creates json file from a dictionary. Log record informs about the writing file proccess. @@ -312,23 +312,20 @@ def write_to_json(dict_, path): logger.debug(f"Successfully wrote to {path}.") -@task +@task(timeout=3600) def cleanup_validation_clutter(expectations_path): ge_project_path = Path(expectations_path).parent shutil.rmtree(ge_project_path) -@task +@task(timeout=3600) def df_converts_bytes_to_int(df: pd.DataFrame) -> pd.DataFrame: logger = prefect.context.get("logger") logger.info("Converting bytes in dataframe columns to list of integers") return df.applymap(lambda x: list(map(int, x)) if isinstance(x, bytes) else x) -@task( - max_retries=3, - retry_delay=timedelta(seconds=10), -) +@task(max_retries=3, retry_delay=timedelta(seconds=10), timeout=3600) def df_to_dataset( df: pd.DataFrame, partitioning_flavor="hive", format="parquet", **kwargs ) -> None: @@ -436,7 +433,7 @@ def custom_mail_state_handler( return new_state -@task +@task(timeout=3600) def df_clean_column( df: pd.DataFrame, columns_to_clean: List[str] = None ) -> pd.DataFrame: @@ -473,7 +470,7 @@ def df_clean_column( return df -@task +@task(timeout=3600) def concat_dfs(dfs: List[pd.DataFrame]): """ Task to combine list of data frames into one. @@ -486,7 +483,7 @@ def concat_dfs(dfs: List[pd.DataFrame]): return pd.concat(dfs, axis=1) -@task +@task(timeout=3600) def cast_df_to_str(df: pd.DataFrame) -> pd.DataFrame: """ Task for casting an entire DataFrame to a string data type. Task is needed @@ -503,7 +500,7 @@ def cast_df_to_str(df: pd.DataFrame) -> pd.DataFrame: return df_mapped -@task +@task(timeout=3600) def set_new_kv(kv_name: str, df: pd.DataFrame, filter_column: str): """ Task for updating/setting key value on Prefect based on the newest @@ -532,7 +529,7 @@ def git_clone_url(self): return f"https://{self.git_token_secret}@{self.repo_host}/{self.repo}" -@task +@task(timeout=3600) def credentials_loader(credentials_secret: str, vault_name: str = None) -> dict: """ Function that gets credentials from azure Key Vault or PrefectSecret or from local config. diff --git a/viadot/tasks/aselite.py b/viadot/tasks/aselite.py index 58bf6f887..ca9a4d971 100644 --- a/viadot/tasks/aselite.py +++ b/viadot/tasks/aselite.py @@ -13,13 +13,20 @@ class ASELiteToDF(Task): def __init__( - self, credentials: Dict[str, Any] = None, query: str = None, *args, **kwargs + self, + credentials: Dict[str, Any] = None, + query: str = None, + timeout: int = 3600, + *args, + **kwargs ): """ Task for obtaining data from ASElite source. Args: credentials (Dict[str, Any], optional): ASElite SQL Database credentials. Defaults to None. query(str, optional): Query to perform on a database. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: Pandas DataFrame """ self.credentials = credentials @@ -27,6 +34,7 @@ def __init__( super().__init__( name="ASElite_to_df", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/azure_blob_storage.py b/viadot/tasks/azure_blob_storage.py index b15399bf2..0d8ae57d3 100644 --- a/viadot/tasks/azure_blob_storage.py +++ b/viadot/tasks/azure_blob_storage.py @@ -10,8 +10,8 @@ class BlobFromCSV(Task): Task for generating Azure Blob Storage from CSV file """ - def __init__(self, *args, **kwargs): - super().__init__(name="csv_to_blob_storage", *args, **kwargs) + def __init__(self, timeout: int = 3600, *args, **kwargs): + super().__init__(name="csv_to_blob_storage", timeout=timeout, *args, **kwargs) def __call__(self): """Generate a blob from a local CSV file""" diff --git a/viadot/tasks/azure_data_lake.py b/viadot/tasks/azure_data_lake.py index 9ca86ed0d..5b2329d45 100644 --- a/viadot/tasks/azure_data_lake.py +++ b/viadot/tasks/azure_data_lake.py @@ -23,6 +23,8 @@ class AzureDataLakeDownload(Task): recursive (bool, optional): Set this to true if downloading entire directories. gen (int, optional): The generation of the Azure Data Lake. Defaults to 2. vault_name (str, optional): The name of the vault from which to fetch the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. max_retries (int, optional): [description]. Defaults to 3. retry_delay (timedelta, optional): [description]. Defaults to timedelta(seconds=10). """ @@ -34,6 +36,7 @@ def __init__( recursive: bool = False, gen: int = 2, vault_name: str = None, + timeout: int = 3600, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), *args, @@ -49,6 +52,7 @@ def __init__( name="adls_download", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -131,6 +135,8 @@ class AzureDataLakeUpload(Task): overwrite (bool, optional): Whether to overwrite files in the lake. Defaults to False. gen (int, optional): The generation of the Azure Data Lake. Defaults to 2. vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -141,6 +147,7 @@ def __init__( overwrite: bool = False, gen: int = 2, vault_name: str = None, + timeout: int = 3600, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), *args, @@ -157,6 +164,7 @@ def __init__( name="adls_upload", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -245,6 +253,7 @@ def __init__( error_bad_lines: bool = None, gen: int = 2, vault_name: str = None, + timeout: int = 3600, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), *args, @@ -261,6 +270,8 @@ def __init__( error_bad_lines (bool, optional): Whether to raise an exception on bad lines. Defaults to None. gen (int, optional): The generation of the Azure Data Lake. Defaults to 2. vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.path = path self.sep = sep @@ -274,6 +285,7 @@ def __init__( name="adls_to_df", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -372,6 +384,8 @@ class AzureDataLakeCopy(Task): recursive (bool, optional): Set this to true if copy entire directories. gen (int, optional): The generation of the Azure Data Lake. Defaults to 2. vault_name (str, optional): The name of the vault from which to fetch the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. max_retries (int, optional): [description]. Defaults to 3. retry_delay (timedelta, optional): [description]. Defaults to timedelta(seconds=10). """ @@ -383,6 +397,7 @@ def __init__( recursive: bool = False, gen: int = 2, vault_name: str = None, + timeout: int = 3600, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), *args, @@ -398,6 +413,7 @@ def __init__( name="adls_copy", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -478,6 +494,8 @@ class AzureDataLakeList(Task): path (str, optional): The path to the directory which contents you want to list. Defaults to None. gen (int, optional): The generation of the Azure Data Lake. Defaults to 2. vault_name (str, optional): The name of the vault from which to fetch the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. max_retries (int, optional): [description]. Defaults to 3. retry_delay (timedelta, optional): [description]. Defaults to timedelta(seconds=10). @@ -493,6 +511,7 @@ def __init__( path: str = None, gen: int = 2, vault_name: str = None, + timeout: int = 3600, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), *args, @@ -506,6 +525,7 @@ def __init__( name="adls_list", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -602,6 +622,8 @@ class AzureDataLakeRemove(Task): recursive (bool): Set this to true if removing entire directories. gen (int, optional): The generation of the Azure Data Lake. Defaults to 2. vault_name (str, optional): The name of the vault from which to fetch the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. max_retries (int, optional): Maximum number of retries before failing. Defaults to 3. retry_delay (timedelta, optional): Time to wait before the next retry attempt. Defaults to timedelta(seconds=10). """ @@ -612,6 +634,7 @@ def __init__( recursive: bool = False, gen: int = 2, vault_name: str = None, + timeout: int = 3600, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), *args, @@ -626,6 +649,7 @@ def __init__( name="adls_rm", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index e8de0c4c5..94bbb4f85 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -50,9 +50,9 @@ def get_credentials(credentials_secret: str, vault_name: str = None): class CreateTableFromBlob(Task): - def __init__(self, sep="\t", *args, **kwargs): + def __init__(self, sep="\t", timeout: int = 3600, *args, **kwargs): self.sep = sep - super().__init__(name="blob_to_azure_sql", *args, **kwargs) + super().__init__(name="blob_to_azure_sql", timeout=timeout, *args, **kwargs) @defaults_from_attrs("sep") def run( @@ -107,6 +107,7 @@ def __init__( sep="\t", if_exists: Literal["fail", "replace", "append", "delete"] = "fail", credentials_secret: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -117,7 +118,7 @@ def __init__( self.sep = sep self.if_exists = if_exists self.credentials_secret = credentials_secret - super().__init__(name="azure_sql_bulk_insert", *args, **kwargs) + super().__init__(name="azure_sql_bulk_insert", timeout=timeout, *args, **kwargs) @defaults_from_attrs("sep", "if_exists", "credentials_secret") def run( @@ -178,6 +179,7 @@ def __init__( if_exists: Literal["fail", "replace", "skip", "delete"] = "fail", credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), *args, @@ -193,6 +195,7 @@ def __init__( name="azure_sql_create_table", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -246,19 +249,22 @@ class AzureSQLDBQuery(Task): credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with SQL db credentials (server, db_name, user, and password). vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( self, credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args, **kwargs, ): self.credentials_secret = credentials_secret self.vault_name = vault_name - super().__init__(name="azure_sql_db_query", *args, **kwargs) + super().__init__(name="azure_sql_db_query", timeout=timeout, *args, **kwargs) def run( self, @@ -294,19 +300,22 @@ class AzureSQLToDF(Task): credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with SQL db credentials (server, db_name, user, and password). vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( self, credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args, **kwargs, ): self.credentials_secret = credentials_secret self.vault_name = vault_name - super().__init__(name="azure_sql_to_df", *args, **kwargs) + super().__init__(name="azure_sql_to_df", timeout=timeout, *args, **kwargs) def run( self, @@ -350,13 +359,16 @@ def __init__( df: pd.DataFrame = None, credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args, **kwargs, ): self.credentials_secret = credentials_secret self.vault_name = vault_name - super().__init__(name="run_check_column_order", *args, **kwargs) + super().__init__( + name="run_check_column_order", timeout=timeout, *args, **kwargs + ) def df_change_order( self, df: pd.DataFrame = None, sql_column_list: List[str] = None @@ -443,6 +455,8 @@ class AzureSQLUpsert(Task): on (str, optional): The field on which to merge (upsert). Defaults to None. credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -451,6 +465,7 @@ def __init__( table: str = None, on: str = None, credentials_secret: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -458,7 +473,7 @@ def __init__( self.table = table self.on = on self.credentials_secret = credentials_secret - super().__init__(name="azure_sql_upsert", *args, **kwargs) + super().__init__(name="azure_sql_upsert", timeout=timeout, *args, **kwargs) @defaults_from_attrs( "schema", diff --git a/viadot/tasks/bcp.py b/viadot/tasks/bcp.py index c81aba71c..e23b22459 100644 --- a/viadot/tasks/bcp.py +++ b/viadot/tasks/bcp.py @@ -34,6 +34,8 @@ class BCPTask(ShellTask): - on_error (Literal["skip", "fail"], optional): What to do if error occurs. Defaults to "skip". - credentials (dict, optional): The credentials to use for connecting with the database. - vault_name (str): The name of the vault from which to fetch the secret. + - timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. - **kwargs (dict, optional): Additional keyword arguments to pass to the Task constructor. """ @@ -49,6 +51,7 @@ def __init__( vault_name: str = None, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), + timeout: int = 3600, *args, **kwargs, ): @@ -67,6 +70,7 @@ def __init__( return_all=True, max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/bigquery.py b/viadot/tasks/bigquery.py index 315097425..e4831afad 100644 --- a/viadot/tasks/bigquery.py +++ b/viadot/tasks/bigquery.py @@ -28,6 +28,7 @@ def __init__( credentials_key: str = None, credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -52,6 +53,8 @@ def __init__( credentials can be generated as key for User Principal inside a BigQuery project. Defaults to None. credentials_secret (str, optional): The name of the Azure Key Vault secret for Bigquery project. Defaults to None. vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.dataset_name = dataset_name self.table_name = table_name @@ -64,6 +67,7 @@ def __init__( super().__init__( name="bigquery_to_df", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/cloud_for_customers.py b/viadot/tasks/cloud_for_customers.py index 9bc5e6b9d..2413f32b4 100644 --- a/viadot/tasks/cloud_for_customers.py +++ b/viadot/tasks/cloud_for_customers.py @@ -23,6 +23,7 @@ def __init__( env: str = "QA", max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), + timeout: int = 3600, **kwargs, ): @@ -35,6 +36,7 @@ def __init__( name="c4c_report_to_df", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -122,6 +124,7 @@ def __init__( if_empty: str = "warn", max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), + timeout: int = 3600, **kwargs, ): @@ -137,6 +140,7 @@ def __init__( name="c4c_to_df", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/duckdb.py b/viadot/tasks/duckdb.py index a26a3ce07..0f12aad43 100644 --- a/viadot/tasks/duckdb.py +++ b/viadot/tasks/duckdb.py @@ -17,25 +17,26 @@ class DuckDBQuery(Task): Args: credentials (dict, optional): The config to use for connecting with the db. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( self, credentials: dict = None, - timeout: int = 600, + timeout: int = 3600, *args, **kwargs, ): self.credentials = credentials super().__init__(name="run_duckdb_query", timeout=timeout, *args, **kwargs) - @defaults_from_attrs("credentials", "timeout") + @defaults_from_attrs("credentials") def run( self, query: str, fetch_type: Literal["record", "dataframe"] = "record", credentials: dict = None, - timeout: int = None, ) -> Union[List[Record], bool]: """Run a query on DuckDB. @@ -71,6 +72,8 @@ class DuckDBCreateTableFromParquet(Task): if_exists (Literal, optional): What to do if the table already exists. if_empty (Literal, optional): What to do if ".parquet" file is emty. Defaults to "skip". credentials(dict, optional): The config to use for connecting with the db. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Raises: ValueError: If the table exists and `if_exists`is set to `fail` or when parquet file @@ -86,6 +89,7 @@ def __init__( if_exists: Literal["fail", "replace", "append", "skip", "delete"] = "fail", if_empty: Literal["skip", "fail"] = "skip", credentials: dict = None, + timeout: int = 3600, *args, **kwargs, ): @@ -96,6 +100,7 @@ def __init__( super().__init__( name="duckdb_create_table", + timeout=timeout, *args, **kwargs, ) @@ -157,6 +162,8 @@ class DuckDBToDF(Task): if_empty (Literal[, optional): What to do if the query returns no data. Defaults to "warn". credentials (dict, optional): The config to use for connecting with the db. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: pd.DataFrame: a pandas DataFrame containing the table data. @@ -168,6 +175,7 @@ def __init__( table: str = None, if_empty: Literal["warn", "skip", "fail"] = "warn", credentials: dict = None, + timeout: int = 3600, *args, **kwargs, ): @@ -177,7 +185,7 @@ def __init__( self.if_empty = if_empty self.credentials = credentials - super().__init__(name="duckdb_to_df", *args, **kwargs) + super().__init__(name="duckdb_to_df", timeout=timeout, *args, **kwargs) @defaults_from_attrs("schema", "table", "if_empty", "credentials") def run( diff --git a/viadot/tasks/epicor.py b/viadot/tasks/epicor.py index 51f12376f..271154162 100644 --- a/viadot/tasks/epicor.py +++ b/viadot/tasks/epicor.py @@ -18,6 +18,7 @@ def __init__( config_key: str = None, start_date_field: str = "BegInvoiceDate", end_date_field: str = "EndInvoiceDate", + timeout: int = 3600, *args, **kwargs, ) -> pd.DataFrame: @@ -32,6 +33,8 @@ def __init__( config_key (str, optional): Credential key to dictionary where details are stored. Defauls to None. start_date_field (str, optional) The name of filters filed containing start date. Defaults to "BegInvoiceDate". end_date_field (str, optional) The name of filters filed containing end date. Defaults to "EndInvoiceDate". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: pd.DataFrame: DataFrame with parsed API output @@ -44,6 +47,7 @@ def __init__( self.end_date_field = end_date_field super().__init__( name="epicor_orders_to_df", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index aec674f17..aab7685ec 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -30,6 +30,7 @@ def __init__( report_url: str = None, report_columns: List[str] = None, credentials_genesys: Dict[str, Any] = None, + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -50,6 +51,8 @@ def __init__( schedule_id (str, optional): The ID of report. Defaults to None. report_url (str, optional): The url of report generated in json response. Defaults to None. report_columns (List[str], optional): List of exisiting column in report. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.logger = prefect.context.get("logger") @@ -69,6 +72,7 @@ def __init__( super().__init__( name=self.report_name, + timeout=timeout, *args, **kwargs, ) @@ -219,6 +223,7 @@ def __init__( report_url: str = None, report_columns: List[str] = None, credentials_genesys: Dict[str, Any] = None, + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -233,6 +238,7 @@ def __init__( super().__init__( name="genesys_to_df", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/github.py b/viadot/tasks/github.py index 0a8e34e62..eb4789471 100644 --- a/viadot/tasks/github.py +++ b/viadot/tasks/github.py @@ -90,6 +90,8 @@ class DownloadGitHubFile(Task): to_path (str, optional): The destination path. Defaults to None. access_token_secret (str, optional): The Prefect secret containing GitHub token. Defaults to "github_token". branch (str, optional): The GitHub branch to use. Defaults to "main". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -99,6 +101,7 @@ def __init__( to_path: str = None, access_token_secret: str = "github_token", branch: str = "main", + timeout: int = 3600, **kwargs, ): self.repo = repo @@ -106,7 +109,7 @@ def __init__( self.to_path = to_path self.access_token_secret = access_token_secret self.branch = branch - super().__init__(name="download_github_file", **kwargs) + super().__init__(name="download_github_file", timeout=timeout, **kwargs) @defaults_from_attrs( "repo", "from_path", "to_path", "access_token_secret", "branch" diff --git a/viadot/tasks/mindful.py b/viadot/tasks/mindful.py index 7927de311..fe1a86230 100644 --- a/viadot/tasks/mindful.py +++ b/viadot/tasks/mindful.py @@ -25,6 +25,7 @@ def __init__( region: Literal["us1", "us2", "us3", "ca1", "eu1", "au1"] = "eu1", file_extension: Literal["parquet", "csv"] = "csv", file_path: str = "", + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -39,6 +40,8 @@ def __init__( region (Literal[us1, us2, us3, ca1, eu1, au1], optional): SD region from where to interact with the mindful API. Defaults to "eu1". file_extension (Literal[parquet, csv], optional): File extensions for storing responses. Defaults to "csv". file_path (str, optional): Path where to save the file locally. Defaults to ''. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Raises: CredentialError: If credentials are not provided in local_config or directly as a parameter inside run method. @@ -53,6 +56,7 @@ def __init__( super().__init__( name=report_name, + timeout=timeout, *args, **kwargs, ) @@ -99,7 +103,7 @@ def run( ): if credentials_mindful is not None: - logger.info("Mindful credentials provided by user") + self.logger.info("Mindful credentials provided by user") elif credentials_mindful is None and credentials_secret is not None: credentials_str = AzureKeyVaultSecret( credentials_secret, vault_name=vault_name @@ -109,7 +113,7 @@ def run( else: try: credentials_mindful = local_config["MINDFUL"] - logger.info("Mindful credentials loaded from local config") + self.logger.info("Mindful credentials loaded from local config") except KeyError: credentials_mindful = None raise CredentialError("Credentials not found.") @@ -149,6 +153,6 @@ def run( logger.info("Successfully downloaded responses data from the Mindful API.") if not file_names: - return None + raise TypeError("Files were not created.") else: return file_names diff --git a/viadot/tasks/mysql_to_df.py b/viadot/tasks/mysql_to_df.py index a2eeda4a8..62a13f7c9 100644 --- a/viadot/tasks/mysql_to_df.py +++ b/viadot/tasks/mysql_to_df.py @@ -17,6 +17,7 @@ def __init__( country_short: Literal["AT", "DE", "CH", None], credentials: Dict[str, Any] = None, query: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -27,6 +28,8 @@ def __init__( credentials (Dict[str, Any], optional): MySql Database credentials. Defaults to None. query(str, optional): Query to perform on a database. Defaults to None. country_short (Dict[str, Any], optional): Country short to select proper credential. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: Pandas DataFrame """ @@ -36,6 +39,7 @@ def __init__( super().__init__( name="MySQLToDF", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/outlook.py b/viadot/tasks/outlook.py index ad70e3fe5..dbff99a54 100644 --- a/viadot/tasks/outlook.py +++ b/viadot/tasks/outlook.py @@ -17,7 +17,7 @@ def __init__( credentials: Dict[str, Any] = None, output_file_extension: str = ".csv", limit: int = 10000, - timeout: int = 1200, + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): diff --git a/viadot/tasks/prefect_date_range.py b/viadot/tasks/prefect_date_range.py index 9d0fe9591..fda09a198 100644 --- a/viadot/tasks/prefect_date_range.py +++ b/viadot/tasks/prefect_date_range.py @@ -144,6 +144,7 @@ def __init__( self, flow_name: str = None, date_range_type: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -153,7 +154,9 @@ def __init__( Args: flow_name (str, optional): Prefect flow name. Defaults to None. date_range_type (str, optional): String argument that should look like this: 'last_X_days' - - X is a number of days. Defaults to None. + X is a number of days. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.flow_name = flow_name @@ -161,6 +164,7 @@ def __init__( super().__init__( name="prefect_extract_details", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/salesforce.py b/viadot/tasks/salesforce.py index a4a4527ce..7557f0744 100644 --- a/viadot/tasks/salesforce.py +++ b/viadot/tasks/salesforce.py @@ -56,6 +56,7 @@ def __init__( raise_on_error: bool = False, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), + timeout: int = 3600, *args, **kwargs, ): @@ -70,6 +71,7 @@ def __init__( name="salesforce_upsert", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -149,6 +151,7 @@ def __init__( raise_on_error: bool = False, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), + timeout: int = 3600, *args, **kwargs, ): @@ -163,6 +166,7 @@ def __init__( name="salesforce_bulk_upsert", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -253,6 +257,7 @@ def __init__( domain: str = "test", client_id: str = "viadot", env: str = "DEV", + timeout: int = 3600, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -265,6 +270,7 @@ def __init__( super().__init__( name="salesforce_to_df", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/sap_rfc.py b/viadot/tasks/sap_rfc.py index a24f0ae33..b863db2fb 100644 --- a/viadot/tasks/sap_rfc.py +++ b/viadot/tasks/sap_rfc.py @@ -73,7 +73,6 @@ def __init__( "credentials", "max_retries", "retry_delay", - "timeout", ) def run( self, @@ -84,7 +83,6 @@ def run( rfc_total_col_width_character_limit: int = None, max_retries: int = None, retry_delay: timedelta = None, - timeout: int = None, ) -> pd.DataFrame: """Task run method. @@ -94,9 +92,9 @@ def run( multiple options are automatically tried. Defaults to None. func (str, optional): SAP RFC function to use. Defaults to None. rfc_total_col_width_character_limit (int, optional): Number of characters by which query will be split in chunks - in case of too many columns for RFC function. According to SAP documentation, the limit is - 512 characters. However, we observed SAP raising an exception even on a slightly lower number - of characters, so we add a safety margin. Defaults to None. + in case of too many columns for RFC function. According to SAP documentation, the limit is + 512 characters. However, we observed SAP raising an exception even on a slightly lower number + of characters, so we add a safety margin. Defaults to None. """ if query is None: raise ValueError("Please provide the query.") diff --git a/viadot/tasks/sftp.py b/viadot/tasks/sftp.py index 1ee6b5f80..cf4365c2b 100644 --- a/viadot/tasks/sftp.py +++ b/viadot/tasks/sftp.py @@ -18,6 +18,7 @@ def __init__( credentials: Dict[str, Any] = None, sftp_credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -27,6 +28,8 @@ def __init__( credentials (Dict[str, Any], optional): SFTP credentials. Defaults to None. sftp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary credentials for SFTP connection. Defaults to None. vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: Pandas DataFrame """ @@ -36,6 +39,7 @@ def __init__( super().__init__( name="SftpToDF", + timeout=timeout, *args, **kwargs, ) @@ -87,6 +91,7 @@ def __init__( credentials: Dict[str, Any] = None, sftp_credentials_secret: str = None, vault_name: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -96,6 +101,8 @@ def __init__( credentials (Dict[str, Any], optional): SFTP credentials. Defaults to None. sftp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary credentials for SFTP connection. Defaults to None. vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: files_list (List): List of files in SFTP server. @@ -106,6 +113,7 @@ def __init__( super().__init__( name="SftpList", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/sharepoint.py b/viadot/tasks/sharepoint.py index a4ad69fa9..fa1b7e83a 100644 --- a/viadot/tasks/sharepoint.py +++ b/viadot/tasks/sharepoint.py @@ -28,6 +28,8 @@ class SharepointToDF(Task): sheet_number (int): Sheet number to be extracted from file. Counting from 0, if None all sheets are axtracted. Defaults to None. validate_excel_file (bool, optional): Check if columns in separate sheets are the same. Defaults to False. if_empty (str, optional): What to do if query returns no data. Defaults to "warn". + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. Returns: pd.DataFrame: Pandas data frame @@ -41,6 +43,7 @@ def __init__( sheet_number: int = None, validate_excel_file: bool = False, if_empty: str = "warn", + timeout: int = 3600, *args, **kwargs, ): @@ -54,6 +57,7 @@ def __init__( super().__init__( name="sharepoint_to_df", + timeout=timeout, *args, **kwargs, ) diff --git a/viadot/tasks/sql_server.py b/viadot/tasks/sql_server.py index 556877d0f..203d1c06b 100644 --- a/viadot/tasks/sql_server.py +++ b/viadot/tasks/sql_server.py @@ -20,6 +20,8 @@ class SQLServerCreateTable(Task): dtypes (Dict[str, Any], optional): Data types to enforce. if_exists (Literal, optional): What to do if the table already exists. credentials (dict, optional): Credentials for the connection. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -31,6 +33,7 @@ def __init__( credentials: dict = None, max_retries: int = 3, retry_delay: timedelta = timedelta(seconds=10), + timeout: int = 3600, *args, **kwargs, ): @@ -43,6 +46,7 @@ def __init__( name="sql_server_create_table", max_retries=max_retries, retry_delay=retry_delay, + timeout=timeout, *args, **kwargs, ) @@ -89,6 +93,7 @@ class SQLServerToDF(Task): def __init__( self, config_key: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -97,11 +102,13 @@ def __init__( Args: config_key (str, optional): The key inside local config containing the credentials. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.config_key = config_key - super().__init__(name="sql_server_to_df", *args, **kwargs) + super().__init__(name="sql_server_to_df", timeout=timeout, *args, **kwargs) @defaults_from_attrs("config_key") def run( @@ -135,6 +142,7 @@ class SQLServerQuery(Task): def __init__( self, config_key: str = None, + timeout: int = 3600, *args, **kwargs, ): @@ -143,10 +151,12 @@ def __init__( Args: config_key (str, optional): The key inside local config containing the credentials. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.config_key = config_key - super().__init__(name="sql_server_query", *args, **kwargs) + super().__init__(name="sql_server_query", timeout=timeout, *args, **kwargs) @defaults_from_attrs("config_key") def run( diff --git a/viadot/tasks/sqlite.py b/viadot/tasks/sqlite.py index fbf767ab1..f72530fc1 100644 --- a/viadot/tasks/sqlite.py +++ b/viadot/tasks/sqlite.py @@ -17,6 +17,8 @@ class SQLiteInsert(Task): Args: db_path (str, optional): The path to the database to be used. Defaults to None. sql_path (str, optional): The path to the text file containing the query. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ @@ -28,6 +30,7 @@ def __init__( table_name: str = None, if_exists: str = "fail", dtypes: Dict[str, Any] = None, + timeout: int = 3600, *args, **kwargs, ): @@ -38,7 +41,7 @@ def __init__( self.schema = schema self.if_exists = if_exists - super().__init__(name="sqlite_insert", *args, **kwargs) + super().__init__(name="sqlite_insert", timeout=timeout, *args, **kwargs) @defaults_from_attrs("df", "db_path", "schema", "table_name", "if_exists", "dtypes") def run( @@ -75,14 +78,23 @@ class SQLiteSQLtoDF(Task): Args: db_path (str, optional): The path to the database to be used. Defaults to None. sql_path (str, optional): The path to the text file containing the query. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ - def __init__(self, db_path: str = None, sql_path: str = None, *args, **kwargs): + def __init__( + self, + db_path: str = None, + sql_path: str = None, + timeout: int = 3600, + *args, + **kwargs, + ): self.db_path = db_path self.sql_path = sql_path - super().__init__(name="sqlite_sql_to_df", *args, **kwargs) + super().__init__(name="sqlite_sql_to_df", timeout=timeout, *args, **kwargs) def __call__(self): """Generate a DataFrame from a SQLite SQL query""" @@ -111,12 +123,22 @@ class SQLiteQuery(Task): Args: query (str, optional): The query to execute on the database. Defaults to None. db_path (str, optional): The path to the database to be used. Defaults to None. + timeout(int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. + """ - def __init__(self, query: str = None, db_path: str = None, *args, **kwargs): + def __init__( + self, + query: str = None, + db_path: str = None, + timeout: int = 3600, + *args, + **kwargs, + ): self.query = query self.db_path = db_path - super().__init__(name="sqlite_query", *args, **kwargs) + super().__init__(name="sqlite_query", timeout=timeout, *args, **kwargs) def __call__(self): """Run an SQL query on SQLite""" diff --git a/viadot/tasks/supermetrics.py b/viadot/tasks/supermetrics.py index 5fe41f8f5..1187d46a9 100644 --- a/viadot/tasks/supermetrics.py +++ b/viadot/tasks/supermetrics.py @@ -16,7 +16,8 @@ class SupermetricsToCSV(Task): path (str, optional): The destination path. Defaults to "supermetrics_extract.csv". max_retries (int, optional): The maximum number of retries. Defaults to 5. retry_delay (timedelta, optional): The delay between task retries. Defaults to 10 seconds. - timeout (int, optional): Task timeout. Defaults to 30 minuntes. + timeout (int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. max_rows (int, optional): Maximum number of rows the query results should contain. Defaults to 1 000 000. max_cols (int, optional): Maximum number of columns the query results should contain. Defaults to None. if_exists (str, optional): What to do if file already exists. Defaults to "replace". @@ -31,7 +32,7 @@ def __init__( path: str = "supermetrics_extract.csv", max_retries: int = 5, retry_delay: timedelta = timedelta(seconds=10), - timeout: int = 60 * 30, + timeout: int = 3600, max_rows: int = 1_000_000, if_exists: str = "replace", if_empty: str = "warn", @@ -173,7 +174,8 @@ class SupermetricsToDF(Task): if_empty (str, optional): What to do if query returns no data. Defaults to "warn". max_retries (int, optional): The maximum number of retries. Defaults to 5. retry_delay (timedelta, optional): The delay between task retries. Defaults to 10 seconds. - timeout (int, optional): Task timeout. Defaults to 30 minuntes. + timeout (int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ def __init__( @@ -183,7 +185,7 @@ def __init__( max_rows: int = 1_000_000, max_retries: int = 5, retry_delay: timedelta = timedelta(seconds=10), - timeout: int = 60 * 30, + timeout: int = 3600, **kwargs, ): From ea7f723f9af36728fd65879a4f45c7d2f929bb2d Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 25 Jan 2023 11:36:54 +0100 Subject: [PATCH 45/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20docs.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/mindful.py | 2 ++ viadot/task_utils.py | 34 +++++++++++++++++++++++++++++++++- viadot/tasks/azure_sql.py | 2 ++ viadot/tasks/mindful.py | 16 +++++++++++++++- 4 files changed, 52 insertions(+), 2 deletions(-) diff --git a/viadot/sources/mindful.py b/viadot/sources/mindful.py index 4bf2bc892..496862237 100644 --- a/viadot/sources/mindful.py +++ b/viadot/sources/mindful.py @@ -188,8 +188,10 @@ def get_survey_list( **kwargs, ) -> Response: """Gets a list of survey resources associated with the authenticated customer. + Args: limit (int, optional): The number of matching interactions to return. Defaults to 1000. + Returns: Response: Request object with the response from the Mindful API. """ diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 9d057599c..b81ce1a7d 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -23,7 +23,7 @@ from visions.typesets.complete_set import CompleteSet from viadot.config import local_config -from viadot.tasks import AzureKeyVaultSecret +from viadot.tasks import AzureKeyVaultSecret, AzureDataLakeUpload from viadot.exceptions import CredentialError @@ -567,3 +567,35 @@ def credentials_loader(credentials_secret: str, vault_name: str = None) -> dict: raise CredentialError("Credentials secret not provided.") return credentials + + +@task(timeout=3600) +def adls_bulk_upload( + file_names: List[str], + file_name_relative_path: str = "", + adls_file_path: str = None, + adls_sp_credentials_secret: str = None, + adls_overwrite: bool = True, + timeout: int = 3600, +) -> None: + """Function that upload files to defined path in ADLS. + Args: + file_names (List[str]): List of file names to generate paths. + file_name_relative_path (str, optional): Path where to save the file locally. Defaults to ''. + adls_file_path (str, optional): Azure Data Lake path. Defaults to None. + adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with + ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET). Defaults to None. + adls_overwrite (bool, optional): Whether to overwrite files in the data lake. Defaults to True. + timeout (int, optional): The amount of time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. + """ + + file_to_adls_task = AzureDataLakeUpload(timeout=timeout) + + for file in file_names: + file_to_adls_task.run( + from_path=os.path.join(file_name_relative_path, file), + to_path=os.path.join(adls_file_path, file), + sp_credentials_secret=adls_sp_credentials_secret, + overwrite=adls_overwrite, + ) diff --git a/viadot/tasks/azure_sql.py b/viadot/tasks/azure_sql.py index 94bbb4f85..9f49b6011 100644 --- a/viadot/tasks/azure_sql.py +++ b/viadot/tasks/azure_sql.py @@ -21,6 +21,8 @@ def get_credentials(credentials_secret: str, vault_name: str = None): """ Get Azure credentials. + If the credential secret is not provided it will be taken from Prefect Secrets. If Prefect Secrets does not + contain the credential, it will be taken from the local credential file. Args: credentials_secret (str): The name of the Azure Key Vault secret containing a dictionary diff --git a/viadot/tasks/mindful.py b/viadot/tasks/mindful.py index fe1a86230..3f84b3eeb 100644 --- a/viadot/tasks/mindful.py +++ b/viadot/tasks/mindful.py @@ -132,6 +132,7 @@ def run( ) file_names = [] + # interactions interactions_response = mindful.get_interactions_list() if interactions_response.status_code == 200: interaction_file_name = mindful.response_to_file( @@ -143,6 +144,8 @@ def run( "Successfully downloaded interactions data from the Mindful API." ) time.sleep(0.5) + + # responses responses_response = mindful.get_responses_list() if responses_response.status_code == 200: response_file_name = mindful.response_to_file( @@ -151,8 +154,19 @@ def run( ) file_names.append(response_file_name) logger.info("Successfully downloaded responses data from the Mindful API.") + time.sleep(0.5) + + # surveys + surveys_response = mindful.get_survey_list() + if surveys_response.status_code == 200: + surveys_file_name = mindful.response_to_file( + surveys_response, + file_path=file_path, + ) + file_names.append(surveys_file_name) + logger.info("Successfully downloaded surveys data from the Mindful API.") if not file_names: - raise TypeError("Files were not created.") + return None else: return file_names From d86a48640aaec1bec791f05bf47fd03769492f7d Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 25 Jan 2023 12:05:52 +0100 Subject: [PATCH 46/55] =?UTF-8?q?=F0=9F=90=9B=20added=20more=20time=20to?= =?UTF-8?q?=20wait=20for=20the=20reponse.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/genesys.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index aec674f17..7db99585c 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -156,8 +156,8 @@ def run( logger.info(f"Waiting for caching data in Genesys database.") # in order to wait for API POST request add it timeout_start = time.time() - # 30 seconds timeout is minimal but for safety added 60. - timeout = timeout_start + 60 + # 30 seconds timeout is minimal but for safety added 300. + timeout = timeout_start + 300 # while loop with timeout while time.time() < timeout: From 464a320f7dc126dd07971a1677c09e204d159126 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 25 Jan 2023 12:21:18 +0100 Subject: [PATCH 47/55] =?UTF-8?q?=E2=8F=AA=20revert=20changes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 ++ viadot/flows/epicor_to_duckdb.py | 4 ++++ viadot/sources/epicor.py | 7 +++++-- viadot/tasks/epicor.py | 6 ++++++ 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b29541f76..e25fb55fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Added `validate_date_filter` parameter to `Epicor` source, `EpicorOrdersToDF` task and `EpicorOrdersToDuckDB` flow. +This parameter enables user to decide whether or not filter should be validated. - Added `check_dtypes_sort` task into `ADLSToAzureSQL` to check if dtypes is properly sorted. - Added `timeout` parameter to all `Task`s where it can be added. - Added `timeout` parameter to all `Flow`s where it can be added. diff --git a/viadot/flows/epicor_to_duckdb.py b/viadot/flows/epicor_to_duckdb.py index 77d4da895..e60938fb3 100644 --- a/viadot/flows/epicor_to_duckdb.py +++ b/viadot/flows/epicor_to_duckdb.py @@ -15,6 +15,7 @@ def __init__( local_file_path: str, epicor_credentials: Dict[str, Any] = None, epicor_config_key: str = None, + validate_date_filter: bool = True, start_date_field: str = "BegInvoiceDate", end_date_field: str = "EndInvoiceDate", duckdb_table: str = None, @@ -36,6 +37,7 @@ def __init__( epicor_credentials (Dict[str, Any], optional): Credentials to connect with Epicor API containing host, port, username and password. Defaults to None. epicor_config_key (str, optional): Credential key to dictionary where details are stored. Defaults to None. + validate_date_filter (bool, optional): Whether or not validate xml date filters. Defaults to True. start_date_field (str, optional) The name of filters field containing start date. Defaults to "BegInvoiceDate". end_date_field (str, optional) The name of filters field containing end date. Defaults to "EndInvoiceDate". duckdb_table (str, optional): Destination table in DuckDB. Defaults to None. @@ -49,6 +51,7 @@ def __init__( self.base_url = base_url self.epicor_credentials = epicor_credentials self.epicor_config_key = epicor_config_key + self.validate_date_filter = validate_date_filter self.filters_xml = filters_xml self.end_date_field = end_date_field self.start_date_field = start_date_field @@ -78,6 +81,7 @@ def gen_flow(self) -> Flow: flow=self, credentials=self.epicor_credentials, config_key=self.epicor_config_key, + validate_date_filter=self.validate_date_filter, end_date_field=self.end_date_field, start_date_field=self.start_date_field, ) diff --git a/viadot/sources/epicor.py b/viadot/sources/epicor.py index 7d3fa000a..82e1b9ff6 100644 --- a/viadot/sources/epicor.py +++ b/viadot/sources/epicor.py @@ -183,6 +183,7 @@ def __init__( filters_xml: str, credentials: Dict[str, Any] = None, config_key: str = None, + validate_date_filter: bool = True, start_date_field: str = "BegInvoiceDate", end_date_field: str = "EndInvoiceDate", *args, @@ -197,6 +198,7 @@ def __init__( credentials (Dict[str, Any], optional): Credentials to connect with Epicor API containing host, port, username and password. Defaults to None. config_key (str, optional): Credential key to dictionary where details are stored. + validate_date_filter (bool, optional): Whether or not validate xml date filters. Defaults to True. start_date_field (str, optional) The name of filters field containing start date. Defaults to "BegInvoiceDate". end_date_field (str, optional) The name of filters field containing end date. Defaults to "EndInvoiceDate". """ @@ -212,6 +214,7 @@ def __init__( self.config_key = config_key self.base_url = base_url self.filters_xml = filters_xml + self.validate_date_filter = validate_date_filter self.start_date_field = start_date_field self.end_date_field = end_date_field @@ -266,8 +269,8 @@ def validate_filter(self) -> None: def get_xml_response(self): "Function for getting response from Epicor API" - - self.validate_filter() + if self.validate_date_filter == True: + self.validate_filter payload = self.filters_xml url = self.generate_url() headers = { diff --git a/viadot/tasks/epicor.py b/viadot/tasks/epicor.py index 271154162..d704809f5 100644 --- a/viadot/tasks/epicor.py +++ b/viadot/tasks/epicor.py @@ -16,6 +16,7 @@ def __init__( filters_xml: str, credentials: Dict[str, Any] = None, config_key: str = None, + validate_date_filter: bool = True, start_date_field: str = "BegInvoiceDate", end_date_field: str = "EndInvoiceDate", timeout: int = 3600, @@ -31,6 +32,7 @@ def __init__( filters_xml (str, required): Filters in form of XML. The date filter is necessary. credentials (Dict[str, Any], optional): Credentials to connect with Epicor Api containing host, port, username and password. Defaults to None. config_key (str, optional): Credential key to dictionary where details are stored. Defauls to None. + validate_date_filter (bool, optional): Whether or not validate xml date filters. Defaults to True. start_date_field (str, optional) The name of filters filed containing start date. Defaults to "BegInvoiceDate". end_date_field (str, optional) The name of filters filed containing end date. Defaults to "EndInvoiceDate". timeout(int, optional): The amount of time (in seconds) to wait while running this task before @@ -43,6 +45,7 @@ def __init__( self.config_key = config_key self.base_url = base_url self.filters_xml = filters_xml + self.validate_date_filter = validate_date_filter self.start_date_field = start_date_field self.end_date_field = end_date_field super().__init__( @@ -61,6 +64,7 @@ def __call__(self, *args, **kwargs): "config_key", "base_url", "filters_xml", + "validate_date_filter", "start_date_field", "end_date_field", ) @@ -70,6 +74,7 @@ def run( config_key: str = None, base_url: str = None, filters_xml: str = None, + validate_date_filter: bool = True, start_date_field: str = None, end_date_field: str = None, ): @@ -78,6 +83,7 @@ def run( config_key=config_key, base_url=base_url, filters_xml=filters_xml, + validate_date_filter=validate_date_filter, start_date_field=start_date_field, end_date_field=end_date_field, ) From db3133292377cab88499764ac1514643bf08ffbc Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 25 Jan 2023 12:33:08 +0100 Subject: [PATCH 48/55] =?UTF-8?q?=E2=8F=AA=20revert=20changes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 -- viadot/flows/epicor_to_duckdb.py | 4 ---- viadot/sources/epicor.py | 6 +----- viadot/tasks/epicor.py | 6 ------ 4 files changed, 1 insertion(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e25fb55fc..b29541f76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added -- Added `validate_date_filter` parameter to `Epicor` source, `EpicorOrdersToDF` task and `EpicorOrdersToDuckDB` flow. -This parameter enables user to decide whether or not filter should be validated. - Added `check_dtypes_sort` task into `ADLSToAzureSQL` to check if dtypes is properly sorted. - Added `timeout` parameter to all `Task`s where it can be added. - Added `timeout` parameter to all `Flow`s where it can be added. diff --git a/viadot/flows/epicor_to_duckdb.py b/viadot/flows/epicor_to_duckdb.py index e60938fb3..77d4da895 100644 --- a/viadot/flows/epicor_to_duckdb.py +++ b/viadot/flows/epicor_to_duckdb.py @@ -15,7 +15,6 @@ def __init__( local_file_path: str, epicor_credentials: Dict[str, Any] = None, epicor_config_key: str = None, - validate_date_filter: bool = True, start_date_field: str = "BegInvoiceDate", end_date_field: str = "EndInvoiceDate", duckdb_table: str = None, @@ -37,7 +36,6 @@ def __init__( epicor_credentials (Dict[str, Any], optional): Credentials to connect with Epicor API containing host, port, username and password. Defaults to None. epicor_config_key (str, optional): Credential key to dictionary where details are stored. Defaults to None. - validate_date_filter (bool, optional): Whether or not validate xml date filters. Defaults to True. start_date_field (str, optional) The name of filters field containing start date. Defaults to "BegInvoiceDate". end_date_field (str, optional) The name of filters field containing end date. Defaults to "EndInvoiceDate". duckdb_table (str, optional): Destination table in DuckDB. Defaults to None. @@ -51,7 +49,6 @@ def __init__( self.base_url = base_url self.epicor_credentials = epicor_credentials self.epicor_config_key = epicor_config_key - self.validate_date_filter = validate_date_filter self.filters_xml = filters_xml self.end_date_field = end_date_field self.start_date_field = start_date_field @@ -81,7 +78,6 @@ def gen_flow(self) -> Flow: flow=self, credentials=self.epicor_credentials, config_key=self.epicor_config_key, - validate_date_filter=self.validate_date_filter, end_date_field=self.end_date_field, start_date_field=self.start_date_field, ) diff --git a/viadot/sources/epicor.py b/viadot/sources/epicor.py index 82e1b9ff6..08547a88c 100644 --- a/viadot/sources/epicor.py +++ b/viadot/sources/epicor.py @@ -183,7 +183,6 @@ def __init__( filters_xml: str, credentials: Dict[str, Any] = None, config_key: str = None, - validate_date_filter: bool = True, start_date_field: str = "BegInvoiceDate", end_date_field: str = "EndInvoiceDate", *args, @@ -198,7 +197,6 @@ def __init__( credentials (Dict[str, Any], optional): Credentials to connect with Epicor API containing host, port, username and password. Defaults to None. config_key (str, optional): Credential key to dictionary where details are stored. - validate_date_filter (bool, optional): Whether or not validate xml date filters. Defaults to True. start_date_field (str, optional) The name of filters field containing start date. Defaults to "BegInvoiceDate". end_date_field (str, optional) The name of filters field containing end date. Defaults to "EndInvoiceDate". """ @@ -214,7 +212,6 @@ def __init__( self.config_key = config_key self.base_url = base_url self.filters_xml = filters_xml - self.validate_date_filter = validate_date_filter self.start_date_field = start_date_field self.end_date_field = end_date_field @@ -269,8 +266,7 @@ def validate_filter(self) -> None: def get_xml_response(self): "Function for getting response from Epicor API" - if self.validate_date_filter == True: - self.validate_filter + self.validate_filter() payload = self.filters_xml url = self.generate_url() headers = { diff --git a/viadot/tasks/epicor.py b/viadot/tasks/epicor.py index d704809f5..271154162 100644 --- a/viadot/tasks/epicor.py +++ b/viadot/tasks/epicor.py @@ -16,7 +16,6 @@ def __init__( filters_xml: str, credentials: Dict[str, Any] = None, config_key: str = None, - validate_date_filter: bool = True, start_date_field: str = "BegInvoiceDate", end_date_field: str = "EndInvoiceDate", timeout: int = 3600, @@ -32,7 +31,6 @@ def __init__( filters_xml (str, required): Filters in form of XML. The date filter is necessary. credentials (Dict[str, Any], optional): Credentials to connect with Epicor Api containing host, port, username and password. Defaults to None. config_key (str, optional): Credential key to dictionary where details are stored. Defauls to None. - validate_date_filter (bool, optional): Whether or not validate xml date filters. Defaults to True. start_date_field (str, optional) The name of filters filed containing start date. Defaults to "BegInvoiceDate". end_date_field (str, optional) The name of filters filed containing end date. Defaults to "EndInvoiceDate". timeout(int, optional): The amount of time (in seconds) to wait while running this task before @@ -45,7 +43,6 @@ def __init__( self.config_key = config_key self.base_url = base_url self.filters_xml = filters_xml - self.validate_date_filter = validate_date_filter self.start_date_field = start_date_field self.end_date_field = end_date_field super().__init__( @@ -64,7 +61,6 @@ def __call__(self, *args, **kwargs): "config_key", "base_url", "filters_xml", - "validate_date_filter", "start_date_field", "end_date_field", ) @@ -74,7 +70,6 @@ def run( config_key: str = None, base_url: str = None, filters_xml: str = None, - validate_date_filter: bool = True, start_date_field: str = None, end_date_field: str = None, ): @@ -83,7 +78,6 @@ def run( config_key=config_key, base_url=base_url, filters_xml=filters_xml, - validate_date_filter=validate_date_filter, start_date_field=start_date_field, end_date_field=end_date_field, ) From ae440ecd0353f18d88dab0edf6f3e0c77268c599 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 25 Jan 2023 12:37:38 +0100 Subject: [PATCH 49/55] =?UTF-8?q?=F0=9F=8E=A8=20improved=20waiting=20time.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/genesys_to_adls.py | 1 + viadot/tasks/genesys.py | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/viadot/flows/genesys_to_adls.py b/viadot/flows/genesys_to_adls.py index c4c894a76..069725533 100644 --- a/viadot/flows/genesys_to_adls.py +++ b/viadot/flows/genesys_to_adls.py @@ -143,6 +143,7 @@ def gen_flow(self) -> Flow: if self.view_type == "queue_performance_detail_view": file_names = to_csv.bind( view_type=self.view_type, + view_type_time_sleep=self.view_type_time_sleep, media_type_list=self.media_type_list, queueIds_list=self.queueIds_list, data_to_post_str=self.data_to_post, diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index 7db99585c..d35584aa3 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -153,11 +153,15 @@ def run( genesys.genesys_generate_exports() if view_type == "queue_performance_detail_view": - logger.info(f"Waiting for caching data in Genesys database.") + logger.info( + f"Waiting {view_type_time_sleep} seconds for caching data in Genesys database." + ) + # sleep time to allow Genesys generate all exports + time.sleep(view_type_time_sleep) # in order to wait for API POST request add it timeout_start = time.time() - # 30 seconds timeout is minimal but for safety added 300. - timeout = timeout_start + 300 + # 30 seconds timeout is minimal but for safety added 60. + timeout = timeout_start + 60 # while loop with timeout while time.time() < timeout: From 5f95f507378b50025ed31f610157ccbe9c274660 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 25 Jan 2023 14:16:00 +0100 Subject: [PATCH 50/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20CHANGELOG.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 178084905..5b005ee7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- Added `view_type_time_sleep` to the Genesys `queue_performance_detail_view`. # [0.4.11] - 2022-12-15 From 6489e20df3915fa0104a2ef9dd8743d9cc8f65ca Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Thu, 26 Jan 2023 16:07:33 +0100 Subject: [PATCH 51/55] =?UTF-8?q?=F0=9F=90=9B=20fixed=20bugs=20in=20genesy?= =?UTF-8?q?s=20and=20mindful=20flows.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/genesys_to_adls.py | 4 ++-- viadot/flows/mindful_to_adls.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/viadot/flows/genesys_to_adls.py b/viadot/flows/genesys_to_adls.py index e26a01ce2..7269467c9 100644 --- a/viadot/flows/genesys_to_adls.py +++ b/viadot/flows/genesys_to_adls.py @@ -145,11 +145,11 @@ def gen_flow(self) -> Flow: add_timestamp.bind(file_names, sep=self.sep, flow=self) - adls_bulk_upload( + adls_bulk_upload.bind( file_names=file_names, adls_file_path=self.adls_file_path, adls_sp_credentials_secret=self.adls_sp_credentials_secret, - task_timeout=self.timeout, + timeout=self.timeout, flow=self, ) diff --git a/viadot/flows/mindful_to_adls.py b/viadot/flows/mindful_to_adls.py index 973d03e44..2e77173b0 100644 --- a/viadot/flows/mindful_to_adls.py +++ b/viadot/flows/mindful_to_adls.py @@ -110,13 +110,13 @@ def mind_flow(self) -> Flow: add_timestamp.bind(file_names, sep=self.sep, flow=self) - adls_bulk_upload( + adls_bulk_upload.bind( file_names=file_names, file_name_relative_path=self.file_path, adls_file_path=self.adls_file_path, adls_sp_credentials_secret=self.adls_sp_credentials_secret, adls_overwrite=self.adls_overwrite, - task_timeout=self.timeout, + timeout=self.timeout, flow=self, ) From fe5b9452a7f9cccc53314041ab88d4de192a8a4d Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Fri, 27 Jan 2023 13:21:53 +0100 Subject: [PATCH 52/55] =?UTF-8?q?=F0=9F=93=9D=20updated=20docs.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/mindful.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/viadot/sources/mindful.py b/viadot/sources/mindful.py index 496862237..d13ff9e9c 100644 --- a/viadot/sources/mindful.py +++ b/viadot/sources/mindful.py @@ -206,16 +206,14 @@ def get_survey_list( ) if response.status_code == 200: - self.logger.info("Succesfully downloaded responses data from mindful API.") + self.logger.info("Succesfully downloaded surveys data from mindful API.") elif response.status_code == 204 and not response.content.decode(): self.logger.warning( - f"Thera are not responses data to download from {self.start_date} to {self.end_date}." + f"Thera are not surveys data to download from {self.start_date} to {self.end_date}." ) else: - self.logger.error( - f"Failed to download responses data. - {response.content}" - ) - raise APIError("Failed to downloaded responses data.") + self.logger.error(f"Failed to download surveys data. - {response.content}") + raise APIError("Failed to downloaded surveys data.") return response From bae450ef88b84fcff4badf8455be8f55589e6089 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Mon, 30 Jan 2023 08:51:34 +0100 Subject: [PATCH 53/55] =?UTF-8?q?=F0=9F=90=9B=20Added=20project=5Fname=20t?= =?UTF-8?q?o=20query?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/bigquery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/tasks/bigquery.py b/viadot/tasks/bigquery.py index e4831afad..89eb1ec5d 100644 --- a/viadot/tasks/bigquery.py +++ b/viadot/tasks/bigquery.py @@ -119,7 +119,7 @@ def run( df = bigquery.query_to_df(query) else: if start_date is not None and end_date is not None: - query = f"""SELECT * FROM `{dataset_name}.{table_name}` + query = f"""SELECT * FROM `{project}.{dataset_name}.{table_name}` where {date_column_name} between PARSE_DATE("%Y-%m-%d", "{start_date}") and PARSE_DATE("%Y-%m-%d", "{end_date}") order by {date_column_name} desc""" else: From 4fe77382ef09f2a9d4da0fb18db1456a1f5bba75 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Mon, 30 Jan 2023 14:55:38 +0100 Subject: [PATCH 54/55] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20tests=20before=20r?= =?UTF-8?q?elease?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_adls_gen1_to_azure_sql_new.py | 41 +++--- .../flows/test_sharepoint_to_adls.py | 135 ++++++++---------- 2 files changed, 80 insertions(+), 96 deletions(-) diff --git a/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py b/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py index bb7fe4c3f..1a0c7e778 100644 --- a/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py +++ b/tests/integration/flows/test_adls_gen1_to_azure_sql_new.py @@ -13,7 +13,6 @@ def test_adls_gen1_to_azure_sql_new_init_args(): - flow = ADLSGen1ToAzureSQLNew( name="test_adls_gen1_gen2_flow", gen1_path="test_file_1.csv", @@ -44,27 +43,23 @@ def test_adls_gen1_to_azure_sql_new_mock(): mock_method.assert_called_with() -def test_adls_gen1_to_azure_sql_new_flow_run_mock(): - - d = {"country": [1, 2], "sales": [3, 4]} - df = pd.DataFrame(data=d) - - with mock.patch( - "viadot.flows.adls_gen1_to_azure_sql_new.gen1_to_df_task.bind" - ) as gen1_to_df_task_mock_bind_method_mock: - gen1_to_df_task_mock_bind_method_mock.return_value = df - - flow = ADLSGen1ToAzureSQLNew( - name="test_adls_g1g2", - gen1_path="example_path", - gen2_path="raw/test/test.csv", - dtypes={"country": "VARCHAR(25)", "sales": "INT"}, - if_exists="replace", - table="test", - schema="sandbox", - ) +@mock.patch( + "viadot.tasks.AzureDataLakeToDF.run", + return_value=pd.DataFrame(data={"country": [1, 2], "sales": [3, 4]}), +) +@pytest.mark.run +def test_adls_gen1_to_azure_sql_new_flow_run_mock(mocked_class): + flow = ADLSGen1ToAzureSQLNew( + name="test_adls_g1g2", + gen1_path="example_path", + gen2_path="raw/test/test.csv", + dtypes={"country": "VARCHAR(25)", "sales": "INT"}, + if_exists="replace", + table="test", + schema="sandbox", + ) - result = flow.run() + result = flow.run() - assert result.is_successful() - os.remove("test_adls_g1g2.csv") + assert result.is_successful() + os.remove("test_adls_g1g2.csv") diff --git a/tests/integration/flows/test_sharepoint_to_adls.py b/tests/integration/flows/test_sharepoint_to_adls.py index eaef177ec..2504f4fa1 100644 --- a/tests/integration/flows/test_sharepoint_to_adls.py +++ b/tests/integration/flows/test_sharepoint_to_adls.py @@ -1,6 +1,6 @@ import os from unittest import mock - +import pytest import pandas as pd import pendulum from prefect.tasks.secrets import PrefectSecret @@ -11,75 +11,64 @@ ADLS_FILE_NAME = str(pendulum.now("utc")) + ".csv" ADLS_DIR_PATH = "raw/tests/" CREDENTIALS_SECRET = PrefectSecret("AZURE_DEFAULT_ADLS_SERVICE_PRINCIPAL_SECRET").run() - - -def test_sharepoint_to_adls_run_flow(): - - d = {"country": [1, 2], "sales": [3, 4]} - df = pd.DataFrame(data=d) - - with mock.patch( - "viadot.flows.sharepoint_to_adls.excel_to_df_task.bind" - ) as excel_to_df_task_mock: - excel_to_df_task_mock.return_value = df - - flow = SharepointToADLS( - "test_sharepoint_to_adls_run_flow", - output_file_extension=".csv", - adls_sp_credentials_secret=CREDENTIALS_SECRET, - adls_dir_path=ADLS_DIR_PATH, - adls_file_name=ADLS_FILE_NAME, - ) - result = flow.run() - assert result.is_successful() - os.remove("test_sharepoint_to_adls_run_flow.csv") - os.remove("test_sharepoint_to_adls_run_flow.json") - - -def test_sharepoint_to_adls_run_flow_overwrite_true(): - - d = {"country": [1, 2], "sales": [3, 4]} - df = pd.DataFrame(data=d) - - with mock.patch( - "viadot.flows.sharepoint_to_adls.excel_to_df_task.bind" - ) as excel_to_df_task_mock: - excel_to_df_task_mock.return_value = df - - flow = SharepointToADLS( - "test_sharepoint_to_adls_run_flow_overwrite_true", - output_file_extension=".csv", - adls_sp_credentials_secret=CREDENTIALS_SECRET, - adls_dir_path=ADLS_DIR_PATH, - adls_file_name=ADLS_FILE_NAME, - overwrite_adls=True, - ) - result = flow.run() - assert result.is_successful() - os.remove("test_sharepoint_to_adls_run_flow_overwrite_true.csv") - os.remove("test_sharepoint_to_adls_run_flow_overwrite_true.json") - - -def test_sharepoint_to_adls_run_flow_overwrite_false(): - - d = {"country": [1, 2], "sales": [3, 4]} - df = pd.DataFrame(data=d) - - with mock.patch( - "viadot.flows.sharepoint_to_adls.excel_to_df_task.bind" - ) as excel_to_df_task_mock: - excel_to_df_task_mock.return_value = df - - flow = SharepointToADLS( - "test_sharepoint_to_adls_run_flow_overwrite_false", - output_file_extension=".csv", - adls_sp_credentials_secret=CREDENTIALS_SECRET, - adls_dir_path=ADLS_DIR_PATH, - adls_file_name=ADLS_FILE_NAME, - overwrite_adls=False, - ) - result = flow.run() - - assert result.is_failed() - os.remove("test_sharepoint_to_adls_run_flow_overwrite_false.csv") - os.remove("test_sharepoint_to_adls_run_flow_overwrite_false.json") +DATA = {"country": [1, 2], "sales": [3, 4]} + + +@mock.patch( + "viadot.tasks.SharepointToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_sharepoint_to_adls_run_flow(mocked_class): + flow = SharepointToADLS( + "test_sharepoint_to_adls_run_flow", + output_file_extension=".csv", + adls_sp_credentials_secret=CREDENTIALS_SECRET, + adls_dir_path=ADLS_DIR_PATH, + adls_file_name=ADLS_FILE_NAME, + ) + result = flow.run() + assert result.is_successful() + os.remove("test_sharepoint_to_adls_run_flow.csv") + os.remove("test_sharepoint_to_adls_run_flow.json") + + +@mock.patch( + "viadot.tasks.SharepointToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_sharepoint_to_adls_run_flow_overwrite_true(mocked_class): + flow = SharepointToADLS( + "test_sharepoint_to_adls_run_flow_overwrite_true", + output_file_extension=".csv", + adls_sp_credentials_secret=CREDENTIALS_SECRET, + adls_dir_path=ADLS_DIR_PATH, + adls_file_name=ADLS_FILE_NAME, + overwrite_adls=True, + ) + result = flow.run() + assert result.is_successful() + os.remove("test_sharepoint_to_adls_run_flow_overwrite_true.csv") + os.remove("test_sharepoint_to_adls_run_flow_overwrite_true.json") + + +@mock.patch( + "viadot.tasks.SharepointToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_sharepoint_to_adls_run_flow_overwrite_false(mocked_class): + flow = SharepointToADLS( + "test_sharepoint_to_adls_run_flow_overwrite_false", + output_file_extension=".csv", + adls_sp_credentials_secret=CREDENTIALS_SECRET, + adls_dir_path=ADLS_DIR_PATH, + adls_file_name=ADLS_FILE_NAME, + overwrite_adls=False, + ) + result = flow.run() + + assert result.is_failed() + os.remove("test_sharepoint_to_adls_run_flow_overwrite_false.csv") + os.remove("test_sharepoint_to_adls_run_flow_overwrite_false.json") From 5eaa398853e67743f8cdda6b5f691f2005a2294c Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Mon, 30 Jan 2023 15:57:24 +0100 Subject: [PATCH 55/55] =?UTF-8?q?=F0=9F=93=9D=20Updated=20Changelog=20befo?= =?UTF-8?q?re=20release?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aeca55eed..c5de69999 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,15 +5,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] + + +## [0.4.12] - 2023-01-31 ### Added - Added `view_type_time_sleep` to the Genesys `queue_performance_detail_view`. - ### Changed - Updated `genesys_to_adls.py` flow with the `adls_bulk_upload` task - Updated `mindful_to_adls.py` flow with the `adls_bulk_upload` task - Changed `MindfulToCSV` task to download surveys info. + ## [0.4.11] - 2022-12-15 ### Added - Added into `Genesys` the new view type `AGENT`.