From 551203c037000e3d46ffb6a392efbad597a87180 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Wed, 15 Nov 2023 14:11:44 +0100 Subject: [PATCH 01/54] =?UTF-8?q?=F0=9F=9A=80=20Bumped=20version=20after?= =?UTF-8?q?=20release?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_viadot.py | 2 +- viadot/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_viadot.py b/tests/test_viadot.py index 675dbfbdc..72df7dfca 100644 --- a/tests/test_viadot.py +++ b/tests/test_viadot.py @@ -2,4 +2,4 @@ def test_version(): - assert __version__ == "0.4.22" + assert __version__ == "0.4.23" diff --git a/viadot/__init__.py b/viadot/__init__.py index ece529aa1..c6dd1e2c0 100644 --- a/viadot/__init__.py +++ b/viadot/__init__.py @@ -1 +1 @@ -__version__ = "0.4.22" +__version__ = "0.4.23" From 53eae885bd9b8ddc34d4cf0ecb6ccddc1b0e358e Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 17 Nov 2023 13:31:53 +0100 Subject: [PATCH 02/54] =?UTF-8?q?=F0=9F=93=9D=20Added=20docstring=20to=20`?= =?UTF-8?q?slugify()`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/viadot/utils.py b/viadot/utils.py index 5e3de784c..6ece6982e 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -23,6 +23,14 @@ def slugify(name: str) -> str: + """Function to change spaces to underscores and convert all characters to lowercase. + + Args: + name (str): String to convert. + + Returns: + str: Output text after conversion. + """ return name.replace(" ", "_").lower() From b056a05db18a63087aa36ac8e9437b409ec6a088 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 17 Nov 2023 13:40:02 +0100 Subject: [PATCH 03/54] =?UTF-8?q?=E2=9C=85=20Added=20tests=20for=20`slugif?= =?UTF-8?q?y()`=20and=20`handle=5Fapi=5Fresponse()`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_utils.py | 65 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 75ef30e97..cf1805a0d 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,8 +1,10 @@ +import json import logging import os import pandas as pd import pytest +from viadot.exceptions import APIError from viadot.signals import SKIP from viadot.utils import ( @@ -10,13 +12,15 @@ check_if_empty_file, gen_bulk_insert_query_from_df, check_value, + slugify, + handle_api_response, ) EMPTY_CSV_PATH = "empty.csv" EMPTY_PARQUET_PATH = "empty.parquet" -class ClassForDecorator: +class ClassForMetadataDecorator: source = "Source_name" def __init__(self): @@ -34,6 +38,13 @@ def to_df_decorated_parameter(self): return self.df +def test_slugify(): + """To test slugify() function functionalities work""" + test_string = "Text With Spaces Before Changes" + string_after_changes = slugify(test_string) + assert string_after_changes == "text_with_spaces_before_changes" + + def test_single_quotes_inside(): TEST_VALUE = "a'b" df1 = pd.DataFrame( @@ -139,17 +150,17 @@ def test_check_if_empty_file_no_data(caplog): def test_add_viadot_metadata_columns_base(): - df_base = ClassForDecorator().to_df() - df_decorated = ClassForDecorator().to_df_decorated() + df_base = ClassForMetadataDecorator().to_df() + df_decorated = ClassForMetadataDecorator().to_df_decorated() assert df_base.columns.to_list() == ["a", "b"] assert df_decorated.columns.to_list() == ["a", "b", "_viadot_source"] - assert df_decorated["_viadot_source"][0] == "ClassForDecorator" + assert df_decorated["_viadot_source"][0] == "ClassForMetadataDecorator" def test_add_viadot_metadata_columns_with_parameter(): - df_base = ClassForDecorator().to_df() - df_decorated = ClassForDecorator().to_df_decorated_parameter() + df_base = ClassForMetadataDecorator().to_df() + df_decorated = ClassForMetadataDecorator().to_df_decorated_parameter() assert df_base.columns.to_list() == ["a", "b"] assert df_decorated.columns.to_list() == ["a", "b", "_viadot_source"] @@ -202,3 +213,45 @@ def test_check_value_nonexistent_key(): } result = check_value(json_data, ["nonexistent_key"]) assert result is None + + +def test_handle_api_response_wrong_method(): + """Test to check if ValueError is thrown when wrong method is used.""" + + api_url = "https://api.api-ninjas.com/v1/randomuser" + with pytest.raises(ValueError, match="Method not found."): + handle_api_response(url=api_url, method="WRONG_METHOD") + + +def test_handle_api_response_credentials_not_provided(): + """Test to check if APIError is thrown when credentials are not provided.""" + + api_url = "https://api.api-ninjas.com/v1/randomuser" + with pytest.raises( + APIError, match="Perhaps your account credentials need to be refreshed?" + ): + handle_api_response(url=api_url) + + +def test_handle_api_response_wrong_url(): + """Test to check if APIError is thrown when api_url is wrong.""" + + api_url = "https://test.com/" + with pytest.raises(APIError, match="failed due to connection issues."): + handle_api_response(url=api_url) + + +def test_handle_api_response_unknown_error(): + """Test to check if APIError is thrown when there is something other than "url" under api_url.""" + + api_url = "test_string" + with pytest.raises(APIError, match="Unknown error"): + handle_api_response(url=api_url) + + +def test_handle_api_response_return_type(): + """Test to check if the connection is successful.""" + + api_url = "https://jsonplaceholder.typicode.com/posts" + response = handle_api_response(url=api_url) + assert response.status_code == 200 From ef4a6500008900338c3837e78e9930885a06c92f Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 17 Nov 2023 13:54:13 +0100 Subject: [PATCH 04/54] =?UTF-8?q?=E2=9C=85=20Added=20missing=20test=20for?= =?UTF-8?q?=20`check=5Fvalue()`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_utils.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index cf1805a0d..400541b38 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -167,8 +167,8 @@ def test_add_viadot_metadata_columns_with_parameter(): assert df_decorated["_viadot_source"][0] == "Source_name" -# Sample test checking the correctness of the function when the key is found def test_check_value_found(): + """Sample test checking the correctness of the function when the key is found.""" json_data = { "first_known_lvl": { "second_known_lvl": {"third_known_lvl": {"searched_phrase": "phrase"}} @@ -181,8 +181,8 @@ def test_check_value_found(): assert result == "phrase" -# Sample test checking the correctness of the function when the key is not found def test_check_value_not_found(): + """Sample test checking the correctness of the function when the key is not found.""" json_data = { "first_known_lvl": { "second_known_lvl": { @@ -197,15 +197,16 @@ def test_check_value_not_found(): assert result is None -# Sample test checking the correctness of the function with an empty dictionary def test_check_value_empty_dict(): + """Sample test checking the correctness of the function with an empty dictionary.""" json_data = {} result = check_value(json_data, ["searched_phrase"]) assert result is None -# Sample test checking the correctness of the function with a nonexistent key def test_check_value_nonexistent_key(): + """Sample test checking the correctness of the function with a nonexistent key.""" + json_data = { "first_known_lvl": { "second_known_lvl": {"third_known_lvl": {"searched_phrase": "phrase"}} @@ -215,6 +216,14 @@ def test_check_value_nonexistent_key(): assert result is None +def test_check_value_base_is_not_dict(): + result = check_value( + base="this_is_not_dict", + levels=["searched_phrase"], + ) + assert result == "this_is_not_dict" + + def test_handle_api_response_wrong_method(): """Test to check if ValueError is thrown when wrong method is used.""" From 78ea4132a9e28750146e7a9fc5dec7ecb5e3b878 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Wed, 22 Nov 2023 12:30:24 +0100 Subject: [PATCH 05/54] =?UTF-8?q?=E2=9C=85=20Added=20SQL=20related=20missi?= =?UTF-8?q?ng=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_utils.py | 41 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 400541b38..ea555b20a 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -7,11 +7,13 @@ from viadot.exceptions import APIError from viadot.signals import SKIP +from viadot.sources import AzureSQL from viadot.utils import ( add_viadot_metadata_columns, check_if_empty_file, gen_bulk_insert_query_from_df, check_value, + get_sql_server_table_dtypes, slugify, handle_api_response, ) @@ -38,6 +40,12 @@ def to_df_decorated_parameter(self): return self.df +@pytest.fixture(scope="function") +def azure_sql(TEST_CSV_FILE_PATH, TEST_CSV_FILE_BLOB_PATH): + azure_sql = AzureSQL(config_key="AZURE_SQL") + yield azure_sql + + def test_slugify(): """To test slugify() function functionalities work""" test_string = "Text With Spaces Before Changes" @@ -45,7 +53,7 @@ def test_slugify(): assert string_after_changes == "text_with_spaces_before_changes" -def test_single_quotes_inside(): +def test_bulk_insert_query_from_df_single_quotes_inside(): TEST_VALUE = "a'b" df1 = pd.DataFrame( { @@ -67,7 +75,7 @@ def test_single_quotes_inside(): ), test_insert_query -def test_single_quotes_outside(): +def test_bulk_insert_query_from_df_single_quotes_outside(): TEST_VALUE = "'a'" df1 = pd.DataFrame( { @@ -89,7 +97,7 @@ def test_single_quotes_outside(): ), test_insert_query -def test_double_quotes_inside(): +def test_bulk_insert_query_from_df_double_quotes_inside(): TEST_VALUE = 'a "b"' df1 = pd.DataFrame( { @@ -111,6 +119,16 @@ def test_double_quotes_inside(): ), test_insert_query +def test_bulk_insert_query_from_df_not_implemeted(): + TEST_VALUE = 'a "b"' + df1 = pd.DataFrame({"a": [TEST_VALUE]}) + with pytest.raises( + NotImplementedError, + match="this function only handles DataFrames with at least two columns.", + ): + gen_bulk_insert_query_from_df(df1, table_fqn="test_schema.test_table") + + def test_check_if_empty_file_csv(caplog): with open(EMPTY_CSV_PATH, "w"): pass @@ -264,3 +282,20 @@ def test_handle_api_response_return_type(): api_url = "https://jsonplaceholder.typicode.com/posts" response = handle_api_response(url=api_url) assert response.status_code == 200 + + +def test_get_sql_server_table_dtypes(azure_sql): + """Checks if dtypes is generated in a good way using `get_sql_server_table_dtypes` function.""" + + SCHEMA = "sandbox" + TABLE = "test_table_dtypes" + dtypes = {"country": "VARCHAR(100)", "sales": "INT"} + + azure_sql.create_table( + schema=SCHEMA, table=TABLE, dtypes=dtypes, if_exists="replace" + ) + + dtypes = get_sql_server_table_dtypes(schema=SCHEMA, table=TABLE, con=azure_sql.con) + assert isinstance(dtypes, dict) + assert list(dtypes.keys()) == ["country", "sales"] + assert list(dtypes.values()) == ["varchar(100)", "int"] From 91ea25c64e39e09be9180066a9f570e8c4d5b58d Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Wed, 22 Nov 2023 15:37:10 +0100 Subject: [PATCH 06/54] =?UTF-8?q?=E2=9C=85=20Added=20missing=20test=20for?= =?UTF-8?q?=20`gen=5Fbulk=5Finsert=5Fquery=5Ffrom=5Fdf`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_utils.py | 27 +++++++++++++++++++++++++-- viadot/utils.py | 8 +++++--- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index ea555b20a..d6e1b9b4c 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,4 +1,3 @@ -import json import logging import os @@ -41,7 +40,13 @@ def to_df_decorated_parameter(self): @pytest.fixture(scope="function") -def azure_sql(TEST_CSV_FILE_PATH, TEST_CSV_FILE_BLOB_PATH): +def example_dataframe(): + data = [(1, "_suffixnan", 1), (2, "Noneprefix", 0), (3, "fooNULLbar", 1, 2.34)] + return pd.DataFrame(data, columns=["id", "name", "is_deleted", "balance"]) + + +@pytest.fixture(scope="function") +def azure_sql(): azure_sql = AzureSQL(config_key="AZURE_SQL") yield azure_sql @@ -129,6 +134,24 @@ def test_bulk_insert_query_from_df_not_implemeted(): gen_bulk_insert_query_from_df(df1, table_fqn="test_schema.test_table") +def test_bulk_insert_query_from_df_full_return(example_dataframe): + result = gen_bulk_insert_query_from_df( + example_dataframe, + table_fqn="users", + chunksize=1000, + status="APPROVED", + address=None, + ) + + expected_result = """INSERT INTO users (id, name, is_deleted, balance, status, address) + +VALUES (1, '_suffixnan', 1, NULL, 'APPROVED', NULL), + (2, 'Noneprefix', 0, NULL, 'APPROVED', NULL), + (3, 'fooNULLbar', 1, 2.34, 'APPROVED', NULL)""" + + assert result == expected_result + + def test_check_if_empty_file_csv(caplog): with open(EMPTY_CSV_PATH, "w"): pass diff --git a/viadot/utils.py b/viadot/utils.py index 6ece6982e..fd2c11cae 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -145,12 +145,12 @@ def get_flow_last_run_date(flow_name: str) -> str: def get_sql_server_table_dtypes( - table, con: pyodbc.Connection, schema: str = None + table: str, con: pyodbc.Connection, schema: str = None ) -> dict: """Get column names and types from a SQL Server database table. Args: - table (_type_): The table for which to fetch dtypes. + table (str): The table for which to fetch dtypes. con (pyodbc.Connection): The connection to the database where the table is located. schema (str, optional): The schema where the table is located. Defaults to None. @@ -265,7 +265,7 @@ def build_merge_query( def gen_bulk_insert_query_from_df( - df: pd.DataFrame, table_fqn: str, chunksize=1000, **kwargs + df: pd.DataFrame, table_fqn: str, chunksize: int = 1000, **kwargs ) -> str: """ Converts a DataFrame to a bulk INSERT query. @@ -273,6 +273,7 @@ def gen_bulk_insert_query_from_df( Args: df (pd.DataFrame): The DataFrame which data should be put into the INSERT query. table_fqn (str): The fully qualified name (schema.table) of the table to be inserted into. + chunksize (int, optional): The size of chunk. Defaults to 1000. Returns: str: A bulk insert query that will insert all data from `df` into `table_fqn`. @@ -288,6 +289,7 @@ def gen_bulk_insert_query_from_df( >>> query = gen_bulk_insert_query_from_df(df, "users", status="APPROVED", address=None) >>> print(query) INSERT INTO users (id, name, is_deleted, balance, status, address) + VALUES (1, '_suffixnan', 1, NULL, 'APPROVED', NULL), (2, 'Noneprefix', 0, NULL, 'APPROVED', NULL), (3, 'fooNULLbar', 1, 2.34, 'APPROVED', NULL); From bcdfa981e066619bf2f856f30a11e7426c420407 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Wed, 22 Nov 2023 15:47:03 +0100 Subject: [PATCH 07/54] =?UTF-8?q?=E2=9C=85=20Added=20missing=20test=20for?= =?UTF-8?q?=20`union=5Fdict()`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_utils.py | 10 ++++++++++ viadot/utils.py | 10 +++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index d6e1b9b4c..0b2ed5782 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -15,6 +15,7 @@ get_sql_server_table_dtypes, slugify, handle_api_response, + union_dict, ) EMPTY_CSV_PATH = "empty.csv" @@ -322,3 +323,12 @@ def test_get_sql_server_table_dtypes(azure_sql): assert isinstance(dtypes, dict) assert list(dtypes.keys()) == ["country", "sales"] assert list(dtypes.values()) == ["varchar(100)", "int"] + + +def test_union_dict_return(): + """Check if dictionaries are unioned in the correct way.""" + a = {"a": 1} + b = {"b": 2} + unioned_dict = union_dict(a, b) + assert isinstance(unioned_dict, dict) + assert unioned_dict == {"a": 1, "b": 2} diff --git a/viadot/utils.py b/viadot/utils.py index fd2c11cae..4690c8cbd 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -354,21 +354,21 @@ def _gen_insert_query_from_records(records: List[tuple]) -> str: return _gen_insert_query_from_records(tuples_escaped) -def union_dict(*dicts): +def union_dict(*dicts) -> dict: """ - Function that union list of dictionaries + Function that union list of dictionaries into a singe dictionary. Args: - dicts (List[Dict]): list of dictionaries with credentials. + *dicts: Variable number of dictionaries to be unioned. Returns: - Dict: A single dictionary createb by union method. + dict: A single dictionary containing the combined key-value pairs from all input dictionaries. Examples: >>> a = {"a":1} >>> b = {"b":2} - >>> union_credentials_dict(a ,b) + >>> union_dict(a ,b) {'a': 1, 'b': 2} """ From 74beb6e487da679bd649f8adbbd890fc319fbd7d Mon Sep 17 00:00:00 2001 From: burzekj Date: Fri, 24 Nov 2023 12:54:59 +0100 Subject: [PATCH 08/54] =?UTF-8?q?=E2=9C=A8=20Added=20new=20logic=20to=20ma?= =?UTF-8?q?p=20and=20reorder=20output=20df?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/genesys_to_adls.py | 8 +++++++ viadot/tasks/genesys.py | 39 ++++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/viadot/flows/genesys_to_adls.py b/viadot/flows/genesys_to_adls.py index 830c02c71..1cebe5a65 100644 --- a/viadot/flows/genesys_to_adls.py +++ b/viadot/flows/genesys_to_adls.py @@ -89,6 +89,8 @@ def __init__( report_url: str = None, report_columns: List[str] = None, conversationId_list: List[str] = None, + mapping_dict: Dict[str, Any] = None, + columns_order: List[str] = None, key_list: List[str] = None, local_file_path: str = "", adls_file_path: str = None, @@ -137,6 +139,8 @@ def __init__( report_url (str, optional): The url of report generated in json response. Defaults to None. report_columns (List[str], optional): List of exisiting column in report. Defaults to None. conversationId_list (List[str], optional): List of conversationId passed as attribute of GET method. Defaults to None. + mapping_dict (dict, optional): Mapping dictionary from user in json format. Defaults to None. + columns_order (List, optional): Columns order list to change column order inside pd.DataFrame. Defaults to None. key_list (List[str], optional): List of keys needed to specify the columns in the GET request method. Defaults to None. local_file_path (str, optional): The local path from which to upload the file(s). Defaults to "". adls_file_path (str, optional): The destination path at ADLS. Defaults to None. @@ -164,6 +168,8 @@ def __init__( self.report_url = report_url self.report_columns = report_columns self.conversationId_list = conversationId_list + self.mapping_dict = mapping_dict + self.columns_order = columns_order self.key_list = key_list self.start_date = start_date self.end_date = end_date @@ -199,6 +205,8 @@ def gen_flow(self) -> Flow: end_date=self.end_date, environment=self.environment, conversationId_list=self.conversationId_list, + mapping_dict=self.mapping_dict, + columns_order=self.columns_order, key_list=self.key_list, credentials_genesys=self.credentials_genesys, flow=self, diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index 942249ac2..88014a6be 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -33,6 +33,8 @@ def __init__( local_file_path: str = "", sep: str = "\t", conversationId_list: List[str] = None, + mapping_dict: Dict[str, Any] = None, + columns_order: List[str] = None, key_list: List[str] = None, credentials_genesys: Dict[str, Any] = None, validate_df_dict: Dict[str, Any] = None, @@ -56,6 +58,8 @@ def __init__( local_file_path (str, optional): The local path from which to upload the file(s). Defaults to "". sep (str, optional): Separator in csv file. Defaults to "\t". conversationId_list (List[str], optional): List of conversationId passed as attribute of GET method. Defaults to None. + mapping_dict (dict, optional): Mapping dictionary from user in json format. Defaults to None. + columns_order (List, optional): Columns order list to change column order inside pd.DataFrame. Defaults to None. key_list (List[str], optional): List of keys needed to specify the columns in the GET request method. Defaults to None. validate_df_dict (Dict[str,Any], optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. @@ -76,6 +80,8 @@ def __init__( self.local_file_path = local_file_path self.sep = sep self.conversationId_list = conversationId_list + self.mapping_dict = mapping_dict + self.columns_order = columns_order self.key_list = key_list self.validate_df_dict = validate_df_dict @@ -298,6 +304,8 @@ def merge_conversations_dfs(self, data_to_merge: list) -> DataFrame: "report_columns", "credentials_genesys", "conversationId_list", + "mapping_dict", + "columns_order", "key_list", "validate_df_dict", ) @@ -314,6 +322,8 @@ def run( end_date: str = None, report_columns: List[str] = None, conversationId_list: List[str] = None, + mapping_dict: Dict[str, Any] = None, + columns_order: List[str] = None, key_list: List[str] = None, credentials_genesys: Dict[str, Any] = None, validate_df_dict: Dict[str, Any] = None, @@ -334,6 +344,8 @@ def run( report_url (str, optional): The url of report generated in json response. Defaults to None. report_columns (List[str], optional): List of exisiting column in report. Defaults to None. conversationId_list (List[str], optional): List of conversationId passed as attribute of GET method. Defaults to None. + mapping_dict (dict, optional): Mapping dictionary from user in json format. Defaults to None. + columns_order (List, optional): Columns order list to change column order inside pd.DataFrame. Defaults to None. key_list (List[str], optional): List of keys needed to specify the columns in the GET request method. Defaults to None. validate_df_dict (Dict[str,Any], optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. @@ -461,8 +473,16 @@ def run( date = start_date.replace("-", "") file_name = f"conversations_detail_{date}".upper() + ".csv" + + # Possible transformation of DataFrame + if mapping_dict: + final_df.rename(columns=mapping_dict, inplace=True) + if columns_order: + final_df = df[columns_order] + if validate_df_dict: validate_df.run(df=final_df, tests=validate_df_dict) + final_df.to_csv( os.path.join(self.local_file_path, file_name), index=False, @@ -494,14 +514,21 @@ def run( data_list.append(temp_dict) df = pd.DataFrame(data_list) - df = df[df.columns[-1:]].join(df[df.columns[:-1]]) + + # Possible transformation of DataFrame + if mapping_dict: + df.rename(columns=mapping_dict, inplace=True) + if columns_order: + df = df[columns_order] + + if validate_df_dict: + validate_df.run(df=df, tests=validate_df_dict) start = start_date.replace("-", "") end = end_date.replace("-", "") file_name = f"WEBMESSAGE_{start}-{end}.csv" - if validate_df_dict: - validate_df.run(df=df, tests=validate_df_dict) + df.to_csv( os.path.join(file_name), index=False, @@ -568,6 +595,12 @@ def run( df = pd.DataFrame(data_list) + # Possible transformation of DataFrame + if mapping_dict: + df.rename(columns=mapping_dict, inplace=True) + if columns_order: + df = df[columns_order] + # data validation function (optional) if validate_df_dict: validate_df.run(df=df, tests=validate_df_dict) From 2885731a960eeca8e627aa580c179b4ca955842e Mon Sep 17 00:00:00 2001 From: burzekj Date: Fri, 24 Nov 2023 13:03:09 +0100 Subject: [PATCH 09/54] Added CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ef880c75..e07bac6e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Added new params for mapping and reordering DataFrame for `Genesys` task and flow. ### Fixed From e6c82e361bbfe169f730bcf23b2eb599f9085552 Mon Sep 17 00:00:00 2001 From: kiurieva Date: Fri, 24 Nov 2023 14:36:47 +0100 Subject: [PATCH 10/54] Updated api url in connector --- viadot/sources/vid_club.py | 84 ++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py index e7819577a..327d9abf7 100644 --- a/viadot/sources/vid_club.py +++ b/viadot/sources/vid_club.py @@ -1,6 +1,7 @@ import json import os import urllib +from pandas.io.json import json_normalize from datetime import date, datetime, timedelta from typing import Any, Dict, List, Literal, Tuple @@ -46,7 +47,7 @@ def build_query( api_url: str, items_per_page: int, source: Literal["jobs", "product", "company", "survey"] = None, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, ) -> str: """ Builds the query from the inputs. @@ -128,7 +129,7 @@ def check_connection( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, url: str = None, ) -> Tuple[Dict[str, Any], str]: """ @@ -160,20 +161,37 @@ def check_connection( if url is None: url = self.credentials["url"] - first_url = self.build_query( - source=source, - from_date=from_date, - to_date=to_date, - api_url=url, - items_per_page=items_per_page, - region=region, - ) - headers = self.headers - response = handle_api_response( - url=first_url, headers=headers, method="GET", verify=False - ) - response = response.json() - + if source in ["jobs", "product", "company"]: + first_url = self.build_query( + source=source, + from_date=from_date, + to_date=to_date, + api_url=url, + items_per_page=items_per_page, + ) + headers = self.headers + response = handle_api_response( + url=first_url, headers=headers, method="GET", verify=False + ) + response = response.json() + elif source == "survey": + first_url = self.build_query( + source=source, + from_date=from_date, + to_date=to_date, + api_url=url, + items_per_page=items_per_page, + region=region, + ) + headers = self.headers + response = handle_api_response( + url=first_url, headers=headers, method="GET", verify=False + ) + response = response.json() + else: + raise ValidationError( + "Pick one these sources: jobs, product, company, survey" + ) return (response, first_url) def get_response( @@ -182,7 +200,7 @@ def get_response( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, ) -> pd.DataFrame: """ Basing on the pagination type retrieved using check_connection function, gets the response from the API queried and transforms it into DataFrame. @@ -207,14 +225,26 @@ def get_response( ) if to_date == None: to_date = datetime.today().strftime("%Y-%m-%d") + if source in ["jobs", "product", "company"]: + response, first_url = self.check_connection( + source=source, + from_date=from_date, + to_date=to_date, + items_per_page=items_per_page, + ) - response, first_url = self.check_connection( - source=source, - from_date=from_date, - to_date=to_date, - items_per_page=items_per_page, - region=region, - ) + elif source == "survey": + response, first_url = self.check_connection( + source=source, + from_date=from_date, + to_date=to_date, + items_per_page=items_per_page, + region=region, + ) + else: + raise ValidationError( + "Pick one these sources: jobs, product, company, survey" + ) if isinstance(response, dict): keys_list = list(response.keys()) @@ -229,7 +259,8 @@ def get_response( ind = False if "data" in keys_list: - df = pd.DataFrame(response["data"]) + df = json_normalize(response["data"]) + df = pd.DataFrame(df) length = df.shape[0] page = 1 @@ -244,7 +275,8 @@ def get_response( url=url, headers=headers, method="GET", verify=False ) response = r.json() - df_page = pd.DataFrame(response["data"]) + df_page = json_normalize(response["data"]) + df_page = pd.DataFrame(df_page) if source == "product": df_page = df_page.transpose() length = df_page.shape[0] From a638571f35246c01e2216dd0a4c5774d5cf5522b Mon Sep 17 00:00:00 2001 From: kiurieva Date: Fri, 24 Nov 2023 14:52:45 +0100 Subject: [PATCH 11/54] Updated api url in connector --- viadot/sources/vid_club.py | 84 ++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py index e7819577a..327d9abf7 100644 --- a/viadot/sources/vid_club.py +++ b/viadot/sources/vid_club.py @@ -1,6 +1,7 @@ import json import os import urllib +from pandas.io.json import json_normalize from datetime import date, datetime, timedelta from typing import Any, Dict, List, Literal, Tuple @@ -46,7 +47,7 @@ def build_query( api_url: str, items_per_page: int, source: Literal["jobs", "product", "company", "survey"] = None, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, ) -> str: """ Builds the query from the inputs. @@ -128,7 +129,7 @@ def check_connection( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, url: str = None, ) -> Tuple[Dict[str, Any], str]: """ @@ -160,20 +161,37 @@ def check_connection( if url is None: url = self.credentials["url"] - first_url = self.build_query( - source=source, - from_date=from_date, - to_date=to_date, - api_url=url, - items_per_page=items_per_page, - region=region, - ) - headers = self.headers - response = handle_api_response( - url=first_url, headers=headers, method="GET", verify=False - ) - response = response.json() - + if source in ["jobs", "product", "company"]: + first_url = self.build_query( + source=source, + from_date=from_date, + to_date=to_date, + api_url=url, + items_per_page=items_per_page, + ) + headers = self.headers + response = handle_api_response( + url=first_url, headers=headers, method="GET", verify=False + ) + response = response.json() + elif source == "survey": + first_url = self.build_query( + source=source, + from_date=from_date, + to_date=to_date, + api_url=url, + items_per_page=items_per_page, + region=region, + ) + headers = self.headers + response = handle_api_response( + url=first_url, headers=headers, method="GET", verify=False + ) + response = response.json() + else: + raise ValidationError( + "Pick one these sources: jobs, product, company, survey" + ) return (response, first_url) def get_response( @@ -182,7 +200,7 @@ def get_response( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, ) -> pd.DataFrame: """ Basing on the pagination type retrieved using check_connection function, gets the response from the API queried and transforms it into DataFrame. @@ -207,14 +225,26 @@ def get_response( ) if to_date == None: to_date = datetime.today().strftime("%Y-%m-%d") + if source in ["jobs", "product", "company"]: + response, first_url = self.check_connection( + source=source, + from_date=from_date, + to_date=to_date, + items_per_page=items_per_page, + ) - response, first_url = self.check_connection( - source=source, - from_date=from_date, - to_date=to_date, - items_per_page=items_per_page, - region=region, - ) + elif source == "survey": + response, first_url = self.check_connection( + source=source, + from_date=from_date, + to_date=to_date, + items_per_page=items_per_page, + region=region, + ) + else: + raise ValidationError( + "Pick one these sources: jobs, product, company, survey" + ) if isinstance(response, dict): keys_list = list(response.keys()) @@ -229,7 +259,8 @@ def get_response( ind = False if "data" in keys_list: - df = pd.DataFrame(response["data"]) + df = json_normalize(response["data"]) + df = pd.DataFrame(df) length = df.shape[0] page = 1 @@ -244,7 +275,8 @@ def get_response( url=url, headers=headers, method="GET", verify=False ) response = r.json() - df_page = pd.DataFrame(response["data"]) + df_page = json_normalize(response["data"]) + df_page = pd.DataFrame(df_page) if source == "product": df_page = df_page.transpose() length = df_page.shape[0] From 5092684bb5450d6bd918c98e521efcbacdccdb34 Mon Sep 17 00:00:00 2001 From: Kateryna Iurieva Date: Mon, 27 Nov 2023 13:35:35 +0100 Subject: [PATCH 12/54] Update viadot/sources/vid_club.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wójcik <107313911+adrian-wojcik@users.noreply.github.com> --- viadot/sources/vid_club.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py index 327d9abf7..a4dacc4cb 100644 --- a/viadot/sources/vid_club.py +++ b/viadot/sources/vid_club.py @@ -190,7 +190,7 @@ def check_connection( response = response.json() else: raise ValidationError( - "Pick one these sources: jobs, product, company, survey" + "Pick one of these sources: jobs, product, company, survey" ) return (response, first_url) From 45250eadd91120c23e849afe2beffa53f9ead469 Mon Sep 17 00:00:00 2001 From: Kateryna Iurieva Date: Mon, 27 Nov 2023 13:35:40 +0100 Subject: [PATCH 13/54] Update viadot/sources/vid_club.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wójcik <107313911+adrian-wojcik@users.noreply.github.com> --- viadot/sources/vid_club.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py index a4dacc4cb..497064c0b 100644 --- a/viadot/sources/vid_club.py +++ b/viadot/sources/vid_club.py @@ -243,7 +243,7 @@ def get_response( ) else: raise ValidationError( - "Pick one these sources: jobs, product, company, survey" + "Pick one of these sources: jobs, product, company, survey" ) if isinstance(response, dict): From 19b9812bf7f928184877dda04cad91c514fbb393 Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Tue, 28 Nov 2023 13:43:55 +0100 Subject: [PATCH 14/54] function to check if df empty --- viadot/flows/sharepoint_to_adls.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 18e392a55..160e8355a 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -3,7 +3,8 @@ from typing import Any, Dict, List import pendulum -from prefect import Flow +from prefect import Flow, task +from typing import Literal from prefect.backend import set_key_value from prefect.utilities import logging @@ -186,6 +187,11 @@ def gen_flow(self) -> Flow: def slugify(name): return name.replace(" ", "_").lower() + @task(slug="check_df") + def check_if_df_empty(df): + if len(df.index) == 0: + logger.info("No data in the response. Df empty") + class SharepointListToADLS(Flow): def __init__( @@ -207,6 +213,7 @@ def __init__( output_file_extension: str = ".parquet", validate_df_dict: dict = None, set_prefect_kv: bool = False, + if_no_data_returned: Literal["continue", "warn", "fail"] = "continue", *args: List[any], **kwargs: Dict[str, Any], ): @@ -321,7 +328,7 @@ def __init__( self.gen_flow() def gen_flow(self) -> Flow: - s = SharepointListToDF( + df = SharepointListToDF( path=self.path, list_title=self.list_title, site_url=self.site_url, @@ -331,12 +338,12 @@ def gen_flow(self) -> Flow: row_count=self.row_count, credentials_secret=self.sp_cert_credentials_secret, ) - df = s.run() if self.validate_df_dict: validation_task = validate_df(df=df, tests=self.validate_df_dict, flow=self) validation_task.set_upstream(df, flow=self) + check_if_df_empty.bind(df, flow=self) df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) df_mapped = df_map_mixed_dtypes_for_parquet.bind( From c4f07df790c17232f446d3ad2eaf8298da89ede9 Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Tue, 28 Nov 2023 14:57:06 +0100 Subject: [PATCH 15/54] function to check if df empty enhanced --- viadot/flows/sharepoint_to_adls.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 160e8355a..2a2d6adb6 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -187,10 +187,11 @@ def gen_flow(self) -> Flow: def slugify(name): return name.replace(" ", "_").lower() - @task(slug="check_df") - def check_if_df_empty(df): - if len(df.index) == 0: - logger.info("No data in the response. Df empty") + +@task(slug="check_df") +def check_if_df_empty(df): + if len(df.index) == 0: + logger.info("No data in the response. Df empty.") class SharepointListToADLS(Flow): From df729f56cab4be4d85eebbf05f214bcef23b41e7 Mon Sep 17 00:00:00 2001 From: cgildenia Date: Wed, 29 Nov 2023 10:59:01 +0100 Subject: [PATCH 16/54] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20merged=20path=20and?= =?UTF-8?q?=20adls=5Ffile=5Fname=20into=20file=5Fname?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/sharepoint_to_adls.py | 31 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 18e392a55..1f120c418 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -193,9 +193,10 @@ def __init__( name: str, list_title: str, site_url: str, - path: str, + file_name: str, + # path: str, adls_dir_path: str, - adls_file_name: str, + # adls_file_name: str, filters: dict = None, required_fields: List[str] = None, field_property: str = "Title", @@ -219,6 +220,7 @@ def __init__( name (str): Prefect flow name. list_title (str): Title of Sharepoint List. site_url (str): URL to set of Sharepoint Lists. + file_name (str): PENDING path (str): Local file path. Default to None. adls_dir_path (str): Azure Data Lake destination folder/catalog path. Defaults to None. adls_file_name (str): Name of file in ADLS. Defaults to None. @@ -272,7 +274,7 @@ def __init__( """ # SharepointListToDF - self.path = path + self.file_name = file_name self.list_title = list_title self.site_url = site_url self.required_fields = required_fields @@ -285,32 +287,29 @@ def __init__( # AzureDataLakeUpload self.adls_dir_path = adls_dir_path - self.adls_file_name = adls_file_name self.overwrite = overwrite_adls self.adls_sp_credentials_secret = adls_sp_credentials_secret self.output_file_extension = output_file_extension self.set_prefect_kv = set_prefect_kv self.now = str(pendulum.now("utc")) - if self.path is not None: + if self.file_name is not None: self.local_file_path = ( - self.path + self.slugify(name) + self.output_file_extension + self.file_name + self.slugify(name) + self.output_file_extension ) - else: - self.local_file_path = self.slugify(name) + self.output_file_extension - self.local_json_path = self.slugify(name) + ".json" - self.adls_dir_path = adls_dir_path - if adls_file_name is not None: - self.adls_file_path = os.path.join(adls_dir_path, adls_file_name) + self.adls_file_path = os.path.join(adls_dir_path, file_name) self.adls_schema_file_dir_file = os.path.join( - adls_dir_path, "schema", Path(adls_file_name).stem + ".json" + adls_dir_path, "schema", Path(file_name).stem + ".json" ) else: + self.local_file_path = self.slugify(name) + self.output_file_extension self.adls_file_path = os.path.join( adls_dir_path, self.now + self.output_file_extension ) self.adls_schema_file_dir_file = os.path.join( adls_dir_path, "schema", self.now + ".json" ) + self.local_json_path = self.slugify(name) + ".json" + self.adls_dir_path = adls_dir_path super().__init__( name=name, @@ -322,7 +321,7 @@ def __init__( def gen_flow(self) -> Flow: s = SharepointListToDF( - path=self.path, + path=self.file_name, list_title=self.list_title, site_url=self.site_url, required_fields=self.required_fields, @@ -345,13 +344,13 @@ def gen_flow(self) -> Flow: df_to_file = df_to_parquet.bind( df=df_mapped, - path=self.path, + path=self.file_name, flow=self, ) file_to_adls_task = AzureDataLakeUpload() file_to_adls_task.bind( - from_path=self.path, + from_path=self.file_name, to_path=self.adls_dir_path, overwrite=self.overwrite, sp_credentials_secret=self.adls_sp_credentials_secret, From 759adf2107be74f33cd85a912123e4ae5da349dd Mon Sep 17 00:00:00 2001 From: cgildenia Date: Wed, 29 Nov 2023 11:01:39 +0100 Subject: [PATCH 17/54] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20updated=20docstring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/sharepoint_to_adls.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 1f120c418..43b51df49 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -194,9 +194,7 @@ def __init__( list_title: str, site_url: str, file_name: str, - # path: str, adls_dir_path: str, - # adls_file_name: str, filters: dict = None, required_fields: List[str] = None, field_property: str = "Title", @@ -220,10 +218,8 @@ def __init__( name (str): Prefect flow name. list_title (str): Title of Sharepoint List. site_url (str): URL to set of Sharepoint Lists. - file_name (str): PENDING - path (str): Local file path. Default to None. + file_name (str): Name of file in ADLS. Defaults to None. adls_dir_path (str): Azure Data Lake destination folder/catalog path. Defaults to None. - adls_file_name (str): Name of file in ADLS. Defaults to None. filters (dict, optional): Dictionary with operators which filters the SharepointList output. Defaults to None. allowed dtypes: ('datetime','date','bool','int', 'float', 'complex', 'str') allowed conjunction: ('&','|') From 495f3c560f10d87684f341f83edab0243ecbac18 Mon Sep 17 00:00:00 2001 From: cgildenia Date: Wed, 29 Nov 2023 12:33:14 +0100 Subject: [PATCH 18/54] =?UTF-8?q?=E2=9C=A8=20added=20to=5Fcsv=20in=20list?= =?UTF-8?q?=20class?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/sharepoint_to_adls.py | 34 ++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 43b51df49..bcc8c5881 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -65,6 +65,7 @@ def __init__( Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_empty (str, optional): What to do if query returns no data. Defaults to "warn". + if_exists (str, optional): What to do if the file already exists. Defaults to "replace". validate_df_dict (dict, optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. timeout(int, optional): The amount of time (in seconds) to wait while running this task before @@ -206,6 +207,7 @@ def __init__( output_file_extension: str = ".parquet", validate_df_dict: dict = None, set_prefect_kv: bool = False, + if_exists: str = "replace", *args: List[any], **kwargs: Dict[str, Any], ): @@ -264,6 +266,7 @@ def __init__( output_file_extension (str, optional): Extension of the resulting file to be stored. Defaults to ".parquet". validate_df_dict (dict, optional): Whether to do an extra df validation before ADLS upload or not to do. Defaults to None. set_prefect_kv (bool, optional): Whether to do key-value parameters in KV Store or not. Defaults to False. + if_exists (str, optional): What to do if the file already exists. Defaults to "replace". Returns: .parquet file inside ADLS. @@ -280,6 +283,7 @@ def __init__( self.vault_name = vault_name self.row_count = row_count self.validate_df_dict = validate_df_dict + self.if_exists = if_exists # AzureDataLakeUpload self.adls_dir_path = adls_dir_path @@ -290,7 +294,8 @@ def __init__( self.now = str(pendulum.now("utc")) if self.file_name is not None: self.local_file_path = ( - self.file_name + self.slugify(name) + self.output_file_extension + self.file_name.split('.')[0] + self.output_file_extension + # self.file_name + self.slugify(name) + self.output_file_extension ) self.adls_file_path = os.path.join(adls_dir_path, file_name) self.adls_schema_file_dir_file = os.path.join( @@ -338,15 +343,30 @@ def gen_flow(self) -> Flow: df_with_metadata, dtypes_dict, flow=self ) - df_to_file = df_to_parquet.bind( - df=df_mapped, - path=self.file_name, - flow=self, - ) + # df_to_file = df_to_parquet.bind( + # df=df_mapped, + # path=self.file_name, + # flow=self, + # ) + + if self.output_file_extension == ".csv": + df_to_file = df_to_csv.bind( + df=df_with_metadata, + path=self.local_file_path, + if_exists=self.if_exists, + flow=self, + ) + else: + df_to_file = df_to_parquet.bind( + df=df_mapped, + path=self.local_file_path, + if_exists=self.if_exists, + flow=self, + ) file_to_adls_task = AzureDataLakeUpload() file_to_adls_task.bind( - from_path=self.file_name, + from_path=self.local_file_path, to_path=self.adls_dir_path, overwrite=self.overwrite, sp_credentials_secret=self.adls_sp_credentials_secret, From 8a615189178235ea397cfbf8b419a0057e28b981 Mon Sep 17 00:00:00 2001 From: burzekj Date: Wed, 29 Nov 2023 13:04:33 +0100 Subject: [PATCH 19/54] =?UTF-8?q?=E2=9C=85=20added=20more=20tests=20for=20?= =?UTF-8?q?genesys=20Task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_genesys_task.py | 39 +++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/tests/integration/tasks/test_genesys_task.py b/tests/integration/tasks/test_genesys_task.py index eb4978fa6..424a45c52 100644 --- a/tests/integration/tasks/test_genesys_task.py +++ b/tests/integration/tasks/test_genesys_task.py @@ -106,7 +106,10 @@ def genesys_api_connection(post_data_list, end_point, method="POST"): "messages": [], } ], + "pageCount": 2, + "entities": [{"id": "xxx"}], } + else: report = { "conversations": [ @@ -307,7 +310,7 @@ def test_genesys_conversations(mock_genesys, var_dictionary): @mock.patch("viadot.tasks.genesys.Genesys", return_value=MockGenesysTask) @pytest.mark.conv -def test_genesys_webmsg(mock_genesys, var_dictionary): +def test_genesys_webmsg_conversations(mock_genesys, var_dictionary): to_csv = GenesysToCSV() file_name = to_csv.run( view_type=None, @@ -324,3 +327,37 @@ def test_genesys_webmsg(mock_genesys, var_dictionary): mock_genesys.assert_called_once() assert file_name[0] == f"WEBMESSAGE_{start}-{end}.csv" + + +@mock.patch("viadot.tasks.genesys.Genesys", return_value=MockGenesysTask) +@pytest.mark.conv +def test_genesys_users(mock_genesys, var_dictionary): + to_csv = GenesysToCSV() + file_name = to_csv.run( + view_type=None, + end_point="users", + conversationId_list=var_dictionary["v_list"], + post_data_list=[""], + key_list=var_dictionary["key_list"], + start_date=var_dictionary["start_date"], + end_date=var_dictionary["end_date"], + ) + + mock_genesys.assert_called_once() + assert file_name[0] == f"All_Genesys_Users.csv" + + +@mock.patch("viadot.tasks.genesys.Genesys", return_value=MockGenesysTask) +@pytest.mark.conv +def test_genesys_queue_performance_detail_view(mock_genesys, var_dictionary): + genesys = GenesysToCSV() + output = genesys.run( + view_type="queue_performance_detail_view", + end_point=None, + conversationId_list=var_dictionary["v_list"], + post_data_list=[""], + key_list=var_dictionary["key_list"], + start_date=var_dictionary["start_date"], + end_date=var_dictionary["end_date"], + ) + assert output is None From 34859ed8516eae4919f946722cc7473fa5fd1102 Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Wed, 29 Nov 2023 13:26:49 +0100 Subject: [PATCH 20/54] Modified logic for check df. Df check and flow Finish in the flow added --- viadot/flows/sharepoint_to_adls.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 2a2d6adb6..5bca3cc8e 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -7,6 +7,7 @@ from typing import Literal from prefect.backend import set_key_value from prefect.utilities import logging +from prefect.engine.state import Finished from viadot.task_utils import ( add_ingestion_metadata_task, @@ -189,9 +190,20 @@ def slugify(name): @task(slug="check_df") -def check_if_df_empty(df): +def check_if_df_empty(df, if_no_data_returned: str = "skip"): + # -> to task.utils + class NoDataReturnedError(Exception): + def __init__(self, message): + self.message = message + if len(df.index) == 0: - logger.info("No data in the response. Df empty.") + if if_no_data_returned == "skip": + logger.info("No data in the source response. Df empty.") + elif if_no_data_returned == "warn": + logger.warning("No data in the source response. Df empty.") + elif if_no_data_returned == "fail": + raise NoDataReturnedError("No data in the source response. Df empty.") + return True class SharepointListToADLS(Flow): @@ -214,7 +226,7 @@ def __init__( output_file_extension: str = ".parquet", validate_df_dict: dict = None, set_prefect_kv: bool = False, - if_no_data_returned: Literal["continue", "warn", "fail"] = "continue", + if_no_data_returned: Literal["skip", "warn", "fail"] = "skip", *args: List[any], **kwargs: Dict[str, Any], ): @@ -290,6 +302,7 @@ def __init__( self.vault_name = vault_name self.row_count = row_count self.validate_df_dict = validate_df_dict + self.if_no_data_returned = if_no_data_returned # AzureDataLakeUpload self.adls_dir_path = adls_dir_path @@ -339,12 +352,18 @@ def gen_flow(self) -> Flow: row_count=self.row_count, credentials_secret=self.sp_cert_credentials_secret, ) + df_empty = check_if_df_empty.bind(df, self.if_no_data_returned) + + if df_empty: + if self.if_no_data_returned == "warn": + raise Finished( + "Flow finished because there is no new data for ingestion." + ) if self.validate_df_dict: validation_task = validate_df(df=df, tests=self.validate_df_dict, flow=self) validation_task.set_upstream(df, flow=self) - check_if_df_empty.bind(df, flow=self) df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) df_mapped = df_map_mixed_dtypes_for_parquet.bind( From b10a9518944dc9161a669006a01305526058bbe5 Mon Sep 17 00:00:00 2001 From: cgildenia Date: Wed, 29 Nov 2023 13:44:56 +0100 Subject: [PATCH 21/54] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20removed=20unused=20c?= =?UTF-8?q?ode?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/sharepoint_to_adls.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index bcc8c5881..a539d6628 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -207,7 +207,6 @@ def __init__( output_file_extension: str = ".parquet", validate_df_dict: dict = None, set_prefect_kv: bool = False, - if_exists: str = "replace", *args: List[any], **kwargs: Dict[str, Any], ): @@ -266,7 +265,6 @@ def __init__( output_file_extension (str, optional): Extension of the resulting file to be stored. Defaults to ".parquet". validate_df_dict (dict, optional): Whether to do an extra df validation before ADLS upload or not to do. Defaults to None. set_prefect_kv (bool, optional): Whether to do key-value parameters in KV Store or not. Defaults to False. - if_exists (str, optional): What to do if the file already exists. Defaults to "replace". Returns: .parquet file inside ADLS. @@ -283,7 +281,6 @@ def __init__( self.vault_name = vault_name self.row_count = row_count self.validate_df_dict = validate_df_dict - self.if_exists = if_exists # AzureDataLakeUpload self.adls_dir_path = adls_dir_path @@ -295,7 +292,6 @@ def __init__( if self.file_name is not None: self.local_file_path = ( self.file_name.split('.')[0] + self.output_file_extension - # self.file_name + self.slugify(name) + self.output_file_extension ) self.adls_file_path = os.path.join(adls_dir_path, file_name) self.adls_schema_file_dir_file = os.path.join( @@ -331,36 +327,27 @@ def gen_flow(self) -> Flow: row_count=self.row_count, credentials_secret=self.sp_cert_credentials_secret, ) - df = s.run() if self.validate_df_dict: - validation_task = validate_df(df=df, tests=self.validate_df_dict, flow=self) - validation_task.set_upstream(df, flow=self) + validation_task = validate_df(df=s, tests=self.validate_df_dict, flow=self) + validation_task.set_upstream(s, flow=self) - df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) + df_with_metadata = add_ingestion_metadata_task.bind(s, flow=self) dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) df_mapped = df_map_mixed_dtypes_for_parquet.bind( df_with_metadata, dtypes_dict, flow=self ) - - # df_to_file = df_to_parquet.bind( - # df=df_mapped, - # path=self.file_name, - # flow=self, - # ) if self.output_file_extension == ".csv": df_to_file = df_to_csv.bind( df=df_with_metadata, path=self.local_file_path, - if_exists=self.if_exists, flow=self, ) else: df_to_file = df_to_parquet.bind( df=df_mapped, path=self.local_file_path, - if_exists=self.if_exists, flow=self, ) From 6dc7a6fa54e527a0706c221f16fe19eb8cb9f2a7 Mon Sep 17 00:00:00 2001 From: cgildenia Date: Thu, 30 Nov 2023 10:05:10 +0100 Subject: [PATCH 22/54] =?UTF-8?q?=E2=9C=A8=20added=20separator=20argument?= =?UTF-8?q?=20for=20csv=20saving?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/sharepoint_to_adls.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index a539d6628..787c78436 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -205,6 +205,7 @@ def __init__( vault_name: str = None, overwrite_adls: bool = True, output_file_extension: str = ".parquet", + sep: str = "\t", validate_df_dict: dict = None, set_prefect_kv: bool = False, *args: List[any], @@ -263,6 +264,7 @@ def __init__( vault_name (str, optional): KeyVaultSecret name. Default to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to True. output_file_extension (str, optional): Extension of the resulting file to be stored. Defaults to ".parquet". + sep (str, optional): The separator to use in the CSV. Defaults to "\t". validate_df_dict (dict, optional): Whether to do an extra df validation before ADLS upload or not to do. Defaults to None. set_prefect_kv (bool, optional): Whether to do key-value parameters in KV Store or not. Defaults to False. @@ -287,11 +289,12 @@ def __init__( self.overwrite = overwrite_adls self.adls_sp_credentials_secret = adls_sp_credentials_secret self.output_file_extension = output_file_extension + self.sep = sep self.set_prefect_kv = set_prefect_kv self.now = str(pendulum.now("utc")) if self.file_name is not None: self.local_file_path = ( - self.file_name.split('.')[0] + self.output_file_extension + self.file_name.split(".")[0] + self.output_file_extension ) self.adls_file_path = os.path.join(adls_dir_path, file_name) self.adls_schema_file_dir_file = os.path.join( @@ -317,7 +320,7 @@ def __init__( self.gen_flow() def gen_flow(self) -> Flow: - s = SharepointListToDF( + df = SharepointListToDF( path=self.file_name, list_title=self.list_title, site_url=self.site_url, @@ -329,19 +332,20 @@ def gen_flow(self) -> Flow: ) if self.validate_df_dict: - validation_task = validate_df(df=s, tests=self.validate_df_dict, flow=self) - validation_task.set_upstream(s, flow=self) + validation_task = validate_df(df=df, tests=self.validate_df_dict, flow=self) + validation_task.set_upstream(df, flow=self) - df_with_metadata = add_ingestion_metadata_task.bind(s, flow=self) + df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) df_mapped = df_map_mixed_dtypes_for_parquet.bind( df_with_metadata, dtypes_dict, flow=self ) - + if self.output_file_extension == ".csv": df_to_file = df_to_csv.bind( df=df_with_metadata, path=self.local_file_path, + sep=self.sep, flow=self, ) else: From 7c6b9e36094303bef3aa0cf4ef3a1316d83e0da0 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 30 Nov 2023 11:07:17 +0100 Subject: [PATCH 23/54] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Replaced=20`check=5F?= =?UTF-8?q?value`=20and=20`get=5Fnested=5Fdict`=20with=20one=20-=20`get=5F?= =?UTF-8?q?nested=5Fvalue`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/utils.py | 60 +++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/viadot/utils.py b/viadot/utils.py index 4690c8cbd..03ea95aea 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -461,37 +461,43 @@ def wrapper(*args, **kwargs) -> pd.DataFrame: return decorator -def get_nested_dict(d): - if isinstance(d, dict): - for lvl in d.values(): - if isinstance(lvl, dict): - return get_nested_dict(lvl) - else: - return d - else: - return None - - -def check_value(base: Union[Dict, Any], levels: List) -> Union[None, Any]: +def get_nested_value( + nested_dict: dict, + levels_to_search: List[str] = None, +) -> Union[None, Any]: """ - Task to extract data from nested json file if there is any under passed parameters. - Otherwise return None. + Retrieve a value from a nested dictionary based on specified levels if the `levels_to_search` are provided. + Retrieve a key:value pair of the first deepest pair if `levels_to_search` is not provided. Args: - base (Dict, Any): variable with base lvl of the json, for example: - json_file["first_known_lvl"]["second_known_lvl"]["third_known_lvl"] - levels (List): List of potential lower levels of nested json for data retrieval. For example: - ["first_lvl_below_base", "second_lvl_below_base", "searched_phrase"] + nested_dict (dict): The nested dictionary to search for the value. + levels_to_search (List[str], optional): List of keys representing the levels to search. Defaults to None. + If provided, the function will attempt to retrieve the value at the specified levels. + If not provided, the function will recursively search for the first non-dictionary value. Returns: - Union[None, Any]: Searched value for the lowest level, in example data under "searched_phrase" key. + Union[None, Any]: The searched value for the specified level or the first key:value pair when + first non-dictionary value found during recursive search. + Returns None if the nested_dict is not a dictionary or if the specified levels are not found. """ - - for lvl in levels: - if isinstance(base, dict): - base = base.get(lvl) - if base is None: - return None + try: + if levels_to_search is not None: + for lvl in levels_to_search: + if isinstance(nested_dict[lvl], dict): + return get_nested_value( + nested_dict=nested_dict[levels_to_search.pop(0)], + levels_to_search=levels_to_search, + ) + else: + return nested_dict[lvl] else: - return base - return base + for lvl in nested_dict.values(): + if isinstance(lvl, dict): + return get_nested_value(nested_dict=lvl) + else: + return nested_dict + except KeyError as e: + return None + except TypeError as e: + logger.error(f"The 'nested_dict' must be a dictionary. {e}") + return None From 102fc933d27f79c723c03ed3f67440f23cd4b5b3 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 30 Nov 2023 11:09:01 +0100 Subject: [PATCH 24/54] =?UTF-8?q?=F0=9F=90=9B=20Added=20tests=20for=20`tes?= =?UTF-8?q?t=5Fnested=5Fvalue`=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_utils.py | 124 +++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 58 deletions(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 0b2ed5782..e0177b3ea 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -11,7 +11,8 @@ add_viadot_metadata_columns, check_if_empty_file, gen_bulk_insert_query_from_df, - check_value, + get_flow_last_run_date, + get_nested_value, get_sql_server_table_dtypes, slugify, handle_api_response, @@ -52,6 +53,27 @@ def azure_sql(): yield azure_sql +@pytest.fixture(scope="function") +def nested_dict(): + nested_dict = { + "first_known_lvl": { + "second_known_lvl": { + "third_known_lvl": { + "searched_lvl": { + "searched_phrase_1": "First value", + "searched_phrase_2": None, + "searched_phrase_3": "Found it!", + } + } + } + }, + "first_known_lvl_2": { + "second_known_lvl_2": {"searched_phrase_2": "Found it_2!"} + }, + } + return nested_dict + + def test_slugify(): """To test slugify() function functionalities work""" test_string = "Text With Spaces Before Changes" @@ -209,63 +231,6 @@ def test_add_viadot_metadata_columns_with_parameter(): assert df_decorated["_viadot_source"][0] == "Source_name" -def test_check_value_found(): - """Sample test checking the correctness of the function when the key is found.""" - json_data = { - "first_known_lvl": { - "second_known_lvl": {"third_known_lvl": {"searched_phrase": "phrase"}} - } - } - result = check_value( - json_data["first_known_lvl"]["second_known_lvl"]["third_known_lvl"], - ["searched_phrase"], - ) - assert result == "phrase" - - -def test_check_value_not_found(): - """Sample test checking the correctness of the function when the key is not found.""" - json_data = { - "first_known_lvl": { - "second_known_lvl": { - "third_known_lvl": {"other_phrase": "This won't be found"} - } - } - } - result = check_value( - json_data["first_known_lvl"]["second_known_lvl"]["third_known_lvl"], - ["searched_phrase"], - ) - assert result is None - - -def test_check_value_empty_dict(): - """Sample test checking the correctness of the function with an empty dictionary.""" - json_data = {} - result = check_value(json_data, ["searched_phrase"]) - assert result is None - - -def test_check_value_nonexistent_key(): - """Sample test checking the correctness of the function with a nonexistent key.""" - - json_data = { - "first_known_lvl": { - "second_known_lvl": {"third_known_lvl": {"searched_phrase": "phrase"}} - } - } - result = check_value(json_data, ["nonexistent_key"]) - assert result is None - - -def test_check_value_base_is_not_dict(): - result = check_value( - base="this_is_not_dict", - levels=["searched_phrase"], - ) - assert result == "this_is_not_dict" - - def test_handle_api_response_wrong_method(): """Test to check if ValueError is thrown when wrong method is used.""" @@ -332,3 +297,46 @@ def test_union_dict_return(): unioned_dict = union_dict(a, b) assert isinstance(unioned_dict, dict) assert unioned_dict == {"a": 1, "b": 2} + + +def test_get_nested_value_found(nested_dict): + """Sample test checking the correctness of the function when the key is found.""" + result = get_nested_value( + nested_dict=nested_dict["first_known_lvl"]["second_known_lvl"][ + "third_known_lvl" + ], + levels_to_search=["searched_lvl", "searched_phrase_3"], + ) + assert result == "Found it!" + + +def test_get_nested_value_not_found(nested_dict): + """Sample test checking the correctness of the function when the key is not found.""" + result = get_nested_value( + nested_dict["first_known_lvl"]["second_known_lvl"]["third_known_lvl"], + levels_to_search=["searched_wrong_lvl"], + ) + assert result is None + + +def test_get_nested_value_nested_dict_is_string(caplog): + """Sample test checking the correctness of the function when non-dictionary value is provided as nested_dict.""" + with caplog.at_level(logging.WARNING): + get_nested_value( + nested_dict="this_is_not_dict", + levels_to_search=["searched_phrase"], + ) + assert "The 'nested_dict' must be a dictionary." in caplog.text + + +def test_get_nested_value_without_levels(nested_dict): + """Sample test checking the correctness of the function when only `nested_value` is provided.""" + result_1 = get_nested_value(nested_dict=nested_dict) + result_2 = get_nested_value(nested_dict=nested_dict["first_known_lvl_2"]) + + assert result_1 == { + "searched_phrase_1": "First value", + "searched_phrase_2": None, + "searched_phrase_3": "Found it!", + } + assert result_2 == {"searched_phrase_2": "Found it_2!"} From 895671638c8766421a9d5ecfedc8fa776886e628 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 30 Nov 2023 11:10:06 +0100 Subject: [PATCH 25/54] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Changed=20funtion=20?= =?UTF-8?q?from=20`get=5Fnested=5Fdict`=20to=20`get=5Fnested=5Fvalue`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/sharepoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/viadot/sources/sharepoint.py b/viadot/sources/sharepoint.py index fbbd1b08b..08e616326 100644 --- a/viadot/sources/sharepoint.py +++ b/viadot/sources/sharepoint.py @@ -10,7 +10,7 @@ from office365.sharepoint.client_context import ClientContext from prefect.utilities import logging -from viadot.utils import get_nested_dict +from viadot.utils import get_nested_value from ..config import local_config from ..exceptions import CredentialError @@ -168,7 +168,7 @@ def _unpack_fields( item_values_dict = list_item.properties if item_values_dict: for field, val in item_values_dict.items(): - nested_dict = get_nested_dict(val) + nested_dict = get_nested_value(val) # Check if the values are nested if nested_dict != None: # Check if field has expandable type From dfacdfab48b098ba306a22dfbaf6e5dec1ca29d8 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 30 Nov 2023 11:33:24 +0100 Subject: [PATCH 26/54] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Changed=20funtion=20?= =?UTF-8?q?from=20`check=5Fvalue`=20to=20`get=5Fnested=5Fvalue`=20in=20gen?= =?UTF-8?q?esys?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/genesys.py | 50 +++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index 942249ac2..0b9d803b8 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -13,7 +13,7 @@ from viadot.exceptions import APIError from viadot.sources import Genesys -from viadot.utils import check_value +from viadot.utils import get_nested_value from viadot.task_utils import * logger = logging.get_logger() @@ -537,31 +537,43 @@ def run( # For loop to extract data from specific page for id in range(0, num_ids): record_dict = {} - record_dict["Id"] = check_value(json_file["entities"][id], ["id"]) - record_dict["Name"] = check_value( - json_file["entities"][id], ["name"] + record_dict["Id"] = get_nested_value( + nested_dict=json_file["entities"][id], levels_to_search=["id"] ) - record_dict["DivisionName"] = check_value( - json_file["entities"][id], ["division", "name"] + record_dict["Name"] = get_nested_value( + nested_dict=json_file["entities"][id], levels_to_search=["name"] ) - record_dict["Email"] = check_value( - json_file["entities"][id], ["email"] + record_dict["DivisionName"] = get_nested_value( + nested_dict=json_file["entities"][id], + levels_to_search=["division", "name"], ) - record_dict["State"] = check_value( - json_file["entities"][id], ["state"] + record_dict["Email"] = get_nested_value( + nested_dict=json_file["entities"][id], + levels_to_search=["email"], ) - record_dict["Title"] = check_value( - json_file["entities"][id], ["title"] + record_dict["State"] = get_nested_value( + nested_dict=json_file["entities"][id], + levels_to_search=["state"], ) - record_dict["Username"] = check_value( - json_file["entities"][id], ["username"] + record_dict["Title"] = get_nested_value( + nested_dict=json_file["entities"][id], + levels_to_search=["title"], ) - record_dict["SystemPresence"] = check_value( - json_file["entities"][id], - ["presence", "presenceDefinition", "systemPresence"], + record_dict["Username"] = get_nested_value( + nested_dict=json_file["entities"][id], + levels_to_search=["username"], ) - record_dict["DateLastLogin"] = check_value( - json_file["entities"][id], ["dateLastLogin"] + record_dict["SystemPresence"] = get_nested_value( + nested_dict=json_file["entities"][id], + levels_to_search=[ + "presence", + "presenceDefinition", + "systemPresence", + ], + ) + record_dict["DateLastLogin"] = get_nested_value( + nested_dict=json_file["entities"][id], + levels_to_search=["dateLastLogin"], ) data_list.append(record_dict) From 416ca6e85b155cceb69fbaa1935b248833a1a96d Mon Sep 17 00:00:00 2001 From: burzekj Date: Fri, 1 Dec 2023 10:35:05 +0100 Subject: [PATCH 27/54] Changed dosc string for new class arguments --- viadot/flows/genesys_to_adls.py | 13 +++++++++++++ viadot/tasks/genesys.py | 26 ++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/viadot/flows/genesys_to_adls.py b/viadot/flows/genesys_to_adls.py index 1cebe5a65..29ec1277a 100644 --- a/viadot/flows/genesys_to_adls.py +++ b/viadot/flows/genesys_to_adls.py @@ -140,7 +140,20 @@ def __init__( report_columns (List[str], optional): List of exisiting column in report. Defaults to None. conversationId_list (List[str], optional): List of conversationId passed as attribute of GET method. Defaults to None. mapping_dict (dict, optional): Mapping dictionary from user in json format. Defaults to None. + Example of mapping_dict: + mapping_dict = { + "col1": "column1", + "col_3": "column3", + "colum2": "column2", + } + where keys in dictionary mapping_dict are current DataFrame columns names. columns_order (List, optional): Columns order list to change column order inside pd.DataFrame. Defaults to None. + Example of columns_order: + columns_order = [ + "column1", + "column2", + "column3", + ] key_list (List[str], optional): List of keys needed to specify the columns in the GET request method. Defaults to None. local_file_path (str, optional): The local path from which to upload the file(s). Defaults to "". adls_file_path (str, optional): The destination path at ADLS. Defaults to None. diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index 88014a6be..d974dd587 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -59,7 +59,20 @@ def __init__( sep (str, optional): Separator in csv file. Defaults to "\t". conversationId_list (List[str], optional): List of conversationId passed as attribute of GET method. Defaults to None. mapping_dict (dict, optional): Mapping dictionary from user in json format. Defaults to None. + Example of mapping_dict: + mapping_dict = { + "col1": "column1", + "col_3": "column3", + "colum2": "column2", + } + where keys in dictionary mapping_dict are current DataFrame columns names. columns_order (List, optional): Columns order list to change column order inside pd.DataFrame. Defaults to None. + Example of columns_order: + columns_order = [ + "column1", + "column2", + "column3", + ] key_list (List[str], optional): List of keys needed to specify the columns in the GET request method. Defaults to None. validate_df_dict (Dict[str,Any], optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. @@ -345,7 +358,20 @@ def run( report_columns (List[str], optional): List of exisiting column in report. Defaults to None. conversationId_list (List[str], optional): List of conversationId passed as attribute of GET method. Defaults to None. mapping_dict (dict, optional): Mapping dictionary from user in json format. Defaults to None. + Example of mapping_dict: + mapping_dict = { + "col1": "column1", + "col_3": "column3", + "colum2": "column2", + } + where keys in dictionary mapping_dict are current DataFrame columns names. columns_order (List, optional): Columns order list to change column order inside pd.DataFrame. Defaults to None. + Example of columns_order: + columns_order = [ + "column1", + "column2", + "column3", + ] key_list (List[str], optional): List of keys needed to specify the columns in the GET request method. Defaults to None. validate_df_dict (Dict[str,Any], optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. From b6cfd413cc9448fdf28f603e98bbeaa919538a15 Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Fri, 1 Dec 2023 11:54:20 +0100 Subject: [PATCH 28/54] update for sharepoint list to df with function for checking df --- viadot/flows/sharepoint_to_adls.py | 119 +++++++++++++++-------------- 1 file changed, 60 insertions(+), 59 deletions(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 5bca3cc8e..2c233eaf8 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -3,11 +3,12 @@ from typing import Any, Dict, List import pendulum -from prefect import Flow, task +from prefect import Flow, task, case +from prefect.engine.state import Failed +from prefect.engine.runner import ENDRUN from typing import Literal from prefect.backend import set_key_value from prefect.utilities import logging -from prefect.engine.state import Finished from viadot.task_utils import ( add_ingestion_metadata_task, @@ -190,20 +191,19 @@ def slugify(name): @task(slug="check_df") -def check_if_df_empty(df, if_no_data_returned: str = "skip"): +def check_if_df_empty(df, if_no_data_returned: str = "fail"): # -> to task.utils - class NoDataReturnedError(Exception): + class NoDataReturnedError(BaseException): def __init__(self, message): self.message = message - if len(df.index) == 0: - if if_no_data_returned == "skip": - logger.info("No data in the source response. Df empty.") - elif if_no_data_returned == "warn": + if df.empty: + if if_no_data_returned == "warn": logger.warning("No data in the source response. Df empty.") + return True + # raise ENDRUN(state=Failed("Failed task raised")) elif if_no_data_returned == "fail": - raise NoDataReturnedError("No data in the source response. Df empty.") - return True + raise NoDataReturnedError("No data in the source response. Df empty...") class SharepointListToADLS(Flow): @@ -352,63 +352,64 @@ def gen_flow(self) -> Flow: row_count=self.row_count, credentials_secret=self.sp_cert_credentials_secret, ) - df_empty = check_if_df_empty.bind(df, self.if_no_data_returned) - if df_empty: - if self.if_no_data_returned == "warn": - raise Finished( - "Flow finished because there is no new data for ingestion." - ) - - if self.validate_df_dict: - validation_task = validate_df(df=df, tests=self.validate_df_dict, flow=self) - validation_task.set_upstream(df, flow=self) - - df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) - dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) - df_mapped = df_map_mixed_dtypes_for_parquet.bind( - df_with_metadata, dtypes_dict, flow=self - ) + if self.if_no_data_returned != "skip": + df_empty = check_if_df_empty.bind(df, self.if_no_data_returned, flow=self) + # If df empty there is no reason to run other tasks + else: + df_empty = False - df_to_file = df_to_parquet.bind( - df=df_mapped, - path=self.path, - flow=self, - ) + with case(df_empty, False): + if self.validate_df_dict: + validation_task = validate_df( + df=df, tests=self.validate_df_dict, flow=self + ) + validation_task.set_upstream(df, flow=self) - file_to_adls_task = AzureDataLakeUpload() - file_to_adls_task.bind( - from_path=self.path, - to_path=self.adls_dir_path, - overwrite=self.overwrite, - sp_credentials_secret=self.adls_sp_credentials_secret, - flow=self, - ) + df_with_metadata = add_ingestion_metadata_task.bind(df, flow=self) + dtypes_dict = df_get_data_types_task.bind(df_with_metadata, flow=self) + df_mapped = df_map_mixed_dtypes_for_parquet.bind( + df_with_metadata, dtypes_dict, flow=self + ) - dtypes_to_json_task.bind( - dtypes_dict=dtypes_dict, local_json_path=self.local_json_path, flow=self - ) + df_to_file = df_to_parquet.bind( + df=df_mapped, + path=self.path, + flow=self, + ) - json_to_adls_task = AzureDataLakeUpload() - json_to_adls_task.bind( - from_path=self.local_json_path, - to_path=self.adls_schema_file_dir_file, - overwrite=self.overwrite, - sp_credentials_secret=self.adls_sp_credentials_secret, - flow=self, - ) + file_to_adls_task = AzureDataLakeUpload() + file_to_adls_task.bind( + from_path=self.path, + to_path=self.adls_dir_path, + overwrite=self.overwrite, + sp_credentials_secret=self.adls_sp_credentials_secret, + flow=self, + ) - if self.validate_df_dict: - df_with_metadata.set_upstream(validation_task, flow=self) + dtypes_to_json_task.bind( + dtypes_dict=dtypes_dict, local_json_path=self.local_json_path, flow=self + ) - df_mapped.set_upstream(df_with_metadata, flow=self) - dtypes_to_json_task.set_upstream(df_mapped, flow=self) - df_to_file.set_upstream(dtypes_to_json_task, flow=self) + json_to_adls_task = AzureDataLakeUpload() + json_to_adls_task.bind( + from_path=self.local_json_path, + to_path=self.adls_schema_file_dir_file, + overwrite=self.overwrite, + sp_credentials_secret=self.adls_sp_credentials_secret, + flow=self, + ) - file_to_adls_task.set_upstream(df_to_file, flow=self) - json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) - if self.set_prefect_kv == True: - set_key_value(key=self.adls_dir_path, value=self.adls_file_path) + if self.validate_df_dict: + df_with_metadata.set_upstream(validation_task, flow=self) + dtypes_dict.set_upstream(df_with_metadata, flow=self) + df_mapped.set_upstream(df_with_metadata, flow=self) + dtypes_to_json_task.set_upstream(df_mapped, flow=self) + df_to_file.set_upstream(dtypes_to_json_task, flow=self) + file_to_adls_task.set_upstream(df_to_file, flow=self) + json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) + if self.set_prefect_kv == True: + set_key_value(key=self.adls_dir_path, value=self.adls_file_path) @staticmethod def slugify(name): From 9c260505e197c60a25f7717e481c014617154117 Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Sat, 2 Dec 2023 16:47:23 +0100 Subject: [PATCH 29/54] changed with case in sharepoint to adls --- viadot/flows/sharepoint_to_adls.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 2c233eaf8..b2d6d22cb 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -204,6 +204,10 @@ def __init__(self, message): # raise ENDRUN(state=Failed("Failed task raised")) elif if_no_data_returned == "fail": raise NoDataReturnedError("No data in the source response. Df empty...") + elif if_no_data_returned == "skip": + return False + else: + return False class SharepointListToADLS(Flow): @@ -353,11 +357,7 @@ def gen_flow(self) -> Flow: credentials_secret=self.sp_cert_credentials_secret, ) - if self.if_no_data_returned != "skip": - df_empty = check_if_df_empty.bind(df, self.if_no_data_returned, flow=self) - # If df empty there is no reason to run other tasks - else: - df_empty = False + df_empty = check_if_df_empty.bind(df, self.if_no_data_returned, flow=self) with case(df_empty, False): if self.validate_df_dict: From 371223b8d4d50405e0397e684ff596f030fdd80f Mon Sep 17 00:00:00 2001 From: cgildenia Date: Mon, 4 Dec 2023 13:23:28 +0100 Subject: [PATCH 30/54] =?UTF-8?q?=E2=9C=85=20added=20tests=20for=20sharepo?= =?UTF-8?q?int=20list?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_sharepoint_to_adls.py | 72 ++++++++++++++++++- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/tests/integration/flows/test_sharepoint_to_adls.py b/tests/integration/flows/test_sharepoint_to_adls.py index b3019bd67..51a1c1956 100644 --- a/tests/integration/flows/test_sharepoint_to_adls.py +++ b/tests/integration/flows/test_sharepoint_to_adls.py @@ -6,15 +6,18 @@ import pytest from prefect.tasks.secrets import PrefectSecret -from viadot.flows import SharepointToADLS +from viadot.flows import SharepointToADLS, SharepointListToADLS from viadot.tasks import AzureDataLakeRemove -ADLS_FILE_NAME = str(pendulum.now("utc")) + ".csv" +ADLS_FILE_NAME = pendulum.now("utc").strftime("%Y-%m-%d_%H:%M:%S_%Z%z") ADLS_DIR_PATH = "raw/tests/" CREDENTIALS_SECRET = PrefectSecret("AZURE_DEFAULT_ADLS_SERVICE_PRINCIPAL_SECRET").run() DATA = {"country": [1, 2], "sales": [3, 4]} +SharepointToADLS + + @mock.patch( "viadot.tasks.SharepointToDF.run", return_value=pd.DataFrame(data=DATA), @@ -73,3 +76,68 @@ def test_sharepoint_to_adls_run_flow_overwrite_false(mocked_class): assert result.is_failed() os.remove("test_sharepoint_to_adls_run_flow_overwrite_false.csv") os.remove("test_sharepoint_to_adls_run_flow_overwrite_false.json") + + +# SharepointListToADLS +@mock.patch( + "viadot.tasks.SharepointListToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_sharepoint_list_to_adls_run_flow_csv(mocked_class): + flow = SharepointListToADLS( + "test_sharepoint_to_adls_run_flow", + output_file_extension=".csv", + adls_sp_credentials_secret=CREDENTIALS_SECRET, + adls_dir_path=ADLS_DIR_PATH, + file_name=ADLS_FILE_NAME, + list_title="", + site_url="", + ) + result = flow.run() + assert result.is_successful() + os.remove(ADLS_FILE_NAME + ".csv") + os.remove("test_sharepoint_to_adls_run_flow.json") + + +@mock.patch( + "viadot.tasks.SharepointListToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_sharepoint_list_to_adls_run_flow_parquet(mocked_class): + flow = SharepointListToADLS( + "test_sharepoint_to_adls_run_flow", + output_file_extension=".parquet", + adls_sp_credentials_secret=CREDENTIALS_SECRET, + adls_dir_path=ADLS_DIR_PATH, + file_name=ADLS_FILE_NAME, + list_title="", + site_url="", + ) + result = flow.run() + assert result.is_successful() + os.remove(ADLS_FILE_NAME + ".parquet") + os.remove("test_sharepoint_to_adls_run_flow.json") + + +@mock.patch( + "viadot.tasks.SharepointListToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_sharepoint_list_to_adls_run_flow_overwrite_true(mocked_class): + flow = SharepointListToADLS( + "test_sharepoint_to_adls_run_flow_overwrite_true", + output_file_extension=".csv", + adls_sp_credentials_secret=CREDENTIALS_SECRET, + adls_dir_path=ADLS_DIR_PATH, + file_name=ADLS_FILE_NAME, + overwrite_adls=True, + list_title="", + site_url="", + ) + result = flow.run() + assert result.is_successful() + os.remove(ADLS_FILE_NAME + ".csv") + os.remove("test_sharepoint_to_adls_run_flow_overwrite_true.json") From c0ae042be3b8685109b37b1cd65a29bb75efc77e Mon Sep 17 00:00:00 2001 From: adrian-wojcik Date: Mon, 4 Dec 2023 13:32:59 +0100 Subject: [PATCH 31/54] =?UTF-8?q?=F0=9F=90=9B=20Added=20warning=20logger?= =?UTF-8?q?=20for=20credential?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/sap_rfc.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/viadot/sources/sap_rfc.py b/viadot/sources/sap_rfc.py index 6432ac8e8..f39fee297 100644 --- a/viadot/sources/sap_rfc.py +++ b/viadot/sources/sap_rfc.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd from prefect.utilities import logging +from prefect.engine.state import Failed try: import pyrfc @@ -257,7 +258,14 @@ def __init__( self._con = None DEFAULT_CREDENTIALS = local_config.get("SAP").get("DEV") - credentials = kwargs.pop("credentials", None) or DEFAULT_CREDENTIALS + + credentials = kwargs.pop("credentials", None) + if credentials is None: + credentials = DEFAULT_CREDENTIALS + logger.warning( + "WARNING!!! Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + ) + if credentials is None: raise CredentialError("Missing credentials.") From a999d91516b63647b593351629998bbfdb5e3096 Mon Sep 17 00:00:00 2001 From: adrian-wojcik Date: Mon, 4 Dec 2023 13:40:35 +0100 Subject: [PATCH 32/54] Add changes to changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ef880c75..2ba7a52ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### Changed - +- Changed __init__ in SAPRFC class in source in order to raise warning in prefect when credentials will be taken from DEV. ## [0.4.22] - 2023-11-15 ### Added From d69733b631ae78c48167b2cef0f5ed6a76e253b8 Mon Sep 17 00:00:00 2001 From: adrian-wojcik Date: Mon, 4 Dec 2023 13:58:32 +0100 Subject: [PATCH 33/54] =?UTF-8?q?=F0=9F=8E=A8=20Delete=20"WARNING!!!"=20fr?= =?UTF-8?q?om=20warning=20message?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/sap_rfc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/sources/sap_rfc.py b/viadot/sources/sap_rfc.py index f39fee297..806e61250 100644 --- a/viadot/sources/sap_rfc.py +++ b/viadot/sources/sap_rfc.py @@ -263,7 +263,7 @@ def __init__( if credentials is None: credentials = DEFAULT_CREDENTIALS logger.warning( - "WARNING!!! Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." ) if credentials is None: From 5e4fa56a09049af109d4fd26f41c405d78b410ec Mon Sep 17 00:00:00 2001 From: gwieloch Date: Mon, 4 Dec 2023 15:17:24 +0100 Subject: [PATCH 34/54] added conn.close after each session to sappw --- viadot/sources/sap_bw.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/viadot/sources/sap_bw.py b/viadot/sources/sap_bw.py index 94e3347a9..90b70dfec 100644 --- a/viadot/sources/sap_bw.py +++ b/viadot/sources/sap_bw.py @@ -1,7 +1,6 @@ import textwrap from typing import List - -from pyrfc import Connection +import pyrfc from viadot.exceptions import CredentialError, ValidationError from viadot.sources.base import Source @@ -31,14 +30,15 @@ def __init__(self, credentials: dict, *args, **kwargs): super().__init__(*args, credentials=credentials, **kwargs) - def get_connection(self) -> Connection: + def get_connection(self) -> pyrfc.Connection: """ Function to create the connection with SAP BW. Returns: Connection: Connection to SAP. """ - return Connection( + + return pyrfc.Connection( ashost=self.credentials.get("ashost"), sysnr=self.credentials.get("sysnr"), user=self.credentials.get("user"), @@ -126,5 +126,6 @@ def get_output_data(self, mdx_query: str) -> dict: datasetid = properties["DATASETID"] query_output = conn.call("RSR_MDX_GET_FLAT_DATA", DATASETID=datasetid) + conn.close() # close connection after full session return query_output From b08dbfffc5e129a32a9ab7fd2ab39d6224bcb0ca Mon Sep 17 00:00:00 2001 From: cgildenia Date: Mon, 4 Dec 2023 16:05:00 +0100 Subject: [PATCH 35/54] =?UTF-8?q?=E2=9C=A8=20list=20extension=20is=20now?= =?UTF-8?q?=20a=20literal?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/sharepoint_to_adls.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 787c78436..79b511c53 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -1,6 +1,6 @@ import os from pathlib import Path -from typing import Any, Dict, List +from typing import Any, Dict, List, Literal import pendulum from prefect import Flow @@ -65,7 +65,6 @@ def __init__( Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_empty (str, optional): What to do if query returns no data. Defaults to "warn". - if_exists (str, optional): What to do if the file already exists. Defaults to "replace". validate_df_dict (dict, optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. timeout(int, optional): The amount of time (in seconds) to wait while running this task before @@ -204,7 +203,7 @@ def __init__( sp_cert_credentials_secret: str = None, vault_name: str = None, overwrite_adls: bool = True, - output_file_extension: str = ".parquet", + output_file_extension: Literal[".parquet", ".csv"] = ".parquet", sep: str = "\t", validate_df_dict: dict = None, set_prefect_kv: bool = False, @@ -263,7 +262,7 @@ def __init__( If not passed it will take cred's from your .config/credentials.json Default to None. vault_name (str, optional): KeyVaultSecret name. Default to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to True. - output_file_extension (str, optional): Extension of the resulting file to be stored. Defaults to ".parquet". + output_file_extension (str, optional): Extension of the resulting file to be stored, either ".csv" or ".parquet". Defaults to ".parquet". sep (str, optional): The separator to use in the CSV. Defaults to "\t". validate_df_dict (dict, optional): Whether to do an extra df validation before ADLS upload or not to do. Defaults to None. set_prefect_kv (bool, optional): Whether to do key-value parameters in KV Store or not. Defaults to False. @@ -348,12 +347,14 @@ def gen_flow(self) -> Flow: sep=self.sep, flow=self, ) - else: + elif self.output_file_extension == ".parquet": df_to_file = df_to_parquet.bind( df=df_mapped, path=self.local_file_path, flow=self, ) + else: + raise ValueError("Output file extension can only be '.csv' or '.parquet'") file_to_adls_task = AzureDataLakeUpload() file_to_adls_task.bind( From 8540e19f585f3422167354a3d2568031e2bd91a3 Mon Sep 17 00:00:00 2001 From: cgildenia Date: Mon, 4 Dec 2023 16:25:28 +0100 Subject: [PATCH 36/54] =?UTF-8?q?=E2=9C=85=20added=20wrong=20extension=20t?= =?UTF-8?q?est?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_sharepoint_to_adls.py | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/tests/integration/flows/test_sharepoint_to_adls.py b/tests/integration/flows/test_sharepoint_to_adls.py index 51a1c1956..5603c49ee 100644 --- a/tests/integration/flows/test_sharepoint_to_adls.py +++ b/tests/integration/flows/test_sharepoint_to_adls.py @@ -9,15 +9,14 @@ from viadot.flows import SharepointToADLS, SharepointListToADLS from viadot.tasks import AzureDataLakeRemove -ADLS_FILE_NAME = pendulum.now("utc").strftime("%Y-%m-%d_%H:%M:%S_%Z%z") +ADLS_FILE_NAME = str(pendulum.now("utc")) + ".csv" +ADLS_FILE_NAME_LIST = pendulum.now("utc").strftime("%Y-%m-%d_%H:%M:%S_%Z%z") ADLS_DIR_PATH = "raw/tests/" CREDENTIALS_SECRET = PrefectSecret("AZURE_DEFAULT_ADLS_SERVICE_PRINCIPAL_SECRET").run() DATA = {"country": [1, 2], "sales": [3, 4]} -SharepointToADLS - - +# SharepointToADLS @mock.patch( "viadot.tasks.SharepointToDF.run", return_value=pd.DataFrame(data=DATA), @@ -72,7 +71,6 @@ def test_sharepoint_to_adls_run_flow_overwrite_false(mocked_class): overwrite_adls=False, ) result = flow.run() - assert result.is_failed() os.remove("test_sharepoint_to_adls_run_flow_overwrite_false.csv") os.remove("test_sharepoint_to_adls_run_flow_overwrite_false.json") @@ -90,13 +88,13 @@ def test_sharepoint_list_to_adls_run_flow_csv(mocked_class): output_file_extension=".csv", adls_sp_credentials_secret=CREDENTIALS_SECRET, adls_dir_path=ADLS_DIR_PATH, - file_name=ADLS_FILE_NAME, + file_name=ADLS_FILE_NAME_LIST, list_title="", site_url="", ) result = flow.run() assert result.is_successful() - os.remove(ADLS_FILE_NAME + ".csv") + os.remove(ADLS_FILE_NAME_LIST + ".csv") os.remove("test_sharepoint_to_adls_run_flow.json") @@ -111,16 +109,36 @@ def test_sharepoint_list_to_adls_run_flow_parquet(mocked_class): output_file_extension=".parquet", adls_sp_credentials_secret=CREDENTIALS_SECRET, adls_dir_path=ADLS_DIR_PATH, - file_name=ADLS_FILE_NAME, + file_name=ADLS_FILE_NAME_LIST, list_title="", site_url="", ) result = flow.run() assert result.is_successful() - os.remove(ADLS_FILE_NAME + ".parquet") + os.remove(ADLS_FILE_NAME_LIST + ".parquet") os.remove("test_sharepoint_to_adls_run_flow.json") +@mock.patch( + "viadot.tasks.SharepointListToDF.run", + return_value=pd.DataFrame(data=DATA), +) +@pytest.mark.run +def test_sharepoint_list_to_adls_run_flow_wrong_extension(mocked_class): + with pytest.raises(ValueError) as exc: + flow = SharepointListToADLS( + "test_sharepoint_to_adls_run_flow", + output_file_extension=".s", + adls_sp_credentials_secret=CREDENTIALS_SECRET, + adls_dir_path=ADLS_DIR_PATH, + file_name=ADLS_FILE_NAME_LIST, + list_title="", + site_url="", + ) + result = flow.run() + assert "Output file extension can only be '.csv' or '.parquet'" in str(exc.value) + + @mock.patch( "viadot.tasks.SharepointListToDF.run", return_value=pd.DataFrame(data=DATA), @@ -132,12 +150,12 @@ def test_sharepoint_list_to_adls_run_flow_overwrite_true(mocked_class): output_file_extension=".csv", adls_sp_credentials_secret=CREDENTIALS_SECRET, adls_dir_path=ADLS_DIR_PATH, - file_name=ADLS_FILE_NAME, + file_name=ADLS_FILE_NAME_LIST, overwrite_adls=True, list_title="", site_url="", ) result = flow.run() assert result.is_successful() - os.remove(ADLS_FILE_NAME + ".csv") + os.remove(ADLS_FILE_NAME_LIST + ".csv") os.remove("test_sharepoint_to_adls_run_flow_overwrite_true.json") From 91e9d0bbeb5ace04806801798bdff36aac48f36b Mon Sep 17 00:00:00 2001 From: adrian-wojcik Date: Tue, 5 Dec 2023 10:01:27 +0100 Subject: [PATCH 37/54] =?UTF-8?q?=F0=9F=8E=A8=20Change=20structure=20of=20?= =?UTF-8?q?'if'=20instruction=20and=20added=20to=20SAPRFCV2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/sap_rfc.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/viadot/sources/sap_rfc.py b/viadot/sources/sap_rfc.py index 806e61250..16cd5483d 100644 --- a/viadot/sources/sap_rfc.py +++ b/viadot/sources/sap_rfc.py @@ -7,7 +7,6 @@ import numpy as np import pandas as pd from prefect.utilities import logging -from prefect.engine.state import Failed try: import pyrfc @@ -262,13 +261,12 @@ def __init__( credentials = kwargs.pop("credentials", None) if credentials is None: credentials = DEFAULT_CREDENTIALS + if credentials is None: + raise CredentialError("Missing credentials.") logger.warning( "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." ) - if credentials is None: - raise CredentialError("Missing credentials.") - super().__init__(*args, credentials=credentials, **kwargs) self.sep = sep @@ -702,9 +700,15 @@ def __init__( self._con = None DEFAULT_CREDENTIALS = local_config.get("SAP").get("DEV") - credentials = kwargs.pop("credentials", None) or DEFAULT_CREDENTIALS + + credentials = kwargs.pop("credentials", None) if credentials is None: - raise CredentialError("Missing credentials.") + credentials = DEFAULT_CREDENTIALS + if credentials is None: + raise CredentialError("Missing credentials.") + logger.warning( + "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + ) super().__init__(*args, credentials=credentials, **kwargs) From 6334ad427bafd4ffd29d8310694bba3e5bcca4b7 Mon Sep 17 00:00:00 2001 From: adrian-wojcik Date: Tue, 5 Dec 2023 10:07:12 +0100 Subject: [PATCH 38/54] =?UTF-8?q?=E2=9C=85=20Added=20tests=20for=20new=20f?= =?UTF-8?q?unctionalities=20for=20SAPRFC=20and=20SAPRFCV2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 4 ++-- tests/integration/test_sap_rfc.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ba7a52ea..68c49c1c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,11 +6,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - +- Added tests for new functionalities in SAPRFC and SAPRFCV2 regarding passing credentials ### Fixed ### Changed -- Changed __init__ in SAPRFC class in source in order to raise warning in prefect when credentials will be taken from DEV. +- Changed __init__ in SAPRFC and SAPRFCV2 class in source in order to raise warning in prefect when credentials will be taken from DEV. ## [0.4.22] - 2023-11-15 ### Added diff --git a/tests/integration/test_sap_rfc.py b/tests/integration/test_sap_rfc.py index 20078d312..fd2298323 100644 --- a/tests/integration/test_sap_rfc.py +++ b/tests/integration/test_sap_rfc.py @@ -187,3 +187,19 @@ def test___build_pandas_filter_query_v2(): sap2._build_pandas_filter_query(sap2.client_side_filters) == "thirdlongcolname == 01234" ), sap2._build_pandas_filter_query(sap2.client_side_filters) + + +def test_default_credentials_warning_SAPRFC(caplog): + _ = SAPRFC() + assert ( + "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + in caplog.text + ) + + +def test_default_credentials_warning_SAPRFCV2(caplog): + _ = SAPRFCV2() + assert ( + "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + in caplog.text + ) From 4f3efc96d72830e2033452d1c9110b0b072a88ff Mon Sep 17 00:00:00 2001 From: kiurieva Date: Tue, 5 Dec 2023 12:02:09 +0100 Subject: [PATCH 39/54] cleaned check_connection and get_response methods --- viadot/flows/vid_club_to_adls.py | 4 +- viadot/sources/vid_club.py | 81 ++++++++++---------------------- viadot/tasks/vid_club.py | 4 +- 3 files changed, 30 insertions(+), 59 deletions(-) diff --git a/viadot/flows/vid_club_to_adls.py b/viadot/flows/vid_club_to_adls.py index 40f53d8ae..de7267479 100644 --- a/viadot/flows/vid_club_to_adls.py +++ b/viadot/flows/vid_club_to_adls.py @@ -31,7 +31,7 @@ def __init__( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, days_interval: int = 30, cols_to_drop: List[str] = None, vid_club_credentials: Dict[str, Any] = None, @@ -60,7 +60,7 @@ def __init__( from_date (str, optional): Start date for the query, by default is the oldest date in the data 2022-03-22. to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. items_per_page (int, optional): Number of entries per page. Defaults to 100. - region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] + region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to None (parameter is not used in url). [December 2023 status: value 'all' does not work for company and jobs] days_interval (int, optional): Days specified in date range per API call (test showed that 30-40 is optimal for performance). Defaults to 30. cols_to_drop (List[str], optional): List of columns to drop. Defaults to None. vid_club_credentials (Dict[str, Any], optional): Stores the credentials information. Defaults to None. diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py index 327d9abf7..9aef751ad 100644 --- a/viadot/sources/vid_club.py +++ b/viadot/sources/vid_club.py @@ -58,7 +58,7 @@ def build_query( api_url (str): Generic part of the URL to Vid Club API. items_per_page (int): number of entries per page. source (Literal["jobs", "product", "company", "survey"], optional): The endpoint source to be accessed. Defaults to None. - region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] + region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to None (parameter is not used in url). [December 2023 status: value 'all' does not work for company and jobs] Returns: str: Final query with all filters added. @@ -67,7 +67,8 @@ def build_query( ValidationError: If any source different than the ones in the list are used. """ if source in ["jobs", "product", "company"]: - url = f"{api_url}{source}?from={from_date}&to={to_date}®ion={region}&limit={items_per_page}" + region_url_string = f"®ion={region}" if region else "" + url = f"{api_url}{source}?from={from_date}&to={to_date}{region_url_string}&limit={items_per_page}" elif source == "survey": url = f"{api_url}{source}?language=en&type=question" else: @@ -141,7 +142,7 @@ def check_connection( from_date (str, optional): Start date for the query, by default is the oldest date in the data 2022-03-22. to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. items_per_page (int, optional): Number of entries per page. 100 entries by default. - region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] + region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to None (parameter is not used in url). [December 2023 status: value 'all' does not work for company and jobs] url (str, optional): Generic part of the URL to Vid Club API. Defaults to None. Returns: @@ -161,37 +162,19 @@ def check_connection( if url is None: url = self.credentials["url"] - if source in ["jobs", "product", "company"]: - first_url = self.build_query( - source=source, - from_date=from_date, - to_date=to_date, - api_url=url, - items_per_page=items_per_page, - ) - headers = self.headers - response = handle_api_response( - url=first_url, headers=headers, method="GET", verify=False - ) - response = response.json() - elif source == "survey": - first_url = self.build_query( - source=source, - from_date=from_date, - to_date=to_date, - api_url=url, - items_per_page=items_per_page, - region=region, - ) - headers = self.headers - response = handle_api_response( - url=first_url, headers=headers, method="GET", verify=False - ) - response = response.json() - else: - raise ValidationError( - "Pick one these sources: jobs, product, company, survey" - ) + first_url = self.build_query( + source=source, + from_date=from_date, + to_date=to_date, + api_url=url, + items_per_page=items_per_page, + region=region, + ) + headers = self.headers + response = handle_api_response( + url=first_url, headers=headers, method="GET", verify=False + ) + response = response.json() return (response, first_url) def get_response( @@ -210,7 +193,7 @@ def get_response( from_date (str, optional): Start date for the query, by default is the oldest date in the data 2022-03-22. to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. items_per_page (int, optional): Number of entries per page. 100 entries by default. - region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] + region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to None (parameter is not used in url). [December 2023 status: value 'all' does not work for company and jobs] Returns: pd.DataFrame: Table of the data carried in the response. @@ -225,26 +208,14 @@ def get_response( ) if to_date == None: to_date = datetime.today().strftime("%Y-%m-%d") - if source in ["jobs", "product", "company"]: - response, first_url = self.check_connection( - source=source, - from_date=from_date, - to_date=to_date, - items_per_page=items_per_page, - ) - elif source == "survey": - response, first_url = self.check_connection( - source=source, - from_date=from_date, - to_date=to_date, - items_per_page=items_per_page, - region=region, - ) - else: - raise ValidationError( - "Pick one these sources: jobs, product, company, survey" - ) + response, first_url = self.check_connection( + source=source, + from_date=from_date, + to_date=to_date, + items_per_page=items_per_page, + region=region, + ) if isinstance(response, dict): keys_list = list(response.keys()) @@ -304,7 +275,7 @@ def total_load( from_date (str, optional): Start date for the query, by default is the oldest date in the data 2022-03-22. to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. items_per_page (int, optional): Number of entries per page. 100 entries by default. - region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] + region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to None (parameter is not used in url). [December 2023 status: value 'all' does not work for company and jobs] days_interval (int, optional): Days specified in date range per api call (test showed that 30-40 is optimal for performance). Defaults to 30. Returns: diff --git a/viadot/tasks/vid_club.py b/viadot/tasks/vid_club.py index 0814a306f..aba7025dc 100644 --- a/viadot/tasks/vid_club.py +++ b/viadot/tasks/vid_club.py @@ -85,7 +85,7 @@ def run( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: str = "all", + region: str = None, days_interval: int = 30, cols_to_drop: List[str] = None, ) -> pd.DataFrame: @@ -98,7 +98,7 @@ def run( from_date (str, optional): Start date for the query, by default is the oldest date in the data, '2022-03-22'. to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. items_per_page (int, optional): Number of entries per page. 100 entries by default. - region (str, optional): Region filter for the query. Valid inputs: ["bg", "hu", "hr", "pl", "ro", "si", "all"]. Defaults to "all". + region (str, optional): Region filter for the query. Valid inputs: ["bg", "hu", "hr", "pl", "ro", "si", "all"]. Defaults to None. days_interval (int, optional): Days specified in date range per api call (test showed that 30-40 is optimal for performance). Defaults to 30. cols_to_drop (List[str], optional): List of columns to drop. Defaults to None. From 635c146116a5501d9bfe34218fe946766bf4cf09 Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Tue, 5 Dec 2023 13:01:36 +0100 Subject: [PATCH 40/54] changed raise to endrun --- viadot/flows/sharepoint_to_adls.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index b2d6d22cb..0255d69e2 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -193,17 +193,13 @@ def slugify(name): @task(slug="check_df") def check_if_df_empty(df, if_no_data_returned: str = "fail"): # -> to task.utils - class NoDataReturnedError(BaseException): - def __init__(self, message): - self.message = message if df.empty: if if_no_data_returned == "warn": logger.warning("No data in the source response. Df empty.") return True - # raise ENDRUN(state=Failed("Failed task raised")) elif if_no_data_returned == "fail": - raise NoDataReturnedError("No data in the source response. Df empty...") + raise ENDRUN(state=Failed("No data in the source response. Df empty...")) elif if_no_data_returned == "skip": return False else: From 48ddcd65336a60d2a8bd8fc6403aded2157398bd Mon Sep 17 00:00:00 2001 From: kiurieva Date: Tue, 5 Dec 2023 15:52:54 +0100 Subject: [PATCH 41/54] unify region parameter --- viadot/sources/vid_club.py | 2 +- viadot/tasks/vid_club.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py index 9aef751ad..fe6e76098 100644 --- a/viadot/sources/vid_club.py +++ b/viadot/sources/vid_club.py @@ -263,7 +263,7 @@ def total_load( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, days_interval: int = 30, ) -> pd.DataFrame: """ diff --git a/viadot/tasks/vid_club.py b/viadot/tasks/vid_club.py index aba7025dc..aff0e09ea 100644 --- a/viadot/tasks/vid_club.py +++ b/viadot/tasks/vid_club.py @@ -85,7 +85,7 @@ def run( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: str = None, + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, days_interval: int = 30, cols_to_drop: List[str] = None, ) -> pd.DataFrame: From be1a72e93ae0a3e2c90bba3780af38605f1ca0bc Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Tue, 5 Dec 2023 16:41:48 +0100 Subject: [PATCH 42/54] Moved tasks to task_utils + tests added --- tests/integration/tasks/test_task_utils.py | 42 +++- viadot/flows/sharepoint_to_adls.py | 17 +- viadot/task_utils.py | 233 ++++++++++++++++++++- 3 files changed, 274 insertions(+), 18 deletions(-) diff --git a/tests/integration/tasks/test_task_utils.py b/tests/integration/tasks/test_task_utils.py index f22d55022..d10cceb4d 100644 --- a/tests/integration/tasks/test_task_utils.py +++ b/tests/integration/tasks/test_task_utils.py @@ -1,9 +1,15 @@ +import pytest import pandas as pd from prefect.backend import get_key_value, set_key_value from prefect.engine.state import Failed, Success from prefect.tasks.secrets import PrefectSecret -from viadot.task_utils import custom_mail_state_handler, set_new_kv +from viadot.task_utils import ( + custom_mail_state_handler, + set_new_kv, + search_for_msg_in_logs, + check_if_df_empty, +) def test_custom_state_handler(): @@ -28,3 +34,37 @@ def test_set_new_kv(): result = get_key_value("test_for_setting_kv") assert result == "72" set_key_value(key="test_for_setting_kv", value=None) + + +def test_search_for_msg_in_logs(): + logs = [ + {"message": "Error occurred"}, + {"message": "Warning: Invalid input"}, + {"message": "Log message"}, + ] + + # Test when the message is found in the logs + assert search_for_msg_in_logs.run(logs, "Error occurred") == True + + # Test when the message is not found in the logs + assert search_for_msg_in_logs.run(logs, "Info message") == False + + +def test_check_if_df_empty(): + df = pd.DataFrame() + from prefect.engine import signals + + # Test when the DataFrame is empty and if_no_data_returned is "warn" + assert check_if_df_empty.run(df, if_no_data_returned="warn") == True + + # Test when the DataFrame is empty and if_no_data_returned is "fail" + try: + check_if_df_empty.run(df, if_no_data_returned="fail") + except: + print("Task failed") + # Test when the DataFrame is empty and if_no_data_returned is "skip" + assert check_if_df_empty.run(df, if_no_data_returned="skip") == False + + # Test when the DataFrame is not empty + df = pd.DataFrame({"col": [1, 2, 3]}) + assert check_if_df_empty.run(df) == False diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 0255d69e2..6ee31f56e 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -21,6 +21,7 @@ ) from viadot.tasks import AzureDataLakeUpload from viadot.tasks.sharepoint import SharepointListToDF, SharepointToDF +from viadot.task_utils import check_if_df_empty logger = logging.get_logger() @@ -190,22 +191,6 @@ def slugify(name): return name.replace(" ", "_").lower() -@task(slug="check_df") -def check_if_df_empty(df, if_no_data_returned: str = "fail"): - # -> to task.utils - - if df.empty: - if if_no_data_returned == "warn": - logger.warning("No data in the source response. Df empty.") - return True - elif if_no_data_returned == "fail": - raise ENDRUN(state=Failed("No data in the source response. Df empty...")) - elif if_no_data_returned == "skip": - return False - else: - return False - - class SharepointListToADLS(Flow): def __init__( self, diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 6a532f932..c87387efb 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -3,9 +3,10 @@ import os import re import shutil +import pendulum from datetime import datetime, timedelta, timezone from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, List, Literal, Union, cast +from typing import TYPE_CHECKING, Any, Callable, List, Literal, Union, cast, Tuple import pandas as pd import prefect @@ -14,6 +15,7 @@ from prefect import Flow, Task, task from prefect.backend import set_key_value from prefect.engine.state import Failed +from prefect.engine.runner import ENDRUN from prefect.storage import Git from prefect.tasks.secrets import PrefectSecret from prefect.utilities import logging @@ -792,3 +794,232 @@ def validate_df(df: pd.DataFrame, tests: dict = None) -> None: raise ValidationError( f"Validation failed for {failed_tests} test/tests: {failed_tests_msg}" ) + + +@task(timeout=3600, slug="check_df") +def check_if_df_empty(df, if_no_data_returned: str = "fail") -> bool: + """ + Check if a DataFrame received as a data source response is empty. + If fail is expected , this task will finish with ENDRUN(Failed()) state. + + Args: + df (pandas.DataFrame): The DataFrame to check. + if_no_data_returned (str, optional): The action to take if no data is returned in the DataFrame. + Options are "fail" (default), "warn", or "skip". + + Returns: + bool: True if the DataFrame is empty and the action is "warn", False otherwise. + + Raises: + ENDRUN: If the DataFrame is empty and the action is "fail". + + Example: + >>> df = pd.DataFrame() + >>> check_if_df_empty(df, if_no_data_returned="warn") + True + """ + if df.empty: + if if_no_data_returned == "warn": + logger.warning("No data in the source response. Df empty.") + return True + elif if_no_data_returned == "fail": + raise ENDRUN(state=Failed("No data in the source response. Df empty...")) + elif if_no_data_returned == "skip": + return False + else: + return False + + +@task(timeout=3600) +def get_flow_run_id(client: prefect.Client, flow_name: str, state: str) -> str: + """Gets the last flow run ID based on the name of the flow and time of its run in descending order of th flows runs + + Args: + client (prefect.Client): The Prefect client used to execute the GraphQL query. + flow_name (str): The name of the flow to search for. + state (str): The state of the flow run to filter by. + + Returns: + str: The ID of the last flow run that matches the given flow name and state. + + Raises: + ValueError: If the given flow name cannot be found in the Prefect Cloud API. + + Example: + >>> client = prefect.Client() + >>> flow_name = "My Flow" + >>> state = "SUCCESS" + >>> get_flow_run_id(client, flow_name, state) + "flow_run_id_12345" + """ + # Construct the GraphQL query + query = f""" + {{ + flow_run( + where: {{ + flow: {{ + name: {{_eq: "{flow_name}"}} + }} + state: {{_eq: "{state}"}} + }} + order_by : {{end_time: desc}} + limit : 1 + ){{ + id + }} + }} + """ + # Execute the GraphQL query + response = client.graphql(query) + result_data = response.get("data").get("flow_run") + if result_data: + flow_run_id = result_data.get("id")[0] + return flow_run_id + else: + raise ValueError("Given flow name cannot be found in the Prefect Cloud API") + + +@task(timeout=3600) +def get_task_logs(client: prefect.Client, flow_run_id: str, task_slug: str) -> List: + """ + Retrieves the logs for a specific task in a flow run using the Prefect client and GraphQL query. + + Args: + client (prefect.Client): The Prefect client used to execute the GraphQL query. + flow_run_id (str): The ID of the flow run. + task_slug (str): The slug of the task to retrieve logs for. + + Returns: + List[Dict[str, Union[str, List[Dict[str, str]]]]]: A list of log entries for the specified task. + Each log entry is a dictionary with 'message' and 'level' keys. + + Raises: + ValueError: If no data is available for the given task slug. + + Example: + >>> client = prefect.Client() + >>> flow_run_id = "flow_run_id_12345" + >>> task_slug = "my_task" + >>> get_task_logs(client, flow_run_id, task_slug) + [{'message': 'Log message 1', 'level': 'INFO'}, {'message': 'Log message 2', 'level': 'DEBUG'}] + """ + # Construct the GraphQL query + query = f""" + {{ + task_run( + where: {{ + flow_run_id: {{_eq: "{flow_run_id}"}}, + task: {{slug: {{_eq: "{task_slug}"}}}} + }} + ) {{ + state + logs {{ + message + level + }} + }} + }} + """ + # Execute the GraphQL query + logger.info("Executing GraphQL query to get task logs") + response = client.graphql(query) + result_data = response.get("data").get("task_run") + # Extract task logs + if result_data: + logs = result_data[0].get("logs") + return logs + else: + raise ValueError("No data available for the given task slug") + + +@task(timeout=3600) +def send_email_notification( + from_address: Union[str, Tuple], + to_address: Union[str, List[str], List[Tuple], Tuple[str]], + content: str, + subject: str, + vault_name: str, + mail_credentials_secret: str, + timezone: str = "Europe/Warsaw", +) -> str: + """ + Sends an email notification using SendGrid API. + + Args: + from_address (Union[str, Tuple]): The email address of the sender. + to_address (Union[str, List[str], List[Tuple], Tuple[str]]): The email address(es) of the recipient(s). + content (str): The content of the email. + subject (str): The subject of the email. + vault_name (str): The name of the Azure Key Vault. + mail_credentials_secret (str): The secret name for the SendGrid API key. + timezone (str, optional): The timezone to use for the current datetime. Defaults to "Europe/Warsaw". + + Returns: + str: The response from the SendGrid API. + + Raises: + Exception: If the API key is not provided. + + Example: + >>> send_email_notification("sender@example.com", "recipient@example.com", "Hello!", "Test Email", "my-vault", "sendgrid-api-key") + 'Email sent successfully' + """ + + # Retrieve the SendGrid API key from the secret + if mail_credentials_secret is None: + mail_credentials_secret = PrefectSecret("SENDGRID_DEFAULT_SECRET").run() + elif mail_credentials_secret is not None: + credentials_str = AzureKeyVaultSecret( + mail_credentials_secret, vault_name=vault_name + ).run() + api_key = json.loads(credentials_str).get("API_KEY") + else: + raise Exception("Please provide API KEY") + + # Get the current datetime in the specified timezone + curr_dt = pendulum.now(tz=timezone) + + # Create the email message + message = Mail( + from_email=from_address, + to_emails=to_address, + subject=subject, + html_content=f"{content}", + ) + + # Send the email using SendGrid API + send_grid = SendGridAPIClient(api_key) + response = send_grid.send(message) + return response + + +@task(timeout=3600) +def search_for_msg_in_logs(logs: list, log_info: str) -> bool: + """ + Searches for a specific message in Prefect flow or task logs. + + Args: + logs (list): The logs to search in. + log_info (str): The message to search for. + + Returns: + bool: True if the message is found, False otherwise. + + Example: + >>> logs = [ + ... {"message": "Error occurred"}, + ... {"message": "Warning: Invalid input"}, + ... {"message": "Log message"} + ... ] + >>> search_for_msg_in_logs(logs, "Error occurred") + True + """ + found_msg = False + + # Iterate over each log entry + for value in logs: + if value.get("message") == log_info: + found_msg = True + break + + return found_msg From 2f7ed2a1b6cb6fdf124a8322ff0a0d9ebf18cf48 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 6 Dec 2023 09:10:56 +0000 Subject: [PATCH 43/54] =?UTF-8?q?=F0=9F=8E=A8=20Format=20Python=20code=20w?= =?UTF-8?q?ith=20Black?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_sharepoint_to_adls.py | 2 +- tests/integration/test_sap_rfc.py | 4 ++-- tests/unit/test_utils.py | 2 +- viadot/sources/sap_bw.py | 1 + viadot/tasks/genesys.py | 2 +- viadot/utils.py | 2 +- 6 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/integration/flows/test_sharepoint_to_adls.py b/tests/integration/flows/test_sharepoint_to_adls.py index 5603c49ee..f0597c41a 100644 --- a/tests/integration/flows/test_sharepoint_to_adls.py +++ b/tests/integration/flows/test_sharepoint_to_adls.py @@ -6,7 +6,7 @@ import pytest from prefect.tasks.secrets import PrefectSecret -from viadot.flows import SharepointToADLS, SharepointListToADLS +from viadot.flows import SharepointListToADLS, SharepointToADLS from viadot.tasks import AzureDataLakeRemove ADLS_FILE_NAME = str(pendulum.now("utc")) + ".csv" diff --git a/tests/integration/test_sap_rfc.py b/tests/integration/test_sap_rfc.py index fd2298323..0ca9a2a1c 100644 --- a/tests/integration/test_sap_rfc.py +++ b/tests/integration/test_sap_rfc.py @@ -187,8 +187,8 @@ def test___build_pandas_filter_query_v2(): sap2._build_pandas_filter_query(sap2.client_side_filters) == "thirdlongcolname == 01234" ), sap2._build_pandas_filter_query(sap2.client_side_filters) - - + + def test_default_credentials_warning_SAPRFC(caplog): _ = SAPRFC() assert ( diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 75ef30e97..c29fbc014 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -8,8 +8,8 @@ from viadot.utils import ( add_viadot_metadata_columns, check_if_empty_file, - gen_bulk_insert_query_from_df, check_value, + gen_bulk_insert_query_from_df, ) EMPTY_CSV_PATH = "empty.csv" diff --git a/viadot/sources/sap_bw.py b/viadot/sources/sap_bw.py index 90b70dfec..e70f79b36 100644 --- a/viadot/sources/sap_bw.py +++ b/viadot/sources/sap_bw.py @@ -1,5 +1,6 @@ import textwrap from typing import List + import pyrfc from viadot.exceptions import CredentialError, ValidationError diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index d974dd587..feafbaccf 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -13,8 +13,8 @@ from viadot.exceptions import APIError from viadot.sources import Genesys -from viadot.utils import check_value from viadot.task_utils import * +from viadot.utils import check_value logger = logging.get_logger() diff --git a/viadot/utils.py b/viadot/utils.py index 5e3de784c..cd34adb8a 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -2,7 +2,7 @@ import os import re from itertools import chain -from typing import Union, Any, Callable, Dict, List, Literal +from typing import Any, Callable, Dict, List, Literal, Union import pandas as pd import prefect From 7aac8b0c7033dba90127e3ee1be89502670ddbc9 Mon Sep 17 00:00:00 2001 From: kiurieva Date: Wed, 6 Dec 2023 12:39:53 +0100 Subject: [PATCH 44/54] Fixed total_load method, updated tests --- tests/integration/test_vid_club.py | 1 - viadot/sources/vid_club.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_vid_club.py b/tests/integration/test_vid_club.py index 6c2bd4544..50c3015cf 100644 --- a/tests/integration/test_vid_club.py +++ b/tests/integration/test_vid_club.py @@ -66,7 +66,6 @@ def test_url_string(): expected_elements = [ f"from={from_date}", f"to={to_date}", - "region=all", f"limit={items_per_page}", api_url, ] diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py index fe6e76098..4da4e4f45 100644 --- a/viadot/sources/vid_club.py +++ b/viadot/sources/vid_club.py @@ -310,6 +310,11 @@ def total_load( items_per_page=items_per_page, region=region, ) + list_columns = df.columns[ + df.applymap(lambda x: isinstance(x, list)).any() + ].tolist() + for i in list_columns: + df[i] = df[i].apply(lambda x: tuple(x) if isinstance(x, list) else x) df.drop_duplicates(inplace=True) if df.empty: From 8f447aa93ce4e0956070a03b6897d44633b00522 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 6 Dec 2023 14:41:50 +0100 Subject: [PATCH 45/54] =?UTF-8?q?=F0=9F=94=8A=20Updated=20logger=20warning?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_sap_rfc.py | 4 ++-- viadot/sources/sap_rfc.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_sap_rfc.py b/tests/integration/test_sap_rfc.py index 0ca9a2a1c..28ab044a2 100644 --- a/tests/integration/test_sap_rfc.py +++ b/tests/integration/test_sap_rfc.py @@ -192,7 +192,7 @@ def test___build_pandas_filter_query_v2(): def test_default_credentials_warning_SAPRFC(caplog): _ = SAPRFC() assert ( - "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + "Your credentials will use DEV environment. If you would like to use different one - please specified it." in caplog.text ) @@ -200,6 +200,6 @@ def test_default_credentials_warning_SAPRFC(caplog): def test_default_credentials_warning_SAPRFCV2(caplog): _ = SAPRFCV2() assert ( - "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + "Your credentials will use DEV environment. If you would like to use different one - please specified it." in caplog.text ) diff --git a/viadot/sources/sap_rfc.py b/viadot/sources/sap_rfc.py index 16cd5483d..a9d109148 100644 --- a/viadot/sources/sap_rfc.py +++ b/viadot/sources/sap_rfc.py @@ -264,7 +264,7 @@ def __init__( if credentials is None: raise CredentialError("Missing credentials.") logger.warning( - "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + "Your credentials will use DEV environment. If you would like to use different one - please specified it." ) super().__init__(*args, credentials=credentials, **kwargs) From 8f6075bb33a01651ac42f70409e9af7de4189eca Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Wed, 6 Dec 2023 14:43:45 +0100 Subject: [PATCH 46/54] Updated tests, removed not necessary imports, changed file path parameter --- .../flows/test_sharepoint_to_adls.py | 19 +++++++++++++++---- viadot/flows/sharepoint_to_adls.py | 4 ++-- viadot/task_utils.py | 1 + 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/integration/flows/test_sharepoint_to_adls.py b/tests/integration/flows/test_sharepoint_to_adls.py index bf7b1e5e5..93a31f5d1 100644 --- a/tests/integration/flows/test_sharepoint_to_adls.py +++ b/tests/integration/flows/test_sharepoint_to_adls.py @@ -171,6 +171,11 @@ def test_sharepoint_list_to_adls_run_flow_overwrite_true(mocked_class): ) @pytest.mark.run def test_sharepoint_list_to_adls_run_flow_fail_on_no_data_returned(mocked_class): + """ + Test will check if flow is failing when empty DF is passed + with the given parameter if_no_data_returned = "fail" + CSV file should not be generated! + """ flow = SharepointListToADLS( "test_sharepoint_to_adls_run_flow", output_file_extension=".csv", @@ -183,8 +188,6 @@ def test_sharepoint_list_to_adls_run_flow_fail_on_no_data_returned(mocked_class) ) result = flow.run() assert result.is_failed() - os.remove(ADLS_FILE_NAME_LIST + ".csv") - os.remove("test_sharepoint_to_adls_run_flow.json") @mock.patch( @@ -193,6 +196,11 @@ def test_sharepoint_list_to_adls_run_flow_fail_on_no_data_returned(mocked_class) ) @pytest.mark.run def test_sharepoint_list_to_adls_run_flow_success_on_no_data_returned(mocked_class): + """ + Test will check if flow will succeed when empty DF is passed + with the given parameter if_no_data_returned = "skip" + Empty csv should be generated! + """ flow = SharepointListToADLS( "test_sharepoint_to_adls_run_flow", output_file_extension=".csv", @@ -217,6 +225,11 @@ def test_sharepoint_list_to_adls_run_flow_success_on_no_data_returned(mocked_cla def test_sharepoint_list_to_adls_run_flow_success_warn_on_no_data_returned( mocked_class, ): + """ + Test will check if flow is failing when empty DF is passed + with the given parameter if_no_data_returned = "warn" + CSV file should not be generated! + """ # Get prefect client instance flow = SharepointListToADLS( "test_sharepoint_to_adls_run_flow", @@ -230,5 +243,3 @@ def test_sharepoint_list_to_adls_run_flow_success_warn_on_no_data_returned( ) result = flow.run() assert result.is_successful() - os.remove(ADLS_FILE_NAME_LIST + ".csv") - os.remove("test_sharepoint_to_adls_run_flow.json") diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index edac65238..e9dfe4b72 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -224,7 +224,7 @@ def __init__( name (str): Prefect flow name. list_title (str): Title of Sharepoint List. site_url (str): URL to set of Sharepoint Lists. - file_name (str): Name of file in ADLS. Defaults to None. + file_name (str): Name of file(without extension) in ADLS . Defaults to None. adls_dir_path (str): Azure Data Lake destination folder/catalog path. Defaults to None. filters (dict, optional): Dictionary with operators which filters the SharepointList output. Defaults to None. allowed dtypes: ('datetime','date','bool','int', 'float', 'complex', 'str') @@ -371,7 +371,7 @@ def gen_flow(self) -> Flow: file_to_adls_task = AzureDataLakeUpload() file_to_adls_task.bind( - from_path=self.path, + from_path=self.local_file_path, to_path=self.adls_dir_path, overwrite=self.overwrite, sp_credentials_secret=self.adls_sp_credentials_secret, diff --git a/viadot/task_utils.py b/viadot/task_utils.py index c87387efb..b7a518033 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -29,6 +29,7 @@ from viadot.exceptions import CredentialError, ValidationError from viadot.tasks import AzureDataLakeUpload, AzureKeyVaultSecret + logger = logging.get_logger() METADATA_COLUMNS = {"_viadot_downloaded_at_utc": "DATETIME"} From 8d326ea069b2df6ee275479c988e917c88a01555 Mon Sep 17 00:00:00 2001 From: Marcin Purtak <44641138+marcinpurtak@users.noreply.github.com> Date: Wed, 6 Dec 2023 15:08:03 +0100 Subject: [PATCH 47/54] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added literals for if_no_data_returned and missing dots Co-authored-by: Rafał Ziemianek <49795849+Rafalz13@users.noreply.github.com> --- viadot/task_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index b7a518033..6fc00d760 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -798,14 +798,14 @@ def validate_df(df: pd.DataFrame, tests: dict = None) -> None: @task(timeout=3600, slug="check_df") -def check_if_df_empty(df, if_no_data_returned: str = "fail") -> bool: +def check_if_df_empty(df, if_no_data_returned: Literal["fail", "warn", "skip"] = "fail") -> bool: """ Check if a DataFrame received as a data source response is empty. If fail is expected , this task will finish with ENDRUN(Failed()) state. Args: df (pandas.DataFrame): The DataFrame to check. - if_no_data_returned (str, optional): The action to take if no data is returned in the DataFrame. + if_no_data_returned (Literal["fail", "warn", "skip"], optional): The action to take if no data is returned in the DataFrame. Defaults to "fail". Options are "fail" (default), "warn", or "skip". Returns: @@ -833,7 +833,7 @@ def check_if_df_empty(df, if_no_data_returned: str = "fail") -> bool: @task(timeout=3600) def get_flow_run_id(client: prefect.Client, flow_name: str, state: str) -> str: - """Gets the last flow run ID based on the name of the flow and time of its run in descending order of th flows runs + """Gets the last flow run ID based on the name of the flow and time of its run in descending order of the flow runs. Args: client (prefect.Client): The Prefect client used to execute the GraphQL query. @@ -922,7 +922,7 @@ def get_task_logs(client: prefect.Client, flow_run_id: str, task_slug: str) -> L }} """ # Execute the GraphQL query - logger.info("Executing GraphQL query to get task logs") + logger.info("Executing GraphQL query to get task logs.") response = client.graphql(query) result_data = response.get("data").get("task_run") # Extract task logs @@ -930,7 +930,7 @@ def get_task_logs(client: prefect.Client, flow_run_id: str, task_slug: str) -> L logs = result_data[0].get("logs") return logs else: - raise ValueError("No data available for the given task slug") + raise ValueError("No data available for the given task slug.") @task(timeout=3600) From b63d16e756df04ed28f14fe0d839666b794c47d5 Mon Sep 17 00:00:00 2001 From: Marcin Purtak <44641138+marcinpurtak@users.noreply.github.com> Date: Wed, 6 Dec 2023 15:12:09 +0100 Subject: [PATCH 48/54] Dot added MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Rafał Ziemianek <49795849+Rafalz13@users.noreply.github.com> --- viadot/flows/sharepoint_to_adls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index e9dfe4b72..0d26d75aa 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -224,7 +224,7 @@ def __init__( name (str): Prefect flow name. list_title (str): Title of Sharepoint List. site_url (str): URL to set of Sharepoint Lists. - file_name (str): Name of file(without extension) in ADLS . Defaults to None. + file_name (str): Name of file (without extension) in ADLS. Defaults to None. adls_dir_path (str): Azure Data Lake destination folder/catalog path. Defaults to None. filters (dict, optional): Dictionary with operators which filters the SharepointList output. Defaults to None. allowed dtypes: ('datetime','date','bool','int', 'float', 'complex', 'str') From 9fd6e6f628c0f9d710e643c1a557540dc2b02ac5 Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Wed, 6 Dec 2023 15:14:04 +0100 Subject: [PATCH 49/54] Formatting fix --- viadot/task_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 6fc00d760..4459c715d 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -798,7 +798,9 @@ def validate_df(df: pd.DataFrame, tests: dict = None) -> None: @task(timeout=3600, slug="check_df") -def check_if_df_empty(df, if_no_data_returned: Literal["fail", "warn", "skip"] = "fail") -> bool: +def check_if_df_empty( + df, if_no_data_returned: Literal["fail", "warn", "skip"] = "fail" +) -> bool: """ Check if a DataFrame received as a data source response is empty. If fail is expected , this task will finish with ENDRUN(Failed()) state. From bd0d155049ff628f9d530b10917a49370552f6bf Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 7 Dec 2023 10:52:29 +0100 Subject: [PATCH 50/54] =?UTF-8?q?=F0=9F=90=9B=20Changed=20`cols=5Fto=5Fdro?= =?UTF-8?q?p`=20in=20VidClub?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_vid_club.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/tasks/test_vid_club.py b/tests/integration/tasks/test_vid_club.py index 8fad7fdde..6ba849c13 100644 --- a/tests/integration/tasks/test_vid_club.py +++ b/tests/integration/tasks/test_vid_club.py @@ -56,7 +56,7 @@ def test_drop_columns(var_dictionary): Args: var_dictionary: Dictionary with example arguments for run method. """ - cols_to_drop = ["regionID", "submissionDate"] + cols_to_drop = ["__v", "status"] vc_to_df = VidClubToDF(credentials=CREDENTIALS) output_with_dropped = vc_to_df.run( From 41204afbcd496b4f4a781de859b14aed93294871 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 7 Dec 2023 11:05:40 +0100 Subject: [PATCH 51/54] =?UTF-8?q?=F0=9F=90=9B=20Changed=20test=20for=20Vid?= =?UTF-8?q?Club=20flow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_vidclub_to_adls.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/integration/flows/test_vidclub_to_adls.py b/tests/integration/flows/test_vidclub_to_adls.py index 0f6705579..79592aeb9 100644 --- a/tests/integration/flows/test_vidclub_to_adls.py +++ b/tests/integration/flows/test_vidclub_to_adls.py @@ -82,11 +82,9 @@ def test_vidclub_validate_df_task_fail(caplog): overwrite_adls=True, validate_df_dict={ "column_size": {"submissionID": 5}, - "column_unique_values": ["regionID"], + "column_unique_values": ["id"], }, ) - try: - flow.run() - except ValidationError: - pass + result = flow.run() + assert result.is_failed() From 434fc66da8be416b95510abb1c48ee48b4498d17 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 7 Dec 2023 11:50:27 +0100 Subject: [PATCH 52/54] =?UTF-8?q?=F0=9F=94=A5=20Removed=20unused=20paramet?= =?UTF-8?q?ers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_bigquery_to_adls.py | 12 +++--------- tests/integration/flows/test_mysql_to_adls.py | 2 -- tests/integration/flows/test_salesforce_to_adls.py | 2 -- tests/integration/flows/test_vidclub_to_adls.py | 2 -- 4 files changed, 3 insertions(+), 15 deletions(-) diff --git a/tests/integration/flows/test_bigquery_to_adls.py b/tests/integration/flows/test_bigquery_to_adls.py index b4503c6e9..e6116c9c0 100644 --- a/tests/integration/flows/test_bigquery_to_adls.py +++ b/tests/integration/flows/test_bigquery_to_adls.py @@ -101,13 +101,9 @@ def test_bigquery_to_adls_validate_df_fail(mocked_data): adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET, validate_df_dict={"column_list_to_match": ["type", "country", "test"]}, ) - try: - result = flow_bigquery.run() - except ValidationError: - pass - os.remove("test_bigquery_to_adls_validate_df_fail.parquet") - os.remove("test_bigquery_to_adls_validate_df_fail.json") + result = flow_bigquery.run() + assert result.is_failed() @mock.patch( @@ -138,7 +134,5 @@ def test_bigquery_to_adls_validate_df_success(mocked_data): os.remove("test_bigquery_to_adls_validate_df_success.parquet") os.remove("test_bigquery_to_adls_validate_df_success.json") - rm = AzureDataLakeRemove( - path=ADLS_DIR_PATH + ADLS_FILE_NAME, vault_name="azuwevelcrkeyv001s" - ) + rm = AzureDataLakeRemove(path=ADLS_DIR_PATH + ADLS_FILE_NAME) rm.run(sp_credentials_secret=ADLS_CREDENTIAL_SECRET) diff --git a/tests/integration/flows/test_mysql_to_adls.py b/tests/integration/flows/test_mysql_to_adls.py index c968d48a3..768b5cf7c 100644 --- a/tests/integration/flows/test_mysql_to_adls.py +++ b/tests/integration/flows/test_mysql_to_adls.py @@ -18,7 +18,6 @@ def test_adls_gen1_to_azure_sql_new_mock(TEST_PARQUET_FILE_PATH): query=query, file_path=TEST_PARQUET_FILE_PATH, to_path=f"raw/examples/{TEST_PARQUET_FILE_PATH}", - sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA-DEV", overwrite_adls=True, ) flow.run() @@ -32,7 +31,6 @@ def test_validate_df(TEST_PARQUET_FILE_PATH): country_short="DE", query=query, file_path=TEST_PARQUET_FILE_PATH, - sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA", to_path=f"raw/examples/{TEST_PARQUET_FILE_PATH}", validate_df_dict={"column_size": {"sales_org": 3}}, ) diff --git a/tests/integration/flows/test_salesforce_to_adls.py b/tests/integration/flows/test_salesforce_to_adls.py index 8c032f308..b58c51f3a 100644 --- a/tests/integration/flows/test_salesforce_to_adls.py +++ b/tests/integration/flows/test_salesforce_to_adls.py @@ -30,7 +30,6 @@ def test_salesforce_to_adls(): os.remove("test_salesforce_to_adls_run_flow.json") rm = AzureDataLakeRemove( path=ADLS_DIR_PATH + ADLS_FILE_NAME, - vault_name="azuwevelcrkeyv001s", ) rm.run(sp_credentials_secret=credentials_secret) @@ -56,6 +55,5 @@ def test_salesforce_to_adls_validate_success(): os.remove("test_salesforce_to_adls_run_flow.json") rm = AzureDataLakeRemove( path=ADLS_DIR_PATH + ADLS_FILE_NAME, - vault_name="azuwevelcrkeyv001s", ) rm.run(sp_credentials_secret=credentials_secret) diff --git a/tests/integration/flows/test_vidclub_to_adls.py b/tests/integration/flows/test_vidclub_to_adls.py index 79592aeb9..c3a7dcaf4 100644 --- a/tests/integration/flows/test_vidclub_to_adls.py +++ b/tests/integration/flows/test_vidclub_to_adls.py @@ -47,7 +47,6 @@ def test_vidclub_validate_df_task_success(caplog): to_date="2023-10-25", adls_dir_path="raw/tests", adls_file_name="test.parquet", - adls_sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA", overwrite_adls=True, validate_df_dict={ "column_size": {"submissionID": 5}, @@ -78,7 +77,6 @@ def test_vidclub_validate_df_task_fail(caplog): to_date="2023-10-25", adls_dir_path="raw/tests", adls_file_name="test.parquet", - adls_sp_credentials_secret="App-Azure-CR-DatalakeGen2-AIA", overwrite_adls=True, validate_df_dict={ "column_size": {"submissionID": 5}, From a9b6f66aaf48a24ab7fd3c4e97e93765d0a9be13 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 7 Dec 2023 12:12:01 +0100 Subject: [PATCH 53/54] =?UTF-8?q?=F0=9F=94=A5=20Removed=20test=20for=20`ge?= =?UTF-8?q?t=5Fsql=5Fserver=5Ftable=5Fdtypes`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_utils.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 21517ec8e..38564ed9e 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -6,14 +6,11 @@ from viadot.exceptions import APIError from viadot.signals import SKIP -from viadot.sources import AzureSQL from viadot.utils import ( add_viadot_metadata_columns, check_if_empty_file, gen_bulk_insert_query_from_df, - get_flow_last_run_date, get_nested_value, - get_sql_server_table_dtypes, slugify, handle_api_response, union_dict, @@ -48,12 +45,6 @@ def example_dataframe(): return pd.DataFrame(data, columns=["id", "name", "is_deleted", "balance"]) -@pytest.fixture(scope="function") -def azure_sql(): - azure_sql = AzureSQL(config_key="AZURE_SQL") - yield azure_sql - - @pytest.fixture(scope="function") def nested_dict(): nested_dict = { @@ -274,23 +265,6 @@ def test_handle_api_response_return_type(): assert response.status_code == 200 -def test_get_sql_server_table_dtypes(azure_sql): - """Checks if dtypes is generated in a good way using `get_sql_server_table_dtypes` function.""" - - SCHEMA = "sandbox" - TABLE = "test_table_dtypes" - dtypes = {"country": "VARCHAR(100)", "sales": "INT"} - - azure_sql.create_table( - schema=SCHEMA, table=TABLE, dtypes=dtypes, if_exists="replace" - ) - - dtypes = get_sql_server_table_dtypes(schema=SCHEMA, table=TABLE, con=azure_sql.con) - assert isinstance(dtypes, dict) - assert list(dtypes.keys()) == ["country", "sales"] - assert list(dtypes.values()) == ["varchar(100)", "int"] - - def test_union_dict_return(): """Check if dictionaries are unioned in the correct way.""" a = {"a": 1} From a1044222efce72843fe32f3d8317f5268468fdb3 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Thu, 7 Dec 2023 12:25:58 +0100 Subject: [PATCH 54/54] =?UTF-8?q?=F0=9F=93=9D=20Updated=20Changelog=20befo?= =?UTF-8?q?re=20release?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe9d467f9..c0fe463f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,16 +6,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added -- Added tests for new functionalities in SAPRFC and SAPRFCV2 regarding passing credentials + +### Fixed + +### Changed + +## [0.4.23] - 2023-12-07 +### Added +- Added tests for new functionalities in SAPRFC and SAPRFCV2 regarding passing credentials. - Added new params for mapping and reordering DataFrame for `Genesys` task and flow. -- Tasks to search for logs in the flow -- Tasks to find flow ID -- Tasks used to control flows in multiflows by searching for a given log from a given task +- Added `get_task_logs` task to search for logs in the flow +- Added `get_flow_run_id` task to find flow ID. +- Added `search_for_msg_in_logs` task used to control flows in multiflows by searching for a given log message from a given task. +- Added closing session to `SAPBW`. +- Added `CSV` as a new output extension to `SharepointListToADLS` flow. + ### Fixed +- Fixed creation of URL in `VidClub` source class. When the `region=None` the region parameter will not be included in the URL. ### Changed -- if_no_data_returned added for sharepoint list flow which can fail,warn in case of no data returend or skip (continue) execution in the old way -- Changed __init__ in SAPRFC and SAPRFCV2 class in source in order to raise warning in prefect when credentials will be taken from DEV. +- `if_no_data_returned` added for sharepoint list flow which can fail, warn in case of no data returend or skip (continue) execution in the old way. +- Changed `__init__` in `SAPRFC` and `SAPRFCV2` class in source in order to raise warning in prefect when credentials will be taken from DEV. ## [0.4.22] - 2023-11-15