From 3fd261002eec974690256526f70c003cd3ede36e Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Tue, 24 Oct 2023 12:24:52 +0200 Subject: [PATCH 01/86] pass global variable to function --- viadot/sources/customer_gauge.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/viadot/sources/customer_gauge.py b/viadot/sources/customer_gauge.py index a696bba32..beee84944 100644 --- a/viadot/sources/customer_gauge.py +++ b/viadot/sources/customer_gauge.py @@ -215,19 +215,25 @@ def flatten_json(self, json_response: Dict[str, Any] = None) -> Dict[str, Any]: Returns: Dict[str, Any]: The flattened dictionary. """ - out = {} + result = {} - def flattify(x, key=""): - if type(x) is dict: + if not isinstance(json_response, dict): + raise TypeError("Input must be a dictionary.") + + def flattify(x, key="", out = None): + if out is None: + out = result + + if isinstance(x, dict): for a in x: - flattify(x[a], key + a + "_") + flattify(x[a], key + a + "_", out) else: out[key[:-1]] = x flattify(json_response) - return out - + return result + def to_df(self, json_response: Dict[str, Any] = None) -> pd.DataFrame: """ Flatten dictionary structure and convert it into pandas DataFrame. Cleans column names. From c543c1d0fd42b9158a6bc30f0946dabcd08b9f32 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Wed, 25 Oct 2023 04:41:53 +0200 Subject: [PATCH 02/86] limit source class to producing list of dicts --- viadot/sources/customer_gauge.py | 83 +++----------------------------- 1 file changed, 7 insertions(+), 76 deletions(-) diff --git a/viadot/sources/customer_gauge.py b/viadot/sources/customer_gauge.py index beee84944..4fccf45f5 100644 --- a/viadot/sources/customer_gauge.py +++ b/viadot/sources/customer_gauge.py @@ -165,87 +165,21 @@ def get_cursor(self, json_response: Dict[str, Any] = None) -> int: ) return cur - - def properties_cleaning( - self, json_response: Dict[str, Any] = None - ) -> Dict[str, Any]: - """ - Returns initialy cleaned data. The cleaning of the additional params is depend on the endpoint. - - Args: - json_response (Dict[str, Any], optional): Dictionary with nested structure that contains data and cursor parameter value. Defaults to None. - - Returns: - Dict[str, Any]: Dictionary that contains cleaned data corresponding to one record. - """ - clean_properties = { - d["field"]: d["reference"] for d in json_response["properties"] - } - json_response["properties"] = clean_properties - - if self.endpoint == "responses": - json_response["drivers"] = ( - " ".join(map(str, json_response["drivers"])) - .replace("label", ",") - .replace(r"{',':", " ") - .replace(r"'", "") - .replace("}", "") - .strip() - .replace(" ", ",") - ) - json_response["tags"] = " ".join(map(str, json_response["tags"])).replace( - "[]", "" - ) - json_response["questions"] = " ".join( - map(str, json_response["questions"]) - ).replace("[]", "") - else: - pass - - return json_response - - def flatten_json(self, json_response: Dict[str, Any] = None) -> Dict[str, Any]: - """ - Function that flattens a nested structure of the JSON object into a single-level dictionary. - Uses a nested `flatten()` function to recursively combine nested keys in the JSON object with '_' to create the flattened keys. - - Args: - json_response (Dict[str, Any], optional): JSON object represented as a nested dictionary. Defaults to None. - - Returns: - Dict[str, Any]: The flattened dictionary. - """ - result = {} - - if not isinstance(json_response, dict): - raise TypeError("Input must be a dictionary.") - - def flattify(x, key="", out = None): - if out is None: - out = result - - if isinstance(x, dict): - for a in x: - flattify(x[a], key + a + "_", out) - else: - out[key[:-1]] = x - - flattify(json_response) - - return result - def to_df(self, json_response: Dict[str, Any] = None) -> pd.DataFrame: + def to_df(self, + json_response: Dict[str, Any] = None, + ) -> List[Dict[str, Any]]: """ - Flatten dictionary structure and convert it into pandas DataFrame. Cleans column names. + Extract and return the 'data' part of a JSON response as a list of dictionaries. Args: json_response (Dict[str, Any], optional): JSON object represented as a nested dictionary that contains data and cursor parameter value. Defaults to None. Raises: - ValueError: If data value not found. + ValueError: If the 'data' key is not present in the provided JSON response. Returns: - pd.DataFrame: pandas.DataFrame + List[Dict[str, Any]]: A list of dictionaries containing data from the 'data' part of the JSON response. """ try: response_json = json_response["data"] @@ -253,8 +187,5 @@ def to_df(self, json_response: Dict[str, Any] = None) -> pd.DataFrame: raise ValueError( "Provided argument doesn't contain 'data' value. Pass json returned from the endpoint." ) - clean_json = list(map(self.properties_cleaning, response_json)) - df = pd.DataFrame(list(map(self.flatten_json, clean_json))) - df.columns = df.columns.str.lower().str.replace(" ", "_") - return df + return response_json From d1189a4e02e8c9c5d38b61435954ea1b9bf2767b Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Wed, 25 Oct 2023 04:43:32 +0200 Subject: [PATCH 03/86] adjust function name --- viadot/sources/customer_gauge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/sources/customer_gauge.py b/viadot/sources/customer_gauge.py index 4fccf45f5..317a35876 100644 --- a/viadot/sources/customer_gauge.py +++ b/viadot/sources/customer_gauge.py @@ -166,7 +166,7 @@ def get_cursor(self, json_response: Dict[str, Any] = None) -> int: return cur - def to_df(self, + def to_list_of_jsons(self, json_response: Dict[str, Any] = None, ) -> List[Dict[str, Any]]: """ From 1b3f9bd0d81677d789f5f03fe30e8839bed28452 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Wed, 25 Oct 2023 04:45:01 +0200 Subject: [PATCH 04/86] add List dtype --- viadot/sources/customer_gauge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/sources/customer_gauge.py b/viadot/sources/customer_gauge.py index 317a35876..7b581bfe3 100644 --- a/viadot/sources/customer_gauge.py +++ b/viadot/sources/customer_gauge.py @@ -1,5 +1,5 @@ from datetime import datetime -from typing import Any, Dict, Literal +from typing import Any, Dict, Literal, List import pandas as pd from prefect.utilities import logging From d2c453f52d0040b81aeec2b8fdc92a155a8ea7cc Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Wed, 25 Oct 2023 05:04:32 +0200 Subject: [PATCH 05/86] remove extracting data function --- viadot/sources/customer_gauge.py | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/viadot/sources/customer_gauge.py b/viadot/sources/customer_gauge.py index 7b581bfe3..6ceeccd02 100644 --- a/viadot/sources/customer_gauge.py +++ b/viadot/sources/customer_gauge.py @@ -1,5 +1,5 @@ from datetime import datetime -from typing import Any, Dict, Literal, List +from typing import Any, Dict, Literal import pandas as pd from prefect.utilities import logging @@ -165,27 +165,3 @@ def get_cursor(self, json_response: Dict[str, Any] = None) -> int: ) return cur - - def to_list_of_jsons(self, - json_response: Dict[str, Any] = None, - ) -> List[Dict[str, Any]]: - """ - Extract and return the 'data' part of a JSON response as a list of dictionaries. - - Args: - json_response (Dict[str, Any], optional): JSON object represented as a nested dictionary that contains data and cursor parameter value. Defaults to None. - - Raises: - ValueError: If the 'data' key is not present in the provided JSON response. - - Returns: - List[Dict[str, Any]]: A list of dictionaries containing data from the 'data' part of the JSON response. - """ - try: - response_json = json_response["data"] - except: - raise ValueError( - "Provided argument doesn't contain 'data' value. Pass json returned from the endpoint." - ) - - return response_json From 70d2d4064b54c2fc85ebbcb956ee3f746e2ad419 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Wed, 25 Oct 2023 08:06:48 +0200 Subject: [PATCH 06/86] add cleaning functions to task class --- viadot/tasks/customer_gauge.py | 217 ++++++++++++++++++++++++++++++--- 1 file changed, 200 insertions(+), 17 deletions(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index 4f1f26bbd..373ca1501 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -1,6 +1,6 @@ import json from datetime import datetime -from typing import Literal +from typing import Any, Dict, Literal, List import pandas as pd from prefect import Task @@ -31,18 +31,26 @@ def __init__( **kwargs, ): """ - Task CustomerGaugeToDF for downloading the selected range of data from Customer Gauge endpoint and return as one pandas DataFrame. + Task CustomerGaugeToDF for downloading the selected range of data from Customer Gauge + endpoint and return as one pandas DataFrame. Args: - endpoint (Literal["responses", "non-responses"], optional): Indicate which endpoint to connect. Defaults to None. - total_load (bool, optional): Indicate whether to download the data to the latest. If 'False', only one API call is executed (up to 1000 records). Defaults to True. + endpoint (Literal["responses", "non-responses"], optional): Indicate which endpoint + to connect. Defaults to None. + total_load (bool, optional): Indicate whether to download the data to the latest. + If 'False', only one API call is executed (up to 1000 records). Defaults to True. endpoint_url (str, optional): Endpoint URL. Defaults to None. cursor (int, optional): Cursor value to navigate to the page. Defaults to None. - pagesize (int, optional): Number of responses (records) returned per page, max value = 1000. Defaults to 1000. - date_field (Literal["date_creation", "date_order", "date_sent", "date_survey_response"], optional): Specifies the date type which filter date range. Defaults to None. - start_date (datetime, optional): Defines the period end date in yyyy-mm-dd format. Defaults to None. - end_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. Defaults to None. - timeout (int, optional): The time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. + pagesize (int, optional): Number of responses (records) returned per page, max value = 1000. + Defaults to 1000. + date_field (Literal["date_creation", "date_order", "date_sent", "date_survey_response"], + optional): Specifies the date type which filter date range. Defaults to None. + start_date (datetime, optional): Defines the period end date in yyyy-mm-dd format. + Defaults to None. + end_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. + Defaults to None. + timeout (int, optional): The time (in seconds) to wait while running this task before + a timeout occurs. Defaults to 3600. """ self.endpoint = endpoint self.total_load = total_load @@ -59,6 +67,175 @@ def __init__( *args, **kwargs, ) + def get_data(self, + json_response: Dict[str, Any] = None, + ) -> List[Dict[str, Any]]: + """ + Extract and return the 'data' part of a JSON response as a list of dictionaries. + + Args: + json_response (Dict[str, Any], optional): JSON object represented as a nested + dictionary that contains data and cursor parameter value. Defaults to None. + + Raises: + ValueError: If the 'data' key is not present in the provided JSON response. + + Returns: + List[Dict[str, Any]]: A list of dictionaries containing data from the 'data' + part of the JSON response. + """ + try: + jsons_list = json_response["data"] + except: + raise ValueError( + "Provided argument doesn't contain 'data' value. Pass json returned from the endpoint." + ) + + return jsons_list + + def _field_reference_unpacker( + self, + json_response: Dict[str, Any], + field: str, + ) -> Dict[str, Any]: + """ + Unpack and modify dictionaries within the specified field of a JSON response. + + This function takes a JSON response and a field name. It processes dictionaries + within the specified field, checking if each dictionary contains exactly two items. + If a dictionary meets this criteria, it is transformed into a new dictionary, + where the first key becomes a key, and the second key becomes its associated value + + Args: + json_response (Dict[str, Any], optional): JSON response with data. + field (str): The key (column) of the dictionary to be modified. + + Returns: + Dict[str, Any]: The JSON response with modified nested dictionaries + within the specified field. + """ + + result = {} + for i, dictionary in enumerate(json_response[field]): + if isinstance(dictionary, dict) and len(dictionary.items()) == 2: + list_properties = list(dictionary.values()) + result[list_properties[0]] = list_properties[1] + if result: + # print(f"All elements in '{field}' are unpacked successfully.") + json_response[field] = result + + return json_response + + def _nested_dict_transformer( + self, + json_response: Dict[str, Any], + field: str, + ) -> Dict[str, Any]: + """ + Modify nested dictionaries within the specified field of a JSON response. + + This function takes a JSON response and a field name. It modifies nested + dictionaries within the specified field by adding an index and underscore + to the keys. The modified dictionary is then updated in the JSON response. + + Args: + json_response (Dict[str, Any], optional): JSON response with data. + field (str): The key (column) of the dictionary to be modified. + + Returns: + Dict[str, Any]: The JSON response with modified nested dictionaries + within the specified field. + """ + d={} + for i, dictionary in enumerate(json_response[field], start=1): + for key, value in dictionary.items(): + d[f'{i}_{key}'] = value + + json_response[field] = d + + return json_response + + def column_unpacker( + self, + json_list: List[Dict[str, Any]] = None, + method1_cols: List[str] = None, + method2_cols: List[str] = None, + ) -> List[Dict[str, Any]]: + + """ + Unpack and modify specific columns in a list of dictionaries using two methods, chosen by the user. + If user wants to use field_reference_unpacker, he needs to provide list of fields in `method1_cols` + argument, if user wants to use nested_dict_transformer - uses 'method2_cols' argument. + + Args: + json_list (List[Dict[str, Any]): A list of dictionaries containing the data. + method1_cols (List[str]): Columns to unpack and modify using field_reference_unpacker. + method2_cols (List[str]): Columns to unpack and modify using nested_dict_transformer. + + Raises: + ValueError: _description_ + + Returns: + List[Dict[str, Any]]: The updated list of dictionaries after column unpacking and modification. + """ + + if json_list is None: + raise ValueError("Input 'json_list' is required.") + + def unpack_columns(columns, unpack_function): + for field in columns: + if field in json_list[0]: + print(f"Unpacking column '{field}'...") + try: + json_list_clean = list(map(lambda x: unpack_function(x, field), json_list)) + print(f"All elements in '{field}' are unpacked successfully.") + except: + print(f"No transformation were made in '{field}', because didn't contain list of key-value data.") + else: + print(f"Column '{field}' not found.") + return json_list_clean + + if method1_cols is not None: + json_list = unpack_columns(columns = method1_cols, unpack_function = self._field_reference_unpacker) + + if method2_cols is not None: + json_list = unpack_columns(columns = method2_cols, unpack_function = self._nested_dict_transformer) + + return json_list + + + def flatten_json(self, json_response: Dict[str, Any] = None) -> Dict[str, Any]: + """ + Function that flattens a nested structure of the JSON object into + a single-level dictionary.Uses a nested `flatten()` function to recursively + combine nested keys in the JSON object with '_' to create the flattened keys. + + Args: + json_response (Dict[str, Any], optional): JSON object represented as + a nested dictionary. Defaults to None. + + Returns: + Dict[str, Any]: The flattened dictionary. + """ + result = {} + + if not isinstance(json_response, dict): + raise TypeError("Input must be a dictionary.") + + def flattify(x, key="", out = None): + if out is None: + out = result + + if isinstance(x, dict): + for a in x: + flattify(x[a], key + a + "_", out) + else: + out[key[:-1]] = x + + flattify(json_response) + + return result + def __call__(self): """Download Customer Gauge data to a DF""" @@ -86,6 +263,8 @@ def run( ] = None, start_date: datetime = None, end_date: datetime = None, + method1_cols: List[str] = None, + method2_cols: List[str] = None, credentials_secret: str = "CUSTOMER-GAUGE", vault_name: str = None, ) -> pd.DataFrame: @@ -115,7 +294,7 @@ def run( except (ValueError, TypeError) as e: logger.error(e) - df_list = [] + total_json = [] customer_gauge = CustomerGauge( endpoint=endpoint, url=endpoint_url, credentials=credentials @@ -131,8 +310,10 @@ def run( end_date=end_date, ) cur = customer_gauge.get_cursor(json_data) - df = customer_gauge.to_df(json_data) - df_list.append(df) + + jsn = self.get_data(json_data) + total_json += jsn + if total_load == True: if cursor is None: logger.info( @@ -142,12 +323,14 @@ def run( logger.info( f"Downloading starting from the {cursor} cursor. Process might take a few minutes..." ) - while df.empty == False: + while jsn: json_data = customer_gauge.get_json_response(cursor=cur) cur = customer_gauge.get_cursor(json_data) - df = customer_gauge.to_df(json_data) - df_list.append(df) + jsn = self.get_data(json_data) + total_json += jsn - df_total = pd.concat(df_list, ignore_index=True) + clean_json = self.column_unpacker(json_list = total_json, method1_cols = method1_cols, method2_cols = method2_cols) + df = pd.DataFrame(list(map(self.flatten_json, clean_json))) + df.columns = df.columns.str.lower().str.replace(" ", "_") - return df_total + return df From d1ecde2447060bc8d3ccd83cd953ac80c4b4a108 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Wed, 25 Oct 2023 08:39:11 +0200 Subject: [PATCH 07/86] cleaning data from empty sqaure brackets --- viadot/tasks/customer_gauge.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index 373ca1501..854721fb5 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -236,6 +236,24 @@ def flattify(x, key="", out = None): return result + def square_brackets_remover( + self, + df: pd.DataFrame = None + ) -> pd.DataFrame: + """ + Replace square brackets "[]" with an empty string in a pandas DataFrame. + + Args: + df (pd.DataFrame, optional): Replace square brackets "[]" with an empty string + in a pandas DataFrame. Defaults to None. + + Returns: + pd.DataFrame: The modified DataFrame with square brackets replaced by an empty string. + """ + + df = df.astype(str) + df = df.applymap(lambda x: x.strip("[]")) + return df def __call__(self): """Download Customer Gauge data to a DF""" @@ -331,6 +349,7 @@ def run( clean_json = self.column_unpacker(json_list = total_json, method1_cols = method1_cols, method2_cols = method2_cols) df = pd.DataFrame(list(map(self.flatten_json, clean_json))) + df = self.square_brackets_remover(df) df.columns = df.columns.str.lower().str.replace(" ", "_") return df From a1f145dc1de88309ca6a3a5945888023690615c4 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Thu, 26 Oct 2023 09:35:01 +0200 Subject: [PATCH 08/86] replace print with logger --- viadot/tasks/customer_gauge.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index 854721fb5..01c2f47ae 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -121,7 +121,6 @@ def _field_reference_unpacker( list_properties = list(dictionary.values()) result[list_properties[0]] = list_properties[1] if result: - # print(f"All elements in '{field}' are unpacked successfully.") json_response[field] = result return json_response @@ -185,14 +184,14 @@ def column_unpacker( def unpack_columns(columns, unpack_function): for field in columns: if field in json_list[0]: - print(f"Unpacking column '{field}'...") + logger.info(f"Unpacking column '{field}'...") try: json_list_clean = list(map(lambda x: unpack_function(x, field), json_list)) - print(f"All elements in '{field}' are unpacked successfully.") + logger.info(f"All elements in '{field}' are unpacked successfully.") except: - print(f"No transformation were made in '{field}', because didn't contain list of key-value data.") + logger.info(f"No transformation were made in '{field}', because didn't contain list of key-value data.") else: - print(f"Column '{field}' not found.") + logger.info(f"Column '{field}' not found.") return json_list_clean if method1_cols is not None: From 0b35c83e0f55d49e69ff48d9213c533a63898051 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Thu, 26 Oct 2023 09:52:54 +0200 Subject: [PATCH 09/86] add new args to docstrings --- viadot/tasks/customer_gauge.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index 01c2f47ae..bc8bfb521 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -26,6 +26,8 @@ def __init__( ] = None, start_date: datetime = None, end_date: datetime = None, + method1_cols: List[str] = None, + method2_cols: List[str] = None, timeout: int = 3600, *args, **kwargs, @@ -49,6 +51,8 @@ def __init__( Defaults to None. end_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. Defaults to None. + method1_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. + method2_cols (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. timeout (int, optional): The time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. """ @@ -60,6 +64,8 @@ def __init__( self.date_field = date_field self.start_date = start_date self.end_date = end_date + self.method1_cols = method1_cols + self.method2_cols = method2_cols super().__init__( name="customer_gauge_to_df", @@ -168,8 +174,8 @@ def column_unpacker( Args: json_list (List[Dict[str, Any]): A list of dictionaries containing the data. - method1_cols (List[str]): Columns to unpack and modify using field_reference_unpacker. - method2_cols (List[str]): Columns to unpack and modify using nested_dict_transformer. + method1_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. + method2_cols (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. Raises: ValueError: _description_ @@ -267,6 +273,8 @@ def __call__(self): "date_field", "start_date", "end_date", + "method1_cols", + "method2_cols", ) def run( self, @@ -297,6 +305,8 @@ def run( date_field (Literal["date_creation", "date_order", "date_sent", "date_survey_response"], optional): Specifies the date type which filter date range. Defaults to None. start_date (datetime, optional): Defines the period end date in yyyy-mm-dd format. Defaults to None. end_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. Defaults to None. + method1_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. + method2_cols (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ['client_id', 'client_secret']. Defaults to "CUSTOMER-GAUGE". vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. From 28109846833553c3b9e25e929edd02c5d23392a2 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Thu, 26 Oct 2023 09:58:39 +0200 Subject: [PATCH 10/86] add info about method used --- viadot/tasks/customer_gauge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index bc8bfb521..ec75e0445 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -190,7 +190,7 @@ def column_unpacker( def unpack_columns(columns, unpack_function): for field in columns: if field in json_list[0]: - logger.info(f"Unpacking column '{field}'...") + logger.info(f"Unpacking column '{field}' with {unpack_function.__name__} method...") try: json_list_clean = list(map(lambda x: unpack_function(x, field), json_list)) logger.info(f"All elements in '{field}' are unpacked successfully.") From a32c185c23616f857f8098208641fe7364af29b9 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Thu, 26 Oct 2023 10:10:47 +0200 Subject: [PATCH 11/86] add final loggers --- viadot/tasks/customer_gauge.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index ec75e0445..3d16af20d 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -357,8 +357,10 @@ def run( total_json += jsn clean_json = self.column_unpacker(json_list = total_json, method1_cols = method1_cols, method2_cols = method2_cols) + logger.info("Inserting data into the DataFrame...") df = pd.DataFrame(list(map(self.flatten_json, clean_json))) df = self.square_brackets_remover(df) df.columns = df.columns.str.lower().str.replace(" ", "_") + logger.info("DataFrame: Ready. Data: Inserted. Let the magic happen!") return df From e9187e1e0afbb0c9e83350a8ce53cc5c0534e0b6 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Thu, 26 Oct 2023 10:16:55 +0200 Subject: [PATCH 12/86] adjust docstrings --- viadot/tasks/customer_gauge.py | 55 +++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index 3d16af20d..bc95bc136 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -38,23 +38,25 @@ def __init__( Args: endpoint (Literal["responses", "non-responses"], optional): Indicate which endpoint - to connect. Defaults to None. + to connect. Defaults to None. total_load (bool, optional): Indicate whether to download the data to the latest. - If 'False', only one API call is executed (up to 1000 records). Defaults to True. + If 'False', only one API call is executed (up to 1000 records). Defaults to True. endpoint_url (str, optional): Endpoint URL. Defaults to None. cursor (int, optional): Cursor value to navigate to the page. Defaults to None. pagesize (int, optional): Number of responses (records) returned per page, max value = 1000. - Defaults to 1000. + Defaults to 1000. date_field (Literal["date_creation", "date_order", "date_sent", "date_survey_response"], - optional): Specifies the date type which filter date range. Defaults to None. + optional): Specifies the date type which filter date range. Defaults to None. start_date (datetime, optional): Defines the period end date in yyyy-mm-dd format. - Defaults to None. + Defaults to None. end_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. - Defaults to None. + Defaults to None. method1_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. + Defaults to None. method2_cols (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. + Defaults to None. timeout (int, optional): The time (in seconds) to wait while running this task before - a timeout occurs. Defaults to 3600. + a timeout occurs. Defaults to 3600. """ self.endpoint = endpoint self.total_load = total_load @@ -174,8 +176,10 @@ def column_unpacker( Args: json_list (List[Dict[str, Any]): A list of dictionaries containing the data. - method1_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. - method2_cols (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. + method1_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. + Defaults to None. + method2_cols (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. + Defaults to None. Raises: ValueError: _description_ @@ -294,21 +298,32 @@ def run( vault_name: str = None, ) -> pd.DataFrame: """ - Run method. Downloading the selected range of data from Customer Gauge endpoint and return as one pandas DataFrame. + Run method. Downloading the selected range of data from Customer Gauge endpoint and return + as one pandas DataFrame. Args: - endpoint (Literal["responses", "non-responses"]): Indicate which endpoint to connect. Defaults to None. - total_load (bool, optional): Indicate whether to download the data to the latest. If 'False', only one API call is executed (up to 1000 records). Defaults to True. + endpoint (Literal["responses", "non-responses"]): Indicate which endpoint to connect. + Defaults to None. + total_load (bool, optional): Indicate whether to download the data to the latest. If + 'False', only one API call is executed (up to 1000 records). Defaults to True. endpoint_url (str, optional): Endpoint URL. Defaults to None. cursor (int, optional): Cursor value to navigate to the page. Defaults to None. - pagesize (int, optional): Number of responses (records) returned per page, max value = 1000. Defaults to 1000. - date_field (Literal["date_creation", "date_order", "date_sent", "date_survey_response"], optional): Specifies the date type which filter date range. Defaults to None. - start_date (datetime, optional): Defines the period end date in yyyy-mm-dd format. Defaults to None. - end_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. Defaults to None. - method1_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. - method2_cols (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. - credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ['client_id', 'client_secret']. Defaults to "CUSTOMER-GAUGE". - vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. + pagesize (int, optional): Number of responses (records) returned per page, max value = 1000. + Defaults to 1000. + date_field (Literal["date_creation", "date_order", "date_sent", "date_survey_response"], + optional): Specifies the date type which filter date range. Defaults to None. + start_date (datetime, optional): Defines the period end date in yyyy-mm-dd format. + Defaults to None. + end_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. + Defaults to None. + method1_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. + Defaults to None. + method2_cols (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. + Defaults to None. + credentials_secret (str, optional): The name of the Azure Key Vault secret containing a + dictionary with ['client_id', 'client_secret']. Defaults to "CUSTOMER-GAUGE". + vault_name (str, optional): The name of the vault from which to obtain the secret. + Defaults to None. Returns: pd.DataFrame: Final pandas DataFrame. From 618666d37cd007eae460a8db1745669a0a062870 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Thu, 26 Oct 2023 10:23:27 +0200 Subject: [PATCH 13/86] update docstrings --- viadot/flows/customer_gauge_to_adls.py | 48 ++++++++++++++++---------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/viadot/flows/customer_gauge_to_adls.py b/viadot/flows/customer_gauge_to_adls.py index 8053aeda3..e1bfd7108 100644 --- a/viadot/flows/customer_gauge_to_adls.py +++ b/viadot/flows/customer_gauge_to_adls.py @@ -37,6 +37,8 @@ def __init__( ] = None, start_date: datetime = None, end_date: datetime = None, + method1_cols: List[str] = None, + method2_cols: List[str] = None, customer_gauge_credentials_secret: str = "CUSTOMER-GAUGE", anonymize: bool = False, columns_to_anonymize: List[str] = None, @@ -57,42 +59,50 @@ def __init__( **kwargs: Dict[str, Any] ): """ - Flow for downloading data from the Customer Gauge's endpoints (Responses and Non-Responses) via API to a CSV or Parquet file. - The data anonimization is optional.Then upload it to Azure Data Lake. + Flow for downloading data from the Customer Gauge's endpoints (Responses and Non-Responses) via API + to a CSV or Parquet file.The data anonimization is optional.Then upload it to Azure Data Lake. Args: name (str): The name of the flow. - endpoint (Literal["responses", "non-responses"], optional): Indicate which endpoint to connect. Defaults to None. + endpoint (Literal["responses", "non-responses"], optional): Indicate which endpoint to connect. + Defaults to None. endpoint_url (str, optional): Full URL for pointing to specific endpoint. Defaults to None. - total_load (bool, optional): Indicate whether to download the data to the latest. If 'False', only one API call is executed (up to 1000 records). - Defaults to True. + total_load (bool, optional): Indicate whether to download the data to the latest. If 'False', + only one API call is executed (up to 1000 records). Defaults to True. cursor (int, optional): Cursor value to navigate to the page. Defaults to None. - pagesize (int, optional): Number of responses (records) returned per page, max value = 1000. Defaults to 1000. - date_field (Literal["date_creation", "date_order", "date_sent", "date_survey_response"], optional): Specifies the date type which filter date range. - Defaults to None. + pagesize (int, optional): Number of responses (records) returned per page, max value = 1000. + Defaults to 1000. + date_field (Literal["date_creation", "date_order", "date_sent", "date_survey_response"], optional): + Specifies the date type which filter date range. Defaults to None. start_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. Defaults to None. end_date (datetime, optional): Defines the period end date in yyyy-mm-dd format. Defaults to None. - customer_gauge_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ['client_id', 'client_secret']. - Defaults to "CUSTOMER-GAUGE". + method1_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. Defaults to None. + method2_cols (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. Defaults to None. + customer_gauge_credentials_secret (str, optional): The name of the Azure Key Vault secret containing + a dictionary with ['client_id', 'client_secret']. Defaults to "CUSTOMER-GAUGE". vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. anonymize (bool, optional): Indicates if anonymize selected columns. Defaults to False. columns_to_anonymize (List[str], optional): List of columns to anonymize. Defaults to None. - anonymize_method (Literal["mask", "hash"], optional): Method of anonymizing data. "mask" -> replace the data with "value" arg. - "hash" -> replace the data with the hash value of an object (using `hash()` method). Defaults to "mask". + anonymize_method (Literal["mask", "hash"], optional): Method of anonymizing data. "mask" -> replace the + data with "value" arg. "hash" -> replace the data with the hash value of an object (using `hash()` + method). Defaults to "mask". anonymize_value (str, optional): Value to replace the data. Defaults to "***". - date_column (str, optional): Name of the date column used to identify rows that are older than a specified number of days. Defaults to None. - days (int, optional): The number of days beyond which we want to anonymize the data, e.g. older that 2 years can be: 2*365. Defaults to None. + date_column (str, optional): Name of the date column used to identify rows that are older than a specified + number of days. Defaults to None. + days (int, optional): The number of days beyond which we want to anonymize the data, e.g. older than + 2 years can be: 2*365. Defaults to None. output_file_extension (str, optional): Output file extension - to allow selection of .csv for data which is not easy to handle with parquet. Defaults to ".parquet". adls_dir_path (str, optional): Azure Data Lake destination folder/catalog path. Defaults to None. local_file_path (str, optional): Local destination path. Defaults to None. adls_file_name (str, optional): Name of file in ADLS. Defaults to None. - adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with - ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake. - Defaults to None. + adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary + with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure + Data Lake. Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_exists (str, optional): What to do if the file exists. Defaults to "replace". - timeout (int, optional): The time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. + timeout (int, optional): The time (in seconds) to wait while running this task before a timeout occurs. + Defaults to 3600. """ # CustomerGaugeToDF self.endpoint = endpoint @@ -103,6 +113,8 @@ def __init__( self.date_field = date_field self.start_date = start_date self.end_date = end_date + self.method1_cols = method1_cols + self.method2_cols = method2_cols self.customer_gauge_credentials_secret = customer_gauge_credentials_secret # anonymize_df From 7732545f33e4b63c0e4319c9cb9a53aa068d9e3f Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Thu, 26 Oct 2023 10:27:51 +0200 Subject: [PATCH 14/86] adjust readability --- viadot/tasks/customer_gauge.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index bc95bc136..4cf328332 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -199,16 +199,23 @@ def unpack_columns(columns, unpack_function): json_list_clean = list(map(lambda x: unpack_function(x, field), json_list)) logger.info(f"All elements in '{field}' are unpacked successfully.") except: - logger.info(f"No transformation were made in '{field}', because didn't contain list of key-value data.") + logger.info(f"No transformation were made in '{field}', + because didn't contain list of key-value data.") else: logger.info(f"Column '{field}' not found.") return json_list_clean if method1_cols is not None: - json_list = unpack_columns(columns = method1_cols, unpack_function = self._field_reference_unpacker) + json_list = unpack_columns( + columns = method1_cols, + unpack_function = self._field_reference_unpacker + ) if method2_cols is not None: - json_list = unpack_columns(columns = method2_cols, unpack_function = self._nested_dict_transformer) + json_list = unpack_columns( + columns = method2_cols, + unpack_function = self._nested_dict_transformer + ) return json_list @@ -359,7 +366,8 @@ def run( if total_load == True: if cursor is None: logger.info( - f"Downloading all the data from the {self.endpoint or self.endpoint_url} endpoint. Process might take a few minutes..." + f"Downloading all the data from the {self.endpoint or self.endpoint_url} endpoint. + Process might take a few minutes..." ) else: logger.info( @@ -371,7 +379,10 @@ def run( jsn = self.get_data(json_data) total_json += jsn - clean_json = self.column_unpacker(json_list = total_json, method1_cols = method1_cols, method2_cols = method2_cols) + clean_json = self.column_unpacker( + json_list = total_json, + method1_cols = method1_cols, + method2_cols = method2_cols) logger.info("Inserting data into the DataFrame...") df = pd.DataFrame(list(map(self.flatten_json, clean_json))) df = self.square_brackets_remover(df) From f23bb2da7ae6001edaa84ca9f60fe7386a139ca6 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Thu, 26 Oct 2023 11:05:32 +0200 Subject: [PATCH 15/86] fix loggers --- viadot/tasks/customer_gauge.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index 4cf328332..cd22ca392 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -199,8 +199,8 @@ def unpack_columns(columns, unpack_function): json_list_clean = list(map(lambda x: unpack_function(x, field), json_list)) logger.info(f"All elements in '{field}' are unpacked successfully.") except: - logger.info(f"No transformation were made in '{field}', - because didn't contain list of key-value data.") + logger.info(f"No transformation were made in '{field}'," + "because didn't contain list of key-value data.") else: logger.info(f"Column '{field}' not found.") return json_list_clean @@ -366,8 +366,8 @@ def run( if total_load == True: if cursor is None: logger.info( - f"Downloading all the data from the {self.endpoint or self.endpoint_url} endpoint. - Process might take a few minutes..." + f"Downloading all the data from the {self.endpoint or self.endpoint_url} endpoint." + "Process might take a few minutes..." ) else: logger.info( From d08612432ab2317a8d0ffdf30ba16ff2a9214889 Mon Sep 17 00:00:00 2001 From: m-paz Date: Thu, 26 Oct 2023 16:58:40 +0100 Subject: [PATCH 16/86] =?UTF-8?q?=F0=9F=93=9D=20Bumped=20version=20after?= =?UTF-8?q?=20release?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_viadot.py | 2 +- viadot/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_viadot.py b/tests/test_viadot.py index 1f0874453..675dbfbdc 100644 --- a/tests/test_viadot.py +++ b/tests/test_viadot.py @@ -2,4 +2,4 @@ def test_version(): - assert __version__ == "0.4.21" + assert __version__ == "0.4.22" diff --git a/viadot/__init__.py b/viadot/__init__.py index e427a5547..ece529aa1 100644 --- a/viadot/__init__.py +++ b/viadot/__init__.py @@ -1 +1 @@ -__version__ = "0.4.21" +__version__ = "0.4.22" From 2714f33b0a6e7d5356186d936ae08ab97a7b1c57 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Fri, 27 Oct 2023 08:21:33 +0200 Subject: [PATCH 17/86] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20typos=20in=20docum?= =?UTF-8?q?entation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 +- viadot/sources/sap_bw.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76eb3280b..507c590cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `SharepointListToDF` task class. - Added `SharepointListToADLS` flow class. - Added tests for `SharepointList`. -- Added `get_nested_dict` to untils.py. +- Added `get_nested_dict` to utils.py. ### Fixed diff --git a/viadot/sources/sap_bw.py b/viadot/sources/sap_bw.py index 8f4fb0583..94e3347a9 100644 --- a/viadot/sources/sap_bw.py +++ b/viadot/sources/sap_bw.py @@ -101,7 +101,7 @@ def get_output_data(self, mdx_query: str) -> dict: { "COLUMN": 0, "ROW": 0, - "DATA": "VELUX Deutschland GmbH", + "DATA": "DATA", "VALUE_DATA_TYPE": "CHAR", "CELL_STATUS": "" },... From cecebf971230bef747f08efb2bc7fc3e8a566730 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Tue, 31 Oct 2023 14:40:15 +0100 Subject: [PATCH 18/86] add drivers cleaner --- viadot/tasks/customer_gauge.py | 40 +++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index cd22ca392..03b30b286 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -121,6 +121,9 @@ def _field_reference_unpacker( Returns: Dict[str, Any]: The JSON response with modified nested dictionaries within the specified field. + + Raises: + ValueError: If a dictionary within the specified field doesn't contain exactly two items. """ result = {} @@ -128,6 +131,8 @@ def _field_reference_unpacker( if isinstance(dictionary, dict) and len(dictionary.items()) == 2: list_properties = list(dictionary.values()) result[list_properties[0]] = list_properties[1] + else: + raise ValueError() if result: json_response[field] = result @@ -154,11 +159,11 @@ def _nested_dict_transformer( within the specified field. """ d={} - for i, dictionary in enumerate(json_response[field], start=1): + for i, dictionary in enumerate(json_response[field], start=1): for key, value in dictionary.items(): d[f'{i}_{key}'] = value - - json_response[field] = d + if d: + json_response[field] = d return json_response @@ -192,11 +197,12 @@ def column_unpacker( raise ValueError("Input 'json_list' is required.") def unpack_columns(columns, unpack_function): + json_list_clean = json_list.copy() for field in columns: - if field in json_list[0]: + if field in json_list_clean[0]: logger.info(f"Unpacking column '{field}' with {unpack_function.__name__} method...") try: - json_list_clean = list(map(lambda x: unpack_function(x, field), json_list)) + json_list_clean = list(map(lambda x: unpack_function(x, field), json_list_clean)) logger.info(f"All elements in '{field}' are unpacked successfully.") except: logger.info(f"No transformation were made in '{field}'," @@ -270,6 +276,28 @@ def square_brackets_remover( df = df.astype(str) df = df.applymap(lambda x: x.strip("[]")) return df + + def _drivers_cleaner( + self, + drivers: str = None + ) -> str: + """ + Clean and format the 'drivers' data. + + Args: + drivers (str, optional): Column name of the data to be cleaned. Defaults to None. + + Returns: + str: A cleaned and formatted string of driver data. + """ + + drivers = drivers.split("}, {") + cleaned_drivers = [] + for driver in drivers: + driver = driver.replace("{", "").replace("}", "") + driver = driver.replace("'", "").replace("label: ", "") + cleaned_drivers.append(driver) + return ', '.join(cleaned_drivers) def __call__(self): """Download Customer Gauge data to a DF""" @@ -386,6 +414,8 @@ def run( logger.info("Inserting data into the DataFrame...") df = pd.DataFrame(list(map(self.flatten_json, clean_json))) df = self.square_brackets_remover(df) + if endpoint == "responses": + df["drivers"] = df["drivers"].apply(self._drivers_cleaner) df.columns = df.columns.str.lower().str.replace(" ", "_") logger.info("DataFrame: Ready. Data: Inserted. Let the magic happen!") From 9894f965e555c856d060dcc07efa759af2ca636d Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Tue, 31 Oct 2023 15:53:01 +0100 Subject: [PATCH 19/86] add new args --- viadot/flows/customer_gauge_to_adls.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/viadot/flows/customer_gauge_to_adls.py b/viadot/flows/customer_gauge_to_adls.py index e1bfd7108..080dda23d 100644 --- a/viadot/flows/customer_gauge_to_adls.py +++ b/viadot/flows/customer_gauge_to_adls.py @@ -176,6 +176,8 @@ def gen_flow(self) -> Flow: date_field=self.date_field, start_date=self.start_date, end_date=self.end_date, + method1_cols=self.method1_cols, + method2_cols=self.method2_cols, vault_name=self.vault_name, credentials_secret=self.customer_gauge_credentials_secret, flow=self, From 24898c8027b8de2513a63962e550592e6b2469b5 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Tue, 31 Oct 2023 18:28:24 +0100 Subject: [PATCH 20/86] temp tests comment --- tests/integration/test_customer_gauge.py | 88 ++++++++++++------------ 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/tests/integration/test_customer_gauge.py b/tests/integration/test_customer_gauge.py index 666a73251..119615100 100644 --- a/tests/integration/test_customer_gauge.py +++ b/tests/integration/test_customer_gauge.py @@ -17,50 +17,50 @@ def test_get_json_content(): assert isinstance(json_response["cursor"], dict) -def test_properties_cleaning(): - json_response = CG.get_json_response() - data = json_response["data"][2].copy() - cleaned_data = CG.properties_cleaning(data.copy()) - assert isinstance(data["properties"], list) - assert isinstance(cleaned_data["properties"], dict) - - -def test_flatten_json(): - nested_json = { - "user": { - "name": "Jane", - "address": { - "street": "456 Elm St", - "city": "San Francisco", - "state": "CA", - "zip": "94109", - "country": {"name": "United States", "code": "US"}, - }, - "phone_numbers": {"type": "home", "number": "555-4321"}, - } - } - - expected_output = { - "user_name": "Jane", - "user_address_street": "456 Elm St", - "user_address_city": "San Francisco", - "user_address_state": "CA", - "user_address_zip": "94109", - "user_address_country_name": "United States", - "user_address_country_code": "US", - "user_phone_numbers_type": "home", - "user_phone_numbers_number": "555-4321", - } - - output = CG.flatten_json(nested_json) - assert output == expected_output - - -def test_pagesize_and_to_df(): - json_response = CG.get_json_response(pagesize=1) - df = CG.to_df(json_response) - assert isinstance(df, pd.DataFrame) - assert len(df) == 1 +# def test_properties_cleaning(): +# json_response = CG.get_json_response() +# data = json_response["data"][2].copy() +# cleaned_data = CG.properties_cleaning(data.copy()) +# assert isinstance(data["properties"], list) +# assert isinstance(cleaned_data["properties"], dict) + + +# def test_flatten_json(): +# nested_json = { +# "user": { +# "name": "Jane", +# "address": { +# "street": "456 Elm St", +# "city": "San Francisco", +# "state": "CA", +# "zip": "94109", +# "country": {"name": "United States", "code": "US"}, +# }, +# "phone_numbers": {"type": "home", "number": "555-4321"}, +# } +# } + +# expected_output = { +# "user_name": "Jane", +# "user_address_street": "456 Elm St", +# "user_address_city": "San Francisco", +# "user_address_state": "CA", +# "user_address_zip": "94109", +# "user_address_country_name": "United States", +# "user_address_country_code": "US", +# "user_phone_numbers_type": "home", +# "user_phone_numbers_number": "555-4321", +# } + +# output = CG.flatten_json(nested_json) +# assert output == expected_output + + +# def test_pagesize_and_to_df(): +# json_response = CG.get_json_response(pagesize=1) +# df = CG.to_df(json_response) +# assert isinstance(df, pd.DataFrame) +# assert len(df) == 1 def test_pass_specific_cursor(): From 7b6556f171744d4bdc466a56c3bb2f9cc17c177a Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Tue, 31 Oct 2023 18:47:20 +0100 Subject: [PATCH 21/86] changelog update --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 507c590cf..2bcf45aef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added + +### Fixed + +### Changed +- Modified `CustomerGauge` source class with simplified logic to return json structure. +- Expand `CustomerGaugeToDF` task class with separate cleaning functions and handling nested json structure flattening with two new methods `_field_reference_unpacker` and `_nested_dict_transformer`. +- Change `CustomerGaugeToADLS` to containg new arguments. ## [0.4.21] - 2023-10-26 ### Added From ac7f63bd0e380a92902ebdb8b15b307f09443b4d Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Wed, 1 Nov 2023 15:37:44 +0100 Subject: [PATCH 22/86] multichoice fields and polish letters extension for sharepoint list --- viadot/flows/sharepoint_to_adls.py | 9 +++-- viadot/sources/sharepoint.py | 58 +++++++++++++++++++----------- 2 files changed, 44 insertions(+), 23 deletions(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index eaf747bab..410538e7b 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -215,8 +215,13 @@ def __init__( site_url (str): URL to set of Sharepoint Lists. Default to None. required_fields (List[str]): Required fields(columns) need to be extracted from Sharepoint List. Default to None. - field_property (List[str]): Property to expand with expand query method. - All propertys can be found under list.item.properties. + field_property (List[str]): Property to expand fields with expand query method. + For example: User fields could be expanded and "Title" + or "ID" could be extracted + -> usefull to get user name instead of ID + All properties can be found under list.item.properties. + WARNING! Field types and properties might change which could + lead to errors - extension of sp connector would be required. Default to ["Title"] filters (dict): Dictionary with operators which filters the SharepointList output. allowed dtypes: ('datetime','date','bool','int', 'float', 'complex', 'str') diff --git a/viadot/sources/sharepoint.py b/viadot/sources/sharepoint.py index 096de825b..7f57bd658 100644 --- a/viadot/sources/sharepoint.py +++ b/viadot/sources/sharepoint.py @@ -18,6 +18,7 @@ logger = logging.get_logger() + # Print out how many rows was extracted in specific iteration def log_of_progress(items): logger.info("Items read: {0}".format(len(items))) @@ -112,7 +113,6 @@ def get_connection( self, site_url: str = None, ): - # Connecting into Sharepoint with AuthenticationContext try: auth_context = AuthenticationContext(site_url) @@ -137,24 +137,33 @@ def _unpack_fields( self, list_item, selected_fields: dict = None, - ): - + ) -> dict: # Creating the body of dictionary new_dict = dict() - # For loop scanning the propertys of searching fields item_values_dict = list_item.properties for field, val in item_values_dict.items(): nested_dict = get_nested_dict(val) - # Check if the dictionary is nested - if nested_dict != None: - # It might be that there are different field properties than expected - nested_value = nested_dict.get(selected_fields["FieldProperty"]) - if nested_value != None: - new_dict[field] = nested_value + # Check if field has expandable type + if field in selected_fields["FieldToExpand"]: + # Check if the values are nested + if nested_dict != None: + # It might be that there are different field properties than expected + nested_value = nested_dict.get( + selected_fields["FieldExpandProperty"] + ) + if nested_value != None: + new_dict[field] = nested_value + else: + logger.info("Property of the extandable field not recognized!") + raise ValueError("Check if given field property is valid!") + elif field in selected_fields["MultiChoiceField"]: + # Field type of multi choice could have more than 1 selection. + new_dict[field] = ";".join(nested_dict.values()) else: - logger.info("I'm not the right value") - raise ValueError + raise ValueError( + "Get nested dict for not recognized type of field! Check field types in the source" + ) else: new_dict[field] = val @@ -166,7 +175,6 @@ def get_fields( site_url: str = None, required_fields: List[str] = None, ): - ctx = self.get_connection(site_url=site_url) # Get list of lists object by List Title @@ -182,22 +190,25 @@ def get_fields( else: list_fields_required = [ - list_fields_all.get_by_internal_name_or_title(field).get() + list_fields_all.get_by_internal_name_or_title(field) + .get() + .execute_query() for field in required_fields ] - ctx.execute_batch() return list_fields_required - def select_expandable_user_fields( + def select_fields( self, list_title: str = None, site_url: str = None, required_fields: List[str] = None, field_property: str = "Title", - ): + ) -> dict: """ - Method to expand fields and get more informations. + Method to create a data structure for handling info about + selection of fields with details about possible expansion for more data or details. + Field types to extract more values can be: "User*", "MultiChoice" field_property to expand can be: ID, Title, FieldTypeKind, TypeAsString and many more. -> more properties can be discovered by getting list.item.properties. Default to "Title" @@ -220,12 +231,17 @@ def select_expandable_user_fields( for field in list_fields if fnmatch(field.properties["TypeAsString"], f"User*") ] - + multi_choice_fields = [ + field.properties["InternalName"] + for field in list_fields + if fnmatch(field.properties["TypeAsString"], "MultiChoice") + ] # Creating the body of the function output selected_fields = { "FieldInternalNames": fields_to_select, "FieldToExpand": fields_to_expand, - "FieldProperty": field_property, + "FieldExpandProperty": field_property, + "MultiChoiceField": multi_choice_fields, } return selected_fields @@ -508,7 +524,7 @@ def list_item_to_df( download_all = False # extracting requeird_fields SP_List objects - selected_fields = self.select_expandable_user_fields( + selected_fields = self.select_fields( list_title=list_title, site_url=site_url, required_fields=required_fields, From 0af6efba1f256385f144afa727bfbebe62f8e1c0 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 2 Nov 2023 10:22:17 +0100 Subject: [PATCH 23/86] =?UTF-8?q?=E2=9C=A8=20Added=20TM1=20connector?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/__init__.py | 1 + viadot/sources/tm1.py | 75 ++++++++++++++++++++++++++++++++++++ viadot/tasks/__init__.py | 1 + viadot/tasks/tm1.py | 78 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 155 insertions(+) create mode 100644 viadot/sources/tm1.py create mode 100644 viadot/tasks/tm1.py diff --git a/viadot/sources/__init__.py b/viadot/sources/__init__.py index c0d96abe2..7f0bf6f51 100644 --- a/viadot/sources/__init__.py +++ b/viadot/sources/__init__.py @@ -30,6 +30,7 @@ from .mindful import Mindful from .sql_server import SQLServer from .sqlite import SQLite +from .tm1 import TM1 # APIS from .uk_carbon_intensity import UKCarbonIntensity diff --git a/viadot/sources/tm1.py b/viadot/sources/tm1.py new file mode 100644 index 000000000..c921e8c87 --- /dev/null +++ b/viadot/sources/tm1.py @@ -0,0 +1,75 @@ +import pandas as pd + +from typing import Any, Dict, Literal +from TM1py.Services import TM1Service +from prefect.utilities import logging + + +from ..config import local_config +from ..exceptions import CredentialError +from .base import Source + +logger = logging.get_logger(__name__) + + +class TM1(Source): + def __init__( + self, + credentials: Dict[str, Any] = None, + config_key: str = "TM1", + cube: str = None, + view: str = None, + limit: int = None, + private: bool = False, + verify: bool = False, + *args, + **kwargs, + ): + DEFAULT_CREDENTIALS = local_config.get(config_key) + credentials = credentials or DEFAULT_CREDENTIALS + + required_credentials = ["address", "port", "username", "password"] + if any([cred_key not in credentials for cred_key in required_credentials]): + not_found = [c for c in required_credentials if c not in credentials] + raise CredentialError(f"Missing credential(s): '{not_found}'.") + + self.config_key = config_key + self.cube = cube + self.view = view + self.limit = limit + self.private = private + self.verify = verify + + super().__init__(*args, credentials=credentials, **kwargs) + + def get_connection(self) -> TM1Service: + return TM1Service( + address=self.credentials["address"], + port=self.credentials["port"], + user=self.credentials["username"], + password=self.credentials["password"], + ssl=self.verify, + ) + + def get_cubes_names(self) -> list: + conn = self.get_connection + return conn.cubes.get_all_names() + + def get_views_names(self) -> list: + conn = self.get_connection + return conn.views.get_all_names(self.cube) + + def to_df(self, if_empty: Literal["warn", "fail", "skip"] = "skip") -> pd.DataFrame: + conn = self.get_connection() + df = conn.cubes.cells.execute_view_dataframe( + cube_name=self.cube, + view_name=self.view, + private=self.private, + top=self.limit, + ) + logger.info( + f"Data was successfully transformed into DataFrame: {len(df.columns)} columns and {len(df)} rows." + ) + if df.empty is True: + self._handle_if_empty(if_empty) + return df diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index ecba1d5c5..e70c89540 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -58,3 +58,4 @@ from .vid_club import VidClubToDF from .git import CloneRepo from .luma import LumaIngest +from .tm1 import TM1ToParquet diff --git a/viadot/tasks/tm1.py b/viadot/tasks/tm1.py new file mode 100644 index 000000000..b7be64b9d --- /dev/null +++ b/viadot/tasks/tm1.py @@ -0,0 +1,78 @@ +from prefect import Task +from typing import Any, Dict +from prefect.utilities.tasks import defaults_from_attrs + +from ..sources import TM1 + + +class TM1ToParquet(Task): + def __init__( + self, + credentials: Dict[str, Any] = None, + config_key: str = "TM1", + cube: str = None, + view: str = None, + limit: int = None, + private: bool = False, + verify: bool = False, + path: str = None, + if_empty: str = "skip", + timeout=3600, + *args, + **kwargs, + ): + self.credentials = credentials + self.config_key = config_key + self.cube = cube + self.view = view + self.limit = limit + self.private = private + self.verify = verify + self.path = path + self.if_empty = if_empty + + super().__init__( + name="tm1_to_parquet", + timeout=timeout, + *args, + **kwargs, + ) + + def __call__(self, *args, **kwargs): + """Load TM1 data to Parquet""" + return super().__call__(*args, **kwargs) + + @defaults_from_attrs( + "credentials", + "config_key", + "cube", + "view", + "limit", + "private", + "verify", + "if_empty", + "path", + ) + def run( + self, + credentials: Dict[str, Any] = None, + config_key: str = None, + cube: str = None, + view: str = None, + limit: int = None, + private: bool = None, + verify: bool = None, + path: str = None, + if_empty: str = None, + ): + tm1 = TM1( + credentials=credentials, + config_key=config_key, + cube=cube, + view=view, + limit=limit, + private=private, + verify=verify, + ) + df = tm1.to_df() + return df.to_parquet(path=path, if_empty=if_empty) From 6894b79b29ecc39b4f7a5f8c04a013563969a580 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 2 Nov 2023 11:27:31 +0100 Subject: [PATCH 24/86] =?UTF-8?q?=F0=9F=8E=A8=20Changed=20to=5Fparquet=20t?= =?UTF-8?q?o=20to=5Fdf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/__init__.py | 2 +- viadot/tasks/tm1.py | 13 ++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index e70c89540..541be70ab 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -58,4 +58,4 @@ from .vid_club import VidClubToDF from .git import CloneRepo from .luma import LumaIngest -from .tm1 import TM1ToParquet +from .tm1 import TM1ToDF diff --git a/viadot/tasks/tm1.py b/viadot/tasks/tm1.py index b7be64b9d..cde043e0a 100644 --- a/viadot/tasks/tm1.py +++ b/viadot/tasks/tm1.py @@ -5,7 +5,7 @@ from ..sources import TM1 -class TM1ToParquet(Task): +class TM1ToDF(Task): def __init__( self, credentials: Dict[str, Any] = None, @@ -15,7 +15,6 @@ def __init__( limit: int = None, private: bool = False, verify: bool = False, - path: str = None, if_empty: str = "skip", timeout=3600, *args, @@ -28,18 +27,17 @@ def __init__( self.limit = limit self.private = private self.verify = verify - self.path = path self.if_empty = if_empty super().__init__( - name="tm1_to_parquet", + name="tm1_to_df", timeout=timeout, *args, **kwargs, ) def __call__(self, *args, **kwargs): - """Load TM1 data to Parquet""" + """Load TM1 data to pandas DataFrame""" return super().__call__(*args, **kwargs) @defaults_from_attrs( @@ -51,7 +49,6 @@ def __call__(self, *args, **kwargs): "private", "verify", "if_empty", - "path", ) def run( self, @@ -62,7 +59,6 @@ def run( limit: int = None, private: bool = None, verify: bool = None, - path: str = None, if_empty: str = None, ): tm1 = TM1( @@ -74,5 +70,4 @@ def run( private=private, verify=verify, ) - df = tm1.to_df() - return df.to_parquet(path=path, if_empty=if_empty) + return tm1.to_df(if_empty=if_empty) From b439d503ec6959baa101de45baad49a842c7f3b1 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 2 Nov 2023 14:35:49 +0100 Subject: [PATCH 25/86] =?UTF-8?q?=F0=9F=93=9D=20Added=20documentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/tm1.py | 51 +++++++++++++++++++++++++++++++++++++++++++ viadot/tasks/tm1.py | 39 ++++++++++++++++++++++++++++++++- 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/viadot/sources/tm1.py b/viadot/sources/tm1.py index c921e8c87..93522fda1 100644 --- a/viadot/sources/tm1.py +++ b/viadot/sources/tm1.py @@ -13,6 +13,10 @@ class TM1(Source): + """ + Class for downloading data from TM1 Software using TM1py library + """ + def __init__( self, credentials: Dict[str, Any] = None, @@ -25,6 +29,24 @@ def __init__( *args, **kwargs, ): + """ + Creating an instance of TM1 source class. + + Args: + credentials (Dict[str, Any], optional): Credentials stored in a dictionary. Required credentials: username, + password, address, port. Defaults to None. + config_key (str, optional): Credential key to dictionary where credentials are stored. Defaults to "TM1". + cube (str, optional): Cube name from which data will be downloaded. Defaults to None. + view (str, optional): View name from which data will be downloaded. Defaults to None. + limit (str, optional): How many rows should be extracted. If None all the avaiable rows will + be downloaded. Defaults to None. + private (bool, optional): Whether or not data download shoulb be private. Defaults to False. + verify (bool, optional): Whether or not verify SSL certificates while. Defaults to False. + + + Raises: + CredentialError: When credentials are not found. + """ DEFAULT_CREDENTIALS = local_config.get(config_key) credentials = credentials or DEFAULT_CREDENTIALS @@ -43,6 +65,12 @@ def __init__( super().__init__(*args, credentials=credentials, **kwargs) def get_connection(self) -> TM1Service: + """ + Start a connection to TM1 instance. + + Returns: + TM1Service: Service instance if connection is succesfull. + """ return TM1Service( address=self.credentials["address"], port=self.credentials["port"], @@ -52,14 +80,37 @@ def get_connection(self) -> TM1Service: ) def get_cubes_names(self) -> list: + """ + Get list of avaiable cubes in TM1 instance. + + Returns: + list: List containing avaiable cubes names. + + """ conn = self.get_connection return conn.cubes.get_all_names() def get_views_names(self) -> list: + """ + Get list of avaiable views in TM1 instance. + + Returns: + list: List containing avaiable views names. + + """ conn = self.get_connection return conn.views.get_all_names(self.cube) def to_df(self, if_empty: Literal["warn", "fail", "skip"] = "skip") -> pd.DataFrame: + """ + Function for downloading data from TM1 to pd.DataFrame. + + Args: + if_empty (Literal["warn", "fail", "skip"], optional): What to do if output DataFrame is empty. Defaults to "skip". + + Returns: + pd.DataFrame: DataFrame with data downloaded from TM1 view. + """ conn = self.get_connection() df = conn.cubes.cells.execute_view_dataframe( cube_name=self.cube, diff --git a/viadot/tasks/tm1.py b/viadot/tasks/tm1.py index cde043e0a..1ea659252 100644 --- a/viadot/tasks/tm1.py +++ b/viadot/tasks/tm1.py @@ -1,3 +1,5 @@ +import pandas as pd + from prefect import Task from typing import Any, Dict from prefect.utilities.tasks import defaults_from_attrs @@ -20,6 +22,22 @@ def __init__( *args, **kwargs, ): + """ + Task for downloading data from TM1 view to pandas DataFrame. + + Args: + credentials (Dict[str, Any], optional): Credentials stored in a dictionary. Required credentials: username, + password, address, port. Defaults to None. + config_key (str, optional): Credential key to dictionary where credentials are stored. Defaults to "TM1". + cube (str, optional): Cube name from which data will be downloaded. Defaults to None. + view (str, optional): View name from which data will be downloaded. Defaults to None. + limit (str, optional): How many rows should be extracted. If None all the avaiable rows will + be downloaded. Defaults to None. + private (bool, optional): Whether or not data download shoulb be private. Defaults to False. + verify (bool, optional): Whether or not verify SSL certificates while. Defaults to False. + if_empty (Literal["warn", "fail", "skip"], optional): What to do if output DataFrame is empty. Defaults to "skip". + + """ self.credentials = credentials self.config_key = config_key self.cube = cube @@ -60,7 +78,26 @@ def run( private: bool = None, verify: bool = None, if_empty: str = None, - ): + ) -> pd.DataFrame: + """ + Run method for TM1ToDF class. + + Args: + credentials (Dict[str, Any], optional): Credentials stored in a dictionary. Required credentials: username, + password, address, port. Defaults to None. + config_key (str, optional): Credential key to dictionary where credentials are stored. Defaults to None. + cube (str, optional): Cube name from which data will be downloaded. Defaults to None. + view (str, optional): View name from which data will be downloaded. Defaults to None. + limit (str, optional): How many rows should be extracted. If None all the avaiable rows will + be downloaded. Defaults to None. + private (bool, optional): Whether or not data download shoulb be private. Defaults to None. + verify (bool, optional): Whether or not verify SSL certificates while. Defaults to None. + if_empty (Literal["warn", "fail", "skip"], optional): What to do if output DataFrame is empty. Defaults to None. + + Returns: + pd.DataFrame: DataFrame with data downloaded from TM1 view. + + """ tm1 = TM1( credentials=credentials, config_key=config_key, From 200b05f5848738dc4cb49b2aeda903a43fc8405b Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 2 Nov 2023 14:36:08 +0100 Subject: [PATCH 26/86] =?UTF-8?q?=E2=9C=85=20Added=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_tm1.py | 15 ++++++++++++ tests/integration/test_tm1.py | 36 +++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 tests/integration/tasks/test_tm1.py create mode 100644 tests/integration/test_tm1.py diff --git a/tests/integration/tasks/test_tm1.py b/tests/integration/tasks/test_tm1.py new file mode 100644 index 000000000..96dd58dfb --- /dev/null +++ b/tests/integration/tasks/test_tm1.py @@ -0,0 +1,15 @@ +import pandas as pd + +from viadot.tasks import TM1ToDF +from viadot.config import local_config + +CUBE = local_config.get("test_cube") +VIEW = local_config.get("test_view") + + +def test_tm1_to_df(): + tm1 = TM1ToDF(CUBE, VIEW) + df = tm1.run() + + assert isinstance(df, pd.DataFrame) + assert df.empty is False diff --git a/tests/integration/test_tm1.py b/tests/integration/test_tm1.py new file mode 100644 index 000000000..3676a8ef6 --- /dev/null +++ b/tests/integration/test_tm1.py @@ -0,0 +1,36 @@ +import pandas as pd + +from viadot.sources import TM1 +from viadot.config import local_config + +CUBE = local_config.get("test_cube") +VIEW = local_config.get("test_view") + + +def test_get_connection(): + tm1_source = TM1() + connection = tm1_source.get_connection() + + assert connection is not None + + +def test_get_cubes_names(): + tm1_source = TM1() + cubes = tm1_source.get_cubes_names() + + assert len(cubes) > 0 + + +def test_get_cubes_names(): + tm1_source = TM1(cube=CUBE) + views = tm1_source.get_views_names() + + assert len(views) > 0 + + +def test_to_df(): + tm1_source = TM1(cube=CUBE, view=VIEW) + df = tm1_source.to_df() + + assert isinstance(df, pd.DataFrame) + assert df.empty is False From 6ec73fbd96e1cf7be1fc9480b968aee3132c95c3 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 2 Nov 2023 14:37:10 +0100 Subject: [PATCH 27/86] =?UTF-8?q?=F0=9F=93=9D=20Updated=20changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 507c590cf..61fd6633e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- Added `TM1` source class. +- Added `TM1ToDF` task class. ## [0.4.21] - 2023-10-26 ### Added From 3401db7c5d27ac73ccc1706fb45b6ca43525d665 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Thu, 2 Nov 2023 14:41:09 +0100 Subject: [PATCH 28/86] =?UTF-8?q?=E2=9C=A8=20Added=20TM1py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 768887e4a..896b11d1a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -43,3 +43,4 @@ dbt-core==1.3.2 dbt-sqlserver==1.3.1 lumaCLI==0.0.19 Office365-REST-Python-Client==2.4.4 +TM1py==1.11.3 From 2be81098299df13ec684ceae64c52709f5185395 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Fri, 3 Nov 2023 08:26:29 +0100 Subject: [PATCH 29/86] =?UTF-8?q?=F0=9F=8E=A8=20Changed=20get=20views=20an?= =?UTF-8?q?d=20get=20cubes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/tm1.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/viadot/sources/tm1.py b/viadot/sources/tm1.py index 93522fda1..54ab010c4 100644 --- a/viadot/sources/tm1.py +++ b/viadot/sources/tm1.py @@ -87,7 +87,7 @@ def get_cubes_names(self) -> list: list: List containing avaiable cubes names. """ - conn = self.get_connection + conn = self.get_connection() return conn.cubes.get_all_names() def get_views_names(self) -> list: @@ -98,7 +98,7 @@ def get_views_names(self) -> list: list: List containing avaiable views names. """ - conn = self.get_connection + conn = self.get_connection() return conn.views.get_all_names(self.cube) def to_df(self, if_empty: Literal["warn", "fail", "skip"] = "skip") -> pd.DataFrame: From d9320cb917d7024bc35dbcd5912678c3e8b1084e Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 7 Nov 2023 15:32:05 +0100 Subject: [PATCH 30/86] =?UTF-8?q?=E2=9C=A8=20Added=20mdx=20option?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/tm1.py | 29 ++++++++++++++++++++++------- viadot/tasks/tm1.py | 7 +++++++ 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/viadot/sources/tm1.py b/viadot/sources/tm1.py index 54ab010c4..c9a1d875c 100644 --- a/viadot/sources/tm1.py +++ b/viadot/sources/tm1.py @@ -6,7 +6,7 @@ from ..config import local_config -from ..exceptions import CredentialError +from ..exceptions import CredentialError,ValidationError from .base import Source logger = logging.get_logger(__name__) @@ -21,6 +21,7 @@ def __init__( self, credentials: Dict[str, Any] = None, config_key: str = "TM1", + mdx_query: str = None, cube: str = None, view: str = None, limit: int = None, @@ -36,6 +37,7 @@ def __init__( credentials (Dict[str, Any], optional): Credentials stored in a dictionary. Required credentials: username, password, address, port. Defaults to None. config_key (str, optional): Credential key to dictionary where credentials are stored. Defaults to "TM1". + mdx_query (str, optional): MDX select query needed to download the data. Defaults to None. cube (str, optional): Cube name from which data will be downloaded. Defaults to None. view (str, optional): View name from which data will be downloaded. Defaults to None. limit (str, optional): How many rows should be extracted. If None all the avaiable rows will @@ -56,6 +58,7 @@ def __init__( raise CredentialError(f"Missing credential(s): '{not_found}'.") self.config_key = config_key + self.mdx_query = mdx_query self.cube = cube self.view = view self.limit = limit @@ -110,14 +113,26 @@ def to_df(self, if_empty: Literal["warn", "fail", "skip"] = "skip") -> pd.DataFr Returns: pd.DataFrame: DataFrame with data downloaded from TM1 view. + + Raises: + ValidationError: When mdx and cube + view are not specified or when combination of both is specified. """ conn = self.get_connection() - df = conn.cubes.cells.execute_view_dataframe( - cube_name=self.cube, - view_name=self.view, - private=self.private, - top=self.limit, - ) + + if self.mdx_query is None and (self.cube is None or self.view is None): + raise ValidationError("MDX query or cube and view are required.") + if self.cube is not None and self.view is not None: + df = conn.cubes.cells.execute_view_dataframe( + cube_name=self.cube, + view_name=self.view, + private=self.private, + top=self.limit, + ) + elif self.mdx_query is not None: + df = conn.cubes.cells.execute_mdx_dataframe(self.mdx_query) + else: + raise ValidationError("Specify only one: MDX query or cube and view.") + logger.info( f"Data was successfully transformed into DataFrame: {len(df.columns)} columns and {len(df)} rows." ) diff --git a/viadot/tasks/tm1.py b/viadot/tasks/tm1.py index 1ea659252..06b96ccd2 100644 --- a/viadot/tasks/tm1.py +++ b/viadot/tasks/tm1.py @@ -12,6 +12,7 @@ def __init__( self, credentials: Dict[str, Any] = None, config_key: str = "TM1", + mdx_query: str = None, cube: str = None, view: str = None, limit: int = None, @@ -29,6 +30,7 @@ def __init__( credentials (Dict[str, Any], optional): Credentials stored in a dictionary. Required credentials: username, password, address, port. Defaults to None. config_key (str, optional): Credential key to dictionary where credentials are stored. Defaults to "TM1". + mdx_query (str, optional): MDX select query needed to download the data. Defaults to None. cube (str, optional): Cube name from which data will be downloaded. Defaults to None. view (str, optional): View name from which data will be downloaded. Defaults to None. limit (str, optional): How many rows should be extracted. If None all the avaiable rows will @@ -40,6 +42,7 @@ def __init__( """ self.credentials = credentials self.config_key = config_key + self.mdx_query = mdx_query self.cube = cube self.view = view self.limit = limit @@ -61,6 +64,7 @@ def __call__(self, *args, **kwargs): @defaults_from_attrs( "credentials", "config_key", + "mdx_query", "cube", "view", "limit", @@ -72,6 +76,7 @@ def run( self, credentials: Dict[str, Any] = None, config_key: str = None, + mdx_query: str = None, cube: str = None, view: str = None, limit: int = None, @@ -86,6 +91,7 @@ def run( credentials (Dict[str, Any], optional): Credentials stored in a dictionary. Required credentials: username, password, address, port. Defaults to None. config_key (str, optional): Credential key to dictionary where credentials are stored. Defaults to None. + mdx_query (str, optional): MDX select query needed to download the data. Defaults to None. cube (str, optional): Cube name from which data will be downloaded. Defaults to None. view (str, optional): View name from which data will be downloaded. Defaults to None. limit (str, optional): How many rows should be extracted. If None all the avaiable rows will @@ -101,6 +107,7 @@ def run( tm1 = TM1( credentials=credentials, config_key=config_key, + mdx_query=mdx_query, cube=cube, view=view, limit=limit, From fb3f981977079b29563a0da5b4572736ec606ce6 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 7 Nov 2023 15:50:20 +0100 Subject: [PATCH 31/86] =?UTF-8?q?=E2=9C=A8=20Added=20extra=20functions=20t?= =?UTF-8?q?o=20check=20structure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/tm1.py | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/viadot/sources/tm1.py b/viadot/sources/tm1.py index c9a1d875c..25c1d9487 100644 --- a/viadot/sources/tm1.py +++ b/viadot/sources/tm1.py @@ -24,6 +24,8 @@ def __init__( mdx_query: str = None, cube: str = None, view: str = None, + dimension: str = None, + hierarchy: str =None, limit: int = None, private: bool = False, verify: bool = False, @@ -40,6 +42,8 @@ def __init__( mdx_query (str, optional): MDX select query needed to download the data. Defaults to None. cube (str, optional): Cube name from which data will be downloaded. Defaults to None. view (str, optional): View name from which data will be downloaded. Defaults to None. + dimension (str, optional): Diemension name. Defaults to None. + hierarchy (str, optional): Hierarchy name. Defaults to None. limit (str, optional): How many rows should be extracted. If None all the avaiable rows will be downloaded. Defaults to None. private (bool, optional): Whether or not data download shoulb be private. Defaults to False. @@ -61,6 +65,8 @@ def __init__( self.mdx_query = mdx_query self.cube = cube self.view = view + self.dimension = dimension + self.hierarchy = hierarchy self.limit = limit self.private = private self.verify = verify @@ -95,7 +101,7 @@ def get_cubes_names(self) -> list: def get_views_names(self) -> list: """ - Get list of avaiable views in TM1 instance. + Get list of avaiable views in TM1 cube instance. Returns: list: List containing avaiable views names. @@ -103,6 +109,39 @@ def get_views_names(self) -> list: """ conn = self.get_connection() return conn.views.get_all_names(self.cube) + + def get_diemensions_names(self) -> list: + """ + Get list of avaiable dimensions in TM1 instance. + + Returns: + list: List containing avaiable dimensions names. + + """ + conn = self.get_connection() + return conn.dimensions.get_all_names() + + def get_hierarchies_names(self) -> list: + """ + Get list of avaiable hierarchies in TM1 dimension instance. + + Returns: + list: List containing avaiable hierarchies names. + + """ + conn = self.get_connection() + return conn.hierarchies.get_all_names(self.dimension) + + def get_available_elements(self) -> list: + """ + Get list of avaiable elements in TM1 instance based on hierarchy and diemension. + + Returns: + list: List containing avaiable elements names. + + """ + conn = self.get_connection() + return conn.elements.get_element_names(dimension_name= self.dimension, hierarchy_name = self.hierarchy) def to_df(self, if_empty: Literal["warn", "fail", "skip"] = "skip") -> pd.DataFrame: """ From a9cd4ae49ab6e16314d0018152fb16655f4edf73 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 7 Nov 2023 15:57:43 +0100 Subject: [PATCH 32/86] =?UTF-8?q?=F0=9F=8E=A8=20Formatted=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/tm1.py | 1 - viadot/tasks/tm1.py | 1 - 2 files changed, 2 deletions(-) diff --git a/viadot/sources/tm1.py b/viadot/sources/tm1.py index 25c1d9487..0fac5e098 100644 --- a/viadot/sources/tm1.py +++ b/viadot/sources/tm1.py @@ -11,7 +11,6 @@ logger = logging.get_logger(__name__) - class TM1(Source): """ Class for downloading data from TM1 Software using TM1py library diff --git a/viadot/tasks/tm1.py b/viadot/tasks/tm1.py index 06b96ccd2..a4926dc55 100644 --- a/viadot/tasks/tm1.py +++ b/viadot/tasks/tm1.py @@ -6,7 +6,6 @@ from ..sources import TM1 - class TM1ToDF(Task): def __init__( self, From 92e621e034621927ae1c741cd91c089a33be03ea Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Tue, 7 Nov 2023 16:04:25 +0100 Subject: [PATCH 33/86] =?UTF-8?q?=F0=9F=8E=A8=20Formatted=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/tm1.py | 17 ++++++++++------- viadot/tasks/tm1.py | 1 + 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/viadot/sources/tm1.py b/viadot/sources/tm1.py index 0fac5e098..77155b07c 100644 --- a/viadot/sources/tm1.py +++ b/viadot/sources/tm1.py @@ -6,11 +6,12 @@ from ..config import local_config -from ..exceptions import CredentialError,ValidationError +from ..exceptions import CredentialError, ValidationError from .base import Source logger = logging.get_logger(__name__) + class TM1(Source): """ Class for downloading data from TM1 Software using TM1py library @@ -24,7 +25,7 @@ def __init__( cube: str = None, view: str = None, dimension: str = None, - hierarchy: str =None, + hierarchy: str = None, limit: int = None, private: bool = False, verify: bool = False, @@ -108,7 +109,7 @@ def get_views_names(self) -> list: """ conn = self.get_connection() return conn.views.get_all_names(self.cube) - + def get_diemensions_names(self) -> list: """ Get list of avaiable dimensions in TM1 instance. @@ -119,7 +120,7 @@ def get_diemensions_names(self) -> list: """ conn = self.get_connection() return conn.dimensions.get_all_names() - + def get_hierarchies_names(self) -> list: """ Get list of avaiable hierarchies in TM1 dimension instance. @@ -130,7 +131,7 @@ def get_hierarchies_names(self) -> list: """ conn = self.get_connection() return conn.hierarchies.get_all_names(self.dimension) - + def get_available_elements(self) -> list: """ Get list of avaiable elements in TM1 instance based on hierarchy and diemension. @@ -140,7 +141,9 @@ def get_available_elements(self) -> list: """ conn = self.get_connection() - return conn.elements.get_element_names(dimension_name= self.dimension, hierarchy_name = self.hierarchy) + return conn.elements.get_element_names( + dimension_name=self.dimension, hierarchy_name=self.hierarchy + ) def to_df(self, if_empty: Literal["warn", "fail", "skip"] = "skip") -> pd.DataFrame: """ @@ -151,7 +154,7 @@ def to_df(self, if_empty: Literal["warn", "fail", "skip"] = "skip") -> pd.DataFr Returns: pd.DataFrame: DataFrame with data downloaded from TM1 view. - + Raises: ValidationError: When mdx and cube + view are not specified or when combination of both is specified. """ diff --git a/viadot/tasks/tm1.py b/viadot/tasks/tm1.py index a4926dc55..06b96ccd2 100644 --- a/viadot/tasks/tm1.py +++ b/viadot/tasks/tm1.py @@ -6,6 +6,7 @@ from ..sources import TM1 + class TM1ToDF(Task): def __init__( self, From 96b66cad68f054bbe4eb55943e0e677c1f82b66f Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Mon, 6 Nov 2023 15:15:52 +0100 Subject: [PATCH 34/86] add keyerror to except --- .../flows/test_customer_gauge_to_adls.py | 2 - viadot/sources/vid_club.py | 315 ------------------ viadot/tasks/customer_gauge.py | 4 +- 3 files changed, 2 insertions(+), 319 deletions(-) delete mode 100644 viadot/sources/vid_club.py diff --git a/tests/integration/flows/test_customer_gauge_to_adls.py b/tests/integration/flows/test_customer_gauge_to_adls.py index 0e7afd3e2..34c7336bc 100644 --- a/tests/integration/flows/test_customer_gauge_to_adls.py +++ b/tests/integration/flows/test_customer_gauge_to_adls.py @@ -91,5 +91,3 @@ def test_customer_gauge_to_adls_run_flow_validation_failure(mocked_class): except ValidationError: pass - os.remove("test_customer_gauge_to_adls_run_flow_validation_failure.parquet") - os.remove("test_customer_gauge_to_adls_run_flow_validation_failure.json") diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py deleted file mode 100644 index e7819577a..000000000 --- a/viadot/sources/vid_club.py +++ /dev/null @@ -1,315 +0,0 @@ -import json -import os -import urllib -from datetime import date, datetime, timedelta -from typing import Any, Dict, List, Literal, Tuple - -import pandas as pd -from prefect.utilities import logging - -from ..exceptions import CredentialError, ValidationError -from ..utils import handle_api_response -from .base import Source - -logger = logging.get_logger() - - -class VidClub(Source): - """ - A class implementing the Vid Club API. - - Documentation for this API is located at: https://evps01.envoo.net/vipapi/ - There are 4 endpoints where to get the data. - """ - - def __init__(self, credentials: Dict[str, Any], *args, **kwargs): - """ - Create an instance of VidClub. - - Args: - credentials (Dict[str, Any]): Credentials to Vid Club APIs containing token. - - Raises: - CredentialError: If credentials are not provided as a parameter. - """ - self.headers = { - "Authorization": "Bearer " + credentials["token"], - "Content-Type": "application/json", - } - - super().__init__(*args, credentials=credentials, **kwargs) - - def build_query( - self, - from_date: str, - to_date: str, - api_url: str, - items_per_page: int, - source: Literal["jobs", "product", "company", "survey"] = None, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", - ) -> str: - """ - Builds the query from the inputs. - - Args: - from_date (str): Start date for the query. - to_date (str): End date for the query, if empty, will be executed as datetime.today().strftime("%Y-%m-%d"). - api_url (str): Generic part of the URL to Vid Club API. - items_per_page (int): number of entries per page. - source (Literal["jobs", "product", "company", "survey"], optional): The endpoint source to be accessed. Defaults to None. - region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] - - Returns: - str: Final query with all filters added. - - Raises: - ValidationError: If any source different than the ones in the list are used. - """ - if source in ["jobs", "product", "company"]: - url = f"{api_url}{source}?from={from_date}&to={to_date}®ion={region}&limit={items_per_page}" - elif source == "survey": - url = f"{api_url}{source}?language=en&type=question" - else: - raise ValidationError( - "Pick one these sources: jobs, product, company, survey" - ) - return url - - def intervals( - self, from_date: str, to_date: str, days_interval: int - ) -> Tuple[List[str], List[str]]: - """ - Breaks dates range into smaller by provided days interval. - - Args: - from_date (str): Start date for the query in "%Y-%m-%d" format. - to_date (str): End date for the query, if empty, will be executed as datetime.today().strftime("%Y-%m-%d"). - days_interval (int): Days specified in date range per api call (test showed that 30-40 is optimal for performance). - - Returns: - List[str], List[str]: Starts and Ends lists that contains information about date ranges for specific period and time interval. - - Raises: - ValidationError: If the final date of the query is before the start date. - """ - - if to_date == None: - to_date = datetime.today().strftime("%Y-%m-%d") - - end_date = datetime.strptime(to_date, "%Y-%m-%d").date() - start_date = datetime.strptime(from_date, "%Y-%m-%d").date() - - from_date_obj = datetime.strptime(from_date, "%Y-%m-%d") - - to_date_obj = datetime.strptime(to_date, "%Y-%m-%d") - delta = to_date_obj - from_date_obj - - if delta.days < 0: - raise ValidationError("to_date cannot be earlier than from_date.") - - interval = timedelta(days=days_interval) - starts = [] - ends = [] - - period_start = start_date - while period_start < end_date: - period_end = min(period_start + interval, end_date) - starts.append(period_start.strftime("%Y-%m-%d")) - ends.append(period_end.strftime("%Y-%m-%d")) - period_start = period_end - if len(starts) == 0 and len(ends) == 0: - starts.append(from_date) - ends.append(to_date) - return starts, ends - - def check_connection( - self, - source: Literal["jobs", "product", "company", "survey"] = None, - from_date: str = "2022-03-22", - to_date: str = None, - items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", - url: str = None, - ) -> Tuple[Dict[str, Any], str]: - """ - Initiate first connection to API to retrieve piece of data with information about type of pagination in API URL. - This option is added because type of pagination for endpoints is being changed in the future from page number to 'next' id. - - Args: - source (Literal["jobs", "product", "company", "survey"], optional): The endpoint source to be accessed. Defaults to None. - from_date (str, optional): Start date for the query, by default is the oldest date in the data 2022-03-22. - to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. - items_per_page (int, optional): Number of entries per page. 100 entries by default. - region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] - url (str, optional): Generic part of the URL to Vid Club API. Defaults to None. - - Returns: - Tuple[Dict[str, Any], str]: Dictionary with first response from API with JSON containing data and used URL string. - - Raises: - ValidationError: If from_date is earlier than 2022-03-22. - ValidationError: If to_date is earlier than from_date. - """ - - if from_date < "2022-03-22": - raise ValidationError("from_date cannot be earlier than 2022-03-22.") - - if to_date < from_date: - raise ValidationError("to_date cannot be earlier than from_date.") - - if url is None: - url = self.credentials["url"] - - first_url = self.build_query( - source=source, - from_date=from_date, - to_date=to_date, - api_url=url, - items_per_page=items_per_page, - region=region, - ) - headers = self.headers - response = handle_api_response( - url=first_url, headers=headers, method="GET", verify=False - ) - response = response.json() - - return (response, first_url) - - def get_response( - self, - source: Literal["jobs", "product", "company", "survey"] = None, - from_date: str = "2022-03-22", - to_date: str = None, - items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", - ) -> pd.DataFrame: - """ - Basing on the pagination type retrieved using check_connection function, gets the response from the API queried and transforms it into DataFrame. - - Args: - source (Literal["jobs", "product", "company", "survey"], optional): The endpoint source to be accessed. Defaults to None. - from_date (str, optional): Start date for the query, by default is the oldest date in the data 2022-03-22. - to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. - items_per_page (int, optional): Number of entries per page. 100 entries by default. - region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] - - Returns: - pd.DataFrame: Table of the data carried in the response. - - Raises: - ValidationError: If any source different than the ones in the list are used. - """ - headers = self.headers - if source not in ["jobs", "product", "company", "survey"]: - raise ValidationError( - "The source has to be: jobs, product, company or survey" - ) - if to_date == None: - to_date = datetime.today().strftime("%Y-%m-%d") - - response, first_url = self.check_connection( - source=source, - from_date=from_date, - to_date=to_date, - items_per_page=items_per_page, - region=region, - ) - - if isinstance(response, dict): - keys_list = list(response.keys()) - elif isinstance(response, list): - keys_list = list(response[0].keys()) - else: - keys_list = [] - - if "next" in keys_list: - ind = True - else: - ind = False - - if "data" in keys_list: - df = pd.DataFrame(response["data"]) - length = df.shape[0] - page = 1 - - while length == items_per_page: - if ind == True: - next = response["next"] - url = f"{first_url}&next={next}" - else: - page += 1 - url = f"{first_url}&page={page}" - r = handle_api_response( - url=url, headers=headers, method="GET", verify=False - ) - response = r.json() - df_page = pd.DataFrame(response["data"]) - if source == "product": - df_page = df_page.transpose() - length = df_page.shape[0] - df = pd.concat((df, df_page), axis=0) - else: - df = pd.DataFrame(response) - - return df - - def total_load( - self, - source: Literal["jobs", "product", "company", "survey"] = None, - from_date: str = "2022-03-22", - to_date: str = None, - items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", - days_interval: int = 30, - ) -> pd.DataFrame: - """ - Looping get_response and iterating by date ranges defined in intervals. Stores outputs as DataFrames in a list. - At the end, daframes are concatenated in one and dropped duplicates that would appear when quering. - - Args: - source (Literal["jobs", "product", "company", "survey"], optional): The endpoint source to be accessed. Defaults to None. - from_date (str, optional): Start date for the query, by default is the oldest date in the data 2022-03-22. - to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. - items_per_page (int, optional): Number of entries per page. 100 entries by default. - region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] - days_interval (int, optional): Days specified in date range per api call (test showed that 30-40 is optimal for performance). Defaults to 30. - - Returns: - pd.DataFrame: Dataframe of the concatanated data carried in the responses. - """ - - starts, ends = self.intervals( - from_date=from_date, to_date=to_date, days_interval=days_interval - ) - - dfs_list = [] - if len(starts) > 0 and len(ends) > 0: - for start, end in zip(starts, ends): - logger.info(f"ingesting data for dates [{start}]-[{end}]...") - df = self.get_response( - source=source, - from_date=start, - to_date=end, - items_per_page=items_per_page, - region=region, - ) - dfs_list.append(df) - if len(dfs_list) > 1: - df = pd.concat(dfs_list, axis=0, ignore_index=True) - else: - df = pd.DataFrame(dfs_list[0]) - else: - df = self.get_response( - source=source, - from_date=from_date, - to_date=to_date, - items_per_page=items_per_page, - region=region, - ) - df.drop_duplicates(inplace=True) - - if df.empty: - logger.error("No data for this date range") - - return df diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index 03b30b286..abaa1fc2e 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -94,8 +94,8 @@ def get_data(self, """ try: jsons_list = json_response["data"] - except: - raise ValueError( + except KeyError: + logger.info( "Provided argument doesn't contain 'data' value. Pass json returned from the endpoint." ) From e9b4fcc65ae62bbe886054849f4c01a9fdab5f2d Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Mon, 6 Nov 2023 15:20:02 +0100 Subject: [PATCH 35/86] add valuerror log --- viadot/tasks/customer_gauge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index abaa1fc2e..aac47337b 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -132,7 +132,7 @@ def _field_reference_unpacker( list_properties = list(dictionary.values()) result[list_properties[0]] = list_properties[1] else: - raise ValueError() + raise ValueError(f"Dictionary within the specified field doesn't contain exactly two items.") if result: json_response[field] = result From 56b01823397693858e9ad1a5b7492dcfed941f2d Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Mon, 6 Nov 2023 15:22:08 +0100 Subject: [PATCH 36/86] rename variable --- viadot/tasks/customer_gauge.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index aac47337b..c2782670d 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -158,12 +158,12 @@ def _nested_dict_transformer( Dict[str, Any]: The JSON response with modified nested dictionaries within the specified field. """ - d={} + result={} for i, dictionary in enumerate(json_response[field], start=1): for key, value in dictionary.items(): - d[f'{i}_{key}'] = value - if d: - json_response[field] = d + result[f'{i}_{key}'] = value + if result: + json_response[field] = result return json_response From 082ac69d6fb842647bddd14d6cba3f8e95afaad5 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Mon, 6 Nov 2023 15:27:21 +0100 Subject: [PATCH 37/86] Update docstrings for column_unpacker --- viadot/tasks/customer_gauge.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index c2782670d..15ee94d15 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -175,9 +175,11 @@ def column_unpacker( ) -> List[Dict[str, Any]]: """ - Unpack and modify specific columns in a list of dictionaries using two methods, chosen by the user. - If user wants to use field_reference_unpacker, he needs to provide list of fields in `method1_cols` - argument, if user wants to use nested_dict_transformer - uses 'method2_cols' argument. + Function to unpack and modify specific columns in a list of dictionaries by using one of two methods, + chosen by the user. + If user would like to use field_reference_unpacker, he/she needs to provide list of fields as strings in + `method1_cols` parameter, if user would like to use nested_dict_transformer he/she needs to provide list of + fields as strings in method2_cols parameter. Args: json_list (List[Dict[str, Any]): A list of dictionaries containing the data. From 25c92f898a0cb84a8c2fa22d2cd36d2c45e77295 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Mon, 6 Nov 2023 15:35:07 +0100 Subject: [PATCH 38/86] update errors --- viadot/tasks/customer_gauge.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index 15ee94d15..28f26ae9e 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -189,7 +189,7 @@ def column_unpacker( Defaults to None. Raises: - ValueError: _description_ + ValueError: Input 'json_list' is required. Returns: List[Dict[str, Any]]: The updated list of dictionaries after column unpacking and modification. @@ -238,6 +238,9 @@ def flatten_json(self, json_response: Dict[str, Any] = None) -> Dict[str, Any]: json_response (Dict[str, Any], optional): JSON object represented as a nested dictionary. Defaults to None. + Raises: + TypeError: If the 'json_response' not a dictionary. + Returns: Dict[str, Any]: The flattened dictionary. """ From 8b96fb983faae41bf23a4d5244565a7879cb2fa1 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Tue, 7 Nov 2023 13:52:52 +0100 Subject: [PATCH 39/86] precise exceptions --- viadot/tasks/customer_gauge.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index 28f26ae9e..5872e7fa7 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -206,9 +206,11 @@ def unpack_columns(columns, unpack_function): try: json_list_clean = list(map(lambda x: unpack_function(x, field), json_list_clean)) logger.info(f"All elements in '{field}' are unpacked successfully.") - except: + except ValueError as ve: logger.info(f"No transformation were made in '{field}'," "because didn't contain list of key-value data.") + except Exception as e: + logger.info(f"Error while unpacking {field}: {e}") else: logger.info(f"Column '{field}' not found.") return json_list_clean From c8c5bb071a27aae6a0fbbd69c1153d9a358653ee Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Tue, 7 Nov 2023 14:25:37 +0100 Subject: [PATCH 40/86] checking duplicated columns --- viadot/tasks/customer_gauge.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index 5872e7fa7..a1f9b2145 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -189,7 +189,9 @@ def column_unpacker( Defaults to None. Raises: - ValueError: Input 'json_list' is required. + ValueError: If 'json_list' is not provided. + ValueError: If specified columns do not exist in the JSON data. + ValueError: If columns are mentioned in both 'method1_cols' and 'method2_cols'. Returns: List[Dict[str, Any]]: The updated list of dictionaries after column unpacking and modification. @@ -215,17 +217,24 @@ def unpack_columns(columns, unpack_function): logger.info(f"Column '{field}' not found.") return json_list_clean - if method1_cols is not None: - json_list = unpack_columns( - columns = method1_cols, - unpack_function = self._field_reference_unpacker - ) - - if method2_cols is not None: - json_list = unpack_columns( - columns = method2_cols, - unpack_function = self._nested_dict_transformer + duplicated_cols = set(method1_cols).intersection(set(method2_cols)) + if duplicated_cols: + raise ValueError( + f"{duplicated_cols} were mentioned in both method1_cols and method2_cols." + " It's not possible to apply two methods to the same field." ) + else: + if method1_cols is not None: + json_list = unpack_columns( + columns = method1_cols, + unpack_function = self._field_reference_unpacker + ) + + if method2_cols is not None: + json_list = unpack_columns( + columns = method2_cols, + unpack_function = self._nested_dict_transformer + ) return json_list From 21983216140985296ccdc0990e665a635a2f553e Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Tue, 7 Nov 2023 14:52:49 +0100 Subject: [PATCH 41/86] adjust flattify nested function --- viadot/tasks/customer_gauge.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index a1f9b2145..7e88a59ee 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -242,7 +242,7 @@ def unpack_columns(columns, unpack_function): def flatten_json(self, json_response: Dict[str, Any] = None) -> Dict[str, Any]: """ Function that flattens a nested structure of the JSON object into - a single-level dictionary.Uses a nested `flatten()` function to recursively + a single-level dictionary. It uses a nested `flattify()` function to recursively combine nested keys in the JSON object with '_' to create the flattened keys. Args: @@ -260,15 +260,15 @@ def flatten_json(self, json_response: Dict[str, Any] = None) -> Dict[str, Any]: if not isinstance(json_response, dict): raise TypeError("Input must be a dictionary.") - def flattify(x, key="", out = None): + def flattify(field, key="", out = None): if out is None: out = result - if isinstance(x, dict): - for a in x: - flattify(x[a], key + a + "_", out) + if isinstance(field, dict): + for item in field.keys(): + flattify(field[item], key + item + "_", out) else: - out[key[:-1]] = x + out[key[:-1]] = field flattify(json_response) From c2d9dbaa6b38e65d12a27c79eb993c364389201a Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Tue, 7 Nov 2023 15:08:39 +0100 Subject: [PATCH 42/86] improve if 'drivers' condition --- viadot/tasks/customer_gauge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index 7e88a59ee..547c0e303 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -430,7 +430,7 @@ def run( logger.info("Inserting data into the DataFrame...") df = pd.DataFrame(list(map(self.flatten_json, clean_json))) df = self.square_brackets_remover(df) - if endpoint == "responses": + if "drivers" in list(df.columns): df["drivers"] = df["drivers"].apply(self._drivers_cleaner) df.columns = df.columns.str.lower().str.replace(" ", "_") logger.info("DataFrame: Ready. Data: Inserted. Let the magic happen!") From 01ec7388c1930ecd085c412352c72afb0b7374a6 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Tue, 7 Nov 2023 15:37:52 +0100 Subject: [PATCH 43/86] update tests for source --- tests/integration/test_customer_gauge.py | 46 ------------------------ 1 file changed, 46 deletions(-) diff --git a/tests/integration/test_customer_gauge.py b/tests/integration/test_customer_gauge.py index 119615100..596cf029c 100644 --- a/tests/integration/test_customer_gauge.py +++ b/tests/integration/test_customer_gauge.py @@ -17,52 +17,6 @@ def test_get_json_content(): assert isinstance(json_response["cursor"], dict) -# def test_properties_cleaning(): -# json_response = CG.get_json_response() -# data = json_response["data"][2].copy() -# cleaned_data = CG.properties_cleaning(data.copy()) -# assert isinstance(data["properties"], list) -# assert isinstance(cleaned_data["properties"], dict) - - -# def test_flatten_json(): -# nested_json = { -# "user": { -# "name": "Jane", -# "address": { -# "street": "456 Elm St", -# "city": "San Francisco", -# "state": "CA", -# "zip": "94109", -# "country": {"name": "United States", "code": "US"}, -# }, -# "phone_numbers": {"type": "home", "number": "555-4321"}, -# } -# } - -# expected_output = { -# "user_name": "Jane", -# "user_address_street": "456 Elm St", -# "user_address_city": "San Francisco", -# "user_address_state": "CA", -# "user_address_zip": "94109", -# "user_address_country_name": "United States", -# "user_address_country_code": "US", -# "user_phone_numbers_type": "home", -# "user_phone_numbers_number": "555-4321", -# } - -# output = CG.flatten_json(nested_json) -# assert output == expected_output - - -# def test_pagesize_and_to_df(): -# json_response = CG.get_json_response(pagesize=1) -# df = CG.to_df(json_response) -# assert isinstance(df, pd.DataFrame) -# assert len(df) == 1 - - def test_pass_specific_cursor(): # for default pagesize=1000 returned cursor value should be bigger than passed cur = random.randint(1, 9999) From a0a9afc0384bb89643b121845625ad2b42091464 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Tue, 7 Nov 2023 16:52:12 +0100 Subject: [PATCH 44/86] update get_data function --- viadot/tasks/customer_gauge.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index 547c0e303..cad205479 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -86,18 +86,20 @@ def get_data(self, dictionary that contains data and cursor parameter value. Defaults to None. Raises: - ValueError: If the 'data' key is not present in the provided JSON response. + KeyError: If the 'data' key is not present in the provided JSON response. Returns: List[Dict[str, Any]]: A list of dictionaries containing data from the 'data' part of the JSON response. """ + jsons_list=[] try: jsons_list = json_response["data"] except KeyError: - logger.info( + logger.error( "Provided argument doesn't contain 'data' value. Pass json returned from the endpoint." ) + raise return jsons_list @@ -216,8 +218,8 @@ def unpack_columns(columns, unpack_function): else: logger.info(f"Column '{field}' not found.") return json_list_clean - - duplicated_cols = set(method1_cols).intersection(set(method2_cols)) + if method1_cols and method2_cols: + duplicated_cols = set(method1_cols).intersection(set(method2_cols)) if duplicated_cols: raise ValueError( f"{duplicated_cols} were mentioned in both method1_cols and method2_cols." From 70ddfba80962c10868efbb9a33ac27309bf77e48 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Tue, 7 Nov 2023 18:13:59 +0100 Subject: [PATCH 45/86] add typeerror handling --- viadot/tasks/customer_gauge.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index cad205479..cec4d4c5f 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -161,11 +161,14 @@ def _nested_dict_transformer( within the specified field. """ result={} - for i, dictionary in enumerate(json_response[field], start=1): - for key, value in dictionary.items(): - result[f'{i}_{key}'] = value - if result: - json_response[field] = result + try: + for i, dictionary in enumerate(json_response[field], start=1): + for key, value in dictionary.items(): + result[f'{i}_{key}'] = value + if result: + json_response[field] = result + except TypeError as te: + logger.error(te) return json_response @@ -198,7 +201,8 @@ def column_unpacker( Returns: List[Dict[str, Any]]: The updated list of dictionaries after column unpacking and modification. """ - + duplicated_cols = [] + if json_list is None: raise ValueError("Input 'json_list' is required.") From 1cda2da307f6aea0ea9513c4db5e3cee10ae1d37 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Tue, 7 Nov 2023 19:45:07 +0100 Subject: [PATCH 46/86] add tests to task --- .../integration/tasks/test_customer_gauge.py | 309 +++++++++++++++++ viadot/sources/vid_club.py | 315 ++++++++++++++++++ 2 files changed, 624 insertions(+) create mode 100644 viadot/sources/vid_club.py diff --git a/tests/integration/tasks/test_customer_gauge.py b/tests/integration/tasks/test_customer_gauge.py index 732205814..6cbd17078 100644 --- a/tests/integration/tasks/test_customer_gauge.py +++ b/tests/integration/tasks/test_customer_gauge.py @@ -8,10 +8,319 @@ CUR = 185000 PAGESIZE = 1000 +DATA_JSON = {'contact': {'first_name': '***', 'last_name': '***'}, + 'number_customer': 266, + 'date_email_sent': '2018-02-05 10:42:28', + 'properties': [{'field': 'Postal Code', 'reference': '999'}, + {'field': 'City', 'reference': 'Eldorado'}, + {'field': 'Currency', 'reference': None}, + {'field': 'Item Quantity', 'reference': '7'}, + {'field': 'PostingDate', 'reference': '2018-01-10 00:00:00'}], + 'custom_fields': [{'field': 'Assignment_ID', 'reference': None}], + 'drivers': [{'label': 'Product Quality and Product Performance'}, + {'label': 'Function and Design'}, + {'label': 'Value for Money'}, + {'label': 'Packaging'}]} + +RAW_JSON = {'data': [{'contact': {'first_name': '***', 'last_name': '***'}, + 'number_customer': 266, + 'date_email_sent': '2018-02-05 10:42:28', + 'properties': [{'field': 'Postal Code', 'reference': '999'}, + {'field': 'City', 'reference': 'Eldorado'}, + {'field': 'Currency', 'reference': None}, + {'field': 'Item Quantity', 'reference': '7'}, + {'field': 'PostingDate', 'reference': '2018-01-10 00:00:00'}], + 'custom_fields': [{'field': 'Assignment_ID', 'reference': None}], + 'drivers': [{'label': 'Product Quality and Product Performance'}, + {'label': 'Function and Design'}, + {'label': 'Value for Money'}, + {'label': 'Packaging'}]}, + {'contact': {'first_name': '***', 'last_name': '***'}, + 'number_customer': 206, + 'date_email_sent': '2018-02-05 10:41:01', + 'properties': [{'field': 'Postal Code', 'reference': '0000'}, + {'field': 'City', 'reference': 'Neverland'}, + {'field': 'Currency', 'reference': None}, + {'field': 'Item Quantity', 'reference': '1'}, + {'field': 'PostingDate', 'reference': '2018-01-26 00:00:00'}], + 'custom_fields': [{'field': 'Assignment_ID', 'reference': None}], + 'drivers': [{'label': 'The website of the online shop (overall impression)'}, + {'label': 'Waiting period'}]}], + 'cursor': {'next': 37}} + +WRONG_DATA = {'cols':[ + {'field': 'City', 'reference': 'Eldorado'}, + {'field': 'Currency', 'reference': None}, + {'field': 'Item Quantity', 'reference': '7'}, + {'field': 'PostingDate', 'reference': '2018-01-10 00:00:00'}]} @pytest.mark.looping_api_calls def test_customer_gauge_to_df_loop(): + """ + Test the 'run' method with looping API calls. + """ df = CG.run(total_load=True, cursor=CUR, pagesize=PAGESIZE) assert isinstance(df, pd.DataFrame) assert len(df) > PAGESIZE + + +@pytest.mark.get_data +def test_get_data(): + """ + Test the 'get_data' method with valid JSON data. + """ + json_data = CG.get_data(RAW_JSON) + assert isinstance(json_data, list) + + +@pytest.mark.get_data_error +def test_get_data_error_raising(): + """ + Test the 'get_data' method with invalid JSON data that raises a KeyError. + """ + with pytest.raises(KeyError): + CG.get_data(WRONG_DATA) + + +@pytest.mark.field_reference_unpacker_success +def test_field_reference_unpacker(): + """ + Test the '_field_reference_unpacker' method with valid data. It should unpack and modify dictionaries within the specified field and return the expected result. + """ + data = DATA_JSON.copy() + field = 'properties' + expected_result = { + 'contact': {'first_name': '***', 'last_name': '***'}, + 'number_customer': 266, + 'date_email_sent': '2018-02-05 10:42:28', + 'properties': {'Postal Code': '999', + 'City': 'Eldorado', + 'Currency': None, + 'Item Quantity': '7', + 'PostingDate': '2018-01-10 00:00:00'}, + 'custom_fields': [{'field': 'Assignment_ID', 'reference': None}], + 'drivers': [{'label': 'Product Quality and Product Performance'}, + {'label': 'Function and Design'}, + {'label': 'Value for Money'}, + {'label': 'Packaging'}] + } + + result = CG._field_reference_unpacker(json_response=data, field=field) + + assert result == expected_result + +@pytest.mark.field_reference_unpacker_value_error +def test_field_reference_unpacker_invalid_data_format(): + """ + Test the '_field_reference_unpacker' method with invalid data format that should raise a ValueError. It should raise a ValueError exception. + """ + data = DATA_JSON.copy() + field='contact' + with pytest.raises(ValueError, match=r"Dictionary within the specified field doesn't contain exactly two items."): + CG._field_reference_unpacker(json_response=data, field=field) + + +@pytest.mark.field_reference_unpacker_key_error +def test_field_reference_unpacker_missing_field(): + """ + Test the '_field_reference_unpacker' method with a missing field that should raise a KeyError. It should raise a KeyError exception. + """ + data = DATA_JSON.copy() + field = "non_existent_field" + with pytest.raises(KeyError): + CG._field_reference_unpacker(json_response=data, field=field) + + +@pytest.mark.nested_dict_transformer_success +def test_nested_dict_transformer(): + """ + Test the '_nested_dict_transformer' method with valid data. It should modify nested dictionaries within the specified field and return the expected result. + """ + data = DATA_JSON.copy() + field = 'drivers' + expected_result = {'contact': {'first_name': '***', 'last_name': '***'}, + 'number_customer': 266, + 'date_email_sent': '2018-02-05 10:42:28', + 'properties': [{'field': 'Postal Code', 'reference': '999'}, + {'field': 'City', 'reference': 'Eldorado'}, + {'field': 'Currency', 'reference': None}, + {'field': 'Item Quantity', 'reference': '7'}, + {'field': 'PostingDate', 'reference': '2018-01-10 00:00:00'}], + 'custom_fields': [{'field': 'Assignment_ID', 'reference': None}], + 'drivers': {'1_label': 'Product Quality and Product Performance', + '2_label': 'Function and Design', + '3_label': 'Value for Money', + '4_label': 'Packaging'}} + + result = CG._nested_dict_transformer(json_response=data, field=field) + + assert result == expected_result + + +@pytest.mark.nested_dict_transformer_type_error +def test_nested_dict_transformer_invalid_data_format(): + """ + Test the '_nested_dict_transformer' method with invalid data format. It should return the same data without modification. + """ + data = DATA_JSON.copy() + field='number_customer' + result = CG._nested_dict_transformer(json_response=data, field=field) + + assert result == data + + +@pytest.mark.nested_dict_transformer_key_error +def test_nested_dict_transformer_missing_field(): + """ + Test the '_nested_dict_transformer' method with a missing field that should raise a KeyError. + """ + data = DATA_JSON.copy() + field = "non_existent_field" + with pytest.raises(KeyError): + CG._nested_dict_transformer(json_response=data, field=field) + + +@pytest.mark.column_unpacker_success +def test_column_unpacker_success_method1_and_method2(): + """ + Test the 'column_unpacker' method with valid data and both Method 1 and Method 2 columns specified. It should return the expected result. + """ + data = RAW_JSON['data'].copy() + method1_cols = ['properties'] + method2_cols = ['drivers'] + + expected_result = [ + {'contact': {'first_name': '***', 'last_name': '***'}, + 'number_customer': 266, + 'date_email_sent': '2018-02-05 10:42:28', + 'properties': { + 'Postal Code': '999', + 'City': 'Eldorado', + 'Currency': None, + 'Item Quantity': '7', + 'PostingDate': '2018-01-10 00:00:00' + }, + 'custom_fields': [{'field': 'Assignment_ID', 'reference': None}], + 'drivers': {'1_label': 'Product Quality and Product Performance', + '2_label': 'Function and Design', + '3_label': 'Value for Money', + '4_label': 'Packaging'}}, + {'contact': {'first_name': '***', 'last_name': '***'}, + 'number_customer': 206, + 'date_email_sent': '2018-02-05 10:41:01', + 'properties': { + 'Postal Code': '0000', + 'City': 'Neverland', + 'Currency': None, + 'Item Quantity': '1', + 'PostingDate': '2018-01-26 00:00:00' + }, + 'custom_fields': [{'field': 'Assignment_ID', 'reference': None}], + 'drivers': {'1_label': 'The website of the online shop (overall impression)', + '2_label': 'Waiting period'}} + ] + + result = CG.column_unpacker(json_list=data, method1_cols=method1_cols, method2_cols=method2_cols) + + assert result == expected_result + + +@pytest.mark.test_column_unpacker_missing_json_argument +def test_column_unpacker_missing_json_list(): + """ + Test the 'column_unpacker' method with missing 'json_list' argument. It should raise a ValueError. + """ + method1_cols = ['properties'] + method2_cols = ['drivers'] + with pytest.raises(ValueError, match="Input 'json_list' is required."): + CG.column_unpacker(json_list=None, method1_cols=method1_cols, method2_cols=method2_cols) + + +@pytest.mark.test_column_unpacker_duplicate_columns +def test_column_unpacker_duplicate_columns(): + """ + Test the 'column_unpacker' method with duplicate columns specified in both Method 1 and Method 2. It should raise a ValueError. + """ + data = RAW_JSON['data'].copy() + method1_cols = ['properties'] + method2_cols = ['properties'] + with pytest.raises(ValueError, match="{'properties'} were mentioned in both method1_cols and method2_cols. It's not possible to apply two methods to the same field."): + CG.column_unpacker(json_list=data, method1_cols=method1_cols, method2_cols=method2_cols) + + +@pytest.mark.test_flatten_json +def test_flatten_json(): + """ + Test the 'flatten_json' method with nested JSON data. It should return a flattened dictionary with expected keys and values. + """ + nested_json = { + "user": { + "name": "Jane", + "address": { + "street": "456 Elm St", + "city": "San Francisco", + "state": "CA", + "zip": "94109", + "country": {"name": "United States", "code": "US"}, + }, + "phone_numbers": {"type": "home", "number": "555-4321"}, + } + } + + expected_output = { + "user_name": "Jane", + "user_address_street": "456 Elm St", + "user_address_city": "San Francisco", + "user_address_state": "CA", + "user_address_zip": "94109", + "user_address_country_name": "United States", + "user_address_country_code": "US", + "user_phone_numbers_type": "home", + "user_phone_numbers_number": "555-4321", + } + + output = CG.flatten_json(nested_json) + assert output == expected_output + + +@pytest.mark.flatten_json_non_dict_input +def test_flatten_json_non_dict_input(): + """ + Test the 'flatten_json' method with non-dictionary input. It should raise a TypeError. + """ + input_json = [1, 2, 3] + with pytest.raises(TypeError): + CG.flatten_json(input_json) + + +@pytest.mark.square_brackets_remover +def test_square_brackets_remover_success(): + """ + Test the 'square_brackets_remover' method with a DataFrame containing square brackets. It should remove square brackets from the DataFrame. + """ + data = { + "Column1": ["Value1", "[Value2]", "Value3", "[Value4]"], + "Column2": ["1", "[2]", "3", "[4]"], + } + sample_df = pd.DataFrame(data) + + expected_data = { + "Column1": ["Value1", "Value2", "Value3", "Value4"], + "Column2": ["1", "2", "3", "4"], + } + expected_df = pd.DataFrame(expected_data) + + result = CG.square_brackets_remover(sample_df) + pd.testing.assert_frame_equal(result, expected_df) + + +@pytest.mark.drivers_cleaner +def test_drivers_cleaner_success(): + """ + Test the '_drivers_cleaner' method with valid 'drivers' data. It should clean and format the 'drivers' data and return the expected result. + """ + data = "{'label': 'Driver1'}, {'label': 'Driver2'}, {'label': 'Driver3'}" + expected_result = "Driver1, Driver2, Driver3" + result = CG._drivers_cleaner(data) + assert result == expected_result \ No newline at end of file diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py new file mode 100644 index 000000000..e7819577a --- /dev/null +++ b/viadot/sources/vid_club.py @@ -0,0 +1,315 @@ +import json +import os +import urllib +from datetime import date, datetime, timedelta +from typing import Any, Dict, List, Literal, Tuple + +import pandas as pd +from prefect.utilities import logging + +from ..exceptions import CredentialError, ValidationError +from ..utils import handle_api_response +from .base import Source + +logger = logging.get_logger() + + +class VidClub(Source): + """ + A class implementing the Vid Club API. + + Documentation for this API is located at: https://evps01.envoo.net/vipapi/ + There are 4 endpoints where to get the data. + """ + + def __init__(self, credentials: Dict[str, Any], *args, **kwargs): + """ + Create an instance of VidClub. + + Args: + credentials (Dict[str, Any]): Credentials to Vid Club APIs containing token. + + Raises: + CredentialError: If credentials are not provided as a parameter. + """ + self.headers = { + "Authorization": "Bearer " + credentials["token"], + "Content-Type": "application/json", + } + + super().__init__(*args, credentials=credentials, **kwargs) + + def build_query( + self, + from_date: str, + to_date: str, + api_url: str, + items_per_page: int, + source: Literal["jobs", "product", "company", "survey"] = None, + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + ) -> str: + """ + Builds the query from the inputs. + + Args: + from_date (str): Start date for the query. + to_date (str): End date for the query, if empty, will be executed as datetime.today().strftime("%Y-%m-%d"). + api_url (str): Generic part of the URL to Vid Club API. + items_per_page (int): number of entries per page. + source (Literal["jobs", "product", "company", "survey"], optional): The endpoint source to be accessed. Defaults to None. + region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] + + Returns: + str: Final query with all filters added. + + Raises: + ValidationError: If any source different than the ones in the list are used. + """ + if source in ["jobs", "product", "company"]: + url = f"{api_url}{source}?from={from_date}&to={to_date}®ion={region}&limit={items_per_page}" + elif source == "survey": + url = f"{api_url}{source}?language=en&type=question" + else: + raise ValidationError( + "Pick one these sources: jobs, product, company, survey" + ) + return url + + def intervals( + self, from_date: str, to_date: str, days_interval: int + ) -> Tuple[List[str], List[str]]: + """ + Breaks dates range into smaller by provided days interval. + + Args: + from_date (str): Start date for the query in "%Y-%m-%d" format. + to_date (str): End date for the query, if empty, will be executed as datetime.today().strftime("%Y-%m-%d"). + days_interval (int): Days specified in date range per api call (test showed that 30-40 is optimal for performance). + + Returns: + List[str], List[str]: Starts and Ends lists that contains information about date ranges for specific period and time interval. + + Raises: + ValidationError: If the final date of the query is before the start date. + """ + + if to_date == None: + to_date = datetime.today().strftime("%Y-%m-%d") + + end_date = datetime.strptime(to_date, "%Y-%m-%d").date() + start_date = datetime.strptime(from_date, "%Y-%m-%d").date() + + from_date_obj = datetime.strptime(from_date, "%Y-%m-%d") + + to_date_obj = datetime.strptime(to_date, "%Y-%m-%d") + delta = to_date_obj - from_date_obj + + if delta.days < 0: + raise ValidationError("to_date cannot be earlier than from_date.") + + interval = timedelta(days=days_interval) + starts = [] + ends = [] + + period_start = start_date + while period_start < end_date: + period_end = min(period_start + interval, end_date) + starts.append(period_start.strftime("%Y-%m-%d")) + ends.append(period_end.strftime("%Y-%m-%d")) + period_start = period_end + if len(starts) == 0 and len(ends) == 0: + starts.append(from_date) + ends.append(to_date) + return starts, ends + + def check_connection( + self, + source: Literal["jobs", "product", "company", "survey"] = None, + from_date: str = "2022-03-22", + to_date: str = None, + items_per_page: int = 100, + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + url: str = None, + ) -> Tuple[Dict[str, Any], str]: + """ + Initiate first connection to API to retrieve piece of data with information about type of pagination in API URL. + This option is added because type of pagination for endpoints is being changed in the future from page number to 'next' id. + + Args: + source (Literal["jobs", "product", "company", "survey"], optional): The endpoint source to be accessed. Defaults to None. + from_date (str, optional): Start date for the query, by default is the oldest date in the data 2022-03-22. + to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. + items_per_page (int, optional): Number of entries per page. 100 entries by default. + region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] + url (str, optional): Generic part of the URL to Vid Club API. Defaults to None. + + Returns: + Tuple[Dict[str, Any], str]: Dictionary with first response from API with JSON containing data and used URL string. + + Raises: + ValidationError: If from_date is earlier than 2022-03-22. + ValidationError: If to_date is earlier than from_date. + """ + + if from_date < "2022-03-22": + raise ValidationError("from_date cannot be earlier than 2022-03-22.") + + if to_date < from_date: + raise ValidationError("to_date cannot be earlier than from_date.") + + if url is None: + url = self.credentials["url"] + + first_url = self.build_query( + source=source, + from_date=from_date, + to_date=to_date, + api_url=url, + items_per_page=items_per_page, + region=region, + ) + headers = self.headers + response = handle_api_response( + url=first_url, headers=headers, method="GET", verify=False + ) + response = response.json() + + return (response, first_url) + + def get_response( + self, + source: Literal["jobs", "product", "company", "survey"] = None, + from_date: str = "2022-03-22", + to_date: str = None, + items_per_page: int = 100, + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + ) -> pd.DataFrame: + """ + Basing on the pagination type retrieved using check_connection function, gets the response from the API queried and transforms it into DataFrame. + + Args: + source (Literal["jobs", "product", "company", "survey"], optional): The endpoint source to be accessed. Defaults to None. + from_date (str, optional): Start date for the query, by default is the oldest date in the data 2022-03-22. + to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. + items_per_page (int, optional): Number of entries per page. 100 entries by default. + region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] + + Returns: + pd.DataFrame: Table of the data carried in the response. + + Raises: + ValidationError: If any source different than the ones in the list are used. + """ + headers = self.headers + if source not in ["jobs", "product", "company", "survey"]: + raise ValidationError( + "The source has to be: jobs, product, company or survey" + ) + if to_date == None: + to_date = datetime.today().strftime("%Y-%m-%d") + + response, first_url = self.check_connection( + source=source, + from_date=from_date, + to_date=to_date, + items_per_page=items_per_page, + region=region, + ) + + if isinstance(response, dict): + keys_list = list(response.keys()) + elif isinstance(response, list): + keys_list = list(response[0].keys()) + else: + keys_list = [] + + if "next" in keys_list: + ind = True + else: + ind = False + + if "data" in keys_list: + df = pd.DataFrame(response["data"]) + length = df.shape[0] + page = 1 + + while length == items_per_page: + if ind == True: + next = response["next"] + url = f"{first_url}&next={next}" + else: + page += 1 + url = f"{first_url}&page={page}" + r = handle_api_response( + url=url, headers=headers, method="GET", verify=False + ) + response = r.json() + df_page = pd.DataFrame(response["data"]) + if source == "product": + df_page = df_page.transpose() + length = df_page.shape[0] + df = pd.concat((df, df_page), axis=0) + else: + df = pd.DataFrame(response) + + return df + + def total_load( + self, + source: Literal["jobs", "product", "company", "survey"] = None, + from_date: str = "2022-03-22", + to_date: str = None, + items_per_page: int = 100, + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + days_interval: int = 30, + ) -> pd.DataFrame: + """ + Looping get_response and iterating by date ranges defined in intervals. Stores outputs as DataFrames in a list. + At the end, daframes are concatenated in one and dropped duplicates that would appear when quering. + + Args: + source (Literal["jobs", "product", "company", "survey"], optional): The endpoint source to be accessed. Defaults to None. + from_date (str, optional): Start date for the query, by default is the oldest date in the data 2022-03-22. + to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. + items_per_page (int, optional): Number of entries per page. 100 entries by default. + region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] + days_interval (int, optional): Days specified in date range per api call (test showed that 30-40 is optimal for performance). Defaults to 30. + + Returns: + pd.DataFrame: Dataframe of the concatanated data carried in the responses. + """ + + starts, ends = self.intervals( + from_date=from_date, to_date=to_date, days_interval=days_interval + ) + + dfs_list = [] + if len(starts) > 0 and len(ends) > 0: + for start, end in zip(starts, ends): + logger.info(f"ingesting data for dates [{start}]-[{end}]...") + df = self.get_response( + source=source, + from_date=start, + to_date=end, + items_per_page=items_per_page, + region=region, + ) + dfs_list.append(df) + if len(dfs_list) > 1: + df = pd.concat(dfs_list, axis=0, ignore_index=True) + else: + df = pd.DataFrame(dfs_list[0]) + else: + df = self.get_response( + source=source, + from_date=from_date, + to_date=to_date, + items_per_page=items_per_page, + region=region, + ) + df.drop_duplicates(inplace=True) + + if df.empty: + logger.error("No data for this date range") + + return df From 3fca2c30ea2a23c5796ef4778e8ec4201563a99b Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 8 Nov 2023 13:18:32 +0100 Subject: [PATCH 47/86] =?UTF-8?q?=F0=9F=8E=A8=20added=20new=20agent=20inte?= =?UTF-8?q?raction=20view=20type=20to=20Genesys.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/genesys.py | 1 + viadot/tasks/genesys.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/viadot/sources/genesys.py b/viadot/sources/genesys.py index 6be907a66..71dfa5209 100644 --- a/viadot/sources/genesys.py +++ b/viadot/sources/genesys.py @@ -322,6 +322,7 @@ def download_all_reporting_exports( "queue_performance_detail_view", "queue_interaction_detail_view", "agent_status_detail_view", + "agent_interaction_detail_view", ]: file_name = f"{self.view_type.upper()}_{next(self.count)}_{date}" elif single_report[4].lower() in [ diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index de47ddebf..7e67dab07 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -10,8 +10,8 @@ from prefect.engine import signals from prefect.utilities import logging from prefect.utilities.tasks import defaults_from_attrs -from viadot.task_utils import * +from viadot.task_utils import validate_df from viadot.exceptions import APIError from viadot.sources import Genesys @@ -385,6 +385,7 @@ def run( "agent_performance_summary_view", "agent_status_summary_view", "agent_status_detail_view", + "agent_interaction_detail_view", ]: genesys.genesys_api_connection( post_data_list=post_data_list, end_point=end_point From 2e596b980f0a19bb83538e9bf657d1615c236440 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 8 Nov 2023 13:20:17 +0100 Subject: [PATCH 48/86] =?UTF-8?q?=F0=9F=93=9D=20updated=20CHANGELOG.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 507c590cf..2bea72907 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Added new view type `agent_interaction_view_type` in `Genesys`source. + ## [0.4.21] - 2023-10-26 ### Added - Added `validate_df` task to task_utils. From 2470fdfa4e9be043cb8697a306007cc942cfc8f0 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Wed, 8 Nov 2023 14:14:43 +0100 Subject: [PATCH 49/86] =?UTF-8?q?=F0=9F=93=9D=20updated=20import.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/genesys.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index 7e67dab07..428e699a0 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -11,7 +11,7 @@ from prefect.utilities import logging from prefect.utilities.tasks import defaults_from_attrs -from viadot.task_utils import validate_df +from viadot.task_utils import * from viadot.exceptions import APIError from viadot.sources import Genesys From c9c3c500a4ddea07837d71f394565556d472ce82 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Thu, 9 Nov 2023 12:24:48 +0100 Subject: [PATCH 50/86] fix typos in changelog --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bcf45aef..c1127a079 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,8 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Modified `CustomerGauge` source class with simplified logic to return json structure. -- Expand `CustomerGaugeToDF` task class with separate cleaning functions and handling nested json structure flattening with two new methods `_field_reference_unpacker` and `_nested_dict_transformer`. -- Change `CustomerGaugeToADLS` to containg new arguments. +- Expanded `CustomerGaugeToDF` task class with separate cleaning functions and handling nested json structure flattening with two new methods `_field_reference_unpacker` and `_nested_dict_transformer`. +- Changed `CustomerGaugeToADLS` to containing new arguments. ## [0.4.21] - 2023-10-26 ### Added From b46b3e9f3a4b270d52442da73e10ca4c65c6305e Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Thu, 9 Nov 2023 13:04:20 +0100 Subject: [PATCH 51/86] simplify cleaning drivers --- viadot/tasks/customer_gauge.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index cec4d4c5f..b24a09f75 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -313,13 +313,9 @@ def _drivers_cleaner( str: A cleaned and formatted string of driver data. """ - drivers = drivers.split("}, {") - cleaned_drivers = [] - for driver in drivers: - driver = driver.replace("{", "").replace("}", "") - driver = driver.replace("'", "").replace("label: ", "") - cleaned_drivers.append(driver) - return ', '.join(cleaned_drivers) + cleaned_drivers = drivers.replace("{", "").replace("}", "").replace("'", "").replace("label: ", "") + + return cleaned_drivers def __call__(self): """Download Customer Gauge data to a DF""" From f10ddfed4fd3b140c6dbdd26f3465936bc87e51a Mon Sep 17 00:00:00 2001 From: adrian-wojcik Date: Fri, 10 Nov 2023 14:49:30 +0100 Subject: [PATCH 52/86] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Splitted=20test=20fo?= =?UTF-8?q?r=20Eurostat=20on=20source=20tests=20and=20task=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 + tests/integration/tasks/test_eurostat.py | 132 --------------------- tests/integration/test_eurostat.py | 140 +++++++++++++++++++++++ 3 files changed, 143 insertions(+), 132 deletions(-) create mode 100644 tests/integration/test_eurostat.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 76eb3280b..2e3709821 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed +- Splitted test for Eurostat on source tests and task tests + ## [0.4.21] - 2023-10-26 ### Added - Added `validate_df` task to task_utils. diff --git a/tests/integration/tasks/test_eurostat.py b/tests/integration/tasks/test_eurostat.py index 475d60190..7fa01dd58 100644 --- a/tests/integration/tasks/test_eurostat.py +++ b/tests/integration/tasks/test_eurostat.py @@ -6,138 +6,6 @@ from viadot.tasks import eurostat -def test_and_validate_dataset_code_without_params(caplog): - """This function is designed to test the accuracy of the data retrieval feature in a program. - Specifically, it tests to ensure that the program returns a non-empty DataFrame when a correct - dataset code is provided without any parameters. The function is intended to be used in software - development to verify that the program is correctly retrieving data from the appropriate dataset. - """ - task = eurostat.EurostatToDF(dataset_code="ILC_DI04").run() - assert isinstance(task, pd.DataFrame) - assert not task.empty - assert caplog.text == "" - - -def test_wrong_dataset_code_logger(caplog): - """This function is designed to test the accuracy of the error logging feature in a program. - Specifically, it tests to ensure that the program is able to correctly identify and log errors - when provided with only incorrect dataset code. - The function is intended to be used in software development to identify correct type errors - and messages in the program's handling of codes. - """ - task = eurostat.EurostatToDF(dataset_code="ILC_DI04E") - - with pytest.raises(ValueError, match="DataFrame is empty!"): - with caplog.at_level(logging.ERROR): - task.run() - assert ( - f"Failed to fetch data for ILC_DI04E, please check correctness of dataset code!" - in caplog.text - ) - - -def test_wrong_parameters_codes_logger(caplog): - """This function is designed to test the accuracy of the error logging feature in a program. - Specifically, it tests to ensure that the program is able to correctly identify and log errors - when provided with a correct dataset_code and correct parameters are provided, but both parameters codes are incorrect. - The function is intended to be used in software development to identify correct type errors - and messages in the program's handling of codes. - """ - task = eurostat.EurostatToDF( - dataset_code="ILC_DI04", - params={"hhtyp": "total1", "indic_il": "non_existing_code"}, - ) - - with pytest.raises(ValueError, match="DataFrame is empty!"): - with caplog.at_level(logging.ERROR): - task.run() - assert ( - f"Parameters codes: 'total1 | non_existing_code' are not available. Please check your spelling!" - in caplog.text - ) - assert ( - f"You can find everything via link: https://ec.europa.eu/eurostat/databrowser/view/ILC_DI04/default/table?lang=en" - in caplog.text - ) - - -def test_parameter_codes_as_list_logger(caplog): - """This function is designed to test the accuracy of the error logging feature in a program. - Specifically, it tests to ensure that the program is able to correctly identify and log errors - when provided with a correct dataset code, correct parameters, but incorrect parameters codes structure - (as a list with strings, instead of single string). - The function is intended to be used in software development to identify correct type errors - and messages in the program's handling of codes. - """ - - task = eurostat.EurostatToDF( - dataset_code="ILC_DI04", - params={"hhtyp": ["totale", "nottotale"], "indic_il": "med_e"}, - ) - with pytest.raises(ValueError, match="Wrong structure of params!"): - with caplog.at_level(logging.ERROR): - task.run() - assert ( - "You can provide only one code per one parameter as 'str' in params!\n" - in caplog.text - ) - assert ( - "CORRECT: params = {'unit': 'EUR'} | INCORRECT: params = {'unit': ['EUR', 'USD', 'PLN']}" - in caplog.text - ) - - -def test_wrong_parameters(caplog): - """This function is designed to test the accuracy of the error logging feature in a program. - Specifically, it tests to ensure that the program is able to correctly identify and log errors - when provided with a correct dataset_code, but incorrect parameters keys. - The function is intended to be used in software development to identify correct type errors - and messages in the program's handling of codes. - """ - - task = eurostat.EurostatToDF( - dataset_code="ILC_DI04", params={"hhhtyp": "total", "indic_ilx": "med_e"} - ) - with pytest.raises(ValueError, match="DataFrame is empty!"): - with caplog.at_level(logging.ERROR): - task.run() - assert ( - f"Parameters: 'hhhtyp | indic_ilx' are not in dataset. Please check your spelling!\n" - in caplog.text - ) - assert ( - f"Possible parameters: freq | hhtyp | indic_il | unit | geo | time" - in caplog.text - ) - - -def test_params_as_list(): - """This function is designed to test the accuracy of the error logging feature in a program. - Specifically, it tests to ensure that the program is able to correctly identify and log error - when provided with a correct dataset_code, but incorrect params structure (as list instead of dict). - The function is intended to be used in software development to identify correct type errors - and messages in the program's handling of codes. - """ - with pytest.raises(TypeError, match="Params should be a dictionary."): - eurostat.EurostatToDF(dataset_code="ILC_DI04", params=["total", "med_e"]).run() - - -def test_correct_params_and_dataset_code(caplog): - """This function is designed to test the accuracy of the data retrieval feature in a program. - Specifically, it tests to ensure that the program returns a non-empty DataFrame when a correct - dataset code is provided with correct params. The function is intended to be used in software - development to verify that the program is correctly retrieving data from the appropriate dataset. - """ - - task = eurostat.EurostatToDF( - dataset_code="ILC_DI04", params={"hhtyp": "total", "indic_il": "med_e"} - ).run() - - assert isinstance(task, pd.DataFrame) - assert not task.empty - assert caplog.text == "" - - def task_correct_requested_columns(caplog): """This function is designed to test the accuracy of the data retrieval feature in a program. Specifically, it tests to ensure that the program is able to correctly identify and log error diff --git a/tests/integration/test_eurostat.py b/tests/integration/test_eurostat.py new file mode 100644 index 000000000..6fb64cbea --- /dev/null +++ b/tests/integration/test_eurostat.py @@ -0,0 +1,140 @@ +import logging + +import pandas as pd +import pytest + +from viadot.sources import Eurostat + + +def test_and_validate_dataset_code_without_params(caplog): + """This function is designed to test the accuracy of the data retrieval feature in a program. + Specifically, it tests to ensure that the program returns a non-empty DataFrame when a correct + dataset code is provided without any parameters. The function is intended to be used in software + development to verify that the program is correctly retrieving data from the appropriate dataset. + """ + source = Eurostat(dataset_code="ILC_DI04").get_data_frame_from_response() + assert isinstance(source, pd.DataFrame) + assert not source.empty + assert caplog.text == "" + + +def test_wrong_dataset_code_logger(caplog): + """This function is designed to test the accuracy of the error logging feature in a program. + Specifically, it tests to ensure that the program is able to correctly identify and log errors + when provided with only incorrect dataset code. + The function is intended to be used in software development to identify correct type errors + and messages in the program's handling of codes. + """ + source = Eurostat(dataset_code="ILC_DI04E") + + with pytest.raises(ValueError, match="DataFrame is empty!"): + with caplog.at_level(logging.ERROR): + source.get_data_frame_from_response() + assert ( + f"Failed to fetch data for ILC_DI04E, please check correctness of dataset code!" + in caplog.text + ) + + +def test_wrong_parameters_codes_logger(caplog): + """This function is designed to test the accuracy of the error logging feature in a program. + Specifically, it tests to ensure that the program is able to correctly identify and log errors + when provided with a correct dataset_code and correct parameters are provided, but both parameters codes are incorrect. + The function is intended to be used in software development to identify correct type errors + and messages in the program's handling of codes. + """ + source = Eurostat( + dataset_code="ILC_DI04", + params={"hhtyp": "total1", "indic_il": "non_existing_code"}, + ) + + with pytest.raises(ValueError, match="DataFrame is empty!"): + with caplog.at_level(logging.ERROR): + source.get_data_frame_from_response() + assert ( + f"Parameters codes: 'total1 | non_existing_code' are not available. Please check your spelling!" + in caplog.text + ) + assert ( + f"You can find everything via link: https://ec.europa.eu/eurostat/databrowser/view/ILC_DI04/default/table?lang=en" + in caplog.text + ) + + +def test_parameter_codes_as_list_logger(caplog): + """This function is designed to test the accuracy of the error logging feature in a program. + Specifically, it tests to ensure that the program is able to correctly identify and log errors + when provided with a correct dataset code, correct parameters, but incorrect parameters codes structure + (as a list with strings, instead of single string). + The function is intended to be used in software development to identify correct type errors + and messages in the program's handling of codes. + """ + + source = Eurostat( + dataset_code="ILC_DI04", + params={"hhtyp": ["totale", "nottotale"], "indic_il": "med_e"}, + ) + with pytest.raises(ValueError, match="Wrong structure of params!"): + with caplog.at_level(logging.ERROR): + source.get_data_frame_from_response() + assert ( + "You can provide only one code per one parameter as 'str' in params!\n" + in caplog.text + ) + assert ( + "CORRECT: params = {'unit': 'EUR'} | INCORRECT: params = {'unit': ['EUR', 'USD', 'PLN']}" + in caplog.text + ) + + +def test_wrong_parameters(caplog): + """This function is designed to test the accuracy of the error logging feature in a program. + Specifically, it tests to ensure that the program is able to correctly identify and log errors + when provided with a correct dataset_code, but incorrect parameters keys. + The function is intended to be used in software development to identify correct type errors + and messages in the program's handling of codes. + """ + + source = Eurostat( + dataset_code="ILC_DI04", params={"hhhtyp": "total", "indic_ilx": "med_e"} + ) + with pytest.raises(ValueError, match="DataFrame is empty!"): + with caplog.at_level(logging.ERROR): + source.get_data_frame_from_response() + assert ( + f"Parameters: 'hhhtyp | indic_ilx' are not in dataset. Please check your spelling!\n" + in caplog.text + ) + assert ( + f"Possible parameters: freq | hhtyp | indic_il | unit | geo | time" + in caplog.text + ) + + +def test_params_as_list(): + """This function is designed to test the accuracy of the error logging feature in a program. + Specifically, it tests to ensure that the program is able to correctly identify and log error + when provided with a correct dataset_code, but incorrect params structure (as list instead of dict). + The function is intended to be used in software development to identify correct type errors + and messages in the program's handling of codes. + """ + with pytest.raises(TypeError, match="Params should be a dictionary."): + Eurostat( + dataset_code="ILC_DI04", params=["total", "med_e"] + ).get_data_frame_from_response() + + +def test_correct_params_and_dataset_code(caplog): + """This function is designed to test the accuracy of the data retrieval feature in a program. + Specifically, it tests to ensure that the program returns a non-empty DataFrame when a correct + dataset code is provided with correct params. The function is intended to be used in software + development to verify that the program is correctly retrieving data from the appropriate dataset. + """ + + source = Eurostat( + dataset_code="ILC_DI04", params={"hhtyp": "total", "indic_il": "med_e"} + ).get_data_frame_from_response() + + assert isinstance(source, pd.DataFrame) + assert not source.empty + assert caplog.text == "" From 5fe4fe14cc5d35eb12c47ab8ac8c81f6574655dc Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 10 Nov 2023 16:31:21 +0100 Subject: [PATCH 53/86] =?UTF-8?q?=E2=9C=85=20Added=20missing=20tests=20for?= =?UTF-8?q?=20Mediatool?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_mediatool.py | 37 +++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_mediatool.py b/tests/integration/test_mediatool.py index f5a2d81a2..45b9da48b 100644 --- a/tests/integration/test_mediatool.py +++ b/tests/integration/test_mediatool.py @@ -5,7 +5,7 @@ import pytest from prefect.tasks.secrets import PrefectSecret -from viadot.exceptions import APIError +from viadot.exceptions import APIError, CredentialError from viadot.sources import Mediatool from viadot.task_utils import credentials_loader @@ -13,6 +13,11 @@ MTOOL = Mediatool(credentials=CREDENTIALS) +def test_init_empty_credentials(): + with pytest.raises(CredentialError, match=r"Missing credentials."): + Mediatool(credentials={}) + + def test_get_campaigns_df(): camps = MTOOL.get_campaigns(CREDENTIALS["ORG"]) assert isinstance(camps, pd.DataFrame) @@ -29,6 +34,13 @@ def test_get_organizations(): assert isinstance(orgs, pd.DataFrame) +def test_get_organizations_return_list(): + orgs = MTOOL.get_organizations( + user_id=CREDENTIALS["USER_ID"], return_dataframe=False + ) + assert isinstance(orgs, list) + + def test_get_media_entries(): media_entries = MTOOL.get_media_entries( organization_id=CREDENTIALS["ORG"], columns=["_id"] @@ -36,6 +48,13 @@ def test_get_media_entries(): assert isinstance(media_entries, pd.DataFrame) +def test_get_media_entries_wrong_columns(caplog): + MTOOL.get_media_entries( + organization_id=CREDENTIALS["ORG"], columns=["wrong_column", "random_column"] + ) + assert "Columns ['wrong_column', 'random_column'] are incorrect." in caplog.text + + def test_get_media_types_correct_id(): media_types = MTOOL.get_media_types(media_type_ids=[CREDENTIALS["MEDIA_TYPE_ID"]]) assert isinstance(media_types, pd.DataFrame) @@ -48,11 +67,25 @@ def test_get_media_types_wrong_id(): _ = MTOOL.get_media_types(["040404"]) -def test_get_vehicles(caplog): +def test_get_media_types_return_list(): + media_types = MTOOL.get_media_types( + media_type_ids=[CREDENTIALS["MEDIA_TYPE_ID"]], return_dataframe=False + ) + assert isinstance(media_types, list) + + +def test_get_vehicles_wrong_ids(caplog): _ = MTOOL.get_vehicles(vehicle_ids=["100000", "200000"]) assert "Vehicle were not found for: ['100000', '200000']" in caplog.text +def test_get_vehicles_return_dict(): + vehicles = MTOOL.get_vehicles( + vehicle_ids=[CREDENTIALS["VEHICLE_ID"]], return_dataframe=False + ) + assert isinstance(vehicles, dict) + + def test_rename_columns_correct(): data = {"id": [1, 2], "amount": [3, 4]} df = pd.DataFrame(data=data) From 5972e27650ce33d1392f71efa3549e47569ab7da Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 10 Nov 2023 16:34:01 +0100 Subject: [PATCH 54/86] =?UTF-8?q?=F0=9F=90=9B=20Fixed=20return=20types=20a?= =?UTF-8?q?nd=20error=20handling=20for=20Mediatool=20source?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/mediatool.py | 40 +++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/viadot/sources/mediatool.py b/viadot/sources/mediatool.py index 88e358ee7..4b40c8739 100644 --- a/viadot/sources/mediatool.py +++ b/viadot/sources/mediatool.py @@ -1,7 +1,7 @@ import inspect import json from datetime import date, timedelta -from typing import List +from typing import List, Union import pandas as pd from prefect.utilities import logging @@ -35,14 +35,14 @@ def __init__( organization_id (str, optional): Organization ID. Defaults to None. user_id (str, optional): User ID. Defaults to None. """ - if credentials is not None: - try: - self.header = {"Authorization": f"Bearer {credentials.get('TOKEN')}"} - except: - raise CredentialError("Credentials not found.") + if any([rq not in credentials for rq in ["TOKEN", "USER_ID"]]): + raise CredentialError( + "Missing credentials. 'TOKEN' and 'USER_ID' are required." + ) super().__init__(*args, credentials=credentials, **kwargs) + self.header = {"Authorization": f"Bearer {self.credentials.get('TOKEN')}"} self.organization_id = organization_id or self.credentials.get( "ORGANIZATION_ID" ) @@ -80,7 +80,7 @@ def get_media_entries( end_date: str = None, time_delta: int = 360, return_dataframe: bool = True, - ) -> pd.DataFrame: + ) -> Union[pd.DataFrame, dict]: """ Get data for media entries. This is a main function. Media entries contain IDs for most of the fields for other endpoints.Returns DataFrame or Dict. @@ -95,7 +95,7 @@ def get_media_entries( Defaults to True. Returns: - pd.DataFrame: Default return dataframe If 'return_daframe=False' then return list of dicts. + Union[pd.DataFrame, dict]: Default return dataframe If 'return_daframe=False' then return list of dicts. """ today = date.today() @@ -119,9 +119,11 @@ def get_media_entries( columns = df.columns try: df_filtered = df[columns] - except KeyError as e: - logger.info(e) - return df_filtered + return df_filtered + except KeyError: + logger.error( + f"Columns {columns} are incorrect. Whole dictionary for 'mediaEntries' will be returned." + ) return response_dict["mediaEntries"] @@ -137,7 +139,7 @@ def get_campaigns( Defaults to True. Returns: - pd.DataFrame: Default return dataframe If 'return_daframe=False' then return list of dicts. + pd.DataFrame: Default return dataframe If 'return_daframe=False' then return dictionary. """ url_campaigns = ( f"https://api.mediatool.com/organizations/{organization_id}/campaigns" @@ -168,7 +170,7 @@ def get_vehicles( self, vehicle_ids: List[str], return_dataframe: bool = True, - ) -> pd.DataFrame: + ) -> Union[pd.DataFrame, dict]: """ Get vehicles data based on the organization IDs. Returns DataFrame or Dict. @@ -178,7 +180,7 @@ def get_vehicles( Defaults to True. Returns: - pd.DataFrame: Default return dataframe. If 'return_daframe=False' then return list of dicts. + Union[pd.DataFrame, dict]: Default return dataframe. If 'return_daframe=False' then return dictionary. """ response_dict = {} dfs = [] @@ -211,11 +213,11 @@ def get_vehicles( return df_updated return None - return response_dict["vehicles"] + return response_dict["vehicle"] def get_organizations( self, user_id: str = None, return_dataframe: bool = True - ) -> pd.DataFrame: + ) -> Union[pd.DataFrame, List[dict]]: """ Get organizations data based on the user ID. Returns DataFrame or Dict. @@ -225,7 +227,7 @@ def get_organizations( Defaults to True. Returns: - pd.DataFrame: Default return dataframe. If 'return_daframe=False' then return list of dicts. + Union[pd.DataFrame, List[dict]]: Default return dataframe. If 'return_daframe=False' then return list of dicts. """ user_id = user_id or self.user_id url_organizations = f"https://api.mediatool.com/users/{user_id}/organizations" @@ -258,7 +260,7 @@ def get_organizations( def get_media_types( self, media_type_ids: List[str], return_dataframe: bool = True - ) -> pd.DataFrame: + ) -> Union[pd.DataFrame, List[dict]]: """ Get media types data based on the media types ID. User have to provide list of media type IDs. Returns DataFrame or Dict. @@ -269,7 +271,7 @@ def get_media_types( Defaults to True. Returns: - pd.DataFrame: Default return dataframe. If 'return_daframe=False' then return list of dicts. + Union[pd.DataFrame, List[dict]]: Default return dataframe. If 'return_daframe=False' then return list of dicts. """ list_media_types = [] for id_media_type in media_type_ids: From 047e9cd8a2247ca794e67915411baad75bdcc042 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 10 Nov 2023 16:35:50 +0100 Subject: [PATCH 55/86] =?UTF-8?q?=E2=9C=85=20Added=20missing=20tests=20for?= =?UTF-8?q?=20Hubspot=20source=20class?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_hubspot.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/integration/test_hubspot.py b/tests/integration/test_hubspot.py index 0907e031a..5963df3ee 100644 --- a/tests/integration/test_hubspot.py +++ b/tests/integration/test_hubspot.py @@ -2,6 +2,7 @@ import pandas as pd import pytest +from viadot.exceptions import CredentialError from viadot.sources import Hubspot from viadot.task_utils import credentials_loader @@ -40,6 +41,11 @@ def var_dictionary(): yield variables +def test_credentials_not_provided(): + with pytest.raises(CredentialError, match="Credentials not found."): + Hubspot(credentials={}) + + def test_clean_special_characters(): test_value = "762##28cd7-e$69d-4708-be31-726bb!859befd" clean_chars = HUBSPOT.clean_special_characters(value=test_value) @@ -79,3 +85,8 @@ def test_to_json(var_dictionary): trigger = HUBSPOT.to_json(url=api_url, body=api_body, method="POST") assert isinstance(trigger, dict) + + +def test_get_properties_url(var_dictionary): + url = HUBSPOT.get_properties_url(endpoint=var_dictionary["endpoint"]) + assert isinstance(url, str) From b2d7ad07594e6d91d9d043efd382ab0ae823bf6a Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 10 Nov 2023 16:36:31 +0100 Subject: [PATCH 56/86] =?UTF-8?q?=E2=9C=85=20Added=20missing=20tests=20for?= =?UTF-8?q?=20Genesys=20source=20class?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_genesys.py | 32 +++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_genesys.py b/tests/integration/test_genesys.py index 817e590b5..8508978f1 100644 --- a/tests/integration/test_genesys.py +++ b/tests/integration/test_genesys.py @@ -1,6 +1,7 @@ from unittest import mock import pytest +import logging from viadot.sources import Genesys @@ -138,6 +139,13 @@ def test_default_credential_param(): assert g.credentials != None and type(g.credentials) == dict +@pytest.mark.init +def test_default_credentials_provided(caplog): + with caplog.at_level(logging.INFO): + Genesys(credentials_genesys={"CREDENTIALS_KEY": "value"}) + assert "Credentials provided by user" in caplog.text + + @pytest.mark.init def test_environment_param(): g = Genesys() @@ -169,15 +177,35 @@ def test_generate_api_connection(mock_api_response, var_dictionary): mock_api_response.assert_called() +def test_api_connection_return_type(): + conn_dict = Genesys().genesys_api_connection(post_data_list=["test_value_to_post"]) + assert isinstance(conn_dict, dict) + + +def test_load_reporting_exports_return_type(caplog): + with caplog.at_level(logging.INFO): + load_return = Genesys().load_reporting_exports() + assert isinstance(load_return, dict) + + assert "loaded" in caplog.text + + @mock.patch.object(Genesys, "download_report") @pytest.mark.dependency(depends=["test_generate_api_connection"]) @pytest.mark.download -def test_download_reports(mock_download_files, var_dictionary): +def test_download_reports(mock_download_files, var_dictionary, caplog): g = Genesys() g.ids_mapping = var_dictionary["ids_mapping"] g.report_data = var_dictionary["report_data"] g.start_date = var_dictionary["start_date"] - file_name_list = g.download_all_reporting_exports() + with caplog.at_level(logging.INFO): + file_name_list = g.download_all_reporting_exports() + assert "IDS_MAPPING loaded" in caplog.text + + g.ids_mapping = None + with caplog.at_level(logging.WARNING): + file_name_list = g.download_all_reporting_exports() + assert "IDS_MAPPING is not provided" in caplog.text assert type(file_name_list) == list and len(file_name_list) > 0 mock_download_files.assert_called() From 4e5c6c2b95f44015fea87a7914a775f69e3ad0a0 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Fri, 10 Nov 2023 16:37:54 +0100 Subject: [PATCH 57/86] =?UTF-8?q?=E2=9C=85=20Added=20missing=20tests=20for?= =?UTF-8?q?=20CustomerGauge=20source=20class?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_customer_gauge.py | 30 +++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_customer_gauge.py b/tests/integration/test_customer_gauge.py index 666a73251..2a24c87d1 100644 --- a/tests/integration/test_customer_gauge.py +++ b/tests/integration/test_customer_gauge.py @@ -9,6 +9,16 @@ CG = CustomerGauge(endpoint=ENDPOINT) +def test_wrong_endpoint(): + with pytest.raises(ValueError, match="Incorrect endpoint name"): + CustomerGauge(endpoint=["wrong_endpoint"]) + + +def test_endpoint_and_url_not_provided(): + with pytest.raises(ValueError, match="Provide endpoint name"): + CustomerGauge() + + def test_get_json_content(): json_response = CG.get_json_response() assert isinstance(json_response, dict) @@ -21,8 +31,10 @@ def test_properties_cleaning(): json_response = CG.get_json_response() data = json_response["data"][2].copy() cleaned_data = CG.properties_cleaning(data.copy()) + assert isinstance(data["properties"], list) assert isinstance(cleaned_data["properties"], dict) + assert r"{',':" or "label" or "}" in json_response["drivers"] def test_flatten_json(): @@ -63,6 +75,13 @@ def test_pagesize_and_to_df(): assert len(df) == 1 +def test_to_df_with_wrong_json_response(): + with pytest.raises( + ValueError, match="Provided argument doesn't contain 'data' value" + ): + CG.to_df(json_response={}) + + def test_pass_specific_cursor(): # for default pagesize=1000 returned cursor value should be bigger than passed cur = random.randint(1, 9999) @@ -71,11 +90,16 @@ def test_pass_specific_cursor(): assert cur_retrieved > cur +def test_cursor_is_not_provided(): + with pytest.raises( + ValueError, match="Provided argument doesn't contain 'cursor' value" + ): + CG.get_cursor(json_response={}) + + def test_uncomplete_date_arguments(): with pytest.raises(ValueError, match="Missing date arguments"): - json_response = CG.get_json_response( - date_field="date_sent", start_date="2012-01-03" - ) + CG.get_json_response(date_field="date_sent", start_date="2012-01-03") def test_endpoint_url_argument(): From d717a09474602e878f8d3e2c932a9b4cf281e966 Mon Sep 17 00:00:00 2001 From: Diego-H-S Date: Mon, 13 Nov 2023 10:04:00 +0100 Subject: [PATCH 58/86] =?UTF-8?q?=F0=9F=93=9D=20updated=20docstrings.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/genesys.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/sources/genesys.py b/viadot/sources/genesys.py index 71dfa5209..d9109b313 100644 --- a/viadot/sources/genesys.py +++ b/viadot/sources/genesys.py @@ -344,7 +344,7 @@ def download_all_reporting_exports( if store_file_names is True: file_name_list.append(file_name + "." + self.file_extension) - self.logger.info("Al reports were successfully dowonload.") + self.logger.info("All reports were successfully downloaded.") if store_file_names is True: self.logger.info("Successfully genetared file names list.") From bbd0a256d9d6dc27f201aad3843e39f3202f50bc Mon Sep 17 00:00:00 2001 From: adrian-wojcik Date: Mon, 13 Nov 2023 11:30:34 +0100 Subject: [PATCH 59/86] =?UTF-8?q?=E2=9C=85=20Added=20additional=20test=20f?= =?UTF-8?q?or=2064=20line=20from=20viadot.task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_eurostat.py | 26 +++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/tests/integration/tasks/test_eurostat.py b/tests/integration/tasks/test_eurostat.py index 7fa01dd58..11297072e 100644 --- a/tests/integration/tasks/test_eurostat.py +++ b/tests/integration/tasks/test_eurostat.py @@ -23,7 +23,7 @@ def task_correct_requested_columns(caplog): assert isinstance(task, pd.DataFrame) assert not task.empty assert caplog.text == "" - assert list(task.columns) == task.needed_columns + assert list(task.columns) == task.requested_columns def test_wrong_needed_columns_names(caplog): @@ -90,3 +90,27 @@ def test_requested_columns_not_in_list(): params={"hhtyp": "total", "indic_il": "med_e"}, requested_columns="updated", ).run() + + +def test_requested_columns_not_provided(caplog): + """Test the behavior when 'requested_columns' are not provided to EurostatToDF. + + This test checks the behavior of the EurostatToDF class when 'requested_columns' are not provided. + It ensures that the resulting DataFrame is of the correct type, not empty, and that no error + messages are logged using the 'caplog' fixture. + + Parameters: + - caplog: pytest fixture for capturing log messages. + + Usage: + - Invoke this test function to check the behavior of EurostatToDF when 'requested_columns' are not provided. + """ + task = eurostat.EurostatToDF( + dataset_code="ILC_DI04", + params={"hhtyp": "total", "indic_il": "med_e"}, + ) + task.run() + + assert isinstance(task, pd.DataFrame) + assert not task.empty + assert caplog.text == "" From 52709cb35c73167bc052b8df9c82cda9ab69946d Mon Sep 17 00:00:00 2001 From: adrian-wojcik Date: Mon, 13 Nov 2023 12:09:39 +0100 Subject: [PATCH 60/86] =?UTF-8?q?=F0=9F=8E=A8=20Changed=20Task=20object=20?= =?UTF-8?q?reference=20and=20fixed=20bug=20in=20new=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_eurostat.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/integration/tasks/test_eurostat.py b/tests/integration/tasks/test_eurostat.py index 11297072e..ed3f20596 100644 --- a/tests/integration/tasks/test_eurostat.py +++ b/tests/integration/tasks/test_eurostat.py @@ -3,7 +3,7 @@ import pandas as pd import pytest -from viadot.tasks import eurostat +from viadot.tasks import EurostatToDF def task_correct_requested_columns(caplog): @@ -13,7 +13,7 @@ def task_correct_requested_columns(caplog): The function is intended to be used in software development to verify that the program is correctly retrieving data from the appropriate dataset. """ - task = eurostat.EurostatToDF( + task = EurostatToDF( dataset_code="ILC_DI04", params={"hhtyp": "total", "indic_il": "med_e"}, requested_columns=["updated", "geo", "indicator"], @@ -33,7 +33,7 @@ def test_wrong_needed_columns_names(caplog): The function is intended to be used in software development to identify correct type errors and messages in the program's handling of codes. """ - task = eurostat.EurostatToDF( + task = EurostatToDF( dataset_code="ILC_DI04", params={"hhtyp": "total", "indic_il": "med_e"}, requested_columns=["updated1", "geo1", "indicator1"], @@ -56,7 +56,7 @@ def test_wrong_params_and_wrong_requested_columns_names(caplog): params validation. The function is intended to be used in software development to identify correct type errors and messages in the program's handling of codes. """ - task = eurostat.EurostatToDF( + task = EurostatToDF( dataset_code="ILC_DI04", params={"hhhtyp": "total", "indic_ilx": "med_e"}, requested_columns=["updated1", "geo1", "indicator1"], @@ -85,7 +85,7 @@ def test_requested_columns_not_in_list(): with pytest.raises( TypeError, match="Requested columns should be provided as list of strings." ): - eurostat.EurostatToDF( + EurostatToDF( dataset_code="ILC_DI04", params={"hhtyp": "total", "indic_il": "med_e"}, requested_columns="updated", @@ -105,12 +105,12 @@ def test_requested_columns_not_provided(caplog): Usage: - Invoke this test function to check the behavior of EurostatToDF when 'requested_columns' are not provided. """ - task = eurostat.EurostatToDF( + task = EurostatToDF( dataset_code="ILC_DI04", params={"hhtyp": "total", "indic_il": "med_e"}, ) - task.run() + df = task.run() - assert isinstance(task, pd.DataFrame) - assert not task.empty + assert isinstance(df, pd.DataFrame) + assert not df.empty assert caplog.text == "" From f69a8ef8416b33415d8c6cdb5f2e16c2d52cf6d5 Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Mon, 13 Nov 2023 14:39:47 +0100 Subject: [PATCH 61/86] rename unpacking cols parameters --- .../integration/tasks/test_customer_gauge.py | 20 +++---- viadot/flows/customer_gauge_to_adls.py | 16 +++--- viadot/tasks/customer_gauge.py | 56 +++++++++---------- 3 files changed, 46 insertions(+), 46 deletions(-) diff --git a/tests/integration/tasks/test_customer_gauge.py b/tests/integration/tasks/test_customer_gauge.py index 6cbd17078..d95ea14cd 100644 --- a/tests/integration/tasks/test_customer_gauge.py +++ b/tests/integration/tasks/test_customer_gauge.py @@ -187,8 +187,8 @@ def test_column_unpacker_success_method1_and_method2(): Test the 'column_unpacker' method with valid data and both Method 1 and Method 2 columns specified. It should return the expected result. """ data = RAW_JSON['data'].copy() - method1_cols = ['properties'] - method2_cols = ['drivers'] + unpack_by_field_reference_cols = ['properties'] + unpack_by_nested_dict_transformer = ['drivers'] expected_result = [ {'contact': {'first_name': '***', 'last_name': '***'}, @@ -221,7 +221,7 @@ def test_column_unpacker_success_method1_and_method2(): '2_label': 'Waiting period'}} ] - result = CG.column_unpacker(json_list=data, method1_cols=method1_cols, method2_cols=method2_cols) + result = CG.column_unpacker(json_list=data, unpack_by_field_reference_cols=unpack_by_field_reference_cols, unpack_by_nested_dict_transformer=unpack_by_nested_dict_transformer) assert result == expected_result @@ -231,10 +231,10 @@ def test_column_unpacker_missing_json_list(): """ Test the 'column_unpacker' method with missing 'json_list' argument. It should raise a ValueError. """ - method1_cols = ['properties'] - method2_cols = ['drivers'] + unpack_by_field_reference_cols = ['properties'] + unpack_by_nested_dict_transformer = ['drivers'] with pytest.raises(ValueError, match="Input 'json_list' is required."): - CG.column_unpacker(json_list=None, method1_cols=method1_cols, method2_cols=method2_cols) + CG.column_unpacker(json_list=None, unpack_by_field_reference_cols=unpack_by_field_reference_cols, unpack_by_nested_dict_transformer=unpack_by_nested_dict_transformer) @pytest.mark.test_column_unpacker_duplicate_columns @@ -243,10 +243,10 @@ def test_column_unpacker_duplicate_columns(): Test the 'column_unpacker' method with duplicate columns specified in both Method 1 and Method 2. It should raise a ValueError. """ data = RAW_JSON['data'].copy() - method1_cols = ['properties'] - method2_cols = ['properties'] - with pytest.raises(ValueError, match="{'properties'} were mentioned in both method1_cols and method2_cols. It's not possible to apply two methods to the same field."): - CG.column_unpacker(json_list=data, method1_cols=method1_cols, method2_cols=method2_cols) + unpack_by_field_reference_cols = ['properties'] + unpack_by_nested_dict_transformer = ['properties'] + with pytest.raises(ValueError, match="{'properties'} were mentioned in both unpack_by_field_reference_cols and unpack_by_nested_dict_transformer. It's not possible to apply two methods to the same field."): + CG.column_unpacker(json_list=data, unpack_by_field_reference_cols=unpack_by_field_reference_cols, unpack_by_nested_dict_transformer=unpack_by_nested_dict_transformer) @pytest.mark.test_flatten_json diff --git a/viadot/flows/customer_gauge_to_adls.py b/viadot/flows/customer_gauge_to_adls.py index f314511eb..6af62a340 100644 --- a/viadot/flows/customer_gauge_to_adls.py +++ b/viadot/flows/customer_gauge_to_adls.py @@ -38,8 +38,8 @@ def __init__( ] = None, start_date: datetime = None, end_date: datetime = None, - method1_cols: List[str] = None, - method2_cols: List[str] = None, + unpack_by_field_reference_cols: List[str] = None, + unpack_by_nested_dict_transformer: List[str] = None, customer_gauge_credentials_secret: str = "CUSTOMER-GAUGE", anonymize: bool = False, columns_to_anonymize: List[str] = None, @@ -78,8 +78,8 @@ def __init__( Specifies the date type which filter date range. Defaults to None. start_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. Defaults to None. end_date (datetime, optional): Defines the period end date in yyyy-mm-dd format. Defaults to None. - method1_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. Defaults to None. - method2_cols (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. Defaults to None. + unpack_by_field_reference_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. Defaults to None. + unpack_by_nested_dict_transformer (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. Defaults to None. customer_gauge_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ['client_id', 'client_secret']. Defaults to "CUSTOMER-GAUGE". vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. @@ -116,8 +116,8 @@ def __init__( self.date_field = date_field self.start_date = start_date self.end_date = end_date - self.method1_cols = method1_cols - self.method2_cols = method2_cols + self.unpack_by_field_reference_cols = unpack_by_field_reference_cols + self.unpack_by_nested_dict_transformer = unpack_by_nested_dict_transformer self.customer_gauge_credentials_secret = customer_gauge_credentials_secret # validate_df @@ -182,8 +182,8 @@ def gen_flow(self) -> Flow: date_field=self.date_field, start_date=self.start_date, end_date=self.end_date, - method1_cols=self.method1_cols, - method2_cols=self.method2_cols, + unpack_by_field_reference_cols=self.unpack_by_field_reference_cols, + unpack_by_nested_dict_transformer=self.unpack_by_nested_dict_transformer, vault_name=self.vault_name, credentials_secret=self.customer_gauge_credentials_secret, flow=self, diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index b24a09f75..72a1a013f 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -26,8 +26,8 @@ def __init__( ] = None, start_date: datetime = None, end_date: datetime = None, - method1_cols: List[str] = None, - method2_cols: List[str] = None, + unpack_by_field_reference_cols: List[str] = None, + unpack_by_nested_dict_transformer: List[str] = None, timeout: int = 3600, *args, **kwargs, @@ -51,9 +51,9 @@ def __init__( Defaults to None. end_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. Defaults to None. - method1_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. + unpack_by_field_reference_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. Defaults to None. - method2_cols (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. + unpack_by_nested_dict_transformer (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. Defaults to None. timeout (int, optional): The time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. @@ -66,8 +66,8 @@ def __init__( self.date_field = date_field self.start_date = start_date self.end_date = end_date - self.method1_cols = method1_cols - self.method2_cols = method2_cols + self.unpack_by_field_reference_cols = unpack_by_field_reference_cols + self.unpack_by_nested_dict_transformer = unpack_by_nested_dict_transformer super().__init__( name="customer_gauge_to_df", @@ -175,28 +175,28 @@ def _nested_dict_transformer( def column_unpacker( self, json_list: List[Dict[str, Any]] = None, - method1_cols: List[str] = None, - method2_cols: List[str] = None, + unpack_by_field_reference_cols: List[str] = None, + unpack_by_nested_dict_transformer: List[str] = None, ) -> List[Dict[str, Any]]: """ Function to unpack and modify specific columns in a list of dictionaries by using one of two methods, chosen by the user. If user would like to use field_reference_unpacker, he/she needs to provide list of fields as strings in - `method1_cols` parameter, if user would like to use nested_dict_transformer he/she needs to provide list of - fields as strings in method2_cols parameter. + `unpack_by_field_reference_cols` parameter, if user would like to use nested_dict_transformer he/she needs to provide list of + fields as strings in unpack_by_nested_dict_transformer parameter. Args: json_list (List[Dict[str, Any]): A list of dictionaries containing the data. - method1_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. + unpack_by_field_reference_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. Defaults to None. - method2_cols (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. + unpack_by_nested_dict_transformer (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. Defaults to None. Raises: ValueError: If 'json_list' is not provided. ValueError: If specified columns do not exist in the JSON data. - ValueError: If columns are mentioned in both 'method1_cols' and 'method2_cols'. + ValueError: If columns are mentioned in both 'unpack_by_field_reference_cols' and 'unpack_by_nested_dict_transformer'. Returns: List[Dict[str, Any]]: The updated list of dictionaries after column unpacking and modification. @@ -222,23 +222,23 @@ def unpack_columns(columns, unpack_function): else: logger.info(f"Column '{field}' not found.") return json_list_clean - if method1_cols and method2_cols: - duplicated_cols = set(method1_cols).intersection(set(method2_cols)) + if unpack_by_field_reference_cols and unpack_by_nested_dict_transformer: + duplicated_cols = set(unpack_by_field_reference_cols).intersection(set(unpack_by_nested_dict_transformer)) if duplicated_cols: raise ValueError( - f"{duplicated_cols} were mentioned in both method1_cols and method2_cols." + f"{duplicated_cols} were mentioned in both unpack_by_field_reference_cols and unpack_by_nested_dict_transformer." " It's not possible to apply two methods to the same field." ) else: - if method1_cols is not None: + if unpack_by_field_reference_cols is not None: json_list = unpack_columns( - columns = method1_cols, + columns = unpack_by_field_reference_cols, unpack_function = self._field_reference_unpacker ) - if method2_cols is not None: + if unpack_by_nested_dict_transformer is not None: json_list = unpack_columns( - columns = method2_cols, + columns = unpack_by_nested_dict_transformer, unpack_function = self._nested_dict_transformer ) @@ -330,8 +330,8 @@ def __call__(self): "date_field", "start_date", "end_date", - "method1_cols", - "method2_cols", + "unpack_by_field_reference_cols", + "unpack_by_nested_dict_transformer", ) def run( self, @@ -345,8 +345,8 @@ def run( ] = None, start_date: datetime = None, end_date: datetime = None, - method1_cols: List[str] = None, - method2_cols: List[str] = None, + unpack_by_field_reference_cols: List[str] = None, + unpack_by_nested_dict_transformer: List[str] = None, credentials_secret: str = "CUSTOMER-GAUGE", vault_name: str = None, ) -> pd.DataFrame: @@ -369,9 +369,9 @@ def run( Defaults to None. end_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. Defaults to None. - method1_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. + unpack_by_field_reference_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. Defaults to None. - method2_cols (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. + unpack_by_nested_dict_transformer (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. Defaults to None. credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ['client_id', 'client_secret']. Defaults to "CUSTOMER-GAUGE". @@ -427,8 +427,8 @@ def run( clean_json = self.column_unpacker( json_list = total_json, - method1_cols = method1_cols, - method2_cols = method2_cols) + unpack_by_field_reference_cols = unpack_by_field_reference_cols, + unpack_by_nested_dict_transformer = unpack_by_nested_dict_transformer) logger.info("Inserting data into the DataFrame...") df = pd.DataFrame(list(map(self.flatten_json, clean_json))) df = self.square_brackets_remover(df) From f967c82f9bdf20827dae4ee9dc985f2836c62b3a Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Mon, 13 Nov 2023 14:45:50 +0100 Subject: [PATCH 62/86] =?UTF-8?q?=E2=9C=85=20Added=20more=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_tm1.py | 77 ++++++++++++++++++++++++++++++++--- viadot/sources/tm1.py | 8 ++-- 2 files changed, 76 insertions(+), 9 deletions(-) diff --git a/tests/integration/test_tm1.py b/tests/integration/test_tm1.py index 3676a8ef6..ea21f0984 100644 --- a/tests/integration/test_tm1.py +++ b/tests/integration/test_tm1.py @@ -1,10 +1,13 @@ import pandas as pd - +import pytest from viadot.sources import TM1 from viadot.config import local_config +from viadot.exceptions import CredentialError,ValidationError -CUBE = local_config.get("test_cube") -VIEW = local_config.get("test_view") +CUBE = local_config.get("TM1").get("test_cube") +VIEW = local_config.get("TM1").get("test_view") +DIMENSION = local_config.get("TM1").get("test_dim") +HIERARCHY= local_config.get("TM1").get("test_hierarchy") def test_get_connection(): @@ -13,6 +16,12 @@ def test_get_connection(): assert connection is not None +def test_get_connection_fail(): + test_creds ={"address":"Addres", "port": 123, "username": "user", } + with pytest.raises(CredentialError): + tm1_source = TM1(credentials=test_creds) + + def test_get_cubes_names(): tm1_source = TM1() @@ -20,17 +29,75 @@ def test_get_cubes_names(): assert len(cubes) > 0 +def test_get_dimensions_names(): + tm1_source = TM1() + dim = tm1_source.get_dimensions_names() -def test_get_cubes_names(): + assert len(dim) > 0 + +def test_get_views_names(): tm1_source = TM1(cube=CUBE) views = tm1_source.get_views_names() assert len(views) > 0 +def test_get_hierarchies_names(): + tm1_source = TM1(dimension=DIMENSION) + hierarchies = tm1_source.get_hierarchies_names() + + assert len(hierarchies) >0 + +def test_get_available_elements(): + tm1_source = TM1(dimension=DIMENSION, hierarchy=HIERARCHY) + elements = tm1_source.get_available_elements() + + assert len(elements) >0 -def test_to_df(): +def test_to_df_view(): tm1_source = TM1(cube=CUBE, view=VIEW) df = tm1_source.to_df() assert isinstance(df, pd.DataFrame) assert df.empty is False + +def test_to_df_mdx(): + query =""" + select + { + [version].[version].[Budget] + } + on columns, + { + [company].[company].MEMBERS + } + on rows + + FROM """ + f"{CUBE}" + + tm1_source = TM1(mdx_query=query) + df = tm1_source.to_df(if_empty="pass") + + assert isinstance(df, pd.DataFrame) + +def test_to_df_fail_both(): + query =""" + select + { + [version].[version].[Budget] + } + on columns, + { + [company].[company].MEMBERS + } + on rows + + FROM """ + f"{CUBE}" + + tm1_source = TM1(mdx_query=query, cube=CUBE) + with pytest.raises(ValidationError, match="Specify only one: MDX query or cube and view."): + tm1_source.to_df(if_empty="pass") + +def test_to_df_fail_no(): + tm1_source = TM1() + with pytest.raises(ValidationError, match="MDX query or cube and view are required."): + tm1_source.to_df(if_empty="pass") diff --git a/viadot/sources/tm1.py b/viadot/sources/tm1.py index 77155b07c..cd91ba369 100644 --- a/viadot/sources/tm1.py +++ b/viadot/sources/tm1.py @@ -110,7 +110,7 @@ def get_views_names(self) -> list: conn = self.get_connection() return conn.views.get_all_names(self.cube) - def get_diemensions_names(self) -> list: + def get_dimensions_names(self) -> list: """ Get list of avaiable dimensions in TM1 instance. @@ -162,7 +162,9 @@ def to_df(self, if_empty: Literal["warn", "fail", "skip"] = "skip") -> pd.DataFr if self.mdx_query is None and (self.cube is None or self.view is None): raise ValidationError("MDX query or cube and view are required.") - if self.cube is not None and self.view is not None: + elif self.mdx_query is not None and (self.cube is not None or self.view is not None): + raise ValidationError("Specify only one: MDX query or cube and view.") + elif self.cube is not None and self.view is not None: df = conn.cubes.cells.execute_view_dataframe( cube_name=self.cube, view_name=self.view, @@ -171,8 +173,6 @@ def to_df(self, if_empty: Literal["warn", "fail", "skip"] = "skip") -> pd.DataFr ) elif self.mdx_query is not None: df = conn.cubes.cells.execute_mdx_dataframe(self.mdx_query) - else: - raise ValidationError("Specify only one: MDX query or cube and view.") logger.info( f"Data was successfully transformed into DataFrame: {len(df.columns)} columns and {len(df)} rows." From f245dbb55fa2f60d6551941a13d7f0472a56da23 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Mon, 13 Nov 2023 14:52:11 +0100 Subject: [PATCH 63/86] =?UTF-8?q?=F0=9F=93=9D=20updated=20documentation=20?= =?UTF-8?q?+=20formatting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_tm1.py | 50 +++++++++++++++++++++++++---------- viadot/sources/tm1.py | 12 ++++++--- 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/tests/integration/test_tm1.py b/tests/integration/test_tm1.py index ea21f0984..ae2b321b9 100644 --- a/tests/integration/test_tm1.py +++ b/tests/integration/test_tm1.py @@ -2,12 +2,12 @@ import pytest from viadot.sources import TM1 from viadot.config import local_config -from viadot.exceptions import CredentialError,ValidationError +from viadot.exceptions import CredentialError, ValidationError CUBE = local_config.get("TM1").get("test_cube") VIEW = local_config.get("TM1").get("test_view") DIMENSION = local_config.get("TM1").get("test_dim") -HIERARCHY= local_config.get("TM1").get("test_hierarchy") +HIERARCHY = local_config.get("TM1").get("test_hierarchy") def test_get_connection(): @@ -16,42 +16,51 @@ def test_get_connection(): assert connection is not None + def test_get_connection_fail(): - test_creds ={"address":"Addres", "port": 123, "username": "user", } + test_creds = { + "address": "Addres", + "port": 123, + "username": "user", + } with pytest.raises(CredentialError): tm1_source = TM1(credentials=test_creds) - def test_get_cubes_names(): tm1_source = TM1() cubes = tm1_source.get_cubes_names() assert len(cubes) > 0 + def test_get_dimensions_names(): tm1_source = TM1() dim = tm1_source.get_dimensions_names() assert len(dim) > 0 + def test_get_views_names(): tm1_source = TM1(cube=CUBE) views = tm1_source.get_views_names() assert len(views) > 0 + def test_get_hierarchies_names(): tm1_source = TM1(dimension=DIMENSION) hierarchies = tm1_source.get_hierarchies_names() - assert len(hierarchies) >0 + assert len(hierarchies) > 0 + def test_get_available_elements(): tm1_source = TM1(dimension=DIMENSION, hierarchy=HIERARCHY) elements = tm1_source.get_available_elements() - assert len(elements) >0 + assert len(elements) > 0 + def test_to_df_view(): tm1_source = TM1(cube=CUBE, view=VIEW) @@ -60,8 +69,10 @@ def test_to_df_view(): assert isinstance(df, pd.DataFrame) assert df.empty is False + def test_to_df_mdx(): - query =""" + query = ( + """ select { [version].[version].[Budget] @@ -72,15 +83,19 @@ def test_to_df_mdx(): } on rows - FROM """ + f"{CUBE}" - + FROM """ + + f"{CUBE}" + ) + tm1_source = TM1(mdx_query=query) df = tm1_source.to_df(if_empty="pass") assert isinstance(df, pd.DataFrame) + def test_to_df_fail_both(): - query =""" + query = ( + """ select { [version].[version].[Budget] @@ -91,13 +106,20 @@ def test_to_df_fail_both(): } on rows - FROM """ + f"{CUBE}" - + FROM """ + + f"{CUBE}" + ) + tm1_source = TM1(mdx_query=query, cube=CUBE) - with pytest.raises(ValidationError, match="Specify only one: MDX query or cube and view."): + with pytest.raises( + ValidationError, match="Specify only one: MDX query or cube and view." + ): tm1_source.to_df(if_empty="pass") + def test_to_df_fail_no(): tm1_source = TM1() - with pytest.raises(ValidationError, match="MDX query or cube and view are required."): + with pytest.raises( + ValidationError, match="MDX query or cube and view are required." + ): tm1_source.to_df(if_empty="pass") diff --git a/viadot/sources/tm1.py b/viadot/sources/tm1.py index cd91ba369..9a182bb97 100644 --- a/viadot/sources/tm1.py +++ b/viadot/sources/tm1.py @@ -33,7 +33,8 @@ def __init__( **kwargs, ): """ - Creating an instance of TM1 source class. + Creating an instance of TM1 source class. To download the data to the dataframe user needs to specify MDX query or + combination of cube and view. Args: credentials (Dict[str, Any], optional): Credentials stored in a dictionary. Required credentials: username, @@ -42,7 +43,7 @@ def __init__( mdx_query (str, optional): MDX select query needed to download the data. Defaults to None. cube (str, optional): Cube name from which data will be downloaded. Defaults to None. view (str, optional): View name from which data will be downloaded. Defaults to None. - dimension (str, optional): Diemension name. Defaults to None. + dimension (str, optional): Dimension name. Defaults to None. hierarchy (str, optional): Hierarchy name. Defaults to None. limit (str, optional): How many rows should be extracted. If None all the avaiable rows will be downloaded. Defaults to None. @@ -147,7 +148,8 @@ def get_available_elements(self) -> list: def to_df(self, if_empty: Literal["warn", "fail", "skip"] = "skip") -> pd.DataFrame: """ - Function for downloading data from TM1 to pd.DataFrame. + Function for downloading data from TM1 to pd.DataFrame. To download the data to the dataframe user needs to specify MDX query or + combination of cube and view. Args: if_empty (Literal["warn", "fail", "skip"], optional): What to do if output DataFrame is empty. Defaults to "skip". @@ -162,7 +164,9 @@ def to_df(self, if_empty: Literal["warn", "fail", "skip"] = "skip") -> pd.DataFr if self.mdx_query is None and (self.cube is None or self.view is None): raise ValidationError("MDX query or cube and view are required.") - elif self.mdx_query is not None and (self.cube is not None or self.view is not None): + elif self.mdx_query is not None and ( + self.cube is not None or self.view is not None + ): raise ValidationError("Specify only one: MDX query or cube and view.") elif self.cube is not None and self.view is not None: df = conn.cubes.cells.execute_view_dataframe( From 4277c1b840496eba5a41a17fa02ff25d525e5a0e Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Mon, 13 Nov 2023 16:26:49 +0100 Subject: [PATCH 64/86] add credentials_secret parameter --- viadot/sources/customer_gauge.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/viadot/sources/customer_gauge.py b/viadot/sources/customer_gauge.py index 6ceeccd02..5fff4387e 100644 --- a/viadot/sources/customer_gauge.py +++ b/viadot/sources/customer_gauge.py @@ -20,6 +20,7 @@ def __init__( endpoint: Literal["responses", "non-responses"] = None, url: str = None, credentials: Dict[str, Any] = None, + credentials_secret: str = "CUSTOMER-GAUGE", ): """ A class to connect and download data using Customer Gauge API. @@ -31,7 +32,8 @@ def __init__( endpoint (Literal["responses", "non-responses"]): Indicate which endpoint to connect. Defaults to None. url (str, optional): Endpoint URL. Defaults to None. credentials (Dict[str, Any], optional): Credentials to connect with API containing client_id, client_secret. Defaults to None. - + credentials_secret (str, optional): The name of the secret stored in local_config containing a + dictionary with ['client_id', 'client_secret']. Defaults to "CUSTOMER-GAUGE". Raises: ValueError: If endpoint is not provided or incorect. CredentialError: If credentials are not provided in local_config or directly as a parameter @@ -50,11 +52,12 @@ def __init__( raise ValueError( "Provide endpoint name. Choose: 'responses' or 'non-responses'. Otherwise, provide URL" ) + self.credentials_secret = credentials_secret if credentials is not None: self.credentials = credentials else: - self.credentials = local_config.get("CustomerGauge") + self.credentials = local_config.get(credentials_secret) if self.credentials is None: raise CredentialError("Credentials not provided.") From 49ecccb15d0ddc543db04a4420d63a349e97fd4d Mon Sep 17 00:00:00 2001 From: "hha.ext" Date: Mon, 13 Nov 2023 16:35:28 +0100 Subject: [PATCH 65/86] add error raising tests --- tests/integration/test_customer_gauge.py | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/integration/test_customer_gauge.py b/tests/integration/test_customer_gauge.py index 596cf029c..a20ea006e 100644 --- a/tests/integration/test_customer_gauge.py +++ b/tests/integration/test_customer_gauge.py @@ -4,6 +4,7 @@ import pytest from viadot.sources import CustomerGauge +from viadot.exceptions import CredentialError ENDPOINT = random.choice(["responses", "non-responses"]) CG = CustomerGauge(endpoint=ENDPOINT) @@ -38,3 +39,27 @@ def test_endpoint_url_argument(): CG = CustomerGauge(url=ENDPOINT_URL) json_response = CG.get_json_response() assert isinstance(json_response, dict) + +@pytest.mark.endpoint_valueerror +def test_wrong_endpoint_valueerror_raising(): + with pytest.raises(ValueError, match=r"Incorrect endpoint name. Choose: 'responses' or 'non-responses'"): + wrong_endpoint_name = "wrong-endpoint" + CG = CustomerGauge(endpoint = wrong_endpoint_name) + +@pytest.mark.endpoint_valueerror +def test_no_endpoint_valueerror_raising(): + with pytest.raises(ValueError, match=r"Provide endpoint name. Choose: 'responses' or 'non-responses'. Otherwise, provide URL"): + CG = CustomerGauge() + +@pytest.mark.endpoint_credentialserror +def test_credentialserror_raising(): + wrong_secret="wrong" + with pytest.raises(CredentialError, match=r"Credentials not provided."): + CG = CustomerGauge(endpoint=ENDPOINT, credentials_secret=wrong_secret) + +@pytest.mark.get_cursor_valueerror +def test_get_cursor_valueerror_raising(): + wrong_json = {} + with pytest.raises(ValueError, match=r"Provided argument doesn't contain 'cursor' value. Pass json returned from the endpoint."): + CG = CustomerGauge(endpoint=ENDPOINT) + CG.get_cursor(json_response=wrong_json) \ No newline at end of file From d2fb298a626537ad51037276df3c5f6ff512fd9e Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Mon, 13 Nov 2023 16:58:50 +0100 Subject: [PATCH 66/86] =?UTF-8?q?=F0=9F=93=9D=20Added=20missing=20docstrin?= =?UTF-8?q?gs=20and=20return=20type?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/epicor.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/viadot/sources/epicor.py b/viadot/sources/epicor.py index a3e59c432..ad28019d0 100644 --- a/viadot/sources/epicor.py +++ b/viadot/sources/epicor.py @@ -104,6 +104,7 @@ def parse_orders_xml(xml_data: str) -> pd.DataFrame: Args: xml_data (str, required): Response from Epicor API in form of xml + Returns: pd.DataFrame: DataFrame containing parsed orders data. """ @@ -221,7 +222,11 @@ def __init__( super().__init__(*args, credentials=credentials, **kwargs) def generate_token(self) -> str: - "Function to generate API access token that is valid for 24 hours" + """Function to generate API access token that is valid for 24 hours. + + Returns: + str: Generated token. + """ url = ( "http://" @@ -243,7 +248,11 @@ def generate_token(self) -> str: return token def generate_url(self) -> str: - "Function to generate url to download data" + """Function to generate url to download data + + Returns: + str: Output url string. + """ return ( "http://" @@ -282,8 +291,12 @@ def get_xml_response(self): ) return response - def to_df(self): - "Function for creating pandas DataFrame from Epicor API response" + def to_df(self) -> pd.DataFrame: + """Function for creating pandas DataFrame from Epicor API response + + Returns: + pd.DataFrame: Output DataFrame. + """ data = self.get_xml_response() df = parse_orders_xml(data) return df From 738f2a4bcdea18710877df721e8199ca1aeeb289 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Mon, 13 Nov 2023 17:01:08 +0100 Subject: [PATCH 67/86] =?UTF-8?q?=E2=9C=85=20Added=20missing=20tests=20for?= =?UTF-8?q?=20Epicor=20source=20class?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_epicor.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_epicor.py b/tests/integration/test_epicor.py index 8e9155059..77c338a88 100644 --- a/tests/integration/test_epicor.py +++ b/tests/integration/test_epicor.py @@ -1,7 +1,8 @@ import pytest +import pandas as pd from viadot.config import local_config -from viadot.exceptions import DataRangeError +from viadot.exceptions import CredentialError, DataRangeError from viadot.sources import Epicor @@ -48,3 +49,25 @@ def test_connection(epicor): def test_validate_filter(epicor_error): with pytest.raises(DataRangeError): epicor_error.validate_filter() + + +def test_credentials_not_provided(): + with pytest.raises(CredentialError): + Epicor( + base_url=local_config.get("EPICOR").get("test_url"), + credentials={"username": "user12", "port": 1111}, + filters_xml=""" + + + 001 + + 2022-05-16 + 3 + + """, + ) + + +def test_to_df_return_type(epicor): + df = epicor.to_df() + assert isinstance(df, pd.DataFrame) From cdb879f43c3f04222507303ba2b289133db763eb Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Tue, 14 Nov 2023 12:21:53 +0100 Subject: [PATCH 68/86] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Added=20`set=5Fprefe?= =?UTF-8?q?ct=5Fkv`=20parameter=20to=20`BigQueryToADLS`=20flow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/flows/bigquery_to_adls.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/viadot/flows/bigquery_to_adls.py b/viadot/flows/bigquery_to_adls.py index e09981ebe..cd092066f 100644 --- a/viadot/flows/bigquery_to_adls.py +++ b/viadot/flows/bigquery_to_adls.py @@ -43,6 +43,7 @@ def __init__( if_exists: str = "replace", validate_df_dict: dict = None, timeout: int = 3600, + set_prefect_kv: bool = False, *args: List[Any], **kwargs: Dict[str, Any], ): @@ -84,6 +85,7 @@ def __init__( When passed, `validate_df` task validation tests are triggered. Defaults to None. timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. + set_prefect_kv(int, optional): Specifies whether to set a key-value pair in the Prefect KV Store. Defaults to False. """ # BigQueryToDF self.query = query @@ -125,6 +127,8 @@ def __init__( adls_dir_path, "schema", self.now + ".json" ) + self.set_prefect_kv = set_prefect_kv + super().__init__(*args, name=name, **kwargs) self.gen_flow() @@ -205,4 +209,5 @@ def gen_flow(self) -> Flow: df_to_be_loaded.set_upstream(dtypes_dict, flow=self) file_to_adls_task.set_upstream(df_to_file, flow=self) json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) - set_key_value(key=self.adls_dir_path, value=self.adls_file_path) + if self.set_prefect_kv is True: + set_key_value(key=self.adls_dir_path, value=self.adls_file_path) From e514deb4b2736d2a25c3fd351b7d0515d781470e Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Tue, 14 Nov 2023 12:31:42 +0100 Subject: [PATCH 69/86] =?UTF-8?q?=F0=9F=93=9D=20Updated=20Changelog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb9154c7c..1ebe1047d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added `TM1` source class. - Added `TM1ToDF` task class. +- Added `set_prefect_kv` parameter to `BigQueryToADLS` with `False` as a default. If there is a need to create new pair in KV Store the parameter can be changed to `True`. ### Changed - Splitted test for Eurostat on source tests and task tests From 3991b5f0ef4854b2f730a5ea8ae63ec4e540a301 Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Fri, 10 Nov 2023 16:09:29 +0100 Subject: [PATCH 70/86] Sharepoint list connector extension for multichoice fields with some small fixes and docstring update --- CHANGELOG.md | 19 ++ tests/integration/test_sharepoint.py | 138 +++++++++++--- viadot/flows/sharepoint_to_adls.py | 80 ++++---- viadot/sources/sharepoint.py | 263 ++++++++++++++++++++------- viadot/tasks/sharepoint.py | 100 ++++++++-- 5 files changed, 457 insertions(+), 143 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 507c590cf..208908bb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,25 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +- Modified `SharepointList` source class: + -> docstrings update +- Modified `SharepointToADLS` flow class: + -> docstrings update + -> changed key_value_param: bool = False to prevent forced KV store append +- Modified `SharepointListToADLS` flow class: + -> changed key_value_param: bool = False to prevent forced KV store append +- Modified `SharepointList` source class: + -> docstrings update + -> Changed `_unpack_fields` method to handle Sharepoint MultiChoiceField type + small improvements + -> Changed `get_fields` method to handle special characters - different approach to call get() and execute_query() + -> Renamed method from `select_expandable_user_fields` to `select_fields` + update for MultiChoiceField type + -> Changed `check_filters` method errors messages and more checks added + -> Changed `operators_mapping` method errors messages + -> Changed `make_filter_for_df` method errors messages +- Modified `SharepointListToDF` task class: + -> docstrings update + -> Added `_rename_duplicated_fields` method to find and rename duplicated columns + ## [0.4.21] - 2023-10-26 ### Added diff --git a/tests/integration/test_sharepoint.py b/tests/integration/test_sharepoint.py index 502ffded0..38fdfa8a7 100644 --- a/tests/integration/test_sharepoint.py +++ b/tests/integration/test_sharepoint.py @@ -2,7 +2,6 @@ import re import pandas as pd -from copy import deepcopy import pytest from prefect.tasks.secrets import PrefectSecret @@ -10,7 +9,7 @@ from viadot.exceptions import CredentialError from viadot.sources import Sharepoint from viadot.task_utils import df_get_data_types_task -from viadot.tasks.sharepoint import SharepointToDF +from viadot.tasks.sharepoint import SharepointToDF, SharepointListToDF from viadot.sources import SharepointList @@ -168,10 +167,11 @@ def test_get_data_types(file_name): assert "String" in dtypes +### SECTION FOR TESTING SHAREPOINT LIST CONNECTOR ### @pytest.fixture(scope="session") def sharepoint_list(): """ - Fixture for creating a Sharepoint class instance. + Fixture for creating a SharepointList class instance. The class instance can be used within a test functions to interact with Sharepoint. """ spl = SharepointList() @@ -187,15 +187,31 @@ def test_valid_filters(sharepoint_list): assert result is True -def test_invalid_dtype(sharepoint_list): +def test_filters_missing_dtype(sharepoint_list): + filters = { + "filter1": {"operator1": ">", "value1": 10}, + } + with pytest.raises( + ValueError, + match=re.escape("dtype for filter1 is missing!"), + ): + sharepoint_list.check_filters(filters) + + +def test_filters_invalid_dtype(sharepoint_list): filters = { "filter1": {"dtype": "list", "operator1": ">", "value1": 10}, } - with pytest.raises(ValueError, match="dtype not allowed!"): + with pytest.raises( + ValueError, + match=re.escape( + "dtype not allowed! Expected: ['datetime', 'date', 'bool', 'int', 'float', 'complex', 'str'] got: list ." + ), + ): sharepoint_list.check_filters(filters) -def test_missing_operator1(sharepoint_list): +def test_filters_missing_operator1(sharepoint_list): filters = { "filter1": {"dtype": "int", "value1": 10}, } @@ -203,23 +219,28 @@ def test_missing_operator1(sharepoint_list): sharepoint_list.check_filters(filters) -def test_invalid_operator1(sharepoint_list): +def test_filters_invalid_operator1(sharepoint_list): filters = { "filter1": {"dtype": "int", "operator1": "*", "value1": 10}, } - with pytest.raises(ValueError, match="Operator type not allowed!"): + with pytest.raises( + ValueError, + match=re.escape( + "Operator1 type not allowed! Expected: ['<', '>', '<=', '>=', '==', '!='] got: * ." + ), + ): sharepoint_list.check_filters(filters) -def test_missing_value1(sharepoint_list): +def test_filters_missing_value1(sharepoint_list): filters = { "filter1": {"dtype": "int", "operator1": ">", "value1": None}, } - with pytest.raises(ValueError, match="Value for operator1 is missing!"): + with pytest.raises(ValueError, match="Value1 for operator1 is missing!"): sharepoint_list.check_filters(filters) -def test_missing_operators_conjuction(sharepoint_list): +def test_filters_missing_operators_conjuction(sharepoint_list): filters = { "filter1": { "dtype": "int", @@ -229,11 +250,16 @@ def test_missing_operators_conjuction(sharepoint_list): "value2": 20, }, } - with pytest.raises(ValueError, match="Operators for conjuction is missing!"): + with pytest.raises( + ValueError, + match=re.escape( + "Operator for conjuction is missing! Expected: ['&', '|'] got empty." + ), + ): sharepoint_list.check_filters(filters) -def test_invalid_operators_conjuction(sharepoint_list): +def test_filters_invalid_operators_conjuction(sharepoint_list): filters = { "filter1": { "dtype": "int", @@ -244,11 +270,16 @@ def test_invalid_operators_conjuction(sharepoint_list): "operators_conjuction": "!", }, } - with pytest.raises(ValueError, match="Operators for conjuction not allowed!"): + with pytest.raises( + ValueError, + match=re.escape( + "Operator for conjuction not allowed! Expected: ['&', '|'] got ! ." + ), + ): sharepoint_list.check_filters(filters) -def test_invalid_filters_conjuction(sharepoint_list): +def test_filters_conjuction_not_allowed(sharepoint_list): filters = { "filter1": { "dtype": "int", @@ -258,7 +289,32 @@ def test_invalid_filters_conjuction(sharepoint_list): }, } with pytest.raises( - ValueError, match="Filters operators for conjuction not allowed!" + ValueError, + match=re.escape( + "Filters conjuction allowed only when more then one filter provided!" + ), + ): + sharepoint_list.check_filters(filters) + + +def test_filters_invalid_conjuction(sharepoint_list): + filters = { + "filter1": { + "dtype": "int", + "value1": 10, + "operator1": ">", + "filters_conjuction": "!", + }, + "filter2": { + "dtype": "int", + "operator1": "==", + }, + } + with pytest.raises( + ValueError, + match=re.escape( + "Filter operator for conjuction not allowed! Expected: ['&', '|'] got ! ." + ), ): sharepoint_list.check_filters(filters) @@ -266,27 +322,47 @@ def test_invalid_filters_conjuction(sharepoint_list): def test_valid_mapping(sharepoint_list): filters = { "filter1": { + "dtype": "int", + "value1": 10, + "value2": 20, "operator1": ">", "operator2": "<=", "operators_conjuction": "&", "filters_conjuction": "|", }, - "filter2": {"operator1": "==", "operator2": "!=", "operators_conjuction": "|"}, + "filter2": { + "dtype": "int", + "value1": 30, + "value2": 0, + "operator1": "==", + "operator2": "!=", + "operators_conjuction": "|", + }, } expected_result = { "filter1": { + "dtype": "int", + "value1": 10, + "value2": 20, "operator1": "gt", "operator2": "le", "operators_conjuction": "and", "filters_conjuction": "or", }, - "filter2": {"operator1": "eq", "operator2": "ne", "operators_conjuction": "or"}, + "filter2": { + "dtype": "int", + "value1": 30, + "value2": 0, + "operator1": "eq", + "operator2": "ne", + "operators_conjuction": "or", + }, } - result = sharepoint_list.operators_mapping(deepcopy(filters)) + result = sharepoint_list.operators_mapping(filters) assert result == expected_result -def test_invalid_comparison_operator(sharepoint_list): +def test_operators_mapping_invalid_comparison_operator(sharepoint_list): filters = { "filter1": { "operator1": "*", @@ -297,10 +373,10 @@ def test_invalid_comparison_operator(sharepoint_list): } error_message = "This comparison operator: * is not allowed. Please read the function documentation for details!" with pytest.raises(ValueError, match=re.escape(error_message)): - sharepoint_list.operators_mapping(deepcopy(filters)) + sharepoint_list.operators_mapping(filters) -def test_invalid_logical_operator(sharepoint_list): +def test_operators_mapping_invalid_logical_operator(sharepoint_list): filters = { "filter1": { "operator1": ">", @@ -309,9 +385,23 @@ def test_invalid_logical_operator(sharepoint_list): "filters_conjuction": "|", }, } - error_message = "This conjuction(logical) operator: ! is not allowed. Please read the function documentation for details!" + error_message = "This conjuction (logical) operator: ! is not allowed. Please read the function documentation for details!" + with pytest.raises(ValueError, match=re.escape(error_message)): + sharepoint_list.operators_mapping(filters) + + +def test_operators_mapping_invalid_filters_logical_operator(sharepoint_list): + filters = { + "filter1": { + "operator1": ">", + "operator2": "<=", + "operators_conjuction": "&", + "filters_conjuction": "!", + }, + } + error_message = "This filters conjuction (logical) operator: ! is not allowed. Please read the function documentation for details!" with pytest.raises(ValueError, match=re.escape(error_message)): - sharepoint_list.operators_mapping(deepcopy(filters)) + sharepoint_list.operators_mapping(filters) def test_single_filter_datetime_api(sharepoint_list): diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 410538e7b..6191317d0 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -42,6 +42,7 @@ def __init__( if_exists: str = "replace", validate_df_dict: dict = None, timeout: int = 3600, + key_value_param: bool = False, *args: List[any], **kwargs: Dict[str, Any], ): @@ -69,6 +70,7 @@ def __init__( dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. + key_value_param (bool, optional): Wheter to do key-value parameters in KV Store or not. Defaults to False. """ # SharepointToDF self.if_empty = if_empty @@ -86,6 +88,7 @@ def __init__( self.adls_sp_credentials_secret = adls_sp_credentials_secret self.if_exists = if_exists self.output_file_extension = output_file_extension + self.key_value_param = key_value_param self.now = str(pendulum.now("utc")) if self.local_dir_path is not None: self.local_file_path = ( @@ -177,7 +180,8 @@ def gen_flow(self) -> Flow: file_to_adls_task.set_upstream(df_to_file, flow=self) json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) - set_key_value(key=self.adls_dir_path, value=self.adls_file_path) + if self.key_value_param == True: + set_key_value(key=self.adls_dir_path, value=self.adls_file_path) @staticmethod def slugify(name): @@ -188,42 +192,35 @@ class SharepointListToADLS(Flow): def __init__( self, name: str, - list_title: str = None, - site_url: str = None, + list_title: str, + site_url: str, + path: str, + adls_dir_path: str, + adls_file_name: str, + filters: dict = None, required_fields: List[str] = None, field_property: str = "Title", - filters: dict = None, row_count: int = 5000, + adls_sp_credentials_secret: str = None, sp_cert_credentials_secret: str = None, vault_name: str = None, - path: str = None, - adls_dir_path: str = None, - adls_file_name: str = None, - adls_sp_credentials_secret: str = None, overwrite_adls: bool = True, output_file_extension: str = ".parquet", validate_df_dict: dict = None, + key_value_param: bool = False, *args: List[any], **kwargs: Dict[str, Any], ): - """ - Run Flow SharepointListToADLS. + """_summary_ Args: - name (str): Prefect flow name. - list_title (str): Title of Sharepoint List. Default to None. - site_url (str): URL to set of Sharepoint Lists. Default to None. - required_fields (List[str]): Required fields(columns) need to be extracted from - Sharepoint List. Default to None. - field_property (List[str]): Property to expand fields with expand query method. - For example: User fields could be expanded and "Title" - or "ID" could be extracted - -> usefull to get user name instead of ID - All properties can be found under list.item.properties. - WARNING! Field types and properties might change which could - lead to errors - extension of sp connector would be required. - Default to ["Title"] - filters (dict): Dictionary with operators which filters the SharepointList output. + name (str): Prefect flow name. + list_title (str): Title of Sharepoint List. + site_url (str): URL to set of Sharepoint Lists. + path (str): Local file path. Default to None. + adls_dir_path (str): Azure Data Lake destination folder/catalog path. Defaults to None. + adls_file_name (str): Name of file in ADLS. Defaults to None. + filters (dict, optional): Dictionary with operators which filters the SharepointList output. allowed dtypes: ('datetime','date','bool','int', 'float', 'complex', 'str') allowed conjuction: ('&','|') allowed operators: ('<','>','<=','>=','==','!=') @@ -247,16 +244,27 @@ def __init__( 'operator1':'==', }, } - row_count (int): Number of downloaded rows in single request. Default to 5000. - sp_cert_credentials_secret (str): Credentials to verify Sharepoint connection. Default to None. - vault_name (str): KeyVaultSecret name. Default to None. - path (str): Local file path. Default to None. - adls_dir_path (str): Azure Data Lake destination folder/catalog path. Defaults to None. - adls_file_name (str, optional): Name of file in ADLS. Defaults to None. - adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with - ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, - CLIENT_SECRET) for the Azure Data Lake. Defaults to None. - overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to True. + Defaults to None. + required_fields (List[str], optional): Required fields(columns) need to be extracted from + Sharepoint List. Defaults to None. + field_property (str, optional): Property to expand fields with expand query method. + For example: User fields could be expanded and "Title" + or "ID" could be extracted + -> usefull to get user name instead of ID + All properties can be found under list.item.properties. + WARNING! Field types and properties might change which could + lead to errors - extension of sp connector would be required. + Default to ["Title"]. Defaults to "Title". + row_count (int, optional): Number of downloaded rows in single request.Defaults to 5000. + adls_sp_credentials_secret (str, optional): Credentials to connect to Azure ADLS + If not passed it will take cred's from your .config/credentials.json Defaults to None. + sp_cert_credentials_secret (str, optional): Credentials to verify Sharepoint connection. + If not passed it will take cred's from your .config/credentials.json Default to None. + vault_name (str, optional): KeyVaultSecret name. Default to None. + overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to True. + output_file_extension (str, optional): _description_. Defaults to ".parquet". + validate_df_dict (dict, optional): Wheter to do an extra df validation before ADLS upload or not to do. Defaults to None. + key_value_param (bool, optional): Wheter to do key-value parameters in KV Store or not. Defaults to False. Returns: .parquet file inside ADLS. @@ -280,6 +288,7 @@ def __init__( self.overwrite = overwrite_adls self.adls_sp_credentials_secret = adls_sp_credentials_secret self.output_file_extension = output_file_extension + self.key_value_param = key_value_param self.now = str(pendulum.now("utc")) if self.path is not None: self.local_file_path = ( @@ -370,7 +379,8 @@ def gen_flow(self) -> Flow: file_to_adls_task.set_upstream(df_to_file, flow=self) json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) - set_key_value(key=self.adls_dir_path, value=self.adls_file_path) + if self.key_value_param == True: + set_key_value(key=self.adls_dir_path, value=self.adls_file_path) @staticmethod def slugify(name): diff --git a/viadot/sources/sharepoint.py b/viadot/sources/sharepoint.py index 7f57bd658..7f1bc523c 100644 --- a/viadot/sources/sharepoint.py +++ b/viadot/sources/sharepoint.py @@ -85,8 +85,10 @@ def download_file( class SharepointList(Source): """ - A Sharepoint_List class to connect and download data from sharpoint lists. - + A Sharepoint_List class to connect and download data from Sharepoint lists. + Warning! + Please be carefull with selection of the column names because once sharepoint list is opened inside a browser it may display columns in different languages. + Because of that the resulting file or output might have different column names then the one which u see in the browser. Args: credentials (dict): Credentials should include: - "tenant" @@ -102,6 +104,20 @@ def __init__( *args, **kwargs, ): + """_summary_ + + Args: + credentials (Dict[str, Any], optional): Credentials should include: + - "tenant" + - "client_id" + - "scopes" + - "thumbprint" + - "private_key" + + Raises: + CredentialError: If no credentials are pased + and local config doesn't contain them neiter + """ DEFAULT_CREDENTIALS = local_config.get("SHAREPOINT_CERT") credentials = credentials or DEFAULT_CREDENTIALS if credentials is None: @@ -109,11 +125,16 @@ def __init__( super().__init__(*args, credentials=credentials, **kwargs) - def get_connection( - self, - site_url: str = None, - ): - # Connecting into Sharepoint with AuthenticationContext + def get_connection(self, site_url: str): + """Function for connecting into Sharepoint with AuthenticationContext + + Args: + site_url (str): url of the sharepoint list + + Returns: + ctx: authentication context + """ + logger.info("Connecting into Sharepoint with AuthenticationContexts") try: auth_context = AuthenticationContext(site_url) auth_context.with_client_certificate( @@ -132,51 +153,80 @@ def get_connection( return self.ctx - # Function for extracting list items from search fields def _unpack_fields( self, list_item, - selected_fields: dict = None, + selected_fields: dict, ) -> dict: + """Function for extracting and unpacking list items from the search fields + + Args: + list_items (office365 list item): A list with office365 list item objects (rows) + selected_fields (dict): A dict with fields selected for ingestion, generated by SharepointList.select_fields() + + Raises: + ValueError: "Check if given field property is valid!" + ValueError: "Get nested dict for not recognized type of field! Check field types in the source" + ValueError: "Get empty properties for list items" + + Returns: + dict: A dictionary with Column: Value pairs for each row from the list + """ # Creating the body of dictionary new_dict = dict() # For loop scanning the propertys of searching fields item_values_dict = list_item.properties - for field, val in item_values_dict.items(): - nested_dict = get_nested_dict(val) - # Check if field has expandable type - if field in selected_fields["FieldToExpand"]: + if item_values_dict: + for field, val in item_values_dict.items(): + nested_dict = get_nested_dict(val) # Check if the values are nested if nested_dict != None: - # It might be that there are different field properties than expected - nested_value = nested_dict.get( - selected_fields["FieldExpandProperty"] - ) - if nested_value != None: - new_dict[field] = nested_value + # Check if field has expandable type + if field in selected_fields["FieldToExpand"]: + # It might be that there are different field properties than expected + nested_value = nested_dict.get( + selected_fields["FieldExpandProperty"] + ) + if nested_value != None: + new_dict[field] = nested_value + else: + raise ValueError("Check if given field property is valid!") + elif field in selected_fields["MultiChoiceField"]: + # Field type of multi choice could have more than 1 selection. + new_dict[field] = ";".join(nested_dict.values()) else: - logger.info("Property of the extandable field not recognized!") - raise ValueError("Check if given field property is valid!") - elif field in selected_fields["MultiChoiceField"]: - # Field type of multi choice could have more than 1 selection. - new_dict[field] = ";".join(nested_dict.values()) + raise ValueError( + "Get nested dict for not recognized type of field! Check field types in the source" + ) else: - raise ValueError( - "Get nested dict for not recognized type of field! Check field types in the source" - ) - else: - new_dict[field] = val - + new_dict[field] = val + else: + raise ValueError( + "Get empty properties for list items. Check if parameter list_item collection containes any data -> item objects." + ) return new_dict def get_fields( self, - list_title: str = None, - site_url: str = None, + list_title: str, + site_url: str, required_fields: List[str] = None, - ): - ctx = self.get_connection(site_url=site_url) + ) -> List: + """ + Function for geting list of fields objects from the sharepoint list. + It can get all fields available if required_fields not passed + or just the one which are in the list required_fields. + + Args: + list_title (str): name of the sharepoint list + site_url (str): url to the sharepoint list with "/" at the end + required_fields (List[str], optional ): List of required fields to ingest. It will get all fields if not passed. + + Returns: + List: list with office365 sharepoint list field objects + """ + ctx = self.get_connection(site_url=site_url) # Get list of lists object by List Title self.list_object = ctx.web.lists.get_by_title(list_title) list_fields_all = self.list_object.fields @@ -200,18 +250,32 @@ def get_fields( def select_fields( self, - list_title: str = None, - site_url: str = None, + list_title: str, + site_url: str, required_fields: List[str] = None, field_property: str = "Title", ) -> dict: """ Method to create a data structure for handling info about - selection of fields with details about possible expansion for more data or details. + selection of fields with details about possible expansion for more data or details. Field types to extract more values can be: "User*", "MultiChoice" field_property to expand can be: ID, Title, FieldTypeKind, TypeAsString and many more. -> more properties can be discovered by getting list.item.properties. - Default to "Title" + + Args: + list_title (str): _description_. Defaults to None. + site_url (str): _description_. Defaults to None. + required_fields (List[str], optional): _description_. Defaults to None. + field_property (str, optional): Property to extract from nested fields + like column with type User*. Defaults to "Title". + + Returns: + dict: selected_fields = { + "FieldInternalNames": List of fields to select with its InternalNames (from api), + "FieldToExpand": fields_to_expand,-> fields which could be expanded to get more data from API + "FieldExpandProperty": field_property, property of the expandable field which will be extracted + "MultiChoiceField": List of fields which can have multiple values in 1 row + } """ list_fields = self.get_fields( @@ -248,30 +312,73 @@ def select_fields( def check_filters( self, - filters: dict = None, + filters: dict, ) -> bool: """ Function to check if filters dict is valid. - example1: if operator2 is present value2 must be in place as well - example2: if dtype is not on allowed list it will throw an error + Please check and apply only allowed filter settings: + allowed_dtypes = ["datetime", "date", "bool", "int", "float", "complex", "str"] + allowed_conjuction = ["&", "|"] + allowed_operators = ["<", ">", "<=", ">=", "==", "!="] + Operator conjuction is only possible if there are 2 values like: value <= 1 | value == 5 + Filter conjuction is only possible if there are more then 1 filters for ex. date and creator + + Args: + filters (dict): A dictionary containing filter settings + Example: + filters = { + "Created": { + "dtype": "datetime", + "value1": yesterday_date, + "value2": today_date, + "operator1": ">=", + "operator2": "<=", + "operators_conjuction": "&", + "filters_conjuction": "&", + }, + "Factory": { + "dtype": "str", + "value1": "NM-PL", + "operator1": "==", + }, + } + + Raises: + ValueError: If dtype not in allowed list + ValueError: If comparison operator1 not in allowed list + ValueError: If value for operator1 is missing + ValueError: If comparison operator1 for the first value is missing + ValueError: If comparison operator2 not in allowed list + ValueError: If value for operator2 is missing + ValueError: If comparison operator2 for the first value is missing + ValueError: If operator conjuction is missing while there are 2 values and 2 operators passed + ValueError: If operator conjuction is not in the allowed list + ValueError: If operator conjuction provided why only one filter value is given + ValueError: If filter conjuction provided without 2nd filter + ValueError: If filter conjuction not in the allowed list + + Returns: + bool: True if all checks passed """ allowed_dtypes = ["datetime", "date", "bool", "int", "float", "complex", "str"] allowed_conjuction = ["&", "|"] allowed_operators = ["<", ">", "<=", ">=", "==", "!="] - for parameters in filters.values(): + for filter_name, parameters in filters.items(): + if not parameters.get("dtype"): + raise ValueError(f"dtype for {filter_name} is missing!") if parameters.get("dtype") not in allowed_dtypes: raise ValueError( - f"dtype not allowed! Expected {allowed_dtypes} got: {parameters.get('dtype')}." + f"dtype not allowed! Expected: {allowed_dtypes} got: {parameters.get('dtype')} ." ) if parameters.get("operator1"): if parameters.get("operator1") not in allowed_operators: raise ValueError( - f"Operator type not allowed! Expected {allowed_operators} got: {parameters.get('operator1')}." + f"Operator1 type not allowed! Expected: {allowed_operators} got: {parameters.get('operator1')} ." ) if not parameters.get("value1"): - raise ValueError("Value for operator1 is missing!") + raise ValueError("Value1 for operator1 is missing!") elif not parameters.get("operator1"): raise ValueError("Operator1 is missing!") if ( @@ -279,22 +386,22 @@ def check_filters( and parameters.get("operators_conjuction") is not None ): raise ValueError( - f"Operator conjuction allowed only with more than one filter operator!" + f"Operator conjuction allowed only with more then one filter operator!" ) if parameters.get("operator2"): if parameters.get("operator2") not in allowed_operators: raise ValueError( - f"Operator type not allowed! Expected {allowed_operators} got: {parameters.get('operator2')}." + f"Operator2 type not allowed! Expected: {allowed_operators} got: {parameters.get('operator2')} ." ) if not parameters.get("value2"): - raise ValueError("Value for operator2 is missing!") + raise ValueError("Value2 for operator2 is missing!") if not parameters.get("operators_conjuction"): raise ValueError( - f"Operators for conjuction is missing! Expected {allowed_conjuction} got empty." + f"Operator for conjuction is missing! Expected: {allowed_conjuction} got empty." ) if parameters.get("operators_conjuction") not in allowed_conjuction: raise ValueError( - f"Operators for conjuction not allowed! Expected {allowed_conjuction} got {parameters.get('operators_conjuction')}." + f"Operator for conjuction not allowed! Expected: {allowed_conjuction} got {parameters.get('operators_conjuction')} ." ) if parameters.get("filters_conjuction"): if ( @@ -302,27 +409,42 @@ def check_filters( and parameters.get("filters_conjuction") is not None ): raise ValueError( - f"Filters conjuction allowed only with more than one filter column!" + f"Filters conjuction allowed only when more then one filter provided!" ) if parameters.get("filters_conjuction") not in allowed_conjuction: raise ValueError( - f"Filters operators for conjuction not allowed! Expected {allowed_conjuction} got {parameters.get('filters_conjuction')}." + f"Filter operator for conjuction not allowed! Expected: {allowed_conjuction} got {parameters.get('filters_conjuction')} ." ) return True def operators_mapping( self, - filters: dict = None, + filters: dict, ) -> dict: """ Function for mapping comparison and conjuction(logical) operators of filters to the format which is recognized by Microsoft API. + Allowed operators: + < + > + <= + >= + == + != + "&" + "|" Args: - filters (dict): A dictionar which contains operators. + filters (dict): A dictionary which contains operators. + + Raises: + ValueError: If operator1 not allowed + ValueError: If operator2 not allowed + ValueError: If operators conjuction not allowed + ValueError: If filters conjuction not allowed Returns: - New modified dict. + dict: New modified dict with mapped operators. """ filters_dict = deepcopy(filters) @@ -361,7 +483,7 @@ def operators_mapping( ] else: raise ValueError( - f"This conjuction(logical) operator: {logical_op_to_change} is not allowed. Please read the function documentation for details!" + f"This conjuction (logical) operator: {logical_op_to_change} is not allowed. Please read the function documentation for details!" ) if parameters.get("filters_conjuction"): logical_fl_to_change = parameters.get("filters_conjuction") @@ -369,12 +491,12 @@ def operators_mapping( parameters["filters_conjuction"] = logical_op[logical_fl_to_change] else: raise ValueError( - f"This conjuction(logical) operator: {logical_fl_to_change} is not allowed. Please read the function documentation for details!" + f"This filters conjuction (logical) operator: {logical_fl_to_change} is not allowed. Please read the function documentation for details!" ) return filters_dict - def make_filter_for_api(self, filters: dict) -> "str": + def make_filter_for_api(self, filters: dict) -> str: """ Function changing type of operators to match MS API style as 'str' passing to URL call. @@ -382,7 +504,7 @@ def make_filter_for_api(self, filters: dict) -> "str": filters (dict): A dictionar which contains operators. Returns: - Output as string to pass as filter parameter to API. + str: Output as filtering string to pass as filter parameter to API. """ filter_text = "" @@ -422,16 +544,16 @@ def make_filter_for_api(self, filters: dict) -> "str": def make_filter_for_df( self, - filters: dict = None, - ) -> "str": + filters: dict, + ) -> str: """ - Function changing dict operators into pandas DataFrame filters. + Function changing filters into pandas DataFrame filtering string used later for filtering the DF. Args: - filters (dict): A dictionar which contains operators. + filters (dict): A dictionary which contains operators. Returns: - Output as string to pass as filter to DataFrame. + str: Output as string to pass as filter to DataFrame. """ filter_in_df = "df.loc[" @@ -469,6 +591,9 @@ def list_item_to_df( ): """ Method to extract data from Sharepoint List into DataFrame. + If filters are passed, function will try to extract only filtered data to reduce the amount of data to transfer. + If there is no filter or there is an throttling (max rows returned limit reached) + exception ,then 2nd workflow will start and download all data which will be filtered later in the data frame. Args: list_title (str): Title of Sharepoint List. Default to None. @@ -504,6 +629,10 @@ def list_item_to_df( } row_count (int): Number of downloaded rows in single request. Default to 5000. + Raises: + AttributeError: If filter column not included inside required fields list. + ValueError: If there is no filter passed - > will extract all fields and filter later. + Returns: pd.DataFrame """ @@ -515,7 +644,7 @@ def list_item_to_df( for key in filters: if key not in required_fields: raise AttributeError( - f"Filter '{key}' not included inside required fields. It is obligatory to extract data which is filtered!" + f"Filter '{key}' column not included inside required fields. It is obligatory to extract data which is filtered!" ) # changing the body of the filter for MS API call @@ -523,7 +652,7 @@ def list_item_to_df( download_all = False - # extracting requeird_fields SP_List objects + # extracting required_fields SP_List objects selected_fields = self.select_fields( list_title=list_title, site_url=site_url, @@ -534,7 +663,7 @@ def list_item_to_df( try: # Extract data below 5k rows or max limitation of the specific SP List with basic filtering. if filters is None: - raise ValueError("There is no filter. Starting extraxction all data") + raise ValueError("There is no filter. Switching to extract all fields.") else: list_items = ( self.list_object.items.filter(filter_text) diff --git a/viadot/tasks/sharepoint.py b/viadot/tasks/sharepoint.py index 2a1cb0bc4..635f9a5ae 100644 --- a/viadot/tasks/sharepoint.py +++ b/viadot/tasks/sharepoint.py @@ -245,7 +245,7 @@ class SharepointListToDF(Task): field_property (List[str]): Property to expand with expand query method. All propertys can be found under list.item.properties. Default to ["Title"] - filters (dict): Dictionary with operators which filters the SharepointList output. + filters (dict, optional): Dictionary with operators which filters the SharepointList output. allowed dtypes: ('datetime','date','bool','int', 'float', 'complex', 'str') allowed conjuction: ('&','|') allowed operators: ('<','>','<=','>=','==','!=') @@ -277,9 +277,9 @@ class SharepointListToDF(Task): def __init__( self, - path: str = None, - list_title: str = None, - site_url: str = None, + path: str, + list_title: str, + site_url: str, required_fields: List[str] = None, field_property: str = "Title", filters: dict = None, @@ -289,7 +289,6 @@ def __init__( *args, **kwargs, ): - self.path = path self.list_title = list_title self.site_url = site_url @@ -300,6 +299,11 @@ def __init__( self.vault_name = vault_name self.credentials_secret = credentials_secret + super().__init__( + *args, + **kwargs, + ) + if not credentials_secret: # Attempt to read a default for the service principal secret name try: @@ -313,16 +317,65 @@ def __init__( ).run() self.credentials = json.loads(credentials_str) - super().__init__( - *args, - **kwargs, - ) - def __call__(self): """Download Sharepoint_List data to a .parquet file""" super().__call__(self) + def _rename_duplicated_fields(self, df): + """ + Renames duplicated columns in a DataFrame by appending a numerical suffix. + Function to check if there are fields with + the same name but in different style (lower, upper) + It might happen that fields returned by get_fields() will be different + than actual list items fields ( from it's properties) + It is specific to sharepoint lists. + MS allowed users to create fields with simillar names (but with different letters style) + fields with same values. For example Id and ID - > office select function doesn't + recognize upper/lower cases. + + Args: + df (pd.DataFrame): The input DataFrame with potentially duplicated columns. + required_fields (list): List of fields that should not be considered for renaming. + + Returns: + pd.DataFrame: DataFrame with duplicated columns renamed to ensure uniqueness. + + Example: + Given DataFrame df: + ``` + A B C B D + 0 1 2 3 4 5 + ``` + + Required fields = ['A', 'B'] + After calling _rename_duplicated_fields(df, required_fields): + ``` + A B C B2 D + 0 1 2 3 4 5 + ``` + """ + col_to_compare = df.columns.tolist() + i = 1 + for column in df.columns.tolist(): + if not column in self.required_fields: + col_to_compare.remove(column) + if column.lower() in [to_cmp.lower() for to_cmp in col_to_compare]: + i += 1 + logger.info(f"Found duplicated column: {column} !") + logger.info(f"Renaming from {column} to {column}{i}") + df = df.rename(columns={f"{column}": f"{column}{i}"}) + return df + def _convert_camel_case_to_words(self, input_str: str) -> str: + """ + Function for converting internal names joined as camelCase column names to regular words + + Args: + input_str (str): Column name + + Returns: + str: Converted column name + """ self.input_str = input_str @@ -331,11 +384,23 @@ def _convert_camel_case_to_words(self, input_str: str) -> str: return converted - def change_column_name( - self, - df: pd.DataFrame = None, - ): - s = SharepointList() + def change_column_name(self, df: pd.DataFrame, credentials: str = None): + """ + Function for changing coded internal column names (Unicode style) to human readable names. + !Warning! + Names are taken from field properties Title! + Because of that the resulting column name might have different then initial name. + + Args: + df (pd.DataFrame): A data frame with loaded column names from sharepoint list. + credentials (str): Credentials str for sharepoint connection establishing. Defaults to None. + + Returns: + pd.DataFrame: Data frame with changed column names + """ + s = SharepointList( + credentials=self.credentials, + ) list_fields = s.get_fields( list_title=self.list_title, site_url=self.site_url, @@ -364,7 +429,7 @@ def change_column_name( # Rename columns names inside DataFrame df = df.rename(columns=dictionary) - + # Check again for duplicates return df def run( @@ -389,7 +454,8 @@ def run( row_count=self.row_count, ) - df = self.change_column_name(df=df_raw) + df_col_changed = self.change_column_name(df=df_raw) + df = self._rename_duplicated_fields(df=df_col_changed) self.logger.info("Successfully changed structure of the DataFrame") return df From ffe6078ad94d00104504cc539fecfd22b28a8ab4 Mon Sep 17 00:00:00 2001 From: Marcin Purtak <44641138+marcinpurtak@users.noreply.github.com> Date: Tue, 14 Nov 2023 15:38:23 +0100 Subject: [PATCH 71/86] removed obsolete comment sharepoint.py --- viadot/tasks/sharepoint.py | 1 - 1 file changed, 1 deletion(-) diff --git a/viadot/tasks/sharepoint.py b/viadot/tasks/sharepoint.py index 635f9a5ae..d6535356b 100644 --- a/viadot/tasks/sharepoint.py +++ b/viadot/tasks/sharepoint.py @@ -429,7 +429,6 @@ def change_column_name(self, df: pd.DataFrame, credentials: str = None): # Rename columns names inside DataFrame df = df.rename(columns=dictionary) - # Check again for duplicates return df def run( From c3fb0a7038ffec2323bbda15ea2d199f30b3871f Mon Sep 17 00:00:00 2001 From: burzekj Date: Tue, 14 Nov 2023 15:56:45 +0100 Subject: [PATCH 72/86] =?UTF-8?q?=E2=9C=A8=20new=20logic=20to=20extracting?= =?UTF-8?q?=20users=20from=20genesys?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/genesys.py | 83 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 2 deletions(-) diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index 428e699a0..96d5bdd03 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -10,8 +10,8 @@ from prefect.engine import signals from prefect.utilities import logging from prefect.utilities.tasks import defaults_from_attrs - from viadot.task_utils import * + from viadot.exceptions import APIError from viadot.sources import Genesys @@ -385,7 +385,6 @@ def run( "agent_performance_summary_view", "agent_status_summary_view", "agent_status_detail_view", - "agent_interaction_detail_view", ]: genesys.genesys_api_connection( post_data_list=post_data_list, end_point=end_point @@ -510,3 +509,83 @@ def run( logger.info("Downloaded the data from the Genesys into the CSV.") return [file_name] + + elif view_type is None and end_point == "users": + # First call to API to get information about amount of pages to extract + temp_json = genesys.genesys_api_connection( + post_data_list=post_data_list, + end_point=f"{end_point}/?pageSize=500&pageNumber=1&expand=presence,dateLastLogin,groups,employerInfo,lasttokenissued&state=any", + method="GET", + ) + last_page = temp_json["pageCount"] + 1 + + # Function to extract nested data from json file + def check_value(base, lvls): + for lvl in lvls: + if isinstance(base, dict): + base = base.get(lvl) + if base is None: + return None + else: + return base + return base + + data_list = [] + + # For loop to donwload all pages from Genesys GET API + for n in range(1, last_page): + json_file = genesys.genesys_api_connection( + post_data_list=post_data_list, + end_point=f"{end_point}/?pageSize=500&pageNumber={n}&expand=presence,dateLastLogin,groups,employerInfo,lasttokenissued&state=any", + method="GET", + ) + logger.info(f"Downloaded: {n} page") + + num_ids = len(json_file["entities"]) + + # For loop to extract data from specific page + for id in range(0, num_ids): + record_dict = {} + record_dict["Id"] = check_value(json_file["entities"][id], ["id"]) + record_dict["Name"] = check_value( + json_file["entities"][id], ["name"] + ) + record_dict["DivisionName"] = check_value( + json_file["entities"][id], ["division", "name"] + ) + record_dict["Email"] = check_value( + json_file["entities"][id], ["email"] + ) + record_dict["State"] = check_value( + json_file["entities"][id], ["state"] + ) + record_dict["Title"] = check_value( + json_file["entities"][id], ["title"] + ) + record_dict["Username"] = check_value( + json_file["entities"][id], ["username"] + ) + record_dict["SystemPresence"] = check_value( + json_file["entities"][id], + ["presence", "presenceDefinition", "systemPresence"], + ) + record_dict["DateLastLogin"] = check_value( + json_file["entities"][id], ["dateLastLogin"] + ) + + data_list.append(record_dict) + + df = pd.DataFrame(data_list) + + # data validation function (optional) + if validate_df_dict: + validate_df.run(df=df, tests=validate_df_dict) + + file_name = "All_Genesys_Users.csv" + df.to_csv( + os.path.join(file_name), + index=False, + sep="\t", + ) + + return [file_name] From 826f729ad82f83e3ac2be6d0887e3cbf9fcd3959 Mon Sep 17 00:00:00 2001 From: burzekj Date: Tue, 14 Nov 2023 16:04:05 +0100 Subject: [PATCH 73/86] =?UTF-8?q?=F0=9F=94=8A=20Added=20CHANGELOG.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ebe1047d..f128c8c00 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added new view type `agent_interaction_view_type` in `Genesys`source. +- Added new logic for endpoint `users` in `Genesys`task. ## [0.4.21] - 2023-10-26 From 53eb33eb1fb0a518067a24594034c3de4a90ea20 Mon Sep 17 00:00:00 2001 From: burzekj Date: Tue, 14 Nov 2023 16:34:10 +0100 Subject: [PATCH 74/86] =?UTF-8?q?=F0=9F=90=9B=20fixed=20bug=20in=20extract?= =?UTF-8?q?ing=20data=20from=20json?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + viadot/tasks/genesys.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ebe1047d..96982445e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Splitted test for Eurostat on source tests and task tests +- Fixed bug for endpoint `conversations` in GET method in `Genesys` Task. ### Added - Added new view type `agent_interaction_view_type` in `Genesys`source. diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index 428e699a0..e1819f1d6 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -487,9 +487,10 @@ def run( temp_dict = { key: value for (key, value) in attributes.items() if key in key_list } - temp_dict["conversationId"] = json_file["id"] - temp_dict["startTime"] = json_file["startTime"] - temp_dict["endTime"] = json_file["endTime"] + temp_dict["conversationId"] = json_file.get("id") + temp_dict["startTime"] = json_file.get("startTime") + temp_dict["endTime"] = json_file.get("endTime") + data_list.append(temp_dict) df = pd.DataFrame(data_list) From 5235f32398d1c51eb711d39bad8b22cecd65826b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 14 Nov 2023 15:48:48 +0000 Subject: [PATCH 75/86] =?UTF-8?q?=F0=9F=8E=A8=20Format=20Python=20code=20w?= =?UTF-8?q?ith=20Black?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flows/test_bigquery_to_adls.py | 9 +- ...test_cloud_for_customers_report_to_adls.py | 2 +- .../flows/test_customer_gauge_to_adls.py | 3 +- .../integration/flows/test_hubspot_to_adls.py | 2 +- .../flows/test_mediatool_to_adls.py | 2 +- tests/integration/flows/test_mysql_to_adls.py | 1 + .../flows/test_salesforce_to_adls.py | 2 +- .../integration/flows/test_sap_bw_to_adls.py | 2 +- .../integration/flows/test_sap_rfc_to_adls.py | 2 +- .../flows/test_supermetrics_to_adls.py | 2 +- .../integration/flows/test_vidclub_to_adls.py | 2 +- .../integration/tasks/test_customer_gauge.py | 297 +++++++++++------- tests/integration/tasks/test_tm1.py | 2 +- tests/integration/test_customer_gauge.py | 27 +- tests/integration/test_epicor.py | 2 +- tests/integration/test_genesys.py | 2 +- tests/integration/test_hubspot.py | 2 +- tests/integration/test_sharepoint.py | 5 +- tests/integration/test_tm1.py | 3 +- tests/unit/test_task_utils.py | 2 +- tests/unit/test_utils.py | 2 +- viadot/flows/__init__.py | 2 +- viadot/flows/customer_gauge_to_adls.py | 24 +- viadot/flows/sharepoint_to_adls.py | 3 +- viadot/flows/supermetrics_to_adls.py | 2 +- viadot/flows/transform_and_catalog.py | 4 +- viadot/sources/bigquery.py | 2 +- viadot/sources/customer_gauge.py | 2 +- viadot/sources/mindful.py | 4 +- viadot/sources/sharepoint.py | 20 +- viadot/sources/tm1.py | 7 +- viadot/task_utils.py | 2 +- viadot/tasks/__init__.py | 8 +- viadot/tasks/customer_gauge.py | 176 ++++++----- viadot/tasks/genesys.py | 2 +- viadot/tasks/luma.py | 2 + viadot/tasks/sap_bw.py | 2 +- viadot/tasks/sharepoint.py | 6 +- viadot/tasks/tm1.py | 4 +- 39 files changed, 367 insertions(+), 278 deletions(-) diff --git a/tests/integration/flows/test_bigquery_to_adls.py b/tests/integration/flows/test_bigquery_to_adls.py index de793344a..b4503c6e9 100644 --- a/tests/integration/flows/test_bigquery_to_adls.py +++ b/tests/integration/flows/test_bigquery_to_adls.py @@ -1,15 +1,14 @@ import os +from unittest import mock +import pandas as pd import pendulum import pytest -from unittest import mock -import pandas as pd - from prefect.tasks.secrets import PrefectSecret -from viadot.flows import BigQueryToADLS -from viadot.tasks import AzureDataLakeRemove from viadot.exceptions import ValidationError +from viadot.flows import BigQueryToADLS +from viadot.tasks import AzureDataLakeRemove ADLS_DIR_PATH = "raw/tests/" ADLS_FILE_NAME = str(pendulum.now("utc")) + ".parquet" diff --git a/tests/integration/flows/test_cloud_for_customers_report_to_adls.py b/tests/integration/flows/test_cloud_for_customers_report_to_adls.py index f0661e314..b0c3128c5 100644 --- a/tests/integration/flows/test_cloud_for_customers_report_to_adls.py +++ b/tests/integration/flows/test_cloud_for_customers_report_to_adls.py @@ -1,6 +1,6 @@ from viadot.config import local_config -from viadot.flows import CloudForCustomersReportToADLS from viadot.exceptions import ValidationError +from viadot.flows import CloudForCustomersReportToADLS def test_cloud_for_customers_report_to_adls(): diff --git a/tests/integration/flows/test_customer_gauge_to_adls.py b/tests/integration/flows/test_customer_gauge_to_adls.py index 34c7336bc..6da0bf8b7 100644 --- a/tests/integration/flows/test_customer_gauge_to_adls.py +++ b/tests/integration/flows/test_customer_gauge_to_adls.py @@ -4,8 +4,8 @@ import pandas as pd import pytest -from viadot.flows import CustomerGaugeToADLS from viadot.exceptions import ValidationError +from viadot.flows import CustomerGaugeToADLS DATA = { "user_name": ["Jane", "Bob"], @@ -90,4 +90,3 @@ def test_customer_gauge_to_adls_run_flow_validation_failure(mocked_class): flow.run() except ValidationError: pass - diff --git a/tests/integration/flows/test_hubspot_to_adls.py b/tests/integration/flows/test_hubspot_to_adls.py index d960fc079..e0c06c20f 100644 --- a/tests/integration/flows/test_hubspot_to_adls.py +++ b/tests/integration/flows/test_hubspot_to_adls.py @@ -5,8 +5,8 @@ import pandas as pd import pytest -from viadot.flows import HubspotToADLS from viadot.exceptions import ValidationError +from viadot.flows import HubspotToADLS DATA = { "id": {"0": "820306930"}, diff --git a/tests/integration/flows/test_mediatool_to_adls.py b/tests/integration/flows/test_mediatool_to_adls.py index d7b5b2658..65cfadf8f 100644 --- a/tests/integration/flows/test_mediatool_to_adls.py +++ b/tests/integration/flows/test_mediatool_to_adls.py @@ -4,8 +4,8 @@ import pandas as pd import pytest -from viadot.flows import MediatoolToADLS from viadot.exceptions import ValidationError +from viadot.flows import MediatoolToADLS DATA = {"country": ["DK", "DE"], "sales": [3, 4]} ADLS_FILE_NAME = "test_mediatool.parquet" diff --git a/tests/integration/flows/test_mysql_to_adls.py b/tests/integration/flows/test_mysql_to_adls.py index 942bab99d..c968d48a3 100644 --- a/tests/integration/flows/test_mysql_to_adls.py +++ b/tests/integration/flows/test_mysql_to_adls.py @@ -1,4 +1,5 @@ from unittest import mock + from viadot.flows.mysql_to_adls import MySqlToADLS query = """SELECT * FROM `example-views`.`sales`""" diff --git a/tests/integration/flows/test_salesforce_to_adls.py b/tests/integration/flows/test_salesforce_to_adls.py index ec68a1227..8c032f308 100644 --- a/tests/integration/flows/test_salesforce_to_adls.py +++ b/tests/integration/flows/test_salesforce_to_adls.py @@ -2,9 +2,9 @@ from prefect.tasks.secrets import PrefectSecret +from viadot.exceptions import ValidationError from viadot.flows import SalesforceToADLS from viadot.tasks import AzureDataLakeRemove -from viadot.exceptions import ValidationError ADLS_FILE_NAME = "test_salesforce.parquet" ADLS_DIR_PATH = "raw/tests/" diff --git a/tests/integration/flows/test_sap_bw_to_adls.py b/tests/integration/flows/test_sap_bw_to_adls.py index 2c01049e8..4259e5c16 100644 --- a/tests/integration/flows/test_sap_bw_to_adls.py +++ b/tests/integration/flows/test_sap_bw_to_adls.py @@ -4,8 +4,8 @@ import pandas as pd import pytest -from viadot.flows import SAPBWToADLS from viadot.exceptions import ValidationError +from viadot.flows import SAPBWToADLS DATA = { "[0CALMONTH].[LEVEL01].[DESCRIPTION]": ["January 2023"], diff --git a/tests/integration/flows/test_sap_rfc_to_adls.py b/tests/integration/flows/test_sap_rfc_to_adls.py index ed33fa320..5503b4684 100644 --- a/tests/integration/flows/test_sap_rfc_to_adls.py +++ b/tests/integration/flows/test_sap_rfc_to_adls.py @@ -1,8 +1,8 @@ from viadot.config import local_config +from viadot.exceptions import ValidationError from viadot.flows import SAPRFCToADLS from viadot.sources import AzureDataLake from viadot.tasks import AzureDataLakeRemove -from viadot.exceptions import ValidationError try: import pyrfc diff --git a/tests/integration/flows/test_supermetrics_to_adls.py b/tests/integration/flows/test_supermetrics_to_adls.py index 9738ddeb1..15deaa01a 100644 --- a/tests/integration/flows/test_supermetrics_to_adls.py +++ b/tests/integration/flows/test_supermetrics_to_adls.py @@ -4,8 +4,8 @@ import pytest from prefect.storage import Local -from viadot.flows import SupermetricsToADLS from viadot.exceptions import ValidationError +from viadot.flows import SupermetricsToADLS CWD = os.getcwd() adls_dir_path = "raw/tests/supermetrics" diff --git a/tests/integration/flows/test_vidclub_to_adls.py b/tests/integration/flows/test_vidclub_to_adls.py index c18eaad10..0f6705579 100644 --- a/tests/integration/flows/test_vidclub_to_adls.py +++ b/tests/integration/flows/test_vidclub_to_adls.py @@ -4,8 +4,8 @@ import pandas as pd import pytest -from viadot.flows import VidClubToADLS from viadot.exceptions import ValidationError +from viadot.flows import VidClubToADLS DATA = {"col1": ["aaa", "bbb", "ccc"], "col2": [11, 22, 33]} ADLS_FILE_NAME = "test_vid_club.parquet" diff --git a/tests/integration/tasks/test_customer_gauge.py b/tests/integration/tasks/test_customer_gauge.py index d95ea14cd..0c524fd0a 100644 --- a/tests/integration/tasks/test_customer_gauge.py +++ b/tests/integration/tasks/test_customer_gauge.py @@ -8,51 +8,77 @@ CUR = 185000 PAGESIZE = 1000 -DATA_JSON = {'contact': {'first_name': '***', 'last_name': '***'}, - 'number_customer': 266, - 'date_email_sent': '2018-02-05 10:42:28', - 'properties': [{'field': 'Postal Code', 'reference': '999'}, - {'field': 'City', 'reference': 'Eldorado'}, - {'field': 'Currency', 'reference': None}, - {'field': 'Item Quantity', 'reference': '7'}, - {'field': 'PostingDate', 'reference': '2018-01-10 00:00:00'}], - 'custom_fields': [{'field': 'Assignment_ID', 'reference': None}], - 'drivers': [{'label': 'Product Quality and Product Performance'}, - {'label': 'Function and Design'}, - {'label': 'Value for Money'}, - {'label': 'Packaging'}]} - -RAW_JSON = {'data': [{'contact': {'first_name': '***', 'last_name': '***'}, - 'number_customer': 266, - 'date_email_sent': '2018-02-05 10:42:28', - 'properties': [{'field': 'Postal Code', 'reference': '999'}, - {'field': 'City', 'reference': 'Eldorado'}, - {'field': 'Currency', 'reference': None}, - {'field': 'Item Quantity', 'reference': '7'}, - {'field': 'PostingDate', 'reference': '2018-01-10 00:00:00'}], - 'custom_fields': [{'field': 'Assignment_ID', 'reference': None}], - 'drivers': [{'label': 'Product Quality and Product Performance'}, - {'label': 'Function and Design'}, - {'label': 'Value for Money'}, - {'label': 'Packaging'}]}, - {'contact': {'first_name': '***', 'last_name': '***'}, - 'number_customer': 206, - 'date_email_sent': '2018-02-05 10:41:01', - 'properties': [{'field': 'Postal Code', 'reference': '0000'}, - {'field': 'City', 'reference': 'Neverland'}, - {'field': 'Currency', 'reference': None}, - {'field': 'Item Quantity', 'reference': '1'}, - {'field': 'PostingDate', 'reference': '2018-01-26 00:00:00'}], - 'custom_fields': [{'field': 'Assignment_ID', 'reference': None}], - 'drivers': [{'label': 'The website of the online shop (overall impression)'}, - {'label': 'Waiting period'}]}], - 'cursor': {'next': 37}} - -WRONG_DATA = {'cols':[ - {'field': 'City', 'reference': 'Eldorado'}, - {'field': 'Currency', 'reference': None}, - {'field': 'Item Quantity', 'reference': '7'}, - {'field': 'PostingDate', 'reference': '2018-01-10 00:00:00'}]} +DATA_JSON = { + "contact": {"first_name": "***", "last_name": "***"}, + "number_customer": 266, + "date_email_sent": "2018-02-05 10:42:28", + "properties": [ + {"field": "Postal Code", "reference": "999"}, + {"field": "City", "reference": "Eldorado"}, + {"field": "Currency", "reference": None}, + {"field": "Item Quantity", "reference": "7"}, + {"field": "PostingDate", "reference": "2018-01-10 00:00:00"}, + ], + "custom_fields": [{"field": "Assignment_ID", "reference": None}], + "drivers": [ + {"label": "Product Quality and Product Performance"}, + {"label": "Function and Design"}, + {"label": "Value for Money"}, + {"label": "Packaging"}, + ], +} + +RAW_JSON = { + "data": [ + { + "contact": {"first_name": "***", "last_name": "***"}, + "number_customer": 266, + "date_email_sent": "2018-02-05 10:42:28", + "properties": [ + {"field": "Postal Code", "reference": "999"}, + {"field": "City", "reference": "Eldorado"}, + {"field": "Currency", "reference": None}, + {"field": "Item Quantity", "reference": "7"}, + {"field": "PostingDate", "reference": "2018-01-10 00:00:00"}, + ], + "custom_fields": [{"field": "Assignment_ID", "reference": None}], + "drivers": [ + {"label": "Product Quality and Product Performance"}, + {"label": "Function and Design"}, + {"label": "Value for Money"}, + {"label": "Packaging"}, + ], + }, + { + "contact": {"first_name": "***", "last_name": "***"}, + "number_customer": 206, + "date_email_sent": "2018-02-05 10:41:01", + "properties": [ + {"field": "Postal Code", "reference": "0000"}, + {"field": "City", "reference": "Neverland"}, + {"field": "Currency", "reference": None}, + {"field": "Item Quantity", "reference": "1"}, + {"field": "PostingDate", "reference": "2018-01-26 00:00:00"}, + ], + "custom_fields": [{"field": "Assignment_ID", "reference": None}], + "drivers": [ + {"label": "The website of the online shop (overall impression)"}, + {"label": "Waiting period"}, + ], + }, + ], + "cursor": {"next": 37}, +} + +WRONG_DATA = { + "cols": [ + {"field": "City", "reference": "Eldorado"}, + {"field": "Currency", "reference": None}, + {"field": "Item Quantity", "reference": "7"}, + {"field": "PostingDate", "reference": "2018-01-10 00:00:00"}, + ] +} + @pytest.mark.looping_api_calls def test_customer_gauge_to_df_loop(): @@ -87,39 +113,47 @@ def test_get_data_error_raising(): def test_field_reference_unpacker(): """ Test the '_field_reference_unpacker' method with valid data. It should unpack and modify dictionaries within the specified field and return the expected result. - """ + """ data = DATA_JSON.copy() - field = 'properties' + field = "properties" expected_result = { - 'contact': {'first_name': '***', 'last_name': '***'}, - 'number_customer': 266, - 'date_email_sent': '2018-02-05 10:42:28', - 'properties': {'Postal Code': '999', - 'City': 'Eldorado', - 'Currency': None, - 'Item Quantity': '7', - 'PostingDate': '2018-01-10 00:00:00'}, - 'custom_fields': [{'field': 'Assignment_ID', 'reference': None}], - 'drivers': [{'label': 'Product Quality and Product Performance'}, - {'label': 'Function and Design'}, - {'label': 'Value for Money'}, - {'label': 'Packaging'}] + "contact": {"first_name": "***", "last_name": "***"}, + "number_customer": 266, + "date_email_sent": "2018-02-05 10:42:28", + "properties": { + "Postal Code": "999", + "City": "Eldorado", + "Currency": None, + "Item Quantity": "7", + "PostingDate": "2018-01-10 00:00:00", + }, + "custom_fields": [{"field": "Assignment_ID", "reference": None}], + "drivers": [ + {"label": "Product Quality and Product Performance"}, + {"label": "Function and Design"}, + {"label": "Value for Money"}, + {"label": "Packaging"}, + ], } result = CG._field_reference_unpacker(json_response=data, field=field) assert result == expected_result + @pytest.mark.field_reference_unpacker_value_error def test_field_reference_unpacker_invalid_data_format(): """ Test the '_field_reference_unpacker' method with invalid data format that should raise a ValueError. It should raise a ValueError exception. """ data = DATA_JSON.copy() - field='contact' - with pytest.raises(ValueError, match=r"Dictionary within the specified field doesn't contain exactly two items."): + field = "contact" + with pytest.raises( + ValueError, + match=r"Dictionary within the specified field doesn't contain exactly two items.", + ): CG._field_reference_unpacker(json_response=data, field=field) - + @pytest.mark.field_reference_unpacker_key_error def test_field_reference_unpacker_missing_field(): @@ -138,20 +172,26 @@ def test_nested_dict_transformer(): Test the '_nested_dict_transformer' method with valid data. It should modify nested dictionaries within the specified field and return the expected result. """ data = DATA_JSON.copy() - field = 'drivers' - expected_result = {'contact': {'first_name': '***', 'last_name': '***'}, - 'number_customer': 266, - 'date_email_sent': '2018-02-05 10:42:28', - 'properties': [{'field': 'Postal Code', 'reference': '999'}, - {'field': 'City', 'reference': 'Eldorado'}, - {'field': 'Currency', 'reference': None}, - {'field': 'Item Quantity', 'reference': '7'}, - {'field': 'PostingDate', 'reference': '2018-01-10 00:00:00'}], - 'custom_fields': [{'field': 'Assignment_ID', 'reference': None}], - 'drivers': {'1_label': 'Product Quality and Product Performance', - '2_label': 'Function and Design', - '3_label': 'Value for Money', - '4_label': 'Packaging'}} + field = "drivers" + expected_result = { + "contact": {"first_name": "***", "last_name": "***"}, + "number_customer": 266, + "date_email_sent": "2018-02-05 10:42:28", + "properties": [ + {"field": "Postal Code", "reference": "999"}, + {"field": "City", "reference": "Eldorado"}, + {"field": "Currency", "reference": None}, + {"field": "Item Quantity", "reference": "7"}, + {"field": "PostingDate", "reference": "2018-01-10 00:00:00"}, + ], + "custom_fields": [{"field": "Assignment_ID", "reference": None}], + "drivers": { + "1_label": "Product Quality and Product Performance", + "2_label": "Function and Design", + "3_label": "Value for Money", + "4_label": "Packaging", + }, + } result = CG._nested_dict_transformer(json_response=data, field=field) @@ -164,7 +204,7 @@ def test_nested_dict_transformer_invalid_data_format(): Test the '_nested_dict_transformer' method with invalid data format. It should return the same data without modification. """ data = DATA_JSON.copy() - field='number_customer' + field = "number_customer" result = CG._nested_dict_transformer(json_response=data, field=field) assert result == data @@ -186,42 +226,54 @@ def test_column_unpacker_success_method1_and_method2(): """ Test the 'column_unpacker' method with valid data and both Method 1 and Method 2 columns specified. It should return the expected result. """ - data = RAW_JSON['data'].copy() - unpack_by_field_reference_cols = ['properties'] - unpack_by_nested_dict_transformer = ['drivers'] + data = RAW_JSON["data"].copy() + unpack_by_field_reference_cols = ["properties"] + unpack_by_nested_dict_transformer = ["drivers"] expected_result = [ - {'contact': {'first_name': '***', 'last_name': '***'}, - 'number_customer': 266, - 'date_email_sent': '2018-02-05 10:42:28', - 'properties': { - 'Postal Code': '999', - 'City': 'Eldorado', - 'Currency': None, - 'Item Quantity': '7', - 'PostingDate': '2018-01-10 00:00:00' + { + "contact": {"first_name": "***", "last_name": "***"}, + "number_customer": 266, + "date_email_sent": "2018-02-05 10:42:28", + "properties": { + "Postal Code": "999", + "City": "Eldorado", + "Currency": None, + "Item Quantity": "7", + "PostingDate": "2018-01-10 00:00:00", + }, + "custom_fields": [{"field": "Assignment_ID", "reference": None}], + "drivers": { + "1_label": "Product Quality and Product Performance", + "2_label": "Function and Design", + "3_label": "Value for Money", + "4_label": "Packaging", }, - 'custom_fields': [{'field': 'Assignment_ID', 'reference': None}], - 'drivers': {'1_label': 'Product Quality and Product Performance', - '2_label': 'Function and Design', - '3_label': 'Value for Money', - '4_label': 'Packaging'}}, - {'contact': {'first_name': '***', 'last_name': '***'}, - 'number_customer': 206, - 'date_email_sent': '2018-02-05 10:41:01', - 'properties': { - 'Postal Code': '0000', - 'City': 'Neverland', - 'Currency': None, - 'Item Quantity': '1', - 'PostingDate': '2018-01-26 00:00:00' + }, + { + "contact": {"first_name": "***", "last_name": "***"}, + "number_customer": 206, + "date_email_sent": "2018-02-05 10:41:01", + "properties": { + "Postal Code": "0000", + "City": "Neverland", + "Currency": None, + "Item Quantity": "1", + "PostingDate": "2018-01-26 00:00:00", }, - 'custom_fields': [{'field': 'Assignment_ID', 'reference': None}], - 'drivers': {'1_label': 'The website of the online shop (overall impression)', - '2_label': 'Waiting period'}} + "custom_fields": [{"field": "Assignment_ID", "reference": None}], + "drivers": { + "1_label": "The website of the online shop (overall impression)", + "2_label": "Waiting period", + }, + }, ] - result = CG.column_unpacker(json_list=data, unpack_by_field_reference_cols=unpack_by_field_reference_cols, unpack_by_nested_dict_transformer=unpack_by_nested_dict_transformer) + result = CG.column_unpacker( + json_list=data, + unpack_by_field_reference_cols=unpack_by_field_reference_cols, + unpack_by_nested_dict_transformer=unpack_by_nested_dict_transformer, + ) assert result == expected_result @@ -231,10 +283,14 @@ def test_column_unpacker_missing_json_list(): """ Test the 'column_unpacker' method with missing 'json_list' argument. It should raise a ValueError. """ - unpack_by_field_reference_cols = ['properties'] - unpack_by_nested_dict_transformer = ['drivers'] + unpack_by_field_reference_cols = ["properties"] + unpack_by_nested_dict_transformer = ["drivers"] with pytest.raises(ValueError, match="Input 'json_list' is required."): - CG.column_unpacker(json_list=None, unpack_by_field_reference_cols=unpack_by_field_reference_cols, unpack_by_nested_dict_transformer=unpack_by_nested_dict_transformer) + CG.column_unpacker( + json_list=None, + unpack_by_field_reference_cols=unpack_by_field_reference_cols, + unpack_by_nested_dict_transformer=unpack_by_nested_dict_transformer, + ) @pytest.mark.test_column_unpacker_duplicate_columns @@ -242,11 +298,18 @@ def test_column_unpacker_duplicate_columns(): """ Test the 'column_unpacker' method with duplicate columns specified in both Method 1 and Method 2. It should raise a ValueError. """ - data = RAW_JSON['data'].copy() - unpack_by_field_reference_cols = ['properties'] - unpack_by_nested_dict_transformer = ['properties'] - with pytest.raises(ValueError, match="{'properties'} were mentioned in both unpack_by_field_reference_cols and unpack_by_nested_dict_transformer. It's not possible to apply two methods to the same field."): - CG.column_unpacker(json_list=data, unpack_by_field_reference_cols=unpack_by_field_reference_cols, unpack_by_nested_dict_transformer=unpack_by_nested_dict_transformer) + data = RAW_JSON["data"].copy() + unpack_by_field_reference_cols = ["properties"] + unpack_by_nested_dict_transformer = ["properties"] + with pytest.raises( + ValueError, + match="{'properties'} were mentioned in both unpack_by_field_reference_cols and unpack_by_nested_dict_transformer. It's not possible to apply two methods to the same field.", + ): + CG.column_unpacker( + json_list=data, + unpack_by_field_reference_cols=unpack_by_field_reference_cols, + unpack_by_nested_dict_transformer=unpack_by_nested_dict_transformer, + ) @pytest.mark.test_flatten_json @@ -323,4 +386,4 @@ def test_drivers_cleaner_success(): data = "{'label': 'Driver1'}, {'label': 'Driver2'}, {'label': 'Driver3'}" expected_result = "Driver1, Driver2, Driver3" result = CG._drivers_cleaner(data) - assert result == expected_result \ No newline at end of file + assert result == expected_result diff --git a/tests/integration/tasks/test_tm1.py b/tests/integration/tasks/test_tm1.py index 96dd58dfb..68527b5f7 100644 --- a/tests/integration/tasks/test_tm1.py +++ b/tests/integration/tasks/test_tm1.py @@ -1,7 +1,7 @@ import pandas as pd -from viadot.tasks import TM1ToDF from viadot.config import local_config +from viadot.tasks import TM1ToDF CUBE = local_config.get("test_cube") VIEW = local_config.get("test_view") diff --git a/tests/integration/test_customer_gauge.py b/tests/integration/test_customer_gauge.py index ea22569c4..a29ff3585 100644 --- a/tests/integration/test_customer_gauge.py +++ b/tests/integration/test_customer_gauge.py @@ -3,8 +3,8 @@ import pandas as pd import pytest -from viadot.sources import CustomerGauge from viadot.exceptions import CredentialError +from viadot.sources import CustomerGauge ENDPOINT = random.choice(["responses", "non-responses"]) CG = CustomerGauge(endpoint=ENDPOINT) @@ -55,26 +55,39 @@ def test_endpoint_url_argument(): json_response = CG.get_json_response() assert isinstance(json_response, dict) + @pytest.mark.endpoint_valueerror def test_wrong_endpoint_valueerror_raising(): - with pytest.raises(ValueError, match=r"Incorrect endpoint name. Choose: 'responses' or 'non-responses'"): + with pytest.raises( + ValueError, + match=r"Incorrect endpoint name. Choose: 'responses' or 'non-responses'", + ): wrong_endpoint_name = "wrong-endpoint" - CG = CustomerGauge(endpoint = wrong_endpoint_name) + CG = CustomerGauge(endpoint=wrong_endpoint_name) + @pytest.mark.endpoint_valueerror def test_no_endpoint_valueerror_raising(): - with pytest.raises(ValueError, match=r"Provide endpoint name. Choose: 'responses' or 'non-responses'. Otherwise, provide URL"): + with pytest.raises( + ValueError, + match=r"Provide endpoint name. Choose: 'responses' or 'non-responses'. Otherwise, provide URL", + ): CG = CustomerGauge() + @pytest.mark.endpoint_credentialserror def test_credentialserror_raising(): - wrong_secret="wrong" + wrong_secret = "wrong" with pytest.raises(CredentialError, match=r"Credentials not provided."): CG = CustomerGauge(endpoint=ENDPOINT, credentials_secret=wrong_secret) + @pytest.mark.get_cursor_valueerror def test_get_cursor_valueerror_raising(): wrong_json = {} - with pytest.raises(ValueError, match=r"Provided argument doesn't contain 'cursor' value. Pass json returned from the endpoint."): + with pytest.raises( + ValueError, + match=r"Provided argument doesn't contain 'cursor' value. Pass json returned from the endpoint.", + ): CG = CustomerGauge(endpoint=ENDPOINT) - CG.get_cursor(json_response=wrong_json) \ No newline at end of file + CG.get_cursor(json_response=wrong_json) diff --git a/tests/integration/test_epicor.py b/tests/integration/test_epicor.py index 77c338a88..60c1f3410 100644 --- a/tests/integration/test_epicor.py +++ b/tests/integration/test_epicor.py @@ -1,5 +1,5 @@ -import pytest import pandas as pd +import pytest from viadot.config import local_config from viadot.exceptions import CredentialError, DataRangeError diff --git a/tests/integration/test_genesys.py b/tests/integration/test_genesys.py index 8508978f1..f91318b96 100644 --- a/tests/integration/test_genesys.py +++ b/tests/integration/test_genesys.py @@ -1,7 +1,7 @@ +import logging from unittest import mock import pytest -import logging from viadot.sources import Genesys diff --git a/tests/integration/test_hubspot.py b/tests/integration/test_hubspot.py index 5963df3ee..c3f303b4c 100644 --- a/tests/integration/test_hubspot.py +++ b/tests/integration/test_hubspot.py @@ -2,8 +2,8 @@ import pandas as pd import pytest -from viadot.exceptions import CredentialError +from viadot.exceptions import CredentialError from viadot.sources import Hubspot from viadot.task_utils import credentials_loader diff --git a/tests/integration/test_sharepoint.py b/tests/integration/test_sharepoint.py index 502ffded0..82090b6a5 100644 --- a/tests/integration/test_sharepoint.py +++ b/tests/integration/test_sharepoint.py @@ -1,17 +1,16 @@ import os import re +from copy import deepcopy import pandas as pd -from copy import deepcopy import pytest from prefect.tasks.secrets import PrefectSecret from viadot.config import local_config from viadot.exceptions import CredentialError -from viadot.sources import Sharepoint +from viadot.sources import Sharepoint, SharepointList from viadot.task_utils import df_get_data_types_task from viadot.tasks.sharepoint import SharepointToDF -from viadot.sources import SharepointList def get_url() -> str: diff --git a/tests/integration/test_tm1.py b/tests/integration/test_tm1.py index ae2b321b9..c0d887a61 100644 --- a/tests/integration/test_tm1.py +++ b/tests/integration/test_tm1.py @@ -1,8 +1,9 @@ import pandas as pd import pytest -from viadot.sources import TM1 + from viadot.config import local_config from viadot.exceptions import CredentialError, ValidationError +from viadot.sources import TM1 CUBE = local_config.get("TM1").get("test_cube") VIEW = local_config.get("TM1").get("test_view") diff --git a/tests/unit/test_task_utils.py b/tests/unit/test_task_utils.py index e77c24fdd..969b699a4 100644 --- a/tests/unit/test_task_utils.py +++ b/tests/unit/test_task_utils.py @@ -19,8 +19,8 @@ df_to_parquet, dtypes_to_json_task, union_dfs_task, - write_to_json, validate_df, + write_to_json, ) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index a94eaff9f..777617244 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -6,9 +6,9 @@ from viadot.signals import SKIP from viadot.utils import ( + add_viadot_metadata_columns, check_if_empty_file, gen_bulk_insert_query_from_df, - add_viadot_metadata_columns, ) EMPTY_CSV_PATH = "empty.csv" diff --git a/viadot/flows/__init__.py b/viadot/flows/__init__.py index e138735d6..2f30c04d8 100644 --- a/viadot/flows/__init__.py +++ b/viadot/flows/__init__.py @@ -11,7 +11,7 @@ from .genesys_to_adls import GenesysToADLS from .outlook_to_adls import OutlookToADLS from .salesforce_to_adls import SalesforceToADLS -from .sharepoint_to_adls import SharepointToADLS, SharepointListToADLS +from .sharepoint_to_adls import SharepointListToADLS, SharepointToADLS from .supermetrics_to_adls import SupermetricsToADLS from .supermetrics_to_azure_sql import SupermetricsToAzureSQL diff --git a/viadot/flows/customer_gauge_to_adls.py b/viadot/flows/customer_gauge_to_adls.py index 6af62a340..82e14d5b4 100644 --- a/viadot/flows/customer_gauge_to_adls.py +++ b/viadot/flows/customer_gauge_to_adls.py @@ -66,40 +66,40 @@ def __init__( Args: name (str): The name of the flow. - endpoint (Literal["responses", "non-responses"], optional): Indicate which endpoint to connect. + endpoint (Literal["responses", "non-responses"], optional): Indicate which endpoint to connect. Defaults to None. endpoint_url (str, optional): Full URL for pointing to specific endpoint. Defaults to None. - total_load (bool, optional): Indicate whether to download the data to the latest. If 'False', + total_load (bool, optional): Indicate whether to download the data to the latest. If 'False', only one API call is executed (up to 1000 records). Defaults to True. cursor (int, optional): Cursor value to navigate to the page. Defaults to None. - pagesize (int, optional): Number of responses (records) returned per page, max value = 1000. + pagesize (int, optional): Number of responses (records) returned per page, max value = 1000. Defaults to 1000. - date_field (Literal["date_creation", "date_order", "date_sent", "date_survey_response"], optional): + date_field (Literal["date_creation", "date_order", "date_sent", "date_survey_response"], optional): Specifies the date type which filter date range. Defaults to None. start_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. Defaults to None. end_date (datetime, optional): Defines the period end date in yyyy-mm-dd format. Defaults to None. unpack_by_field_reference_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. Defaults to None. - unpack_by_nested_dict_transformer (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. Defaults to None. - customer_gauge_credentials_secret (str, optional): The name of the Azure Key Vault secret containing + unpack_by_nested_dict_transformer (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. Defaults to None. + customer_gauge_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ['client_id', 'client_secret']. Defaults to "CUSTOMER-GAUGE". vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. anonymize (bool, optional): Indicates if anonymize selected columns. Defaults to False. columns_to_anonymize (List[str], optional): List of columns to anonymize. Defaults to None. - anonymize_method (Literal["mask", "hash"], optional): Method of anonymizing data. "mask" -> replace the - data with "value" arg. "hash" -> replace the data with the hash value of an object (using `hash()` + anonymize_method (Literal["mask", "hash"], optional): Method of anonymizing data. "mask" -> replace the + data with "value" arg. "hash" -> replace the data with the hash value of an object (using `hash()` method). Defaults to "mask". anonymize_value (str, optional): Value to replace the data. Defaults to "***". - date_column (str, optional): Name of the date column used to identify rows that are older than a specified + date_column (str, optional): Name of the date column used to identify rows that are older than a specified number of days. Defaults to None. - days (int, optional): The number of days beyond which we want to anonymize the data, e.g. older than + days (int, optional): The number of days beyond which we want to anonymize the data, e.g. older than 2 years can be: 2*365. Defaults to None. output_file_extension (str, optional): Output file extension - to allow selection of .csv for data which is not easy to handle with parquet. Defaults to ".parquet". adls_dir_path (str, optional): Azure Data Lake destination folder/catalog path. Defaults to None. local_file_path (str, optional): Local destination path. Defaults to None. adls_file_name (str, optional): Name of file in ADLS. Defaults to None. - adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary - with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure + adls_sp_credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary + with ACCOUNT_NAME and Service Principal credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake. Defaults to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to False. if_exists (str, optional): What to do if the file exists. Defaults to "replace". diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index eaf747bab..c9e131361 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -17,8 +17,7 @@ validate_df, ) from viadot.tasks import AzureDataLakeUpload -from viadot.tasks.sharepoint import SharepointToDF, SharepointListToDF - +from viadot.tasks.sharepoint import SharepointListToDF, SharepointToDF logger = logging.get_logger() diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index 80253eb88..cff39fc89 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -18,8 +18,8 @@ dtypes_to_json_task, union_dfs_task, update_dtypes_dict, - write_to_json, validate_df, + write_to_json, ) from viadot.tasks import ( AzureDataLakeUpload, diff --git a/viadot/flows/transform_and_catalog.py b/viadot/flows/transform_and_catalog.py index 1de5c4430..08ac6b895 100644 --- a/viadot/flows/transform_and_catalog.py +++ b/viadot/flows/transform_and_catalog.py @@ -1,13 +1,13 @@ import os -from pathlib import Path import shutil +from pathlib import Path from typing import Dict, List, Union from prefect import Flow, task from prefect.tasks.shell import ShellTask from prefect.triggers import any_successful -from viadot.tasks import CloneRepo, AzureKeyVaultSecret, LumaIngest +from viadot.tasks import AzureKeyVaultSecret, CloneRepo, LumaIngest @task(trigger=any_successful) diff --git a/viadot/sources/bigquery.py b/viadot/sources/bigquery.py index 1be69e866..32d1dac2c 100644 --- a/viadot/sources/bigquery.py +++ b/viadot/sources/bigquery.py @@ -6,8 +6,8 @@ from ..config import local_config from ..exceptions import CredentialError, DBDataAccessError -from .base import Source from ..utils import add_viadot_metadata_columns +from .base import Source class BigQuery(Source): diff --git a/viadot/sources/customer_gauge.py b/viadot/sources/customer_gauge.py index 5fff4387e..819f92a90 100644 --- a/viadot/sources/customer_gauge.py +++ b/viadot/sources/customer_gauge.py @@ -32,7 +32,7 @@ def __init__( endpoint (Literal["responses", "non-responses"]): Indicate which endpoint to connect. Defaults to None. url (str, optional): Endpoint URL. Defaults to None. credentials (Dict[str, Any], optional): Credentials to connect with API containing client_id, client_secret. Defaults to None. - credentials_secret (str, optional): The name of the secret stored in local_config containing a + credentials_secret (str, optional): The name of the secret stored in local_config containing a dictionary with ['client_id', 'client_secret']. Defaults to "CUSTOMER-GAUGE". Raises: ValueError: If endpoint is not provided or incorect. diff --git a/viadot/sources/mindful.py b/viadot/sources/mindful.py index 254eecb9d..2698adb15 100644 --- a/viadot/sources/mindful.py +++ b/viadot/sources/mindful.py @@ -1,12 +1,12 @@ import os -from io import StringIO from datetime import datetime, timedelta +from io import StringIO from typing import Any, Dict, Literal, Tuple import pandas as pd import prefect -from requests.models import Response from requests.auth import HTTPBasicAuth +from requests.models import Response from viadot.exceptions import APIError from viadot.sources.base import Source diff --git a/viadot/sources/sharepoint.py b/viadot/sources/sharepoint.py index 096de825b..6e935eee2 100644 --- a/viadot/sources/sharepoint.py +++ b/viadot/sources/sharepoint.py @@ -1,20 +1,20 @@ -from ..config import local_config -from ..exceptions import CredentialError -from .base import Source -from viadot.utils import get_nested_dict - -from typing import Any, Dict, List -from fnmatch import fnmatch -from datetime import datetime from copy import deepcopy -import pandas as pd +from datetime import datetime +from fnmatch import fnmatch +from typing import Any, Dict, List +import pandas as pd import sharepy from office365.runtime.auth.authentication_context import AuthenticationContext -from office365.sharepoint.client_context import ClientContext from office365.runtime.client_request_exception import ClientRequestException +from office365.sharepoint.client_context import ClientContext from prefect.utilities import logging +from viadot.utils import get_nested_dict + +from ..config import local_config +from ..exceptions import CredentialError +from .base import Source logger = logging.get_logger() diff --git a/viadot/sources/tm1.py b/viadot/sources/tm1.py index 9a182bb97..fcb1dae7a 100644 --- a/viadot/sources/tm1.py +++ b/viadot/sources/tm1.py @@ -1,9 +1,8 @@ -import pandas as pd - from typing import Any, Dict, Literal -from TM1py.Services import TM1Service -from prefect.utilities import logging +import pandas as pd +from prefect.utilities import logging +from TM1py.Services import TM1Service from ..config import local_config from ..exceptions import CredentialError, ValidationError diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 6173e2994..6a532f932 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -1,8 +1,8 @@ import copy import json import os -import shutil import re +import shutil from datetime import datetime, timedelta, timezone from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, List, Literal, Union, cast diff --git a/viadot/tasks/__init__.py b/viadot/tasks/__init__.py index 541be70ab..7dc3d61cd 100644 --- a/viadot/tasks/__init__.py +++ b/viadot/tasks/__init__.py @@ -31,7 +31,7 @@ from .outlook import OutlookToDF from .prefect_date_range import GetFlowNewDateRange from .salesforce import SalesforceBulkUpsert, SalesforceToDF, SalesforceUpsert -from .sharepoint import SharepointToDF, SharepointListToDF +from .sharepoint import SharepointListToDF, SharepointToDF from .sqlite import SQLiteInsert, SQLiteQuery, SQLiteSQLtoDF from .supermetrics import SupermetricsToCSV, SupermetricsToDF @@ -50,12 +50,12 @@ from .duckdb import DuckDBCreateTableFromParquet, DuckDBQuery, DuckDBToDF from .epicor import EpicorOrdersToDF from .eurostat import EurostatToDF +from .git import CloneRepo from .hubspot import HubspotToDF +from .luma import LumaIngest from .mediatool import MediatoolToDF from .mindful import MindfulToCSV from .sftp import SftpList, SftpToDF from .sql_server import SQLServerCreateTable, SQLServerQuery, SQLServerToDF -from .vid_club import VidClubToDF -from .git import CloneRepo -from .luma import LumaIngest from .tm1 import TM1ToDF +from .vid_club import VidClubToDF diff --git a/viadot/tasks/customer_gauge.py b/viadot/tasks/customer_gauge.py index 72a1a013f..ecb5e0de5 100644 --- a/viadot/tasks/customer_gauge.py +++ b/viadot/tasks/customer_gauge.py @@ -1,6 +1,6 @@ import json from datetime import datetime -from typing import Any, Dict, Literal, List +from typing import Any, Dict, List, Literal import pandas as pd from prefect import Task @@ -33,29 +33,29 @@ def __init__( **kwargs, ): """ - Task CustomerGaugeToDF for downloading the selected range of data from Customer Gauge + Task CustomerGaugeToDF for downloading the selected range of data from Customer Gauge endpoint and return as one pandas DataFrame. Args: - endpoint (Literal["responses", "non-responses"], optional): Indicate which endpoint + endpoint (Literal["responses", "non-responses"], optional): Indicate which endpoint to connect. Defaults to None. - total_load (bool, optional): Indicate whether to download the data to the latest. + total_load (bool, optional): Indicate whether to download the data to the latest. If 'False', only one API call is executed (up to 1000 records). Defaults to True. endpoint_url (str, optional): Endpoint URL. Defaults to None. cursor (int, optional): Cursor value to navigate to the page. Defaults to None. - pagesize (int, optional): Number of responses (records) returned per page, max value = 1000. + pagesize (int, optional): Number of responses (records) returned per page, max value = 1000. Defaults to 1000. - date_field (Literal["date_creation", "date_order", "date_sent", "date_survey_response"], + date_field (Literal["date_creation", "date_order", "date_sent", "date_survey_response"], optional): Specifies the date type which filter date range. Defaults to None. - start_date (datetime, optional): Defines the period end date in yyyy-mm-dd format. + start_date (datetime, optional): Defines the period end date in yyyy-mm-dd format. Defaults to None. - end_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. + end_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. Defaults to None. unpack_by_field_reference_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. Defaults to None. unpack_by_nested_dict_transformer (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. Defaults to None. - timeout (int, optional): The time (in seconds) to wait while running this task before + timeout (int, optional): The time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. """ self.endpoint = endpoint @@ -75,24 +75,26 @@ def __init__( *args, **kwargs, ) - def get_data(self, + + def get_data( + self, json_response: Dict[str, Any] = None, ) -> List[Dict[str, Any]]: """ Extract and return the 'data' part of a JSON response as a list of dictionaries. Args: - json_response (Dict[str, Any], optional): JSON object represented as a nested + json_response (Dict[str, Any], optional): JSON object represented as a nested dictionary that contains data and cursor parameter value. Defaults to None. Raises: KeyError: If the 'data' key is not present in the provided JSON response. Returns: - List[Dict[str, Any]]: A list of dictionaries containing data from the 'data' + List[Dict[str, Any]]: A list of dictionaries containing data from the 'data' part of the JSON response. """ - jsons_list=[] + jsons_list = [] try: jsons_list = json_response["data"] except KeyError: @@ -104,7 +106,7 @@ def get_data(self, return jsons_list def _field_reference_unpacker( - self, + self, json_response: Dict[str, Any], field: str, ) -> Dict[str, Any]: @@ -113,7 +115,7 @@ def _field_reference_unpacker( This function takes a JSON response and a field name. It processes dictionaries within the specified field, checking if each dictionary contains exactly two items. - If a dictionary meets this criteria, it is transformed into a new dictionary, + If a dictionary meets this criteria, it is transformed into a new dictionary, where the first key becomes a key, and the second key becomes its associated value Args: @@ -123,7 +125,7 @@ def _field_reference_unpacker( Returns: Dict[str, Any]: The JSON response with modified nested dictionaries within the specified field. - + Raises: ValueError: If a dictionary within the specified field doesn't contain exactly two items. """ @@ -134,14 +136,16 @@ def _field_reference_unpacker( list_properties = list(dictionary.values()) result[list_properties[0]] = list_properties[1] else: - raise ValueError(f"Dictionary within the specified field doesn't contain exactly two items.") + raise ValueError( + f"Dictionary within the specified field doesn't contain exactly two items." + ) if result: json_response[field] = result return json_response def _nested_dict_transformer( - self, + self, json_response: Dict[str, Any], field: str, ) -> Dict[str, Any]: @@ -160,49 +164,49 @@ def _nested_dict_transformer( Dict[str, Any]: The JSON response with modified nested dictionaries within the specified field. """ - result={} + result = {} try: for i, dictionary in enumerate(json_response[field], start=1): for key, value in dictionary.items(): - result[f'{i}_{key}'] = value + result[f"{i}_{key}"] = value if result: json_response[field] = result except TypeError as te: logger.error(te) return json_response - + def column_unpacker( - self, + self, json_list: List[Dict[str, Any]] = None, unpack_by_field_reference_cols: List[str] = None, unpack_by_nested_dict_transformer: List[str] = None, - ) -> List[Dict[str, Any]]: + ) -> List[Dict[str, Any]]: """ - Function to unpack and modify specific columns in a list of dictionaries by using one of two methods, - chosen by the user. - If user would like to use field_reference_unpacker, he/she needs to provide list of fields as strings in - `unpack_by_field_reference_cols` parameter, if user would like to use nested_dict_transformer he/she needs to provide list of - fields as strings in unpack_by_nested_dict_transformer parameter. + Function to unpack and modify specific columns in a list of dictionaries by using one of two methods, + chosen by the user. + If user would like to use field_reference_unpacker, he/she needs to provide list of fields as strings in + `unpack_by_field_reference_cols` parameter, if user would like to use nested_dict_transformer he/she needs to provide list of + fields as strings in unpack_by_nested_dict_transformer parameter. Args: json_list (List[Dict[str, Any]): A list of dictionaries containing the data. - unpack_by_field_reference_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. + unpack_by_field_reference_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. Defaults to None. - unpack_by_nested_dict_transformer (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. + unpack_by_nested_dict_transformer (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. Defaults to None. Raises: ValueError: If 'json_list' is not provided. ValueError: If specified columns do not exist in the JSON data. - ValueError: If columns are mentioned in both 'unpack_by_field_reference_cols' and 'unpack_by_nested_dict_transformer'. + ValueError: If columns are mentioned in both 'unpack_by_field_reference_cols' and 'unpack_by_nested_dict_transformer'. Returns: List[Dict[str, Any]]: The updated list of dictionaries after column unpacking and modification. """ duplicated_cols = [] - + if json_list is None: raise ValueError("Input 'json_list' is required.") @@ -210,49 +214,59 @@ def unpack_columns(columns, unpack_function): json_list_clean = json_list.copy() for field in columns: if field in json_list_clean[0]: - logger.info(f"Unpacking column '{field}' with {unpack_function.__name__} method...") + logger.info( + f"Unpacking column '{field}' with {unpack_function.__name__} method..." + ) try: - json_list_clean = list(map(lambda x: unpack_function(x, field), json_list_clean)) - logger.info(f"All elements in '{field}' are unpacked successfully.") + json_list_clean = list( + map(lambda x: unpack_function(x, field), json_list_clean) + ) + logger.info( + f"All elements in '{field}' are unpacked successfully." + ) except ValueError as ve: - logger.info(f"No transformation were made in '{field}'," - "because didn't contain list of key-value data.") + logger.info( + f"No transformation were made in '{field}'," + "because didn't contain list of key-value data." + ) except Exception as e: logger.info(f"Error while unpacking {field}: {e}") else: logger.info(f"Column '{field}' not found.") return json_list_clean + if unpack_by_field_reference_cols and unpack_by_nested_dict_transformer: - duplicated_cols = set(unpack_by_field_reference_cols).intersection(set(unpack_by_nested_dict_transformer)) + duplicated_cols = set(unpack_by_field_reference_cols).intersection( + set(unpack_by_nested_dict_transformer) + ) if duplicated_cols: raise ValueError( - f"{duplicated_cols} were mentioned in both unpack_by_field_reference_cols and unpack_by_nested_dict_transformer." + f"{duplicated_cols} were mentioned in both unpack_by_field_reference_cols and unpack_by_nested_dict_transformer." " It's not possible to apply two methods to the same field." - ) + ) else: if unpack_by_field_reference_cols is not None: json_list = unpack_columns( - columns = unpack_by_field_reference_cols, - unpack_function = self._field_reference_unpacker - ) + columns=unpack_by_field_reference_cols, + unpack_function=self._field_reference_unpacker, + ) if unpack_by_nested_dict_transformer is not None: json_list = unpack_columns( - columns = unpack_by_nested_dict_transformer, - unpack_function = self._nested_dict_transformer - ) - - return json_list + columns=unpack_by_nested_dict_transformer, + unpack_function=self._nested_dict_transformer, + ) + return json_list def flatten_json(self, json_response: Dict[str, Any] = None) -> Dict[str, Any]: """ - Function that flattens a nested structure of the JSON object into - a single-level dictionary. It uses a nested `flattify()` function to recursively + Function that flattens a nested structure of the JSON object into + a single-level dictionary. It uses a nested `flattify()` function to recursively combine nested keys in the JSON object with '_' to create the flattened keys. Args: - json_response (Dict[str, Any], optional): JSON object represented as + json_response (Dict[str, Any], optional): JSON object represented as a nested dictionary. Defaults to None. Raises: @@ -266,7 +280,7 @@ def flatten_json(self, json_response: Dict[str, Any] = None) -> Dict[str, Any]: if not isinstance(json_response, dict): raise TypeError("Input must be a dictionary.") - def flattify(field, key="", out = None): + def flattify(field, key="", out=None): if out is None: out = result @@ -279,16 +293,13 @@ def flattify(field, key="", out = None): flattify(json_response) return result - - def square_brackets_remover( - self, - df: pd.DataFrame = None - ) -> pd.DataFrame: + + def square_brackets_remover(self, df: pd.DataFrame = None) -> pd.DataFrame: """ Replace square brackets "[]" with an empty string in a pandas DataFrame. Args: - df (pd.DataFrame, optional): Replace square brackets "[]" with an empty string + df (pd.DataFrame, optional): Replace square brackets "[]" with an empty string in a pandas DataFrame. Defaults to None. Returns: @@ -298,11 +309,8 @@ def square_brackets_remover( df = df.astype(str) df = df.applymap(lambda x: x.strip("[]")) return df - - def _drivers_cleaner( - self, - drivers: str = None - ) -> str: + + def _drivers_cleaner(self, drivers: str = None) -> str: """ Clean and format the 'drivers' data. @@ -313,8 +321,13 @@ def _drivers_cleaner( str: A cleaned and formatted string of driver data. """ - cleaned_drivers = drivers.replace("{", "").replace("}", "").replace("'", "").replace("label: ", "") - + cleaned_drivers = ( + drivers.replace("{", "") + .replace("}", "") + .replace("'", "") + .replace("label: ", "") + ) + return cleaned_drivers def __call__(self): @@ -351,31 +364,31 @@ def run( vault_name: str = None, ) -> pd.DataFrame: """ - Run method. Downloading the selected range of data from Customer Gauge endpoint and return + Run method. Downloading the selected range of data from Customer Gauge endpoint and return as one pandas DataFrame. Args: - endpoint (Literal["responses", "non-responses"]): Indicate which endpoint to connect. + endpoint (Literal["responses", "non-responses"]): Indicate which endpoint to connect. Defaults to None. - total_load (bool, optional): Indicate whether to download the data to the latest. If + total_load (bool, optional): Indicate whether to download the data to the latest. If 'False', only one API call is executed (up to 1000 records). Defaults to True. endpoint_url (str, optional): Endpoint URL. Defaults to None. cursor (int, optional): Cursor value to navigate to the page. Defaults to None. - pagesize (int, optional): Number of responses (records) returned per page, max value = 1000. + pagesize (int, optional): Number of responses (records) returned per page, max value = 1000. Defaults to 1000. - date_field (Literal["date_creation", "date_order", "date_sent", "date_survey_response"], + date_field (Literal["date_creation", "date_order", "date_sent", "date_survey_response"], optional): Specifies the date type which filter date range. Defaults to None. - start_date (datetime, optional): Defines the period end date in yyyy-mm-dd format. + start_date (datetime, optional): Defines the period end date in yyyy-mm-dd format. Defaults to None. - end_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. + end_date (datetime, optional): Defines the period start date in yyyy-mm-dd format. Defaults to None. - unpack_by_field_reference_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. + unpack_by_field_reference_cols (List[str]): Columns to unpack and modify using `_field_reference_unpacker`. + Defaults to None. + unpack_by_nested_dict_transformer (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. Defaults to None. - unpack_by_nested_dict_transformer (List[str]): Columns to unpack and modify using `_nested_dict_transformer`. - Defaults to None. - credentials_secret (str, optional): The name of the Azure Key Vault secret containing a + credentials_secret (str, optional): The name of the Azure Key Vault secret containing a dictionary with ['client_id', 'client_secret']. Defaults to "CUSTOMER-GAUGE". - vault_name (str, optional): The name of the vault from which to obtain the secret. + vault_name (str, optional): The name of the vault from which to obtain the secret. Defaults to None. Returns: @@ -412,7 +425,7 @@ def run( if total_load == True: if cursor is None: logger.info( - f"Downloading all the data from the {self.endpoint or self.endpoint_url} endpoint." + f"Downloading all the data from the {self.endpoint or self.endpoint_url} endpoint." "Process might take a few minutes..." ) else: @@ -426,9 +439,10 @@ def run( total_json += jsn clean_json = self.column_unpacker( - json_list = total_json, - unpack_by_field_reference_cols = unpack_by_field_reference_cols, - unpack_by_nested_dict_transformer = unpack_by_nested_dict_transformer) + json_list=total_json, + unpack_by_field_reference_cols=unpack_by_field_reference_cols, + unpack_by_nested_dict_transformer=unpack_by_nested_dict_transformer, + ) logger.info("Inserting data into the DataFrame...") df = pd.DataFrame(list(map(self.flatten_json, clean_json))) df = self.square_brackets_remover(df) diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index 428e699a0..bf69db2e0 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -11,9 +11,9 @@ from prefect.utilities import logging from prefect.utilities.tasks import defaults_from_attrs -from viadot.task_utils import * from viadot.exceptions import APIError from viadot.sources import Genesys +from viadot.task_utils import * logger = logging.get_logger() diff --git a/viadot/tasks/luma.py b/viadot/tasks/luma.py index 5b78ebc27..11eb91e45 100644 --- a/viadot/tasks/luma.py +++ b/viadot/tasks/luma.py @@ -1,5 +1,7 @@ import json + from prefect.tasks.shell import ShellTask + from .azure_key_vault import AzureKeyVaultSecret diff --git a/viadot/tasks/sap_bw.py b/viadot/tasks/sap_bw.py index acc92c246..0d8d7b2e3 100644 --- a/viadot/tasks/sap_bw.py +++ b/viadot/tasks/sap_bw.py @@ -1,12 +1,12 @@ import pandas as pd from prefect import Task from prefect.tasks.secrets import PrefectSecret -from viadot.tasks import AzureKeyVaultSecret from prefect.utilities import logging from viadot.exceptions import ValidationError from viadot.sources import SAPBW from viadot.task_utils import * +from viadot.tasks import AzureKeyVaultSecret logger = logging.get_logger() diff --git a/viadot/tasks/sharepoint.py b/viadot/tasks/sharepoint.py index 2a1cb0bc4..c4d670617 100644 --- a/viadot/tasks/sharepoint.py +++ b/viadot/tasks/sharepoint.py @@ -1,10 +1,10 @@ -from typing import List -import pandas as pd import copy import json import os import re +from typing import List +import pandas as pd from prefect import Task from prefect.tasks.secrets import PrefectSecret from prefect.utilities import logging @@ -12,8 +12,8 @@ from ..exceptions import ValidationError from ..sources import Sharepoint, SharepointList -from .azure_key_vault import AzureKeyVaultSecret from ..utils import add_viadot_metadata_columns +from .azure_key_vault import AzureKeyVaultSecret logger = logging.get_logger() diff --git a/viadot/tasks/tm1.py b/viadot/tasks/tm1.py index 06b96ccd2..56d4401f0 100644 --- a/viadot/tasks/tm1.py +++ b/viadot/tasks/tm1.py @@ -1,7 +1,7 @@ -import pandas as pd +from typing import Any, Dict +import pandas as pd from prefect import Task -from typing import Any, Dict from prefect.utilities.tasks import defaults_from_attrs from ..sources import TM1 From b9bcdd1c2ea7c251fbf46982845a796fd448bc8b Mon Sep 17 00:00:00 2001 From: Jakub Burzec <125436423+burzekj@users.noreply.github.com> Date: Wed, 15 Nov 2023 09:25:01 +0100 Subject: [PATCH 76/86] Update viadot/tasks/genesys.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: RafaÅ‚ Ziemianek <49795849+Rafalz13@users.noreply.github.com> --- viadot/tasks/genesys.py | 1 + 1 file changed, 1 insertion(+) diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index 96d5bdd03..187ba1150 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -385,6 +385,7 @@ def run( "agent_performance_summary_view", "agent_status_summary_view", "agent_status_detail_view", + "agent_interaction_detail_view", ]: genesys.genesys_api_connection( post_data_list=post_data_list, end_point=end_point From ce1cb0e5165df628e009476452643ae1082ea57d Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Wed, 15 Nov 2023 09:39:01 +0100 Subject: [PATCH 77/86] Typos, spelling and docstring update --- viadot/flows/sharepoint_to_adls.py | 36 +++--- viadot/sources/sharepoint.py | 174 ++++++++++++++--------------- viadot/tasks/sharepoint.py | 20 ++-- 3 files changed, 112 insertions(+), 118 deletions(-) diff --git a/viadot/flows/sharepoint_to_adls.py b/viadot/flows/sharepoint_to_adls.py index 6191317d0..9baab7d34 100644 --- a/viadot/flows/sharepoint_to_adls.py +++ b/viadot/flows/sharepoint_to_adls.py @@ -42,7 +42,7 @@ def __init__( if_exists: str = "replace", validate_df_dict: dict = None, timeout: int = 3600, - key_value_param: bool = False, + set_prefect_kv: bool = False, *args: List[any], **kwargs: Dict[str, Any], ): @@ -70,7 +70,7 @@ def __init__( dataframe. If defined, triggers the `validate_df` task from task_utils. Defaults to None. timeout(int, optional): The amount of time (in seconds) to wait while running this task before a timeout occurs. Defaults to 3600. - key_value_param (bool, optional): Wheter to do key-value parameters in KV Store or not. Defaults to False. + set_prefect_kv (bool, optional): Whether to do key-value parameters in KV Store or not. Defaults to False. """ # SharepointToDF self.if_empty = if_empty @@ -88,7 +88,7 @@ def __init__( self.adls_sp_credentials_secret = adls_sp_credentials_secret self.if_exists = if_exists self.output_file_extension = output_file_extension - self.key_value_param = key_value_param + self.set_prefect_kv = set_prefect_kv self.now = str(pendulum.now("utc")) if self.local_dir_path is not None: self.local_file_path = ( @@ -180,7 +180,7 @@ def gen_flow(self) -> Flow: file_to_adls_task.set_upstream(df_to_file, flow=self) json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) - if self.key_value_param == True: + if self.set_prefect_kv == True: set_key_value(key=self.adls_dir_path, value=self.adls_file_path) @staticmethod @@ -207,11 +207,14 @@ def __init__( overwrite_adls: bool = True, output_file_extension: str = ".parquet", validate_df_dict: dict = None, - key_value_param: bool = False, + set_prefect_kv: bool = False, *args: List[any], **kwargs: Dict[str, Any], ): - """_summary_ + """ + Flow for ingesting sharepoint list items(rows) with a given (or all) columns. + It allows to filter the output by column values. + Data is ingested from MS Sharepoint list (with given name and url ) and stored in MS Azure ADLS. Args: name (str): Prefect flow name. @@ -220,9 +223,9 @@ def __init__( path (str): Local file path. Default to None. adls_dir_path (str): Azure Data Lake destination folder/catalog path. Defaults to None. adls_file_name (str): Name of file in ADLS. Defaults to None. - filters (dict, optional): Dictionary with operators which filters the SharepointList output. + filters (dict, optional): Dictionary with operators which filters the SharepointList output. Defaults to None. allowed dtypes: ('datetime','date','bool','int', 'float', 'complex', 'str') - allowed conjuction: ('&','|') + allowed conjunction: ('&','|') allowed operators: ('<','>','<=','>=','==','!=') Example how to build the dict: filters = { @@ -233,8 +236,8 @@ def __init__( 'value2':'YYYY-MM-DD', 'operator1':'>=', 'operator2':'<=', - 'operators_conjuction':'&', # conjuction operators allowed only when 2 values passed - 'filters_conjuction':'&', # conjuction filters allowed only when 2 columns passed + 'operators_conjunction':'&', # conjunction operators allowed only when 2 values passed + 'filters_conjunction':'&', # conjunction filters allowed only when 2 columns passed } , 'Column_name_2' : @@ -244,13 +247,12 @@ def __init__( 'operator1':'==', }, } - Defaults to None. required_fields (List[str], optional): Required fields(columns) need to be extracted from Sharepoint List. Defaults to None. field_property (str, optional): Property to expand fields with expand query method. For example: User fields could be expanded and "Title" or "ID" could be extracted - -> usefull to get user name instead of ID + -> useful to get user name instead of ID All properties can be found under list.item.properties. WARNING! Field types and properties might change which could lead to errors - extension of sp connector would be required. @@ -262,9 +264,9 @@ def __init__( If not passed it will take cred's from your .config/credentials.json Default to None. vault_name (str, optional): KeyVaultSecret name. Default to None. overwrite_adls (bool, optional): Whether to overwrite files in the lake. Defaults to True. - output_file_extension (str, optional): _description_. Defaults to ".parquet". - validate_df_dict (dict, optional): Wheter to do an extra df validation before ADLS upload or not to do. Defaults to None. - key_value_param (bool, optional): Wheter to do key-value parameters in KV Store or not. Defaults to False. + output_file_extension (str, optional): Extension of the resulting file to be stored. Defaults to ".parquet". + validate_df_dict (dict, optional): Whether to do an extra df validation before ADLS upload or not to do. Defaults to None. + set_prefect_kv (bool, optional): Whether to do key-value parameters in KV Store or not. Defaults to False. Returns: .parquet file inside ADLS. @@ -288,7 +290,7 @@ def __init__( self.overwrite = overwrite_adls self.adls_sp_credentials_secret = adls_sp_credentials_secret self.output_file_extension = output_file_extension - self.key_value_param = key_value_param + self.set_prefect_kv = set_prefect_kv self.now = str(pendulum.now("utc")) if self.path is not None: self.local_file_path = ( @@ -379,7 +381,7 @@ def gen_flow(self) -> Flow: file_to_adls_task.set_upstream(df_to_file, flow=self) json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) - if self.key_value_param == True: + if self.set_prefect_kv == True: set_key_value(key=self.adls_dir_path, value=self.adls_file_path) @staticmethod diff --git a/viadot/sources/sharepoint.py b/viadot/sources/sharepoint.py index 7f1bc523c..832633806 100644 --- a/viadot/sources/sharepoint.py +++ b/viadot/sources/sharepoint.py @@ -84,27 +84,17 @@ def download_file( class SharepointList(Source): - """ - A Sharepoint_List class to connect and download data from Sharepoint lists. - Warning! - Please be carefull with selection of the column names because once sharepoint list is opened inside a browser it may display columns in different languages. - Because of that the resulting file or output might have different column names then the one which u see in the browser. - Args: - credentials (dict): Credentials should include: - - "tenant" - - "client_id" - - "scopes" - - "thumbprint" - - "private_key" - """ - def __init__( self, credentials: Dict[str, Any] = None, *args, **kwargs, ): - """_summary_ + """ + A Sharepoint_List class to connect and download data from Sharepoint lists. + Warning! + Please be careful with selection of the column names because once sharepoint list is opened inside a browser it may display columns in different languages. + Because of that the resulting file or output might have different column names then the one which u see in the browser. Args: credentials (Dict[str, Any], optional): Credentials should include: @@ -115,8 +105,8 @@ def __init__( - "private_key" Raises: - CredentialError: If no credentials are pased - and local config doesn't contain them neiter + CredentialError: If no credentials are passed + and local config doesn't contain them neither """ DEFAULT_CREDENTIALS = local_config.get("SHAREPOINT_CERT") credentials = credentials or DEFAULT_CREDENTIALS @@ -126,15 +116,15 @@ def __init__( super().__init__(*args, credentials=credentials, **kwargs) def get_connection(self, site_url: str): - """Function for connecting into Sharepoint with AuthenticationContext + """Function for connecting into Sharepoint with AuthenticationContext. Args: - site_url (str): url of the sharepoint list + site_url (str): URL of the sharepoint list. Returns: - ctx: authentication context + ctx: Authentication context. """ - logger.info("Connecting into Sharepoint with AuthenticationContexts") + logger.info("Connecting into Sharepoint with AuthenticationContexts.") try: auth_context = AuthenticationContext(site_url) auth_context.with_client_certificate( @@ -158,7 +148,7 @@ def _unpack_fields( list_item, selected_fields: dict, ) -> dict: - """Function for extracting and unpacking list items from the search fields + """Function for extracting and unpacking list items from the search fields. Args: list_items (office365 list item): A list with office365 list item objects (rows) @@ -166,15 +156,15 @@ def _unpack_fields( Raises: ValueError: "Check if given field property is valid!" - ValueError: "Get nested dict for not recognized type of field! Check field types in the source" - ValueError: "Get empty properties for list items" + ValueError: "Get nested dict for not recognized type of field! Check field types in the source." + ValueError: "Get empty properties for list items." Returns: - dict: A dictionary with Column: Value pairs for each row from the list + dict: A dictionary with Column: Value pairs for each row from the list. """ # Creating the body of dictionary new_dict = dict() - # For loop scanning the propertys of searching fields + # For loop scanning the properties of searching fields item_values_dict = list_item.properties if item_values_dict: for field, val in item_values_dict.items(): @@ -196,13 +186,13 @@ def _unpack_fields( new_dict[field] = ";".join(nested_dict.values()) else: raise ValueError( - "Get nested dict for not recognized type of field! Check field types in the source" + "Get nested dict for not recognized type of field! Check field types in the source." ) else: new_dict[field] = val else: raise ValueError( - "Get empty properties for list items. Check if parameter list_item collection containes any data -> item objects." + "Get empty properties for list items. Check if parameter list_item collection contains any data -> item objects." ) return new_dict @@ -213,17 +203,17 @@ def get_fields( required_fields: List[str] = None, ) -> List: """ - Function for geting list of fields objects from the sharepoint list. + Function for getting list of fields objects from the sharepoint list. It can get all fields available if required_fields not passed or just the one which are in the list required_fields. Args: - list_title (str): name of the sharepoint list - site_url (str): url to the sharepoint list with "/" at the end + list_title (str): Name of the sharepoint list. + site_url (str): URL to the sharepoint list with "/" at the end. required_fields (List[str], optional ): List of required fields to ingest. It will get all fields if not passed. Returns: - List: list with office365 sharepoint list field objects + List: List with office365 sharepoint list field objects. """ ctx = self.get_connection(site_url=site_url) @@ -263,9 +253,9 @@ def select_fields( -> more properties can be discovered by getting list.item.properties. Args: - list_title (str): _description_. Defaults to None. - site_url (str): _description_. Defaults to None. - required_fields (List[str], optional): _description_. Defaults to None. + list_title (str): A title of the sharepoint list. Defaults to None. + site_url (str): A sharepoint list URL. Defaults to None. + required_fields (List[str], optional): List of fields(columns) to be ingested. Defaults to None. field_property (str, optional): Property to extract from nested fields like column with type User*. Defaults to "Title". @@ -318,10 +308,10 @@ def check_filters( Function to check if filters dict is valid. Please check and apply only allowed filter settings: allowed_dtypes = ["datetime", "date", "bool", "int", "float", "complex", "str"] - allowed_conjuction = ["&", "|"] + allowed_conjunction = ["&", "|"] allowed_operators = ["<", ">", "<=", ">=", "==", "!="] - Operator conjuction is only possible if there are 2 values like: value <= 1 | value == 5 - Filter conjuction is only possible if there are more then 1 filters for ex. date and creator + Operator conjunction is only possible if there are 2 values like: value <= 1 | value == 5 + Filter conjunction is only possible if there are more then 1 filters for ex. date and creator Args: filters (dict): A dictionary containing filter settings @@ -333,8 +323,8 @@ def check_filters( "value2": today_date, "operator1": ">=", "operator2": "<=", - "operators_conjuction": "&", - "filters_conjuction": "&", + "operators_conjunction": "&", + "filters_conjunction": "&", }, "Factory": { "dtype": "str", @@ -344,25 +334,25 @@ def check_filters( } Raises: - ValueError: If dtype not in allowed list - ValueError: If comparison operator1 not in allowed list - ValueError: If value for operator1 is missing - ValueError: If comparison operator1 for the first value is missing - ValueError: If comparison operator2 not in allowed list - ValueError: If value for operator2 is missing - ValueError: If comparison operator2 for the first value is missing - ValueError: If operator conjuction is missing while there are 2 values and 2 operators passed - ValueError: If operator conjuction is not in the allowed list - ValueError: If operator conjuction provided why only one filter value is given - ValueError: If filter conjuction provided without 2nd filter - ValueError: If filter conjuction not in the allowed list + ValueError: If dtype not in allowed list. + ValueError: If comparison operator1 not in allowed list. + ValueError: If value for operator1 is missing. + ValueError: If comparison operator1 for the first value is missing. + ValueError: If comparison operator2 not in allowed list. + ValueError: If value for operator2 is missing. + ValueError: If comparison operator2 for the first value is missing. + ValueError: If operator conjunction is missing while there are 2 values and 2 operators passed. + ValueError: If operator conjunction is not in the allowed list. + ValueError: If operator conjunction provided why only one filter value is given. + ValueError: If filter conjunction provided without 2nd filter. + ValueError: If filter conjunction not in the allowed list. Returns: - bool: True if all checks passed + bool: True if all checks passed. """ allowed_dtypes = ["datetime", "date", "bool", "int", "float", "complex", "str"] - allowed_conjuction = ["&", "|"] + allowed_conjunction = ["&", "|"] allowed_operators = ["<", ">", "<=", ">=", "==", "!="] for filter_name, parameters in filters.items(): @@ -383,10 +373,10 @@ def check_filters( raise ValueError("Operator1 is missing!") if ( not parameters.get("operator2") - and parameters.get("operators_conjuction") is not None + and parameters.get("operators_conjunction") is not None ): raise ValueError( - f"Operator conjuction allowed only with more then one filter operator!" + f"Operator conjunction allowed only with more then one filter operator!" ) if parameters.get("operator2"): if parameters.get("operator2") not in allowed_operators: @@ -395,25 +385,25 @@ def check_filters( ) if not parameters.get("value2"): raise ValueError("Value2 for operator2 is missing!") - if not parameters.get("operators_conjuction"): + if not parameters.get("operators_conjunction"): raise ValueError( - f"Operator for conjuction is missing! Expected: {allowed_conjuction} got empty." + f"Operator for conjunction is missing! Expected: {allowed_conjunction} got empty." ) - if parameters.get("operators_conjuction") not in allowed_conjuction: + if parameters.get("operators_conjunction") not in allowed_conjunction: raise ValueError( - f"Operator for conjuction not allowed! Expected: {allowed_conjuction} got {parameters.get('operators_conjuction')} ." + f"Operator for conjunction not allowed! Expected: {allowed_conjunction} got {parameters.get('operators_conjunction')} ." ) - if parameters.get("filters_conjuction"): + if parameters.get("filters_conjunction"): if ( len(filters.keys()) == 1 - and parameters.get("filters_conjuction") is not None + and parameters.get("filters_conjunction") is not None ): raise ValueError( - f"Filters conjuction allowed only when more then one filter provided!" + f"Filters conjunction allowed only when more then one filter provided!" ) - if parameters.get("filters_conjuction") not in allowed_conjuction: + if parameters.get("filters_conjunction") not in allowed_conjunction: raise ValueError( - f"Filter operator for conjuction not allowed! Expected: {allowed_conjuction} got {parameters.get('filters_conjuction')} ." + f"Filter operator for conjunction not allowed! Expected: {allowed_conjunction} got {parameters.get('filters_conjunction')} ." ) return True @@ -423,7 +413,7 @@ def operators_mapping( filters: dict, ) -> dict: """ - Function for mapping comparison and conjuction(logical) operators of filters to the format which is recognized by Microsoft API. + Function for mapping comparison and conjunction(logical) operators of filters to the format which is recognized by Microsoft API. Allowed operators: < > @@ -438,10 +428,10 @@ def operators_mapping( filters (dict): A dictionary which contains operators. Raises: - ValueError: If operator1 not allowed - ValueError: If operator2 not allowed - ValueError: If operators conjuction not allowed - ValueError: If filters conjuction not allowed + ValueError: If operator1 not allowed. + ValueError: If operator2 not allowed. + ValueError: If operators conjunction not allowed. + ValueError: If filters conjunction not allowed. Returns: dict: New modified dict with mapped operators. @@ -475,23 +465,23 @@ def operators_mapping( raise ValueError( f"This comparison operator: {operator2_to_change} is not allowed. Please read the function documentation for details!" ) - if parameters.get("operators_conjuction"): - logical_op_to_change = parameters.get("operators_conjuction") + if parameters.get("operators_conjunction"): + logical_op_to_change = parameters.get("operators_conjunction") if logical_op_to_change in logical_op.keys(): - parameters["operators_conjuction"] = logical_op[ + parameters["operators_conjunction"] = logical_op[ logical_op_to_change ] else: raise ValueError( - f"This conjuction (logical) operator: {logical_op_to_change} is not allowed. Please read the function documentation for details!" + f"This conjunction (logical) operator: {logical_op_to_change} is not allowed. Please read the function documentation for details!" ) - if parameters.get("filters_conjuction"): - logical_fl_to_change = parameters.get("filters_conjuction") + if parameters.get("filters_conjunction"): + logical_fl_to_change = parameters.get("filters_conjunction") if logical_fl_to_change in logical_op.keys(): - parameters["filters_conjuction"] = logical_op[logical_fl_to_change] + parameters["filters_conjunction"] = logical_op[logical_fl_to_change] else: raise ValueError( - f"This filters conjuction (logical) operator: {logical_fl_to_change} is not allowed. Please read the function documentation for details!" + f"This filters conjunction (logical) operator: {logical_fl_to_change} is not allowed. Please read the function documentation for details!" ) return filters_dict @@ -501,7 +491,7 @@ def make_filter_for_api(self, filters: dict) -> str: Function changing type of operators to match MS API style as 'str' passing to URL call. Args: - filters (dict): A dictionar which contains operators. + filters (dict): A dictionary which contains operators. Returns: str: Output as filtering string to pass as filter parameter to API. @@ -525,7 +515,7 @@ def make_filter_for_api(self, filters: dict) -> str: ).isoformat() filter_text = ( filter_text - + f" {parameters.get('operators_conjuction')} {column} {parameters.get('operator2')} datetime'{from_date2}' " + + f" {parameters.get('operators_conjunction')} {column} {parameters.get('operator2')} datetime'{from_date2}' " ) elif parameters.get("dtype") not in ["datetime", "date"]: filter_text = ( @@ -537,8 +527,8 @@ def make_filter_for_api(self, filters: dict) -> str: filter_text + f"{column} {parameters.get('operator2')} '{parameters.get('value2')}'" ) - if parameters.get("filters_conjuction"): - filter_text = filter_text + f"{parameters.get('filters_conjuction')} " + if parameters.get("filters_conjunction"): + filter_text = filter_text + f"{parameters.get('filters_conjunction')} " return filter_text @@ -567,11 +557,13 @@ def make_filter_for_df( if parameters.get("operator2"): filter_in_df = ( filter_in_df - + f") {parameters.get('operators_conjuction')} (df.{column} {parameters.get('operator2', '')} '{parameters.get('value2', '')}'" + + f") {parameters.get('operators_conjunction')} (df.{column} {parameters.get('operator2', '')} '{parameters.get('value2', '')}'" ) - if parameters.get("filters_conjuction"): - filter_in_df = filter_in_df + ")" + parameters.get("filters_conjuction") + if parameters.get("filters_conjunction"): + filter_in_df = ( + filter_in_df + ")" + parameters.get("filters_conjunction") + ) else: filter_in_df = filter_in_df + ")" @@ -601,11 +593,11 @@ def list_item_to_df( required_fields (List[str]): Required fields(columns) need to be extracted from Sharepoint List. Default to None. field_property (List[str]): Property to expand with expand query method. - All propertys can be found under list.item.properties. + All properties can be found under list.item.properties. Default to ["Title"] filters (dict): Dictionary with operators which filters the SharepointList output. allowed dtypes: ('datetime','date','bool','int', 'float', 'complex', 'str') - allowed conjuction: ('&','|') + allowed conjunction: ('&','|') allowed operators: ('<','>','<=','>=','==','!=') Example how to build the dict: filters = { @@ -616,8 +608,8 @@ def list_item_to_df( 'value2':'YYYY-MM-DD', 'operator1':'>=', 'operator2':'<=', - 'operators_conjuction':'&', - 'filters_conjuction':'&', + 'operators_conjunction':'&', + 'filters_conjunction':'&', } , 'Column_name_2' : @@ -675,7 +667,7 @@ def list_item_to_df( self.ctx.execute_query() except (ClientRequestException, ValueError) as e: - # Extract all data from specific SP List without basic filtering. Additional logic for filtering applied on DataFreame level. + # Extract all data from specific SP List without basic filtering. Additional logic for filtering applied on DataFrame level. logger.info(f"Exception SPQueryThrottledException occurred: {e}") list_items = ( self.list_object.items.get_all(row_count, log_of_progress) @@ -691,7 +683,7 @@ def list_item_to_df( ) if download_all == True and filters is not None: - # Filter for desired range of created date and for factory Namyslow PL + # Apply filters to the data frame -> accordingly to the filter dict passed as na parameter self.logger.info("Filtering df with all data output") filter_for_df = self.make_filter_for_df(filters) df = eval(filter_for_df) diff --git a/viadot/tasks/sharepoint.py b/viadot/tasks/sharepoint.py index 635f9a5ae..f87134bb7 100644 --- a/viadot/tasks/sharepoint.py +++ b/viadot/tasks/sharepoint.py @@ -243,11 +243,11 @@ class SharepointListToDF(Task): required_fields (List[str]): Required fields(columns) need to be extracted from Sharepoint List. Default to None. field_property (List[str]): Property to expand with expand query method. - All propertys can be found under list.item.properties. + All properties can be found under list.item.properties. Default to ["Title"] - filters (dict, optional): Dictionary with operators which filters the SharepointList output. + filters (dict, optional): Dictionary with operators which filters the SharepointList output. Default to None. allowed dtypes: ('datetime','date','bool','int', 'float', 'complex', 'str') - allowed conjuction: ('&','|') + allowed conjunction: ('&','|') allowed operators: ('<','>','<=','>=','==','!=') Example how to build the dict: filters = { @@ -258,8 +258,8 @@ class SharepointListToDF(Task): 'value2':'YYYY-MM-DD', 'operator1':'>=', 'operator2':'<=', - 'operators_conjuction':'&', - 'filters_conjuction':'&', + 'operators_conjunction':'&', + 'filters_conjunction':'&', } , 'Column_name_2' : @@ -329,7 +329,7 @@ def _rename_duplicated_fields(self, df): It might happen that fields returned by get_fields() will be different than actual list items fields ( from it's properties) It is specific to sharepoint lists. - MS allowed users to create fields with simillar names (but with different letters style) + MS allowed users to create fields with similar names (but with different letters style) fields with same values. For example Id and ID - > office select function doesn't recognize upper/lower cases. @@ -368,13 +368,13 @@ def _rename_duplicated_fields(self, df): def _convert_camel_case_to_words(self, input_str: str) -> str: """ - Function for converting internal names joined as camelCase column names to regular words + Function for converting internal names joined as camelCase column names to regular words. Args: - input_str (str): Column name + input_str (str): Column name. Returns: - str: Converted column name + str: Converted column name. """ self.input_str = input_str @@ -396,7 +396,7 @@ def change_column_name(self, df: pd.DataFrame, credentials: str = None): credentials (str): Credentials str for sharepoint connection establishing. Defaults to None. Returns: - pd.DataFrame: Data frame with changed column names + pd.DataFrame: Data frame with changed column names. """ s = SharepointList( credentials=self.credentials, From 7d4941929984cbadbf982523fe5b942f19ecbcef Mon Sep 17 00:00:00 2001 From: mgwinner Date: Wed, 15 Nov 2023 09:58:32 +0100 Subject: [PATCH 78/86] =?UTF-8?q?=E2=9C=A8=20Add=20new=20requirements?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 1 + requirements.txt | 2 ++ 2 files changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c8a63ab1..a71c3496e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `TM1` source class. - Added `TM1ToDF` task class. - Added `set_prefect_kv` parameter to `BigQueryToADLS` with `False` as a default. If there is a need to create new pair in KV Store the parameter can be changed to `True`. +- Added libraries `nltk` and `sklearn` to `requirements`. ### Fixed diff --git a/requirements.txt b/requirements.txt index 896b11d1a..4d6c3a15f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -44,3 +44,5 @@ dbt-sqlserver==1.3.1 lumaCLI==0.0.19 Office365-REST-Python-Client==2.4.4 TM1py==1.11.3 +nltk==3.8.1 +scikit-learn==1.3.2 \ No newline at end of file From e1e49df7f8c5babdd885e2be8e22cd1c416002c8 Mon Sep 17 00:00:00 2001 From: burzekj Date: Wed, 15 Nov 2023 10:17:48 +0100 Subject: [PATCH 79/86] =?UTF-8?q?=E2=9C=85=20=20Added=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/tasks/test_task_utils.py | 51 +++++++++++++++++++++- viadot/task_utils.py | 25 +++++++++++ viadot/tasks/genesys.py | 13 +----- 3 files changed, 76 insertions(+), 13 deletions(-) diff --git a/tests/integration/tasks/test_task_utils.py b/tests/integration/tasks/test_task_utils.py index f22d55022..63887d40a 100644 --- a/tests/integration/tasks/test_task_utils.py +++ b/tests/integration/tasks/test_task_utils.py @@ -3,7 +3,7 @@ from prefect.engine.state import Failed, Success from prefect.tasks.secrets import PrefectSecret -from viadot.task_utils import custom_mail_state_handler, set_new_kv +from viadot.task_utils import custom_mail_state_handler, set_new_kv, check_value def test_custom_state_handler(): @@ -28,3 +28,52 @@ def test_set_new_kv(): result = get_key_value("test_for_setting_kv") assert result == "72" set_key_value(key="test_for_setting_kv", value=None) + + +# Sample test checking the correctness of the function when the key is found +def test_check_value_found(): + json_data = { + "first_known_lvl": { + "second_known_lvl": { + "third_known_lvl": { + "searched_phrase": "phrase" + } + } + } + } + result = check_value(json_data["first_known_lvl"]["second_known_lvl"]["third_known_lvl"], ["searched_phrase"]) + assert result == "phrase" + +# Sample test checking the correctness of the function when the key is not found +def test_check_value_not_found(): + json_data = { + "first_known_lvl": { + "second_known_lvl": { + "third_known_lvl": { + "other_phrase": "This won't be found" + } + } + } + } + result = check_value(json_data["first_known_lvl"]["second_known_lvl"]["third_known_lvl"], ["searched_phrase"]) + assert result is None + +# Sample test checking the correctness of the function with an empty dictionary +def test_check_value_empty_dict(): + json_data = {} + result = check_value(json_data, ["searched_phrase"]) + assert result is None + +# Sample test checking the correctness of the function with a nonexistent key +def test_check_value_nonexistent_key(): + json_data = { + "first_known_lvl": { + "second_known_lvl": { + "third_known_lvl": { + "searched_phrase": "phrase" + } + } + } + } + result = check_value(json_data, ["nonexistent_key"]) + assert result is None \ No newline at end of file diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 6173e2994..41494a929 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -792,3 +792,28 @@ def validate_df(df: pd.DataFrame, tests: dict = None) -> None: raise ValidationError( f"Validation failed for {failed_tests} test/tests: {failed_tests_msg}" ) + + +def check_value(base, lvls: List): + """ + Task to extract data from nested json file if there is any under passed parameters. + Otherwise return None. + + Args: + base: variable with base lvl of the json, fo example: + json_file["first_known_lvl"]["second_known_lvl"]["third_known_lvl"] + lvls (List): List of potential lower levels of nested json for data retrieval. For example: + ["first_lvl_below_base", "second_lvl_below_base", "searched_phrase"] + + Return: + Searched value for the lowest level, in example data under "searched_phrase" key. + """ + + for lvl in lvls: + if isinstance(base, dict): + base = base.get(lvl) + if base is None: + return None + else: + return base + return base \ No newline at end of file diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index 187ba1150..7986d9b0c 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -520,20 +520,9 @@ def run( ) last_page = temp_json["pageCount"] + 1 - # Function to extract nested data from json file - def check_value(base, lvls): - for lvl in lvls: - if isinstance(base, dict): - base = base.get(lvl) - if base is None: - return None - else: - return base - return base - data_list = [] - # For loop to donwload all pages from Genesys GET API + # For loop to download all pages from Genesys GET API for n in range(1, last_page): json_file = genesys.genesys_api_connection( post_data_list=post_data_list, From 1e37dcd8eb099173812ed2b0462c520a4c0c7e48 Mon Sep 17 00:00:00 2001 From: Marcin Purtak <44641138+marcinpurtak@users.noreply.github.com> Date: Wed, 15 Nov 2023 10:20:13 +0100 Subject: [PATCH 80/86] Update CHANGELOG.md --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fd6fedf8..4201189c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,9 +19,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 -> docstrings update - Modified `SharepointToADLS` flow class: -> docstrings update - -> changed key_value_param: bool = False to prevent forced KV store append + -> changed set_prefect_kv: bool = False to prevent forced KV store append - Modified `SharepointListToADLS` flow class: - -> changed key_value_param: bool = False to prevent forced KV store append + -> changed set_prefect_kv: bool = False to prevent forced KV store append - Modified `SharepointList` source class: -> docstrings update -> Changed `_unpack_fields` method to handle Sharepoint MultiChoiceField type + small improvements @@ -662,4 +662,4 @@ specified in the `SUPERMETRICS_DEFAULT_USER` secret - Moved from poetry to pip ### Fixed -- Fix `AzureBlobStorage`'s `to_storage()` method is missing the final upload blob part \ No newline at end of file +- Fix `AzureBlobStorage`'s `to_storage()` method is missing the final upload blob part From f31406705eea2196603b8c13727e5d84af66379f Mon Sep 17 00:00:00 2001 From: burzekj Date: Wed, 15 Nov 2023 10:41:27 +0100 Subject: [PATCH 81/86] =?UTF-8?q?=F0=9F=8E=A8=20output=20dataframe=20corre?= =?UTF-8?q?ctions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/tasks/genesys.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index cf470c170..4268e830e 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -490,11 +490,24 @@ def run( temp_dict["conversationId"] = json_file.get("id") temp_dict["startTime"] = json_file.get("startTime") temp_dict["endTime"] = json_file.get("endTime") - data_list.append(temp_dict) + desired_order = [ + "startTime", + "endTime", + "LOB", + "CustomerOutcomeResult", + "CustomerOutcomeTrack", + "LastUtterance", + "Final Sub Intent", + "SubIntent", + "Final Main Intent", + "conversationId", + ] + df = pd.DataFrame(data_list) - df = df[df.columns[-1:]].join(df[df.columns[:-1]]) + df = df[desired_order] + df.rename(columns={"LastUtterance": "CustomerTextInput"}, inplace=True) start = start_date.replace("-", "") end = end_date.replace("-", "") From 82cbd3f3409343f7bc18d0e94870cd94518d5b3c Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Wed, 15 Nov 2023 11:12:32 +0100 Subject: [PATCH 82/86] Fixed typo in tests --- tests/integration/test_sharepoint.py | 52 ++++++++++++++-------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/integration/test_sharepoint.py b/tests/integration/test_sharepoint.py index dbbbf1e70..c45925ccb 100644 --- a/tests/integration/test_sharepoint.py +++ b/tests/integration/test_sharepoint.py @@ -240,7 +240,7 @@ def test_filters_missing_value1(sharepoint_list): sharepoint_list.check_filters(filters) -def test_filters_missing_operators_conjuction(sharepoint_list): +def test_filters_missing_operators_conjunction(sharepoint_list): filters = { "filter1": { "dtype": "int", @@ -253,13 +253,13 @@ def test_filters_missing_operators_conjuction(sharepoint_list): with pytest.raises( ValueError, match=re.escape( - "Operator for conjuction is missing! Expected: ['&', '|'] got empty." + "Operator for conjunction is missing! Expected: ['&', '|'] got empty." ), ): sharepoint_list.check_filters(filters) -def test_filters_invalid_operators_conjuction(sharepoint_list): +def test_filters_invalid_operators_conjunction(sharepoint_list): filters = { "filter1": { "dtype": "int", @@ -267,43 +267,43 @@ def test_filters_invalid_operators_conjuction(sharepoint_list): "value1": 10, "operator2": "<", "value2": 20, - "operators_conjuction": "!", + "operators_conjunction": "!", }, } with pytest.raises( ValueError, match=re.escape( - "Operator for conjuction not allowed! Expected: ['&', '|'] got ! ." + "Operator for conjunction not allowed! Expected: ['&', '|'] got ! ." ), ): sharepoint_list.check_filters(filters) -def test_filters_conjuction_not_allowed(sharepoint_list): +def test_filters_conjunction_not_allowed(sharepoint_list): filters = { "filter1": { "dtype": "int", "operator1": ">", "value1": 10, - "filters_conjuction": "!", + "filters_conjunction": "!", }, } with pytest.raises( ValueError, match=re.escape( - "Filters conjuction allowed only when more then one filter provided!" + "Filters conjunction allowed only when more then one filter provided!" ), ): sharepoint_list.check_filters(filters) -def test_filters_invalid_conjuction(sharepoint_list): +def test_filters_invalid_conjunction(sharepoint_list): filters = { "filter1": { "dtype": "int", "value1": 10, "operator1": ">", - "filters_conjuction": "!", + "filters_conjunction": "!", }, "filter2": { "dtype": "int", @@ -313,7 +313,7 @@ def test_filters_invalid_conjuction(sharepoint_list): with pytest.raises( ValueError, match=re.escape( - "Filter operator for conjuction not allowed! Expected: ['&', '|'] got ! ." + "Filter operator for conjunction not allowed! Expected: ['&', '|'] got ! ." ), ): sharepoint_list.check_filters(filters) @@ -327,8 +327,8 @@ def test_valid_mapping(sharepoint_list): "value2": 20, "operator1": ">", "operator2": "<=", - "operators_conjuction": "&", - "filters_conjuction": "|", + "operators_conjunction": "&", + "filters_conjunction": "|", }, "filter2": { "dtype": "int", @@ -336,7 +336,7 @@ def test_valid_mapping(sharepoint_list): "value2": 0, "operator1": "==", "operator2": "!=", - "operators_conjuction": "|", + "operators_conjunction": "|", }, } expected_result = { @@ -346,8 +346,8 @@ def test_valid_mapping(sharepoint_list): "value2": 20, "operator1": "gt", "operator2": "le", - "operators_conjuction": "and", - "filters_conjuction": "or", + "operators_conjunction": "and", + "filters_conjunction": "or", }, "filter2": { "dtype": "int", @@ -355,7 +355,7 @@ def test_valid_mapping(sharepoint_list): "value2": 0, "operator1": "eq", "operator2": "ne", - "operators_conjuction": "or", + "operators_conjunction": "or", }, } result = sharepoint_list.operators_mapping(filters) @@ -367,8 +367,8 @@ def test_operators_mapping_invalid_comparison_operator(sharepoint_list): "filter1": { "operator1": "*", "operator2": "<=", - "operators_conjuction": "&", - "filters_conjuction": "|", + "operators_conjunction": "&", + "filters_conjunction": "|", }, } error_message = "This comparison operator: * is not allowed. Please read the function documentation for details!" @@ -381,11 +381,11 @@ def test_operators_mapping_invalid_logical_operator(sharepoint_list): "filter1": { "operator1": ">", "operator2": "<=", - "operators_conjuction": "!", - "filters_conjuction": "|", + "operators_conjunction": "!", + "filters_conjunction": "|", }, } - error_message = "This conjuction (logical) operator: ! is not allowed. Please read the function documentation for details!" + error_message = "This conjunction (logical) operator: ! is not allowed. Please read the function documentation for details!" with pytest.raises(ValueError, match=re.escape(error_message)): sharepoint_list.operators_mapping(filters) @@ -395,11 +395,11 @@ def test_operators_mapping_invalid_filters_logical_operator(sharepoint_list): "filter1": { "operator1": ">", "operator2": "<=", - "operators_conjuction": "&", - "filters_conjuction": "!", + "operators_conjunction": "&", + "filters_conjunction": "!", }, } - error_message = "This filters conjuction (logical) operator: ! is not allowed. Please read the function documentation for details!" + error_message = "This filters conjunction (logical) operator: ! is not allowed. Please read the function documentation for details!" with pytest.raises(ValueError, match=re.escape(error_message)): sharepoint_list.operators_mapping(filters) @@ -438,7 +438,7 @@ def test_single_df_filter(sharepoint_list): def test_multiple_df_filters(sharepoint_list): filters = { - "column1": {"operator1": ">", "value1": 10, "filters_conjuction": "&"}, + "column1": {"operator1": ">", "value1": 10, "filters_conjunction": "&"}, "column2": {"operator1": "<", "value1": 20}, } result = sharepoint_list.make_filter_for_df(filters) From 35b278311640e925523abe98b63f885737d83718 Mon Sep 17 00:00:00 2001 From: burzekj Date: Wed, 15 Nov 2023 11:25:20 +0100 Subject: [PATCH 83/86] moved fun from task_utils to utils --- tests/integration/tasks/test_task_utils.py | 53 +--------------------- tests/unit/test_utils.py | 49 ++++++++++++++++++++ viadot/task_utils.py | 27 +---------- viadot/tasks/genesys.py | 1 + viadot/utils.py | 27 ++++++++++- 5 files changed, 79 insertions(+), 78 deletions(-) diff --git a/tests/integration/tasks/test_task_utils.py b/tests/integration/tasks/test_task_utils.py index 63887d40a..5d77ab7e9 100644 --- a/tests/integration/tasks/test_task_utils.py +++ b/tests/integration/tasks/test_task_utils.py @@ -3,7 +3,7 @@ from prefect.engine.state import Failed, Success from prefect.tasks.secrets import PrefectSecret -from viadot.task_utils import custom_mail_state_handler, set_new_kv, check_value +from viadot.task_utils import custom_mail_state_handler, set_new_kv def test_custom_state_handler(): @@ -27,53 +27,4 @@ def test_set_new_kv(): set_new_kv.run(kv_name="test_for_setting_kv", df=df, filter_column="col1") result = get_key_value("test_for_setting_kv") assert result == "72" - set_key_value(key="test_for_setting_kv", value=None) - - -# Sample test checking the correctness of the function when the key is found -def test_check_value_found(): - json_data = { - "first_known_lvl": { - "second_known_lvl": { - "third_known_lvl": { - "searched_phrase": "phrase" - } - } - } - } - result = check_value(json_data["first_known_lvl"]["second_known_lvl"]["third_known_lvl"], ["searched_phrase"]) - assert result == "phrase" - -# Sample test checking the correctness of the function when the key is not found -def test_check_value_not_found(): - json_data = { - "first_known_lvl": { - "second_known_lvl": { - "third_known_lvl": { - "other_phrase": "This won't be found" - } - } - } - } - result = check_value(json_data["first_known_lvl"]["second_known_lvl"]["third_known_lvl"], ["searched_phrase"]) - assert result is None - -# Sample test checking the correctness of the function with an empty dictionary -def test_check_value_empty_dict(): - json_data = {} - result = check_value(json_data, ["searched_phrase"]) - assert result is None - -# Sample test checking the correctness of the function with a nonexistent key -def test_check_value_nonexistent_key(): - json_data = { - "first_known_lvl": { - "second_known_lvl": { - "third_known_lvl": { - "searched_phrase": "phrase" - } - } - } - } - result = check_value(json_data, ["nonexistent_key"]) - assert result is None \ No newline at end of file + set_key_value(key="test_for_setting_kv", value=None) \ No newline at end of file diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 777617244..1ca967141 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -9,6 +9,7 @@ add_viadot_metadata_columns, check_if_empty_file, gen_bulk_insert_query_from_df, + check_value, ) EMPTY_CSV_PATH = "empty.csv" @@ -153,3 +154,51 @@ def test_add_viadot_metadata_columns_with_parameter(): assert df_base.columns.to_list() == ["a", "b"] assert df_decorated.columns.to_list() == ["a", "b", "_viadot_source"] assert df_decorated["_viadot_source"][0] == "Source_name" + +# Sample test checking the correctness of the function when the key is found +def test_check_value_found(): + json_data = { + "first_known_lvl": { + "second_known_lvl": { + "third_known_lvl": { + "searched_phrase": "phrase" + } + } + } + } + result = check_value(json_data["first_known_lvl"]["second_known_lvl"]["third_known_lvl"], ["searched_phrase"]) + assert result == "phrase" + +# Sample test checking the correctness of the function when the key is not found +def test_check_value_not_found(): + json_data = { + "first_known_lvl": { + "second_known_lvl": { + "third_known_lvl": { + "other_phrase": "This won't be found" + } + } + } + } + result = check_value(json_data["first_known_lvl"]["second_known_lvl"]["third_known_lvl"], ["searched_phrase"]) + assert result is None + +# Sample test checking the correctness of the function with an empty dictionary +def test_check_value_empty_dict(): + json_data = {} + result = check_value(json_data, ["searched_phrase"]) + assert result is None + +# Sample test checking the correctness of the function with a nonexistent key +def test_check_value_nonexistent_key(): + json_data = { + "first_known_lvl": { + "second_known_lvl": { + "third_known_lvl": { + "searched_phrase": "phrase" + } + } + } + } + result = check_value(json_data, ["nonexistent_key"]) + assert result is None diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 924b2e6a6..32be339af 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -791,29 +791,4 @@ def validate_df(df: pd.DataFrame, tests: dict = None) -> None: failed_tests_msg = ", ".join(failed_tests_list) raise ValidationError( f"Validation failed for {failed_tests} test/tests: {failed_tests_msg}" - ) - - -def check_value(base, lvls: List): - """ - Task to extract data from nested json file if there is any under passed parameters. - Otherwise return None. - - Args: - base: variable with base lvl of the json, fo example: - json_file["first_known_lvl"]["second_known_lvl"]["third_known_lvl"] - lvls (List): List of potential lower levels of nested json for data retrieval. For example: - ["first_lvl_below_base", "second_lvl_below_base", "searched_phrase"] - - Return: - Searched value for the lowest level, in example data under "searched_phrase" key. - """ - - for lvl in lvls: - if isinstance(base, dict): - base = base.get(lvl) - if base is None: - return None - else: - return base - return base \ No newline at end of file + ) \ No newline at end of file diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index cbc18292e..628af1177 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -13,6 +13,7 @@ from viadot.exceptions import APIError from viadot.sources import Genesys +from viadot.utils import check_value from viadot.task_utils import * logger = logging.get_logger() diff --git a/viadot/utils.py b/viadot/utils.py index d05cfdd95..e77323aca 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -2,7 +2,7 @@ import os import re from itertools import chain -from typing import Any, Callable, Dict, List, Literal +from typing import Union, Any, Callable, Dict, List, Literal import pandas as pd import prefect @@ -460,3 +460,28 @@ def get_nested_dict(d): return d else: return None + + +def check_value(base: Union[Dict, Any], levels: List) -> Union[None, Any]: + """ + Task to extract data from nested json file if there is any under passed parameters. + Otherwise return None. + + Args: + base (Dict, Any): variable with base lvl of the json, for example: + json_file["first_known_lvl"]["second_known_lvl"]["third_known_lvl"] + levels (List): List of potential lower levels of nested json for data retrieval. For example: + ["first_lvl_below_base", "second_lvl_below_base", "searched_phrase"] + + Returns: + Union[None, Any]: Searched value for the lowest level, in example data under "searched_phrase" key. + """ + + for lvl in levels: + if isinstance(base, dict): + base = base.get(lvl) + if base is None: + return None + else: + return base + return base \ No newline at end of file From ad02ad4ed2ea05d1c681b394650f2ca6df29c628 Mon Sep 17 00:00:00 2001 From: burzekj Date: Wed, 15 Nov 2023 11:48:28 +0100 Subject: [PATCH 84/86] blackformatter changes --- tests/integration/tasks/test_task_utils.py | 2 +- tests/unit/test_utils.py | 30 +++++++++++----------- viadot/task_utils.py | 2 +- viadot/utils.py | 6 ++--- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/integration/tasks/test_task_utils.py b/tests/integration/tasks/test_task_utils.py index 5d77ab7e9..f22d55022 100644 --- a/tests/integration/tasks/test_task_utils.py +++ b/tests/integration/tasks/test_task_utils.py @@ -27,4 +27,4 @@ def test_set_new_kv(): set_new_kv.run(kv_name="test_for_setting_kv", df=df, filter_column="col1") result = get_key_value("test_for_setting_kv") assert result == "72" - set_key_value(key="test_for_setting_kv", value=None) \ No newline at end of file + set_key_value(key="test_for_setting_kv", value=None) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 1ca967141..75ef30e97 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -155,49 +155,49 @@ def test_add_viadot_metadata_columns_with_parameter(): assert df_decorated.columns.to_list() == ["a", "b", "_viadot_source"] assert df_decorated["_viadot_source"][0] == "Source_name" + # Sample test checking the correctness of the function when the key is found def test_check_value_found(): json_data = { "first_known_lvl": { - "second_known_lvl": { - "third_known_lvl": { - "searched_phrase": "phrase" - } - } + "second_known_lvl": {"third_known_lvl": {"searched_phrase": "phrase"}} } } - result = check_value(json_data["first_known_lvl"]["second_known_lvl"]["third_known_lvl"], ["searched_phrase"]) + result = check_value( + json_data["first_known_lvl"]["second_known_lvl"]["third_known_lvl"], + ["searched_phrase"], + ) assert result == "phrase" + # Sample test checking the correctness of the function when the key is not found def test_check_value_not_found(): json_data = { "first_known_lvl": { "second_known_lvl": { - "third_known_lvl": { - "other_phrase": "This won't be found" - } + "third_known_lvl": {"other_phrase": "This won't be found"} } } } - result = check_value(json_data["first_known_lvl"]["second_known_lvl"]["third_known_lvl"], ["searched_phrase"]) + result = check_value( + json_data["first_known_lvl"]["second_known_lvl"]["third_known_lvl"], + ["searched_phrase"], + ) assert result is None + # Sample test checking the correctness of the function with an empty dictionary def test_check_value_empty_dict(): json_data = {} result = check_value(json_data, ["searched_phrase"]) assert result is None + # Sample test checking the correctness of the function with a nonexistent key def test_check_value_nonexistent_key(): json_data = { "first_known_lvl": { - "second_known_lvl": { - "third_known_lvl": { - "searched_phrase": "phrase" - } - } + "second_known_lvl": {"third_known_lvl": {"searched_phrase": "phrase"}} } } result = check_value(json_data, ["nonexistent_key"]) diff --git a/viadot/task_utils.py b/viadot/task_utils.py index 32be339af..6a532f932 100644 --- a/viadot/task_utils.py +++ b/viadot/task_utils.py @@ -791,4 +791,4 @@ def validate_df(df: pd.DataFrame, tests: dict = None) -> None: failed_tests_msg = ", ".join(failed_tests_list) raise ValidationError( f"Validation failed for {failed_tests} test/tests: {failed_tests_msg}" - ) \ No newline at end of file + ) diff --git a/viadot/utils.py b/viadot/utils.py index e77323aca..5e3de784c 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -460,10 +460,10 @@ def get_nested_dict(d): return d else: return None - + def check_value(base: Union[Dict, Any], levels: List) -> Union[None, Any]: - """ + """ Task to extract data from nested json file if there is any under passed parameters. Otherwise return None. @@ -484,4 +484,4 @@ def check_value(base: Union[Dict, Any], levels: List) -> Union[None, Any]: return None else: return base - return base \ No newline at end of file + return base From e33cd9d13d78ca866646b7eba1c44f4150c71669 Mon Sep 17 00:00:00 2001 From: marcinpurtak Date: Wed, 15 Nov 2023 12:44:22 +0100 Subject: [PATCH 85/86] Revert of changes for desired columns --- viadot/tasks/genesys.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index 2655552bf..942249ac2 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -493,22 +493,8 @@ def run( temp_dict["endTime"] = json_file.get("endTime") data_list.append(temp_dict) - desired_order = [ - "startTime", - "endTime", - "LOB", - "CustomerOutcomeResult", - "CustomerOutcomeTrack", - "LastUtterance", - "Final Sub Intent", - "SubIntent", - "Final Main Intent", - "conversationId", - ] - df = pd.DataFrame(data_list) - df = df[desired_order] - df.rename(columns={"LastUtterance": "CustomerTextInput"}, inplace=True) + df = df[df.columns[-1:]].join(df[df.columns[:-1]]) start = start_date.replace("-", "") end = end_date.replace("-", "") From 9a8b3c936f7f2fb4e395620f38b6947929aaf864 Mon Sep 17 00:00:00 2001 From: Rafalz13 Date: Wed, 15 Nov 2023 12:53:54 +0100 Subject: [PATCH 86/86] =?UTF-8?q?=F0=9F=93=9D=20Updated=20Changelog=20befo?= =?UTF-8?q?re=20release?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ce6cef40..2ef880c75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,37 +5,43 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added + +### Fixed + +### Changed + +## [0.4.22] - 2023-11-15 ### Added -- Added new view type `agent_interaction_view_type` in `Genesys`source. - Added `TM1` source class. - Added `TM1ToDF` task class. - Added `set_prefect_kv` parameter to `BigQueryToADLS` with `False` as a default. If there is a need to create new pair in KV Store the parameter can be changed to `True`. -- Added `_rename_duplicated_fields` method to `SharepointListToDF` task class for finding and rename duplicated columns +- Added `_rename_duplicated_fields` method to `SharepointListToDF` task class for finding and rename duplicated columns. - Added new view type `agent_interaction_view_type` in `Genesys`source. -- Added libraries `nltk` and `sklearn` to `requirements`. - Added new logic for endpoint `users` in `Genesys`task. +- Added libraries `nltk` and `sklearn` to `requirements`. ### Fixed - Fixed bug for endpoint `conversations` in GET method in `Genesys` Task. ### Changed -- Splitted test for Eurostat on source tests and task tests +- Splitted test for `Eurostat` on source tests and task tests. - Modified `SharepointList` source class: - -> docstrings update + -> docstrings update. - Modified `SharepointToADLS` flow class: - -> docstrings update - -> changed set_prefect_kv: bool = False to prevent forced KV store append + -> docstrings update. + -> changed set_prefect_kv: bool = False to prevent forced KV store append. - Modified `SharepointListToADLS` flow class: - -> changed set_prefect_kv: bool = False to prevent forced KV store append + -> changed set_prefect_kv: bool = False to prevent forced KV store append. - Modified `SharepointList` source class: - -> docstrings update - -> Changed `_unpack_fields` method to handle Sharepoint MultiChoiceField type + small improvements - -> Changed `get_fields` method to handle special characters - different approach to call get() and execute_query() - -> Renamed method from `select_expandable_user_fields` to `select_fields` + update for MultiChoiceField type - -> Changed `check_filters` method errors messages and more checks added - -> Changed `operators_mapping` method errors messages - -> Changed `make_filter_for_df` method errors messages + -> docstrings update. + -> Changed `_unpack_fields` method to handle Sharepoint MultiChoiceField type + small improvements. + -> Changed `get_fields` method to handle special characters - different approach to call get() and execute_query(). + -> Renamed method from `select_expandable_user_fields` to `select_fields` + update for MultiChoiceField type. + -> Changed `check_filters` method errors messages and more checks added. + -> Changed `operators_mapping` method errors messages. + -> Changed `make_filter_for_df` method errors messages. - Modified `SharepointListToDF` task class: -> docstrings update - Splitted test for Eurostat on source tests and task tests. @@ -43,8 +49,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Expanded `CustomerGaugeToDF` task class with separate cleaning functions and handling nested json structure flattening with two new methods `_field_reference_unpacker` and `_nested_dict_transformer`. - Changed `CustomerGaugeToADLS` to containing new arguments. -### Fixed - ## [0.4.21] - 2023-10-26 ### Added @@ -60,6 +64,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Changed `GenesysToCSV` logic for end_point == "conversations". Added new fields to extraction. + ## [0.4.20] - 2023-10-12 ### Added - Added `Office365-REST-Python-Client` library to `requirements`.