From e6c82e361bbfe169f730bcf23b2eb599f9085552 Mon Sep 17 00:00:00 2001 From: kiurieva Date: Fri, 24 Nov 2023 14:36:47 +0100 Subject: [PATCH 01/14] Updated api url in connector --- viadot/sources/vid_club.py | 84 ++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py index e7819577a..327d9abf7 100644 --- a/viadot/sources/vid_club.py +++ b/viadot/sources/vid_club.py @@ -1,6 +1,7 @@ import json import os import urllib +from pandas.io.json import json_normalize from datetime import date, datetime, timedelta from typing import Any, Dict, List, Literal, Tuple @@ -46,7 +47,7 @@ def build_query( api_url: str, items_per_page: int, source: Literal["jobs", "product", "company", "survey"] = None, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, ) -> str: """ Builds the query from the inputs. @@ -128,7 +129,7 @@ def check_connection( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, url: str = None, ) -> Tuple[Dict[str, Any], str]: """ @@ -160,20 +161,37 @@ def check_connection( if url is None: url = self.credentials["url"] - first_url = self.build_query( - source=source, - from_date=from_date, - to_date=to_date, - api_url=url, - items_per_page=items_per_page, - region=region, - ) - headers = self.headers - response = handle_api_response( - url=first_url, headers=headers, method="GET", verify=False - ) - response = response.json() - + if source in ["jobs", "product", "company"]: + first_url = self.build_query( + source=source, + from_date=from_date, + to_date=to_date, + api_url=url, + items_per_page=items_per_page, + ) + headers = self.headers + response = handle_api_response( + url=first_url, headers=headers, method="GET", verify=False + ) + response = response.json() + elif source == "survey": + first_url = self.build_query( + source=source, + from_date=from_date, + to_date=to_date, + api_url=url, + items_per_page=items_per_page, + region=region, + ) + headers = self.headers + response = handle_api_response( + url=first_url, headers=headers, method="GET", verify=False + ) + response = response.json() + else: + raise ValidationError( + "Pick one these sources: jobs, product, company, survey" + ) return (response, first_url) def get_response( @@ -182,7 +200,7 @@ def get_response( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, ) -> pd.DataFrame: """ Basing on the pagination type retrieved using check_connection function, gets the response from the API queried and transforms it into DataFrame. @@ -207,14 +225,26 @@ def get_response( ) if to_date == None: to_date = datetime.today().strftime("%Y-%m-%d") + if source in ["jobs", "product", "company"]: + response, first_url = self.check_connection( + source=source, + from_date=from_date, + to_date=to_date, + items_per_page=items_per_page, + ) - response, first_url = self.check_connection( - source=source, - from_date=from_date, - to_date=to_date, - items_per_page=items_per_page, - region=region, - ) + elif source == "survey": + response, first_url = self.check_connection( + source=source, + from_date=from_date, + to_date=to_date, + items_per_page=items_per_page, + region=region, + ) + else: + raise ValidationError( + "Pick one these sources: jobs, product, company, survey" + ) if isinstance(response, dict): keys_list = list(response.keys()) @@ -229,7 +259,8 @@ def get_response( ind = False if "data" in keys_list: - df = pd.DataFrame(response["data"]) + df = json_normalize(response["data"]) + df = pd.DataFrame(df) length = df.shape[0] page = 1 @@ -244,7 +275,8 @@ def get_response( url=url, headers=headers, method="GET", verify=False ) response = r.json() - df_page = pd.DataFrame(response["data"]) + df_page = json_normalize(response["data"]) + df_page = pd.DataFrame(df_page) if source == "product": df_page = df_page.transpose() length = df_page.shape[0] From a638571f35246c01e2216dd0a4c5774d5cf5522b Mon Sep 17 00:00:00 2001 From: kiurieva Date: Fri, 24 Nov 2023 14:52:45 +0100 Subject: [PATCH 02/14] Updated api url in connector --- viadot/sources/vid_club.py | 84 ++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py index e7819577a..327d9abf7 100644 --- a/viadot/sources/vid_club.py +++ b/viadot/sources/vid_club.py @@ -1,6 +1,7 @@ import json import os import urllib +from pandas.io.json import json_normalize from datetime import date, datetime, timedelta from typing import Any, Dict, List, Literal, Tuple @@ -46,7 +47,7 @@ def build_query( api_url: str, items_per_page: int, source: Literal["jobs", "product", "company", "survey"] = None, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, ) -> str: """ Builds the query from the inputs. @@ -128,7 +129,7 @@ def check_connection( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, url: str = None, ) -> Tuple[Dict[str, Any], str]: """ @@ -160,20 +161,37 @@ def check_connection( if url is None: url = self.credentials["url"] - first_url = self.build_query( - source=source, - from_date=from_date, - to_date=to_date, - api_url=url, - items_per_page=items_per_page, - region=region, - ) - headers = self.headers - response = handle_api_response( - url=first_url, headers=headers, method="GET", verify=False - ) - response = response.json() - + if source in ["jobs", "product", "company"]: + first_url = self.build_query( + source=source, + from_date=from_date, + to_date=to_date, + api_url=url, + items_per_page=items_per_page, + ) + headers = self.headers + response = handle_api_response( + url=first_url, headers=headers, method="GET", verify=False + ) + response = response.json() + elif source == "survey": + first_url = self.build_query( + source=source, + from_date=from_date, + to_date=to_date, + api_url=url, + items_per_page=items_per_page, + region=region, + ) + headers = self.headers + response = handle_api_response( + url=first_url, headers=headers, method="GET", verify=False + ) + response = response.json() + else: + raise ValidationError( + "Pick one these sources: jobs, product, company, survey" + ) return (response, first_url) def get_response( @@ -182,7 +200,7 @@ def get_response( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, ) -> pd.DataFrame: """ Basing on the pagination type retrieved using check_connection function, gets the response from the API queried and transforms it into DataFrame. @@ -207,14 +225,26 @@ def get_response( ) if to_date == None: to_date = datetime.today().strftime("%Y-%m-%d") + if source in ["jobs", "product", "company"]: + response, first_url = self.check_connection( + source=source, + from_date=from_date, + to_date=to_date, + items_per_page=items_per_page, + ) - response, first_url = self.check_connection( - source=source, - from_date=from_date, - to_date=to_date, - items_per_page=items_per_page, - region=region, - ) + elif source == "survey": + response, first_url = self.check_connection( + source=source, + from_date=from_date, + to_date=to_date, + items_per_page=items_per_page, + region=region, + ) + else: + raise ValidationError( + "Pick one these sources: jobs, product, company, survey" + ) if isinstance(response, dict): keys_list = list(response.keys()) @@ -229,7 +259,8 @@ def get_response( ind = False if "data" in keys_list: - df = pd.DataFrame(response["data"]) + df = json_normalize(response["data"]) + df = pd.DataFrame(df) length = df.shape[0] page = 1 @@ -244,7 +275,8 @@ def get_response( url=url, headers=headers, method="GET", verify=False ) response = r.json() - df_page = pd.DataFrame(response["data"]) + df_page = json_normalize(response["data"]) + df_page = pd.DataFrame(df_page) if source == "product": df_page = df_page.transpose() length = df_page.shape[0] From 5092684bb5450d6bd918c98e521efcbacdccdb34 Mon Sep 17 00:00:00 2001 From: Kateryna Iurieva Date: Mon, 27 Nov 2023 13:35:35 +0100 Subject: [PATCH 03/14] Update viadot/sources/vid_club.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wójcik <107313911+adrian-wojcik@users.noreply.github.com> --- viadot/sources/vid_club.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py index 327d9abf7..a4dacc4cb 100644 --- a/viadot/sources/vid_club.py +++ b/viadot/sources/vid_club.py @@ -190,7 +190,7 @@ def check_connection( response = response.json() else: raise ValidationError( - "Pick one these sources: jobs, product, company, survey" + "Pick one of these sources: jobs, product, company, survey" ) return (response, first_url) From 45250eadd91120c23e849afe2beffa53f9ead469 Mon Sep 17 00:00:00 2001 From: Kateryna Iurieva Date: Mon, 27 Nov 2023 13:35:40 +0100 Subject: [PATCH 04/14] Update viadot/sources/vid_club.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wójcik <107313911+adrian-wojcik@users.noreply.github.com> --- viadot/sources/vid_club.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py index a4dacc4cb..497064c0b 100644 --- a/viadot/sources/vid_club.py +++ b/viadot/sources/vid_club.py @@ -243,7 +243,7 @@ def get_response( ) else: raise ValidationError( - "Pick one these sources: jobs, product, company, survey" + "Pick one of these sources: jobs, product, company, survey" ) if isinstance(response, dict): From c0ae042be3b8685109b37b1cd65a29bb75efc77e Mon Sep 17 00:00:00 2001 From: adrian-wojcik Date: Mon, 4 Dec 2023 13:32:59 +0100 Subject: [PATCH 05/14] =?UTF-8?q?=F0=9F=90=9B=20Added=20warning=20logger?= =?UTF-8?q?=20for=20credential?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/sap_rfc.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/viadot/sources/sap_rfc.py b/viadot/sources/sap_rfc.py index 6432ac8e8..f39fee297 100644 --- a/viadot/sources/sap_rfc.py +++ b/viadot/sources/sap_rfc.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd from prefect.utilities import logging +from prefect.engine.state import Failed try: import pyrfc @@ -257,7 +258,14 @@ def __init__( self._con = None DEFAULT_CREDENTIALS = local_config.get("SAP").get("DEV") - credentials = kwargs.pop("credentials", None) or DEFAULT_CREDENTIALS + + credentials = kwargs.pop("credentials", None) + if credentials is None: + credentials = DEFAULT_CREDENTIALS + logger.warning( + "WARNING!!! Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + ) + if credentials is None: raise CredentialError("Missing credentials.") From a999d91516b63647b593351629998bbfdb5e3096 Mon Sep 17 00:00:00 2001 From: adrian-wojcik Date: Mon, 4 Dec 2023 13:40:35 +0100 Subject: [PATCH 06/14] Add changes to changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ef880c75..2ba7a52ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### Changed - +- Changed __init__ in SAPRFC class in source in order to raise warning in prefect when credentials will be taken from DEV. ## [0.4.22] - 2023-11-15 ### Added From d69733b631ae78c48167b2cef0f5ed6a76e253b8 Mon Sep 17 00:00:00 2001 From: adrian-wojcik Date: Mon, 4 Dec 2023 13:58:32 +0100 Subject: [PATCH 07/14] =?UTF-8?q?=F0=9F=8E=A8=20Delete=20"WARNING!!!"=20fr?= =?UTF-8?q?om=20warning=20message?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/sap_rfc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viadot/sources/sap_rfc.py b/viadot/sources/sap_rfc.py index f39fee297..806e61250 100644 --- a/viadot/sources/sap_rfc.py +++ b/viadot/sources/sap_rfc.py @@ -263,7 +263,7 @@ def __init__( if credentials is None: credentials = DEFAULT_CREDENTIALS logger.warning( - "WARNING!!! Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." ) if credentials is None: From 91e9d0bbeb5ace04806801798bdff36aac48f36b Mon Sep 17 00:00:00 2001 From: adrian-wojcik Date: Tue, 5 Dec 2023 10:01:27 +0100 Subject: [PATCH 08/14] =?UTF-8?q?=F0=9F=8E=A8=20Change=20structure=20of=20?= =?UTF-8?q?'if'=20instruction=20and=20added=20to=20SAPRFCV2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- viadot/sources/sap_rfc.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/viadot/sources/sap_rfc.py b/viadot/sources/sap_rfc.py index 806e61250..16cd5483d 100644 --- a/viadot/sources/sap_rfc.py +++ b/viadot/sources/sap_rfc.py @@ -7,7 +7,6 @@ import numpy as np import pandas as pd from prefect.utilities import logging -from prefect.engine.state import Failed try: import pyrfc @@ -262,13 +261,12 @@ def __init__( credentials = kwargs.pop("credentials", None) if credentials is None: credentials = DEFAULT_CREDENTIALS + if credentials is None: + raise CredentialError("Missing credentials.") logger.warning( "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." ) - if credentials is None: - raise CredentialError("Missing credentials.") - super().__init__(*args, credentials=credentials, **kwargs) self.sep = sep @@ -702,9 +700,15 @@ def __init__( self._con = None DEFAULT_CREDENTIALS = local_config.get("SAP").get("DEV") - credentials = kwargs.pop("credentials", None) or DEFAULT_CREDENTIALS + + credentials = kwargs.pop("credentials", None) if credentials is None: - raise CredentialError("Missing credentials.") + credentials = DEFAULT_CREDENTIALS + if credentials is None: + raise CredentialError("Missing credentials.") + logger.warning( + "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + ) super().__init__(*args, credentials=credentials, **kwargs) From 6334ad427bafd4ffd29d8310694bba3e5bcca4b7 Mon Sep 17 00:00:00 2001 From: adrian-wojcik Date: Tue, 5 Dec 2023 10:07:12 +0100 Subject: [PATCH 09/14] =?UTF-8?q?=E2=9C=85=20Added=20tests=20for=20new=20f?= =?UTF-8?q?unctionalities=20for=20SAPRFC=20and=20SAPRFCV2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 4 ++-- tests/integration/test_sap_rfc.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ba7a52ea..68c49c1c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,11 +6,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - +- Added tests for new functionalities in SAPRFC and SAPRFCV2 regarding passing credentials ### Fixed ### Changed -- Changed __init__ in SAPRFC class in source in order to raise warning in prefect when credentials will be taken from DEV. +- Changed __init__ in SAPRFC and SAPRFCV2 class in source in order to raise warning in prefect when credentials will be taken from DEV. ## [0.4.22] - 2023-11-15 ### Added diff --git a/tests/integration/test_sap_rfc.py b/tests/integration/test_sap_rfc.py index 20078d312..fd2298323 100644 --- a/tests/integration/test_sap_rfc.py +++ b/tests/integration/test_sap_rfc.py @@ -187,3 +187,19 @@ def test___build_pandas_filter_query_v2(): sap2._build_pandas_filter_query(sap2.client_side_filters) == "thirdlongcolname == 01234" ), sap2._build_pandas_filter_query(sap2.client_side_filters) + + +def test_default_credentials_warning_SAPRFC(caplog): + _ = SAPRFC() + assert ( + "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + in caplog.text + ) + + +def test_default_credentials_warning_SAPRFCV2(caplog): + _ = SAPRFCV2() + assert ( + "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + in caplog.text + ) From 4f3efc96d72830e2033452d1c9110b0b072a88ff Mon Sep 17 00:00:00 2001 From: kiurieva Date: Tue, 5 Dec 2023 12:02:09 +0100 Subject: [PATCH 10/14] cleaned check_connection and get_response methods --- viadot/flows/vid_club_to_adls.py | 4 +- viadot/sources/vid_club.py | 81 ++++++++++---------------------- viadot/tasks/vid_club.py | 4 +- 3 files changed, 30 insertions(+), 59 deletions(-) diff --git a/viadot/flows/vid_club_to_adls.py b/viadot/flows/vid_club_to_adls.py index 40f53d8ae..de7267479 100644 --- a/viadot/flows/vid_club_to_adls.py +++ b/viadot/flows/vid_club_to_adls.py @@ -31,7 +31,7 @@ def __init__( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, days_interval: int = 30, cols_to_drop: List[str] = None, vid_club_credentials: Dict[str, Any] = None, @@ -60,7 +60,7 @@ def __init__( from_date (str, optional): Start date for the query, by default is the oldest date in the data 2022-03-22. to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. items_per_page (int, optional): Number of entries per page. Defaults to 100. - region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] + region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to None (parameter is not used in url). [December 2023 status: value 'all' does not work for company and jobs] days_interval (int, optional): Days specified in date range per API call (test showed that 30-40 is optimal for performance). Defaults to 30. cols_to_drop (List[str], optional): List of columns to drop. Defaults to None. vid_club_credentials (Dict[str, Any], optional): Stores the credentials information. Defaults to None. diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py index 327d9abf7..9aef751ad 100644 --- a/viadot/sources/vid_club.py +++ b/viadot/sources/vid_club.py @@ -58,7 +58,7 @@ def build_query( api_url (str): Generic part of the URL to Vid Club API. items_per_page (int): number of entries per page. source (Literal["jobs", "product", "company", "survey"], optional): The endpoint source to be accessed. Defaults to None. - region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] + region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to None (parameter is not used in url). [December 2023 status: value 'all' does not work for company and jobs] Returns: str: Final query with all filters added. @@ -67,7 +67,8 @@ def build_query( ValidationError: If any source different than the ones in the list are used. """ if source in ["jobs", "product", "company"]: - url = f"{api_url}{source}?from={from_date}&to={to_date}®ion={region}&limit={items_per_page}" + region_url_string = f"®ion={region}" if region else "" + url = f"{api_url}{source}?from={from_date}&to={to_date}{region_url_string}&limit={items_per_page}" elif source == "survey": url = f"{api_url}{source}?language=en&type=question" else: @@ -141,7 +142,7 @@ def check_connection( from_date (str, optional): Start date for the query, by default is the oldest date in the data 2022-03-22. to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. items_per_page (int, optional): Number of entries per page. 100 entries by default. - region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] + region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to None (parameter is not used in url). [December 2023 status: value 'all' does not work for company and jobs] url (str, optional): Generic part of the URL to Vid Club API. Defaults to None. Returns: @@ -161,37 +162,19 @@ def check_connection( if url is None: url = self.credentials["url"] - if source in ["jobs", "product", "company"]: - first_url = self.build_query( - source=source, - from_date=from_date, - to_date=to_date, - api_url=url, - items_per_page=items_per_page, - ) - headers = self.headers - response = handle_api_response( - url=first_url, headers=headers, method="GET", verify=False - ) - response = response.json() - elif source == "survey": - first_url = self.build_query( - source=source, - from_date=from_date, - to_date=to_date, - api_url=url, - items_per_page=items_per_page, - region=region, - ) - headers = self.headers - response = handle_api_response( - url=first_url, headers=headers, method="GET", verify=False - ) - response = response.json() - else: - raise ValidationError( - "Pick one these sources: jobs, product, company, survey" - ) + first_url = self.build_query( + source=source, + from_date=from_date, + to_date=to_date, + api_url=url, + items_per_page=items_per_page, + region=region, + ) + headers = self.headers + response = handle_api_response( + url=first_url, headers=headers, method="GET", verify=False + ) + response = response.json() return (response, first_url) def get_response( @@ -210,7 +193,7 @@ def get_response( from_date (str, optional): Start date for the query, by default is the oldest date in the data 2022-03-22. to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. items_per_page (int, optional): Number of entries per page. 100 entries by default. - region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] + region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to None (parameter is not used in url). [December 2023 status: value 'all' does not work for company and jobs] Returns: pd.DataFrame: Table of the data carried in the response. @@ -225,26 +208,14 @@ def get_response( ) if to_date == None: to_date = datetime.today().strftime("%Y-%m-%d") - if source in ["jobs", "product", "company"]: - response, first_url = self.check_connection( - source=source, - from_date=from_date, - to_date=to_date, - items_per_page=items_per_page, - ) - elif source == "survey": - response, first_url = self.check_connection( - source=source, - from_date=from_date, - to_date=to_date, - items_per_page=items_per_page, - region=region, - ) - else: - raise ValidationError( - "Pick one these sources: jobs, product, company, survey" - ) + response, first_url = self.check_connection( + source=source, + from_date=from_date, + to_date=to_date, + items_per_page=items_per_page, + region=region, + ) if isinstance(response, dict): keys_list = list(response.keys()) @@ -304,7 +275,7 @@ def total_load( from_date (str, optional): Start date for the query, by default is the oldest date in the data 2022-03-22. to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. items_per_page (int, optional): Number of entries per page. 100 entries by default. - region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to "all". [July 2023 status: parameter works only for 'all' on API] + region (Literal["bg", "hu", "hr", "pl", "ro", "si", "all"], optional): Region filter for the query. Defaults to None (parameter is not used in url). [December 2023 status: value 'all' does not work for company and jobs] days_interval (int, optional): Days specified in date range per api call (test showed that 30-40 is optimal for performance). Defaults to 30. Returns: diff --git a/viadot/tasks/vid_club.py b/viadot/tasks/vid_club.py index 0814a306f..aba7025dc 100644 --- a/viadot/tasks/vid_club.py +++ b/viadot/tasks/vid_club.py @@ -85,7 +85,7 @@ def run( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: str = "all", + region: str = None, days_interval: int = 30, cols_to_drop: List[str] = None, ) -> pd.DataFrame: @@ -98,7 +98,7 @@ def run( from_date (str, optional): Start date for the query, by default is the oldest date in the data, '2022-03-22'. to_date (str, optional): End date for the query. By default None, which will be executed as datetime.today().strftime("%Y-%m-%d") in code. items_per_page (int, optional): Number of entries per page. 100 entries by default. - region (str, optional): Region filter for the query. Valid inputs: ["bg", "hu", "hr", "pl", "ro", "si", "all"]. Defaults to "all". + region (str, optional): Region filter for the query. Valid inputs: ["bg", "hu", "hr", "pl", "ro", "si", "all"]. Defaults to None. days_interval (int, optional): Days specified in date range per api call (test showed that 30-40 is optimal for performance). Defaults to 30. cols_to_drop (List[str], optional): List of columns to drop. Defaults to None. From 48ddcd65336a60d2a8bd8fc6403aded2157398bd Mon Sep 17 00:00:00 2001 From: kiurieva Date: Tue, 5 Dec 2023 15:52:54 +0100 Subject: [PATCH 11/14] unify region parameter --- viadot/sources/vid_club.py | 2 +- viadot/tasks/vid_club.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py index 9aef751ad..fe6e76098 100644 --- a/viadot/sources/vid_club.py +++ b/viadot/sources/vid_club.py @@ -263,7 +263,7 @@ def total_load( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = "all", + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, days_interval: int = 30, ) -> pd.DataFrame: """ diff --git a/viadot/tasks/vid_club.py b/viadot/tasks/vid_club.py index aba7025dc..aff0e09ea 100644 --- a/viadot/tasks/vid_club.py +++ b/viadot/tasks/vid_club.py @@ -85,7 +85,7 @@ def run( from_date: str = "2022-03-22", to_date: str = None, items_per_page: int = 100, - region: str = None, + region: Literal["bg", "hu", "hr", "pl", "ro", "si", "all"] = None, days_interval: int = 30, cols_to_drop: List[str] = None, ) -> pd.DataFrame: From 2f7ed2a1b6cb6fdf124a8322ff0a0d9ebf18cf48 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 6 Dec 2023 09:10:56 +0000 Subject: [PATCH 12/14] =?UTF-8?q?=F0=9F=8E=A8=20Format=20Python=20code=20w?= =?UTF-8?q?ith=20Black?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/flows/test_sharepoint_to_adls.py | 2 +- tests/integration/test_sap_rfc.py | 4 ++-- tests/unit/test_utils.py | 2 +- viadot/sources/sap_bw.py | 1 + viadot/tasks/genesys.py | 2 +- viadot/utils.py | 2 +- 6 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/integration/flows/test_sharepoint_to_adls.py b/tests/integration/flows/test_sharepoint_to_adls.py index 5603c49ee..f0597c41a 100644 --- a/tests/integration/flows/test_sharepoint_to_adls.py +++ b/tests/integration/flows/test_sharepoint_to_adls.py @@ -6,7 +6,7 @@ import pytest from prefect.tasks.secrets import PrefectSecret -from viadot.flows import SharepointToADLS, SharepointListToADLS +from viadot.flows import SharepointListToADLS, SharepointToADLS from viadot.tasks import AzureDataLakeRemove ADLS_FILE_NAME = str(pendulum.now("utc")) + ".csv" diff --git a/tests/integration/test_sap_rfc.py b/tests/integration/test_sap_rfc.py index fd2298323..0ca9a2a1c 100644 --- a/tests/integration/test_sap_rfc.py +++ b/tests/integration/test_sap_rfc.py @@ -187,8 +187,8 @@ def test___build_pandas_filter_query_v2(): sap2._build_pandas_filter_query(sap2.client_side_filters) == "thirdlongcolname == 01234" ), sap2._build_pandas_filter_query(sap2.client_side_filters) - - + + def test_default_credentials_warning_SAPRFC(caplog): _ = SAPRFC() assert ( diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 75ef30e97..c29fbc014 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -8,8 +8,8 @@ from viadot.utils import ( add_viadot_metadata_columns, check_if_empty_file, - gen_bulk_insert_query_from_df, check_value, + gen_bulk_insert_query_from_df, ) EMPTY_CSV_PATH = "empty.csv" diff --git a/viadot/sources/sap_bw.py b/viadot/sources/sap_bw.py index 90b70dfec..e70f79b36 100644 --- a/viadot/sources/sap_bw.py +++ b/viadot/sources/sap_bw.py @@ -1,5 +1,6 @@ import textwrap from typing import List + import pyrfc from viadot.exceptions import CredentialError, ValidationError diff --git a/viadot/tasks/genesys.py b/viadot/tasks/genesys.py index d974dd587..feafbaccf 100644 --- a/viadot/tasks/genesys.py +++ b/viadot/tasks/genesys.py @@ -13,8 +13,8 @@ from viadot.exceptions import APIError from viadot.sources import Genesys -from viadot.utils import check_value from viadot.task_utils import * +from viadot.utils import check_value logger = logging.get_logger() diff --git a/viadot/utils.py b/viadot/utils.py index 5e3de784c..cd34adb8a 100644 --- a/viadot/utils.py +++ b/viadot/utils.py @@ -2,7 +2,7 @@ import os import re from itertools import chain -from typing import Union, Any, Callable, Dict, List, Literal +from typing import Any, Callable, Dict, List, Literal, Union import pandas as pd import prefect From 7aac8b0c7033dba90127e3ee1be89502670ddbc9 Mon Sep 17 00:00:00 2001 From: kiurieva Date: Wed, 6 Dec 2023 12:39:53 +0100 Subject: [PATCH 13/14] Fixed total_load method, updated tests --- tests/integration/test_vid_club.py | 1 - viadot/sources/vid_club.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_vid_club.py b/tests/integration/test_vid_club.py index 6c2bd4544..50c3015cf 100644 --- a/tests/integration/test_vid_club.py +++ b/tests/integration/test_vid_club.py @@ -66,7 +66,6 @@ def test_url_string(): expected_elements = [ f"from={from_date}", f"to={to_date}", - "region=all", f"limit={items_per_page}", api_url, ] diff --git a/viadot/sources/vid_club.py b/viadot/sources/vid_club.py index fe6e76098..4da4e4f45 100644 --- a/viadot/sources/vid_club.py +++ b/viadot/sources/vid_club.py @@ -310,6 +310,11 @@ def total_load( items_per_page=items_per_page, region=region, ) + list_columns = df.columns[ + df.applymap(lambda x: isinstance(x, list)).any() + ].tolist() + for i in list_columns: + df[i] = df[i].apply(lambda x: tuple(x) if isinstance(x, list) else x) df.drop_duplicates(inplace=True) if df.empty: From 8f447aa93ce4e0956070a03b6897d44633b00522 Mon Sep 17 00:00:00 2001 From: Angelika Tarnawa Date: Wed, 6 Dec 2023 14:41:50 +0100 Subject: [PATCH 14/14] =?UTF-8?q?=F0=9F=94=8A=20Updated=20logger=20warning?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/integration/test_sap_rfc.py | 4 ++-- viadot/sources/sap_rfc.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_sap_rfc.py b/tests/integration/test_sap_rfc.py index 0ca9a2a1c..28ab044a2 100644 --- a/tests/integration/test_sap_rfc.py +++ b/tests/integration/test_sap_rfc.py @@ -192,7 +192,7 @@ def test___build_pandas_filter_query_v2(): def test_default_credentials_warning_SAPRFC(caplog): _ = SAPRFC() assert ( - "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + "Your credentials will use DEV environment. If you would like to use different one - please specified it." in caplog.text ) @@ -200,6 +200,6 @@ def test_default_credentials_warning_SAPRFC(caplog): def test_default_credentials_warning_SAPRFCV2(caplog): _ = SAPRFCV2() assert ( - "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + "Your credentials will use DEV environment. If you would like to use different one - please specified it." in caplog.text ) diff --git a/viadot/sources/sap_rfc.py b/viadot/sources/sap_rfc.py index 16cd5483d..a9d109148 100644 --- a/viadot/sources/sap_rfc.py +++ b/viadot/sources/sap_rfc.py @@ -264,7 +264,7 @@ def __init__( if credentials is None: raise CredentialError("Missing credentials.") logger.warning( - "Your credentials will use DEV environment. If you would like to use different one - please specified it in 'sap_credentials' variable inside the flow." + "Your credentials will use DEV environment. If you would like to use different one - please specified it." ) super().__init__(*args, credentials=credentials, **kwargs)