Merge pull request #687 from dyvenia/dev

Release 0.4.15 PR
dyvenia · May 11, 2023 · 204feb1 · 204feb1
2 parents ce91048 + 50964cb
commit 204feb1
Show file tree

Hide file tree

Showing 21 changed files with 1,941 additions and 33 deletions.
diff --git a/.github/workflows/build-2.0.yml b/.github/workflows/build-2.0.yml
@@ -34,4 +34,4 @@ jobs:
           file: docker/Dockerfile
           platforms: linux/amd64
           push: true
-          tags: ghcr.io/${{ github.repository }}/viadot:2.0-latest
+          tags: ghcr.io/${{ github.repository }}/viadot:2.0-latest
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 
+## [0.4.15] - 2023-05-11
+### Added
+- Added `BusinessCore` source class
+- Added `BusinessCoreToParquet` task class
+- Added `verify` parameter to `handle_api_response()`.
+- Added `to_parquet()` in `base.py`
+- Added new source class `SAPRFCV2` in `sap_rfc.py` with new approximation.
+- Added new parameter `rfc_replacement` to `sap_rfc_to_adls.py` to replace
+an extra separator character within a string column to avoid conflicts.
+- Added `rfc_unique_id` in `SAPRFCV2` to merge chunks on this column.
+- Added `close_connection()` to `SAPRFC` and `SAPRFCV2`
+
+### Fixed
+- Removed `try-except` sentence and added a new logic to remove extra separators in `sap_rfc.py` 
+source file, to vaoid a mismatch in columns lenght between iterative connections to SAP tables.
+- When `SAP` tables are updated during `sap_rfc.py` scrip running, if there are chunks, the
+columns in the next chunk are unrelated rows.
+- Fixed `sap_rfc.py` source file to not breakdown by both, 
+and extra separator in a row and adding new rows in SAP table between iterations.
+
+
 ## [0.4.14] - 2023-04-13
 ### Added
 - Added `anonymize_df` task function to `task_utils.py` to anonymize data in the dataframe in selected columns.
@@ -56,7 +77,6 @@ This parameter enables user to decide whether or not filter should be validated.
 ### Changed
 - Changed data extraction logic for `Outlook` data.
 
-
 ## [0.4.10] - 2022-11-16
 ### Added
 - Added `credentials_loader` function in utils

diff --git a/tests/integration/flows/test_eurostat_to_adls.py b/tests/integration/flows/test_eurostat_to_adls.py
@@ -0,0 +1,29 @@
+from unittest import mock
+import pytest
+import pandas as pd
+import os
+
+from viadot.flows import EurostatToADLS
+
+DATA = {"geo": ["PL", "DE", "NL"], "indicator": [35, 55, 77]}
+ADLS_FILE_NAME = "test_eurostat.parquet"
+ADLS_DIR_PATH = "raw/tests/"
+
+
+@mock.patch(
+    "viadot.tasks.EurostatToDF.run",
+    return_value=pd.DataFrame(data=DATA),
+)
+@pytest.mark.run
+def test_eurostat_to_adls_run_flow(mocked_class):
+    flow = EurostatToADLS(
+        "test_eurostat_to_adls_flow_run",
+        dataset_code="ILC_DI04",
+        overwrite_adls=True,
+        adls_dir_path=ADLS_DIR_PATH,
+        adls_file_name=ADLS_FILE_NAME,
+    )
+    result = flow.run()
+    assert result.is_successful()
+    os.remove("test_eurostat_to_adls_flow_run.parquet")
+    os.remove("test_eurostat_to_adls_flow_run.json")
diff --git a/tests/integration/tasks/test_eurostat.py b/tests/integration/tasks/test_eurostat.py
@@ -0,0 +1,223 @@
+import pytest
+import pandas as pd
+import logging
+
+from viadot.tasks import eurostat
+
+
+def test_and_validate_dataset_code_without_params(caplog):
+    """This function is designed to test the accuracy of the data retrieval feature in a program.
+    Specifically, it tests to ensure that the program returns a non-empty DataFrame when a correct
+    dataset code is provided without any parameters. The function is intended to be used in software
+    development to verify that the program is correctly retrieving data from the appropriate dataset.
+    """
+    task = eurostat.EurostatToDF(dataset_code="ILC_DI04").run()
+    assert isinstance(task, pd.DataFrame)
+    assert not task.empty
+    assert caplog.text == ""
+
+
+def test_wrong_dataset_code_logger(caplog):
+    """This function is designed to test the accuracy of the error logging feature in a program.
+    Specifically, it tests to ensure that the program is able to correctly identify and log errors
+    when provided with only incorrect dataset code.
+    The function is intended to be used in software development to identify correct type errors
+    and messages in the program's handling of codes.
+    """
+    task = eurostat.EurostatToDF(dataset_code="ILC_DI04E")
+
+    with pytest.raises(ValueError, match="DataFrame is empty!"):
+        with caplog.at_level(logging.ERROR):
+            task.run()
+    assert (
+        f"Failed to fetch data for ILC_DI04E, please check correctness of dataset code!"
+        in caplog.text
+    )
+
+
+def test_wrong_parameters_codes_logger(caplog):
+    """This function is designed to test the accuracy of the error logging feature in a program.
+    Specifically, it tests to ensure that the program is able to correctly identify and log errors
+    when provided with a correct dataset_code and correct parameters are provided, but both parameters codes are incorrect.
+    The function is intended to be used in software development to identify correct type errors
+    and messages in the program's handling of codes.
+    """
+    task = eurostat.EurostatToDF(
+        dataset_code="ILC_DI04",
+        params={"hhtyp": "total1", "indic_il": "non_existing_code"},
+    )
+
+    with pytest.raises(ValueError, match="DataFrame is empty!"):
+        with caplog.at_level(logging.ERROR):
+            task.run()
+    assert (
+        f"Parameters codes: 'total1 | non_existing_code' are not available. Please check your spelling!"
+        in caplog.text
+    )
+    assert (
+        f"You can find everything via link: https://ec.europa.eu/eurostat/databrowser/view/ILC_DI04/default/table?lang=en"
+        in caplog.text
+    )
+
+
+def test_parameter_codes_as_list_logger(caplog):
+    """This function is designed to test the accuracy of the error logging feature in a program.
+    Specifically, it tests to ensure that the program is able to correctly identify and log errors
+    when provided with a correct dataset code, correct parameters, but incorrect parameters codes structure
+    (as a list with strings, instead of single string).
+    The function is intended to be used in software development to identify correct type errors
+    and messages in the program's handling of codes.
+    """
+
+    task = eurostat.EurostatToDF(
+        dataset_code="ILC_DI04",
+        params={"hhtyp": ["totale", "nottotale"], "indic_il": "med_e"},
+    )
+    with pytest.raises(ValueError, match="Wrong structure of params!"):
+        with caplog.at_level(logging.ERROR):
+            task.run()
+    assert (
+        "You can provide only one code per one parameter as 'str' in params!\n"
+        in caplog.text
+    )
+    assert (
+        "CORRECT: params = {'unit': 'EUR'} | INCORRECT: params = {'unit': ['EUR', 'USD', 'PLN']}"
+        in caplog.text
+    )
+
+
+def test_wrong_parameters(caplog):
+    """This function is designed to test the accuracy of the error logging feature in a program.
+    Specifically, it tests to ensure that the program is able to correctly identify and log errors
+    when provided with a correct dataset_code, but incorrect parameters keys.
+    The function is intended to be used in software development to identify correct type errors
+    and messages in the program's handling of codes.
+    """
+
+    task = eurostat.EurostatToDF(
+        dataset_code="ILC_DI04", params={"hhhtyp": "total", "indic_ilx": "med_e"}
+    )
+    with pytest.raises(ValueError, match="DataFrame is empty!"):
+        with caplog.at_level(logging.ERROR):
+            task.run()
+    assert (
+        f"Parameters: 'hhhtyp | indic_ilx' are not in dataset. Please check your spelling!\n"
+        in caplog.text
+    )
+    assert (
+        f"Possible parameters: freq | hhtyp | indic_il | unit | geo | time"
+        in caplog.text
+    )
+
+
+def test_params_as_list():
+    """This function is designed to test the accuracy of the error logging feature in a program.
+    Specifically, it tests to ensure that the program is able to correctly identify and log error
+    when provided with a correct dataset_code, but incorrect params structure (as list instead of dict).
+    The function is intended to be used in software development to identify correct type errors
+    and messages in the program's handling of codes.
+    """
+    with pytest.raises(TypeError, match="Params should be a dictionary."):
+        eurostat.EurostatToDF(dataset_code="ILC_DI04", params=["total", "med_e"]).run()
+
+
+def test_correct_params_and_dataset_code(caplog):
+    """This function is designed to test the accuracy of the data retrieval feature in a program.
+    Specifically, it tests to ensure that the program returns a non-empty DataFrame when a correct
+    dataset code is provided with correct params. The function is intended to be used in software
+    development to verify that the program is correctly retrieving data from the appropriate dataset.
+    """
+
+    task = eurostat.EurostatToDF(
+        dataset_code="ILC_DI04", params={"hhtyp": "total", "indic_il": "med_e"}
+    ).run()
+
+    assert isinstance(task, pd.DataFrame)
+    assert not task.empty
+    assert caplog.text == ""
+
+
+def task_correct_requested_columns(caplog):
+    """This function is designed to test the accuracy of the data retrieval feature in a program.
+    Specifically, it tests to ensure that the program is able to correctly identify and log error
+    when provided with a correct dataset_code, correct params and correct requested_columns.
+    The function is intended to be used in software development to verify that the program is correctly
+    retrieving data from the appropriate dataset.
+    """
+    task = eurostat.EurostatToDF(
+        dataset_code="ILC_DI04",
+        params={"hhtyp": "total", "indic_il": "med_e"},
+        requested_columns=["updated", "geo", "indicator"],
+    )
+    task.run()
+
+    assert isinstance(task, pd.DataFrame)
+    assert not task.empty
+    assert caplog.text == ""
+    assert list(task.columns) == task.needed_columns
+
+
+def test_wrong_needed_columns_names(caplog):
+    """This function is designed to test the accuracy of the error logging feature in a program.
+    Specifically, it tests to ensure that the program is able to correctly identify and log error
+    when provided with a correct dataset_code, correct parameters, but incorrect names of requested columns.
+    The function is intended to be used in software development to identify correct type errors
+    and messages in the program's handling of codes.
+    """
+    task = eurostat.EurostatToDF(
+        dataset_code="ILC_DI04",
+        params={"hhtyp": "total", "indic_il": "med_e"},
+        requested_columns=["updated1", "geo1", "indicator1"],
+    )
+    with pytest.raises(ValueError, match="Provided columns are not available!"):
+        with caplog.at_level(logging.ERROR):
+            task.run()
+    assert (
+        f"Name of the columns: 'updated1 | geo1 | indicator1' are not in DataFrame. Please check spelling!\n"
+        in caplog.text
+    )
+    assert f"Available columns: geo | time | indicator | label | updated" in caplog.text
+
+
+def test_wrong_params_and_wrong_requested_columns_names(caplog):
+    """This function is designed to test the accuracy of the error logging feature in a program.
+    Specifically, it tests to ensure that the program is able to correctly identify and log error
+    when provided with a correct dataset_code, incorrect parameters and incorrect names of requested columns.
+    Test should log errors only related with wrong params - we are trying to check if program will stop after
+    params validation. The function is intended to be used in software development to identify correct type errors
+    and messages in the program's handling of codes.
+    """
+    task = eurostat.EurostatToDF(
+        dataset_code="ILC_DI04",
+        params={"hhhtyp": "total", "indic_ilx": "med_e"},
+        requested_columns=["updated1", "geo1", "indicator1"],
+    )
+    with pytest.raises(ValueError, match="DataFrame is empty!"):
+        with caplog.at_level(logging.ERROR):
+            task.run()
+    assert (
+        f"Parameters: 'hhhtyp | indic_ilx' are not in dataset. Please check your spelling!\n"
+        in caplog.text
+    )
+    assert (
+        f"Possible parameters: freq | hhtyp | indic_il | unit | geo | time"
+        in caplog.text
+    )
+
+
+def test_requested_columns_not_in_list():
+    """This function is designed to test the accuracy of the error logging feature in a program.
+    Specifically, it tests to ensure that the program is able to correctly identify and log error
+    when provided with a correct dataset_code, correct params but incorrect requested_columns structure
+    (as single string instead of list with strings).
+    The function is intended to be used in software development to identify correct type errors
+    and messages in the program's handling of codes.
+    """
+    with pytest.raises(
+        TypeError, match="Requested columns should be provided as list of strings."
+    ):
+        eurostat.EurostatToDF(
+            dataset_code="ILC_DI04",
+            params={"hhtyp": "total", "indic_il": "med_e"},
+            requested_columns="updated",
+        ).run()
diff --git a/tests/integration/test_business_core.py b/tests/integration/test_business_core.py
@@ -0,0 +1,49 @@
+import pytest
+from unittest.mock import patch, Mock
+import pandas as pd
+from viadot.sources import BusinessCore
+
+
+@pytest.fixture(scope="module")
+def business_core():
+    return BusinessCore(
+        url="https://api.businesscore.ae/api/GetCustomerData",
+        filters_dict={
+            "BucketCount": 10,
+            "BucketNo": 1,
+            "FromDate": None,
+            "ToDate": None,
+        },
+        credentials={"username": "test", "password": "test123"},
+    )
+
+
+@patch("viadot.sources.business_core.handle_api_response")
+def test_generate_token(mock_api_response, business_core):
+    mock_api_response.return_value = Mock(text='{"access_token": "12345"}')
+    token = business_core.generate_token()
+    assert token == "12345"
+
+
+def test_clean_filters_dict(business_core):
+    filters = business_core.clean_filters_dict()
+    assert filters == {
+        "BucketCount": 10,
+        "BucketNo": 1,
+        "FromDate": "&",
+        "ToDate": "&",
+    }
+
+
+def test_to_df(business_core):
+    with patch.object(
+        business_core,
+        "get_data",
+        return_value={"MasterDataList": [{"id": 1, "name": "John Doe"}]},
+    ):
+        df = business_core.to_df()
+        assert isinstance(df, pd.DataFrame)
+        assert len(df.columns) == 2
+        assert len(df) == 1
+        assert df["id"].tolist() == [1]
+        assert df["name"].tolist() == ["John Doe"]