diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4872f19a..1636270b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,7 +10,7 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python: ['3.8', '3.9', '3.10', '3.11'] + python: ['3.9', '3.10', '3.11', '3.12'] runs-on: ${{ matrix.os }} steps: @@ -40,15 +40,15 @@ jobs: strategy: matrix: include: -# - SPARK_VERSION: "2.4.8" -# HADOOP_VERSION: "2.7" -# JAVA_VERSION: "8" -# python: "3.7" -# os: ubuntu-latest - SPARK_VERSION: "3.3.2" HADOOP_VERSION: "3" JAVA_VERSION: "11" - python: "3.8" + python: "3.9" + os: ubuntu-latest + - SPARK_VERSION: "3.5.3" + HADOOP_VERSION: "3" + JAVA_VERSION: "11" + python: "3.9" os: ubuntu-latest runs-on: ${{ matrix.os }} name: ${{ matrix.os }}, Spark ${{ matrix.SPARK_VERSION}}, Python ${{ matrix.python }} diff --git a/docs/source/developing.rst b/docs/source/developing.rst index 15dc7fc2..99d8eb2e 100644 --- a/docs/source/developing.rst +++ b/docs/source/developing.rst @@ -27,8 +27,8 @@ For this you'll need to install our test requirements: .. code-block:: bash cd popmon/ - pip install -r requirements-test.txt - python setup.py test + pip install -r .[test] + pytest That's it! diff --git a/popmon/analysis/profiling/profiles.py b/popmon/analysis/profiling/profiles.py index 4dbe2aad..747abd3c 100644 --- a/popmon/analysis/profiling/profiles.py +++ b/popmon/analysis/profiling/profiles.py @@ -186,9 +186,7 @@ def replace(bl): if len(bin_labels) == 0 or len(bin_labels) > 4 or np.sum(bin_entries) == 0: return np.nan if not np.all([isinstance(bl, (bool, np.bool_)) for bl in bin_labels]): - if not np.all( - [isinstance(bl, (str, np.str_, np.string_)) for bl in bin_labels] - ): + if not np.all([isinstance(bl, (str, np.str_, np.bytes_)) for bl in bin_labels]): return np.nan # all strings from hereon n_true = (bin_labels == "True").sum() + (bin_labels == "true").sum() diff --git a/popmon/analysis/profiling/pull_calculator.py b/popmon/analysis/profiling/pull_calculator.py index b1d0e0a6..f6606162 100644 --- a/popmon/analysis/profiling/pull_calculator.py +++ b/popmon/analysis/profiling/pull_calculator.py @@ -208,6 +208,11 @@ def transform(self, datastore): class ReferencePullCalculator(PullCalculator): """Pull calculation based on reference mean and standard deviations""" + @staticmethod + def mean(x): + """ "Column-wise mean version.""" + return np.mean(x, axis=0) + def __init__( self, reference_key, @@ -233,7 +238,7 @@ def __init__( :param kwargs: (dict, optional): residual kwargs passed on to mean and std functions """ super().__init__( - np.mean, + ReferencePullCalculator.mean, np.std, reference_key, assign_to_key, diff --git a/pyproject.toml b/pyproject.toml index 70bf35d2..36a16753 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,12 +17,12 @@ keywords = [ "ipython" ] readme = "README.rst" -requires-python = ">=3.7" +requires-python = ">=3.9" authors = [{name = "ING Analytics Wholesale Banking", email = "wbaa@ing.com"}] license = {type = "MIT", file = "LICENSE"} dependencies = [ "numpy>=1.18.0", - "pandas>=0.25.1,<2", + "pandas>=0.25.1", "scipy>=1.5.2", "histogrammar>=1.0.32", "phik", diff --git a/requirements.txt b/requirements.txt index 08a9d220..a5d5b128 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,5 +8,6 @@ tqdm plotly>=5.8.0 joblib>=0.14.0 htmlmin -pydantic -typing_extensions +pydantic>=2 +pydantic-settings +typing_extensions \ No newline at end of file diff --git a/tests/popmon/analysis/profiling/test_apply_func.py b/tests/popmon/analysis/profiling/test_apply_func.py index 556a0e7d..52636913 100644 --- a/tests/popmon/analysis/profiling/test_apply_func.py +++ b/tests/popmon/analysis/profiling/test_apply_func.py @@ -13,6 +13,11 @@ from popmon.base import Pipeline +def mean(x): + """ "Column-wise mean version,""" + return np.mean(x, axis=0) + + def get_test_data(): df = pd.DataFrame() df["a"] = np.arange(100) @@ -25,7 +30,7 @@ def test_pull(): module1 = ApplyFunc(apply_to_key="to_profile") module1.add_apply_func(np.std, suffix="_std", entire=True) - module1.add_apply_func(np.mean, suffix="_mean", entire=True) + module1.add_apply_func(mean, suffix="_mean", entire=True) module2 = ApplyFunc(apply_to_key="to_profile", features=["asc_numbers"]) module2.add_apply_func( @@ -57,7 +62,7 @@ def func(x): ) module.add_apply_func(np.std, entire=True) - module.add_apply_func(np.mean, entire=True) + module.add_apply_func(mean, entire=True) module.add_apply_func(func) datastore = module.transform(datastore) @@ -77,7 +82,7 @@ def test_variance_comparer(): apply_to_key="to_profile", features=["the_feature", "dummy_feature"] ) module1.add_apply_func(np.std, suffix="_std", entire=True) - module1.add_apply_func(np.mean, suffix="_mean", entire=True) + module1.add_apply_func(mean, suffix="_mean", entire=True) module2 = ApplyFunc( apply_to_key="to_profile", features=["the_feature", "dummy_feature"] @@ -171,7 +176,7 @@ def test_apply_func(): apply_funcs = [ {"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True}, - {"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True}, + {"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True}, ] d = apply_func( @@ -195,7 +200,7 @@ def test_apply_func_array(): apply_funcs = [ {"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True}, - {"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True}, + {"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True}, ] f, p = apply_func_array( diff --git a/tests/popmon/analysis/test_hist_numpy.py b/tests/popmon/analysis/test_hist_numpy.py index d33da9de..477b27c1 100644 --- a/tests/popmon/analysis/test_hist_numpy.py +++ b/tests/popmon/analysis/test_hist_numpy.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd import pytest +from conftest import make_mixed_dataframe from popmon.analysis.hist_numpy import ( assert_similar_hists, @@ -30,7 +31,7 @@ def get_test_histograms1(): """Get set 1 of test histograms""" # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() df["date"] = df["D"].apply(to_ns) df["boolT"] = True df["boolF"] = False @@ -55,8 +56,7 @@ def get_test_histograms1(): def get_test_histograms2(): """Get set 2 of test histograms""" # dummy dataset with mixed types - # convert timestamp (col D) to nanosec since 1970-1-1 - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() # building 1d-, 2d-histogram (iteratively) hist1 = hg.Categorize(unit("C")) @@ -351,7 +351,7 @@ def test_check_similar_hists(): """ # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() df["date"] = df["D"].apply(to_ns) # building 1d-, 2d-, and 3d-histogram (iteratively) @@ -391,7 +391,7 @@ def test_assert_similar_hists(): """ # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() df["date"] = df["D"].apply(to_ns) # building 1d-, 2d-, and 3d-histogram (iteratively) diff --git a/tests/popmon/conftest.py b/tests/popmon/conftest.py index b6b50b8b..dba9cd90 100644 --- a/tests/popmon/conftest.py +++ b/tests/popmon/conftest.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd import pytest +from pandas.core.indexes.datetimes import bdate_range from popmon import resources @@ -88,3 +89,14 @@ def pytest_configure(): df = pd.read_csv(resources.data(CSV_FILE)) df["date"] = pd.to_datetime(df["date"]) pytest.test_df = df + + +def make_mixed_dataframe() -> pd.DataFrame: + return pd.DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + } + ) diff --git a/tests/popmon/hist/test_histogram.py b/tests/popmon/hist/test_histogram.py index 18d83e17..721bff07 100644 --- a/tests/popmon/hist/test_histogram.py +++ b/tests/popmon/hist/test_histogram.py @@ -1,6 +1,7 @@ import histogrammar as hg import numpy as np import pandas as pd +from conftest import make_mixed_dataframe from popmon.hist.hist_utils import ( is_numeric, @@ -15,7 +16,7 @@ def get_test_data(): - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() df["date"] = df["D"].apply(lambda x: pd.to_datetime(x).value) return df