Skip to content

Commit

Permalink
- align requirements.txt with pyproject.toml
Browse files Browse the repository at this point in the history
- remove calls to np.string_ not existing in numpy >= 2.0.0
- remove calls to pd._testing.makeMixedDataFrame not existing in new pandas versions
- fix install and test commands in documentation for developers
- replace np.mean with column-wise version
- drop pandas dependency constraint <2
- require Python 3.9 in pyproject.toml
- add PySpark 3.5.3 to test pipeline matrix
- update test pipeline matrix: exclude Python 3.8, include Python 3.12
  • Loading branch information
mkopec87 committed Dec 16, 2024
1 parent ac79d21 commit 96483a2
Show file tree
Hide file tree
Showing 10 changed files with 50 additions and 28 deletions.
14 changes: 7 additions & 7 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest]
python: ['3.8', '3.9', '3.10', '3.11']
python: ['3.9', '3.10', '3.11', '3.12']
runs-on: ${{ matrix.os }}

steps:
Expand Down Expand Up @@ -40,15 +40,15 @@ jobs:
strategy:
matrix:
include:
# - SPARK_VERSION: "2.4.8"
# HADOOP_VERSION: "2.7"
# JAVA_VERSION: "8"
# python: "3.7"
# os: ubuntu-latest
- SPARK_VERSION: "3.3.2"
HADOOP_VERSION: "3"
JAVA_VERSION: "11"
python: "3.8"
python: "3.9"
os: ubuntu-latest
- SPARK_VERSION: "3.5.3"
HADOOP_VERSION: "3"
JAVA_VERSION: "11"
python: "3.9"
os: ubuntu-latest
runs-on: ${{ matrix.os }}
name: ${{ matrix.os }}, Spark ${{ matrix.SPARK_VERSION}}, Python ${{ matrix.python }}
Expand Down
4 changes: 2 additions & 2 deletions docs/source/developing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ For this you'll need to install our test requirements:
.. code-block:: bash
cd popmon/
pip install -r requirements-test.txt
python setup.py test
pip install -r .[test]
pytest
That's it!

Expand Down
4 changes: 1 addition & 3 deletions popmon/analysis/profiling/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,7 @@ def replace(bl):
if len(bin_labels) == 0 or len(bin_labels) > 4 or np.sum(bin_entries) == 0:
return np.nan
if not np.all([isinstance(bl, (bool, np.bool_)) for bl in bin_labels]):
if not np.all(
[isinstance(bl, (str, np.str_, np.string_)) for bl in bin_labels]
):
if not np.all([isinstance(bl, (str, np.str_, np.bytes_)) for bl in bin_labels]):
return np.nan
# all strings from hereon
n_true = (bin_labels == "True").sum() + (bin_labels == "true").sum()
Expand Down
7 changes: 6 additions & 1 deletion popmon/analysis/profiling/pull_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,11 @@ def transform(self, datastore):
class ReferencePullCalculator(PullCalculator):
"""Pull calculation based on reference mean and standard deviations"""

@staticmethod
def mean(x):
""" "Column-wise mean version."""
return np.mean(x, axis=0)

def __init__(
self,
reference_key,
Expand All @@ -233,7 +238,7 @@ def __init__(
:param kwargs: (dict, optional): residual kwargs passed on to mean and std functions
"""
super().__init__(
np.mean,
ReferencePullCalculator.mean,
np.std,
reference_key,
assign_to_key,
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ keywords = [
"ipython"
]
readme = "README.rst"
requires-python = ">=3.7"
requires-python = ">=3.9"
authors = [{name = "ING Analytics Wholesale Banking", email = "[email protected]"}]
license = {type = "MIT", file = "LICENSE"}
dependencies = [
"numpy>=1.18.0",
"pandas>=0.25.1,<2",
"pandas>=0.25.1",
"scipy>=1.5.2",
"histogrammar>=1.0.32",
"phik",
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ tqdm
plotly>=5.8.0
joblib>=0.14.0
htmlmin
pydantic
typing_extensions
pydantic>=2
pydantic-settings
typing_extensions
15 changes: 10 additions & 5 deletions tests/popmon/analysis/profiling/test_apply_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
from popmon.base import Pipeline


def mean(x):
""" "Column-wise mean version,"""
return np.mean(x, axis=0)


def get_test_data():
df = pd.DataFrame()
df["a"] = np.arange(100)
Expand All @@ -25,7 +30,7 @@ def test_pull():

module1 = ApplyFunc(apply_to_key="to_profile")
module1.add_apply_func(np.std, suffix="_std", entire=True)
module1.add_apply_func(np.mean, suffix="_mean", entire=True)
module1.add_apply_func(mean, suffix="_mean", entire=True)

module2 = ApplyFunc(apply_to_key="to_profile", features=["asc_numbers"])
module2.add_apply_func(
Expand Down Expand Up @@ -57,7 +62,7 @@ def func(x):
)

module.add_apply_func(np.std, entire=True)
module.add_apply_func(np.mean, entire=True)
module.add_apply_func(mean, entire=True)
module.add_apply_func(func)

datastore = module.transform(datastore)
Expand All @@ -77,7 +82,7 @@ def test_variance_comparer():
apply_to_key="to_profile", features=["the_feature", "dummy_feature"]
)
module1.add_apply_func(np.std, suffix="_std", entire=True)
module1.add_apply_func(np.mean, suffix="_mean", entire=True)
module1.add_apply_func(mean, suffix="_mean", entire=True)

module2 = ApplyFunc(
apply_to_key="to_profile", features=["the_feature", "dummy_feature"]
Expand Down Expand Up @@ -171,7 +176,7 @@ def test_apply_func():

apply_funcs = [
{"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True},
{"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
{"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
]

d = apply_func(
Expand All @@ -195,7 +200,7 @@ def test_apply_func_array():

apply_funcs = [
{"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True},
{"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
{"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
]

f, p = apply_func_array(
Expand Down
10 changes: 5 additions & 5 deletions tests/popmon/analysis/test_hist_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
import pandas as pd
import pytest
from conftest import make_mixed_dataframe

from popmon.analysis.hist_numpy import (
assert_similar_hists,
Expand Down Expand Up @@ -30,7 +31,7 @@ def get_test_histograms1():
"""Get set 1 of test histograms"""
# dummy dataset with mixed types
# convert timestamp (col D) to nanosec since 1970-1-1
df = pd._testing.makeMixedDataFrame()
df = make_mixed_dataframe()
df["date"] = df["D"].apply(to_ns)
df["boolT"] = True
df["boolF"] = False
Expand All @@ -55,8 +56,7 @@ def get_test_histograms1():
def get_test_histograms2():
"""Get set 2 of test histograms"""
# dummy dataset with mixed types
# convert timestamp (col D) to nanosec since 1970-1-1
df = pd._testing.makeMixedDataFrame()
df = make_mixed_dataframe()

# building 1d-, 2d-histogram (iteratively)
hist1 = hg.Categorize(unit("C"))
Expand Down Expand Up @@ -351,7 +351,7 @@ def test_check_similar_hists():
"""
# dummy dataset with mixed types
# convert timestamp (col D) to nanosec since 1970-1-1
df = pd._testing.makeMixedDataFrame()
df = make_mixed_dataframe()
df["date"] = df["D"].apply(to_ns)

# building 1d-, 2d-, and 3d-histogram (iteratively)
Expand Down Expand Up @@ -391,7 +391,7 @@ def test_assert_similar_hists():
"""
# dummy dataset with mixed types
# convert timestamp (col D) to nanosec since 1970-1-1
df = pd._testing.makeMixedDataFrame()
df = make_mixed_dataframe()
df["date"] = df["D"].apply(to_ns)

# building 1d-, 2d-, and 3d-histogram (iteratively)
Expand Down
12 changes: 12 additions & 0 deletions tests/popmon/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import pandas as pd
import pytest
from pandas.core.indexes.datetimes import bdate_range

from popmon import resources

Expand Down Expand Up @@ -88,3 +89,14 @@ def pytest_configure():
df = pd.read_csv(resources.data(CSV_FILE))
df["date"] = pd.to_datetime(df["date"])
pytest.test_df = df


def make_mixed_dataframe() -> pd.DataFrame:
return pd.DataFrame(
{
"A": [0.0, 1.0, 2.0, 3.0, 4.0],
"B": [0.0, 1.0, 0.0, 1.0, 0.0],
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
"D": bdate_range("1/1/2009", periods=5),
}
)
3 changes: 2 additions & 1 deletion tests/popmon/hist/test_histogram.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import histogrammar as hg
import numpy as np
import pandas as pd
from conftest import make_mixed_dataframe

from popmon.hist.hist_utils import (
is_numeric,
Expand All @@ -15,7 +16,7 @@


def get_test_data():
df = pd._testing.makeMixedDataFrame()
df = make_mixed_dataframe()
df["date"] = df["D"].apply(lambda x: pd.to_datetime(x).value)
return df

Expand Down

0 comments on commit 96483a2

Please sign in to comment.