Skip to content

Commit

Permalink
Make EvalML compatible with Woodwork changes (#4066)
Browse files Browse the repository at this point in the history
* update file

* update release notes

* lint

* update latest dep

* update min

* update test

* revert woodwork versions

* set ww min to 0.22.0

* empty

* merge target leakage update

* update wodwork for meta.yaml

* remove print

* lint

* Update timeseries.ipynb

Reformat doc

* update release notes

* retrigger codecov

* lint

* trigger

---------

Co-authored-by: bchen1116 <[email protected]>
  • Loading branch information
ParthivNaresh and bchen1116 authored Mar 14, 2023
1 parent 2401d30 commit 1412fc3
Show file tree
Hide file tree
Showing 14 changed files with 70 additions and 37 deletions.
2 changes: 1 addition & 1 deletion .github/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ outputs:
- click>=8.0.0
- shap >=0.40.0
- texttable >=1.6.2
- woodwork >=0.21.1
- woodwork >=0.22.0
- featuretools>=1.16.0
- nlp-primitives>=2.9.0
- python >=3.8.*
Expand Down
2 changes: 2 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ Release Notes
* Testing Changes
* Use ``release.yaml`` for performance tests on merge to main :pr:`4007`
* Pin ``github-action-check-linked-issues`` at v1.4.5 :pr:`4042`
* Updated tests to support Woodwork's object dtype inference for numeric columns :pr:`4066`
* Updated ``TargetLeakageDataCheck`` tests to handle boolean targets properly :pr:`4066`

.. warning::

Expand Down
1 change: 1 addition & 0 deletions docs/source/user_guide/timeseries.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@
"outputs": [],
"source": [
"X[\"Categorical\"] = [str(i % 4) for i in range(len(X))]\n",
"X[\"Categorical\"] = X[\"Categorical\"].astype(\"category\")\n",
"X[\"Numeric\"] = [i for i in range(len(X))]\n",
"\n",
"# Re-split the data since we modified X\n",
Expand Down
3 changes: 1 addition & 2 deletions evalml/pipelines/classification_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Pipeline subclass for all classification pipelines."""
import numpy as np
import pandas as pd
import woodwork as ww

from evalml.pipelines import PipelineBase
from evalml.problem_types import is_binary, is_multiclass
Expand Down Expand Up @@ -71,7 +70,7 @@ def fit(self, X, y):

# TODO: Added this in because numpy's unique() does not support pandas.NA
try:
self._classes_ = list(ww.init_series(np.unique(y)))
self._classes_ = list(np.unique(y))
except TypeError as e:
if "boolean value of NA is ambiguous" in str(e):
self._classes_ = y.unique()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def test_select_by_type_empty_X():
[
lambda X, X_t: X_t.empty,
lambda X, X_t: X_t.empty,
lambda X, X_t: X_t.equals(X[["three"]].astype("int64")),
lambda X, X_t: X_t.equals(X[["one", "three"]].astype("int64")),
lambda X, X_t: X_t.astype(str).equals(X.astype(str)),
],
),
Expand Down Expand Up @@ -135,7 +135,7 @@ def test_column_transformer_transform(class_to_test, checking_functions):
SelectByType,
[
lambda X, X_t: X_t.empty,
lambda X, X_t: X_t.equals(X[["three"]].astype("int64")),
lambda X, X_t: X_t.equals(X[["one", "three"]].astype("int64")),
lambda X, X_t: X_t.astype(str).equals(X.astype(str)),
],
),
Expand Down
15 changes: 9 additions & 6 deletions evalml/tests/component_tests/test_target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,29 +121,32 @@ def test_cols():
{
"col_1": [1, 2, 1, 1, 2] * 2,
"col_2": ["2", "1", "1", "1", "1"] * 2,
"col_3": ["a", "a", "a", "a", "a"] * 2,
"col_3": ["a", "a", "a", "a", "b"] * 2,
},
)
X_expected = X.astype({"col_1": "int64", "col_2": "category", "col_3": "category"})
X_expected = X.astype({"col_1": "int64", "col_2": "int64", "col_3": "category"})
y = pd.Series([0, 1, 1, 1, 0] * 2)
encoder = TargetEncoder(cols=[])
encoder.fit(X, y)
X_t = encoder.transform(X)
assert_frame_equal(X_expected, X_t)

encoder = TargetEncoder(cols=["col_2"])
encoder = TargetEncoder(cols=["col_3"])
encoder.fit(X, y)
X_t = encoder.transform(X)
X_expected = pd.DataFrame(
{
"col_1": pd.Series([1, 2, 1, 1, 2] * 2, dtype="int64"),
"col_2": [0.161365, 0.749863, 0.749863, 0.749863, 0.749863] * 2,
"col_3": pd.Series(["a", "a", "a", "a", "a"] * 2, dtype="category"),
"col_2": [2, 1, 1, 1, 1] * 2,
"col_3": pd.Series(
[0.749863, 0.749863, 0.749863, 0.749863, 0.161365] * 2,
dtype="float64",
),
},
)
assert_frame_equal(X_expected, X_t, check_less_precise=True)

encoder = TargetEncoder(cols=["col_2", "col_3"])
encoder = TargetEncoder(cols=["col_3"])
encoder.fit(X, y)
X_t = encoder.transform(X)
encoder2 = TargetEncoder()
Expand Down
18 changes: 12 additions & 6 deletions evalml/tests/data_checks_tests/test_id_columns_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,15 @@ def test_id_columns_strings():
id_cols_check = IDColumnsDataCheck(id_threshold=1.0)
assert id_cols_check.validate(X) == [
DataCheckWarning(
message="Columns 'Id' are 100.0% or more likely to be an ID column",
message="Columns 'Id', 'col_3_id' are 100.0% or more likely to be an ID column",
data_check_name=id_data_check_name,
message_code=DataCheckMessageCode.HAS_ID_COLUMN,
details={"columns": ["Id"]},
details={"columns": ["Id", "col_3_id"]},
action_options=[
DataCheckActionOption(
DataCheckActionCode.DROP_COL,
data_check_name=id_data_check_name,
metadata={"columns": ["Id"]},
metadata={"columns": ["Id", "col_3_id"]},
),
],
).to_dict(),
Expand Down Expand Up @@ -293,17 +293,23 @@ def test_unidentified_first_col_primary_key(
)

id_cols_check = IDColumnsDataCheck(id_threshold=0.95)
if input_type == "string":
order = ["col_2", "col_3_id", "col_1_id"]
else:
order = ["col_2", "col_1_id", "col_3_id"]
order_msg = f"Columns '{order[0]}', '{order[1]}', '{order[2]}' are 95.0% or more likely to be an ID column"

assert id_cols_check.validate(X) == [
DataCheckWarning(
message="Columns 'col_2', 'col_1_id', 'col_3_id' are 95.0% or more likely to be an ID column",
message=order_msg,
data_check_name=id_data_check_name,
message_code=DataCheckMessageCode.HAS_ID_COLUMN,
details={"columns": ["col_2", "col_1_id", "col_3_id"]},
details={"columns": order},
action_options=[
DataCheckActionOption(
DataCheckActionCode.DROP_COL,
data_check_name=id_data_check_name,
metadata={"columns": ["col_2", "col_1_id", "col_3_id"]},
metadata={"columns": order},
),
],
).to_dict(),
Expand Down
26 changes: 18 additions & 8 deletions evalml/tests/data_checks_tests/test_target_leakage_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,15 +193,15 @@ def test_target_leakage_types():

expected = [
DataCheckWarning(
message="Columns 'a', 'b' are 80.0% or more correlated with the target",
message="Columns 'a', 'b', 'c' are 80.0% or more correlated with the target",
data_check_name=target_leakage_data_check_name,
message_code=DataCheckMessageCode.TARGET_LEAKAGE,
details={"columns": ["a", "b"]},
details={"columns": ["a", "b", "c"]},
action_options=[
DataCheckActionOption(
DataCheckActionCode.DROP_COL,
data_check_name=target_leakage_data_check_name,
metadata={"columns": ["a", "b"]},
metadata={"columns": ["a", "b", "c"]},
),
],
).to_dict(),
Expand Down Expand Up @@ -356,8 +356,21 @@ def test_target_leakage_data_check_warnings_pearson():
y = y.astype(bool)

leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5, method="pearson")
# pearsons does not support boolean columns
assert leakage_check.validate(X, y) == []
assert leakage_check.validate(X, y) == [
DataCheckWarning(
message="Columns 'a', 'b', 'c', 'd' are 50.0% or more correlated with the target",
data_check_name=target_leakage_data_check_name,
message_code=DataCheckMessageCode.TARGET_LEAKAGE,
details={"columns": ["a", "b", "c", "d"]},
action_options=[
DataCheckActionOption(
DataCheckActionCode.DROP_COL,
data_check_name=target_leakage_data_check_name,
metadata={"columns": ["a", "b", "c", "d"]},
),
],
).to_dict(),
]

y = y.astype(int)
assert leakage_check.validate(X, y) == [
Expand Down Expand Up @@ -447,9 +460,6 @@ def test_target_leakage_none_measures(measures):
X["b"] = y
y = y.astype(bool)

if measures in ["pearson", "spearman"]:
assert leakage_check.validate(X, y) == []
return
assert len(leakage_check.validate(X, y))


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ statsmodels==0.13.5
texttable==1.6.7
tomli==2.0.1
vowpalwabbit==9.7.0
woodwork==0.21.2
woodwork==0.22.0
xgboost==1.7.4
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,5 @@ statsmodels==0.12.2
texttable==1.6.2
tomli==2.0.1
vowpalwabbit==8.11.0
woodwork==0.21.1
woodwork==0.22.0
xgboost==1.7.0
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ statsmodels==0.12.2
texttable==1.6.2
tomli==2.0.1
vowpalwabbit==8.11.0
woodwork==0.21.1
woodwork==0.22.0
xgboost==1.7.0
Original file line number Diff line number Diff line change
Expand Up @@ -1132,11 +1132,11 @@ def test_json_serialization(
pipeline = linear_regression_pipeline
elif problem_type == problem_type.BINARY:
X, y = X_y_binary
y = pd.Series(y).astype("str")
y = pd.Series(y).astype("string")
pipeline = logistic_regression_binary_pipeline
else:
X, y = X_y_multi
y = pd.Series(y).astype("str")
y = pd.Series(y).astype("string")
pipeline = logistic_regression_multiclass_pipeline

pipeline.fit(X, y)
Expand All @@ -1148,6 +1148,7 @@ def test_json_serialization(
num_to_explain=1,
output_format="dict",
)

assert json.loads(json.dumps(best_worst)) == best_worst

report = explain_predictions(
Expand Down
23 changes: 17 additions & 6 deletions evalml/tests/model_understanding_tests/test_partial_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -1425,10 +1425,14 @@ def test_graph_partial_dependence_regression_and_binary_categorical(

X = pd.DataFrame(X)
X.columns = [str(i) for i in range(X.shape[1])]
X["categorical_column"] = pd.Series([i % 3 for i in range(X.shape[0])]).astype(
X["categorical_column"] = pd.Series(
[f"cat_{i % 3}" for i in range(X.shape[0])],
).astype(
"str",
)
X["categorical_column_2"] = pd.Series([i % 6 for i in range(X.shape[0])]).astype(
X["categorical_column_2"] = pd.Series(
[f"cat_{i % 6}" for i in range(X.shape[0])],
).astype(
"str",
)

Expand All @@ -1442,7 +1446,7 @@ def test_graph_partial_dependence_regression_and_binary_categorical(
)
plot_data = fig.to_dict()["data"][0]
assert plot_data["type"] == "bar"
assert list(plot_data["x"]) == ["0", "1", "2"]
assert list(plot_data["x"]) == ["cat_0", "cat_1", "cat_2"]

fig = graph_partial_dependence(
pipeline,
Expand All @@ -1453,7 +1457,7 @@ def test_graph_partial_dependence_regression_and_binary_categorical(
fig_dict = fig.to_dict()
plot_data = fig_dict["data"][0]
assert plot_data["type"] == "contour"
assert fig_dict["layout"]["yaxis"]["ticktext"] == ["0", "1", "2"]
assert fig_dict["layout"]["yaxis"]["ticktext"] == ["cat_0", "cat_1", "cat_2"]
assert (
fig_dict["layout"]["title"]["text"]
== "Partial Dependence of 'categorical_column' vs. '0'"
Expand All @@ -1468,8 +1472,15 @@ def test_graph_partial_dependence_regression_and_binary_categorical(
fig_dict = fig.to_dict()
plot_data = fig_dict["data"][0]
assert plot_data["type"] == "contour"
assert fig_dict["layout"]["xaxis"]["ticktext"] == ["0", "1", "2"]
assert fig_dict["layout"]["yaxis"]["ticktext"] == ["0", "1", "2", "3", "4", "5"]
assert fig_dict["layout"]["xaxis"]["ticktext"] == ["cat_0", "cat_1", "cat_2"]
assert fig_dict["layout"]["yaxis"]["ticktext"] == [
"cat_0",
"cat_1",
"cat_2",
"cat_3",
"cat_4",
"cat_5",
]
assert (
fig_dict["layout"]["title"]["text"]
== "Partial Dependence of 'categorical_column_2' vs. 'categorical_column'"
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dependencies = [
"shap >= 0.40.0",
"statsmodels >= 0.12.2",
"texttable >= 1.6.2",
"woodwork >= 0.21.1",
"woodwork >= 0.22.0",
"dask >= 2022.2.0, != 2022.10.1",
"featuretools >= 1.16.0",
"nlp-primitives >= 2.9.0",
Expand Down

0 comments on commit 1412fc3

Please sign in to comment.