From 4e76e5de7bfe5ed45860531dfe66cdffa1582e6e Mon Sep 17 00:00:00 2001 From: bchen1116 Date: Thu, 9 Mar 2023 14:15:39 +0800 Subject: [PATCH 1/3] update file --- docs/source/release_notes.rst | 1 + .../data_checks/target_leakage_data_check.py | 5 +++++ .../test_target_leakage_data_check.py | 22 ++++++++++++++----- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 8623081feb..51a914ed3a 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -9,6 +9,7 @@ Release Notes * Add ``get_evalml_requirements_file`` :pr:`4034` * Pipelines with DFS Transformers will run fast permutation importance if DFS features pre-exist :pr:`4037` * Fixes + * Updated ``TargetLeakageDataCheck`` to handle boolean targets properly :pr:`` * Changes * Uncapped ``pmdarima`` and updated minimum version :pr:`4027` * Increase min catboost to 1.1.1 and xgboost to 1.7.0 to add nullable type support for those estimators :pr:`3996` diff --git a/evalml/data_checks/target_leakage_data_check.py b/evalml/data_checks/target_leakage_data_check.py index 4594cc23e2..620e896868 100644 --- a/evalml/data_checks/target_leakage_data_check.py +++ b/evalml/data_checks/target_leakage_data_check.py @@ -48,11 +48,16 @@ def _calculate_dependence(self, X, y): while target_str in list(X2.columns): target_str += "_y" X2.ww[target_str] = y + # bool_columns = X2.ww.select(["Boolean", "BooleanNullable"]).columns.values + # bool_to_int = {col: "IntegerNullable" for col in bool_columns} + # X2.ww.set_types(bool_to_int) + # print(X2.ww.types) try: dep_corr = X2.ww.dependence_dict( measures=self.method, target_col=target_str, ) + print(dep_corr) except KeyError: # keyError raised when the target does not appear due to incompatibility with the metric, return [] return [] diff --git a/evalml/tests/data_checks_tests/test_target_leakage_data_check.py b/evalml/tests/data_checks_tests/test_target_leakage_data_check.py index b443fb669f..6b43aebb94 100644 --- a/evalml/tests/data_checks_tests/test_target_leakage_data_check.py +++ b/evalml/tests/data_checks_tests/test_target_leakage_data_check.py @@ -188,7 +188,7 @@ def test_target_leakage_types(): ] * 6 + [datetime.strptime("2015", "%Y")] X["d"] = ~y X["e"] = np.zeros(len(y)) - y = y.astype(bool) + # y = y.astype(bool) X.ww.init(logical_types={"a": "categorical", "d": "Boolean", "b": "Boolean"}) expected = [ @@ -356,8 +356,21 @@ def test_target_leakage_data_check_warnings_pearson(): y = y.astype(bool) leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5, method="pearson") - # pearsons does not support boolean columns - assert leakage_check.validate(X, y) == [] + assert leakage_check.validate(X, y) == [ + DataCheckWarning( + message="Columns 'a', 'b', 'c', 'd' are 50.0% or more correlated with the target", + data_check_name=target_leakage_data_check_name, + message_code=DataCheckMessageCode.TARGET_LEAKAGE, + details={"columns": ["a", "b", "c", "d"]}, + action_options=[ + DataCheckActionOption( + DataCheckActionCode.DROP_COL, + data_check_name=target_leakage_data_check_name, + metadata={"columns": ["a", "b", "c", "d"]}, + ), + ], + ).to_dict(), + ] y = y.astype(int) assert leakage_check.validate(X, y) == [ @@ -447,9 +460,6 @@ def test_target_leakage_none_measures(measures): X["b"] = y y = y.astype(bool) - if measures in ["pearson", "spearman"]: - assert leakage_check.validate(X, y) == [] - return assert len(leakage_check.validate(X, y)) From 4ed1c643a98690b86d2835b0408f5ed1ada5acc3 Mon Sep 17 00:00:00 2001 From: bchen1116 Date: Thu, 9 Mar 2023 14:17:41 +0800 Subject: [PATCH 2/3] update release notes --- docs/source/release_notes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 51a914ed3a..dffe486bb0 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -9,7 +9,7 @@ Release Notes * Add ``get_evalml_requirements_file`` :pr:`4034` * Pipelines with DFS Transformers will run fast permutation importance if DFS features pre-exist :pr:`4037` * Fixes - * Updated ``TargetLeakageDataCheck`` to handle boolean targets properly :pr:`` + * Updated ``TargetLeakageDataCheck`` to handle boolean targets properly :pr:`4061` * Changes * Uncapped ``pmdarima`` and updated minimum version :pr:`4027` * Increase min catboost to 1.1.1 and xgboost to 1.7.0 to add nullable type support for those estimators :pr:`3996` From 0141795106dd3fdbd53f4c409ad4667474f91ae7 Mon Sep 17 00:00:00 2001 From: bchen1116 Date: Fri, 10 Mar 2023 12:03:58 +0800 Subject: [PATCH 3/3] update test --- .../data_checks_tests/test_target_leakage_data_check.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/evalml/tests/data_checks_tests/test_target_leakage_data_check.py b/evalml/tests/data_checks_tests/test_target_leakage_data_check.py index 6b43aebb94..11d47d7945 100644 --- a/evalml/tests/data_checks_tests/test_target_leakage_data_check.py +++ b/evalml/tests/data_checks_tests/test_target_leakage_data_check.py @@ -188,20 +188,20 @@ def test_target_leakage_types(): ] * 6 + [datetime.strptime("2015", "%Y")] X["d"] = ~y X["e"] = np.zeros(len(y)) - # y = y.astype(bool) + y = y.astype(bool) X.ww.init(logical_types={"a": "categorical", "d": "Boolean", "b": "Boolean"}) expected = [ DataCheckWarning( - message="Columns 'a', 'b' are 80.0% or more correlated with the target", + message="Columns 'a', 'b', 'c' are 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, - details={"columns": ["a", "b"]}, + details={"columns": ["a", "b", "c"]}, action_options=[ DataCheckActionOption( DataCheckActionCode.DROP_COL, data_check_name=target_leakage_data_check_name, - metadata={"columns": ["a", "b"]}, + metadata={"columns": ["a", "b", "c"]}, ), ], ).to_dict(),