Updated statistical test for categorical features

evidentlyai · Jul 20, 2021 · 7893fc4 · 7893fc4
1 parent c77f072
commit 7893fc4
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 15 deletions.
diff --git a/evidently/_version.py b/evidently/_version.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
 # coding: utf-8
 
-version_info = (0, 1, 19, 'dev0')
+version_info = (0, 1, 20, 'dev0')
 __version__ = ".".join(map(str, version_info))
diff --git a/evidently/analyzers/cat_target_drift_analyzer.py b/evidently/analyzers/cat_target_drift_analyzer.py
@@ -6,8 +6,31 @@
 from pandas.api.types import is_numeric_dtype
 import numpy as np
 
-from scipy.stats import ks_2samp, chisquare
-
+from scipy.stats import ks_2samp, chisquare, norm
+
+def proportions_diff_z_stat_ind(ref, curr):
+    n1 = len(ref)
+    n2 = len(curr)
+
+    p1 = float(sum(ref)) / n1
+    p2 = float(sum(curr)) / n2 
+    P = float(p1*n1 + p2*n2) / (n1 + n2)
+
+    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))
+
+def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
+    if alternative not in ('two-sided', 'less', 'greater'):
+        raise ValueError("alternative not recognized\n"
+                         "should be 'two-sided', 'less' or 'greater'")
+
+    if alternative == 'two-sided':
+        return 2 * (1 - norm.cdf(np.abs(z_stat)))
+
+    if alternative == 'less':
+        return norm.cdf(z_stat)
+
+    if alternative == 'greater':
+        return 1 - norm.cdf(z_stat)
 
 class CatTargetDriftAnalyzer(Analyzer):
     def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping):
@@ -66,10 +89,15 @@ def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, co
             for key, item in zip(current_feature_vc.index, current_feature_vc.values):
                 current_feature_dict[key] = item
 
-            f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
-            f_obs = [value[1] for value in sorted(current_feature_dict.items())]
+            if len(keys) > 2:
+                f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
+                f_obs = [value[1] for value in sorted(current_feature_dict.items())]
+                target_p_value = chisquare(f_exp, f_obs)[1]
+            else:
+                ordered_keys = sorted(list(keys))
+                target_p_value = proportions_diff_z_test(proportions_diff_z_stat_ind(reference_data[target_column].apply(lambda x : 0 if x == ordered_keys[0] else 1), 
+                    current_data[target_column].apply(lambda x : 0 if x == ordered_keys[0] else 1)))
 
-            target_p_value = chisquare(f_exp, f_obs)[1]
             result['metrics']["target_name"] = target_column
             result['metrics']["target_type"] = 'cat'
             result['metrics']["target_drift"] = target_p_value
@@ -97,10 +125,15 @@ def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, co
             for key, item in zip(current_feature_vc.index, current_feature_vc.values):
                 current_feature_dict[key] = item
 
-            f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
-            f_obs = [value[1] for value in sorted(current_feature_dict.items())]
+            if len(keys) > 2:
+                f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
+                f_obs = [value[1] for value in sorted(current_feature_dict.items())]
+                pred_p_value = chisquare(f_exp, f_obs)[1]
+            else:
+                ordered_keys = sorted(list(keys))
+                pred_p_value = proportions_diff_z_test(proportions_diff_z_stat_ind(reference_data[prediction_column].apply(lambda x : 0 if x == ordered_keys[0] else 1), 
+                    current_data[prediction_column].apply(lambda x : 0 if x == ordered_keys[0] else 1)))
 
-            pred_p_value = chisquare(f_exp, f_obs)[1]
             result['metrics']["prediction_name"] = prediction_column
             result['metrics']["prediction_type"] = 'cat'
             result['metrics']["prediction_drift"] = pred_p_value

diff --git a/evidently/analyzers/data_drift_analyzer.py b/evidently/analyzers/data_drift_analyzer.py
@@ -6,7 +6,31 @@
 from pandas.api.types import is_numeric_dtype
 import numpy as np
 
-from scipy.stats import ks_2samp, chisquare
+from scipy.stats import ks_2samp, chisquare, norm
+
+def proportions_diff_z_stat_ind(ref, curr):
+    n1 = len(ref)
+    n2 = len(curr)
+
+    p1 = float(sum(ref)) / n1
+    p2 = float(sum(curr)) / n2 
+    P = float(p1*n1 + p2*n2) / (n1 + n2)
+
+    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))
+
+def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
+    if alternative not in ('two-sided', 'less', 'greater'):
+        raise ValueError("alternative not recognized\n"
+                         "should be 'two-sided', 'less' or 'greater'")
+
+    if alternative == 'two-sided':
+        return 2 * (1 - norm.cdf(np.abs(z_stat)))
+
+    if alternative == 'less':
+        return norm.cdf(z_stat)
+
+    if alternative == 'greater':
+        return 1 - norm.cdf(z_stat)
 
 
 class DataDriftAnalyzer(Analyzer):
@@ -70,11 +94,15 @@ def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, co
             for key, item in zip(current_feature_vc.index, current_feature_vc.values):
                 current_feature_dict[key] = item
 
-            f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
-            f_obs = [value[1] for value in sorted(current_feature_dict.items())]
-
-            # CHI2 to be implemented for cases with different categories
-            p_value = chisquare(f_exp, f_obs)[1]
+            if len(keys) > 2:
+                f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
+                f_obs = [value[1] for value in sorted(current_feature_dict.items())]
+                # CHI2 to be implemented for cases with different categories
+                p_value = chisquare(f_exp, f_obs)[1]
+            else:
+                ordered_keys = sorted(list(keys))
+                p_value = proportions_diff_z_test(proportions_diff_z_stat_ind(reference_data[feature_name].apply(lambda x : 0 if x == ordered_keys[0] else 1), 
+                    current_data[feature_name].apply(lambda x : 0 if x == ordered_keys[0] else 1)))
 
             result['metrics'][feature_name] = dict(
                 current_small_hist=[t.tolist() for t in np.histogram(current_data[feature_name][np.isfinite(current_data[feature_name])],