Skip to content

Commit

Permalink
Updated statistical test for categorical features
Browse files Browse the repository at this point in the history
  • Loading branch information
emeli-dral committed Jul 20, 2021
1 parent c77f072 commit 7893fc4
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 15 deletions.
2 changes: 1 addition & 1 deletion evidently/_version.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# coding: utf-8

version_info = (0, 1, 19, 'dev0')
version_info = (0, 1, 20, 'dev0')
__version__ = ".".join(map(str, version_info))
49 changes: 41 additions & 8 deletions evidently/analyzers/cat_target_drift_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,31 @@
from pandas.api.types import is_numeric_dtype
import numpy as np

from scipy.stats import ks_2samp, chisquare

from scipy.stats import ks_2samp, chisquare, norm

def proportions_diff_z_stat_ind(ref, curr):
n1 = len(ref)
n2 = len(curr)

p1 = float(sum(ref)) / n1
p2 = float(sum(curr)) / n2
P = float(p1*n1 + p2*n2) / (n1 + n2)

return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
if alternative not in ('two-sided', 'less', 'greater'):
raise ValueError("alternative not recognized\n"
"should be 'two-sided', 'less' or 'greater'")

if alternative == 'two-sided':
return 2 * (1 - norm.cdf(np.abs(z_stat)))

if alternative == 'less':
return norm.cdf(z_stat)

if alternative == 'greater':
return 1 - norm.cdf(z_stat)

class CatTargetDriftAnalyzer(Analyzer):
def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping):
Expand Down Expand Up @@ -66,10 +89,15 @@ def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, co
for key, item in zip(current_feature_vc.index, current_feature_vc.values):
current_feature_dict[key] = item

f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
f_obs = [value[1] for value in sorted(current_feature_dict.items())]
if len(keys) > 2:
f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
f_obs = [value[1] for value in sorted(current_feature_dict.items())]
target_p_value = chisquare(f_exp, f_obs)[1]
else:
ordered_keys = sorted(list(keys))
target_p_value = proportions_diff_z_test(proportions_diff_z_stat_ind(reference_data[target_column].apply(lambda x : 0 if x == ordered_keys[0] else 1),
current_data[target_column].apply(lambda x : 0 if x == ordered_keys[0] else 1)))

target_p_value = chisquare(f_exp, f_obs)[1]
result['metrics']["target_name"] = target_column
result['metrics']["target_type"] = 'cat'
result['metrics']["target_drift"] = target_p_value
Expand Down Expand Up @@ -97,10 +125,15 @@ def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, co
for key, item in zip(current_feature_vc.index, current_feature_vc.values):
current_feature_dict[key] = item

f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
f_obs = [value[1] for value in sorted(current_feature_dict.items())]
if len(keys) > 2:
f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
f_obs = [value[1] for value in sorted(current_feature_dict.items())]
pred_p_value = chisquare(f_exp, f_obs)[1]
else:
ordered_keys = sorted(list(keys))
pred_p_value = proportions_diff_z_test(proportions_diff_z_stat_ind(reference_data[prediction_column].apply(lambda x : 0 if x == ordered_keys[0] else 1),
current_data[prediction_column].apply(lambda x : 0 if x == ordered_keys[0] else 1)))

pred_p_value = chisquare(f_exp, f_obs)[1]
result['metrics']["prediction_name"] = prediction_column
result['metrics']["prediction_type"] = 'cat'
result['metrics']["prediction_drift"] = pred_p_value
Expand Down
40 changes: 34 additions & 6 deletions evidently/analyzers/data_drift_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,31 @@
from pandas.api.types import is_numeric_dtype
import numpy as np

from scipy.stats import ks_2samp, chisquare
from scipy.stats import ks_2samp, chisquare, norm

def proportions_diff_z_stat_ind(ref, curr):
n1 = len(ref)
n2 = len(curr)

p1 = float(sum(ref)) / n1
p2 = float(sum(curr)) / n2
P = float(p1*n1 + p2*n2) / (n1 + n2)

return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
if alternative not in ('two-sided', 'less', 'greater'):
raise ValueError("alternative not recognized\n"
"should be 'two-sided', 'less' or 'greater'")

if alternative == 'two-sided':
return 2 * (1 - norm.cdf(np.abs(z_stat)))

if alternative == 'less':
return norm.cdf(z_stat)

if alternative == 'greater':
return 1 - norm.cdf(z_stat)


class DataDriftAnalyzer(Analyzer):
Expand Down Expand Up @@ -70,11 +94,15 @@ def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, co
for key, item in zip(current_feature_vc.index, current_feature_vc.values):
current_feature_dict[key] = item

f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
f_obs = [value[1] for value in sorted(current_feature_dict.items())]

# CHI2 to be implemented for cases with different categories
p_value = chisquare(f_exp, f_obs)[1]
if len(keys) > 2:
f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
f_obs = [value[1] for value in sorted(current_feature_dict.items())]
# CHI2 to be implemented for cases with different categories
p_value = chisquare(f_exp, f_obs)[1]
else:
ordered_keys = sorted(list(keys))
p_value = proportions_diff_z_test(proportions_diff_z_stat_ind(reference_data[feature_name].apply(lambda x : 0 if x == ordered_keys[0] else 1),
current_data[feature_name].apply(lambda x : 0 if x == ordered_keys[0] else 1)))

result['metrics'][feature_name] = dict(
current_small_hist=[t.tolist() for t in np.histogram(current_data[feature_name][np.isfinite(current_data[feature_name])],
Expand Down

0 comments on commit 7893fc4

Please sign in to comment.