Skip to content

Commit

Permalink
Added parameters to configure dataset drift calculation and bins cust…
Browse files Browse the repository at this point in the history
…omization with column_mapping
  • Loading branch information
emeli-dral committed Sep 6, 2021
1 parent 2b3a753 commit b4c5c28
Show file tree
Hide file tree
Showing 5 changed files with 507 additions and 24 deletions.
2 changes: 1 addition & 1 deletion evidently/_version.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# coding: utf-8

version_info = (0, 1, 22, 'dev0')
version_info = (0, 1, 23, 'dev0')
__version__ = ".".join(map(str, version_info))
49 changes: 42 additions & 7 deletions evidently/analyzers/data_drift_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
if alternative == 'greater':
return 1 - norm.cdf(z_stat)

def dataset_drift_evaluation(p_values, confidence, drift_share):
n_drifted_features = sum([1 if x<(1. - confidence) else 0 for x in p_values])
share_drifted_features = n_drifted_features/len(p_values)
dataset_drift = True if share_drifted_features >= drift_share else False
return (n_drifted_features, share_drifted_features, dataset_drift)

class DataDriftAnalyzer(Analyzer):
def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping):
Expand All @@ -42,6 +47,10 @@ def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, co
target_column = column_mapping.get('target')
prediction_column = column_mapping.get('prediction')
num_feature_names = column_mapping.get('numerical_features')
confidence = column_mapping.get('drift_conf_level')
drift_share = column_mapping.get('drift_features_share')
nbinsx = column_mapping.get('nbinsx')
xbins = column_mapping.get('xbins')
if num_feature_names is None:
num_feature_names = []
else:
Expand All @@ -62,21 +71,35 @@ def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, co

num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))
confidence = 0.95
drift_share = 0.5
nbinsx = None
xbins = None

result["utility_columns"] = {'date':date_column, 'id':id_column, 'target':target_column, 'prediction':prediction_column}
result["utility_columns"] = {'date':date_column, 'id':id_column, 'target':target_column, 'prediction':prediction_column,
'drift_conf_level':confidence, 'drift_features_share':drift_share, 'nbinsx':nbinsx, 'xbins':xbins}
result["cat_feature_names"] = cat_feature_names
result["num_feature_names"] = num_feature_names

#calculate result
result['metrics'] = {}

p_values = []

for feature_name in num_feature_names:
p_value=ks_2samp(reference_data[feature_name], current_data[feature_name])[1]
p_values.append(p_value)
if nbinsx:
current_nbinsx = nbinsx.get(feature_name) if nbinsx.get(feature_name) else 10
else:
current_nbinsx = 10
result['metrics'][feature_name] = dict(
current_small_hist=[t.tolist() for t in np.histogram(current_data[feature_name][np.isfinite(current_data[feature_name])],
bins=10, density=True)],
bins=current_nbinsx, density=True)],
ref_small_hist=[t.tolist() for t in np.histogram(reference_data[feature_name][np.isfinite(reference_data[feature_name])],
bins=10, density=True)],
bins=current_nbinsx, density=True)],
feature_type='num',
p_value=ks_2samp(reference_data[feature_name], current_data[feature_name])[1]
p_value=p_value
)

for feature_name in cat_feature_names:
Expand All @@ -103,13 +126,25 @@ def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, co
ordered_keys = sorted(list(keys))
p_value = proportions_diff_z_test(proportions_diff_z_stat_ind(reference_data[feature_name].apply(lambda x : 0 if x == ordered_keys[0] else 1),
current_data[feature_name].apply(lambda x : 0 if x == ordered_keys[0] else 1)))

p_values.append(p_value)

if nbinsx:
current_nbinsx = nbinsx.get(feature_name) if nbinsx.get(feature_name) else 10
else:
current_nbinsx = 10
result['metrics'][feature_name] = dict(
current_small_hist=[t.tolist() for t in np.histogram(current_data[feature_name][np.isfinite(current_data[feature_name])],
bins=10, density=True)],
bins=current_nbinsx, density=True)],
ref_small_hist=[t.tolist() for t in np.histogram(reference_data[feature_name][np.isfinite(reference_data[feature_name])],
bins=10, density=True)],
bins=current_nbinsx, density=True)],
feature_type='cat',
p_value=p_value,
p_value=p_value
)

n_drifted_features, share_drifted_features, dataset_drift = dataset_drift_evaluation(p_values, confidence, drift_share)
result['metrics']['n_features'] = len(num_feature_names) + len(cat_feature_names)
result['metrics']['n_drifted_features'] = n_drifted_features
result['metrics']['share_drifted_features'] = share_drifted_features
result['metrics']['dataset_drift'] = dataset_drift
return result
419 changes: 419 additions & 0 deletions evidently/examples/boston_dataset_drift_with_customized_bins.ipynb

Large diffs are not rendered by default.

59 changes: 44 additions & 15 deletions evidently/widgets/data_drift_table_widget.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,11 @@ def calculate(self,
ref_small_hist = results['metrics'][feature_name]["ref_small_hist"]
feature_type = results['metrics'][feature_name]["feature_type"]

confidence = results['utility_columns']['drift_conf_level']
p_value = results['metrics'][feature_name]["p_value"]

distr_sim_test = "Detected" if p_value < 0.05 else "Not Detected"
drifted_fetures_count += 1 if p_value < 0.05 else 0
distr_sim_test = "Detected" if p_value < (1. - confidence) else "Not Detected"
#drifted_fetures_count += 1 if p_value < 0.05 else 0

params_data.append(
{
Expand Down Expand Up @@ -92,8 +93,7 @@ def calculate(self,

p_value = results['metrics'][feature_name]["p_value"]

distr_sim_test = "Detected" if p_value < 0.05 else "Not Detected"
drifted_fetures_count += 1 if p_value < 0.05 else 0
distr_sim_test = "Detected" if p_value < (1. - confidence) else "Not Detected"

params_data.append(
{
Expand Down Expand Up @@ -128,16 +128,40 @@ def calculate(self,

# set additionalGraphs
additional_graphs_data = []
xbins = results['utility_columns']['xbins']
nbinsx = results['utility_columns']['nbinsx']
for feature_name in num_feature_names + cat_feature_names:
# plot distributions
fig = go.Figure()
fig.add_trace(go.Histogram(x=reference_data[feature_name],
marker_color=grey, opacity=0.6, nbinsx=10, name='Reference',
histnorm='probability'))

fig.add_trace(go.Histogram(x=current_data[feature_name],
marker_color=red, opacity=0.6, nbinsx=10, name='Current',
histnorm='probability'))
fig = go.Figure()
if xbins:
current_xbins = results['utility_columns']['xbins'].get(feature_name)
if current_xbins:
current_nbinsx = None
else:
if nbinsx:
current_nbinsx = results['utility_columns']['nbinsx'].get(feature_name)
current_nbinsx = current_nbinsx if current_nbinsx else 10
else:
current_nbinsx = 10
fig.add_trace(go.Histogram(x=reference_data[feature_name],
marker_color=grey, opacity=0.6, xbins=current_xbins, nbinsx=current_nbinsx, name='Reference',
histnorm='probability'))

fig.add_trace(go.Histogram(x=current_data[feature_name],
marker_color=red, opacity=0.6, xbins=current_xbins, nbinsx=current_nbinsx, name='Current',
histnorm='probability'))
else:
current_nbinsx = None
if nbinsx:
current_nbinsx = results['utility_columns']['nbinsx'].get(feature_name)
current_nbinsx = current_nbinsx if current_nbinsx else 10
fig.add_trace(go.Histogram(x=reference_data[feature_name],
marker_color=grey, opacity=0.6, nbinsx=current_nbinsx, name='Reference',
histnorm='probability'))

fig.add_trace(go.Histogram(x=current_data[feature_name],
marker_color=red, opacity=0.6, nbinsx=current_nbinsx, name='Current',
histnorm='probability'))

fig.update_layout(
legend=dict(
Expand Down Expand Up @@ -246,9 +270,14 @@ def calculate(self,
)
)

n_features = len(num_feature_names) + len(cat_feature_names)
drift_share = round(100.*results['metrics']['share_drifted_features'], 1)

title_prefix = 'Drift is detected for ' + str(drift_share) + '% of features (' + str(results['metrics']['n_drifted_features']) + ' out of ' + str(n_features) + '). '
title_suffix = 'Dataset Drift is detected.' if results['metrics']['dataset_drift'] else 'Dataset Drift is NOT detected.'

self.wi = BaseWidgetInfo(
title="Data Drift: drift detected for " + str(drifted_fetures_count) +
" out of " + str(len(num_feature_names) + len(cat_feature_names)) + " features",
title=title_prefix + title_suffix,
type="big_table",
details="",
alertStats=AlertStats(),
Expand All @@ -257,7 +286,7 @@ def calculate(self,
insights=[],
size=2,
params={
"rowsPerPage": min(len(num_feature_names) + len(cat_feature_names), 10),
"rowsPerPage": min(n_features, 10),
"columns": [
{
"title": "Feature",
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
"numpy",
"statsmodels",
"plotly",
"scipy",
"scipy<=1.6.3",
"pyyaml",
"scikit-learn>=0.22.1",
"requests"
Expand Down

0 comments on commit b4c5c28

Please sign in to comment.