Skip to content

Commit

Permalink
feat: added CHI2 drift to float columns when cardinality < 15 (#121)
Browse files Browse the repository at this point in the history
* feat: added CHI2 drift to float columns when cardinality < 15

* feat: fixed tests

* feat: removed print
  • Loading branch information
SteZamboni authored Jul 19, 2024
1 parent e601c2d commit 62f3852
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 65 deletions.
53 changes: 44 additions & 9 deletions spark/jobs/metrics/drift_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,18 +63,53 @@ def calculate_drift(
for column in float_features:
feature_dict_to_append = {
"feature_name": column,
"drift_calc": {
"type": "KS",
},
"drift_calc": {},
}
result_tmp = ks.test(column, column)
feature_dict_to_append["drift_calc"]["value"] = float(
result_tmp["ks_statistic"]
unique_values_ref = (
reference_dataset.reference.select(column)
.distinct()
.rdd.flatMap(lambda x: x)
.collect()
)
feature_dict_to_append["drift_calc"]["has_drift"] = bool(
result_tmp["ks_statistic"] > result_tmp["critical_value"]
unique_values_cur = (
current_dataset.current.select(column)
.distinct()
.rdd.flatMap(lambda x: x)
.collect()
)
drift_result["feature_metrics"].append(feature_dict_to_append)
unique_values_refcur = unique_values_ref + unique_values_cur
lookup = set()
unique_values_tot = [
x
for x in unique_values_refcur
if x is not None and x not in lookup and lookup.add(x) is None
]
if len(unique_values_tot) < 15:
feature_dict_to_append["drift_calc"]["type"] = "CHI2"
if (
reference_dataset.reference_count > 5
and current_dataset.current_count > 5
):
result_tmp = chi2.test(column, column)
feature_dict_to_append["drift_calc"]["value"] = float(
result_tmp["pValue"]
)
feature_dict_to_append["drift_calc"]["has_drift"] = bool(
result_tmp["pValue"] <= 0.05
)
else:
feature_dict_to_append["drift_calc"]["value"] = None
feature_dict_to_append["drift_calc"]["has_drift"] = False
else:
feature_dict_to_append["drift_calc"]["type"] = "KS"
result_tmp = ks.test(column, column)
feature_dict_to_append["drift_calc"]["value"] = float(
result_tmp["ks_statistic"]
)
feature_dict_to_append["drift_calc"]["has_drift"] = bool(
result_tmp["ks_statistic"] > result_tmp["critical_value"]
)
drift_result["feature_metrics"].append(feature_dict_to_append)

int_features = [
int_f.name for int_f in reference_dataset.model.get_int_features()
Expand Down
48 changes: 0 additions & 48 deletions spark/tests/results/drift_calculator_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,6 @@
"has_drift": False,
},
},
{
"feature_name": "num1",
"drift_calc": {"type": "KS", "value": 0.9, "has_drift": True},
},
{
"feature_name": "num2",
"drift_calc": {"type": "KS", "value": 0.3, "has_drift": False},
},
]
}

Expand All @@ -37,14 +29,6 @@
"feature_name": "cat2",
"drift_calc": {"type": "CHI2", "value": None, "has_drift": False},
},
{
"feature_name": "num1",
"drift_calc": {"type": "KS", "value": 0.75, "has_drift": False},
},
{
"feature_name": "num2",
"drift_calc": {"type": "KS", "value": 0.7, "has_drift": False},
},
]
}

Expand All @@ -66,14 +50,6 @@
"has_drift": True,
},
},
{
"feature_name": "num1",
"drift_calc": {"type": "KS", "value": 0.4, "has_drift": False},
},
{
"feature_name": "num2",
"drift_calc": {"type": "KS", "value": 0.3, "has_drift": False},
},
]
}

Expand All @@ -95,31 +71,11 @@
"has_drift": False,
},
},
{
"feature_name": "num1",
"drift_calc": {
"type": "KS",
"value": 0.9230769231,
"has_drift": True,
},
},
{
"feature_name": "num2",
"drift_calc": {
"type": "KS",
"value": 0.5384615385,
"has_drift": False,
},
},
]
}

test_drift_bike_res = {
"feature_metrics": [
{
"feature_name": "weathersit",
"drift_calc": {"type": "KS", "value": 0.6219091927, "has_drift": True},
},
{
"feature_name": "temp",
"drift_calc": {"type": "KS", "value": 0.5259741552, "has_drift": True},
Expand Down Expand Up @@ -241,10 +197,6 @@
"feature_name": "primary_camera_front",
"drift_calc": {"type": "KS", "value": 0.3139650146, "has_drift": True},
},
{
"feature_name": "extended_upto",
"drift_calc": {"type": "KS", "value": 0.5237507289, "has_drift": True},
},
{
"feature_name": "price",
"drift_calc": {"type": "PSI", "value": 0.0, "has_drift": False},
Expand Down
Loading

0 comments on commit 62f3852

Please sign in to comment.