From 5b484e476d5c92ba785d82935b9ba02748173342 Mon Sep 17 00:00:00 2001
From: Stefano Zamboni <39366866+SteZamboni@users.noreply.github.com>
Date: Mon, 1 Jul 2024 16:29:48 +0200
Subject: [PATCH] fix: updated ks test with new phi value, fixed reference and
 current columns in class, fixed chi2 has_drift (#58)

---
 spark/jobs/utils/chi2.py           | 46 +++++++++++++++----------
 spark/jobs/utils/current_binary.py |  8 ++---
 spark/jobs/utils/ks.py             | 55 ++++++++++++++----------------
 spark/tests/binary_drift_test.py   | 12 +++----
 4 files changed, 63 insertions(+), 58 deletions(-)

diff --git a/spark/jobs/utils/chi2.py b/spark/jobs/utils/chi2.py
index 8620365f..082f9ed8 100644
--- a/spark/jobs/utils/chi2.py
+++ b/spark/jobs/utils/chi2.py
@@ -104,7 +104,9 @@ def __concatenate_columns(self) -> pyspark.sql.DataFrame:
 
         return concatenated_data
 
-    def __numeric_casting(self, concatenated_data) -> pyspark.sql.DataFrame:
+    def __numeric_casting(
+        self, concatenated_data, reference_column, current_column
+    ) -> pyspark.sql.DataFrame:
         """
         Performs numeric casting on the concatenated data.
 
@@ -118,18 +120,20 @@ def __numeric_casting(self, concatenated_data) -> pyspark.sql.DataFrame:
             StringIndexer(inputCol=column, outputCol=column + "_index").fit(
                 concatenated_data
             )
-            for column in [self.reference_column, self.current_column]
+            for column in [reference_column, current_column]
         ]
         pipeline = Pipeline(stages=indexers)
         return (
             pipeline.fit(concatenated_data)
             .transform(concatenated_data)
-            .drop(self.reference_column, self.current_column)
-            .withColumnRenamed(self.reference_column + "_index", self.reference_column)
-            .withColumnRenamed(self.current_column + "_index", self.current_column)
+            .drop(reference_column, current_column)
+            .withColumnRenamed(reference_column + "_index", reference_column)
+            .withColumnRenamed(current_column + "_index", current_column)
         )
 
-    def __current_column_to_vector(self, data) -> pyspark.sql.DataFrame:
+    def __current_column_to_vector(
+        self, data, reference_column, current_column
+    ) -> pyspark.sql.DataFrame:
         """
         Converts the current column data to a vector using VectorAssembler.
 
@@ -140,13 +144,15 @@ def __current_column_to_vector(self, data) -> pyspark.sql.DataFrame:
         - pyspark.sql.DataFrame: The DataFrame with the current column data converted to a vector.
         """
         vector_assembler = VectorAssembler(
-            inputCols=[self.current_column], outputCol=f"{self.current_column}_vector"
+            inputCols=[current_column], outputCol=f"{current_column}_vector"
         )
         return vector_assembler.transform(data).select(
-            self.reference_column, f"{self.current_column}_vector"
+            reference_column, f"{current_column}_vector"
         )
 
-    def __prepare_data_for_test(self) -> pyspark.sql.DataFrame:
+    def __prepare_data_for_test(
+        self, reference_column, current_column
+    ) -> pyspark.sql.DataFrame:
         """
         Prepares the data for the chi-square test by concatenating columns, performing numeric casting, and converting
         the current column data to a vector.
@@ -156,12 +162,16 @@ def __prepare_data_for_test(self) -> pyspark.sql.DataFrame:
         """
         concatenated_data = self.__concatenate_columns()
         numeric_concatenated_data = self.__numeric_casting(
-            concatenated_data=concatenated_data
+            concatenated_data=concatenated_data,
+            reference_column=reference_column,
+            current_column=current_column,
         )
-        vector_data = self.__current_column_to_vector(data=numeric_concatenated_data)
-        return vector_data.select(
-            self.reference_column, f"{self.current_column}_vector"
+        vector_data = self.__current_column_to_vector(
+            data=numeric_concatenated_data,
+            reference_column=reference_column,
+            current_column=current_column,
         )
+        return vector_data.select(reference_column, f"{current_column}_vector")
 
     def test(self, reference_column, current_column) -> Dict:
         """
@@ -186,14 +196,14 @@ def test(self, reference_column, current_column) -> Dict:
             .drop(*[current_column])
             .na.drop()
         )
-        self.reference_column = f"{reference_column}_reference"
-        self.current_column = f"{current_column}_current"
+        reference_column = f"{reference_column}_reference"
+        current_column = f"{current_column}_current"
         self.reference_size = self.reference.count()
         self.current_size = self.current.count()
         result = ChiSquareTest.test(
-            self.__prepare_data_for_test(),
-            f"{self.current_column}_vector",
-            self.reference_column,
+            self.__prepare_data_for_test(reference_column, current_column),
+            f"{current_column}_vector",
+            reference_column,
             True,
         )
 
diff --git a/spark/jobs/utils/current_binary.py b/spark/jobs/utils/current_binary.py
index 366c9d3f..20939c45 100644
--- a/spark/jobs/utils/current_binary.py
+++ b/spark/jobs/utils/current_binary.py
@@ -388,7 +388,7 @@ def calculate_drift(self):
                     result_tmp["pValue"]
                 )
                 feature_dict_to_append["drift_calc"]["has_drift"] = bool(
-                    result_tmp["pValue"] >= 0.05
+                    result_tmp["pValue"] <= 0.05
                 )
             else:
                 feature_dict_to_append["drift_calc"]["value"] = None
@@ -402,7 +402,7 @@ def calculate_drift(self):
             reference_data=self.reference,
             current_data=self.current,
             alpha=0.05,
-            beta=0.000001,
+            phi=0.004,
         )
 
         for column in numerical_features:
@@ -414,10 +414,10 @@ def calculate_drift(self):
             }
             result_tmp = ks.test(column, column)
             feature_dict_to_append["drift_calc"]["value"] = float(
-                result_tmp["ks_Statistic"]
+                result_tmp["ks_statistic"]
             )
             feature_dict_to_append["drift_calc"]["has_drift"] = bool(
-                result_tmp["ks_Statistic"] > result_tmp["critical_value"]
+                result_tmp["ks_statistic"] > result_tmp["critical_value"]
             )
             drift_result["feature_metrics"].append(feature_dict_to_append)
 
diff --git a/spark/jobs/utils/ks.py b/spark/jobs/utils/ks.py
index e6b2cc80..4cfadf80 100644
--- a/spark/jobs/utils/ks.py
+++ b/spark/jobs/utils/ks.py
@@ -9,7 +9,7 @@ class KolmogorovSmirnovTest:
     It is designed to compare two sample distributions and determine if they differ significantly.
     """
 
-    def __init__(self, reference_data, current_data, alpha, beta) -> None:
+    def __init__(self, reference_data, current_data, alpha, phi) -> None:
         """
         Initializes the KolmogorovSmirnovTest with the provided data and parameters.
 
@@ -17,11 +17,12 @@ def __init__(self, reference_data, current_data, alpha, beta) -> None:
         - reference_data (DataFrame): The reference data as a Spark DataFrame.
         - current_data (DataFrame): The current data as a Spark DataFrame.
         - alpha (float): The significance level for the hypothesis test.
+        - phi (float): ϕ defines the precision of the KS test statistic.
         """
         self.reference_data = reference_data
         self.current_data = current_data
         self.alpha = alpha
-        self.beta = beta
+        self.phi = phi
         self.reference_size = self.reference_data.count()
         self.current_size = self.current_data.count()
 
@@ -38,35 +39,23 @@ def __num_probs(n, delta: float, epsilon: float) -> int:
         """Calculate number of probability points for approx
         quantiles; at most this is the number of data points.
         Returns:
-            - int_ the number of probability points
+            - int: the number of probability points
         """
 
         a = 1 / (delta - epsilon) + 1
         return min(ceil(a), n)
 
-    def __critical_value_2(self, _alpha):
-        """Compute the critical value for the KS test at a given alpha level."""
+    def __critical_value(self, significance_level) -> float:
+        """Compute the critical value for the KS test at a given alpha level.
+        Returns:
+            - float: the critical value for a given alpha
+        """
 
-        return np.sqrt(-0.5 * np.log(_alpha / 2)) * np.sqrt(
+        return np.sqrt(-0.5 * np.log(significance_level / 2)) * np.sqrt(
             (self.reference_size + self.current_size)
             / (self.reference_size * self.current_size)
         )
 
-    def __compute_phi(self, alpha, beta):
-        """Compute the value of phi for the given parameters."""
-
-        D_crit = self.__critical_value_2(self.alpha)
-        D_crit_upper = self.__critical_value_2(alpha + beta)
-        D_crit_lower = self.__critical_value_2(alpha - beta)
-
-        # Compute the absolute differences
-        delta_upper = abs(D_crit_upper - D_crit)
-        delta_lower = abs(D_crit_lower - D_crit)
-
-        # Phi is the minimum of these differences
-        phi = min(delta_upper, delta_lower)
-        return phi
-
     def test(self, reference_column, current_column) -> dict:
         """Approximates two-sample KS distance with precision
         phi between columns of Spark DataFrames.
@@ -76,28 +65,34 @@ def test(self, reference_column, current_column) -> dict:
         - current_column (str): The column name in the current data.
         """
 
-        phi = self.__compute_phi(alpha=self.alpha, beta=self.beta)
-        delta = phi / 2
-        eps45x = self.__eps45(delta, self.reference_size)
-        eps45y = self.__eps45(delta, self.current_size)
-        ax = self.__num_probs(self.reference_size, delta, eps45x)
-        ay = self.__num_probs(self.current_size, delta, eps45y)
+        delta = self.phi / 2
+
+        eps45x = self.__eps45(delta=delta, n=self.reference_size)
+        eps45y = self.__eps45(delta=delta, n=self.current_size)
+
+        ax = self.__num_probs(n=self.reference_size, delta=delta, epsilon=eps45x)
+        ay = self.__num_probs(n=self.current_size, delta=delta, epsilon=eps45y)
+
         pxi = linspace(1 / self.reference_size, 1, ax)
         pyj = linspace(1 / self.current_size, 1, ay)
+
         xi = self.reference_data.approxQuantile(reference_column, list(pxi), eps45x)
         yj = self.current_data.approxQuantile(current_column, list(pyj), eps45y)
+
         f_xi = pxi
         f_yi = interp(xi, yj, pyj)
+
         f_yj = pyj
         f_xj = interp(yj, xi, pxi)
+
         d_i = max(abs(f_xi - f_yi))
         d_j = max(abs(f_xj - f_yj))
         d_ks = max(d_i, d_j)
-        critical_value = self.__critical_value_2(_alpha=self.alpha)
+
+        critical_value = self.__critical_value(significance_level=self.alpha)
 
         return {
             "critical_value": critical_value,
-            "ks_Statistic": round(d_ks, 10),
+            "ks_statistic": round(d_ks, 10),
             "alpha": self.alpha,
-            "phi": phi,
         }
diff --git a/spark/tests/binary_drift_test.py b/spark/tests/binary_drift_test.py
index 42f94ec4..2d1f6818 100644
--- a/spark/tests/binary_drift_test.py
+++ b/spark/tests/binary_drift_test.py
@@ -120,7 +120,7 @@ def test_drift(spark_fixture, drift_dataset):
                     "drift_calc": {
                         "type": "CHI2",
                         "value": 0.0004993992273872871,
-                        "has_drift": False,
+                        "has_drift": True,
                     },
                 },
                 {
@@ -128,7 +128,7 @@ def test_drift(spark_fixture, drift_dataset):
                     "drift_calc": {
                         "type": "CHI2",
                         "value": 0.49015296041582523,
-                        "has_drift": True,
+                        "has_drift": False,
                     },
                 },
                 {
@@ -284,7 +284,7 @@ def test_drift_boolean(spark_fixture, drift_dataset_bool):
                     "drift_calc": {
                         "type": "CHI2",
                         "value": 0.0012340980408668267,
-                        "has_drift": False,
+                        "has_drift": True,
                     },
                 },
                 {
@@ -292,7 +292,7 @@ def test_drift_boolean(spark_fixture, drift_dataset_bool):
                     "drift_calc": {
                         "type": "CHI2",
                         "value": 0.002699796063260207,
-                        "has_drift": False,
+                        "has_drift": True,
                     },
                 },
                 {
@@ -370,7 +370,7 @@ def test_drift_bigger_file(spark_fixture, drift_dataset_bigger_file):
                     "drift_calc": {
                         "type": "CHI2",
                         "value": 0.26994857272252293,
-                        "has_drift": True,
+                        "has_drift": False,
                     },
                 },
                 {
@@ -378,7 +378,7 @@ def test_drift_bigger_file(spark_fixture, drift_dataset_bigger_file):
                     "drift_calc": {
                         "type": "CHI2",
                         "value": 0.3894236957350261,
-                        "has_drift": True,
+                        "has_drift": False,
                     },
                 },
                 {