scikit-learn-contrib · adriencrtr · Apr 25, 2024 · Apr 25, 2024
diff --git a/docs/audit.rst b/docs/audit.rst
@@ -0,0 +1,3 @@
+
+Audit
+===============
diff --git a/docs/index.rst b/docs/index.rst
@@ -1,5 +1,13 @@
 .. include:: ../README.rst
 
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+   :caption: AUDIT
+
+   audit
+   examples/tutorials/plot_tuto_mcar_test
+
 .. toctree::
    :maxdepth: 2
    :hidden:

diff --git a/examples/tutorials/plot_tuto_mcar_test.py b/examples/tutorials/plot_tuto_mcar_test.py
@@ -0,0 +1,149 @@
+"""
+============================================
+Tutorial for testing the MCAR case
+============================================
+
+In this tutorial, we show how to use the mcar test classe and it methods
+
+Keep in my mind that, at this moment, the mcar tests are only handle tabular data.
+"""
+# %%
+# First import some libraries
+from matplotlib import pyplot as plt
+import random
+
+import numpy as np
+import pandas as pd
+
+from qolmat.audit.holes_characterization import MCARTest
+
+# %%
+# 1. The Little's test
+# ---------------------------------------------------------------
+# How to use the Little's test ?
+# ==============================
+# When we deal with missing data in our dataset it's interesting to know the nature of these holes.
+# There exist three types of holes : MCAR, MAR and MNAR.
+# (see the: `Rubin's missing mechanism classification
+# <https://qolmat.readthedocs.io/en/latest/explanation.html>`_)
+#
+# The simplest case to test is the MCAR case. The most famous MCAR statistical test is the
+# `Little's test <https://www.tandfonline.com/doi/abs/10.1080/01621459.1988.10478722>`_.
+# Keep in mind that the Little's test is designed to test the homogeneity of means between the
+# missing patterns and won't be efficient to detect the heterogeneity of covariance between missing
+# patterns.
+#
+# This notebook shows how the Little's test performs and its limitations.
+
+np.random.seed(11)
+
+mcartest = MCARTest(method="little")
+
+# %%
+# Case 1 : Normal iid feature with MCAR holes
+# ===========================================
+
+matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
+matrix.ravel()[np.random.choice(matrix.size, size=20, replace=False)] = np.nan
+matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
+df_1 = pd.DataFrame(matrix)
+
+plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
+plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])
+
+plt.legend(
+    (plt_1, plt_2),
+    ("observed_values", "masked_vlues"),
+    scatterpoints=1,
+    loc="lower left",
+    ncol=1,
+    fontsize=8,
+)
+
+plt.title("Case 1 : MCAR missingness mechanism")
+plt.xlabel("x values (all observed)")
+plt.ylabel("y values (with missing ones)")
+
+plt.show()
+
+# %%
+
+mcartest.test(df_1)
+# %%
+# The p-value is quite high, therefore we don't reject H_0.
+# We can then suppose that our missingness mechanism is MCAR.
+
+# %%
+# Case 2 : Normal iid feature with MAR holes
+# ==========================================
+np.random.seed(11)
+
+matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
+threshold = random.uniform(0, 1)
+matrix[np.argwhere(matrix[:, 0] > 1.96), 1] = np.nan
+matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
+df_2 = pd.DataFrame(matrix)
+
+plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
+plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])
+
+plt.legend(
+    (plt_1, plt_2),
+    ("observed_values", "masked_vlues"),
+    scatterpoints=1,
+    loc="lower left",
+    ncol=1,
+    fontsize=8,
+)
+
+plt.title("Case 2 : MAR missingness mechanism")
+plt.xlabel("x values (all observed)")
+plt.ylabel("y values (with missing ones)")
+
+plt.show()
+
+# %%
+
+mcartest.test(df_2)
+# %%
+# The p-value is lower than the classic threshold (5%).
+# H_0 is then rejected and we can suppose that our missingness mechanism is MAR.
+
+# %%
+# Case 3 : Normal iid feature MAR holes
+# =====================================
+# The specific case is design to emphasize the Little's test limits. In the case, we generate holes
+# when the value of the first feature is high. This missingness mechanism is clearly MAR but the
+# means between missing patterns is not statistically different.
+
+np.random.seed(11)
+
+matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
+matrix[np.argwhere(abs(matrix[:, 0]) >= 1.95), 1] = np.nan
+matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
+df_3 = pd.DataFrame(matrix)
+
+plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
+plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])
+
+plt.legend(
+    (plt_1, plt_2),
+    ("observed_values", "masked_values"),
+    scatterpoints=1,
+    loc="lower left",
+    ncol=1,
+    fontsize=8,
+)
+
+plt.title("Case 3 : MAR missingness mechanism undetected by the Little's test")
+plt.xlabel("x values (all observed)")
+plt.ylabel("y values (with missing ones)")
+
+plt.show()
+
+# %%
+
+mcartest.test(df_3)
+# %%
+# The p-value is higher than the classic threshold (5%).
+# H_0 is not rejected whereas the missingness mechanism is clearly MAR.
diff --git a/qolmat/audit/holes_characterization.py b/qolmat/audit/holes_characterization.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+from typing import Literal, Optional, TYPE_CHECKING
+
+import numpy as np
+import pandas as pd
+from scipy.stats import chi2
+
+from qolmat.imputations.imputers import ImputerEM
+
+if TYPE_CHECKING:
+    from qolmat.imputations.imputers import _Imputer
+
+
+class MCARTest:
+    """
+    This class implements the statistical tests to test the MCAR case.
+
+    Parameters
+        ----------
+        method : Literal[&quot;little&quot;]
+            The name of the statistical test. This should be handled by qolmat.
+        imputer : Optional[_Imputer], optional
+            If the selected test needs a imputer, you can provide the Imputer you want. Otherwise,
+            a default imputer will be used.
+    """
+
+    def __init__(self, method: Literal["little"], imputer: Optional[_Imputer] = None):
+        if method not in ["little"]:
+            raise ValueError(f"method` must be handled by qolmat, provided value is '{method}'")
+
+        self.method = method
+        self.imputer = imputer
+
+    def test(self, df: pd.DataFrame) -> float:
+        if self.method == "little":
+            return self.little_mcar_test(df)
+
+    def little_mcar_test(self, df: pd.DataFrame) -> float:
+        """
+        This method implements the Little's test. Use this test to test the homogenity of means
+        between all your missing patterns.
+        The null hypethoses is "The missing data mechanism is MCAR".
+        Be aware that this test won't detect the heterogeneity of covariance.
+
+        Parameters
+        ----------
+        df : pd.DataFrame
+            Your input data with missing values.
+
+        Returns
+        -------
+        float
+            The p-value of the test.
+        """
+        imputer = self.imputer or ImputerEM()
+        fitted_imputer = imputer._fit_element(df)
+
+        # Instanciant the stat, the degree of freedom and estimators.
+        d0 = 0
+        n_rows, degree_f = df.shape
+        degree_f = -degree_f
+        ml_means = fitted_imputer.means
+        ml_cov = n_rows / (n_rows - 1) * fitted_imputer.cov
+
+        # Iterate over the patterns
+        df_nan = df.notna()
+        for tup_pattern, df_nan_pattern in df_nan.groupby(df_nan.columns.tolist()):
+            n_rows_pattern, _ = df_nan_pattern.shape
+            ind_pattern = df_nan_pattern.index
+            df_pattern = df.loc[ind_pattern, list(tup_pattern)]
+            obs_mean = df_pattern.mean().to_numpy()
+
+            diff_means = obs_mean - ml_means[list(tup_pattern)]
+            inv_sigma_pattern = np.linalg.inv(ml_cov[:, tup_pattern][tup_pattern, :])
+
+            d0 += n_rows_pattern * np.dot(np.dot(diff_means, inv_sigma_pattern), diff_means.T)
+            degree_f += tup_pattern.count(True)
+
+        return 1 - chi2.cdf(d0, degree_f)
diff --git a/tests/audit/test_holes_characterization.py b/tests/audit/test_holes_characterization.py
@@ -0,0 +1,39 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from qolmat.audit.holes_characterization import MCARTest
+from qolmat.imputations.imputers import ImputerEM
+
+
+@pytest.fixture
+def mcar_df() -> pd.DataFrame:
+    rng = np.random.default_rng(42)
+    matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
+    matrix.ravel()[rng.choice(matrix.size, size=20, replace=False)] = np.nan
+    return pd.DataFrame(data=matrix)
+
+
+@pytest.fixture
+def mar_hm_df() -> pd.DataFrame:
+    rng = np.random.default_rng(42)
+    matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
+    matrix[np.argwhere(matrix[:, 0] > 1.96), 1] = np.nan
+    return pd.DataFrame(data=matrix)
+
+
+@pytest.fixture
+def mcar_hc_df() -> pd.DataFrame:
+    rng = np.random.default_rng(42)
+    matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
+    matrix[np.argwhere(abs(matrix[:, 0]) >= 1.95), 1] = np.nan
+    return pd.DataFrame(data=matrix)
+
+
+@pytest.mark.parametrize(
+    "df_input, expected", [("mcar_df", True), ("mar_hm_df", False), ("mcar_hc_df", True)]
+)
+def test_little_mcar_test(df_input: pd.DataFrame, expected: bool, request):
+    mcar_test_little = MCARTest(method="little", imputer=ImputerEM(random_state=42))
+    result = mcar_test_little.test(request.getfixturevalue(df_input))
+    assert expected == (result > 0.05)