📝 Add the tuto of the Little's MCAR test. And modify the test file re…

…garding the random generator seed.
scikit-learn-contrib · Apr 25, 2024 · 23aa9b4 · 23aa9b4
1 parent d70563d
commit 23aa9b4
Show file tree

Hide file tree

Showing 4 changed files with 189 additions and 20 deletions.
diff --git a/docs/audit.rst b/docs/audit.rst
@@ -0,0 +1,3 @@
+
+Audit
+===============
diff --git a/docs/index.rst b/docs/index.rst
@@ -1,5 +1,13 @@
 .. include:: ../README.rst
 
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+   :caption: AUDIT
+
+   audit
+   examples/tutorials/plot_tuto_mcar_test
+
 .. toctree::
    :maxdepth: 2
    :hidden:

diff --git a/examples/tutorials/plot_tuto_mcar_test.py b/examples/tutorials/plot_tuto_mcar_test.py
@@ -0,0 +1,149 @@
+"""
+============================================
+Tutorial for testing the MCAR case
+============================================
+
+In this tutorial, we show how to use the mcar test classe and it methods
+
+Keep in my mind that, at this moment, the mcar tests are only handle tabular data.
+"""
+# %%
+# First import some libraries
+from matplotlib import pyplot as plt
+import random
+
+import numpy as np
+import pandas as pd
+
+from qolmat.audit.holes_characterization import MCARTest
+
+# %%
+# 1. The Little's test
+# ---------------------------------------------------------------
+# How to use the Little's test ?
+# ==============================
+# When we deal with missing data in our dataset it's interesting to know the nature of these holes.
+# There exist three types of holes : MCAR, MAR and MNAR.
+# (see the: `Rubin's missing mechanism classification
+# <https://qolmat.readthedocs.io/en/latest/explanation.html>`_)
+#
+# The simplest case to test is the MCAR case. The most famous MCAR statistical test is the
+# `Little's test <https://www.tandfonline.com/doi/abs/10.1080/01621459.1988.10478722>`_.
+# Keep in mind that the Little's test is designed to test the homogeneity of means between the
+# missing patterns and won't be efficient to detect the heterogeneity of covariance between missing
+# patterns.
+#
+# This notebook shows how the Little's test performs and its limitations.
+
+np.random.seed(11)
+
+mcartest = MCARTest(method="little")
+
+# %%
+# Case 1 : Normal iid feature with MCAR holes
+# ===========================================
+
+matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
+matrix.ravel()[np.random.choice(matrix.size, size=20, replace=False)] = np.nan
+matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
+df_1 = pd.DataFrame(matrix)
+
+plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
+plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])
+
+plt.legend(
+    (plt_1, plt_2),
+    ("observed_values", "masked_vlues"),
+    scatterpoints=1,
+    loc="lower left",
+    ncol=1,
+    fontsize=8,
+)
+
+plt.title("Case 1 : MCAR missingness mechanism")
+plt.xlabel("x values (all observed)")
+plt.ylabel("y values (with missing ones)")
+
+plt.show()
+
+# %%
+
+mcartest.test(df_1)
+# %%
+# The p-value is quite high, therefore we don't reject H_0.
+# We can then suppose that our missingness mechanism is MCAR.
+
+# %%
+# Case 2 : Normal iid feature with MAR holes
+# ==========================================
+np.random.seed(11)
+
+matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
+threshold = random.uniform(0, 1)
+matrix[np.argwhere(matrix[:, 0] > 1.96), 1] = np.nan
+matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
+df_2 = pd.DataFrame(matrix)
+
+plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
+plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])
+
+plt.legend(
+    (plt_1, plt_2),
+    ("observed_values", "masked_vlues"),
+    scatterpoints=1,
+    loc="lower left",
+    ncol=1,
+    fontsize=8,
+)
+
+plt.title("Case 2 : MAR missingness mechanism")
+plt.xlabel("x values (all observed)")
+plt.ylabel("y values (with missing ones)")
+
+plt.show()
+
+# %%
+
+mcartest.test(df_2)
+# %%
+# The p-value is lower than the classic threshold (5%).
+# H_0 is then rejected and we can suppose that our missingness mechanism is MAR.
+
+# %%
+# Case 3 : Normal iid feature MAR holes
+# =====================================
+# The specific case is design to emphasize the Little's test limits. In the case, we generate holes
+# when the value of the first feature is high. This missingness mechanism is clearly MAR but the
+# means between missing patterns is not statistically different.
+
+np.random.seed(11)
+
+matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
+matrix[np.argwhere(abs(matrix[:, 0]) >= 1.95), 1] = np.nan
+matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
+df_3 = pd.DataFrame(matrix)
+
+plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
+plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])
+
+plt.legend(
+    (plt_1, plt_2),
+    ("observed_values", "masked_values"),
+    scatterpoints=1,
+    loc="lower left",
+    ncol=1,
+    fontsize=8,
+)
+
+plt.title("Case 3 : MAR missingness mechanism undetected by the Little's test")
+plt.xlabel("x values (all observed)")
+plt.ylabel("y values (with missing ones)")
+
+plt.show()
+
+# %%
+
+mcartest.test(df_3)
+# %%
+# The p-value is higher than the classic threshold (5%).
+# H_0 is not rejected whereas the missingness mechanism is clearly MAR.
diff --git a/tests/audit/test_holes_characterization.py b/tests/audit/test_holes_characterization.py
@@ -6,25 +6,34 @@
 from qolmat.imputations.imputers import ImputerEM
 
 
-np.random.seed(11)
-matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
-matrix_1, matrix_2, matrix_3 = map(np.copy, [matrix] * 3)
-
-# Case 1 : MCAR case detected by Little
-matrix_1.ravel()[np.random.choice(matrix_1.size, size=20, replace=False)] = np.nan
-df_1 = pd.DataFrame(matrix_1)
-
-# Case 2 : MAR case detected by Little
-matrix_2[np.argwhere(matrix_2[:, 0] > 1.96), 1] = np.nan
-df_2 = pd.DataFrame(matrix_2)
-
-# Case 3 : MAR case undetected by Little
-matrix_3[np.argwhere(abs(matrix_3[:, 0]) >= 1.95), 1] = np.nan
-df_3 = pd.DataFrame(matrix_3)
-
-
-@pytest.mark.parametrize("df_input, expected", [(df_1, True), (df_2, False), (df_3, True)])
-def test_little_mcar_test(df_input: pd.DataFrame, expected: bool):
+@pytest.fixture
+def mcar_df() -> pd.DataFrame:
+    rng = np.random.default_rng(42)
+    matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
+    matrix.ravel()[rng.choice(matrix.size, size=20, replace=False)] = np.nan
+    return pd.DataFrame(data=matrix)
+
+
+@pytest.fixture
+def mar_hm_df() -> pd.DataFrame:
+    rng = np.random.default_rng(42)
+    matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
+    matrix[np.argwhere(matrix[:, 0] > 1.96), 1] = np.nan
+    return pd.DataFrame(data=matrix)
+
+
+@pytest.fixture
+def mcar_hc_df() -> pd.DataFrame:
+    rng = np.random.default_rng(42)
+    matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
+    matrix[np.argwhere(abs(matrix[:, 0]) >= 1.95), 1] = np.nan
+    return pd.DataFrame(data=matrix)
+
+
+@pytest.mark.parametrize(
+    "df_input, expected", [("mcar_df", True), ("mar_hm_df", False), ("mcar_hc_df", True)]
+)
+def test_little_mcar_test(df_input: pd.DataFrame, expected: bool, request):
     mcar_test_little = MCARTest(method="little", imputer=ImputerEM(random_state=42))
-    result = mcar_test_little.test(df_input)
+    result = mcar_test_little.test(request.getfixturevalue(df_input))
     assert expected == (result > 0.05)