diff --git a/docs/audit.rst b/docs/audit.rst new file mode 100644 index 00000000..8cdc10ec --- /dev/null +++ b/docs/audit.rst @@ -0,0 +1,3 @@ + +Audit +=============== diff --git a/docs/index.rst b/docs/index.rst index 5bfc64a5..abba8b9c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,5 +1,13 @@ .. include:: ../README.rst +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: AUDIT + + audit + examples/tutorials/plot_tuto_mcar_test + .. toctree:: :maxdepth: 2 :hidden: diff --git a/examples/tutorials/plot_tuto_mcar_test.py b/examples/tutorials/plot_tuto_mcar_test.py new file mode 100644 index 00000000..2f405e16 --- /dev/null +++ b/examples/tutorials/plot_tuto_mcar_test.py @@ -0,0 +1,149 @@ +""" +============================================ +Tutorial for testing the MCAR case +============================================ + +In this tutorial, we show how to use the mcar test classe and it methods + +Keep in my mind that, at this moment, the mcar tests are only handle tabular data. +""" +# %% +# First import some libraries +from matplotlib import pyplot as plt +import random + +import numpy as np +import pandas as pd + +from qolmat.audit.holes_characterization import MCARTest + +# %% +# 1. The Little's test +# --------------------------------------------------------------- +# How to use the Little's test ? +# ============================== +# When we deal with missing data in our dataset it's interesting to know the nature of these holes. +# There exist three types of holes : MCAR, MAR and MNAR. +# (see the: `Rubin's missing mechanism classification +# `_) +# +# The simplest case to test is the MCAR case. The most famous MCAR statistical test is the +# `Little's test `_. +# Keep in mind that the Little's test is designed to test the homogeneity of means between the +# missing patterns and won't be efficient to detect the heterogeneity of covariance between missing +# patterns. +# +# This notebook shows how the Little's test performs and its limitations. + +np.random.seed(11) + +mcartest = MCARTest(method="little") + +# %% +# Case 1 : Normal iid feature with MCAR holes +# =========================================== + +matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100) +matrix.ravel()[np.random.choice(matrix.size, size=20, replace=False)] = np.nan +matrix_masked = matrix[np.argwhere(np.isnan(matrix))] +df_1 = pd.DataFrame(matrix) + +plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1]) +plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1]) + +plt.legend( + (plt_1, plt_2), + ("observed_values", "masked_vlues"), + scatterpoints=1, + loc="lower left", + ncol=1, + fontsize=8, +) + +plt.title("Case 1 : MCAR missingness mechanism") +plt.xlabel("x values (all observed)") +plt.ylabel("y values (with missing ones)") + +plt.show() + +# %% + +mcartest.test(df_1) +# %% +# The p-value is quite high, therefore we don't reject H_0. +# We can then suppose that our missingness mechanism is MCAR. + +# %% +# Case 2 : Normal iid feature with MAR holes +# ========================================== +np.random.seed(11) + +matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100) +threshold = random.uniform(0, 1) +matrix[np.argwhere(matrix[:, 0] > 1.96), 1] = np.nan +matrix_masked = matrix[np.argwhere(np.isnan(matrix))] +df_2 = pd.DataFrame(matrix) + +plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1]) +plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1]) + +plt.legend( + (plt_1, plt_2), + ("observed_values", "masked_vlues"), + scatterpoints=1, + loc="lower left", + ncol=1, + fontsize=8, +) + +plt.title("Case 2 : MAR missingness mechanism") +plt.xlabel("x values (all observed)") +plt.ylabel("y values (with missing ones)") + +plt.show() + +# %% + +mcartest.test(df_2) +# %% +# The p-value is lower than the classic threshold (5%). +# H_0 is then rejected and we can suppose that our missingness mechanism is MAR. + +# %% +# Case 3 : Normal iid feature MAR holes +# ===================================== +# The specific case is design to emphasize the Little's test limits. In the case, we generate holes +# when the value of the first feature is high. This missingness mechanism is clearly MAR but the +# means between missing patterns is not statistically different. + +np.random.seed(11) + +matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100) +matrix[np.argwhere(abs(matrix[:, 0]) >= 1.95), 1] = np.nan +matrix_masked = matrix[np.argwhere(np.isnan(matrix))] +df_3 = pd.DataFrame(matrix) + +plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1]) +plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1]) + +plt.legend( + (plt_1, plt_2), + ("observed_values", "masked_values"), + scatterpoints=1, + loc="lower left", + ncol=1, + fontsize=8, +) + +plt.title("Case 3 : MAR missingness mechanism undetected by the Little's test") +plt.xlabel("x values (all observed)") +plt.ylabel("y values (with missing ones)") + +plt.show() + +# %% + +mcartest.test(df_3) +# %% +# The p-value is higher than the classic threshold (5%). +# H_0 is not rejected whereas the missingness mechanism is clearly MAR. diff --git a/tests/audit/test_holes_characterization.py b/tests/audit/test_holes_characterization.py index a5ae979d..74aff3f7 100644 --- a/tests/audit/test_holes_characterization.py +++ b/tests/audit/test_holes_characterization.py @@ -6,25 +6,34 @@ from qolmat.imputations.imputers import ImputerEM -np.random.seed(11) -matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100) -matrix_1, matrix_2, matrix_3 = map(np.copy, [matrix] * 3) - -# Case 1 : MCAR case detected by Little -matrix_1.ravel()[np.random.choice(matrix_1.size, size=20, replace=False)] = np.nan -df_1 = pd.DataFrame(matrix_1) - -# Case 2 : MAR case detected by Little -matrix_2[np.argwhere(matrix_2[:, 0] > 1.96), 1] = np.nan -df_2 = pd.DataFrame(matrix_2) - -# Case 3 : MAR case undetected by Little -matrix_3[np.argwhere(abs(matrix_3[:, 0]) >= 1.95), 1] = np.nan -df_3 = pd.DataFrame(matrix_3) - - -@pytest.mark.parametrize("df_input, expected", [(df_1, True), (df_2, False), (df_3, True)]) -def test_little_mcar_test(df_input: pd.DataFrame, expected: bool): +@pytest.fixture +def mcar_df() -> pd.DataFrame: + rng = np.random.default_rng(42) + matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100) + matrix.ravel()[rng.choice(matrix.size, size=20, replace=False)] = np.nan + return pd.DataFrame(data=matrix) + + +@pytest.fixture +def mar_hm_df() -> pd.DataFrame: + rng = np.random.default_rng(42) + matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100) + matrix[np.argwhere(matrix[:, 0] > 1.96), 1] = np.nan + return pd.DataFrame(data=matrix) + + +@pytest.fixture +def mcar_hc_df() -> pd.DataFrame: + rng = np.random.default_rng(42) + matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100) + matrix[np.argwhere(abs(matrix[:, 0]) >= 1.95), 1] = np.nan + return pd.DataFrame(data=matrix) + + +@pytest.mark.parametrize( + "df_input, expected", [("mcar_df", True), ("mar_hm_df", False), ("mcar_hc_df", True)] +) +def test_little_mcar_test(df_input: pd.DataFrame, expected: bool, request): mcar_test_little = MCARTest(method="little", imputer=ImputerEM(random_state=42)) - result = mcar_test_little.test(df_input) + result = mcar_test_little.test(request.getfixturevalue(df_input)) assert expected == (result > 0.05)