Skip to content

Commit

Permalink
📝 Add the tuto of the Little's MCAR test. And modify the test file re…
Browse files Browse the repository at this point in the history
…garding the random generator seed.
  • Loading branch information
adriencrtr committed Apr 25, 2024
1 parent d70563d commit 23aa9b4
Show file tree
Hide file tree
Showing 4 changed files with 189 additions and 20 deletions.
3 changes: 3 additions & 0 deletions docs/audit.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

Audit
===============
8 changes: 8 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
.. include:: ../README.rst

.. toctree::
:maxdepth: 2
:hidden:
:caption: AUDIT

audit
examples/tutorials/plot_tuto_mcar_test

.. toctree::
:maxdepth: 2
:hidden:
Expand Down
149 changes: 149 additions & 0 deletions examples/tutorials/plot_tuto_mcar_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
"""
============================================
Tutorial for testing the MCAR case
============================================
In this tutorial, we show how to use the mcar test classe and it methods
Keep in my mind that, at this moment, the mcar tests are only handle tabular data.
"""
# %%
# First import some libraries
from matplotlib import pyplot as plt
import random

import numpy as np
import pandas as pd

from qolmat.audit.holes_characterization import MCARTest

# %%
# 1. The Little's test
# ---------------------------------------------------------------
# How to use the Little's test ?
# ==============================
# When we deal with missing data in our dataset it's interesting to know the nature of these holes.
# There exist three types of holes : MCAR, MAR and MNAR.
# (see the: `Rubin's missing mechanism classification
# <https://qolmat.readthedocs.io/en/latest/explanation.html>`_)
#
# The simplest case to test is the MCAR case. The most famous MCAR statistical test is the
# `Little's test <https://www.tandfonline.com/doi/abs/10.1080/01621459.1988.10478722>`_.
# Keep in mind that the Little's test is designed to test the homogeneity of means between the
# missing patterns and won't be efficient to detect the heterogeneity of covariance between missing
# patterns.
#
# This notebook shows how the Little's test performs and its limitations.

np.random.seed(11)

mcartest = MCARTest(method="little")

# %%
# Case 1 : Normal iid feature with MCAR holes
# ===========================================

matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
matrix.ravel()[np.random.choice(matrix.size, size=20, replace=False)] = np.nan
matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
df_1 = pd.DataFrame(matrix)

plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])

plt.legend(
(plt_1, plt_2),
("observed_values", "masked_vlues"),
scatterpoints=1,
loc="lower left",
ncol=1,
fontsize=8,
)

plt.title("Case 1 : MCAR missingness mechanism")
plt.xlabel("x values (all observed)")
plt.ylabel("y values (with missing ones)")

plt.show()

# %%

mcartest.test(df_1)
# %%
# The p-value is quite high, therefore we don't reject H_0.
# We can then suppose that our missingness mechanism is MCAR.

# %%
# Case 2 : Normal iid feature with MAR holes
# ==========================================
np.random.seed(11)

matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
threshold = random.uniform(0, 1)
matrix[np.argwhere(matrix[:, 0] > 1.96), 1] = np.nan
matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
df_2 = pd.DataFrame(matrix)

plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])

plt.legend(
(plt_1, plt_2),
("observed_values", "masked_vlues"),
scatterpoints=1,
loc="lower left",
ncol=1,
fontsize=8,
)

plt.title("Case 2 : MAR missingness mechanism")
plt.xlabel("x values (all observed)")
plt.ylabel("y values (with missing ones)")

plt.show()

# %%

mcartest.test(df_2)
# %%
# The p-value is lower than the classic threshold (5%).
# H_0 is then rejected and we can suppose that our missingness mechanism is MAR.

# %%
# Case 3 : Normal iid feature MAR holes
# =====================================
# The specific case is design to emphasize the Little's test limits. In the case, we generate holes
# when the value of the first feature is high. This missingness mechanism is clearly MAR but the
# means between missing patterns is not statistically different.

np.random.seed(11)

matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
matrix[np.argwhere(abs(matrix[:, 0]) >= 1.95), 1] = np.nan
matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
df_3 = pd.DataFrame(matrix)

plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])

plt.legend(
(plt_1, plt_2),
("observed_values", "masked_values"),
scatterpoints=1,
loc="lower left",
ncol=1,
fontsize=8,
)

plt.title("Case 3 : MAR missingness mechanism undetected by the Little's test")
plt.xlabel("x values (all observed)")
plt.ylabel("y values (with missing ones)")

plt.show()

# %%

mcartest.test(df_3)
# %%
# The p-value is higher than the classic threshold (5%).
# H_0 is not rejected whereas the missingness mechanism is clearly MAR.
49 changes: 29 additions & 20 deletions tests/audit/test_holes_characterization.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,34 @@
from qolmat.imputations.imputers import ImputerEM


np.random.seed(11)
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
matrix_1, matrix_2, matrix_3 = map(np.copy, [matrix] * 3)

# Case 1 : MCAR case detected by Little
matrix_1.ravel()[np.random.choice(matrix_1.size, size=20, replace=False)] = np.nan
df_1 = pd.DataFrame(matrix_1)

# Case 2 : MAR case detected by Little
matrix_2[np.argwhere(matrix_2[:, 0] > 1.96), 1] = np.nan
df_2 = pd.DataFrame(matrix_2)

# Case 3 : MAR case undetected by Little
matrix_3[np.argwhere(abs(matrix_3[:, 0]) >= 1.95), 1] = np.nan
df_3 = pd.DataFrame(matrix_3)


@pytest.mark.parametrize("df_input, expected", [(df_1, True), (df_2, False), (df_3, True)])
def test_little_mcar_test(df_input: pd.DataFrame, expected: bool):
@pytest.fixture
def mcar_df() -> pd.DataFrame:
rng = np.random.default_rng(42)
matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
matrix.ravel()[rng.choice(matrix.size, size=20, replace=False)] = np.nan
return pd.DataFrame(data=matrix)


@pytest.fixture
def mar_hm_df() -> pd.DataFrame:
rng = np.random.default_rng(42)
matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
matrix[np.argwhere(matrix[:, 0] > 1.96), 1] = np.nan
return pd.DataFrame(data=matrix)


@pytest.fixture
def mcar_hc_df() -> pd.DataFrame:
rng = np.random.default_rng(42)
matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
matrix[np.argwhere(abs(matrix[:, 0]) >= 1.95), 1] = np.nan
return pd.DataFrame(data=matrix)


@pytest.mark.parametrize(
"df_input, expected", [("mcar_df", True), ("mar_hm_df", False), ("mcar_hc_df", True)]
)
def test_little_mcar_test(df_input: pd.DataFrame, expected: bool, request):
mcar_test_little = MCARTest(method="little", imputer=ImputerEM(random_state=42))
result = mcar_test_little.test(df_input)
result = mcar_test_little.test(request.getfixturevalue(df_input))
assert expected == (result > 0.05)

0 comments on commit 23aa9b4

Please sign in to comment.