Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mcar test implementation #142

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/audit.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

Audit
===============
8 changes: 8 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
.. include:: ../README.rst

.. toctree::
:maxdepth: 2
:hidden:
:caption: AUDIT

audit
examples/tutorials/plot_tuto_mcar_test

.. toctree::
:maxdepth: 2
:hidden:
Expand Down
149 changes: 149 additions & 0 deletions examples/tutorials/plot_tuto_mcar_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
"""
============================================
Tutorial for testing the MCAR case
============================================

In this tutorial, we show how to use the mcar test classe and it methods

Keep in my mind that, at this moment, the mcar tests are only handle tabular data.
"""
# %%
# First import some libraries
from matplotlib import pyplot as plt
import random

import numpy as np
import pandas as pd

from qolmat.audit.holes_characterization import MCARTest

# %%
# 1. The Little's test
# ---------------------------------------------------------------
# How to use the Little's test ?
# ==============================
# When we deal with missing data in our dataset it's interesting to know the nature of these holes.
# There exist three types of holes : MCAR, MAR and MNAR.
# (see the: `Rubin's missing mechanism classification
# <https://qolmat.readthedocs.io/en/latest/explanation.html>`_)
#
# The simplest case to test is the MCAR case. The most famous MCAR statistical test is the
# `Little's test <https://www.tandfonline.com/doi/abs/10.1080/01621459.1988.10478722>`_.
# Keep in mind that the Little's test is designed to test the homogeneity of means between the
# missing patterns and won't be efficient to detect the heterogeneity of covariance between missing
# patterns.
#
# This notebook shows how the Little's test performs and its limitations.

np.random.seed(11)

mcartest = MCARTest(method="little")

# %%
# Case 1 : Normal iid feature with MCAR holes
# ===========================================

matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
matrix.ravel()[np.random.choice(matrix.size, size=20, replace=False)] = np.nan
matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
df_1 = pd.DataFrame(matrix)

plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])

plt.legend(
(plt_1, plt_2),
("observed_values", "masked_vlues"),
scatterpoints=1,
loc="lower left",
ncol=1,
fontsize=8,
)

plt.title("Case 1 : MCAR missingness mechanism")
plt.xlabel("x values (all observed)")
plt.ylabel("y values (with missing ones)")

plt.show()

# %%

mcartest.test(df_1)
# %%
# The p-value is quite high, therefore we don't reject H_0.
# We can then suppose that our missingness mechanism is MCAR.

# %%
# Case 2 : Normal iid feature with MAR holes
# ==========================================
np.random.seed(11)

matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
threshold = random.uniform(0, 1)
matrix[np.argwhere(matrix[:, 0] > 1.96), 1] = np.nan
matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
df_2 = pd.DataFrame(matrix)

plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])

plt.legend(
(plt_1, plt_2),
("observed_values", "masked_vlues"),
scatterpoints=1,
loc="lower left",
ncol=1,
fontsize=8,
)

plt.title("Case 2 : MAR missingness mechanism")
plt.xlabel("x values (all observed)")
plt.ylabel("y values (with missing ones)")

plt.show()

# %%

mcartest.test(df_2)
# %%
# The p-value is lower than the classic threshold (5%).
# H_0 is then rejected and we can suppose that our missingness mechanism is MAR.

# %%
# Case 3 : Normal iid feature MAR holes
# =====================================
# The specific case is design to emphasize the Little's test limits. In the case, we generate holes
# when the value of the first feature is high. This missingness mechanism is clearly MAR but the
# means between missing patterns is not statistically different.

np.random.seed(11)

matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
matrix[np.argwhere(abs(matrix[:, 0]) >= 1.95), 1] = np.nan
matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
df_3 = pd.DataFrame(matrix)

plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])

plt.legend(
(plt_1, plt_2),
("observed_values", "masked_values"),
scatterpoints=1,
loc="lower left",
ncol=1,
fontsize=8,
)

plt.title("Case 3 : MAR missingness mechanism undetected by the Little's test")
plt.xlabel("x values (all observed)")
plt.ylabel("y values (with missing ones)")

plt.show()

# %%

mcartest.test(df_3)
# %%
# The p-value is higher than the classic threshold (5%).
# H_0 is not rejected whereas the missingness mechanism is clearly MAR.
79 changes: 79 additions & 0 deletions qolmat/audit/holes_characterization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from __future__ import annotations
from typing import Literal, Optional, TYPE_CHECKING

import numpy as np
import pandas as pd
from scipy.stats import chi2

from qolmat.imputations.imputers import ImputerEM

if TYPE_CHECKING:
from qolmat.imputations.imputers import _Imputer


class MCARTest:
"""
This class implements the statistical tests to test the MCAR case.

Parameters
----------
method : Literal[&quot;little&quot;]
The name of the statistical test. This should be handled by qolmat.
imputer : Optional[_Imputer], optional
If the selected test needs a imputer, you can provide the Imputer you want. Otherwise,
a default imputer will be used.
"""

def __init__(self, method: Literal["little"], imputer: Optional[_Imputer] = None):
if method not in ["little"]:
raise ValueError(f"method` must be handled by qolmat, provided value is '{method}'")

self.method = method
self.imputer = imputer

def test(self, df: pd.DataFrame) -> float:
if self.method == "little":
return self.little_mcar_test(df)

def little_mcar_test(self, df: pd.DataFrame) -> float:
"""
This method implements the Little's test. Use this test to test the homogenity of means
between all your missing patterns.
The null hypethoses is "The missing data mechanism is MCAR".
Be aware that this test won't detect the heterogeneity of covariance.

Parameters
----------
df : pd.DataFrame
Your input data with missing values.

Returns
-------
float
The p-value of the test.
"""
imputer = self.imputer or ImputerEM()
fitted_imputer = imputer._fit_element(df)

# Instanciant the stat, the degree of freedom and estimators.
d0 = 0
n_rows, degree_f = df.shape
degree_f = -degree_f
ml_means = fitted_imputer.means
ml_cov = n_rows / (n_rows - 1) * fitted_imputer.cov

# Iterate over the patterns
df_nan = df.notna()
for tup_pattern, df_nan_pattern in df_nan.groupby(df_nan.columns.tolist()):
n_rows_pattern, _ = df_nan_pattern.shape
ind_pattern = df_nan_pattern.index
df_pattern = df.loc[ind_pattern, list(tup_pattern)]
obs_mean = df_pattern.mean().to_numpy()

diff_means = obs_mean - ml_means[list(tup_pattern)]
inv_sigma_pattern = np.linalg.inv(ml_cov[:, tup_pattern][tup_pattern, :])

d0 += n_rows_pattern * np.dot(np.dot(diff_means, inv_sigma_pattern), diff_means.T)
degree_f += tup_pattern.count(True)

return 1 - chi2.cdf(d0, degree_f)
39 changes: 39 additions & 0 deletions tests/audit/test_holes_characterization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import numpy as np
import pandas as pd
import pytest

from qolmat.audit.holes_characterization import MCARTest
from qolmat.imputations.imputers import ImputerEM


@pytest.fixture
def mcar_df() -> pd.DataFrame:
rng = np.random.default_rng(42)
matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
matrix.ravel()[rng.choice(matrix.size, size=20, replace=False)] = np.nan
return pd.DataFrame(data=matrix)


@pytest.fixture
def mar_hm_df() -> pd.DataFrame:
rng = np.random.default_rng(42)
matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
matrix[np.argwhere(matrix[:, 0] > 1.96), 1] = np.nan
return pd.DataFrame(data=matrix)


@pytest.fixture
def mcar_hc_df() -> pd.DataFrame:
rng = np.random.default_rng(42)
matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
matrix[np.argwhere(abs(matrix[:, 0]) >= 1.95), 1] = np.nan
return pd.DataFrame(data=matrix)


@pytest.mark.parametrize(
"df_input, expected", [("mcar_df", True), ("mar_hm_df", False), ("mcar_hc_df", True)]
)
def test_little_mcar_test(df_input: pd.DataFrame, expected: bool, request):
mcar_test_little = MCARTest(method="little", imputer=ImputerEM(random_state=42))
result = mcar_test_little.test(request.getfixturevalue(df_input))
assert expected == (result > 0.05)
Loading