Skip to content

Commit

Permalink
Random state management and tests added to hole_generator
Browse files Browse the repository at this point in the history
  • Loading branch information
Julien Roussel authored and Julien Roussel committed Jun 12, 2024
1 parent 67d44cf commit 93ddbbe
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 39 deletions.
67 changes: 34 additions & 33 deletions examples/tutorials/plot_tuto_mcar.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
============================================
Tutorial for testing the MCAR case
Tutorial for Testing the MCAR Case
============================================
In this tutorial, we show how to test the MCAR case using the Little's test.
Expand All @@ -19,65 +19,66 @@

plt.rcParams.update({"font.size": 12})

rng = np.random.RandomState(42)

# %%
# 1. The Little's test
# ---------------------------------------------------------------
# First, we need to introduce the concept of missing pattern. A missing pattern, also called
# pattern, is the structure of observed and missing values in a data set. For example, for a
# dataset with 2 columns, the possible patterns are : (0, 0), (1, 0), (0, 1), (1, 1). The value 1
# (0) indicates that the value in the column is missing (observed).
#
# The null hypothesis, H0, is : "The means of observations within each pattern are similar.".
# Against the alternative hypothesis, H1 : "The means of the observed variables can vary across the
# patterns."
# First, we need to introduce the concept of a missing pattern. A missing pattern, also called a
# pattern, is the structure of observed and missing values in a dataset. For example, in a
# dataset with two columns, the possible patterns are: (0, 0), (1, 0), (0, 1), (1, 1). The value 1
# (0) indicates that the column value is missing (observed).
#
# If H0 is not rejected , we can assume that the missing data mechanism is MCAR. On the contrary,
# if H0 is rejected, we can assume that the missing data mechanism is MAR.
# The null hypothesis, H0, is: "The means of observations within each pattern are similar.".
#
# We choose to use the classic threshold, equal to 5%. If the test p_value is below this threshold,
# We choose to use the classic threshold of 5%. If the test p-value is below this threshold,
# we reject the null hypothesis.
#
# This notebook shows how the Little's test performs and its limitations.

mcartest = LittleTest()
test_mcar = LittleTest(random_state=rng)

# %%
# Case 1 : Normal iid features with MCAR holes
# Case 1: Normal iid features with MCAR holes
# ============================================

np.random.seed(42)
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
df = pd.DataFrame(data=matrix, columns=["Column_1", "Column_2"])

hole_gen = UniformHoleGenerator(n_splits=1, random_state=42, subset=["Column_2"], ratio_masked=0.2)
matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
df = pd.DataFrame(data=matrix, columns=["Column 1", "Column 2"])

hole_gen = UniformHoleGenerator(
n_splits=1, random_state=rng, subset=["Column 2"], ratio_masked=0.2
)
df_mask = hole_gen.generate_mask(df)
df_unmasked = ~df_mask
df_unmasked["Column_1"] = False

df_observed = df.mask(df_mask).dropna()
df_hidden = df.mask(df_unmasked).dropna(subset="Column_2")
has_nan = df_mask.any(axis=1)

plt_1 = plt.scatter(df_observed.iloc[:, 0], df_observed.iloc[:, 1], label="Observed values")
plt_2 = plt.scatter(df_hidden.iloc[:, 0], df_hidden.iloc[:, 1], label="Missing values")
df_observed = df.loc[~has_nan]
df_hidden = df.loc[has_nan]

plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values")
plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2")

plt.legend(
loc="lower left",
fontsize=8,
)
plt.title("Case 1 : MCAR missingness mechanism")
plt.xlabel("Column 1")
plt.ylabel("Column 2")
plt.title("Case 1: MCAR missingness mechanism")
plt.show()

# %%

mcartest.test(df.mask(df_mask))
result = test_mcar.test(df.mask(df_mask))
print(f"Test p-value: {result:.2%}")
# %%
# The p-value is quite high, therefore we don't reject H0.
# We can then suppose that our missingness mechanism is MCAR.

# %%
# Case 2 : Normal iid features with MAR holes
# Case 2: Normal iid features with MAR holes
# ===========================================
np.random.seed(42)
quantile_95 = norm.ppf(0.975)

matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
Expand All @@ -99,18 +100,18 @@
loc="lower left",
fontsize=8,
)
plt.title("Case 2 : MAR missingness mechanism")
plt.title("Case 2: MAR missingness mechanism")
plt.show()

# %%

mcartest.test(df.mask(df_mask))
test_mcar.test(df.mask(df_mask))
# %%
# The p-value is lower than the classic threshold (5%).
# H0 is then rejected and we can suppose that our missingness mechanism is MAR.

# %%
# Case 3 : Normal iid features with MAR holes
# Case 3: Normal iid features with MAR holes
# ===========================================
# The specific case is designed to emphasize the Little's test limits. In the case, we generate
# holes when the absolute value of the first feature is high. This missingness mechanism is clearly
Expand All @@ -137,12 +138,12 @@
loc="lower left",
fontsize=8,
)
plt.title("Case 3 : MAR missingness mechanism undetected by the Little's test")
plt.title("Case 3: MAR missingness mechanism undetected by the Little's test")
plt.show()

# %%

mcartest.test(df.mask(df_mask))
test_mcar.test(df.mask(df_mask))
# %%
# The p-value is higher than the classic threshold (5%).
# H0 is not rejected whereas the missingness mechanism is clearly MAR.
Expand Down
4 changes: 4 additions & 0 deletions qolmat/analysis/holes_characterization.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,12 @@ def test(self, df: pd.DataFrame) -> float:
float
The p-value of the test.
"""
print("test")
print(self.random_state.randint(100))
imputer = self.imputer or ImputerEM(random_state=self.random_state)
imputer = imputer._fit_element(df)
print(df[df.notna()].mean().mean())
print("means:", imputer.means)

d0 = 0
n_rows, n_cols = df.shape
Expand Down
9 changes: 4 additions & 5 deletions qolmat/benchmark/missing_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,17 +185,16 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame:
Initial dataframe with a missing pattern to be imitated.
"""

self.rng = sku.check_random_state(self.random_state)
self.random_state = sku.check_random_state(self.random_state)
df_mask = pd.DataFrame(False, index=X.index, columns=X.columns)
n_masked_col = math.ceil(self.ratio_masked * len(X))

for column in self.subset:
indices = np.where(X[column].notna())[0]
indices = resample(
indices = self.random_state.choice(
indices,
replace=False,
n_samples=n_masked_col,
stratify=None,
size=n_masked_col,
)
df_mask[column].iloc[indices] = True

Expand Down Expand Up @@ -699,7 +698,7 @@ def split(self, X: pd.DataFrame) -> List[pd.DataFrame]:
list_masks = []

for _ in range(self.n_splits):
shuffled_group_sizes = group_sizes.sample(frac=1)
shuffled_group_sizes = group_sizes.sample(frac=1, random_state=self.random_state)

ratio_masks = shuffled_group_sizes.cumsum() / len(X)
ratio_masks = ratio_masks.reset_index(name="ratio")
Expand Down
23 changes: 22 additions & 1 deletion tests/benchmark/test_missing_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
@pytest.mark.parametrize(
"df, generator",
[
(df_incomplet, list_generators["geo"]),
(df_incomplet, list_generators["unif"]),
(df_incomplet, list_generators["geo"]),
(df_incomplet, list_generators["multi"]),
(df_incomplet_group, list_generators["group"]),
],
Expand All @@ -48,6 +48,27 @@ def test_SamplerHoleGenerator_split(df: pd.DataFrame, generator: mp._HoleGenerat
np.testing.assert_allclose(col2_holes, expected_col2_holes, atol=1)


@pytest.mark.parametrize(
"df, generator",
[
(df_incomplet, list_generators["unif"]),
(df_incomplet, list_generators["geo"]),
(df_incomplet, list_generators["multi"]),
(df_incomplet_group, list_generators["group"]),
],
)
def test_SamplerHoleGenerator_reproducible(df: pd.DataFrame, generator: mp._HoleGenerator) -> None:
generator.random_state = 42
mask1 = generator.split(df)[0]
generator.random_state = 43
mask2 = generator.split(df)[0]
generator.random_state = 42
mask3 = generator.split(df)[0]

np.testing.assert_array_equal(mask1, mask3)
assert (mask1 != mask2).any().any()


@pytest.mark.parametrize(
"df, generator",
[
Expand Down

0 comments on commit 93ddbbe

Please sign in to comment.