Random state management and tests added to hole_generator

scikit-learn-contrib · Jun 12, 2024 · 93ddbbe · 93ddbbe
1 parent 67d44cf
commit 93ddbbe
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 39 deletions.
diff --git a/examples/tutorials/plot_tuto_mcar.py b/examples/tutorials/plot_tuto_mcar.py
@@ -1,6 +1,6 @@
 """
 ============================================
-Tutorial for testing the MCAR case
+Tutorial for Testing the MCAR Case
 ============================================
 
 In this tutorial, we show how to test the MCAR case using the Little's test.
@@ -19,65 +19,66 @@
 
 plt.rcParams.update({"font.size": 12})
 
+rng = np.random.RandomState(42)
+
 # %%
 # 1. The Little's test
 # ---------------------------------------------------------------
-# First, we need to introduce the concept of missing pattern. A missing pattern, also called
-# pattern, is the structure of observed and missing values in a data set. For example, for a
-# dataset with 2 columns, the possible patterns are : (0, 0), (1, 0), (0, 1), (1, 1). The value 1
-# (0) indicates that the value in the column is missing (observed).
-#
-# The null hypothesis, H0, is : "The means of observations within each pattern are similar.".
-# Against the alternative hypothesis, H1 : "The means of the observed variables can vary across the
-# patterns."
+# First, we need to introduce the concept of a missing pattern. A missing pattern, also called a
+# pattern, is the structure of observed and missing values in a dataset. For example, in a
+# dataset with two columns, the possible patterns are: (0, 0), (1, 0), (0, 1), (1, 1). The value 1
+# (0) indicates that the column value is missing (observed).
 #
-# If H0 is not rejected , we can assume that the missing data mechanism is MCAR. On the contrary,
-# if H0 is rejected, we can assume that the missing data mechanism is MAR.
+# The null hypothesis, H0, is: "The means of observations within each pattern are similar.".
 #
-# We choose to use the classic threshold, equal to 5%. If the test p_value is below this threshold,
+# We choose to use the classic threshold of 5%. If the test p-value is below this threshold,
 # we reject the null hypothesis.
 #
 # This notebook shows how the Little's test performs and its limitations.
 
-mcartest = LittleTest()
+test_mcar = LittleTest(random_state=rng)
 
 # %%
-# Case 1 : Normal iid features with MCAR holes
+# Case 1: Normal iid features with MCAR holes
 # ============================================
 
-np.random.seed(42)
-matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
-df = pd.DataFrame(data=matrix, columns=["Column_1", "Column_2"])
 
-hole_gen = UniformHoleGenerator(n_splits=1, random_state=42, subset=["Column_2"], ratio_masked=0.2)
+matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
+df = pd.DataFrame(data=matrix, columns=["Column 1", "Column 2"])
+
+hole_gen = UniformHoleGenerator(
+    n_splits=1, random_state=rng, subset=["Column 2"], ratio_masked=0.2
+)
 df_mask = hole_gen.generate_mask(df)
-df_unmasked = ~df_mask
-df_unmasked["Column_1"] = False
 
-df_observed = df.mask(df_mask).dropna()
-df_hidden = df.mask(df_unmasked).dropna(subset="Column_2")
+has_nan = df_mask.any(axis=1)
 
-plt_1 = plt.scatter(df_observed.iloc[:, 0], df_observed.iloc[:, 1], label="Observed values")
-plt_2 = plt.scatter(df_hidden.iloc[:, 0], df_hidden.iloc[:, 1], label="Missing values")
+df_observed = df.loc[~has_nan]
+df_hidden = df.loc[has_nan]
+
+plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values")
+plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2")
 
 plt.legend(
     loc="lower left",
     fontsize=8,
 )
-plt.title("Case 1 : MCAR missingness mechanism")
+plt.xlabel("Column 1")
+plt.ylabel("Column 2")
+plt.title("Case 1: MCAR missingness mechanism")
 plt.show()
 
 # %%
 
-mcartest.test(df.mask(df_mask))
+result = test_mcar.test(df.mask(df_mask))
+print(f"Test p-value: {result:.2%}")
 # %%
 # The p-value is quite high, therefore we don't reject H0.
 # We can then suppose that our missingness mechanism is MCAR.
 
 # %%
-# Case 2 : Normal iid features with MAR holes
+# Case 2: Normal iid features with MAR holes
 # ===========================================
-np.random.seed(42)
 quantile_95 = norm.ppf(0.975)
 
 matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
@@ -99,18 +100,18 @@
     loc="lower left",
     fontsize=8,
 )
-plt.title("Case 2 : MAR missingness mechanism")
+plt.title("Case 2: MAR missingness mechanism")
 plt.show()
 
 # %%
 
-mcartest.test(df.mask(df_mask))
+test_mcar.test(df.mask(df_mask))
 # %%
 # The p-value is lower than the classic threshold (5%).
 # H0 is then rejected and we can suppose that our missingness mechanism is MAR.
 
 # %%
-# Case 3 : Normal iid features with MAR holes
+# Case 3: Normal iid features with MAR holes
 # ===========================================
 # The specific case is designed to emphasize the Little's test limits. In the case, we generate
 # holes when the absolute value of the first feature is high. This missingness mechanism is clearly
@@ -137,12 +138,12 @@
     loc="lower left",
     fontsize=8,
 )
-plt.title("Case 3 : MAR missingness mechanism undetected by the Little's test")
+plt.title("Case 3: MAR missingness mechanism undetected by the Little's test")
 plt.show()
 
 # %%
 
-mcartest.test(df.mask(df_mask))
+test_mcar.test(df.mask(df_mask))
 # %%
 # The p-value is higher than the classic threshold (5%).
 # H0 is not rejected whereas the missingness mechanism is clearly MAR.

diff --git a/qolmat/analysis/holes_characterization.py b/qolmat/analysis/holes_characterization.py
@@ -66,8 +66,12 @@ def test(self, df: pd.DataFrame) -> float:
         float
             The p-value of the test.
         """
+        print("test")
+        print(self.random_state.randint(100))
         imputer = self.imputer or ImputerEM(random_state=self.random_state)
         imputer = imputer._fit_element(df)
+        print(df[df.notna()].mean().mean())
+        print("means:", imputer.means)
 
         d0 = 0
         n_rows, n_cols = df.shape

diff --git a/qolmat/benchmark/missing_patterns.py b/qolmat/benchmark/missing_patterns.py
@@ -185,17 +185,16 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame:
             Initial dataframe with a missing pattern to be imitated.
         """
 
-        self.rng = sku.check_random_state(self.random_state)
+        self.random_state = sku.check_random_state(self.random_state)
         df_mask = pd.DataFrame(False, index=X.index, columns=X.columns)
         n_masked_col = math.ceil(self.ratio_masked * len(X))
 
         for column in self.subset:
             indices = np.where(X[column].notna())[0]
-            indices = resample(
+            indices = self.random_state.choice(
                 indices,
                 replace=False,
-                n_samples=n_masked_col,
-                stratify=None,
+                size=n_masked_col,
             )
             df_mask[column].iloc[indices] = True
 
@@ -699,7 +698,7 @@ def split(self, X: pd.DataFrame) -> List[pd.DataFrame]:
         list_masks = []
 
         for _ in range(self.n_splits):
-            shuffled_group_sizes = group_sizes.sample(frac=1)
+            shuffled_group_sizes = group_sizes.sample(frac=1, random_state=self.random_state)
 
             ratio_masks = shuffled_group_sizes.cumsum() / len(X)
             ratio_masks = ratio_masks.reset_index(name="ratio")

diff --git a/tests/benchmark/test_missing_patterns.py b/tests/benchmark/test_missing_patterns.py
@@ -32,8 +32,8 @@
 @pytest.mark.parametrize(
     "df, generator",
     [
-        (df_incomplet, list_generators["geo"]),
         (df_incomplet, list_generators["unif"]),
+        (df_incomplet, list_generators["geo"]),
         (df_incomplet, list_generators["multi"]),
         (df_incomplet_group, list_generators["group"]),
     ],
@@ -48,6 +48,27 @@ def test_SamplerHoleGenerator_split(df: pd.DataFrame, generator: mp._HoleGenerat
     np.testing.assert_allclose(col2_holes, expected_col2_holes, atol=1)
 
 
+@pytest.mark.parametrize(
+    "df, generator",
+    [
+        (df_incomplet, list_generators["unif"]),
+        (df_incomplet, list_generators["geo"]),
+        (df_incomplet, list_generators["multi"]),
+        (df_incomplet_group, list_generators["group"]),
+    ],
+)
+def test_SamplerHoleGenerator_reproducible(df: pd.DataFrame, generator: mp._HoleGenerator) -> None:
+    generator.random_state = 42
+    mask1 = generator.split(df)[0]
+    generator.random_state = 43
+    mask2 = generator.split(df)[0]
+    generator.random_state = 42
+    mask3 = generator.split(df)[0]
+
+    np.testing.assert_array_equal(mask1, mask3)
+    assert (mask1 != mask2).any().any()
+
+
 @pytest.mark.parametrize(
     "df, generator",
     [