-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
✨ PKLM first draft of code with a little tuto and the begining of the…
… simulation study.
- Loading branch information
1 parent
beb6c2a
commit 784fbd9
Showing
3 changed files
with
400 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
#!/usr/bin/env python | ||
# coding: utf-8 | ||
|
||
# ### L'objectif de ce notebook est de récréer les résultats du papiers. | ||
# | ||
# Il reste encore pas mal de travail. | ||
|
||
# In[18]: | ||
|
||
|
||
import numpy as np | ||
import pandas as pd | ||
from scipy.stats import multivariate_t | ||
|
||
from qolmat.audit.pklm import PKLMtest | ||
|
||
|
||
# In[4]: | ||
|
||
|
||
# Set the parameters | ||
n = 1000 # Number of samples | ||
p = 5 # Number of dimensions | ||
|
||
# Mean vector of zeros | ||
mean = np.zeros(p) | ||
|
||
# Identity covariance matrix | ||
cov = np.eye(p) | ||
|
||
# Generate the multivariate normal distribution | ||
data = np.random.multivariate_normal(mean, cov, n) | ||
|
||
# Create a DataFrame | ||
df = pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)]) | ||
|
||
# Display the first few rows of the DataFrame | ||
df.head() | ||
|
||
|
||
# In[10]: | ||
|
||
|
||
def generate_df_case_1(n: int, p: int) -> pd.DataFrame: | ||
mean = np.zeros(p) | ||
cov = np.eye(p) | ||
data = np.random.multivariate_normal(mean, cov, n) | ||
return pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)]) | ||
|
||
|
||
# In[16]: | ||
|
||
|
||
def generate_df_case_2(n: int, p: int) -> pd.DataFrame: | ||
cov = np.full((p, p), 0.7) | ||
np.fill_diagonal(cov, 1) | ||
mean = np.zeros(p) | ||
data = np.random.multivariate_normal(mean, cov, n) | ||
return pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)]) | ||
|
||
|
||
# In[27]: | ||
|
||
|
||
def generate_df_case_3(n: int, p: int, df: int = 4) -> pd.DataFrame: | ||
mean = np.zeros(4) | ||
cov = np.eye(4) | ||
rv = multivariate_t(loc=mean, shape=cov, df=4) | ||
data = rv.rvs(size=n) | ||
return pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)]) | ||
|
||
|
||
# In[30]: | ||
|
||
|
||
def generate_df_case_4(n: int, p: int, df: int = 4) -> pd.DataFrame: | ||
cov = np.full((p, p), 0.7) | ||
np.fill_diagonal(cov, 1) | ||
mean = np.zeros(p) | ||
rv = multivariate_t(loc=mean, shape=cov, df=4) | ||
data = rv.rvs(size=n) | ||
return pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)]) | ||
|
||
|
||
# In[32]: | ||
|
||
|
||
def generate_df_case_5(n: int, p: int) -> pd.DataFrame: | ||
return pd.DataFrame(np.random.rand(n, p), columns=[f"Feature_{i+1}" for i in range(p)]) | ||
|
||
|
||
# ### Generate the MCAR holes | ||
|
||
# In[7]: | ||
|
||
|
||
def gen_mcar_holes(df: pd.DataFrame, r: float) -> pd.DataFrame: | ||
_, n_col = df.shape | ||
mask = np.random.rand(*df.shape) < (1 - r ** (1 / n_col)) | ||
df[mask] = np.nan | ||
return df | ||
|
||
|
||
# In[12]: | ||
|
||
|
||
df_1 = generate_df_case_1(n=200, p=4) | ||
df_1_mcar = gen_mcar_holes(df_1, 0.65) | ||
PKLMtest(df_1_mcar, nb_projections=100, nb_permutations=30) | ||
|
||
|
||
# In[15]: | ||
|
||
|
||
PKLMtest(df_1_mcar, nb_projections=100, nb_permutations=30) | ||
|
||
|
||
# Ça a vraiment pris beaucoup de temps. (environ 10 minutes). | ||
|
||
# ### Generate the MAR holes | ||
|
||
# In[38]: | ||
n = 200 | ||
p = 4 | ||
M = pd.DataFrame(np.zeros((n, p)), columns=[f"Feature_{i+1}" for i in range(p)]) | ||
|
||
|
||
def generate_m(df: pd.DataFrame, r: float) -> pd.DataFrame: | ||
_, n_col = df.shape | ||
mask = np.random.rand(*df.shape) < (1 - r ** (1 / (n_col - 1))) | ||
df[mask] = 1 | ||
df.iloc[:, 0] = 0 | ||
return df | ||
|
||
|
||
generate_m(M, 0.65) | ||
|
||
# In[39]: | ||
|
||
|
||
my_df = generate_df_case_1(200, 4) | ||
|
||
|
||
# In[40]: | ||
|
||
|
||
my_df.head(1) | ||
|
||
|
||
# In[42]: | ||
|
||
|
||
my_df["Feature_1"].apply(lambda x: x > my_df["Feature_1"].mean()).sort_values() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
#!/usr/bin/env python | ||
# coding: utf-8 | ||
|
||
# In[1]: | ||
import pandas as pd | ||
import numpy as np | ||
from scipy.stats import norm | ||
|
||
from qolmat.audit.pklm import PKLMtest | ||
from qolmat.benchmark.missing_patterns import UniformHoleGenerator | ||
|
||
# ### DataFrame definition | ||
|
||
# In[5]: | ||
|
||
|
||
nb_rows = 40 | ||
nb_col = 5 | ||
|
||
# Création du DataFrame avec des données aléatoires | ||
data = {f"Colonne_{i}": np.random.randint(0, 100, nb_rows).astype(float) for i in range(nb_col)} | ||
|
||
df = pd.DataFrame(data) | ||
|
||
# Introduction de valeurs manquantes | ||
nb_valeurs_manquantes = int(0.1 * df.size) | ||
indices_valeurs_manquantes = np.random.choice(df.size, nb_valeurs_manquantes, replace=False) | ||
df.values.flat[indices_valeurs_manquantes] = np.nan | ||
|
||
|
||
# __Remarque__ : On remarque que parfois le calcul du U_hat n'est pas possible pour une | ||
# certaine projection. | ||
|
||
# In[5]: | ||
|
||
|
||
PKLMtest(df, 10, 10) | ||
|
||
|
||
# ### Tests sur des matrices typiques (utilisées dans le test de Little) | ||
|
||
# ### The MCAR real case | ||
|
||
# In[2]: | ||
|
||
|
||
np.random.seed(42) | ||
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) | ||
df = pd.DataFrame(data=matrix, columns=["Column_1", "Column_2"]) | ||
|
||
hole_gen = UniformHoleGenerator(n_splits=1, random_state=42, subset=["Column_2"], ratio_masked=0.2) | ||
df_mask = hole_gen.generate_mask(df) | ||
df_unmasked = ~df_mask | ||
df_unmasked["Column_1"] = False | ||
|
||
df_observed = df.mask(df_mask).dropna() | ||
df_hidden = df.mask(df_unmasked).dropna(subset="Column_2") | ||
|
||
|
||
# In[4]: | ||
|
||
|
||
PKLMtest(df.mask(df_mask), 10, 10) | ||
|
||
|
||
# __Résultat__ : Très cohérent les trous sont bien MCAR. | ||
|
||
# ### The MAR case : Heterogeneity in means | ||
|
||
# In[11]: | ||
|
||
|
||
np.random.seed(42) | ||
quantile_95 = norm.ppf(0.975) | ||
|
||
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) | ||
df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"]) | ||
df_nan = df.copy() | ||
df_nan.loc[df_nan["Column_1"] > quantile_95, "Column_2"] = np.nan | ||
|
||
df_mask = df_nan.isna() | ||
df_unmasked = ~df_mask | ||
df_unmasked["Column_1"] = False | ||
|
||
df_observed = df.mask(df_mask).dropna() | ||
df_hidden = df.mask(df_unmasked).dropna(subset="Column_2") | ||
|
||
|
||
# In[12]: | ||
|
||
|
||
PKLMtest(df.mask(df_mask), 20, 20) | ||
|
||
|
||
# __Résultat__ : Cohérent les trous sont MAR. | ||
|
||
# ### The MAR case : Heterogeneity in covariance | ||
|
||
# In[13]: | ||
|
||
|
||
np.random.seed(42) | ||
quantile_95 = norm.ppf(0.975) | ||
|
||
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) | ||
df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"]) | ||
df_nan = df.copy() | ||
df_nan.loc[abs(df_nan["Column_1"]) > quantile_95, "Column_2"] = np.nan | ||
|
||
df_mask = df_nan.isna() | ||
df_unmasked = ~df_mask | ||
df_unmasked["Column_1"] = False | ||
|
||
df_observed = df.mask(df_mask).dropna() | ||
df_hidden = df.mask(df_unmasked).dropna(subset="Column_2") | ||
|
||
|
||
# In[14]: | ||
|
||
|
||
PKLMtest(df.mask(df_mask), 20, 20) | ||
|
||
|
||
# __Résultat__ : Cohérent les trous sont MAR. | ||
# | ||
# Pourquoi la p-value du test est-elle éxacetement égale à la précédente |
Oops, something went wrong.