Skip to content

Commit

Permalink
✨ PKLM first draft of code with a little tuto and the begining of the…
Browse files Browse the repository at this point in the history
… simulation study.
  • Loading branch information
adriencrtr committed May 27, 2024
1 parent beb6c2a commit 784fbd9
Show file tree
Hide file tree
Showing 3 changed files with 400 additions and 0 deletions.
153 changes: 153 additions & 0 deletions examples/pklm/PKLM_Simulations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#!/usr/bin/env python
# coding: utf-8

# ### L'objectif de ce notebook est de récréer les résultats du papiers.
#
# Il reste encore pas mal de travail.

# In[18]:


import numpy as np
import pandas as pd
from scipy.stats import multivariate_t

from qolmat.audit.pklm import PKLMtest


# In[4]:


# Set the parameters
n = 1000 # Number of samples
p = 5 # Number of dimensions

# Mean vector of zeros
mean = np.zeros(p)

# Identity covariance matrix
cov = np.eye(p)

# Generate the multivariate normal distribution
data = np.random.multivariate_normal(mean, cov, n)

# Create a DataFrame
df = pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)])

# Display the first few rows of the DataFrame
df.head()


# In[10]:


def generate_df_case_1(n: int, p: int) -> pd.DataFrame:
mean = np.zeros(p)
cov = np.eye(p)
data = np.random.multivariate_normal(mean, cov, n)
return pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)])


# In[16]:


def generate_df_case_2(n: int, p: int) -> pd.DataFrame:
cov = np.full((p, p), 0.7)
np.fill_diagonal(cov, 1)
mean = np.zeros(p)
data = np.random.multivariate_normal(mean, cov, n)
return pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)])


# In[27]:


def generate_df_case_3(n: int, p: int, df: int = 4) -> pd.DataFrame:
mean = np.zeros(4)
cov = np.eye(4)
rv = multivariate_t(loc=mean, shape=cov, df=4)
data = rv.rvs(size=n)
return pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)])


# In[30]:


def generate_df_case_4(n: int, p: int, df: int = 4) -> pd.DataFrame:
cov = np.full((p, p), 0.7)
np.fill_diagonal(cov, 1)
mean = np.zeros(p)
rv = multivariate_t(loc=mean, shape=cov, df=4)
data = rv.rvs(size=n)
return pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)])


# In[32]:


def generate_df_case_5(n: int, p: int) -> pd.DataFrame:
return pd.DataFrame(np.random.rand(n, p), columns=[f"Feature_{i+1}" for i in range(p)])


# ### Generate the MCAR holes

# In[7]:


def gen_mcar_holes(df: pd.DataFrame, r: float) -> pd.DataFrame:
_, n_col = df.shape
mask = np.random.rand(*df.shape) < (1 - r ** (1 / n_col))
df[mask] = np.nan
return df


# In[12]:


df_1 = generate_df_case_1(n=200, p=4)
df_1_mcar = gen_mcar_holes(df_1, 0.65)
PKLMtest(df_1_mcar, nb_projections=100, nb_permutations=30)


# In[15]:


PKLMtest(df_1_mcar, nb_projections=100, nb_permutations=30)


# Ça a vraiment pris beaucoup de temps. (environ 10 minutes).

# ### Generate the MAR holes

# In[38]:
n = 200
p = 4
M = pd.DataFrame(np.zeros((n, p)), columns=[f"Feature_{i+1}" for i in range(p)])


def generate_m(df: pd.DataFrame, r: float) -> pd.DataFrame:
_, n_col = df.shape
mask = np.random.rand(*df.shape) < (1 - r ** (1 / (n_col - 1)))
df[mask] = 1
df.iloc[:, 0] = 0
return df


generate_m(M, 0.65)

# In[39]:


my_df = generate_df_case_1(200, 4)


# In[40]:


my_df.head(1)


# In[42]:


my_df["Feature_1"].apply(lambda x: x > my_df["Feature_1"].mean()).sort_values()
126 changes: 126 additions & 0 deletions examples/pklm/PKLM_tutoriel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env python
# coding: utf-8

# In[1]:
import pandas as pd
import numpy as np
from scipy.stats import norm

from qolmat.audit.pklm import PKLMtest
from qolmat.benchmark.missing_patterns import UniformHoleGenerator

# ### DataFrame definition

# In[5]:


nb_rows = 40
nb_col = 5

# Création du DataFrame avec des données aléatoires
data = {f"Colonne_{i}": np.random.randint(0, 100, nb_rows).astype(float) for i in range(nb_col)}

df = pd.DataFrame(data)

# Introduction de valeurs manquantes
nb_valeurs_manquantes = int(0.1 * df.size)
indices_valeurs_manquantes = np.random.choice(df.size, nb_valeurs_manquantes, replace=False)
df.values.flat[indices_valeurs_manquantes] = np.nan


# __Remarque__ : On remarque que parfois le calcul du U_hat n'est pas possible pour une
# certaine projection.

# In[5]:


PKLMtest(df, 10, 10)


# ### Tests sur des matrices typiques (utilisées dans le test de Little)

# ### The MCAR real case

# In[2]:


np.random.seed(42)
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
df = pd.DataFrame(data=matrix, columns=["Column_1", "Column_2"])

hole_gen = UniformHoleGenerator(n_splits=1, random_state=42, subset=["Column_2"], ratio_masked=0.2)
df_mask = hole_gen.generate_mask(df)
df_unmasked = ~df_mask
df_unmasked["Column_1"] = False

df_observed = df.mask(df_mask).dropna()
df_hidden = df.mask(df_unmasked).dropna(subset="Column_2")


# In[4]:


PKLMtest(df.mask(df_mask), 10, 10)


# __Résultat__ : Très cohérent les trous sont bien MCAR.

# ### The MAR case : Heterogeneity in means

# In[11]:


np.random.seed(42)
quantile_95 = norm.ppf(0.975)

matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"])
df_nan = df.copy()
df_nan.loc[df_nan["Column_1"] > quantile_95, "Column_2"] = np.nan

df_mask = df_nan.isna()
df_unmasked = ~df_mask
df_unmasked["Column_1"] = False

df_observed = df.mask(df_mask).dropna()
df_hidden = df.mask(df_unmasked).dropna(subset="Column_2")


# In[12]:


PKLMtest(df.mask(df_mask), 20, 20)


# __Résultat__ : Cohérent les trous sont MAR.

# ### The MAR case : Heterogeneity in covariance

# In[13]:


np.random.seed(42)
quantile_95 = norm.ppf(0.975)

matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"])
df_nan = df.copy()
df_nan.loc[abs(df_nan["Column_1"]) > quantile_95, "Column_2"] = np.nan

df_mask = df_nan.isna()
df_unmasked = ~df_mask
df_unmasked["Column_1"] = False

df_observed = df.mask(df_mask).dropna()
df_hidden = df.mask(df_unmasked).dropna(subset="Column_2")


# In[14]:


PKLMtest(df.mask(df_mask), 20, 20)


# __Résultat__ : Cohérent les trous sont MAR.
#
# Pourquoi la p-value du test est-elle éxacetement égale à la précédente
Loading

0 comments on commit 784fbd9

Please sign in to comment.