diff --git a/examples/pklm/PKLM_Simulations.py b/examples/pklm/PKLM_Simulations.py new file mode 100644 index 0000000..8c57341 --- /dev/null +++ b/examples/pklm/PKLM_Simulations.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python +# coding: utf-8 + +# ### L'objectif de ce notebook est de récréer les résultats du papiers. +# +# Il reste encore pas mal de travail. + +# In[18]: + + +import numpy as np +import pandas as pd +from scipy.stats import multivariate_t + +from qolmat.audit.pklm import PKLMtest + + +# In[4]: + + +# Set the parameters +n = 1000 # Number of samples +p = 5 # Number of dimensions + +# Mean vector of zeros +mean = np.zeros(p) + +# Identity covariance matrix +cov = np.eye(p) + +# Generate the multivariate normal distribution +data = np.random.multivariate_normal(mean, cov, n) + +# Create a DataFrame +df = pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)]) + +# Display the first few rows of the DataFrame +df.head() + + +# In[10]: + + +def generate_df_case_1(n: int, p: int) -> pd.DataFrame: + mean = np.zeros(p) + cov = np.eye(p) + data = np.random.multivariate_normal(mean, cov, n) + return pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)]) + + +# In[16]: + + +def generate_df_case_2(n: int, p: int) -> pd.DataFrame: + cov = np.full((p, p), 0.7) + np.fill_diagonal(cov, 1) + mean = np.zeros(p) + data = np.random.multivariate_normal(mean, cov, n) + return pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)]) + + +# In[27]: + + +def generate_df_case_3(n: int, p: int, df: int = 4) -> pd.DataFrame: + mean = np.zeros(4) + cov = np.eye(4) + rv = multivariate_t(loc=mean, shape=cov, df=4) + data = rv.rvs(size=n) + return pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)]) + + +# In[30]: + + +def generate_df_case_4(n: int, p: int, df: int = 4) -> pd.DataFrame: + cov = np.full((p, p), 0.7) + np.fill_diagonal(cov, 1) + mean = np.zeros(p) + rv = multivariate_t(loc=mean, shape=cov, df=4) + data = rv.rvs(size=n) + return pd.DataFrame(data, columns=[f"Feature_{i+1}" for i in range(p)]) + + +# In[32]: + + +def generate_df_case_5(n: int, p: int) -> pd.DataFrame: + return pd.DataFrame(np.random.rand(n, p), columns=[f"Feature_{i+1}" for i in range(p)]) + + +# ### Generate the MCAR holes + +# In[7]: + + +def gen_mcar_holes(df: pd.DataFrame, r: float) -> pd.DataFrame: + _, n_col = df.shape + mask = np.random.rand(*df.shape) < (1 - r ** (1 / n_col)) + df[mask] = np.nan + return df + + +# In[12]: + + +df_1 = generate_df_case_1(n=200, p=4) +df_1_mcar = gen_mcar_holes(df_1, 0.65) +PKLMtest(df_1_mcar, nb_projections=100, nb_permutations=30) + + +# In[15]: + + +PKLMtest(df_1_mcar, nb_projections=100, nb_permutations=30) + + +# Ça a vraiment pris beaucoup de temps. (environ 10 minutes). + +# ### Generate the MAR holes + +# In[38]: +n = 200 +p = 4 +M = pd.DataFrame(np.zeros((n, p)), columns=[f"Feature_{i+1}" for i in range(p)]) + + +def generate_m(df: pd.DataFrame, r: float) -> pd.DataFrame: + _, n_col = df.shape + mask = np.random.rand(*df.shape) < (1 - r ** (1 / (n_col - 1))) + df[mask] = 1 + df.iloc[:, 0] = 0 + return df + + +generate_m(M, 0.65) + +# In[39]: + + +my_df = generate_df_case_1(200, 4) + + +# In[40]: + + +my_df.head(1) + + +# In[42]: + + +my_df["Feature_1"].apply(lambda x: x > my_df["Feature_1"].mean()).sort_values() diff --git a/examples/pklm/PKLM_tutoriel.py b/examples/pklm/PKLM_tutoriel.py new file mode 100644 index 0000000..4315b7d --- /dev/null +++ b/examples/pklm/PKLM_tutoriel.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: +import pandas as pd +import numpy as np +from scipy.stats import norm + +from qolmat.audit.pklm import PKLMtest +from qolmat.benchmark.missing_patterns import UniformHoleGenerator + +# ### DataFrame definition + +# In[5]: + + +nb_rows = 40 +nb_col = 5 + +# Création du DataFrame avec des données aléatoires +data = {f"Colonne_{i}": np.random.randint(0, 100, nb_rows).astype(float) for i in range(nb_col)} + +df = pd.DataFrame(data) + +# Introduction de valeurs manquantes +nb_valeurs_manquantes = int(0.1 * df.size) +indices_valeurs_manquantes = np.random.choice(df.size, nb_valeurs_manquantes, replace=False) +df.values.flat[indices_valeurs_manquantes] = np.nan + + +# __Remarque__ : On remarque que parfois le calcul du U_hat n'est pas possible pour une +# certaine projection. + +# In[5]: + + +PKLMtest(df, 10, 10) + + +# ### Tests sur des matrices typiques (utilisées dans le test de Little) + +# ### The MCAR real case + +# In[2]: + + +np.random.seed(42) +matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) +df = pd.DataFrame(data=matrix, columns=["Column_1", "Column_2"]) + +hole_gen = UniformHoleGenerator(n_splits=1, random_state=42, subset=["Column_2"], ratio_masked=0.2) +df_mask = hole_gen.generate_mask(df) +df_unmasked = ~df_mask +df_unmasked["Column_1"] = False + +df_observed = df.mask(df_mask).dropna() +df_hidden = df.mask(df_unmasked).dropna(subset="Column_2") + + +# In[4]: + + +PKLMtest(df.mask(df_mask), 10, 10) + + +# __Résultat__ : Très cohérent les trous sont bien MCAR. + +# ### The MAR case : Heterogeneity in means + +# In[11]: + + +np.random.seed(42) +quantile_95 = norm.ppf(0.975) + +matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) +df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"]) +df_nan = df.copy() +df_nan.loc[df_nan["Column_1"] > quantile_95, "Column_2"] = np.nan + +df_mask = df_nan.isna() +df_unmasked = ~df_mask +df_unmasked["Column_1"] = False + +df_observed = df.mask(df_mask).dropna() +df_hidden = df.mask(df_unmasked).dropna(subset="Column_2") + + +# In[12]: + + +PKLMtest(df.mask(df_mask), 20, 20) + + +# __Résultat__ : Cohérent les trous sont MAR. + +# ### The MAR case : Heterogeneity in covariance + +# In[13]: + + +np.random.seed(42) +quantile_95 = norm.ppf(0.975) + +matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) +df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"]) +df_nan = df.copy() +df_nan.loc[abs(df_nan["Column_1"]) > quantile_95, "Column_2"] = np.nan + +df_mask = df_nan.isna() +df_unmasked = ~df_mask +df_unmasked["Column_1"] = False + +df_observed = df.mask(df_mask).dropna() +df_hidden = df.mask(df_unmasked).dropna(subset="Column_2") + + +# In[14]: + + +PKLMtest(df.mask(df_mask), 20, 20) + + +# __Résultat__ : Cohérent les trous sont MAR. +# +# Pourquoi la p-value du test est-elle éxacetement égale à la précédente diff --git a/qolmat/analysis/pklm.py b/qolmat/analysis/pklm.py new file mode 100644 index 0000000..4654f62 --- /dev/null +++ b/qolmat/analysis/pklm.py @@ -0,0 +1,121 @@ +# Non industrial implementation of the PKLM test +import random +from typing import Tuple + + +import pandas as pd +import numpy as np + + +from sklearn.ensemble import RandomForestClassifier + + +def draw_A(df: pd.DataFrame) -> pd.DataFrame: + _, n_col = df.shape + if n_col == 2: + return df.sample(n=1, axis=1) + return df.sample(np.random.randint(1, n_col - 1), axis=1) + + +def build_M_proj(df_A: pd.DataFrame, M: pd.DataFrame) -> pd.DataFrame: + indexes = df_A.dropna().index + return M.loc[indexes, :].drop(columns=df_A.columns.tolist()) + + +def check_binary_class(M: pd.DataFrame) -> bool: + S = M.nunique() + return S.sum() > S.shape[0] + + +def draw_B(M: pd.DataFrame) -> str: + S = M.nunique() + S = S[S > 1] + list_col = S.index.tolist() + return random.choice(list_col) + + +def draw_A_B(df: pd.DataFrame, M: pd.DataFrame) -> Tuple[list, list]: + temp_res = False + while not temp_res: + df_A = draw_A(df) + M_proj = build_M_proj(df_A, M) + temp_res = check_binary_class(M_proj) + col_B = draw_B(M_proj) + return df_A.columns.tolist(), [col_B] + + +def check_B(df: pd.DataFrame, M: pd.DataFrame, col_features: list, col_name: str) -> bool: + M_proj = build_M_proj(df[col_features], M) + return pd.unique(M_proj[col_name]).shape[0] > 1 + + +def create_binary_classif_df( + df: pd.DataFrame, + M: pd.DataFrame, + col_feature: list, + col_target: list, +) -> pd.DataFrame: + df = df[col_feature].dropna() + M_proj = build_M_proj(df, M)[col_target] + M_proj = M_proj.rename(columns={col_target[0]: "target"}) + return pd.concat([df, M_proj], axis=1).reset_index(drop=True) + + +def get_oob_probabilities(df: pd.DataFrame): + X, y = df.loc[:, df.columns != "target"], df["target"] + clf = RandomForestClassifier( + n_estimators=200, + max_features=None, + min_samples_split=10, + bootstrap=True, + oob_score=True, + random_state=42, + ) + clf.fit(X, y) + return clf.oob_decision_function_ + + +def U_hat(df: pd.DataFrame, M: pd.DataFrame, A: list, B: list) -> float: + df_class = create_binary_classif_df(df, M, A, B) + oob_probs = get_oob_probabilities(df_class) + + def my_func(x: float, eps: float = 1e-5) -> float: + return np.log((x + eps) / (1 - x + eps)) + + vfunc = np.vectorize(my_func) + + element_0 = df_class[df_class["target"] == 0].index + element_1 = df_class[df_class["target"] == 1].index + + return 1 / element_1.shape[0] * np.sum(vfunc(oob_probs[element_1, 1])) - 1 / element_0.shape[ + 0 + ] * np.sum(vfunc(oob_probs[element_0, 0])) + + +def PKLMtest( + df: pd.DataFrame, + nb_projections: int, + nb_permutations: int, +) -> float: + M = 1 * df.isnull() + list_perm_M = [M.sample(frac=1, axis=0).reset_index(drop=True) for _ in range(nb_permutations)] + dict_res_sigma = {i: 0.0 for i in range(nb_permutations)} + dict_count_sigma = {i: 0.0 for i in range(nb_permutations)} + U = 0.0 + for _ in range(nb_projections): + cols_feature, col_target = draw_A_B(df, M) + res = U_hat(df, M, cols_feature, col_target) + U += res + for idx, M_perm in enumerate(list_perm_M): + if check_B(df, M_perm, cols_feature, col_target[0]): + dict_res_sigma[idx] += U_hat(df, M_perm, cols_feature, col_target) + dict_count_sigma[idx] += 1 + + U = U / nb_projections + dict_res_sigma = {k: v / dict_count_sigma[k] for k, v in dict_res_sigma.items()} + + p_value = 1 + for ind_perm, u_sigma in dict_res_sigma.items(): + if u_sigma >= U: + p_value += 1 + return p_value / (nb_permutations + 1)