history updated

scikit-learn-contrib · Mar 7, 2024 · eccb46d · eccb46d
1 parent 7fae887
commit eccb46d
Show file tree

Hide file tree

Showing 18 changed files with 726 additions and 332 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -2,6 +2,17 @@
 History
 =======
 
+0.1.3 (2024-03-07)
+------------------
+
+* RPCA algorithms now start with a normalizing scaler
+* The EM algorithms now include a gradient projection step to be more robust to colinearity
+* The EM algorithm based on the Gaussian model is now initialized using a robust estimation of the covariance matrix
+* A bug in the EM algorithm has been patched: the normalizing matrix gamma was creating a sampling biais
+* Speed up of the EM algorithm likelihood maximization, using the conjugate gradient method
+* The ImputeRegressor class now handles the nans by `row` by default
+* The metric `frechet` was not correctly called and has been patched
+
 0.1.2 (2024-02-28)
 ------------------
 

diff --git a/docs/imputers.rst b/docs/imputers.rst
@@ -41,7 +41,7 @@ See the :class:`~qolmat.imputations.imputers.ImputerRpcaPcp` class for implement
 The class :class:`RPCANoisy` implements an recommanded improved version, which relies on a decomposition :math:`\mathbf{D} = \mathbf{M} + \mathbf{A} + \mathbf{E}`. The additionnal term encodes a Gaussian noise and makes the numerical convergence more reliable. This class also implements a time-consistency penalization for time series, parametrized by the :math:`\eta_k`and :math:`H_k`. By defining :math:`\Vert \mathbf{MH_k} \Vert_p` is either :math:`\Vert \mathbf{MH_k} \Vert_1` or  :math:`\Vert \mathbf{MH_k} \Vert_F^2`, the optimisation problem is the following
 
 .. math::
-   \text{min}_{\mathbf{M, A} \in \mathbb{R}^{m \times n}} \quad \Vert P_{\Omega} (\mathbf{D}-\mathbf{M}-\mathbf{A}) \Vert_F^2 + \tau \Vert \mathbf{M} \Vert_* + \lambda \Vert \mathbf{A} \Vert_1 + \sum_{k=1}^K \eta_k \Vert \mathbf{M H_k} \Vert_p
+   \text{min}_{\mathbf{M, A} \in \mathbb{R}^{m \times n}} \quad \frac 1 2 \Vert P_{\Omega} (\mathbf{D}-\mathbf{M}-\mathbf{A}) \Vert_F^2 + \tau \Vert \mathbf{M} \Vert_* + \lambda \Vert \mathbf{A} \Vert_1 + \sum_{k=1}^K \eta_k \Vert \mathbf{M H_k} \Vert_p
 
 with :math:`\mathbf{E} = \mathbf{D} - \mathbf{M} - \mathbf{A}`.
 See the :class:`~qolmat.imputations.imputers.ImputerRpcaNoisy` class for implementation details.

diff --git a/examples/benchmark.md b/examples/benchmark.md
@@ -8,9 +8,9 @@ jupyter:
       format_version: '1.3'
       jupytext_version: 1.14.4
   kernelspec:
-    display_name: env_qolmat
+    display_name: env_qolmat_dev
     language: python
-    name: env_qolmat
+    name: env_qolmat_dev
 ---
 
 **This notebook aims to present the Qolmat repo through an example of a multivariate time series.
@@ -28,6 +28,8 @@ import warnings
 %reload_ext autoreload
 %autoreload 2
 
+from IPython.display import Image
+
 import pandas as pd
 from datetime import datetime
 import numpy as np
@@ -82,12 +84,12 @@ n_cols = len(cols_to_impute)
 ```
 
 ```python
-fig = plt.figure(figsize=(10 * n_stations, 3 * n_cols))
+fig = plt.figure(figsize=(20 * n_stations, 6 * n_cols))
 for i_station, (station, df) in enumerate(df_data.groupby("station")):
     df_station = df_data.loc[station]
     for i_col, col in enumerate(cols_to_impute):
         fig.add_subplot(n_cols, n_stations, i_col * n_stations + i_station + 1)
-        plt.plot(df_station[col], '.', label=station)
+        plt.plot(df_station[col], label=station)
         # break
         plt.ylabel(col)
         plt.xticks(rotation=15)
@@ -127,7 +129,7 @@ imputer_spline = imputers.ImputerInterpolation(groups=("station",), method="spli
 imputer_shuffle = imputers.ImputerShuffle(groups=("station",))
 imputer_residuals = imputers.ImputerResiduals(groups=("station",), period=365, model_tsa="additive", extrapolate_trend="freq", method_interpolation="linear")
 
-imputer_rpca = imputers.ImputerRpcaNoisy(groups=("station",), columnwise=False, max_iterations=500, tau=2, lam=0.05)
+imputer_rpca = imputers.ImputerRpcaNoisy(groups=("station",), columnwise=False, max_iterations=500, tau=.01, lam=5, rank=1)
 imputer_rpca_opti = imputers.ImputerRpcaNoisy(groups=("station",), columnwise=False, max_iterations=256)
 dict_config_opti["RPCA_opti"] = {
     "tau": ho.hp.uniform("tau", low=.5, high=5),
@@ -141,9 +143,9 @@ dict_config_opti["RPCA_opticw"] = {
     "lam/PRES": ho.hp.uniform("lam/PRES", low=.1, high=1),
 }
 
-imputer_ou = imputers.ImputerEM(groups=("station",), model="multinormal", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3)
-imputer_tsou = imputers.ImputerEM(groups=("station",), model="VAR", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3, p=1)
-imputer_tsmle = imputers.ImputerEM(groups=("station",), model="VAR", method="mle", max_iter_em=100, n_iter_ou=15, dt=1e-3, p=1)
+imputer_normal_sample = imputers.ImputerEM(groups=("station",), model="multinormal", method="sample", max_iter_em=8, n_iter_ou=128, dt=4e-2)
+imputer_var_sample = imputers.ImputerEM(groups=("station",), model="VAR", method="sample", max_iter_em=8, n_iter_ou=128, dt=4e-2, p=1)
+imputer_var_max = imputers.ImputerEM(groups=("station",), model="VAR", method="mle", max_iter_em=8, n_iter_ou=128, dt=4e-2, p=1)
 
 imputer_knn = imputers.ImputerKNN(groups=("station",), n_neighbors=10)
 imputer_mice = imputers.ImputerMICE(groups=("station",), estimator=LinearRegression(), sample_posterior=False, max_iter=100)
@@ -163,25 +165,25 @@ dict_imputers = {
     # "spline": imputer_spline,
     # "shuffle": imputer_shuffle,
     "residuals": imputer_residuals,
-    # "OU": imputer_ou,
-    "TSOU": imputer_tsou,
-    "TSMLE": imputer_tsmle,
+    "Normal_sample": imputer_normal_sample,
+    "VAR_sample": imputer_var_sample,
+    "VAR_max": imputer_var_max,
     "RPCA": imputer_rpca,
     # "RPCA_opti": imputer_rpca,
     # "RPCA_opticw": imputer_rpca_opti2,
     # "locf": imputer_locf,
     # "nocb": imputer_nocb,
     # "knn": imputer_knn,
-    "ols": imputer_regressor,
-    "mice_ols": imputer_mice,
+    "OLS": imputer_regressor,
+    "MICE_OLS": imputer_mice,
 }
 n_imputers = len(dict_imputers)
 ```
 
 In order to compare the methods, we $i)$ artificially create missing data (for missing data mechanisms, see the docs); $ii)$ then impute it using the different methods chosen and $iii)$ calculate the reconstruction error. These three steps are repeated a number of times equal to `n_splits`. For each method, we calculate the average error and compare the final errors.
 
 <p align="center">
-    <img src="../../docs/images/comparator.png"  width=50% height=50%>
+    <img src="https://raw.githubusercontent.com/Quantmetry/qolmat/main/docs/images/schema_qolmat.png"  width=50% height=50%>
 </p>
 
 
@@ -190,14 +192,14 @@ Concretely, the comparator takes as input a dataframe to impute, a proportion of
 
 Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
 
-```python
-metrics = ["mae", "wmape", "KL_columnwise", "ks_test", "dist_corr_pattern"]
+```python tags=[]
+metrics = ["mae", "wmape", "KL_columnwise", "frechet"]
 comparison = comparator.Comparator(
     dict_imputers,
     cols_to_impute,
     generator_holes = generator_holes,
     metrics=metrics,
-    max_evals=10,
+    max_evals=2,
     dict_config_opti=dict_config_opti,
 )
 results = comparison.compare(df_data)
@@ -220,9 +222,9 @@ plt.show()
 ### **III. Comparison of methods**
 
 
-We now run just one time each algorithm on the initial corrupted dataframe and compare the different performances through multiple analysis.
+We now run just one time each algorithm on the initial corrupted dataframe and visualize the different imputations.
 
-```python
+```python tags=[]
 df_plot = df_data[cols_to_impute]
 ```
 
@@ -233,12 +235,17 @@ dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.i
 ```python
 station = df_plot.index.get_level_values("station")[0]
 df_station = df_plot.loc[station]
+# dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
 dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
 ```
 
 Let's look at the imputations.
 When the data is missing at random, imputation is easier. Missing block are more challenging.
 
+```python
+dfs_imputed_station["VAR_max"]
+```
+
 ```python
 for col in cols_to_impute:
     fig, ax = plt.subplots(figsize=(10, 3))
@@ -270,21 +277,21 @@ i_plot = 1
 for i_col, col in enumerate(cols_to_impute):
     for name_imputer, df_imp in dfs_imputed_station.items():
 
-        fig.add_subplot(n_columns, n_imputers, i_plot)
+        ax = fig.add_subplot(n_columns, n_imputers, i_plot)
         values_orig = df_station[col]
 
         values_imp = df_imp[col].copy()
         values_imp[values_orig.notna()] = np.nan
-        plt.plot(values_imp, marker="o", color=tab10(0), label=name_imputer, alpha=1)
+        plt.plot(values_imp, marker="o", color=tab10(0), label="imputation", alpha=1)
         plt.plot(values_orig, color='black', marker="o", label="original")
         plt.ylabel(col, fontsize=16)
-        if i_plot % n_columns == 1:
-            plt.legend(loc=[1, 0], fontsize=18)
+        if i_plot % n_imputers == 0:
+            plt.legend(loc="lower right", fontsize=18)
         plt.xticks(rotation=15)
         if i_col == 0:
             plt.title(name_imputer)
         if i_col != n_columns - 1:
-            plt.xticks([], [])
+            ax.set_xticklabels([])
         loc = plticker.MultipleLocator(base=2*365)
         ax.xaxis.set_major_locator(loc)
         ax.tick_params(axis='both', which='major')
@@ -297,7 +304,7 @@ plt.show()
 ## (Optional) Deep Learning Model
 
 
-In this section, we present an MLP model of data imputation using Keras, which can be installed using a "pip install pytorch".
+In this section, we present an MLP model of data imputation using PyTorch, which can be installed using a "pip install qolmat[pytorch]".
 
 ```python
 from qolmat.imputations import imputers_pytorch
@@ -308,17 +315,6 @@ except ModuleNotFoundError:
     raise PyTorchExtraNotInstalled
 ```
 
-For the MLP model, we work on a dataset that corresponds to weather data with missing values. We add missing MCAR values on the features "TEMP", "PRES" and other features with NaN values. The goal is impute the missing values for the features "TEMP" and "PRES" by a Deep Learning method. We add features to take into account the seasonality of the data set and a feature for the station name
-
-```python
-df = data.get_data("Beijing")
-cols_to_impute = ["TEMP", "PRES"]
-cols_with_nans = list(df.columns[df.isna().any()])
-df_data = data.add_datetime_features(df)
-df_data[cols_with_nans + cols_to_impute] = data.add_holes(pd.DataFrame(df_data[cols_with_nans + cols_to_impute]), ratio_masked=.1, mean_size=120)
-df_data
-```
-
 For the example, we use a simple MLP model with 3 layers of neurons.
 Then we train the model without taking a group on the stations
 
@@ -340,49 +336,75 @@ plt.show()
 ```
 
 ```python
-# estimator = nn.Sequential(
-#         nn.Linear(np.sum(df_data.isna().sum()==0), 256),
-#         nn.ReLU(),
-#         nn.Linear(256, 128),
-#         nn.ReLU(),
-#         nn.Linear(128, 64),
-#         nn.ReLU(),
-#         nn.Linear(64, 1)
-#     )
-estimator = imputers_pytorch.build_mlp(input_dim=np.sum(df_data.isna().sum()==0), list_num_neurons=[256,128,64])
-encoder, decoder  = imputers_pytorch.build_autoencoder(input_dim=df_data.values.shape[1],latent_dim=4, output_dim=df_data.values.shape[1], list_num_neurons=[4*4, 2*4])
+n_variables = len(cols_to_impute)
+
+estimator = imputers_pytorch.build_mlp(input_dim=n_variables-1, list_num_neurons=[256,128,64])
+encoder, decoder  = imputers_pytorch.build_autoencoder(input_dim=n_variables,latent_dim=4, output_dim=n_variables, list_num_neurons=[4*4, 2*4])
 ```
 
 ```python
-dict_imputers["MLP"] = imputer_mlp = imputers_pytorch.ImputerRegressorPyTorch(estimator=estimator, groups=('station',), handler_nan = "column", epochs=500)
+dict_imputers["MLP"] = imputer_mlp = imputers_pytorch.ImputerRegressorPyTorch(estimator=estimator, groups=('station',), epochs=500)
 dict_imputers["Autoencoder"] = imputer_autoencoder = imputers_pytorch.ImputerAutoencoder(encoder, decoder, max_iterations=100, epochs=100)
 dict_imputers["Diffusion"] = imputer_diffusion = imputers_pytorch.ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=100, batch_size=100)
 ```
 
 We can re-run the imputation model benchmark as before.
+```python
+comparison = comparator.Comparator(
+    dict_imputers,
+    cols_to_impute,
+    generator_holes = generator_holes,
+    metrics=metrics,
+    max_evals=2,
+    dict_config_opti=dict_config_opti,
+)
+```
+
 ```python tags=[]
 generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=3, groups=('station',), subset=cols_to_impute, ratio_masked=ratio_masked)
 
 comparison = comparator.Comparator(
     dict_imputers,
-    selected_columns = df_data.columns,
+    cols_to_impute,
     generator_holes = generator_holes,
-    metrics=["mae", "wmape", "KL_columnwise", "ks_test"],
-    max_evals=10,
+    metrics=metrics,
+    max_evals=2,
     dict_config_opti=dict_config_opti,
 )
 results = comparison.compare(df_data)
 results.style.highlight_min(color="green", axis=1)
 ```
+```python
+n_metrics = len(metrics)
+fig = plt.figure(figsize=(24, 4 * n_metrics))
+for i, metric in enumerate(metrics):
+    fig.add_subplot(n_metrics, 1, i + 1)
+    df = results.loc[metric]
+    plot.multibar(df, decimals=2)
+    plt.ylabel(metric)
+
+#plt.savefig("figures/imputations_benchmark_errors.png")
+plt.show()
+```
+
 ```python tags=[]
-df_plot = df_data
+df_plot = df_data[cols_to_impute]
+```
+
+```python
 dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()}
+```
+
+```python
 station = df_plot.index.get_level_values("station")[0]
 df_station = df_plot.loc[station]
 dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
 ```
 
-```python tags=[]
+Let's look at the imputations.
+When the data is missing at random, imputation is easier. Missing block are more challenging.
+
+```python
 for col in cols_to_impute:
     fig, ax = plt.subplots(figsize=(10, 3))
     values_orig = df_station[col]
@@ -399,39 +421,42 @@ for col in cols_to_impute:
     ax.xaxis.set_major_locator(loc)
     ax.tick_params(axis='both', which='major', labelsize=17)
     plt.show()
+
 ```
 
 ```python
-n_columns = len(df_plot.columns)
+# plot.plot_imputations(df_station, dfs_imputed_station)
+
+n_columns = len(cols_to_impute)
 n_imputers = len(dict_imputers)
 
-fig = plt.figure(figsize=(8 * n_imputers, 6 * n_columns))
+fig = plt.figure(figsize=(12 * n_imputers, 4 * n_columns))
 i_plot = 1
-for i_col, col in enumerate(df_plot):
+for i_col, col in enumerate(cols_to_impute):
     for name_imputer, df_imp in dfs_imputed_station.items():
 
-        fig.add_subplot(n_columns, n_imputers, i_plot)
+        ax = fig.add_subplot(n_columns, n_imputers, i_plot)
         values_orig = df_station[col]
 
-        plt.plot(values_orig, ".", color='black', label="original")
-
         values_imp = df_imp[col].copy()
         values_imp[values_orig.notna()] = np.nan
-        plt.plot(values_imp, ".", color=tab10(0), label=name_imputer, alpha=1)
+        plt.plot(values_imp, marker="o", color=tab10(0), label="imputation", alpha=1)
+        plt.plot(values_orig, color='black', marker="o", label="original")
         plt.ylabel(col, fontsize=16)
-        if i_plot % n_columns == 1:
-            plt.legend(loc=[1, 0], fontsize=18)
+        if i_plot % n_imputers == 0:
+            plt.legend(loc="lower right", fontsize=18)
         plt.xticks(rotation=15)
         if i_col == 0:
             plt.title(name_imputer)
         if i_col != n_columns - 1:
-            plt.xticks([], [])
+            ax.set_xticklabels([])
         loc = plticker.MultipleLocator(base=2*365)
         ax.xaxis.set_major_locator(loc)
         ax.tick_params(axis='both', which='major')
         i_plot += 1
-plt.savefig("figures/imputations_benchmark.png")
+
 plt.show()
+
 ```
 
 ## Covariance

diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py
@@ -1011,7 +1011,7 @@ def get_metric(name: str) -> Callable:
         "ks_test": kolmogorov_smirnov_test,
         "correlation_diff": mean_difference_correlation_matrix_numerical_features,
         "energy": sum_energy_distances,
-        "frechet": frechet_distance,
+        "frechet": frechet_distance_pattern,
         "dist_corr_pattern": partial(
             pattern_based_weighted_mean_metric,
             metric=distance_anticorr,