Skip to content

Commit

Permalink
history updated
Browse files Browse the repository at this point in the history
  • Loading branch information
Julien Roussel authored and Julien Roussel committed Mar 7, 2024
1 parent 7fae887 commit eccb46d
Show file tree
Hide file tree
Showing 18 changed files with 726 additions and 332 deletions.
11 changes: 11 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@
History
=======

0.1.3 (2024-03-07)
------------------

* RPCA algorithms now start with a normalizing scaler
* The EM algorithms now include a gradient projection step to be more robust to colinearity
* The EM algorithm based on the Gaussian model is now initialized using a robust estimation of the covariance matrix
* A bug in the EM algorithm has been patched: the normalizing matrix gamma was creating a sampling biais
* Speed up of the EM algorithm likelihood maximization, using the conjugate gradient method
* The ImputeRegressor class now handles the nans by `row` by default
* The metric `frechet` was not correctly called and has been patched

0.1.2 (2024-02-28)
------------------

Expand Down
2 changes: 1 addition & 1 deletion docs/imputers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ See the :class:`~qolmat.imputations.imputers.ImputerRpcaPcp` class for implement
The class :class:`RPCANoisy` implements an recommanded improved version, which relies on a decomposition :math:`\mathbf{D} = \mathbf{M} + \mathbf{A} + \mathbf{E}`. The additionnal term encodes a Gaussian noise and makes the numerical convergence more reliable. This class also implements a time-consistency penalization for time series, parametrized by the :math:`\eta_k`and :math:`H_k`. By defining :math:`\Vert \mathbf{MH_k} \Vert_p` is either :math:`\Vert \mathbf{MH_k} \Vert_1` or :math:`\Vert \mathbf{MH_k} \Vert_F^2`, the optimisation problem is the following

.. math::
\text{min}_{\mathbf{M, A} \in \mathbb{R}^{m \times n}} \quad \Vert P_{\Omega} (\mathbf{D}-\mathbf{M}-\mathbf{A}) \Vert_F^2 + \tau \Vert \mathbf{M} \Vert_* + \lambda \Vert \mathbf{A} \Vert_1 + \sum_{k=1}^K \eta_k \Vert \mathbf{M H_k} \Vert_p
\text{min}_{\mathbf{M, A} \in \mathbb{R}^{m \times n}} \quad \frac 1 2 \Vert P_{\Omega} (\mathbf{D}-\mathbf{M}-\mathbf{A}) \Vert_F^2 + \tau \Vert \mathbf{M} \Vert_* + \lambda \Vert \mathbf{A} \Vert_1 + \sum_{k=1}^K \eta_k \Vert \mathbf{M H_k} \Vert_p
with :math:`\mathbf{E} = \mathbf{D} - \mathbf{M} - \mathbf{A}`.
See the :class:`~qolmat.imputations.imputers.ImputerRpcaNoisy` class for implementation details.
Expand Down
153 changes: 89 additions & 64 deletions examples/benchmark.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ jupyter:
format_version: '1.3'
jupytext_version: 1.14.4
kernelspec:
display_name: env_qolmat
display_name: env_qolmat_dev
language: python
name: env_qolmat
name: env_qolmat_dev
---

**This notebook aims to present the Qolmat repo through an example of a multivariate time series.
Expand All @@ -28,6 +28,8 @@ import warnings
%reload_ext autoreload
%autoreload 2

from IPython.display import Image

import pandas as pd
from datetime import datetime
import numpy as np
Expand Down Expand Up @@ -82,12 +84,12 @@ n_cols = len(cols_to_impute)
```

```python
fig = plt.figure(figsize=(10 * n_stations, 3 * n_cols))
fig = plt.figure(figsize=(20 * n_stations, 6 * n_cols))
for i_station, (station, df) in enumerate(df_data.groupby("station")):
df_station = df_data.loc[station]
for i_col, col in enumerate(cols_to_impute):
fig.add_subplot(n_cols, n_stations, i_col * n_stations + i_station + 1)
plt.plot(df_station[col], '.', label=station)
plt.plot(df_station[col], label=station)
# break
plt.ylabel(col)
plt.xticks(rotation=15)
Expand Down Expand Up @@ -127,7 +129,7 @@ imputer_spline = imputers.ImputerInterpolation(groups=("station",), method="spli
imputer_shuffle = imputers.ImputerShuffle(groups=("station",))
imputer_residuals = imputers.ImputerResiduals(groups=("station",), period=365, model_tsa="additive", extrapolate_trend="freq", method_interpolation="linear")

imputer_rpca = imputers.ImputerRpcaNoisy(groups=("station",), columnwise=False, max_iterations=500, tau=2, lam=0.05)
imputer_rpca = imputers.ImputerRpcaNoisy(groups=("station",), columnwise=False, max_iterations=500, tau=.01, lam=5, rank=1)
imputer_rpca_opti = imputers.ImputerRpcaNoisy(groups=("station",), columnwise=False, max_iterations=256)
dict_config_opti["RPCA_opti"] = {
"tau": ho.hp.uniform("tau", low=.5, high=5),
Expand All @@ -141,9 +143,9 @@ dict_config_opti["RPCA_opticw"] = {
"lam/PRES": ho.hp.uniform("lam/PRES", low=.1, high=1),
}

imputer_ou = imputers.ImputerEM(groups=("station",), model="multinormal", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3)
imputer_tsou = imputers.ImputerEM(groups=("station",), model="VAR", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3, p=1)
imputer_tsmle = imputers.ImputerEM(groups=("station",), model="VAR", method="mle", max_iter_em=100, n_iter_ou=15, dt=1e-3, p=1)
imputer_normal_sample = imputers.ImputerEM(groups=("station",), model="multinormal", method="sample", max_iter_em=8, n_iter_ou=128, dt=4e-2)
imputer_var_sample = imputers.ImputerEM(groups=("station",), model="VAR", method="sample", max_iter_em=8, n_iter_ou=128, dt=4e-2, p=1)
imputer_var_max = imputers.ImputerEM(groups=("station",), model="VAR", method="mle", max_iter_em=8, n_iter_ou=128, dt=4e-2, p=1)

imputer_knn = imputers.ImputerKNN(groups=("station",), n_neighbors=10)
imputer_mice = imputers.ImputerMICE(groups=("station",), estimator=LinearRegression(), sample_posterior=False, max_iter=100)
Expand All @@ -163,25 +165,25 @@ dict_imputers = {
# "spline": imputer_spline,
# "shuffle": imputer_shuffle,
"residuals": imputer_residuals,
# "OU": imputer_ou,
"TSOU": imputer_tsou,
"TSMLE": imputer_tsmle,
"Normal_sample": imputer_normal_sample,
"VAR_sample": imputer_var_sample,
"VAR_max": imputer_var_max,
"RPCA": imputer_rpca,
# "RPCA_opti": imputer_rpca,
# "RPCA_opticw": imputer_rpca_opti2,
# "locf": imputer_locf,
# "nocb": imputer_nocb,
# "knn": imputer_knn,
"ols": imputer_regressor,
"mice_ols": imputer_mice,
"OLS": imputer_regressor,
"MICE_OLS": imputer_mice,
}
n_imputers = len(dict_imputers)
```

In order to compare the methods, we $i)$ artificially create missing data (for missing data mechanisms, see the docs); $ii)$ then impute it using the different methods chosen and $iii)$ calculate the reconstruction error. These three steps are repeated a number of times equal to `n_splits`. For each method, we calculate the average error and compare the final errors.

<p align="center">
<img src="../../docs/images/comparator.png" width=50% height=50%>
<img src="https://raw.githubusercontent.com/Quantmetry/qolmat/main/docs/images/schema_qolmat.png" width=50% height=50%>
</p>


Expand All @@ -190,14 +192,14 @@ Concretely, the comparator takes as input a dataframe to impute, a proportion of

Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.

```python
metrics = ["mae", "wmape", "KL_columnwise", "ks_test", "dist_corr_pattern"]
```python tags=[]
metrics = ["mae", "wmape", "KL_columnwise", "frechet"]
comparison = comparator.Comparator(
dict_imputers,
cols_to_impute,
generator_holes = generator_holes,
metrics=metrics,
max_evals=10,
max_evals=2,
dict_config_opti=dict_config_opti,
)
results = comparison.compare(df_data)
Expand All @@ -220,9 +222,9 @@ plt.show()
### **III. Comparison of methods**


We now run just one time each algorithm on the initial corrupted dataframe and compare the different performances through multiple analysis.
We now run just one time each algorithm on the initial corrupted dataframe and visualize the different imputations.

```python
```python tags=[]
df_plot = df_data[cols_to_impute]
```

Expand All @@ -233,12 +235,17 @@ dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.i
```python
station = df_plot.index.get_level_values("station")[0]
df_station = df_plot.loc[station]
# dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
```

Let's look at the imputations.
When the data is missing at random, imputation is easier. Missing block are more challenging.

```python
dfs_imputed_station["VAR_max"]
```

```python
for col in cols_to_impute:
fig, ax = plt.subplots(figsize=(10, 3))
Expand Down Expand Up @@ -270,21 +277,21 @@ i_plot = 1
for i_col, col in enumerate(cols_to_impute):
for name_imputer, df_imp in dfs_imputed_station.items():

fig.add_subplot(n_columns, n_imputers, i_plot)
ax = fig.add_subplot(n_columns, n_imputers, i_plot)
values_orig = df_station[col]

values_imp = df_imp[col].copy()
values_imp[values_orig.notna()] = np.nan
plt.plot(values_imp, marker="o", color=tab10(0), label=name_imputer, alpha=1)
plt.plot(values_imp, marker="o", color=tab10(0), label="imputation", alpha=1)
plt.plot(values_orig, color='black', marker="o", label="original")
plt.ylabel(col, fontsize=16)
if i_plot % n_columns == 1:
plt.legend(loc=[1, 0], fontsize=18)
if i_plot % n_imputers == 0:
plt.legend(loc="lower right", fontsize=18)
plt.xticks(rotation=15)
if i_col == 0:
plt.title(name_imputer)
if i_col != n_columns - 1:
plt.xticks([], [])
ax.set_xticklabels([])
loc = plticker.MultipleLocator(base=2*365)
ax.xaxis.set_major_locator(loc)
ax.tick_params(axis='both', which='major')
Expand All @@ -297,7 +304,7 @@ plt.show()
## (Optional) Deep Learning Model


In this section, we present an MLP model of data imputation using Keras, which can be installed using a "pip install pytorch".
In this section, we present an MLP model of data imputation using PyTorch, which can be installed using a "pip install qolmat[pytorch]".

```python
from qolmat.imputations import imputers_pytorch
Expand All @@ -308,17 +315,6 @@ except ModuleNotFoundError:
raise PyTorchExtraNotInstalled
```

For the MLP model, we work on a dataset that corresponds to weather data with missing values. We add missing MCAR values on the features "TEMP", "PRES" and other features with NaN values. The goal is impute the missing values for the features "TEMP" and "PRES" by a Deep Learning method. We add features to take into account the seasonality of the data set and a feature for the station name

```python
df = data.get_data("Beijing")
cols_to_impute = ["TEMP", "PRES"]
cols_with_nans = list(df.columns[df.isna().any()])
df_data = data.add_datetime_features(df)
df_data[cols_with_nans + cols_to_impute] = data.add_holes(pd.DataFrame(df_data[cols_with_nans + cols_to_impute]), ratio_masked=.1, mean_size=120)
df_data
```

For the example, we use a simple MLP model with 3 layers of neurons.
Then we train the model without taking a group on the stations

Expand All @@ -340,49 +336,75 @@ plt.show()
```

```python
# estimator = nn.Sequential(
# nn.Linear(np.sum(df_data.isna().sum()==0), 256),
# nn.ReLU(),
# nn.Linear(256, 128),
# nn.ReLU(),
# nn.Linear(128, 64),
# nn.ReLU(),
# nn.Linear(64, 1)
# )
estimator = imputers_pytorch.build_mlp(input_dim=np.sum(df_data.isna().sum()==0), list_num_neurons=[256,128,64])
encoder, decoder = imputers_pytorch.build_autoencoder(input_dim=df_data.values.shape[1],latent_dim=4, output_dim=df_data.values.shape[1], list_num_neurons=[4*4, 2*4])
n_variables = len(cols_to_impute)

estimator = imputers_pytorch.build_mlp(input_dim=n_variables-1, list_num_neurons=[256,128,64])
encoder, decoder = imputers_pytorch.build_autoencoder(input_dim=n_variables,latent_dim=4, output_dim=n_variables, list_num_neurons=[4*4, 2*4])
```

```python
dict_imputers["MLP"] = imputer_mlp = imputers_pytorch.ImputerRegressorPyTorch(estimator=estimator, groups=('station',), handler_nan = "column", epochs=500)
dict_imputers["MLP"] = imputer_mlp = imputers_pytorch.ImputerRegressorPyTorch(estimator=estimator, groups=('station',), epochs=500)
dict_imputers["Autoencoder"] = imputer_autoencoder = imputers_pytorch.ImputerAutoencoder(encoder, decoder, max_iterations=100, epochs=100)
dict_imputers["Diffusion"] = imputer_diffusion = imputers_pytorch.ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=100, batch_size=100)
```

We can re-run the imputation model benchmark as before.
```python
comparison = comparator.Comparator(
dict_imputers,
cols_to_impute,
generator_holes = generator_holes,
metrics=metrics,
max_evals=2,
dict_config_opti=dict_config_opti,
)
```

```python tags=[]
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=3, groups=('station',), subset=cols_to_impute, ratio_masked=ratio_masked)

comparison = comparator.Comparator(
dict_imputers,
selected_columns = df_data.columns,
cols_to_impute,
generator_holes = generator_holes,
metrics=["mae", "wmape", "KL_columnwise", "ks_test"],
max_evals=10,
metrics=metrics,
max_evals=2,
dict_config_opti=dict_config_opti,
)
results = comparison.compare(df_data)
results.style.highlight_min(color="green", axis=1)
```
```python
n_metrics = len(metrics)
fig = plt.figure(figsize=(24, 4 * n_metrics))
for i, metric in enumerate(metrics):
fig.add_subplot(n_metrics, 1, i + 1)
df = results.loc[metric]
plot.multibar(df, decimals=2)
plt.ylabel(metric)

#plt.savefig("figures/imputations_benchmark_errors.png")
plt.show()
```

```python tags=[]
df_plot = df_data
df_plot = df_data[cols_to_impute]
```

```python
dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()}
```

```python
station = df_plot.index.get_level_values("station")[0]
df_station = df_plot.loc[station]
dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
```

```python tags=[]
Let's look at the imputations.
When the data is missing at random, imputation is easier. Missing block are more challenging.

```python
for col in cols_to_impute:
fig, ax = plt.subplots(figsize=(10, 3))
values_orig = df_station[col]
Expand All @@ -399,39 +421,42 @@ for col in cols_to_impute:
ax.xaxis.set_major_locator(loc)
ax.tick_params(axis='both', which='major', labelsize=17)
plt.show()

```

```python
n_columns = len(df_plot.columns)
# plot.plot_imputations(df_station, dfs_imputed_station)

n_columns = len(cols_to_impute)
n_imputers = len(dict_imputers)

fig = plt.figure(figsize=(8 * n_imputers, 6 * n_columns))
fig = plt.figure(figsize=(12 * n_imputers, 4 * n_columns))
i_plot = 1
for i_col, col in enumerate(df_plot):
for i_col, col in enumerate(cols_to_impute):
for name_imputer, df_imp in dfs_imputed_station.items():

fig.add_subplot(n_columns, n_imputers, i_plot)
ax = fig.add_subplot(n_columns, n_imputers, i_plot)
values_orig = df_station[col]

plt.plot(values_orig, ".", color='black', label="original")

values_imp = df_imp[col].copy()
values_imp[values_orig.notna()] = np.nan
plt.plot(values_imp, ".", color=tab10(0), label=name_imputer, alpha=1)
plt.plot(values_imp, marker="o", color=tab10(0), label="imputation", alpha=1)
plt.plot(values_orig, color='black', marker="o", label="original")
plt.ylabel(col, fontsize=16)
if i_plot % n_columns == 1:
plt.legend(loc=[1, 0], fontsize=18)
if i_plot % n_imputers == 0:
plt.legend(loc="lower right", fontsize=18)
plt.xticks(rotation=15)
if i_col == 0:
plt.title(name_imputer)
if i_col != n_columns - 1:
plt.xticks([], [])
ax.set_xticklabels([])
loc = plticker.MultipleLocator(base=2*365)
ax.xaxis.set_major_locator(loc)
ax.tick_params(axis='both', which='major')
i_plot += 1
plt.savefig("figures/imputations_benchmark.png")

plt.show()

```

## Covariance
Expand Down
2 changes: 1 addition & 1 deletion qolmat/benchmark/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1011,7 +1011,7 @@ def get_metric(name: str) -> Callable:
"ks_test": kolmogorov_smirnov_test,
"correlation_diff": mean_difference_correlation_matrix_numerical_features,
"energy": sum_energy_distances,
"frechet": frechet_distance,
"frechet": frechet_distance_pattern,
"dist_corr_pattern": partial(
pattern_based_weighted_mean_metric,
metric=distance_anticorr,
Expand Down
Loading

0 comments on commit eccb46d

Please sign in to comment.