From c88ffeb9a74cbacdaeda645954607e69b1f4e021 Mon Sep 17 00:00:00 2001 From: Oscar Esteban Date: Thu, 5 May 2022 14:02:32 +0200 Subject: [PATCH] enh: finishing auto-qc module --- docs/_toc.yml | 1 + docs/auto-qc/classification.md | 278 +++++++++++++++++++++++ docs/auto-qc/dimensionality_reduction.md | 22 +- docs/auto-qc/iqms_first_contact.md | 10 +- 4 files changed, 303 insertions(+), 8 deletions(-) create mode 100644 docs/auto-qc/classification.md diff --git a/docs/_toc.yml b/docs/_toc.yml index e74863e..9fe4301 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -23,3 +23,4 @@ parts: - file: auto-qc/iqms_first_contact - file: auto-qc/manual_ratings - file: auto-qc/dimensionality_reduction + - file: auto-qc/classification diff --git a/docs/auto-qc/classification.md b/docs/auto-qc/classification.md new file mode 100644 index 0000000..5ebc9f1 --- /dev/null +++ b/docs/auto-qc/classification.md @@ -0,0 +1,278 @@ +--- +jupytext: + formats: md:myst + notebook_metadata_filter: all,-language_info + split_at_heading: true + text_representation: + extension: .md + format_name: myst + format_version: '0.8' + jupytext_version: 1.11.2 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + + +```{code-cell} python +:tags: [remove-cell] +import numpy as np +import pandas as pd +# Some configurations to "beautify" plots +import matplotlib as mpl +import matplotlib.pyplot as plt + +mpl.rcParams["font.family"] = "Libre Franklin" +plt.rcParams["axes.facecolor"] = "white" +plt.rcParams["savefig.facecolor"] = "white" +``` + +# Making predictions about quality + + +First, let's load the ABIDE dataset, and apply the site-wise normalization. + +```{code-cell} python +from mriqc_learn.datasets import load_dataset +from mriqc_learn.models.preprocess import SiteRobustScaler + +(train_x, train_y), _ = load_dataset(split_strategy="none") +train_x = train_x.drop(columns=[ + "size_x", + "size_y", + "size_z", + "spacing_x", + "spacing_y", + "spacing_z", +]) +numeric_columns = train_x.columns.tolist() +train_x["site"] = train_y.site + +# Harmonize between sites +scaled_x = SiteRobustScaler(unit_variance=True).fit_transform(train_x) +``` + +This time, we are going to *massage* the target labels, adapting it to a binary classification problem. +Since we have three classes, we are going to merge the "*doubtful*" label into the "*discard*" label: + +```{code-cell} python +train_y = train_y[["rater_3"]].values.squeeze().astype(int) +print(f"Discard={100 * (train_y == -1).sum() / len(train_y):.2f}%.") +print(f"Doubtful={100 * (train_y == 0).sum() / len(train_y):.2f}%.") +print(f"Accept={100 * (train_y == 1).sum() / len(train_y):.2f}%.") +train_y += 1 +train_y = (~train_y.astype(bool)).astype(int) +print(100 * train_y.sum() / len(train_y)) # Let's double check we still have 15% discard +``` + +Because we already had 1.5% of *doubtful* cases, our final dataset will remain quite imbalanced (\~85% vs. \~15%). + +## Setting up a classifier + +Let's bundle together the dimensionality reduction and a random forests classifier within a `Pipeline` with *scikit-learn*. +The first entry of our pipeline will drop the `site` column of our data table. + +```{code-cell} python +from sklearn.pipeline import Pipeline +from sklearn.decomposition import PCA +from sklearn.ensemble import RandomForestClassifier as RFC +from mriqc_learn.models.preprocess import DropColumns + +model = Pipeline([ + ("drop_site", DropColumns(drop=["site"])), + ("pca", PCA(n_components=4)), + ( + "rfc", + RFC( + bootstrap=True, + class_weight=None, + criterion="gini", + max_depth=10, + max_features="sqrt", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_samples_leaf=10, + min_samples_split=10, + min_weight_fraction_leaf=0.0, + n_estimators=400, + oob_score=True, + ), + ), +]) +``` + +## Evaluating performance with cross-validation + +And we are going to obtain a cross-validated estimate of performance. +By leaving one whole site out for validation at a time, we can get a sense of how this classifier may perform on unseen, new data. + +```{code-cell} python +from mriqc_learn.model_selection import split +from sklearn.model_selection import cross_val_score + +# Define a splitting strategy +outer_cv = split.LeavePSitesOut(1, robust=True) + +cv_score = cross_val_score( + model, + X=scaled_x, + y=train_y, + cv=outer_cv, + scoring="roc_auc", +) +print(f"AUC (cross-validated, {len(cv_score)} folds): {cv_score.mean()} ± {cv_score.std()}") +``` + +## Evaluating performance on a left-out dataset + +We start by loading an additional dataset containing the IQMs extracted from [OpenNeuro's *ds0000030*](https://openneuro.org/datasets/ds000030/). +We will then drop some columns we won't need, and discard a few datapoints for which an aliasing ghost idiosyncratic of this dataset is found. +We do this because our simple models will not correctly identify this artifact. + +```{code-cell} python +(test_x, test_y), _ = load_dataset( + "ds030", + split_strategy="none", +) +test_x = test_x.drop(columns=[ + "size_x", + "size_y", + "size_z", + "spacing_x", + "spacing_y", + "spacing_z", +]) +test_x["site"] = test_y.site + +# Discard datasets with ghost +has_ghost = test_y.has_ghost.values.astype(bool) +test_y = test_y[~has_ghost] + +# Harmonize between sites +scaled_test_x = SiteRobustScaler(unit_variance=True).fit_transform( + test_x[~has_ghost] +) +``` + +Prepare the targets in the same way: + +```{code-cell} python +test_y = test_y[["rater_1"]].values.squeeze().astype(int) +print(f"Discard={100 * (test_y == -1).sum() / len(test_y):.2f}%.") +print(f"Doubtful={100 * (test_y == 0).sum() / len(test_y):.2f}%.") +print(f"Accept={100 * (test_y == 1).sum() / len(test_y):.2f}%.") +test_y += 1 +test_y = (~test_y.astype(bool)).astype(int) +print(100 * test_y.sum() / len(test_y)) +``` + +Now we train our classifier in all available data: + +```{code-cell} python +classifier = model.fit(X=train_x, y=train_y) +``` + +And we can make predictions about the new data: + +```{code-cell} python +predicted_y = classifier.predict(scaled_test_x) +``` + +And calculate the performance score (AUC): + +```{code-cell} python +from sklearn.metrics import roc_auc_score as auc + +print(f"AUC on DS030 is {auc(test_y, predicted_y)}.") +``` + +Okay, that is embarrassing. +We have a model with modest estimated performance (AUC=0.63, on average), but it totally failed when applied on a left-out dataset (AUC=0.5). +An alternative and useful way of understanding the predictions is plotting the confusion matrix: + +```{code-cell} python +from sklearn.metrics import classification_report + +print(classification_report(test_y, predicted_y, zero_division=0)) +``` + +As we can see, all test samples were predicted as having *good* quality. + +## Changing the dimensionality reduction strategy + +Let's design a new model using an alternative methodology to dimensionality reduction. +In this case, we will use locally linear embedding: + +```{code-cell} python +from sklearn.manifold import LocallyLinearEmbedding as LLE + +model_2 = Pipeline([ + ("drop_site", DropColumns(drop=["site"])), + ("lle", LLE(n_components=3)), + ( + "rfc", + RFC( + bootstrap=True, + class_weight=None, + criterion="gini", + max_depth=10, + max_features="sqrt", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_samples_leaf=10, + min_samples_split=10, + min_weight_fraction_leaf=0.0, + n_estimators=400, + oob_score=True, + ), + ), +]) + +cv_score_2 = cross_val_score( + model_2, + X=scaled_x, + y=train_y, + cv=outer_cv, + scoring="roc_auc", +) +print(f"AUC (cross-validated, {len(cv_score_2)} folds): {cv_score_2.mean()} ± {cv_score_2.std()}") +``` + +Now, the cross-validated performance is even worse. +Predictions on the left out dataset are just the same, all samples were predicted as *good*: + +```{code-cell} python +pred_y_2 = model_2.fit(X=train_x, y=train_y).predict(scaled_test_x) +print(f"AUC on DS030 is {auc(test_y, pred_y_2)}.") +print(classification_report(test_y, pred_y_2, zero_division=0)) +``` + +## Using the MRIQC classifier + +Finally, let's check the internal classifier design of MRIQC. +It can be instantiated with `init_pipeline()`: + +```{code-cell} python +from mriqc_learn.models.production import init_pipeline + +# Reload the datasets as MRIQC's model will want to see the removed columns +(train_x, sites), _ = load_dataset(split_strategy="none") +train_x["site"] = sites.site +(test_x, sites), _ = load_dataset("ds030", split_strategy="none") +test_x["site"] = sites.site + +cv_score_3 = cross_val_score( + init_pipeline(), + X=train_x, + y=train_y, + cv=outer_cv, + scoring="roc_auc", + n_jobs=16, +) +print(f"AUC (cross-validated, {len(cv_score_3)} folds): {cv_score_3.mean()} ± {cv_score_3.std()}") + +pred_y_3 = init_pipeline().fit(X=train_x, y=train_y).predict(test_x[~has_ghost]) +print(f"AUC on DS030 is {auc(test_y, pred_y_3)}.") +print(classification_report(test_y, pred_y_3, zero_division=0)) +``` diff --git a/docs/auto-qc/dimensionality_reduction.md b/docs/auto-qc/dimensionality_reduction.md index 926b1ef..62b43b4 100644 --- a/docs/auto-qc/dimensionality_reduction.md +++ b/docs/auto-qc/dimensionality_reduction.md @@ -33,12 +33,18 @@ plt.rcParams["savefig.facecolor"] = "white" First, let's load the ABIDE dataset, and apply the site-wise normalization. ```{code-cell} python -from mriqc_learn.viz import metrics -from mriqc_learn.models.preprocess import SiteRobustScaler from mriqc_learn.datasets import load_dataset +from mriqc_learn.models.preprocess import SiteRobustScaler (train_x, train_y), _ = load_dataset(split_strategy="none") -train_x.drop(columns=["size_x", "size_y", "size_z", "spacing_x", "spacing_y", "spacing_z"]) +train_x = train_x.drop(columns=[ + "size_x", + "size_y", + "size_z", + "spacing_x", + "spacing_y", + "spacing_z", +]) numeric_columns = train_x.columns.tolist() train_x["site"] = train_y.site @@ -51,6 +57,8 @@ Highly correlated (either positive or negatively correlated) are not adding much We therefore expect to see high (negative and positive) correlations between different metrics: ```{code-cell} python +from mriqc_learn.viz import metrics + metrics.plot_corrmat(scaled_x[numeric_columns].corr(), figsize=(12, 12)); ``` @@ -76,13 +84,15 @@ To make an educated guess of a sufficient number of components for properly repr ```{code-cell} python fig = plt.figure(figsize=(15,6)) -ax = plt.plot(np.cumsum(pca_model.explained_variance_ratio_ * 100), "-x") +plt.plot(np.cumsum(pca_model.explained_variance_ratio_ * 100), "-x") plt.ylabel("Cumulative variance explained [%]") plt.xlabel("Number of components") xticks = np.arange(0, pca_model.explained_variance_ratio_.size, dtype=int) plt.xticks(xticks) plt.gca().set_xticklabels(xticks + 1) -plt.show() +yticks = np.linspace(92.0, 100.0, num=5) +plt.yticks(yticks) +plt.gca().set_yticklabels([f"{v:.2f}" for v in yticks]); ``` We can see that the first four components account for 99% of the variance, which is pretty high. @@ -124,4 +134,4 @@ metrics.plot_corrmat(components.corr(), figsize=(12, 12)); ``` ## Thanks -We thank Céline Provins for the [original notebook](https://github.com/nipreps/mriqc-learn/blob/cd1a57aea2a1dd7f18882f35fc29f07aba00915a/docs/notebooks/Dimensionality%20Reduction%20on%20IQMs.ipynb) on which this section is based. +We thank Céline Provins for the [original notebook](https://github.com/nipreps/mriqc-learn/blob/5132ffd69af1e56427bda39a9e44de13988d57aa/docs/notebooks/Dimensionality%20Reduction%20on%20IQMs.ipynb) on which this section is based. diff --git a/docs/auto-qc/iqms_first_contact.md b/docs/auto-qc/iqms_first_contact.md index 36d6e63..54b7682 100644 --- a/docs/auto-qc/iqms_first_contact.md +++ b/docs/auto-qc/iqms_first_contact.md @@ -39,11 +39,17 @@ from mriqc_learn.datasets import load_dataset (train_x, train_y), _ = load_dataset(split_strategy="none") # 3. Remove non-informative IQMs (image size and spacing) -train_x = train_x.drop(columns = ['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z'], inplace = False) +train_x = train_x.drop(columns=[ + "size_x", + "size_y", + "size_z", + "spacing_x", + "spacing_y", + "spacing_z", +]) # 4. Keep a list of numeric columns before adding the site as a column numeric_columns = train_x.columns.tolist() - ``` With the argument `split_strategy="none"` we are indicating that we want to obtain the full dataset, without partitioning it in any ways.