From cbeb7e7a5ed954f4c375ab5fa2877a44256448c6 Mon Sep 17 00:00:00 2001
From: FBurkhardt <fburkhardt@audeering.com>
Date: Thu, 16 Nov 2023 21:20:32 +0100
Subject: [PATCH] 0.70.0

---
 CHANGELOG.md             |  4 +++
 nkululeko/constants.py   |  2 +-
 nkululeko/experiment.py  | 74 +++++++++++++---------------------------
 nkululeko/modelrunner.py | 48 ++++++++++++++++++++++++++
 nkululeko/util.py        | 32 ++++++-----------
 5 files changed, 88 insertions(+), 72 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9a9c2ccc..ec793822 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,10 @@
 Changelog
 =========
 
+Version 0.70.0
+--------------
+* added imb_learn balancing of training set
+
 Version 0.69.0
 --------------
 * added CNN model and melspec extractor
diff --git a/nkululeko/constants.py b/nkululeko/constants.py
index 877002cf..6f8d6a33 100644
--- a/nkululeko/constants.py
+++ b/nkululeko/constants.py
@@ -1,2 +1,2 @@
-VERSION="0.69.0"
+VERSION="0.70.0"
 SAMPLING_RATE = 16000
diff --git a/nkululeko/experiment.py b/nkululeko/experiment.py
index a3b26c62..660f7c62 100644
--- a/nkululeko/experiment.py
+++ b/nkululeko/experiment.py
@@ -62,7 +62,7 @@ def store_report(self):
         if eval(self.util.config_val("REPORT", "show", "False")):
             self.report.print()
         if self.util.config_val("REPORT", "latex", False):
-            self.report.export_latex() 
+            self.report.export_latex()
 
     def get_name(self):
         return self.util.get_exp_name()
@@ -102,9 +102,7 @@ def load_datasets(self):
         if labels:
             labels = ast.literal_eval(labels)
         else:
-            labels = list(
-                next(iter(self.datasets.values())).df[self.target].unique()
-            )
+            labels = list(next(iter(self.datasets.values())).df[self.target].unique())
         # print labels via debug
         self.util.debug(f"Target labels (user defined): {labels}")
         glob_conf.set_labels(labels)
@@ -248,8 +246,7 @@ def fill_train_and_tests(self):
                     test_cats = self.df_test[self.target].unique()
                 else:
                     # if there is no target, copy a dummy label
-                    self.df_test = self._add_random_target(
-                        self.df_test).astype('str')
+                    self.df_test = self._add_random_target(self.df_test).astype("str")
                 train_cats = self.df_train[self.target].unique()
                 # print(f"df_train: {pd.DataFrame(self.df_train[self.target])}")
                 # print(f"train_cats with target {self.target}: {train_cats}")
@@ -280,19 +277,17 @@ def fill_train_and_tests(self):
 
         target_factor = self.util.config_val("DATA", "target_divide_by", False)
         if target_factor:
-            self.df_test[self.target] = self.df_test[self.target] / float(
-                target_factor
-            )
+            self.df_test[self.target] = self.df_test[self.target] / float(target_factor)
             self.df_train[self.target] = self.df_train[self.target] / float(
                 target_factor
             )
             if not self.util.exp_is_classification():
-                self.df_test["class_label"] = self.df_test[
-                    "class_label"
-                ] / float(target_factor)
-                self.df_train["class_label"] = self.df_train[
-                    "class_label"
-                ] / float(target_factor)
+                self.df_test["class_label"] = self.df_test["class_label"] / float(
+                    target_factor
+                )
+                self.df_train["class_label"] = self.df_train["class_label"] / float(
+                    target_factor
+                )
 
     def _add_random_target(self, df):
         labels = glob_conf.labels
@@ -305,18 +300,14 @@ def _add_random_target(self, df):
     def plot_distribution(self, df_labels):
         """Plot the distribution of samples and speaker per target class and biological sex"""
         plot = Plots()
-        sample_selection = self.util.config_val(
-            "EXPL", "sample_selection", "all"
-        )
+        sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
         plot.plot_distributions(df_labels)
         if self.got_speaker:
             plot.plot_distributions_speaker(df_labels)
 
     def extract_test_feats(self):
         self.feats_test = pd.DataFrame()
-        feats_name = "_".join(
-            ast.literal_eval(glob_conf.config["DATA"]["tests"])
-        )
+        feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["tests"]))
         feats_types = self.util.config_val_list("FEATS", "type", ["os"])
         self.feature_extractor = FeatureExtractor(
             self.df_test, feats_types, feats_name, "test"
@@ -333,9 +324,7 @@ def extract_feats(self):
 
         """
         df_train, df_test = self.df_train, self.df_test
-        feats_name = "_".join(
-            ast.literal_eval(glob_conf.config["DATA"]["databases"])
-        )
+        feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["databases"]))
         self.feats_test, self.feats_train = pd.DataFrame(), pd.DataFrame()
         feats_types = self.util.config_val_list("FEATS", "type", ["os"])
         self.feature_extractor = FeatureExtractor(
@@ -364,9 +353,7 @@ def extract_feats(self):
                 f"test feats ({self.feats_test.shape[0]}) != test labels"
                 f" ({self.df_test.shape[0]})"
             )
-            self.df_test = self.df_test[
-                self.df_test.index.isin(self.feats_test.index)
-            ]
+            self.df_test = self.df_test[self.df_test.index.isin(self.feats_test.index)]
             self.util.warn(f"mew test labels shape: {self.df_test.shape[0]}")
 
         self._check_scale()
@@ -457,8 +444,7 @@ def autopredict(self):
                 predictor = ValencePredictor(df)
                 df = predictor.predict(sample_selection)
             elif target == "dominance":
-                from nkululeko.autopredict.ap_dominance import \
-                    DominancePredictor
+                from nkululeko.autopredict.ap_dominance import DominancePredictor
 
                 predictor = DominancePredictor(df)
                 df = predictor.predict(sample_selection)
@@ -472,9 +458,7 @@ def random_splice(self):
         """
         from nkululeko.augmenting.randomsplicer import Randomsplicer
 
-        sample_selection = self.util.config_val(
-            "DATA", "random_splice", "train"
-        )
+        sample_selection = self.util.config_val("DATA", "random_splice", "train")
         if sample_selection == "all":
             df = pd.concat([self.df_train, self.df_test])
         elif sample_selection == "train":
@@ -498,9 +482,7 @@ def analyse_features(self, needs_feats):
         plot_feats = eval(
             self.util.config_val("EXPL", "feature_distributions", "False")
         )
-        sample_selection = self.util.config_val(
-            "EXPL", "sample_selection", "all"
-        )
+        sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
         # get the data labels
         if sample_selection == "all":
             df_labels = pd.concat([self.df_train, self.df_test])
@@ -523,8 +505,9 @@ def analyse_features(self, needs_feats):
         # check if data should be shown with the spotlight data visualizer
         spotlight = eval(self.util.config_val("EXPL", "spotlight", "False"))
         if spotlight:
-            self.util.debug('opening spotlight tab in web browser')
+            self.util.debug("opening spotlight tab in web browser")
             from renumics import spotlight
+
             spotlight.show(df_labels.reset_index())
 
         if not needs_feats:
@@ -543,9 +526,7 @@ def analyse_features(self, needs_feats):
             )
 
         if plot_feats:
-            feat_analyser = FeatureAnalyser(
-                sample_selection, df_labels, df_feats
-            )
+            feat_analyser = FeatureAnalyser(sample_selection, df_labels, df_feats)
             feat_analyser.analyse()
 
         # check if a scatterplot should be done
@@ -555,9 +536,7 @@ def analyse_features(self, needs_feats):
             if self.util.exp_is_classification():
                 plots = Plots()
                 for scatter in scatters:
-                    plots.scatter_plot(
-                        df_feats, df_labels["class_label"], scatter
-                    )
+                    plots.scatter_plot(df_feats, df_labels["class_label"], scatter)
             else:
                 self.util.debug("can't do scatterplot if not classification")
 
@@ -636,8 +615,7 @@ def run(self):
     def plot_confmat_per_speaker(self, function):
         if self.loso or self.logo or self.xfoldx:
             self.util.debug(
-                "plot combined speaker predictions not possible for cross"
-                " validation"
+                "plot combined speaker predictions not possible for cross" " validation"
             )
             return
         best = self._get_best_report(self.reports)
@@ -647,9 +625,7 @@ def plot_confmat_per_speaker(self, function):
         preds = best.preds
         speakers = self.df_test.speaker.values
         print(f"{len(truths)} {len(preds)} {len(speakers) }")
-        df = pd.DataFrame(
-            data={"truth": truths, "pred": preds, "speaker": speakers}
-        )
+        df = pd.DataFrame(data={"truth": truths, "pred": preds, "speaker": speakers})
         plot_name = "result_combined_per_speaker"
         self.util.debug(
             f"plotting speaker combination ({function}) confusion matrix to"
@@ -695,6 +671,4 @@ def save(self, filename):
             pickle.dump(self.__dict__, f)
             f.close()
         except (AttributeError, TypeError, RuntimeError) as error:
-            self.util.warn(
-                f"Save experiment: Can't pickle local object: {error}"
-            )
+            self.util.warn(f"Save experiment: Can't pickle local object: {error}")
diff --git a/nkululeko/modelrunner.py b/nkululeko/modelrunner.py
index 1a31ed25..b6119185 100644
--- a/nkululeko/modelrunner.py
+++ b/nkululeko/modelrunner.py
@@ -1,7 +1,10 @@
 # modelrunner.py
 
+import pandas as pd
+
 from nkululeko.util import Util
 from nkululeko import glob_conf
+import nkululeko.glob_conf as glob_conf
 
 
 class Modelrunner:
@@ -68,6 +71,8 @@ def do_epochs(self):
         return reports
 
     def _select_model(self, model_type):
+        self._check_balancing()
+
         if model_type == "svm":
             from nkululeko.models.model_svm import SVM_model
 
@@ -154,3 +159,46 @@ def _select_model(self, model_type):
                 " classifier"
             )
         return self.model
+
+    def _check_balancing(self):
+        balancing = self.util.config_val("FEATS", "balancing", False)
+        if balancing:
+            orig_size = self.feats_train.shape[0]
+            self.util.debug(f"balancing the training features with: {balancing}")
+            if balancing == "ros":
+                from imblearn.over_sampling import RandomOverSampler
+
+                sampler = RandomOverSampler()
+                X_res, y_res = sampler.fit_resample(
+                    self.feats_train, self.df_train[self.target]
+                )
+            elif balancing == "smote":
+                from imblearn.over_sampling import SMOTE
+
+                sampler = SMOTE()
+                X_res, y_res = sampler.fit_resample(
+                    self.feats_train, self.df_train[self.target]
+                )
+            elif balancing == "adasyn":
+                from imblearn.over_sampling import ADASYN
+
+                sampler = ADASYN()
+                X_res, y_res = sampler.fit_resample(
+                    self.feats_train, self.df_train[self.target]
+                )
+            else:
+                self.util.error(
+                    f"unknown balancing algorithm: {balancing} (should be [ros|smote|adasyn])"
+                )
+
+            self.feats_train = X_res
+            self.df_train = pd.DataFrame({self.target: y_res}, index=X_res.index)
+            self.util.debug(
+                f"balanced with: {balancing}, new size: {X_res.shape[0]} (was {orig_size})"
+            )
+            le = glob_conf.label_encoder
+            res = y_res.value_counts()
+            resd = {}
+            for i, e in enumerate(le.inverse_transform(res.index.values)):
+                resd[e] = res.values[i]
+            self.util.debug(f"{resd})")
diff --git a/nkululeko/util.py b/nkululeko/util.py
index 8bbf10e7..a4c783a2 100644
--- a/nkululeko/util.py
+++ b/nkululeko/util.py
@@ -74,22 +74,17 @@ def config_val_data(self, dataset, key, default):
             if self.got_data_roots:
                 try:
                     if len(key) > 0:
-                        return self.data_roots["DATA"][
-                            dataset + "." + key
-                        ].strip("'\"")
+                        return self.data_roots["DATA"][dataset + "." + key].strip("'\"")
                     else:
                         return self.data_roots["DATA"][dataset].strip("'\"")
                 except KeyError:
                     if not default in self.stopvals:
                         self.debug(
-                            f"value for {key} not found, using default:"
-                            f" {default}"
+                            f"value for {key} not found, using default:" f" {default}"
                         )
                     return default
             if not default in self.stopvals:
-                self.debug(
-                    f"value for {key} not found, using default: {default}"
-                )
+                self.debug(f"value for {key} not found, using default: {default}")
             return default
 
     def get_save_name(self):
@@ -123,17 +118,15 @@ def make_segmented_index(self, df):
         if len(df) == 0:
             return df
         if not isinstance(df.index, pd.MultiIndex):
-            df.index = audformat.utils.to_segmented_index(
-                df.index, allow_nat=False
-            )
+            df.index = audformat.utils.to_segmented_index(df.index, allow_nat=False)
         return df
 
     def _get_value_descript(self, section, name):
         if self.config_val(section, name, False):
             val = self.config_val(section, name, False)
-            val = str(val).strip('.')
-            return f'_{name}-{str(val)}'
-        return ''
+            val = str(val).strip(".")
+            return f"_{name}-{str(val)}"
+        return ""
 
     def get_data_name(self):
         """
@@ -151,7 +144,7 @@ def get_exp_name(self, only_train=False, only_data=False):
         return_string = f"{ds}"
         if not only_data:
             mt = self.get_model_description()
-            return_string = return_string+'_'+mt
+            return_string = return_string + "_" + mt
         return return_string.replace("__", "_")
 
     def get_model_description(self):
@@ -178,6 +171,7 @@ def get_model_description(self):
             ["MODEL", "logo"],
             ["MODEL", "learning_rate"],
             ["MODEL", "k_fold_cross"],
+            ["FEATS", "balancing"],
         ]
         for option in options:
             return_string += self._get_value_descript(option[0], option[1])
@@ -224,9 +218,7 @@ def config_val(self, section, key, default):
             return self.config[section][key]
         except KeyError:
             if not default in self.stopvals:
-                self.debug(
-                    f"value for {key} not found, using default: {default}"
-                )
+                self.debug(f"value for {key} not found, using default: {default}")
             return default
 
     def config_val_list(self, section, key, default):
@@ -234,9 +226,7 @@ def config_val_list(self, section, key, default):
             return ast.literal_eval(self.config[section][key])
         except KeyError:
             if not default in self.stopvals:
-                self.debug(
-                    f"value for {key} not found, using default: {default}"
-                )
+                self.debug(f"value for {key} not found, using default: {default}")
             return default
 
     def continuous_to_categorical(self, array):