From cbeb7e7a5ed954f4c375ab5fa2877a44256448c6 Mon Sep 17 00:00:00 2001 From: FBurkhardt Date: Thu, 16 Nov 2023 21:20:32 +0100 Subject: [PATCH] 0.70.0 --- CHANGELOG.md | 4 +++ nkululeko/constants.py | 2 +- nkululeko/experiment.py | 74 +++++++++++++--------------------------- nkululeko/modelrunner.py | 48 ++++++++++++++++++++++++++ nkululeko/util.py | 32 ++++++----------- 5 files changed, 88 insertions(+), 72 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a9c2ccc..ec793822 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ Changelog ========= +Version 0.70.0 +-------------- +* added imb_learn balancing of training set + Version 0.69.0 -------------- * added CNN model and melspec extractor diff --git a/nkululeko/constants.py b/nkululeko/constants.py index 877002cf..6f8d6a33 100644 --- a/nkululeko/constants.py +++ b/nkululeko/constants.py @@ -1,2 +1,2 @@ -VERSION="0.69.0" +VERSION="0.70.0" SAMPLING_RATE = 16000 diff --git a/nkululeko/experiment.py b/nkululeko/experiment.py index a3b26c62..660f7c62 100644 --- a/nkululeko/experiment.py +++ b/nkululeko/experiment.py @@ -62,7 +62,7 @@ def store_report(self): if eval(self.util.config_val("REPORT", "show", "False")): self.report.print() if self.util.config_val("REPORT", "latex", False): - self.report.export_latex() + self.report.export_latex() def get_name(self): return self.util.get_exp_name() @@ -102,9 +102,7 @@ def load_datasets(self): if labels: labels = ast.literal_eval(labels) else: - labels = list( - next(iter(self.datasets.values())).df[self.target].unique() - ) + labels = list(next(iter(self.datasets.values())).df[self.target].unique()) # print labels via debug self.util.debug(f"Target labels (user defined): {labels}") glob_conf.set_labels(labels) @@ -248,8 +246,7 @@ def fill_train_and_tests(self): test_cats = self.df_test[self.target].unique() else: # if there is no target, copy a dummy label - self.df_test = self._add_random_target( - self.df_test).astype('str') + self.df_test = self._add_random_target(self.df_test).astype("str") train_cats = self.df_train[self.target].unique() # print(f"df_train: {pd.DataFrame(self.df_train[self.target])}") # print(f"train_cats with target {self.target}: {train_cats}") @@ -280,19 +277,17 @@ def fill_train_and_tests(self): target_factor = self.util.config_val("DATA", "target_divide_by", False) if target_factor: - self.df_test[self.target] = self.df_test[self.target] / float( - target_factor - ) + self.df_test[self.target] = self.df_test[self.target] / float(target_factor) self.df_train[self.target] = self.df_train[self.target] / float( target_factor ) if not self.util.exp_is_classification(): - self.df_test["class_label"] = self.df_test[ - "class_label" - ] / float(target_factor) - self.df_train["class_label"] = self.df_train[ - "class_label" - ] / float(target_factor) + self.df_test["class_label"] = self.df_test["class_label"] / float( + target_factor + ) + self.df_train["class_label"] = self.df_train["class_label"] / float( + target_factor + ) def _add_random_target(self, df): labels = glob_conf.labels @@ -305,18 +300,14 @@ def _add_random_target(self, df): def plot_distribution(self, df_labels): """Plot the distribution of samples and speaker per target class and biological sex""" plot = Plots() - sample_selection = self.util.config_val( - "EXPL", "sample_selection", "all" - ) + sample_selection = self.util.config_val("EXPL", "sample_selection", "all") plot.plot_distributions(df_labels) if self.got_speaker: plot.plot_distributions_speaker(df_labels) def extract_test_feats(self): self.feats_test = pd.DataFrame() - feats_name = "_".join( - ast.literal_eval(glob_conf.config["DATA"]["tests"]) - ) + feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["tests"])) feats_types = self.util.config_val_list("FEATS", "type", ["os"]) self.feature_extractor = FeatureExtractor( self.df_test, feats_types, feats_name, "test" @@ -333,9 +324,7 @@ def extract_feats(self): """ df_train, df_test = self.df_train, self.df_test - feats_name = "_".join( - ast.literal_eval(glob_conf.config["DATA"]["databases"]) - ) + feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["databases"])) self.feats_test, self.feats_train = pd.DataFrame(), pd.DataFrame() feats_types = self.util.config_val_list("FEATS", "type", ["os"]) self.feature_extractor = FeatureExtractor( @@ -364,9 +353,7 @@ def extract_feats(self): f"test feats ({self.feats_test.shape[0]}) != test labels" f" ({self.df_test.shape[0]})" ) - self.df_test = self.df_test[ - self.df_test.index.isin(self.feats_test.index) - ] + self.df_test = self.df_test[self.df_test.index.isin(self.feats_test.index)] self.util.warn(f"mew test labels shape: {self.df_test.shape[0]}") self._check_scale() @@ -457,8 +444,7 @@ def autopredict(self): predictor = ValencePredictor(df) df = predictor.predict(sample_selection) elif target == "dominance": - from nkululeko.autopredict.ap_dominance import \ - DominancePredictor + from nkululeko.autopredict.ap_dominance import DominancePredictor predictor = DominancePredictor(df) df = predictor.predict(sample_selection) @@ -472,9 +458,7 @@ def random_splice(self): """ from nkululeko.augmenting.randomsplicer import Randomsplicer - sample_selection = self.util.config_val( - "DATA", "random_splice", "train" - ) + sample_selection = self.util.config_val("DATA", "random_splice", "train") if sample_selection == "all": df = pd.concat([self.df_train, self.df_test]) elif sample_selection == "train": @@ -498,9 +482,7 @@ def analyse_features(self, needs_feats): plot_feats = eval( self.util.config_val("EXPL", "feature_distributions", "False") ) - sample_selection = self.util.config_val( - "EXPL", "sample_selection", "all" - ) + sample_selection = self.util.config_val("EXPL", "sample_selection", "all") # get the data labels if sample_selection == "all": df_labels = pd.concat([self.df_train, self.df_test]) @@ -523,8 +505,9 @@ def analyse_features(self, needs_feats): # check if data should be shown with the spotlight data visualizer spotlight = eval(self.util.config_val("EXPL", "spotlight", "False")) if spotlight: - self.util.debug('opening spotlight tab in web browser') + self.util.debug("opening spotlight tab in web browser") from renumics import spotlight + spotlight.show(df_labels.reset_index()) if not needs_feats: @@ -543,9 +526,7 @@ def analyse_features(self, needs_feats): ) if plot_feats: - feat_analyser = FeatureAnalyser( - sample_selection, df_labels, df_feats - ) + feat_analyser = FeatureAnalyser(sample_selection, df_labels, df_feats) feat_analyser.analyse() # check if a scatterplot should be done @@ -555,9 +536,7 @@ def analyse_features(self, needs_feats): if self.util.exp_is_classification(): plots = Plots() for scatter in scatters: - plots.scatter_plot( - df_feats, df_labels["class_label"], scatter - ) + plots.scatter_plot(df_feats, df_labels["class_label"], scatter) else: self.util.debug("can't do scatterplot if not classification") @@ -636,8 +615,7 @@ def run(self): def plot_confmat_per_speaker(self, function): if self.loso or self.logo or self.xfoldx: self.util.debug( - "plot combined speaker predictions not possible for cross" - " validation" + "plot combined speaker predictions not possible for cross" " validation" ) return best = self._get_best_report(self.reports) @@ -647,9 +625,7 @@ def plot_confmat_per_speaker(self, function): preds = best.preds speakers = self.df_test.speaker.values print(f"{len(truths)} {len(preds)} {len(speakers) }") - df = pd.DataFrame( - data={"truth": truths, "pred": preds, "speaker": speakers} - ) + df = pd.DataFrame(data={"truth": truths, "pred": preds, "speaker": speakers}) plot_name = "result_combined_per_speaker" self.util.debug( f"plotting speaker combination ({function}) confusion matrix to" @@ -695,6 +671,4 @@ def save(self, filename): pickle.dump(self.__dict__, f) f.close() except (AttributeError, TypeError, RuntimeError) as error: - self.util.warn( - f"Save experiment: Can't pickle local object: {error}" - ) + self.util.warn(f"Save experiment: Can't pickle local object: {error}") diff --git a/nkululeko/modelrunner.py b/nkululeko/modelrunner.py index 1a31ed25..b6119185 100644 --- a/nkululeko/modelrunner.py +++ b/nkululeko/modelrunner.py @@ -1,7 +1,10 @@ # modelrunner.py +import pandas as pd + from nkululeko.util import Util from nkululeko import glob_conf +import nkululeko.glob_conf as glob_conf class Modelrunner: @@ -68,6 +71,8 @@ def do_epochs(self): return reports def _select_model(self, model_type): + self._check_balancing() + if model_type == "svm": from nkululeko.models.model_svm import SVM_model @@ -154,3 +159,46 @@ def _select_model(self, model_type): " classifier" ) return self.model + + def _check_balancing(self): + balancing = self.util.config_val("FEATS", "balancing", False) + if balancing: + orig_size = self.feats_train.shape[0] + self.util.debug(f"balancing the training features with: {balancing}") + if balancing == "ros": + from imblearn.over_sampling import RandomOverSampler + + sampler = RandomOverSampler() + X_res, y_res = sampler.fit_resample( + self.feats_train, self.df_train[self.target] + ) + elif balancing == "smote": + from imblearn.over_sampling import SMOTE + + sampler = SMOTE() + X_res, y_res = sampler.fit_resample( + self.feats_train, self.df_train[self.target] + ) + elif balancing == "adasyn": + from imblearn.over_sampling import ADASYN + + sampler = ADASYN() + X_res, y_res = sampler.fit_resample( + self.feats_train, self.df_train[self.target] + ) + else: + self.util.error( + f"unknown balancing algorithm: {balancing} (should be [ros|smote|adasyn])" + ) + + self.feats_train = X_res + self.df_train = pd.DataFrame({self.target: y_res}, index=X_res.index) + self.util.debug( + f"balanced with: {balancing}, new size: {X_res.shape[0]} (was {orig_size})" + ) + le = glob_conf.label_encoder + res = y_res.value_counts() + resd = {} + for i, e in enumerate(le.inverse_transform(res.index.values)): + resd[e] = res.values[i] + self.util.debug(f"{resd})") diff --git a/nkululeko/util.py b/nkululeko/util.py index 8bbf10e7..a4c783a2 100644 --- a/nkululeko/util.py +++ b/nkululeko/util.py @@ -74,22 +74,17 @@ def config_val_data(self, dataset, key, default): if self.got_data_roots: try: if len(key) > 0: - return self.data_roots["DATA"][ - dataset + "." + key - ].strip("'\"") + return self.data_roots["DATA"][dataset + "." + key].strip("'\"") else: return self.data_roots["DATA"][dataset].strip("'\"") except KeyError: if not default in self.stopvals: self.debug( - f"value for {key} not found, using default:" - f" {default}" + f"value for {key} not found, using default:" f" {default}" ) return default if not default in self.stopvals: - self.debug( - f"value for {key} not found, using default: {default}" - ) + self.debug(f"value for {key} not found, using default: {default}") return default def get_save_name(self): @@ -123,17 +118,15 @@ def make_segmented_index(self, df): if len(df) == 0: return df if not isinstance(df.index, pd.MultiIndex): - df.index = audformat.utils.to_segmented_index( - df.index, allow_nat=False - ) + df.index = audformat.utils.to_segmented_index(df.index, allow_nat=False) return df def _get_value_descript(self, section, name): if self.config_val(section, name, False): val = self.config_val(section, name, False) - val = str(val).strip('.') - return f'_{name}-{str(val)}' - return '' + val = str(val).strip(".") + return f"_{name}-{str(val)}" + return "" def get_data_name(self): """ @@ -151,7 +144,7 @@ def get_exp_name(self, only_train=False, only_data=False): return_string = f"{ds}" if not only_data: mt = self.get_model_description() - return_string = return_string+'_'+mt + return_string = return_string + "_" + mt return return_string.replace("__", "_") def get_model_description(self): @@ -178,6 +171,7 @@ def get_model_description(self): ["MODEL", "logo"], ["MODEL", "learning_rate"], ["MODEL", "k_fold_cross"], + ["FEATS", "balancing"], ] for option in options: return_string += self._get_value_descript(option[0], option[1]) @@ -224,9 +218,7 @@ def config_val(self, section, key, default): return self.config[section][key] except KeyError: if not default in self.stopvals: - self.debug( - f"value for {key} not found, using default: {default}" - ) + self.debug(f"value for {key} not found, using default: {default}") return default def config_val_list(self, section, key, default): @@ -234,9 +226,7 @@ def config_val_list(self, section, key, default): return ast.literal_eval(self.config[section][key]) except KeyError: if not default in self.stopvals: - self.debug( - f"value for {key} not found, using default: {default}" - ) + self.debug(f"value for {key} not found, using default: {default}") return default def continuous_to_categorical(self, array):