Skip to content

Commit

Permalink
0.70.0
Browse files Browse the repository at this point in the history
  • Loading branch information
FBurkhardt committed Nov 16, 2023
1 parent 5a1fdb4 commit cbeb7e7
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 72 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Changelog
=========

Version 0.70.0
--------------
* added imb_learn balancing of training set

Version 0.69.0
--------------
* added CNN model and melspec extractor
Expand Down
2 changes: 1 addition & 1 deletion nkululeko/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
VERSION="0.69.0"
VERSION="0.70.0"
SAMPLING_RATE = 16000
74 changes: 24 additions & 50 deletions nkululeko/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def store_report(self):
if eval(self.util.config_val("REPORT", "show", "False")):
self.report.print()
if self.util.config_val("REPORT", "latex", False):
self.report.export_latex()
self.report.export_latex()

def get_name(self):
return self.util.get_exp_name()
Expand Down Expand Up @@ -102,9 +102,7 @@ def load_datasets(self):
if labels:
labels = ast.literal_eval(labels)
else:
labels = list(
next(iter(self.datasets.values())).df[self.target].unique()
)
labels = list(next(iter(self.datasets.values())).df[self.target].unique())
# print labels via debug
self.util.debug(f"Target labels (user defined): {labels}")
glob_conf.set_labels(labels)
Expand Down Expand Up @@ -248,8 +246,7 @@ def fill_train_and_tests(self):
test_cats = self.df_test[self.target].unique()
else:
# if there is no target, copy a dummy label
self.df_test = self._add_random_target(
self.df_test).astype('str')
self.df_test = self._add_random_target(self.df_test).astype("str")
train_cats = self.df_train[self.target].unique()
# print(f"df_train: {pd.DataFrame(self.df_train[self.target])}")
# print(f"train_cats with target {self.target}: {train_cats}")
Expand Down Expand Up @@ -280,19 +277,17 @@ def fill_train_and_tests(self):

target_factor = self.util.config_val("DATA", "target_divide_by", False)
if target_factor:
self.df_test[self.target] = self.df_test[self.target] / float(
target_factor
)
self.df_test[self.target] = self.df_test[self.target] / float(target_factor)
self.df_train[self.target] = self.df_train[self.target] / float(
target_factor
)
if not self.util.exp_is_classification():
self.df_test["class_label"] = self.df_test[
"class_label"
] / float(target_factor)
self.df_train["class_label"] = self.df_train[
"class_label"
] / float(target_factor)
self.df_test["class_label"] = self.df_test["class_label"] / float(
target_factor
)
self.df_train["class_label"] = self.df_train["class_label"] / float(
target_factor
)

def _add_random_target(self, df):
labels = glob_conf.labels
Expand All @@ -305,18 +300,14 @@ def _add_random_target(self, df):
def plot_distribution(self, df_labels):
"""Plot the distribution of samples and speaker per target class and biological sex"""
plot = Plots()
sample_selection = self.util.config_val(
"EXPL", "sample_selection", "all"
)
sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
plot.plot_distributions(df_labels)
if self.got_speaker:
plot.plot_distributions_speaker(df_labels)

def extract_test_feats(self):
self.feats_test = pd.DataFrame()
feats_name = "_".join(
ast.literal_eval(glob_conf.config["DATA"]["tests"])
)
feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["tests"]))
feats_types = self.util.config_val_list("FEATS", "type", ["os"])
self.feature_extractor = FeatureExtractor(
self.df_test, feats_types, feats_name, "test"
Expand All @@ -333,9 +324,7 @@ def extract_feats(self):
"""
df_train, df_test = self.df_train, self.df_test
feats_name = "_".join(
ast.literal_eval(glob_conf.config["DATA"]["databases"])
)
feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["databases"]))
self.feats_test, self.feats_train = pd.DataFrame(), pd.DataFrame()
feats_types = self.util.config_val_list("FEATS", "type", ["os"])
self.feature_extractor = FeatureExtractor(
Expand Down Expand Up @@ -364,9 +353,7 @@ def extract_feats(self):
f"test feats ({self.feats_test.shape[0]}) != test labels"
f" ({self.df_test.shape[0]})"
)
self.df_test = self.df_test[
self.df_test.index.isin(self.feats_test.index)
]
self.df_test = self.df_test[self.df_test.index.isin(self.feats_test.index)]
self.util.warn(f"mew test labels shape: {self.df_test.shape[0]}")

self._check_scale()
Expand Down Expand Up @@ -457,8 +444,7 @@ def autopredict(self):
predictor = ValencePredictor(df)
df = predictor.predict(sample_selection)
elif target == "dominance":
from nkululeko.autopredict.ap_dominance import \
DominancePredictor
from nkululeko.autopredict.ap_dominance import DominancePredictor

predictor = DominancePredictor(df)
df = predictor.predict(sample_selection)
Expand All @@ -472,9 +458,7 @@ def random_splice(self):
"""
from nkululeko.augmenting.randomsplicer import Randomsplicer

sample_selection = self.util.config_val(
"DATA", "random_splice", "train"
)
sample_selection = self.util.config_val("DATA", "random_splice", "train")
if sample_selection == "all":
df = pd.concat([self.df_train, self.df_test])
elif sample_selection == "train":
Expand All @@ -498,9 +482,7 @@ def analyse_features(self, needs_feats):
plot_feats = eval(
self.util.config_val("EXPL", "feature_distributions", "False")
)
sample_selection = self.util.config_val(
"EXPL", "sample_selection", "all"
)
sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
# get the data labels
if sample_selection == "all":
df_labels = pd.concat([self.df_train, self.df_test])
Expand All @@ -523,8 +505,9 @@ def analyse_features(self, needs_feats):
# check if data should be shown with the spotlight data visualizer
spotlight = eval(self.util.config_val("EXPL", "spotlight", "False"))
if spotlight:
self.util.debug('opening spotlight tab in web browser')
self.util.debug("opening spotlight tab in web browser")
from renumics import spotlight

spotlight.show(df_labels.reset_index())

if not needs_feats:
Expand All @@ -543,9 +526,7 @@ def analyse_features(self, needs_feats):
)

if plot_feats:
feat_analyser = FeatureAnalyser(
sample_selection, df_labels, df_feats
)
feat_analyser = FeatureAnalyser(sample_selection, df_labels, df_feats)
feat_analyser.analyse()

# check if a scatterplot should be done
Expand All @@ -555,9 +536,7 @@ def analyse_features(self, needs_feats):
if self.util.exp_is_classification():
plots = Plots()
for scatter in scatters:
plots.scatter_plot(
df_feats, df_labels["class_label"], scatter
)
plots.scatter_plot(df_feats, df_labels["class_label"], scatter)
else:
self.util.debug("can't do scatterplot if not classification")

Expand Down Expand Up @@ -636,8 +615,7 @@ def run(self):
def plot_confmat_per_speaker(self, function):
if self.loso or self.logo or self.xfoldx:
self.util.debug(
"plot combined speaker predictions not possible for cross"
" validation"
"plot combined speaker predictions not possible for cross" " validation"
)
return
best = self._get_best_report(self.reports)
Expand All @@ -647,9 +625,7 @@ def plot_confmat_per_speaker(self, function):
preds = best.preds
speakers = self.df_test.speaker.values
print(f"{len(truths)} {len(preds)} {len(speakers) }")
df = pd.DataFrame(
data={"truth": truths, "pred": preds, "speaker": speakers}
)
df = pd.DataFrame(data={"truth": truths, "pred": preds, "speaker": speakers})
plot_name = "result_combined_per_speaker"
self.util.debug(
f"plotting speaker combination ({function}) confusion matrix to"
Expand Down Expand Up @@ -695,6 +671,4 @@ def save(self, filename):
pickle.dump(self.__dict__, f)
f.close()
except (AttributeError, TypeError, RuntimeError) as error:
self.util.warn(
f"Save experiment: Can't pickle local object: {error}"
)
self.util.warn(f"Save experiment: Can't pickle local object: {error}")
48 changes: 48 additions & 0 deletions nkululeko/modelrunner.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# modelrunner.py

import pandas as pd

from nkululeko.util import Util
from nkululeko import glob_conf
import nkululeko.glob_conf as glob_conf


class Modelrunner:
Expand Down Expand Up @@ -68,6 +71,8 @@ def do_epochs(self):
return reports

def _select_model(self, model_type):
self._check_balancing()

if model_type == "svm":
from nkululeko.models.model_svm import SVM_model

Expand Down Expand Up @@ -154,3 +159,46 @@ def _select_model(self, model_type):
" classifier"
)
return self.model

def _check_balancing(self):
balancing = self.util.config_val("FEATS", "balancing", False)
if balancing:
orig_size = self.feats_train.shape[0]
self.util.debug(f"balancing the training features with: {balancing}")
if balancing == "ros":
from imblearn.over_sampling import RandomOverSampler

sampler = RandomOverSampler()
X_res, y_res = sampler.fit_resample(
self.feats_train, self.df_train[self.target]
)
elif balancing == "smote":
from imblearn.over_sampling import SMOTE

sampler = SMOTE()
X_res, y_res = sampler.fit_resample(
self.feats_train, self.df_train[self.target]
)
elif balancing == "adasyn":
from imblearn.over_sampling import ADASYN

sampler = ADASYN()
X_res, y_res = sampler.fit_resample(
self.feats_train, self.df_train[self.target]
)
else:
self.util.error(
f"unknown balancing algorithm: {balancing} (should be [ros|smote|adasyn])"
)

self.feats_train = X_res
self.df_train = pd.DataFrame({self.target: y_res}, index=X_res.index)
self.util.debug(
f"balanced with: {balancing}, new size: {X_res.shape[0]} (was {orig_size})"
)
le = glob_conf.label_encoder
res = y_res.value_counts()
resd = {}
for i, e in enumerate(le.inverse_transform(res.index.values)):
resd[e] = res.values[i]
self.util.debug(f"{resd})")
32 changes: 11 additions & 21 deletions nkululeko/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,22 +74,17 @@ def config_val_data(self, dataset, key, default):
if self.got_data_roots:
try:
if len(key) > 0:
return self.data_roots["DATA"][
dataset + "." + key
].strip("'\"")
return self.data_roots["DATA"][dataset + "." + key].strip("'\"")
else:
return self.data_roots["DATA"][dataset].strip("'\"")
except KeyError:
if not default in self.stopvals:
self.debug(
f"value for {key} not found, using default:"
f" {default}"
f"value for {key} not found, using default:" f" {default}"
)
return default
if not default in self.stopvals:
self.debug(
f"value for {key} not found, using default: {default}"
)
self.debug(f"value for {key} not found, using default: {default}")
return default

def get_save_name(self):
Expand Down Expand Up @@ -123,17 +118,15 @@ def make_segmented_index(self, df):
if len(df) == 0:
return df
if not isinstance(df.index, pd.MultiIndex):
df.index = audformat.utils.to_segmented_index(
df.index, allow_nat=False
)
df.index = audformat.utils.to_segmented_index(df.index, allow_nat=False)
return df

def _get_value_descript(self, section, name):
if self.config_val(section, name, False):
val = self.config_val(section, name, False)
val = str(val).strip('.')
return f'_{name}-{str(val)}'
return ''
val = str(val).strip(".")
return f"_{name}-{str(val)}"
return ""

def get_data_name(self):
"""
Expand All @@ -151,7 +144,7 @@ def get_exp_name(self, only_train=False, only_data=False):
return_string = f"{ds}"
if not only_data:
mt = self.get_model_description()
return_string = return_string+'_'+mt
return_string = return_string + "_" + mt
return return_string.replace("__", "_")

def get_model_description(self):
Expand All @@ -178,6 +171,7 @@ def get_model_description(self):
["MODEL", "logo"],
["MODEL", "learning_rate"],
["MODEL", "k_fold_cross"],
["FEATS", "balancing"],
]
for option in options:
return_string += self._get_value_descript(option[0], option[1])
Expand Down Expand Up @@ -224,19 +218,15 @@ def config_val(self, section, key, default):
return self.config[section][key]
except KeyError:
if not default in self.stopvals:
self.debug(
f"value for {key} not found, using default: {default}"
)
self.debug(f"value for {key} not found, using default: {default}")
return default

def config_val_list(self, section, key, default):
try:
return ast.literal_eval(self.config[section][key])
except KeyError:
if not default in self.stopvals:
self.debug(
f"value for {key} not found, using default: {default}"
)
self.debug(f"value for {key} not found, using default: {default}")
return default

def continuous_to_categorical(self, array):
Expand Down

0 comments on commit cbeb7e7

Please sign in to comment.