From 21c40c3e2ae939e595a70dc6159da7c60da795e4 Mon Sep 17 00:00:00 2001 From: FBurkhardt Date: Thu, 12 Dec 2024 19:40:25 +0100 Subject: [PATCH] 0.93.8 --- CHANGELOG.md | 5 +++ ini_file.md | 6 ++- nkululeko/constants.py | 2 +- nkululeko/feat_extract/feats_import.py | 9 +++- nkululeko/feat_extract/feats_trill.py | 9 ++-- nkululeko/plots.py | 60 ++++++++++++++++---------- nkululeko/reporting/reporter.py | 40 ++++++++--------- requirements.txt | 5 ++- setup.cfg | 4 +- 9 files changed, 84 insertions(+), 56 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 372ee85..2b6d269 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ Changelog ========= +Version 0.93.8 +-------------- +* fixed bugs in plot +* added import_files_append=False + Version 0.93.7 -------------- * added a safety to remove nan values after mapping diff --git a/ini_file.md b/ini_file.md index a845a41..3d394ab 100644 --- a/ini_file.md +++ b/ini_file.md @@ -183,6 +183,8 @@ * **import**: [already computed features](http://blog.syntheticspeech.de/2022/10/18/how-to-import-features-from-outside-the-nkululeko-software/) * **import_file** = pathes to files with features in CSV format * import_file = ['path1/file1.csv', 'path2/file1.csv2'] + * **import_files_append** = set this to False if you want the files to be concatenated column-wise, else it's done row-wise + * import_files_append = True * **mld**: [mid-level-descriptors](http://www.essv.de/paper.php?id=447) * **mld.model** = *path to the mld sources folder* * **min_syls** = *minimum number of syllables* @@ -197,8 +199,8 @@ * **fft_hop_dur** = 10 *(msec hop duration)* * **fft_nbands** = 64 *(number of frequency bands)* * **ast**: [audio spectrogram transformer](https://arxiv.org/abs/2104.01778) features from MIT - * **trill**: [TRILL embeddings](https://ai.googleblog.com/2020/06/improving-speech-representations-and.html) from Google - * **trill.model** = *path to the TRILL model folder, optional* + * **wav2vec variants**: [wav2vec2 embeddings](https://huggingface.co/facebook/wav2vec2-large-robust-ft-swbd-300h) from facebook * "wav2vec2-large-robust-ft-swbd-300h" * **wav2vec.model** = *path to the wav2vec2 model folder* diff --git a/nkululeko/constants.py b/nkululeko/constants.py index 71ae542..9708b6a 100644 --- a/nkululeko/constants.py +++ b/nkululeko/constants.py @@ -1,2 +1,2 @@ -VERSION="0.93.7" +VERSION="0.93.8" SAMPLING_RATE = 16000 diff --git a/nkululeko/feat_extract/feats_import.py b/nkululeko/feat_extract/feats_import.py index 5e939da..44ad37d 100644 --- a/nkululeko/feat_extract/feats_import.py +++ b/nkululeko/feat_extract/feats_import.py @@ -18,6 +18,10 @@ def __init__(self, name, data_df, feats_type): def extract(self): """Import the features.""" self.util.debug(f"importing features for {self.name}") + # import_files_append: set this to True if the multiple tables should be combined row-wise, else they are combined column-wise + import_files_append = eval( + self.util.config_val("FEATS", "import_files_append", "True") + ) try: feat_import_files = self.util.config_val("FEATS", "import_file", False) feat_import_files = ast.literal_eval(feat_import_files) @@ -38,7 +42,10 @@ def extract(self): df = audformat.utils.read_csv(feat_import_file) df = self.util.make_segmented_index(df) df = df[df.index.isin(self.data_df.index)] - feat_df = pd.concat([feat_df, df]) + if import_files_append: + feat_df = pd.concat([feat_df, df], axis=0) + else: + feat_df = pd.concat([feat_df, df], axis=1) if feat_df.shape[0] == 0: self.util.error(f"Imported features for data set {self.name} not found!") # and assign to be the "official" feature set diff --git a/nkululeko/feat_extract/feats_trill.py b/nkululeko/feat_extract/feats_trill.py index f6aafec..dd8733e 100644 --- a/nkululeko/feat_extract/feats_trill.py +++ b/nkululeko/feat_extract/feats_trill.py @@ -3,15 +3,16 @@ import audiofile as af import pandas as pd -import tensorflow as tf -import tensorflow_hub as hub + +# import tensorflow as tf +# import tensorflow_hub as hub from tqdm import tqdm import nkululeko.glob_conf as glob_conf from nkululeko.feat_extract.featureset import Featureset # Import TF 2.X and make sure we're running eager. -assert tf.executing_eagerly() +# assert tf.executing_eagerly() class TRILLset(Featureset): @@ -39,7 +40,7 @@ def __init__(self, name, data_df, feats_type): "trill.model", "https://tfhub.dev/google/nonsemantic-speech-benchmark/trill/3", ) - self.model = hub.load(model_path) + # self.model = hub.load(model_path) self.feats_type = feats_type def extract(self): diff --git a/nkululeko/plots.py b/nkululeko/plots.py index b6bde6f..7e7521f 100644 --- a/nkululeko/plots.py +++ b/nkululeko/plots.py @@ -242,84 +242,100 @@ def _check_binning(self, att, df): def _plot2cont_cat(self, df, cont1, cont2, cat, ylab): """Plot relation of two continuous distributions with one categorical.""" + plot_df = df[[cont1, cont2, cat]].copy() if cont2 == "class_label": - df.rename(columns={cont2: self.target}) + plot_df = plot_df.rename(columns={cont2: self.target}) cont2 = self.target if cont1 == "class_label": - df.rename(columns={cont1: self.target}) + plot_df = plot_df.rename(columns={cont1: self.target}) cont1 = self.target if cat == "class_label": - df.rename(columns={cat: self.target}) + plot_df = plot_df.rename(columns={cat: self.target}) cat = self.target - pearson = stats.pearsonr(df[cont1], df[cont2]) + pearson = stats.pearsonr(plot_df[cont1], plot_df[cont2]) # trunc to three digits pearson = int(pearson[0] * 1000) / 1000 pearson_string = f"PCC: {pearson}" ccc_string = "" if self.with_ccc: - ccc_val = ccc(df[cont1], df[cont2]) + ccc_val = ccc(plot_df[cont1], plot_df[cont2]) ccc_val = int(ccc_val * 1000) / 1000 ccc_string = f"CCC: {ccc_val}" - ax = sns.lmplot(data=df, x=cont1, y=cont2, hue=cat) - caption = f"{ylab} {df.shape[0]}. {pearson_string} {ccc_string}" + ax = sns.lmplot(data=plot_df, x=cont1, y=cont2, hue=cat) + caption = f"{ylab} {plot_df.shape[0]}. {pearson_string} {ccc_string}" ax.figure.suptitle(caption) return ax, caption def _plot2cont(self, df, col1, col2, ylab): """Plot relation of two continuous distributions.""" + plot_df = df[[col1, col2]].copy() # rename "class_label" to the original target if col2 == "class_label": - df.rename(columns={col2: self.target}) + plot_df = plot_df.rename(columns={col2: self.target}) col2 = self.target if col1 == "class_label": - df.rename(columns={col1: self.target}) + plot_df = plot_df.rename(columns={col1: self.target}) col1 = self.target - pearson = stats.pearsonr(df[col1], df[col2]) + pearson = stats.pearsonr(plot_df[col1], plot_df[col2]) # trunc to three digits pearson = int(pearson[0] * 1000) / 1000 pearson_string = f"PCC: {pearson}" ccc_string = "" if self.with_ccc: - ccc_val = ccc(df[col1], df[col2]) + ccc_val = ccc(plot_df[col1], plot_df[col2]) ccc_val = int(ccc_val * 1000) / 1000 ccc_string = f"CCC: {ccc_val}" - ax = sns.lmplot(data=df, x=col1, y=col2) - caption = f"{ylab} {df.shape[0]}. {pearson_string} {ccc_string}" + ax = sns.lmplot(data=plot_df, x=col1, y=col2) + caption = f"{ylab} {plot_df.shape[0]}. {pearson_string} {ccc_string}" ax.figure.suptitle(caption) return ax, caption def plotcatcont(self, df, cat_col, cont_col, xlab, ylab): """Plot relation of categorical distribution with continuous.""" # rename "class_label" to the original target + plot_df = df[[cat_col, cont_col]].copy() if cat_col == "class_label": - df.rename(columns={cat_col: self.target}) + plot_df = plot_df.rename(columns={cat_col: self.target}) cat_col = self.target dist_type = self.util.config_val("EXPL", "dist_type", "kde") - cats, cat_str, es = su.get_effect_size(df, cat_col, cont_col) + cats, cat_str, es = su.get_effect_size(plot_df, cat_col, cont_col) model_type = self.util.get_model_type() if dist_type == "hist" and model_type != "tree": - ax = sns.histplot(df, x=cont_col, hue=cat_col, kde=True) - caption = f"{ylab} {df.shape[0]}. {cat_str} ({cats}):" f" {es}" + ax = sns.histplot(plot_df, x=cont_col, hue=cat_col, kde=True) + caption = f"{ylab} {plot_df.shape[0]}. {cat_str} ({cats}):" f" {es}" ax.set_title(caption) ax.set_xlabel(f"{cont_col}") ax.set_ylabel(f"number of {ylab}") else: ax = sns.displot( - df, x=cont_col, hue=cat_col, kind="kde", fill=True, warn_singular=False + plot_df, + x=cont_col, + hue=cat_col, + kind="kde", + fill=True, + warn_singular=False, ) ax.set(xlabel=f"{cont_col}") - caption = f"{ylab} {df.shape[0]}. {cat_str} ({cats}):" f" {es}" + caption = f"{ylab} {plot_df.shape[0]}. {cat_str} ({cats}):" f" {es}" ax.figure.suptitle(caption) return ax, caption def _plot2cat(self, df, col1, col2, xlab, ylab): """Plot relation of 2 categorical distributions.""" - crosstab = pd.crosstab(index=df[col1], columns=df[col2]) + plot_df = df[[col1, col2]].copy() + # rename "class_label" to the original target + if col2 == "class_label": + plot_df = plot_df.rename(columns={col2: self.target}) + col2 = self.target + if col1 == "class_label": + plot_df = plot_df.rename(columns={col1: self.target}) + col1 = self.target + crosstab = pd.crosstab(index=plot_df[col1], columns=plot_df[col2]) res_pval = stats.chi2_contingency(crosstab) res_pval = int(res_pval[1] * 1000) / 1000 - caption = f"{ylab} {df.shape[0]}. P-val chi2: {res_pval}" + caption = f"{ylab} {plot_df.shape[0]}. P-val chi2: {res_pval}" ax = ( - df.groupby(col1, observed=False)[col2] + plot_df.groupby(col1, observed=False)[col2] .value_counts() .unstack() .plot(kind="bar", stacked=True, title=caption, rot=0) diff --git a/nkululeko/reporting/reporter.py b/nkululeko/reporting/reporter.py index ad45c78..d81a7bb 100644 --- a/nkululeko/reporting/reporter.py +++ b/nkululeko/reporting/reporter.py @@ -3,32 +3,28 @@ import json import math +# import os +from confidence_intervals import evaluate_with_conf_int import matplotlib.pyplot as plt import numpy as np +from scipy.special import softmax +from scipy.stats import entropy +from scipy.stats import pearsonr +from sklearn.metrics import ConfusionMatrixDisplay +from sklearn.metrics import RocCurveDisplay +from sklearn.metrics import auc +from sklearn.metrics import classification_report +from sklearn.metrics import confusion_matrix +from sklearn.metrics import r2_score +from sklearn.metrics import roc_auc_score +from sklearn.metrics import roc_curve # from torch import is_tensor -from audmetric import ( - accuracy, - concordance_cc, - mean_absolute_error, - mean_squared_error, - unweighted_average_recall, -) - -# import os -from confidence_intervals import evaluate_with_conf_int -from scipy.special import softmax -from scipy.stats import entropy, pearsonr -from sklearn.metrics import ( - ConfusionMatrixDisplay, - RocCurveDisplay, - auc, - classification_report, - confusion_matrix, - r2_score, - roc_auc_score, - roc_curve, -) +from audmetric import accuracy +from audmetric import concordance_cc +from audmetric import mean_absolute_error +from audmetric import mean_squared_error +from audmetric import unweighted_average_recall import nkululeko.glob_conf as glob_conf from nkululeko.plots import Plots diff --git a/requirements.txt b/requirements.txt index c19d145..1a84ee1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,10 +23,11 @@ pylatex scikit_learn scipy seaborn +shap sounddevice splitutils -tensorflow -tensorflow_hub +# tensorflow +# tensorflow_hub torch torchaudio torchvision diff --git a/setup.cfg b/setup.cfg index 8338cf8..9f2c377 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,8 +36,8 @@ install_requires = scipy seaborn sounddevice - tensorflow - tensorflow_hub + # tensorflow + # tensorflow_hub torch torchvision transformers