0.93.8

felixbur · Dec 12, 2024 · 21c40c3 · 21c40c3
1 parent 8b50484
commit 21c40c3
Show file tree

Hide file tree

Showing 9 changed files with 84 additions and 56 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,11 @@
 Changelog
 =========
 
+Version 0.93.8
+--------------
+* fixed bugs in plot
+* added import_files_append=False
+
 Version 0.93.7
 --------------
 * added a safety to remove nan values after mapping

diff --git a/ini_file.md b/ini_file.md
@@ -183,6 +183,8 @@
     * **import**: [already computed features](http://blog.syntheticspeech.de/2022/10/18/how-to-import-features-from-outside-the-nkululeko-software/)
       * **import_file** = pathes to files with features in CSV format
         * import_file = ['path1/file1.csv', 'path2/file1.csv2']  
+      * **import_files_append** = set this to False if you want the files to be concatenated column-wise, else it's done row-wise
+        * import_files_append = True  
     * **mld**: [mid-level-descriptors](http://www.essv.de/paper.php?id=447)
       * **mld.model** = *path to the mld sources folder*
       * **min_syls** = *minimum number of syllables*
@@ -197,8 +199,8 @@
       * **fft_hop_dur** = 10 *(msec hop duration)*
       * **fft_nbands** = 64 *(number of frequency bands)*
     * **ast**: [audio spectrogram transformer](https://arxiv.org/abs/2104.01778) features from MIT
-    * **trill**: [TRILL embeddings](https://ai.googleblog.com/2020/06/improving-speech-representations-and.html) from Google
-      * **trill.model** = *path to the TRILL model folder, optional*
+    <!-- * **trill**: [TRILL embeddings](https://ai.googleblog.com/2020/06/improving-speech-representations-and.html) from Google
+      * **trill.model** = *path to the TRILL model folder, optional* -->
     * **wav2vec variants**: [wav2vec2 embeddings](https://huggingface.co/facebook/wav2vec2-large-robust-ft-swbd-300h) from facebook
       * "wav2vec2-large-robust-ft-swbd-300h"
       * **wav2vec.model** = *path to the wav2vec2 model folder*

diff --git a/nkululeko/constants.py b/nkululeko/constants.py
@@ -1,2 +1,2 @@
-VERSION="0.93.7"
+VERSION="0.93.8"
 SAMPLING_RATE = 16000
diff --git a/nkululeko/feat_extract/feats_import.py b/nkululeko/feat_extract/feats_import.py
@@ -18,6 +18,10 @@ def __init__(self, name, data_df, feats_type):
     def extract(self):
         """Import the features."""
         self.util.debug(f"importing features for {self.name}")
+        # import_files_append: set this to True if the multiple tables should be combined row-wise, else they are combined column-wise
+        import_files_append = eval(
+            self.util.config_val("FEATS", "import_files_append", "True")
+        )
         try:
             feat_import_files = self.util.config_val("FEATS", "import_file", False)
             feat_import_files = ast.literal_eval(feat_import_files)
@@ -38,7 +42,10 @@ def extract(self):
             df = audformat.utils.read_csv(feat_import_file)
             df = self.util.make_segmented_index(df)
             df = df[df.index.isin(self.data_df.index)]
-            feat_df = pd.concat([feat_df, df])
+            if import_files_append:
+                feat_df = pd.concat([feat_df, df], axis=0)
+            else:
+                feat_df = pd.concat([feat_df, df], axis=1)
         if feat_df.shape[0] == 0:
             self.util.error(f"Imported features for data set {self.name} not found!")
         # and assign to be the "official" feature set

diff --git a/nkululeko/feat_extract/feats_trill.py b/nkululeko/feat_extract/feats_trill.py
@@ -3,15 +3,16 @@
 
 import audiofile as af
 import pandas as pd
-import tensorflow as tf
-import tensorflow_hub as hub
+
+# import tensorflow as tf
+# import tensorflow_hub as hub
 from tqdm import tqdm
 
 import nkululeko.glob_conf as glob_conf
 from nkululeko.feat_extract.featureset import Featureset
 
 # Import TF 2.X and make sure we're running eager.
-assert tf.executing_eagerly()
+# assert tf.executing_eagerly()
 
 
 class TRILLset(Featureset):
@@ -39,7 +40,7 @@ def __init__(self, name, data_df, feats_type):
             "trill.model",
             "https://tfhub.dev/google/nonsemantic-speech-benchmark/trill/3",
         )
-        self.model = hub.load(model_path)
+        # self.model = hub.load(model_path)
         self.feats_type = feats_type
 
     def extract(self):

diff --git a/nkululeko/plots.py b/nkululeko/plots.py
@@ -242,84 +242,100 @@ def _check_binning(self, att, df):
 
     def _plot2cont_cat(self, df, cont1, cont2, cat, ylab):
         """Plot relation of two continuous distributions with one categorical."""
+        plot_df = df[[cont1, cont2, cat]].copy()
         if cont2 == "class_label":
-            df.rename(columns={cont2: self.target})
+            plot_df = plot_df.rename(columns={cont2: self.target})
             cont2 = self.target
         if cont1 == "class_label":
-            df.rename(columns={cont1: self.target})
+            plot_df = plot_df.rename(columns={cont1: self.target})
             cont1 = self.target
         if cat == "class_label":
-            df.rename(columns={cat: self.target})
+            plot_df = plot_df.rename(columns={cat: self.target})
             cat = self.target
-        pearson = stats.pearsonr(df[cont1], df[cont2])
+        pearson = stats.pearsonr(plot_df[cont1], plot_df[cont2])
         # trunc to three digits
         pearson = int(pearson[0] * 1000) / 1000
         pearson_string = f"PCC: {pearson}"
         ccc_string = ""
         if self.with_ccc:
-            ccc_val = ccc(df[cont1], df[cont2])
+            ccc_val = ccc(plot_df[cont1], plot_df[cont2])
             ccc_val = int(ccc_val * 1000) / 1000
             ccc_string = f"CCC: {ccc_val}"
-        ax = sns.lmplot(data=df, x=cont1, y=cont2, hue=cat)
-        caption = f"{ylab} {df.shape[0]}. {pearson_string} {ccc_string}"
+        ax = sns.lmplot(data=plot_df, x=cont1, y=cont2, hue=cat)
+        caption = f"{ylab} {plot_df.shape[0]}. {pearson_string} {ccc_string}"
         ax.figure.suptitle(caption)
         return ax, caption
 
     def _plot2cont(self, df, col1, col2, ylab):
         """Plot relation of two continuous distributions."""
+        plot_df = df[[col1, col2]].copy()
         # rename "class_label" to the original target
         if col2 == "class_label":
-            df.rename(columns={col2: self.target})
+            plot_df = plot_df.rename(columns={col2: self.target})
             col2 = self.target
         if col1 == "class_label":
-            df.rename(columns={col1: self.target})
+            plot_df = plot_df.rename(columns={col1: self.target})
             col1 = self.target
-        pearson = stats.pearsonr(df[col1], df[col2])
+        pearson = stats.pearsonr(plot_df[col1], plot_df[col2])
         # trunc to three digits
         pearson = int(pearson[0] * 1000) / 1000
         pearson_string = f"PCC: {pearson}"
         ccc_string = ""
         if self.with_ccc:
-            ccc_val = ccc(df[col1], df[col2])
+            ccc_val = ccc(plot_df[col1], plot_df[col2])
             ccc_val = int(ccc_val * 1000) / 1000
             ccc_string = f"CCC: {ccc_val}"
-        ax = sns.lmplot(data=df, x=col1, y=col2)
-        caption = f"{ylab} {df.shape[0]}. {pearson_string} {ccc_string}"
+        ax = sns.lmplot(data=plot_df, x=col1, y=col2)
+        caption = f"{ylab} {plot_df.shape[0]}. {pearson_string} {ccc_string}"
         ax.figure.suptitle(caption)
         return ax, caption
 
     def plotcatcont(self, df, cat_col, cont_col, xlab, ylab):
         """Plot relation of categorical distribution with continuous."""
         # rename "class_label" to the original target
+        plot_df = df[[cat_col, cont_col]].copy()
         if cat_col == "class_label":
-            df.rename(columns={cat_col: self.target})
+            plot_df = plot_df.rename(columns={cat_col: self.target})
             cat_col = self.target
         dist_type = self.util.config_val("EXPL", "dist_type", "kde")
-        cats, cat_str, es = su.get_effect_size(df, cat_col, cont_col)
+        cats, cat_str, es = su.get_effect_size(plot_df, cat_col, cont_col)
         model_type = self.util.get_model_type()
         if dist_type == "hist" and model_type != "tree":
-            ax = sns.histplot(df, x=cont_col, hue=cat_col, kde=True)
-            caption = f"{ylab} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
+            ax = sns.histplot(plot_df, x=cont_col, hue=cat_col, kde=True)
+            caption = f"{ylab} {plot_df.shape[0]}. {cat_str} ({cats}):" f" {es}"
             ax.set_title(caption)
             ax.set_xlabel(f"{cont_col}")
             ax.set_ylabel(f"number of {ylab}")
         else:
             ax = sns.displot(
-                df, x=cont_col, hue=cat_col, kind="kde", fill=True, warn_singular=False
+                plot_df,
+                x=cont_col,
+                hue=cat_col,
+                kind="kde",
+                fill=True,
+                warn_singular=False,
             )
             ax.set(xlabel=f"{cont_col}")
-            caption = f"{ylab} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
+            caption = f"{ylab} {plot_df.shape[0]}. {cat_str} ({cats}):" f" {es}"
             ax.figure.suptitle(caption)
         return ax, caption
 
     def _plot2cat(self, df, col1, col2, xlab, ylab):
         """Plot relation of 2 categorical distributions."""
-        crosstab = pd.crosstab(index=df[col1], columns=df[col2])
+        plot_df = df[[col1, col2]].copy()
+        # rename "class_label" to the original target
+        if col2 == "class_label":
+            plot_df = plot_df.rename(columns={col2: self.target})
+            col2 = self.target
+        if col1 == "class_label":
+            plot_df = plot_df.rename(columns={col1: self.target})
+            col1 = self.target
+        crosstab = pd.crosstab(index=plot_df[col1], columns=plot_df[col2])
         res_pval = stats.chi2_contingency(crosstab)
         res_pval = int(res_pval[1] * 1000) / 1000
-        caption = f"{ylab} {df.shape[0]}. P-val chi2: {res_pval}"
+        caption = f"{ylab} {plot_df.shape[0]}. P-val chi2: {res_pval}"
         ax = (
-            df.groupby(col1, observed=False)[col2]
+            plot_df.groupby(col1, observed=False)[col2]
             .value_counts()
             .unstack()
             .plot(kind="bar", stacked=True, title=caption, rot=0)

diff --git a/nkululeko/reporting/reporter.py b/nkululeko/reporting/reporter.py
@@ -3,32 +3,28 @@
 import json
 import math
 
+# import os
+from confidence_intervals import evaluate_with_conf_int
 import matplotlib.pyplot as plt
 import numpy as np
+from scipy.special import softmax
+from scipy.stats import entropy
+from scipy.stats import pearsonr
+from sklearn.metrics import ConfusionMatrixDisplay
+from sklearn.metrics import RocCurveDisplay
+from sklearn.metrics import auc
+from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import r2_score
+from sklearn.metrics import roc_auc_score
+from sklearn.metrics import roc_curve
 
 # from torch import is_tensor
-from audmetric import (
-    accuracy,
-    concordance_cc,
-    mean_absolute_error,
-    mean_squared_error,
-    unweighted_average_recall,
-)
-
-# import os
-from confidence_intervals import evaluate_with_conf_int
-from scipy.special import softmax
-from scipy.stats import entropy, pearsonr
-from sklearn.metrics import (
-    ConfusionMatrixDisplay,
-    RocCurveDisplay,
-    auc,
-    classification_report,
-    confusion_matrix,
-    r2_score,
-    roc_auc_score,
-    roc_curve,
-)
+from audmetric import accuracy
+from audmetric import concordance_cc
+from audmetric import mean_absolute_error
+from audmetric import mean_squared_error
+from audmetric import unweighted_average_recall
 
 import nkululeko.glob_conf as glob_conf
 from nkululeko.plots import Plots

diff --git a/requirements.txt b/requirements.txt
@@ -23,10 +23,11 @@ pylatex
 scikit_learn
 scipy
 seaborn
+shap
 sounddevice
 splitutils
-tensorflow
-tensorflow_hub
+# tensorflow
+# tensorflow_hub
 torch
 torchaudio
 torchvision

diff --git a/setup.cfg b/setup.cfg
@@ -36,8 +36,8 @@ install_requires =
     scipy
     seaborn
     sounddevice
-    tensorflow
-    tensorflow_hub
+    # tensorflow
+    # tensorflow_hub
     torch
     torchvision
     transformers