Skip to content

Commit

Permalink
0.93.8
Browse files Browse the repository at this point in the history
  • Loading branch information
FBurkhardt committed Dec 12, 2024
1 parent 8b50484 commit 21c40c3
Show file tree
Hide file tree
Showing 9 changed files with 84 additions and 56 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
Changelog
=========

Version 0.93.8
--------------
* fixed bugs in plot
* added import_files_append=False

Version 0.93.7
--------------
* added a safety to remove nan values after mapping
Expand Down
6 changes: 4 additions & 2 deletions ini_file.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@
* **import**: [already computed features](http://blog.syntheticspeech.de/2022/10/18/how-to-import-features-from-outside-the-nkululeko-software/)
* **import_file** = pathes to files with features in CSV format
* import_file = ['path1/file1.csv', 'path2/file1.csv2']
* **import_files_append** = set this to False if you want the files to be concatenated column-wise, else it's done row-wise
* import_files_append = True
* **mld**: [mid-level-descriptors](http://www.essv.de/paper.php?id=447)
* **mld.model** = *path to the mld sources folder*
* **min_syls** = *minimum number of syllables*
Expand All @@ -197,8 +199,8 @@
* **fft_hop_dur** = 10 *(msec hop duration)*
* **fft_nbands** = 64 *(number of frequency bands)*
* **ast**: [audio spectrogram transformer](https://arxiv.org/abs/2104.01778) features from MIT
* **trill**: [TRILL embeddings](https://ai.googleblog.com/2020/06/improving-speech-representations-and.html) from Google
* **trill.model** = *path to the TRILL model folder, optional*
<!-- * **trill**: [TRILL embeddings](https://ai.googleblog.com/2020/06/improving-speech-representations-and.html) from Google
* **trill.model** = *path to the TRILL model folder, optional* -->
* **wav2vec variants**: [wav2vec2 embeddings](https://huggingface.co/facebook/wav2vec2-large-robust-ft-swbd-300h) from facebook
* "wav2vec2-large-robust-ft-swbd-300h"
* **wav2vec.model** = *path to the wav2vec2 model folder*
Expand Down
2 changes: 1 addition & 1 deletion nkululeko/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
VERSION="0.93.7"
VERSION="0.93.8"
SAMPLING_RATE = 16000
9 changes: 8 additions & 1 deletion nkululeko/feat_extract/feats_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ def __init__(self, name, data_df, feats_type):
def extract(self):
"""Import the features."""
self.util.debug(f"importing features for {self.name}")
# import_files_append: set this to True if the multiple tables should be combined row-wise, else they are combined column-wise
import_files_append = eval(
self.util.config_val("FEATS", "import_files_append", "True")
)
try:
feat_import_files = self.util.config_val("FEATS", "import_file", False)
feat_import_files = ast.literal_eval(feat_import_files)
Expand All @@ -38,7 +42,10 @@ def extract(self):
df = audformat.utils.read_csv(feat_import_file)
df = self.util.make_segmented_index(df)
df = df[df.index.isin(self.data_df.index)]
feat_df = pd.concat([feat_df, df])
if import_files_append:
feat_df = pd.concat([feat_df, df], axis=0)
else:
feat_df = pd.concat([feat_df, df], axis=1)
if feat_df.shape[0] == 0:
self.util.error(f"Imported features for data set {self.name} not found!")
# and assign to be the "official" feature set
Expand Down
9 changes: 5 additions & 4 deletions nkululeko/feat_extract/feats_trill.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@

import audiofile as af
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

# import tensorflow as tf
# import tensorflow_hub as hub
from tqdm import tqdm

import nkululeko.glob_conf as glob_conf
from nkululeko.feat_extract.featureset import Featureset

# Import TF 2.X and make sure we're running eager.
assert tf.executing_eagerly()
# assert tf.executing_eagerly()


class TRILLset(Featureset):
Expand Down Expand Up @@ -39,7 +40,7 @@ def __init__(self, name, data_df, feats_type):
"trill.model",
"https://tfhub.dev/google/nonsemantic-speech-benchmark/trill/3",
)
self.model = hub.load(model_path)
# self.model = hub.load(model_path)
self.feats_type = feats_type

def extract(self):
Expand Down
60 changes: 38 additions & 22 deletions nkululeko/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,84 +242,100 @@ def _check_binning(self, att, df):

def _plot2cont_cat(self, df, cont1, cont2, cat, ylab):
"""Plot relation of two continuous distributions with one categorical."""
plot_df = df[[cont1, cont2, cat]].copy()
if cont2 == "class_label":
df.rename(columns={cont2: self.target})
plot_df = plot_df.rename(columns={cont2: self.target})
cont2 = self.target
if cont1 == "class_label":
df.rename(columns={cont1: self.target})
plot_df = plot_df.rename(columns={cont1: self.target})
cont1 = self.target
if cat == "class_label":
df.rename(columns={cat: self.target})
plot_df = plot_df.rename(columns={cat: self.target})
cat = self.target
pearson = stats.pearsonr(df[cont1], df[cont2])
pearson = stats.pearsonr(plot_df[cont1], plot_df[cont2])
# trunc to three digits
pearson = int(pearson[0] * 1000) / 1000
pearson_string = f"PCC: {pearson}"
ccc_string = ""
if self.with_ccc:
ccc_val = ccc(df[cont1], df[cont2])
ccc_val = ccc(plot_df[cont1], plot_df[cont2])
ccc_val = int(ccc_val * 1000) / 1000
ccc_string = f"CCC: {ccc_val}"
ax = sns.lmplot(data=df, x=cont1, y=cont2, hue=cat)
caption = f"{ylab} {df.shape[0]}. {pearson_string} {ccc_string}"
ax = sns.lmplot(data=plot_df, x=cont1, y=cont2, hue=cat)
caption = f"{ylab} {plot_df.shape[0]}. {pearson_string} {ccc_string}"
ax.figure.suptitle(caption)
return ax, caption

def _plot2cont(self, df, col1, col2, ylab):
"""Plot relation of two continuous distributions."""
plot_df = df[[col1, col2]].copy()
# rename "class_label" to the original target
if col2 == "class_label":
df.rename(columns={col2: self.target})
plot_df = plot_df.rename(columns={col2: self.target})
col2 = self.target
if col1 == "class_label":
df.rename(columns={col1: self.target})
plot_df = plot_df.rename(columns={col1: self.target})
col1 = self.target
pearson = stats.pearsonr(df[col1], df[col2])
pearson = stats.pearsonr(plot_df[col1], plot_df[col2])
# trunc to three digits
pearson = int(pearson[0] * 1000) / 1000
pearson_string = f"PCC: {pearson}"
ccc_string = ""
if self.with_ccc:
ccc_val = ccc(df[col1], df[col2])
ccc_val = ccc(plot_df[col1], plot_df[col2])
ccc_val = int(ccc_val * 1000) / 1000
ccc_string = f"CCC: {ccc_val}"
ax = sns.lmplot(data=df, x=col1, y=col2)
caption = f"{ylab} {df.shape[0]}. {pearson_string} {ccc_string}"
ax = sns.lmplot(data=plot_df, x=col1, y=col2)
caption = f"{ylab} {plot_df.shape[0]}. {pearson_string} {ccc_string}"
ax.figure.suptitle(caption)
return ax, caption

def plotcatcont(self, df, cat_col, cont_col, xlab, ylab):
"""Plot relation of categorical distribution with continuous."""
# rename "class_label" to the original target
plot_df = df[[cat_col, cont_col]].copy()
if cat_col == "class_label":
df.rename(columns={cat_col: self.target})
plot_df = plot_df.rename(columns={cat_col: self.target})
cat_col = self.target
dist_type = self.util.config_val("EXPL", "dist_type", "kde")
cats, cat_str, es = su.get_effect_size(df, cat_col, cont_col)
cats, cat_str, es = su.get_effect_size(plot_df, cat_col, cont_col)
model_type = self.util.get_model_type()
if dist_type == "hist" and model_type != "tree":
ax = sns.histplot(df, x=cont_col, hue=cat_col, kde=True)
caption = f"{ylab} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
ax = sns.histplot(plot_df, x=cont_col, hue=cat_col, kde=True)
caption = f"{ylab} {plot_df.shape[0]}. {cat_str} ({cats}):" f" {es}"
ax.set_title(caption)
ax.set_xlabel(f"{cont_col}")
ax.set_ylabel(f"number of {ylab}")
else:
ax = sns.displot(
df, x=cont_col, hue=cat_col, kind="kde", fill=True, warn_singular=False
plot_df,
x=cont_col,
hue=cat_col,
kind="kde",
fill=True,
warn_singular=False,
)
ax.set(xlabel=f"{cont_col}")
caption = f"{ylab} {df.shape[0]}. {cat_str} ({cats}):" f" {es}"
caption = f"{ylab} {plot_df.shape[0]}. {cat_str} ({cats}):" f" {es}"
ax.figure.suptitle(caption)
return ax, caption

def _plot2cat(self, df, col1, col2, xlab, ylab):
"""Plot relation of 2 categorical distributions."""
crosstab = pd.crosstab(index=df[col1], columns=df[col2])
plot_df = df[[col1, col2]].copy()
# rename "class_label" to the original target
if col2 == "class_label":
plot_df = plot_df.rename(columns={col2: self.target})
col2 = self.target
if col1 == "class_label":
plot_df = plot_df.rename(columns={col1: self.target})
col1 = self.target
crosstab = pd.crosstab(index=plot_df[col1], columns=plot_df[col2])
res_pval = stats.chi2_contingency(crosstab)
res_pval = int(res_pval[1] * 1000) / 1000
caption = f"{ylab} {df.shape[0]}. P-val chi2: {res_pval}"
caption = f"{ylab} {plot_df.shape[0]}. P-val chi2: {res_pval}"
ax = (
df.groupby(col1, observed=False)[col2]
plot_df.groupby(col1, observed=False)[col2]
.value_counts()
.unstack()
.plot(kind="bar", stacked=True, title=caption, rot=0)
Expand Down
40 changes: 18 additions & 22 deletions nkululeko/reporting/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,28 @@
import json
import math

# import os
from confidence_intervals import evaluate_with_conf_int
import matplotlib.pyplot as plt
import numpy as np
from scipy.special import softmax
from scipy.stats import entropy
from scipy.stats import pearsonr
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import auc
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# from torch import is_tensor
from audmetric import (
accuracy,
concordance_cc,
mean_absolute_error,
mean_squared_error,
unweighted_average_recall,
)

# import os
from confidence_intervals import evaluate_with_conf_int
from scipy.special import softmax
from scipy.stats import entropy, pearsonr
from sklearn.metrics import (
ConfusionMatrixDisplay,
RocCurveDisplay,
auc,
classification_report,
confusion_matrix,
r2_score,
roc_auc_score,
roc_curve,
)
from audmetric import accuracy
from audmetric import concordance_cc
from audmetric import mean_absolute_error
from audmetric import mean_squared_error
from audmetric import unweighted_average_recall

import nkululeko.glob_conf as glob_conf
from nkululeko.plots import Plots
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@ pylatex
scikit_learn
scipy
seaborn
shap
sounddevice
splitutils
tensorflow
tensorflow_hub
# tensorflow
# tensorflow_hub
torch
torchaudio
torchvision
Expand Down
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ install_requires =
scipy
seaborn
sounddevice
tensorflow
tensorflow_hub
# tensorflow
# tensorflow_hub
torch
torchvision
transformers
Expand Down

0 comments on commit 21c40c3

Please sign in to comment.