Skip to content

Commit

Permalink
added dataset filters
Browse files Browse the repository at this point in the history
  • Loading branch information
felixbur committed Feb 22, 2022
1 parent e8a211f commit e5e15f0
Show file tree
Hide file tree
Showing 7 changed files with 152 additions and 41 deletions.
5 changes: 4 additions & 1 deletion ini_file.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,16 @@
* **reuse**: reuse the splits after a *speaker_split* run to save time with feature extraction.
* **train**: use the entire database for training
* **test**: use the entire database for evaluation

* **db_name.target_tables**: tables that containes the target / speaker / sex labels
* emodb.target_tables = ['emotion']
* **db_name.files_tables**: tables that containes the audio file names
* emodb.files_tables = ['files']
* **db_name.limit**: maximum number of samples per table (for testing with very large data mainly)
* emodb.limit = 20
* **db_name.required**: force a data set to have a specific feature (for example filter all sets that have gender labeled)
* emodb.required = gender
* **db_name.max_samples_per_speaker**: maximum number of samples per speaker (for leveling data where same speakers have a large number of samples)
* emodb.max_samples_per_speaker = 20
* **target**: the task name, e.g. *age* or *emotion*
* target = emotion
* **labels**: for classification experiments: the names of the categories (is also used for regression when binning the values)
Expand Down
93 changes: 70 additions & 23 deletions src/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,19 @@ def __init__(self, name):
def load(self):
"""Load the dataframe with files, speakers and task labels"""
self.util.debug(f'loading {self.name}')
store = self.util.get_path('store')
store_file = f'{store}{self.name}.pkl'
if os.path.isfile(store_file):
self.util.debug(f'reusing previously stored file {store_file}')
self.df = pd.read_pickle(store_file)
got_target = self.target in self.df
got_gender = 'gender' in self.df
got_speaker = 'speaker' in self.df
self.is_labeled = got_target
self.util.debug(f'Loaded database {self.name} with {self.df.shape[0]} '\
f'samples: got targets: {got_target}, got speakers: {got_speaker}, '\
f'got sexes: {got_gender}')
return
data_roots = self.util.config_val('DATA', 'root_folders', False)
if data_roots:
# if there is a global data rootfolder file, read from there
Expand All @@ -41,6 +54,7 @@ def load(self):
else:
# else there should be one in the experiment ini
root = glob_conf.config['DATA'][self.name]
self.util.debug(f'loading from {root}')
db = audformat.Database.load(root)
# map the audio file paths
db.map_files(lambda x: os.path.join(root, x))
Expand All @@ -49,13 +63,13 @@ def load(self):
df_files_tables = ast.literal_eval(df_files)
# The label for the target column
self.col_label = self.util.config_val('DATA', f'{self.name}.label', self.target)
df, got_target, got_speaker, got_gender = self.get_df_for_lists(db, df_files_tables)
df, got_target, got_speaker, got_gender = self._get_df_for_lists(db, df_files_tables)
if False in {got_target, got_speaker, got_gender}:
try :
# There might be a separate table with the targets, e.g. emotion or age
df_targets = self.util.config_val('DATA', f'{self.name}.target_tables', f'[\'{self.target}\']')
df_target_tables = ast.literal_eval(df_targets)
df_target, got_target2, got_speaker2, got_gender2 = self.get_df_for_lists(db, df_target_tables)
df_target, got_target2, got_speaker2, got_gender2 = self._get_df_for_lists(db, df_target_tables)
got_target = got_target2 or got_target
got_speaker = got_speaker2 or got_speaker
got_gender = got_gender2 or got_gender
Expand Down Expand Up @@ -89,15 +103,34 @@ def load(self):
self.plot.describe_df(self.name, df, self.target, f'{self.name}_distplot.png')
self.is_labeled = got_target
self.df.is_labeled = self.is_labeled
# Perform some filtering if desired
required = self.util.config_val('DATA', f'{self.name}.required', False)
if required:
pre = self.df.shape[0]
self.df = self.df[self.df[required].notna()]
post = self.df.shape[0]
self.util.debug(f'kept {post} samples with {required} (from {pre}, filtered {pre-post})')
samples_per_speaker = self.util.config_val('DATA', f'{self.name}.max_samples_per_speaker', False)
if samples_per_speaker:
pre = self.df.shape[0]
self.df = self._limit_speakers(self.df, int(samples_per_speaker))
post = self.df.shape[0]
self.util.debug(f'kept {post} samples with {samples_per_speaker} per speaker (from {pre}, filtered {pre-post})')
if self.limit:
pre = self.df.shape[0]
self.df = self.df.head(self.limit)
post = self.df.shape[0]
self.util.debug(f'lmited to {post} samples (from {pre}, filtered {pre-post})')

# store the dataframe
self.df.to_pickle(store_file)


def get_df_for_lists(self, db, df_files):
def _get_df_for_lists(self, db, df_files):
got_target, got_speaker, got_gender = False, False, False
df = pd.DataFrame()
for table in df_files:
if self.limit:
source_df = db.tables[table].df.iloc[:self.limit]
else:
source_df = db.tables[table].df
source_df = db.tables[table].df
# create a dataframe with the index (the filenames)
df_local = pd.DataFrame(index=source_df.index)
# try to get the targets from this dataframe
Expand All @@ -121,34 +154,47 @@ def get_df_for_lists(self, db, df_files):
pass
try:
# also it might be possible that the sex is part of the speaker description
if self.limit:
df_local['gender'] = db[table]['speaker'].get(map='gender').iloc[:self.limit]
else:
df_local['gender'] = db[table]['speaker'].get(map='gender')
df_local['gender'] = db[table]['speaker'].get(map='gender')

got_gender = True
except (ValueError, audformat.errors.BadKeyError) as e:
pass
try:
# same for the target, e.g. "age"
if self.limit:
df_local[self.target] = db[table]['speaker'].get(map=self.target).iloc[:self.limit]
else:
df_local[self.target] = db[table]['speaker'].get(map=self.target)
df_local[self.target] = db[table]['speaker'].get(map=self.target)
got_target = True
except (ValueError, audformat.core.errors.BadKeyError) as e:
pass
#pd.concat([df, df_local], axis=0, join='outer')
df = df.append(df_local)
return df, got_target, got_speaker, got_gender

def _limit_speakers(self, df, max=20):
""" limit number of samples per speaker
call e.g. df = limit_speakers(df)
"""
df_ret = pd.DataFrame()
for s in df.speaker.unique():
s_df = df[df['speaker'].eq(s)]
if s_df.shape[0] < max:
df_ret = df_ret.append(s_df)
else:
df_ret = df_ret.append(s_df.sample(max))
return df_ret


def split(self):
"""Split the datbase into train and development set"""
store = self.util.get_path('store')
storage_test = f'{store}{self.name}_testdf.pkl'
storage_train = f'{store}{self.name}_traindf.pkl'
split_strategy = self.util.config_val('DATA', self.name+'.split_strategy', 'database')
# 'database' (default), 'speaker_split', 'specified', 'reuse'
if os.path.isfile(storage_test) and os.path.isfile(storage_train) and split_strategy != 'speaker_split':
self.util.debug(f'splits: reusing previously stored files {storage_test} and {storage_train}')
self.df_test = pd.read_pickle(storage_test)
self.df_train = pd.read_pickle(storage_train)
return

if split_strategy == 'database':
# use the splits from the database
testdf = self.db.tables[self.target+'.test'].df
Expand Down Expand Up @@ -196,13 +242,13 @@ def split(self):
self.df_train = pd.read_pickle(storage_train)

if self.df_test.shape[0]>0:
self.df_test = self.finish_up(self.df_test, 'test', storage_test)
self.df_test = self.finish_up(self.df_test, storage_test)
if self.df_train.shape[0]>0:
self.df_train = self.finish_up(self.df_train, 'train', storage_train)
self.df_train = self.finish_up(self.df_train, storage_train)

def finish_up(self, df, name, storage):
# Bin target values if they are continous but a classification experiment should be done
# self.check_continous_classification(df)
def finish_up(self, df, storage):
# Bin target values if they are continuous but a classification experiment should be done
# self.check_continuous_classification(df)
# remember the splits for future use
df.is_labeled = self.is_labeled
self.df_test.is_labeled = self.is_labeled
Expand Down Expand Up @@ -236,7 +282,7 @@ def prepare_labels(self):

def map_labels(self, df):
if df.shape[0]==0 or not self.util.exp_is_classification() \
or self.check_continuous_classification() :
or self.check_continuous_classification():
return df
"""Rename the labels and remove the ones that are not needed."""
target = glob_conf.config['DATA']['target']
Expand All @@ -251,7 +297,8 @@ def map_labels(self, df):
try :
labels = ast.literal_eval(glob_conf.config['DATA']['labels'])
df = df[df[target].isin(labels)]
# self.util.debug(f'Categories: {df[target].unique()}')
# remember in case they get encoded later
df['class_label'] = df[target]
except KeyError:
pass
return df
Expand Down
19 changes: 5 additions & 14 deletions src/experiment.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from cgi import test
import numpy
import random
from dataset import Dataset
Expand All @@ -9,7 +8,6 @@
from test_predictor import Test_predictor
from util import Util
import glob_conf
from plots import Plots
from demo_predictor import Demo_predictor
import ast # To convert strings to objects
import pandas as pd
Expand Down Expand Up @@ -148,12 +146,12 @@ def plot_distribution(self):
from plots import Plots
plot = Plots()
if self.util.exp_is_classification():
self.df_train['labels'] = self.label_encoder.inverse_transform(self.df_train[self.target])
if self.df_test.is_labeled:
self.df_test['labels'] = self.label_encoder.inverse_transform(self.df_test[self.target])
# self.df_train['labels'] = self.label_encoder.inverse_transform(self.df_train[self.target])
# if self.df_test.is_labeled:
# self.df_test['labels'] = self.label_encoder.inverse_transform(self.df_test[self.target])
if self.df_test.shape[0] > 0:
plot.describe_df('dev_set', self.df_test, 'labels', f'test_distplot.png')
plot.describe_df('train_set', self.df_train, 'labels', f'train_distplot.png')
plot.describe_df('dev_set', self.df_test, self.target, f'test_distplot.png')
plot.describe_df('train_set', self.df_train, self.target, f'train_distplot.png')
else:
if self.df_test.shape[0] > 0:
plot.describe_df('dev_set', self.df_test, self.target, f'test_distplot.png')
Expand Down Expand Up @@ -301,13 +299,6 @@ def run(self):
def print_best_model(self):
self.runmgr.print_best_result_runs()

def __collect_reports(self):
self.results, self.losses, self.train_results = [], [], []
for r in self.reports:
self.results.append(r.get_result().test)
self.losses.append(r.get_result().loss)
self.train_results.append(r.get_result().train)

def demo(self):
model = self.runmgr.get_best_model()
feature_extractor = self.feats_train
Expand Down
9 changes: 7 additions & 2 deletions src/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,13 @@ def describe_df(self, name, df, target, filename):
fig_dir = self.util.get_path('fig_dir')+'../' # one up because of the runs
sampl_num = df.shape[0]
spkr_num = df.speaker.nunique()
female_smpl_num = df[df.gender=='female'].shape[0]
male_smpl_num = df[df.gender=='male'].shape[0]
sex_col = 'gender'
if target == 'gender':
sex_col = 'class_label'
if self.util.exp_is_classification():
target = 'class_label'
female_smpl_num = df[df[sex_col]=='female'].shape[0]
male_smpl_num = df[df[sex_col]=='male'].shape[0]
self.util.debug(f'{name}: # samples: {sampl_num} (f: {female_smpl_num}, m: {male_smpl_num}), # speakers: {spkr_num}')
if df.is_labeled:
fig, axes = plt.subplots(nrows=1, ncols=2)
Expand Down
2 changes: 1 addition & 1 deletion src/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def __init__(self, truths, preds, run, epoch):
self.util.debug(self.preds)
self.util.error(f'result is NAN')
else:
self.util.error(f'unknown measure: {measure}')
self.util.error(f'unknown measure: {self.measure}')

# train and loss are being set by the model

Expand Down
24 changes: 24 additions & 0 deletions tests/exp_moz.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[EXP]
root = ./tests/
name = exp_moz
[DATA]
databases = ['moz']
root_folders = data_roots.ini
moz.files_tables = ['de-validated']
#moz.split_strategy = speaker_split
moz.split_strategy = reuse
moz.testsplit = 20
moz.required = gender
moz.max_samples_per_speaker = 20
target = gender
labels = ['male', 'female']
[FEATS]
#type = trill
type = os
scale = standard
[MODEL]
type = svm
save = True
[PLOT]
value_counts = True
tsne = True
41 changes: 41 additions & 0 deletions tests/exp_moz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# main.py
# Demonstration code to use the ML-experiment framework

import sys
sys.path.append("./src")
import experiment as exp
import configparser
from util import Util
import constants

def main(config_file):
# load one configuration per experiment
config = configparser.ConfigParser()
config.read(config_file)
util = Util()

# create a new experiment
expr = exp.Experiment(config)
print(f'running {expr.name}, nkululeko version {constants.VERSION}')

# load the data
expr.load_datasets()

# split into train and test
expr.fill_train_and_tests()
util.debug(f'train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}')

# extract features
expr.extract_feats()
util.debug(f'train feats shape : {expr.feats_train.df.shape}, test feats shape:{expr.feats_test.df.shape}')

# initialize a run manager
expr.init_runmanager()

# run the experiment
expr.run()

print('DONE')

if __name__ == "__main__":
main('./tests/exp_moz.ini')

0 comments on commit e5e15f0

Please sign in to comment.