-
Notifications
You must be signed in to change notification settings - Fork 1
/
helpers.py
67 lines (55 loc) · 3.32 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from os import sep
from glob import glob
import matplotlib.pyplot as mpl
from pandas import Series, DataFrame
from numpy import argwhere
__author__ = "Andreas Nautsch"
__email__ = "[email protected]"
__coauthor__ = ["Jose Patino", "Natalia Tomashenko", "Junichi Yamagishi", "Paul-Gauthier Noé", "Jean-François Bonastre", "Massimiliano Todisco", "Nicholas Evans"]
__credits__ = ["Niko Brummer", "Daniel Ramos", "Edward de Villiers", "Anthony Larcher"]
__license__ = "LGPLv3"
def read_kaldi_folder_structure(glob_cmd):
"""
Gather score files of multiple experiments in easy tractable pandas database structure of score files
:param glob_cmd: regular expression for searching files
:return: scr_files: pandas.DataFrame object with meta-information: gender, dataset, task, label tailored to the 2020 VoicePrivacy challenge
"""
score_glob = Series(glob(glob_cmd))
score_files = score_glob.str.split(sep, expand=True)
results_identifier = score_files[3].str.split('_', expand=True).iloc[:, 1:].apply(lambda x: '_'.join(filter(None, x)), axis=1)
if not results_identifier.isna().all():
for idx in argwhere(results_identifier.str.len().values > 0).squeeze():
score_files.iloc[idx, 4] = score_files.iloc[idx, 4].replace(results_identifier[idx], 'anon')
score_files_tmp = score_files[4].str.split('-', expand=True)
score_files_tmp2 = score_files[5]
score_files.drop(columns=[4, 5], inplace=True)
for i in range(len(score_files_tmp.columns)):
score_files.insert(loc=4+score_files_tmp.columns[i], column=4+score_files_tmp.columns[i], value=score_files_tmp[i].values)
score_files.insert(loc=len(score_files.columns), column=len(score_files.columns), value=score_files_tmp2.values)
# establish flags
protected_enrol_test = Series(score_glob[score_files.loc[:, 5].str.contains('anon')])
protected_test = Series(score_glob[score_files.loc[:, 6].str.contains('anon')]).drop(index=protected_enrol_test.index)
# original = Series(score_glob).drop(index=protected_enrol_test.index).drop(index=protected_test.index)
flag_gender = score_files.loc[:, 6].str.contains('_f')
flag_libri = score_files.loc[:, 6].str.contains('libri')
flag_vctk_common = score_files.loc[:, 6].str.contains('common')
# extract score paths with boolean meta information
scr_files_ds = DataFrame({'scr': score_glob})
scr_files_ds['gender'] = 'm'
scr_files_ds.loc[flag_gender, 'gender'] = 'f'
scr_files_ds['dataset'] = 'vctk'
scr_files_ds.loc[flag_libri, 'dataset'] = 'libri'
scr_files_ds.loc[flag_vctk_common, 'dataset'] = 'vctk_common'
scr_files_ds['task'] = 'o-o'
scr_files_ds.loc[protected_test.index, 'task'] = 'o-a'
scr_files_ds.loc[protected_enrol_test.index, 'task'] = 'a-a'
team_sys = score_glob.str.split(sep, expand=True).iloc[:, 1:3]
scr_files_ds['label'] = team_sys[1] + '-' + team_sys[2]
return scr_files_ds
# Shrink current axis by 20%, see:
# https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot
def place_legend(figure, legend, shrink=0.2):
ax = mpl.figure(figure).get_axes()[0]
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * shrink/2, box.width, box.height * (1 - shrink/2)])
ax.legend(legend, loc='upper center', bbox_to_anchor=(0.5, -0.1-shrink/4), fancybox=True, shadow=True, ncol=2)