-
Notifications
You must be signed in to change notification settings - Fork 5
/
eval_separation.py
399 lines (337 loc) · 13.7 KB
/
eval_separation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
"""
This file is a modified version of https://github.com/sigsep/open-unmix-pytorch/blob/master/openunmix/evaluate.py
It can be used to evaluate the source separation results using objective measures SDR, SAR, SIR, EPS, PES
"""
import argparse
import musdb
import museval
import testx
import multiprocessing
import functools
from pathlib import Path
import torch
import numpy as np
import pandas as pd
import tqdm
import resampy
import json
import os
import soundfile as sf
import data
import model
import silent_frames_evaluation
def evaluate(references, estimates, output_dir, track_name, sample_rate, win=1.0, hop=1.0, mode='v4'):
"""
Compute the BSS_eval metrics as well as PES and EPS. It is following the design concept of museval.eval_mus_track
:param references: dict of reference sources {target_name: signal}, signal has shape: (nb_timesteps, np_channels)
:param estimates: dict of user estimates {target_name: signal}, signal has shape: (nb_timesteps, np_channels)
:param output_dir: path to output directory used to save evaluation results
:param track_name: name that is assigned to TrackStore object for evaluated track
:param win: evaluation window length in seconds, default 1
:param hop: evaluation window hop length in second, default 1
:param sample_rate: sample rate of test tracks (should be same as rate the model has been trained on)
:param mode: BSSEval version, default to `v4`
:return:
bss_eval_data: museval.TrackStore object containing bss_eval evaluation scores
silent_frames_data: Pandas data frame containing EPS and PES scores
"""
eval_targets = list(estimates.keys())
estimates_list = []
references_list = []
for target in eval_targets:
estimates_list.append(estimates[target])
references_list.append(references[target])
# eval bass_eval and EPS, PES metrics
# save in TrackStore object
bss_eval_data = museval.TrackStore(win=win, hop=hop, track_name=track_name)
# skip examples with a silent source because BSSeval metrics are not defined in this case
skip = False
for target in eval_targets:
reference_energy = np.sum(references[target]**2)
estimate_energy = np.sum(estimates[target]**2)
if reference_energy == 0 or estimate_energy == 0:
skip = True
SDR = ISR = SIR = SAR = (np.ones((1,)) * (-np.inf), np.ones((1,)) * (-np.inf))
print("skip {}, {} source is all zero".format(track_name, target))
if not skip:
SDR, ISR, SIR, SAR = museval.evaluate(
references_list,
estimates_list,
win=int(win * sample_rate),
hop=int(hop * sample_rate),
mode=mode,
padding=True
)
# add evaluation of ESP and PES
PES, EPS, _, __ = silent_frames_evaluation.eval_silent_frames(
true_source=np.array(references_list),
predicted_source=np.array(estimates_list),
window_size=int(win * sample_rate),
hop_size=int(hop * sample_rate)
)
# iterate over all targets
for i, target in enumerate(eval_targets):
values = {
"SDR": SDR[i].tolist(),
"SIR": SIR[i].tolist(),
"ISR": ISR[i].tolist(),
"SAR": SAR[i].tolist(),
}
bss_eval_data.add_target(
target_name=target,
values=values
)
silent_frames_data = pd.DataFrame({'target': [], 'PES': [], 'EPS': [], 'track': []})
for i, target in enumerate(eval_targets):
silent_frames_data = silent_frames_data.append({'target': target, 'PES': PES[i], 'EPS': EPS[i], 'track': track_name}, ignore_index=True)
# save evaluation results if output directory is defined
if output_dir:
# validate against the schema
bss_eval_data.validate()
try:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
with open(
os.path.join(output_dir, track_name.replace('/', '_')) + '.json', 'w+'
) as f:
f.write(bss_eval_data.json)
except (IOError):
pass
return bss_eval_data, silent_frames_data
def separate_and_evaluate2(
track,
targets,
model_name,
niter,
alpha,
softmask,
output_dir,
eval_dir,
samplerate,
device='cpu',
args=None
):
mix = track['mix']
true_vocals = track['vocals']
true_accompaniment = track['accompaniment']
text = track['text'].unsqueeze(dim=0)
track_name = track['name']
mix_numpy = mix.numpy().T
if args.alignment_from:
attention_weights = track['attention_weights'].unsqueeze(dim=0)
inputs = (mix_numpy, text, attention_weights)
else:
inputs = (mix_numpy, text)
estimates = testx.separate(
inputs=inputs,
targets=targets,
model_name=model_name,
niter=niter,
alpha=alpha,
softmask=softmask,
device=device,
args=args,
accompaniment_model=args.accompaniment_model
)
if output_dir:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# make another script that reassembles kind of whole tracks of snippets
sf.write(os.path.join(output_dir, track_name.replace('/', '_') + '.wav'), estimates['vocals'], samplerate)
references = {'vocals': true_vocals.numpy().T, 'accompaniment': true_accompaniment.numpy().T}
bss_eval_scores, silent_frames_scores = evaluate(references=references,
estimates=estimates,
output_dir=eval_dir,
track_name=track_name,
sample_rate=samplerate)
return bss_eval_scores, silent_frames_scores
if __name__ == '__main__':
# Training settings
parser = argparse.ArgumentParser(
description='MUSDB18 Evaluation',
add_help=False
)
# changed default to vocals
parser.add_argument(
'--targets',
nargs='+',
default=['vocals'],
type=str,
help='provide targets to be processed. \
If none, all available targets will be computed'
)
parser.add_argument(
'--tag',
type=str,
help ='tag of model/variant to evaluate that was assigned at training time'
)
parser.add_argument(
'--test-snr',
type=int,
default= None,
help ='SNR if evaluation mixes should have a custom SNR'
)
args, _ = parser.parse_known_args()
if args.test_snr == None:
default_eval_tag = args.tag
else:
default_eval_tag = args.tag + "_snr_" + str(args.test_snr)
parser.add_argument(
'--eval-tag',
type=str,
default= default_eval_tag,
help ='tag for evaluation folder etc.'
)
# changed
parser.add_argument(
'--model',
type=str,
default='trained_models/{}'.format(args.tag),
help='path to mode base directory of pretrained models'
)
parser.add_argument(
'--optimal-path-attention',
action='store_true',
default=False,
help='Set to True if alphas should be the optimal path through scores during evaluation'
)
parser.add_argument(
'--estimates-out',
action='store_true',
default=False,
help='Set to True if estimates should be stored'
)
parser.add_argument(
'--outdir',
type=str,
help='Results path where audio evaluation results are stored'
)
args, _ = parser.parse_known_args()
outdir = None
if args.estimates_out and not args.outdir:
outdir = 'evaluation/{}/estimates'.format(args.eval_tag)
if args.estimates_out and args.outdir:
outdir = args.outdir
parser.add_argument(
'--evaldir',
type=str,
default='evaluation/{}/eval_results'.format(args.eval_tag),
help='Results path for museval scores'
)
# changed --root to --testset
parser.add_argument(
'--testset',
type=str,
help='Test set name'
)
# removed --subset
parser.add_argument(
'--cores',
type=int,
default=1
)
parser.add_argument(
'--no-cuda',
action='store_true',
default=False,
help='disables CUDA inference'
)
parser.add_argument(
'--is-wav',
action='store_true', default=False,
help='flags wav version of the dataset'
)
args, _ = parser.parse_known_args()
if args.testset == 'musdb':
mus = musdb.DB(
root='../Datasets/MUSDB18',
subsets='test',
is_wav=args.is_wav
)
test_tracks = mus.tracks
elif args.testset == 'musdb_lyrics':
try:
# load configuration that was generated at training time
with open(Path(args.model, args.targets[0] + '.json'), 'r') as stream:
config = json.load(stream)
keys = config['args'].keys()
samplerate = config['args']['samplerate']
text_units_default = config['args']['text_units']
space_token_only = config['args']['space_token_only'] if 'space_token_only' in keys else False
alignment_from = config['args']['alignment_from'] if 'alignment_from' in keys else None
fake_alignment = config['args']['fake_alignment'] if 'fake_alignment' in keys else False
except (FileNotFoundError):
text_units_default = 'characters'
samplerate = 44100
fake_alignment = False
space_token_only = True
alignment_from = None
print("No config found, set n, x, s, d to True, test_units=characters, samplerate=44100")
# use same data set specifications as during training but allow overwriting on command line
parser.add_argument('--text-units', type=str, default=text_units_default)
parser.add_argument('--alignment-from', type=str, default=alignment_from)
args, _ = parser.parse_known_args()
# here one "test track" is actually one snippet
test_tracks = data.MUSDBLyricsDataTest(samplerate=samplerate, text_units=args.text_units,
n=True, x=True, s=True, d=True, space_token_only=space_token_only,
alignment_from=args.alignment_from, fake_alignment=fake_alignment,
mix_snr=args.test_snr)
args, _ = parser.parse_known_args()
args = testx.inference_args(parser, args)
use_cuda = not args.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)
if args.cores > 1:
pool = multiprocessing.Pool(args.cores)
results = museval.EvalStore()
scores_list = list(
pool.imap_unordered(
func=functools.partial(
separate_and_evaluate2,
targets=args.targets,
model_name=args.model,
niter=args.niter,
alpha=args.alpha,
softmask=args.softmask,
output_dir=outdir,
eval_dir=args.evaldir,
device=device
),
iterable=mus.tracks,
chunksize=1
)
)
pool.close()
pool.join()
for scores in scores_list:
results.add_track(scores)
else:
print("evaluate model {} with sample rate {}".format(args.model, samplerate))
results = museval.EvalStore()
silent_frames_results = pd.DataFrame({'target': [], 'PES': [], 'EPS': [], 'track': []})
for idx in tqdm.tqdm(range(len(test_tracks))):
track = test_tracks[idx]
bss_eval_scores, silent_frames_scores = separate_and_evaluate2(
track,
targets=args.targets,
model_name=args.model,
niter=args.niter,
alpha=args.alpha,
softmask=args.softmask,
output_dir=outdir,
eval_dir=args.evaldir,
device=device,
samplerate=samplerate,
args=args
)
results.add_track(bss_eval_scores)
silent_frames_results = silent_frames_results.append(silent_frames_scores, ignore_index=True)
print(results)
method = museval.MethodStore()
method.add_evalstore(results, args.model)
method.save('evaluation/{}/bss_eval_results.pandas'.format(args.eval_tag))
# print mean over tracks for PES and EPS
print("mean over evaluation frames, mean over channels, mean over tracks")
for target in ['vocals', 'accompaniment']:
print(target + ' ==>', silent_frames_results.loc[silent_frames_results['target'] == target].mean(axis=0, skipna=True))
silent_frames_results.to_json('evaluation/{}/silent_frames_results.json'.format(args.eval_tag), orient='records')