-
Notifications
You must be signed in to change notification settings - Fork 0
/
silent_frames_evaluation.py
78 lines (63 loc) · 3.68 KB
/
silent_frames_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import numpy as np
def eval_silent_frames(true_source, predicted_source, window_size: int, hop_size: int, eval_incomplete_last_frame=False,
eps_for_silent_target=True):
"""
:param true_source: true source signal in the time domain, numpy array with shape (T,)
:param predicted_source: predicted source signal in the time domain, numpy array with shape (T,)
:param window_size: length (in samples) of the window used for the framewise bss_eval metrics computation
:param hop_size: hop size (in samples) used for the framewise bss_eval metrics computation
:param eval_incomplete_last_frame: if True, takes last frame into account even if it is shorter than the window,
default: False
:param eps_for_silent_target: if True, returns a value also if target source is silent, set to False for exact same
behaviour as explained in the paper "Weakly Informed Audio Source Separation", default: True
:return: pes: numpy array containing PES values for all applicable frames
eps: numpy array containing EPS values for all applicable frames
silent_true_source_frames: list of indices of frames with silent target source
silent_prediction_frames: list of indices of frames with silent predicted source
"""
# check inputs
assert true_source.ndim == 1, "true source array has too many dimensions, expected shape is (T,)"
assert predicted_source.ndim == 1, "predicted source array has too many dimensions, expected shape is (T,)"
assert len(true_source) == len(predicted_source), "true source and predicted source must have same length"
# compute number of evaluation frames
number_eval_frames = int(np.ceil((len(true_source) - window_size) / hop_size)) + 1
last_frame_incomplete = False
if len(true_source) % hop_size != 0:
last_frame_incomplete = True
# values for each frame will be gathered here
pes_list = []
eps_list = []
# indices of frames with silence will be gathered here
silent_true_source_frames = []
silent_prediction_frames = []
for n in range(number_eval_frames):
# evaluate last frame if applicable
if n == number_eval_frames - 1 and last_frame_incomplete:
if eval_incomplete_last_frame:
prediction_window = predicted_source[n * hop_size:]
true_window = true_source[n * hop_size:]
else:
continue
# evaluate other frames
else:
prediction_window = predicted_source[n * hop_size: n * hop_size + window_size]
true_window = true_source[n * hop_size: n * hop_size + window_size]
# compute Predicted Energy at Silence (PES)
if sum(abs(true_window)) == 0:
pes = 10 * np.log10(sum(prediction_window ** 2) + 10 ** (-12))
pes_list.append(pes)
silent_true_source_frames.append(n)
# compute Energy at Predicted Silence (EPS)
if eps_for_silent_target:
if sum(abs(prediction_window)) == 0:
true_source_energy_at_silent_prediction = 10 * np.log10(sum(true_window ** 2) + 10 ** (-12))
eps_list.append(true_source_energy_at_silent_prediction)
silent_prediction_frames.append(n)
else:
if sum(abs(prediction_window)) == 0 and sum(abs(true_window)) != 0:
true_source_energy_at_silent_prediction = 10 * np.log10(sum(true_window ** 2) + 10 ** (-12))
eps_list.append(true_source_energy_at_silent_prediction)
silent_prediction_frames.append(n)
pes = np.asarray(pes_list)
eps = np.asarray(eps_list)
return pes, eps, silent_true_source_frames, silent_prediction_frames