-
Notifications
You must be signed in to change notification settings - Fork 19
/
audio.py
118 lines (82 loc) · 3.34 KB
/
audio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import subprocess
import matplotlib
matplotlib.use('Agg')
import librosa
import librosa.filters
import numpy as np
from scipy import signal
from scipy.io import wavfile
def save_wav(wav, path, sr, norm=False):
if norm:
wav = wav / np.abs(wav).max()
wav *= 32767
# proposed by @dsmiller
wavfile.write(path, sr, wav.astype(np.int16))
def to_mp3(out_path):
subprocess.check_call(
f'ffmpeg -threads 1 -loglevel error -i "{out_path}.wav" -vn -ar 44100 -ac 1 -b:a 192k -y -hide_banner "{out_path}.mp3"',
shell=True, stdin=subprocess.PIPE)
subprocess.check_call(f'rm -f "{out_path}.wav"', shell=True)
def get_hop_size(hparams):
hop_size = hparams['hop_size']
if hop_size is None:
assert hparams['frame_shift_ms'] is not None
hop_size = int(hparams['frame_shift_ms'] / 1000 * hparams['audio_sample_rate'])
return hop_size
###########################################################################################
def griffin_lim(S, hparams, angles=None):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) if angles is None else angles
S_complex = np.abs(S).astype(np.complex)
y = _istft(S_complex * angles, hparams)
for i in range(hparams['griffin_lim_iters']):
angles = np.exp(1j * np.angle(_stft(y, hparams)))
y = _istft(S_complex * angles, hparams)
return y
def preemphasis(wav, k, preemphasize=True):
if preemphasize:
return signal.lfilter([1, -k], [1], wav)
return wav
def inv_preemphasis(wav, k, inv_preemphasize=True):
if inv_preemphasize:
return signal.lfilter([1], [1, -k], wav)
return wav
def _stft(y, hparams):
return librosa.stft(y=y, n_fft=hparams['fft_size'], hop_length=get_hop_size(hparams),
win_length=hparams['win_size'], pad_mode='constant')
def _istft(y, hparams):
return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams['win_size'])
def librosa_pad_lr(x, fsize, fshift, pad_sides=1):
'''compute right padding (final frame) or both sides padding (first and final frames)
'''
assert pad_sides in (1, 2)
# return int(fsize // 2)
pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
if pad_sides == 1:
return 0, pad
else:
return pad // 2, pad // 2 + pad % 2
# Conversions
_mel_basis = None
_inv_mel_basis = None
def _linear_to_mel(spectogram, hparams):
global _mel_basis
if _mel_basis is None:
_mel_basis = _build_mel_basis(hparams)
return np.dot(_mel_basis, spectogram)
def _mel_to_linear(mel_spectrogram, hparams):
global _inv_mel_basis
if _inv_mel_basis is None:
_inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
def _build_mel_basis(hparams):
assert hparams['fmax'] <= hparams['audio_sample_rate'] // 2
return librosa.filters.mel(hparams['audio_sample_rate'], hparams['fft_size'], n_mels=hparams['audio_num_mel_bins'],
fmin=hparams['fmin'], fmax=hparams['fmax'])
def amp_to_db(x):
return 20 * np.log10(np.maximum(1e-5, x))
def db_to_amp(x):
return 10.0 ** (x * 0.05)
def normalize(S, hparams):
return (S - hparams['min_level_db']) / -hparams['min_level_db']
def denormalize(D, hparams):
return (D * -hparams['min_level_db']) + hparams['min_level_db']