-
Notifications
You must be signed in to change notification settings - Fork 17
/
recorder.py
157 lines (134 loc) · 5.65 KB
/
recorder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import queue
import subprocess
import time
import numpy as np
import sounddevice as sd
import torch
from input_chunk_filter import ChunkHPFilter
class Recorder(object):
def __init__(self, tts_signal=True):
self.max_duration = 10 # seconds
self.rate = 16000
self.buff_size = int(30 * self.rate) # Record a 30-second buffer.
self.chunk_sz = self.rate // 10 # 100ms @ 16KHz
self.channels = 1
self.buff_idx = 0
self.vad_chunk_idx = 0
self.recording_voice = False
# audio buffer in int16
self.buff = np.zeros((self.buff_size), dtype=np.float32)
self.vad_chunks = np.zeros((int(self.buff_size / self.chunk_sz)),
dtype=np.float32)
self.q = queue.Queue()
self.hp_filter = ChunkHPFilter(fc=50)
torch.set_num_threads(1)
dir_path = os.path.dirname(os.path.realpath(__file__))
self.vad, _ = torch.hub.load(
repo_or_dir=(f"{dir_path}/downloaded/snakers4_silero-vad_master"),
source='local',
model='silero_vad',
force_reload=False,
onnx=True)
sd.default.channels = self.channels, self.channels
sd.default.dtype = 'int32', 'int32'
sd.default.samplerate = self.rate, self.rate
self.in_stream = sd.InputStream(samplerate=self.rate,
blocksize=self.chunk_sz,
channels=self.channels,
callback=self._audio_callback)
self.in_stream.start()
# Signal to tts that audio output can start. This is required for
# AI in a Box because audio input must be configured before audio output
# with the current onboard audio drivers. Not required for main().
if tts_signal:
with open('/tmp/audio_input_running.bool', 'w') as fp:
pass
print(
f'audio input stream started successfully: {self.in_stream.active}'
)
def _audio_callback(self, indata, frames, time_info, status):
self.q.put(np.frombuffer(indata, dtype=np.int32).flatten())
def preprocess_audio(self, audio):
# 20-bits of audio are copied into lowest 16-bits, meaning we should
# keep 4 extra bits.
x = audio.astype(dtype=np.float32)
x = self.hp_filter.run(x) / (2**27)
x = x.astype(np.float32)
return x, self.vad(torch.from_numpy(x), 16000).item()
def process_audio_chunk(self):
audio, vad = self.preprocess_audio(self.q.get())
if self.buff_idx < self.buff_size:
self.buff[self.buff_idx:self.buff_idx + self.chunk_sz] = audio
self.vad_chunks[self.vad_chunk_idx] = vad
self.buff_idx += self.chunk_sz
self.vad_chunk_idx += 1
else:
self.buff[:-self.chunk_sz] = self.buff[self.chunk_sz:]
self.buff[-self.chunk_sz:] = audio
self.vad_chunks[:-1] = self.vad_chunks[1:]
self.vad_chunks[-1] = vad
def record_voice(self):
if not self.recording_voice:
self.recording_voice = True
self.reset()
for _ in range(self.q.qsize()):
self.process_audio_chunk()
preamble_chunks = 3 # 300ms of extra audio.
vad_inds = np.nonzero(self.vad_chunks > 0.9)[0]
if len(vad_inds) == 0:
return None
first_vad_sample = max(0, vad_inds[0] - preamble_chunks)
last_idx = self.vad_chunk_idx
for i in range(len(vad_inds) - 1, 0, -1):
if last_idx - vad_inds[i] > 10:
src_start_idx = first_vad_sample * self.chunk_sz
src_end_idx = (vad_inds[i] + 10) * self.chunk_sz
samples_to_copy = src_end_idx - src_start_idx
audio_buff = np.zeros((self.buff_size), dtype=np.float32)
audio_buff[:samples_to_copy] = self.buff[
src_start_idx:src_end_idx]
self.buff_idx = 0
self.vad_chunk_idx = 0
self.buff.fill(0.0)
self.vad_chunks.fill(0.0)
self.recording_voice = False
return audio_buff
last_idx = vad_inds[i]
def get_audio(self):
for _ in range(self.q.qsize()):
self.process_audio_chunk()
end_idx = self.vad_chunk_idx
audio_start_idx = max(
0, end_idx - self.max_duration * 10) # Return ten seconds buffer.
vad_start_idx = max(0, end_idx - 3 * 10) # 3 seconds of voice activity.
last_ten_seconds = self.buff[audio_start_idx * self.chunk_sz:end_idx *
self.chunk_sz]
# Return samples padded to native whisper length.
x = np.pad(last_ten_seconds, (0, (30 - self.max_duration) * self.rate),
'constant')
chunks_with_vad = np.sum(self.vad_chunks[vad_start_idx:end_idx] > 0.5)
return x, chunks_with_vad
def reset(self):
for _ in range(self.q.qsize()):
self.process_audio_chunk()
self.buff_idx = 0
self.buff.fill(0.0)
self.vad_chunk_idx = 0
self.vad_chunks.fill(0.0)
self.hp_filter.reset()
self.vad.reset_states()
from transcriber import Transcriber
import time
if __name__ == '__main__':
"""Example call: `taskset -c 4-7 python3 recorder.py`."""
recorder = Recorder(tts_signal=False)
recorder.reset()
t = Transcriber()
while True:
clip, vad = recorder.get_audio()
print(f'vad {vad}')
if (vad > 0):
print("Transcription:\n", t.run(clip), "\n")
else:
time.sleep(1)