diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py index 3bcca221..9605931c 100644 --- a/faster_whisper/vad.py +++ b/faster_whisper/vad.py @@ -16,14 +16,14 @@ class VadOptions: """VAD options. Attributes: - onset: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, + threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH. It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets. - offset: Silence threshold for determining the end of speech. If a probability is lower than - the offset, it is always considered silence. Values higher than offset are only considered - speech if the previous sample was classified as speech; otherwise, they are treated as - silence. This parameter helps refine the detection of speech transitions, ensuring smoother - segment boundaries. + neg_threshold: Silence threshold for determining the end of speech. If a probability is lower + than neg_threshold, it is always considered silence. Values higher than neg_threshold + are only considered speech if the previous sample was classified as speech; otherwise, + they are treated as silence. This parameter helps refine the detection of speech + transitions, ensuring smoother segment boundaries. min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that @@ -34,8 +34,8 @@ class VadOptions: speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side """ - onset: float = 0.5 - offset: float = onset - 0.15 + threshold: float = 0.5 + neg_threshold: float = threshold - 0.15 min_speech_duration_ms: int = 0 max_speech_duration_s: float = float("inf") min_silence_duration_ms: int = 2000 @@ -62,7 +62,7 @@ def get_speech_timestamps( if vad_options is None: vad_options = VadOptions(**kwargs) - onset = vad_options.onset + threshold = vad_options.threshold min_speech_duration_ms = vad_options.min_speech_duration_ms max_speech_duration_s = vad_options.max_speech_duration_s min_silence_duration_ms = vad_options.min_silence_duration_ms @@ -90,7 +90,7 @@ def get_speech_timestamps( triggered = False speeches = [] current_speech = {} - offset = vad_options.offset + neg_threshold = vad_options.neg_threshold # to save potential segment end (and tolerate some silence) temp_end = 0 @@ -98,12 +98,12 @@ def get_speech_timestamps( prev_end = next_start = 0 for i, speech_prob in enumerate(speech_probs): - if (speech_prob >= onset) and temp_end: + if (speech_prob >= threshold) and temp_end: temp_end = 0 if next_start < prev_end: next_start = window_size_samples * i - if (speech_prob >= onset) and not triggered: + if (speech_prob >= threshold) and not triggered: triggered = True current_speech["start"] = window_size_samples * i continue @@ -130,7 +130,7 @@ def get_speech_timestamps( triggered = False continue - if (speech_prob < offset) and triggered: + if (speech_prob < neg_threshold) and triggered: if not temp_end: temp_end = window_size_samples * i # condition to avoid cutting in very short silence