Skip to content

Commit

Permalink
Add option for running ffmpeg_microphone_live as a background process (
Browse files Browse the repository at this point in the history
…huggingface#32838)

* Add option for running ffmpeg_microphone_live as a background process

* Code quality checks for audio_utils

* Code clean up for audio_utils

* Fixing logic in ffmpeg_microphone calls in audio_utils

* Allowing any arbitrary arguments to be passed to ffmpeg_microphone_live

* Formatting

* Fixing last problems with adding ffmpeg_additional_args

* Fixing default arguments and formatting issues

* Fixing comments for ffmpeg_additional_args

* Adding two shorts tests for ffmpeg_microphone_live

* Fixing test bug
  • Loading branch information
mikamerath authored and BernardZach committed Dec 6, 2024
1 parent ecdda01 commit a790e3a
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 2 deletions.
24 changes: 23 additions & 1 deletion src/transformers/pipelines/audio_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def ffmpeg_microphone(
chunk_length_s: float,
format_for_conversion: str = "f32le",
ffmpeg_input_device: Optional[str] = None,
ffmpeg_additional_args: Optional[list[str]] = None,
):
"""
Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another
Expand All @@ -70,6 +71,11 @@ def ffmpeg_microphone(
The indentifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
for how to specify and list input devices.
ffmpeg_additional_args (`list[str]`, *optional*):
Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
Returns:
A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length
`int(round(sampling_rate * chunk_length_s)) * size_of_sample`.
Expand All @@ -95,6 +101,8 @@ def ffmpeg_microphone(
format_ = "dshow"
input_ = ffmpeg_input_device or _get_microphone_name()

ffmpeg_additional_args = [] if ffmpeg_additional_args is None else ffmpeg_additional_args

ffmpeg_command = [
"ffmpeg",
"-f",
Expand All @@ -114,6 +122,9 @@ def ffmpeg_microphone(
"quiet",
"pipe:1",
]

ffmpeg_command.extend(ffmpeg_additional_args)

chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
iterator = _ffmpeg_stream(ffmpeg_command, chunk_len)
for item in iterator:
Expand All @@ -127,6 +138,7 @@ def ffmpeg_microphone_live(
stride_length_s: Optional[Union[Tuple[float, float], float]] = None,
format_for_conversion: str = "f32le",
ffmpeg_input_device: Optional[str] = None,
ffmpeg_additional_args: Optional[list[str]] = None,
):
"""
Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting
Expand All @@ -153,6 +165,11 @@ def ffmpeg_microphone_live(
The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
for how to specify and list input devices.
ffmpeg_additional_args (`list[str]`, *optional*):
Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
Return:
A generator yielding dictionaries of the following form
Expand All @@ -168,8 +185,13 @@ def ffmpeg_microphone_live(
chunk_s = chunk_length_s

microphone = ffmpeg_microphone(
sampling_rate, chunk_s, format_for_conversion=format_for_conversion, ffmpeg_input_device=ffmpeg_input_device
sampling_rate,
chunk_s,
format_for_conversion=format_for_conversion,
ffmpeg_input_device=ffmpeg_input_device,
ffmpeg_additional_args=[] if ffmpeg_additional_args is None else ffmpeg_additional_args,
)

if format_for_conversion == "s16le":
dtype = np.int16
size_of_sample = 2
Expand Down
10 changes: 9 additions & 1 deletion tests/pipelines/test_pipelines_automatic_speech_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
WhisperForConditionalGeneration,
)
from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline
from transformers.pipelines.audio_utils import chunk_bytes_iter
from transformers.pipelines.audio_utils import chunk_bytes_iter, ffmpeg_microphone_live
from transformers.pipelines.automatic_speech_recognition import _find_timestamp_sequence, chunk_iter
from transformers.testing_utils import (
compare_pipeline_output_to_hub_spec,
Expand Down Expand Up @@ -1989,3 +1989,11 @@ def test_chunk_bytes_iter_stride_stream(self):
)
with self.assertRaises(StopIteration):
next(iter_)

def test_ffmpeg_no_additional_args(self):
mic = ffmpeg_microphone_live(16000, 2.0)
mic.close()

def test_ffmpeg_additional_args(self):
mic = ffmpeg_microphone_live(16000, 2.0, ffmpeg_additional_args=["-nostdin"])
mic.close()

0 comments on commit a790e3a

Please sign in to comment.