diff --git a/README.md b/README.md index ec59188d..bca6330e 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,6 @@ async def main(): ## Examples - [Facelandmark](https://github.com/livekit/python-sdks/tree/main/examples/face_landmark): Use mediapipe to detect face landmarks (eyes, nose ...) -- [Whisper](https://github.com/livekit/python-sdks/tree/main/examples/whisper): Transcribe an audio track using OpenAI whisper - [Basic room](https://github.com/livekit/python-sdks/blob/main/examples/basic_room.py): Connect to a room - [Publish hue](https://github.com/livekit/python-sdks/blob/main/examples/publish_hue.py): Publish a rainbow video track - [Publish wave](https://github.com/livekit/python-sdks/blob/main/examples/publish_hue.py): Publish a sine wave diff --git a/examples/whisper/.gitignore b/examples/whisper/.gitignore deleted file mode 100644 index 39792963..00000000 --- a/examples/whisper/.gitignore +++ /dev/null @@ -1 +0,0 @@ -whisper.cpp \ No newline at end of file diff --git a/examples/whisper/README.md b/examples/whisper/README.md deleted file mode 100644 index c65c1f22..00000000 --- a/examples/whisper/README.md +++ /dev/null @@ -1,42 +0,0 @@ -## Whisper example - -[Whisper](https://github.com/openai/whisper) is a speech-to-text model from OpenAI. It ordinarily requires 30s of input data for transcription, making it challenging to use in real-time applications. We work around this limitation by padding shorter bursts of speech with silent audio packets. - -## How to run the demo - -### Step 1: -Change the URL and TOKEN inside the whisper.py script to use your LiveKit websocket URL and a valid session token - -### Step 2: -Clone [whisper.cpp](https://github.com/ggerganov/whisper.cpp) inside this directory - -### Step 3: -Build a shared library: -``` -cd whisper.cpp -gcc -O3 -std=c11 -pthread -mavx -mavx2 -mfma -mf16c -fPIC -c ggml.c -g++ -O3 -std=c++11 -pthread --shared -fPIC -static-libstdc++ whisper.cpp ggml.o -o libwhisper.so -``` - -### Step 4: -Download a model you want to use, for example: -``` -./models/download-ggml-model.sh tiny.en -``` - -### Step 5: -Rename the shared object library if you're on Windows or macOS: -1. If Windows, rename `libwhisper.so` to `libwhisper.dll` -2. If macOS, rename `libwhisper.so` to `libwhisper.dylib` - -### Step 6: -Run the whisper.py script: -``` -python3 whisper.py -``` - -### Step 7: -Connect another participant to the room and publish a microphone stream. To do this quickly, you can use our [Meet example](https://meet.livekit.io/?tab=custom) or use the [livekit-cli](https://github.com/livekit/livekit-cli): -``` -livekit-cli load-test --room yourroom --audio-publishers 1 -``` diff --git a/examples/whisper/whisper.py b/examples/whisper/whisper.py deleted file mode 100644 index 1b883c65..00000000 --- a/examples/whisper/whisper.py +++ /dev/null @@ -1,226 +0,0 @@ -import asyncio -import ctypes -import logging -import pathlib -import platform -from signal import SIGINT, SIGTERM -from os import getenv - -import numpy as np -from livekit import api, rtc - -platform = platform.system().lower() -if platform == "windows": - lib_file = "whisper.dll" -elif platform == "darwin": - lib_file = "libwhisper.dylib" -else: - lib_file = "libwhisper.so" - -whisper_dir = pathlib.Path(__file__).parent.absolute() / "whisper.cpp" -libname = str(whisper_dir / lib_file) -fname_model = str(whisper_dir / "models/ggml-tiny.en.bin") - -# declare the Whisper C API (Only what we need, keep things simple) -# also see this issue: https://github.com/ggerganov/whisper.cpp/issues/9 -# structure must match https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h - - -class WhisperSamplingStrategy(ctypes.c_int): - WHISPER_SAMPLING_GREEDY = 0 - WHISPER_SAMPLING_BEAM_SEARCH = 1 - - -class WhisperFullParams(ctypes.Structure): - _fields_ = [ - ("strategy", ctypes.c_int), - ("n_threads", ctypes.c_int), - ("n_max_text_ctx", ctypes.c_int), - ("offset_ms", ctypes.c_int), - ("duration_ms", ctypes.c_int), - ("translate", ctypes.c_bool), - ("no_context", ctypes.c_bool), - ("single_segment", ctypes.c_bool), - ("print_special", ctypes.c_bool), - ("print_progress", ctypes.c_bool), - ("print_realtime", ctypes.c_bool), - ("print_timestamps", ctypes.c_bool), - ("token_timestamps", ctypes.c_bool), - ("thold_pt", ctypes.c_float), - ("thold_ptsum", ctypes.c_float), - ("max_len", ctypes.c_int), - ("split_on_word", ctypes.c_bool), - ("max_tokens", ctypes.c_int), - ("speed_up", ctypes.c_bool), - ("audio_ctx", ctypes.c_int), - ("tdrz_enable", ctypes.c_bool), - ("initial_prompt", ctypes.c_char_p), - ("prompt_tokens", ctypes.c_void_p), - ("prompt_n_tokens", ctypes.c_int), - ("language", ctypes.c_char_p), - ("detect_language", ctypes.c_bool), - ("suppress_blank", ctypes.c_bool), - ("suppress_non_speech_tokens", ctypes.c_bool), - ("temperature", ctypes.c_float), - ("max_initial_ts", ctypes.c_float), - ("length_penalty", ctypes.c_float), - ("temperature_inc", ctypes.c_float), - ("entropy_thold", ctypes.c_float), - ("logprob_thold", ctypes.c_float), - ("no_speech_thold", ctypes.c_float), - ("greedy", ctypes.c_int), - ("beam_size", ctypes.c_int), - ("patience", ctypes.c_float), - ("new_segment_callback", ctypes.c_void_p), - ("new_segment_callback_user_data", ctypes.c_void_p), - ("progress_callback", ctypes.c_void_p), - ("progress_callback_user_data", ctypes.c_void_p), - ("encoder_begin_callback", ctypes.c_void_p), - ("encoder_begin_callback_user_data", ctypes.c_void_p), - ("logits_filter_callback", ctypes.c_void_p), - ("logits_filter_callback_user_data", ctypes.c_void_p), - ] - - -WHISPER_SAMPLE_RATE = 16000 -SAMPLES_30_SECS = WHISPER_SAMPLE_RATE * 30 -SAMPLES_KEEP = WHISPER_SAMPLE_RATE * 1 # data to keep from the old inference -SAMPLES_STEP = WHISPER_SAMPLE_RATE * 3 # 3 seconds of new data - -whisper = ctypes.CDLL(libname) -whisper.whisper_init_from_file.argtypes = [ctypes.c_char_p] -whisper.whisper_init_from_file.restype = ctypes.c_void_p -whisper.whisper_full_default_params.restype = WhisperFullParams -whisper.whisper_full_get_segment_text.restype = ctypes.c_char_p -ctx = whisper.whisper_init_from_file(fname_model.encode("utf-8")) - - -async def main(room: rtc.Room): - @room.on("track_published") - def on_track_published( - publication: rtc.RemoteTrackPublication, participant: rtc.RemoteParticipant - ): - # Only subscribe to the audio tracks coming from the microphone - if ( - publication.kind == rtc.TrackKind.KIND_AUDIO - and publication.source == rtc.TrackSource.SOURCE_MICROPHONE - ): - logging.info( - "track published: %s from participant %s (%s), subscribing...", - publication.sid, - participant.sid, - participant.identity, - ) - - publication.set_subscribed(True) - - @room.on("track_subscribed") - def on_track_subscribed( - track: rtc.Track, - publication: rtc.RemoteTrackPublication, - participant: rtc.RemoteParticipant, - ): - logging.info("starting listening to: %s", participant.identity) - audio_stream = rtc.AudioStream(track) - asyncio.create_task(whisper_task(audio_stream)) - - url = getenv("LIVEKIT_URL") - token = ( - api.AccessToken() - .with_identity("python-bot") - .with_name("Python Bot") - .with_grants( - api.VideoGrants( - room_join=True, - room="my-room", - ) - ) - .to_jwt() - ) - # manually manage subscriptions - await room.connect(url, token, rtc.RoomOptions(auto_subscribe=False)) - logging.info("connected to room %s", room.name) - - # check if there are already published audio tracks - for participant in room.participants.values(): - for track in participant.tracks.values(): - if ( - track.kind == rtc.TrackKind.KIND_AUDIO - and track.source == rtc.TrackSource.SOURCE_MICROPHONE - ): - track.set_subscribed(True) - - -async def whisper_task(stream: rtc.AudioStream): - data_30_secs = np.zeros(SAMPLES_30_SECS, dtype=np.float32) - written_samples = 0 # nb. of samples written to data_30_secs for the cur. inference - - async for frame in stream: - # whisper requires 16kHz mono, so resample the data - # also convert the samples from int16 to float32 - - frame = frame.remix_and_resample(WHISPER_SAMPLE_RATE, 1) - - data = np.frombuffer(frame.data, dtype=np.int16).astype(np.float32) / 32768.0 - - # write the data inside data_30_secs at written_samples - data_start = SAMPLES_KEEP + written_samples - data_30_secs[data_start : data_start + len(data)] = data - written_samples += len(data) - - if written_samples >= SAMPLES_STEP: - params = whisper.whisper_full_default_params( - WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY - ) - params.print_realtime = False - params.print_progress = False - - ctx_ptr = ctypes.c_void_p(ctx) - data_ptr = data_30_secs.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) - res = whisper.whisper_full( - ctx_ptr, params, data_ptr, written_samples + SAMPLES_KEEP - ) - - if res != 0: - logging.error("error while running inference: %s", res) - return - - n_segments = whisper.whisper_full_n_segments(ctx_ptr) - for i in range(n_segments): - t0 = whisper.whisper_full_get_segment_t0(ctx_ptr, i) - t1 = whisper.whisper_full_get_segment_t1(ctx_ptr, i) - txt = whisper.whisper_full_get_segment_text(ctx_ptr, i) - - logging.info( - f"{t0/1000.0:.3f} - {t1/1000.0:.3f} : {txt.decode('utf-8')}" - ) - - # write old data to the beginning of the buffer (SAMPLES_KEEP) - data_30_secs[:SAMPLES_KEEP] = data_30_secs[ - data_start + written_samples - SAMPLES_KEEP : data_start - + written_samples - ] - written_samples = 0 - - -if __name__ == "__main__": - logging.basicConfig( - level=logging.INFO, - handlers=[logging.FileHandler("whisper.log"), logging.StreamHandler()], - ) - - loop = asyncio.get_event_loop() - room = rtc.Room(loop=loop) - - async def cleanup(): - await room.disconnect() - loop.stop() - - asyncio.ensure_future(main(room)) - for signal in [SIGINT, SIGTERM]: - loop.add_signal_handler(signal, lambda: asyncio.ensure_future(cleanup())) - - try: - loop.run_forever() - finally: - loop.close() diff --git a/livekit-rtc/rust-sdks b/livekit-rtc/rust-sdks deleted file mode 160000 index 4450c6ca..00000000 --- a/livekit-rtc/rust-sdks +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4450c6ca5cf269873db5debf5fc06115490a44ea