From 8af35cabad47e6f45603492c0110735ff1b47220 Mon Sep 17 00:00:00 2001 From: m-danya Date: Wed, 1 May 2024 17:24:18 +0300 Subject: [PATCH] Ask OpenAI to process recognized speech to form `lyrics_karaoke` It works, but you can't use it yet if you're not running backend locally. Also, the refactoring is needed in three places. --- .env.sample | 4 +- README.md | 10 +++ accompanist/collection/service.py | 2 +- .../{recognizer.py => timestamper.py} | 76 +++++++++---------- accompanist/config.py | 1 + .../tests/unit_tests/test_timestamper.py | 16 +++- poetry.lock | 38 +++++++++- pyproject.toml | 1 + 8 files changed, 105 insertions(+), 43 deletions(-) rename accompanist/collection/{recognizer.py => timestamper.py} (62%) diff --git a/.env.sample b/.env.sample index 158682e..749f85e 100644 --- a/.env.sample +++ b/.env.sample @@ -1,5 +1,6 @@ DEPLOYMENT_HOST=127.0.0.1 + MODE=DEV DB_HOST=127.0.0.1 @@ -23,4 +24,5 @@ DOCKER_FRONTEND_PORT=80 CELERY_CONCURRENCY=1 STORAGE_PATH=./storage-volume -GENIUS_CLIENT_ACCESS_TOKEN=... \ No newline at end of file +GENIUS_CLIENT_ACCESS_TOKEN=... +OPENAI_API_KEY=... diff --git a/README.md b/README.md index e892b17..aad6ffb 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,10 @@ sudo apt-get install -y nvidia-container-toolkit sudo systemctl restart docker ``` +If you don't have a GPU, you'll need to comment +[these](https://github.com/m-danya/accompanist/blob/2dd6a2b4e5a655a26f32ae536ae35a80ce0dbd9d/compose.yaml#L36C1-L42C36) +lines in `compose.yaml`. + How to run the system: ``` @@ -44,6 +48,8 @@ obtain _Client Access Token_. You can enter any app name an use any "App Website URL". After you get the token, place it into your `.env` file (corresponding variable is `GENIUS_CLIENT_ACCESS_TOKEN`). +You can set `OPENAI_API_KEY` or left this key unfilled. + Let's continue: ``` @@ -119,6 +125,9 @@ the system at `${DEPLOYMENT_HOST}:${DOCKER_FRONTEND_PORT}`. #### Possible todos - [All the "TODO"s in the code] +- Storage management: delete (or even manage) stored mp3s and jpgs +- Finish timestamper (see #4) +- Autofetch albums/tracks in components every N seconds at frontend? - DragAndDrop albums at the main page - Fix admin panel being available only at ${FASTAPI_PORT} but not at ${NGINX_PORT} - Tune Genius search query (e.g. remove the parentheses in track title) @@ -138,6 +147,7 @@ the system at `${DEPLOYMENT_HOST}:${DOCKER_FRONTEND_PORT}`. - Run `alembic upgrade head` at launching? - Publish docker image(s) to Docker Hub - Different gradient backgounds for different songs (random + choosing) +- Recognize notes? - Frontend localization - Add Telegram bot wrapper for the backend - Add mypy (+ CI) diff --git a/accompanist/collection/service.py b/accompanist/collection/service.py index acc8015..579548f 100644 --- a/accompanist/collection/service.py +++ b/accompanist/collection/service.py @@ -2,7 +2,7 @@ from accompanist.celery.tasks import process_album_task from accompanist.collection.dao import TrackDAO -from accompanist.collection.recognizer import LyricsTimestamper +from accompanist.collection.timestamper import LyricsTimestamper from accompanist.collection.schema import AlbumInfoFromUser, TrackUpdateRequest from accompanist.collection.service_genius import get_lyrics_from_genius from accompanist.config import settings diff --git a/accompanist/collection/recognizer.py b/accompanist/collection/timestamper.py similarity index 62% rename from accompanist/collection/recognizer.py rename to accompanist/collection/timestamper.py index 26dc780..dff29e5 100644 --- a/accompanist/collection/recognizer.py +++ b/accompanist/collection/timestamper.py @@ -1,3 +1,4 @@ +import json from pathlib import Path from typing import List @@ -8,26 +9,25 @@ WhisperTokenizer, WhisperProcessor, ) +from accompanist.config import settings +from openai import OpenAI class LyricsTimestamper: MODEL_TYPE = "small" def __init__(self) -> None: - # self.pipe = pipeline( - # "automatic-speech-recognition", - # model="openai/whisper-small", - # chunk_length_s=30, - # device_map="auto", - # ) self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model = WhisperForConditionalGeneration.from_pretrained( "openai/whisper-small", ).to(self.device) self.processor = WhisperProcessor.from_pretrained("openai/whisper-small") self.tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small") + self.openai_client = OpenAI(api_key=settings.OPENAI_API_KEY) def get_karaoke_lyrics(self, vocals_mp3: Path, lyrics: str) -> List[dict[str, str]]: + # TODO: refactor this long method + # TODO: move calling this method to celery task # TODO: standardize lyrics preprocessing between frontend and backend lyrics_lines = [ line.strip() @@ -35,18 +35,9 @@ def get_karaoke_lyrics(self, vocals_mp3: Path, lyrics: str) -> List[dict[str, st if line.strip() and not line.startswith("[") ] - # audio = whisper.load_audio(path_mp3) - # audio = whisper.pad_or_trim(audio) - - # mel = whisper.log_mel_spectrogram(audio).to(self.model.device) - - # _, probs = self.model.detect_language(mel) - # options = whisper.DecodingOptions(language='ru') sampling_rate = 16_000 # as Whisper requires? waveform, _ = librosa.load(vocals_mp3, sr=sampling_rate, mono=True) - # non_silent_intervals = librosa.effects.split(waveform, top_db=30) - chunk_length_s = 5 overlap = chunk_length_s / 2 # 50% overlap chunk_samples = int(chunk_length_s * sampling_rate) @@ -83,34 +74,43 @@ def get_karaoke_lyrics(self, vocals_mp3: Path, lyrics: str) -> List[dict[str, st {"text": transcribed_text, "start_ts": start_ts, "end_ts": end_ts} ) - # transcribed_lines = [] - # for start_sample, end_sample in non_silent_intervals: - # interval_audio = waveform[start_sample:end_sample] - - # prediction = self.pipe( - # audio_dict, - # # as the pipeline does not support `language` kwarg and fails if - # # autodetection finds two different languages in one batch - # batch_size=1, - # return_timestamps=True, - # )["chunks"] - - print(f"{lyrics_lines=}") - print(f"{transcribed_lines=}") - prompt = """ - You're a deaf person at a local karaoke bar. Your job is to make timestamps - for karaoke songs. For each song you get `lyrics_lines` an array of ground - truth lines and`transcribed_lines`, which is an array of speech-to-text - model. You need to output a `lyrics_karaoke` list, each object of which is a - dict with keys "line" and "end_ts". In other words, you need to find an end - of every line in the text. + system_prompt = """ + You're a deaf person at a local karaoke bar. Your job is to make + timestamps for karaoke songs. For each song you get `lyrics_lines` an + array of ground truth lines and`transcribed_lines`, which is an array of + speech-to-text model. You need to output a `lyrics_karaoke` list, each + object of which is a dict with keys "line" and "end_ts". In other words, + you need to find an end of every line in the text. `lyrics_karaoke` array must have exactly the same lines as `lyrics_lines` in exactly the same order. Values of `end_ts` must increase from one element to another, as they are lines of one song. Do not write any code, just output the `lyrics_karaoke` dict in a JSON format. - Try your best. Here is your input: + Try your best. """ - lyrics_karaoke = "ChatGPT4(lyrics_lines, transcribed_lines, prompt)" + user_request = f"{lyrics_lines=}\n\n{transcribed_lines=}" + print(user_request) + completion = self.openai_client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + { + "role": "system", + "content": system_prompt, + }, + { + "role": "user", + "content": user_request, + }, + ], + ) + lyrics_karaoke = completion.choices[0].message.content + lyrics_karaoke = json.loads( + lyrics_karaoke.strip().replace("```json", "").replace("```", "") + ) + try: + lyrics_karaoke = lyrics_karaoke["lyrics_karaoke"] + except Exception: + pass + print(lyrics_karaoke) return lyrics_karaoke diff --git a/accompanist/config.py b/accompanist/config.py index b706ac0..db27404 100644 --- a/accompanist/config.py +++ b/accompanist/config.py @@ -25,6 +25,7 @@ class Settings(BaseSettings): STORAGE_PATH: Path GENIUS_CLIENT_ACCESS_TOKEN: Optional[str] + OPENAI_API_KEY: Optional[str] @property def DATABASE_URL(self): diff --git a/accompanist/tests/unit_tests/test_timestamper.py b/accompanist/tests/unit_tests/test_timestamper.py index c5bec30..6c383ed 100644 --- a/accompanist/tests/unit_tests/test_timestamper.py +++ b/accompanist/tests/unit_tests/test_timestamper.py @@ -1,8 +1,11 @@ from pathlib import Path -from accompanist.collection.recognizer import LyricsTimestamper +import pytest +from accompanist.collection.timestamper import LyricsTimestamper + +@pytest.mark.skip(reason="It requires an OpenAI API key") def test_timestamper(test_track_data): timestamper = LyricsTimestamper() vocals_mp3_path = Path("accompanist/tests/melancholy_vocals.mp3") @@ -12,4 +15,13 @@ def test_timestamper(test_track_data): lyrics_karaoke = timestamper.get_karaoke_lyrics( vocals_mp3_path, melancholy_track["lyrics"] ) - # TODO: write asserts + assert lyrics_karaoke + # TODO: move preprocessing to model's property or somewhere else + lyrics_lines_preprocessed = [ + line.strip() + for line in melancholy_track["lyrics"].split("\n") + if line.strip() and not line.startswith("[") + ] + for original_line, karaoke_line in zip(lyrics_lines_preprocessed, lyrics_karaoke): + assert original_line == karaoke_line["line"] + # TODO: assert non-descending of `end_ts` sequence diff --git a/poetry.lock b/poetry.lock index 80add9d..316bf92 100644 --- a/poetry.lock +++ b/poetry.lock @@ -725,6 +725,18 @@ tqdm = "*" [package.extras] dev = ["diffq (>=0.2.1)", "dora-search (>=0.1.12)", "einops", "flake8", "hydra-colorlog (>=1.1)", "hydra-core (>=1.1)", "julius (>=0.2.3)", "lameenc (>=1.2)", "museval", "mypy", "openunmix", "pyyaml", "soundfile (>=0.10.3)", "submitit", "torch (>=1.8.1)", "torchaudio (>=0.8)", "tqdm", "treetable"] +[[package]] +name = "distro" +version = "1.9.0" +description = "Distro - an OS platform information API" +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"}, + {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"}, +] + [[package]] name = "dnspython" version = "2.6.1" @@ -1950,6 +1962,30 @@ files = [ antlr4-python3-runtime = ">=4.9.0,<4.10.0" PyYAML = ">=5.1.0" +[[package]] +name = "openai" +version = "1.25.0" +description = "The official Python library for the openai API" +category = "main" +optional = false +python-versions = ">=3.7.1" +files = [ + {file = "openai-1.25.0-py3-none-any.whl", hash = "sha256:d0cfdf6afb31a5dabf3b95966cb31f3c757a0edaf3228715409cb404b9933de0"}, + {file = "openai-1.25.0.tar.gz", hash = "sha256:22c35b26b8281cd2759b1a4c05ac99e2f2b26a9df71f90a0b4ddb75aa27adc81"}, +] + +[package.dependencies] +anyio = ">=3.5.0,<5" +distro = ">=1.7.0,<2" +httpx = ">=0.23.0,<1" +pydantic = ">=1.9.0,<3" +sniffio = "*" +tqdm = ">4" +typing-extensions = ">=4.7,<5" + +[package.extras] +datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] + [[package]] name = "openai-whisper" version = "20231117" @@ -4093,4 +4129,4 @@ requests = ">=2.22" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "2a5d1f543b2c6fed9daefcac5e57a62815336f88a340a16a07864b7ebb1b1468" +content-hash = "162c54ca166ad802b5f3d6f74bb2e845832bb1d48d3d7ceb15e404795270e202" diff --git a/pyproject.toml b/pyproject.toml index 08891dc..ef6a7db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ openai-whisper = "^20231117" transformers = "^4.40.1" librosa = "^0.10.1" accelerate = "^0.29.3" +openai = "^1.25.0" [tool.poetry.group.dev.dependencies] ruff = "^0.3.2"