Ask OpenAI to process recognized speech to form lyrics_karaoke

It works, but you can't use it yet if you're not running backend locally. Also, the refactoring is needed in three places.
m-danya · May 1, 2024 · 8af35ca · 8af35ca
1 parent 4039141
commit 8af35ca
Show file tree

Hide file tree

Showing 8 changed files with 105 additions and 43 deletions.
diff --git a/.env.sample b/.env.sample
@@ -1,5 +1,6 @@
 DEPLOYMENT_HOST=127.0.0.1
 
+
 MODE=DEV
 
 DB_HOST=127.0.0.1
@@ -23,4 +24,5 @@ DOCKER_FRONTEND_PORT=80
 CELERY_CONCURRENCY=1
 STORAGE_PATH=./storage-volume
 
-GENIUS_CLIENT_ACCESS_TOKEN=...
+GENIUS_CLIENT_ACCESS_TOKEN=...
+OPENAI_API_KEY=...
diff --git a/README.md b/README.md
@@ -28,6 +28,10 @@ sudo apt-get install -y nvidia-container-toolkit
 sudo systemctl restart docker
 ```
 
+If you don't have a GPU, you'll need to comment
+[these](https://github.com/m-danya/accompanist/blob/2dd6a2b4e5a655a26f32ae536ae35a80ce0dbd9d/compose.yaml#L36C1-L42C36)
+lines in `compose.yaml`.
+
 How to run the system:
 
 ```
@@ -44,6 +48,8 @@ obtain _Client Access Token_. You can enter any app name an use any "App Website
 URL". After you get the token, place it into your `.env` file (corresponding
 variable is `GENIUS_CLIENT_ACCESS_TOKEN`).
 
+You can set `OPENAI_API_KEY` or left this key unfilled.
+
 Let's continue:
 
 ```
@@ -119,6 +125,9 @@ the system at `${DEPLOYMENT_HOST}:${DOCKER_FRONTEND_PORT}`.
 #### Possible todos
 
 - [All the "TODO"s in the code]
+- Storage management: delete (or even manage) stored mp3s and jpgs
+- Finish timestamper (see #4)
+- Autofetch albums/tracks in components every N seconds at frontend?
 - DragAndDrop albums at the main page
 - Fix admin panel being available only at ${FASTAPI_PORT} but not at ${NGINX_PORT}
 - Tune Genius search query (e.g. remove the parentheses in track title)
@@ -138,6 +147,7 @@ the system at `${DEPLOYMENT_HOST}:${DOCKER_FRONTEND_PORT}`.
 - Run `alembic upgrade head` at launching?
 - Publish docker image(s) to Docker Hub
 - Different gradient backgounds for different songs (random + choosing)
+- Recognize notes?
 - Frontend localization
 - Add Telegram bot wrapper for the backend
 - Add mypy (+ CI)

diff --git a/accompanist/collection/service.py b/accompanist/collection/service.py
@@ -2,7 +2,7 @@
 
 from accompanist.celery.tasks import process_album_task
 from accompanist.collection.dao import TrackDAO
-from accompanist.collection.recognizer import LyricsTimestamper
+from accompanist.collection.timestamper import LyricsTimestamper
 from accompanist.collection.schema import AlbumInfoFromUser, TrackUpdateRequest
 from accompanist.collection.service_genius import get_lyrics_from_genius
 from accompanist.config import settings

diff --git a/accompanist/collection/recognizer.py → accompanist/collection/timestamper.py b/accompanist/collection/recognizer.py → accompanist/collection/timestamper.py
@@ -1,3 +1,4 @@
+import json
 from pathlib import Path
 from typing import List
 
@@ -8,45 +9,35 @@
     WhisperTokenizer,
     WhisperProcessor,
 )
+from accompanist.config import settings
+from openai import OpenAI
 
 
 class LyricsTimestamper:
     MODEL_TYPE = "small"
 
     def __init__(self) -> None:
-        # self.pipe = pipeline(
-        #     "automatic-speech-recognition",
-        #     model="openai/whisper-small",
-        #     chunk_length_s=30,
-        #     device_map="auto",
-        # )
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model = WhisperForConditionalGeneration.from_pretrained(
             "openai/whisper-small",
         ).to(self.device)
         self.processor = WhisperProcessor.from_pretrained("openai/whisper-small")
         self.tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small")
+        self.openai_client = OpenAI(api_key=settings.OPENAI_API_KEY)
 
     def get_karaoke_lyrics(self, vocals_mp3: Path, lyrics: str) -> List[dict[str, str]]:
+        # TODO: refactor this long method
+        # TODO: move calling this method to celery task
         # TODO: standardize lyrics preprocessing between frontend and backend
         lyrics_lines = [
             line.strip()
             for line in lyrics.split("\n")
             if line.strip() and not line.startswith("[")
         ]
 
-        # audio = whisper.load_audio(path_mp3)
-        # audio = whisper.pad_or_trim(audio)
-
-        # mel = whisper.log_mel_spectrogram(audio).to(self.model.device)
-
-        # _, probs = self.model.detect_language(mel)
-        # options = whisper.DecodingOptions(language='ru')
         sampling_rate = 16_000  # as Whisper requires?
         waveform, _ = librosa.load(vocals_mp3, sr=sampling_rate, mono=True)
 
-        # non_silent_intervals = librosa.effects.split(waveform, top_db=30)
-
         chunk_length_s = 5
         overlap = chunk_length_s / 2  # 50% overlap
         chunk_samples = int(chunk_length_s * sampling_rate)
@@ -83,34 +74,43 @@ def get_karaoke_lyrics(self, vocals_mp3: Path, lyrics: str) -> List[dict[str, st
                 {"text": transcribed_text, "start_ts": start_ts, "end_ts": end_ts}
             )
 
-        # transcribed_lines = []
-        # for start_sample, end_sample in non_silent_intervals:
-        #     interval_audio = waveform[start_sample:end_sample]
-
-        # prediction = self.pipe(
-        #     audio_dict,
-        #     # as the pipeline does not support `language` kwarg and fails if
-        #     #  autodetection finds two different languages in one batch
-        #     batch_size=1,
-        #     return_timestamps=True,
-        # )["chunks"]
-
-        print(f"{lyrics_lines=}")
-        print(f"{transcribed_lines=}")
-        prompt = """
-        You're a deaf person at a local karaoke bar. Your job is to make timestamps
-        for karaoke songs. For each song you get `lyrics_lines` an array of ground
-        truth lines and`transcribed_lines`, which is an array of speech-to-text
-        model. You need to output a `lyrics_karaoke` list, each object of which is a
-        dict with keys "line" and "end_ts". In other words, you need to find an end
-        of every line in the text. 
+        system_prompt = """
+        You're a deaf person at a local karaoke bar. Your job is to make
+        timestamps for karaoke songs. For each song you get `lyrics_lines` an
+        array of ground truth lines and`transcribed_lines`, which is an array of
+        speech-to-text model. You need to output a `lyrics_karaoke` list, each
+        object of which is a dict with keys "line" and "end_ts". In other words,
+        you need to find an end of every line in the text. 
 
         `lyrics_karaoke` array must have exactly the same lines as `lyrics_lines` in
         exactly the same order. Values of `end_ts` must increase from one element to
         another, as they are lines of one song.
 
         Do not write any code, just output the `lyrics_karaoke` dict in a JSON format.
-        Try your best. Here is your input:
+        Try your best.
         """
-        lyrics_karaoke = "ChatGPT4(lyrics_lines, transcribed_lines, prompt)"
+        user_request = f"{lyrics_lines=}\n\n{transcribed_lines=}"
+        print(user_request)
+        completion = self.openai_client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {
+                    "role": "system",
+                    "content": system_prompt,
+                },
+                {
+                    "role": "user",
+                    "content": user_request,
+                },
+            ],
+        )
+        lyrics_karaoke = completion.choices[0].message.content
+        lyrics_karaoke = json.loads(
+            lyrics_karaoke.strip().replace("```json", "").replace("```", "")
+        )
+        try:
+            lyrics_karaoke = lyrics_karaoke["lyrics_karaoke"]
+        except Exception:
+            pass
+        print(lyrics_karaoke)
         return lyrics_karaoke
diff --git a/accompanist/config.py b/accompanist/config.py
@@ -25,6 +25,7 @@ class Settings(BaseSettings):
     STORAGE_PATH: Path
 
     GENIUS_CLIENT_ACCESS_TOKEN: Optional[str]
+    OPENAI_API_KEY: Optional[str]
 
     @property
     def DATABASE_URL(self):

diff --git a/accompanist/tests/unit_tests/test_timestamper.py b/accompanist/tests/unit_tests/test_timestamper.py
@@ -1,8 +1,11 @@
 from pathlib import Path
 
-from accompanist.collection.recognizer import LyricsTimestamper
+import pytest
 
+from accompanist.collection.timestamper import LyricsTimestamper
 
+
+@pytest.mark.skip(reason="It requires an OpenAI API key")
 def test_timestamper(test_track_data):
     timestamper = LyricsTimestamper()
     vocals_mp3_path = Path("accompanist/tests/melancholy_vocals.mp3")
@@ -12,4 +15,13 @@ def test_timestamper(test_track_data):
     lyrics_karaoke = timestamper.get_karaoke_lyrics(
         vocals_mp3_path, melancholy_track["lyrics"]
     )
-    # TODO: write asserts
+    assert lyrics_karaoke
+    # TODO: move preprocessing to model's property or somewhere else
+    lyrics_lines_preprocessed = [
+        line.strip()
+        for line in melancholy_track["lyrics"].split("\n")
+        if line.strip() and not line.startswith("[")
+    ]
+    for original_line, karaoke_line in zip(lyrics_lines_preprocessed, lyrics_karaoke):
+        assert original_line == karaoke_line["line"]
+    # TODO: assert non-descending of `end_ts` sequence
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,6 +28,7 @@ openai-whisper = "^20231117"
 transformers = "^4.40.1"
 librosa = "^0.10.1"
 accelerate = "^0.29.3"
+openai = "^1.25.0"
 
 [tool.poetry.group.dev.dependencies]
 ruff = "^0.3.2"