From 8af35cabad47e6f45603492c0110735ff1b47220 Mon Sep 17 00:00:00 2001
From: m-danya <danila-mikh@ya.ru>
Date: Wed, 1 May 2024 17:24:18 +0300
Subject: [PATCH] Ask OpenAI to process recognized speech to form
 `lyrics_karaoke`

It works, but you can't use it yet if you're not running backend
locally. Also, the refactoring is needed in three places.
---
 .env.sample                                   |  4 +-
 README.md                                     | 10 +++
 accompanist/collection/service.py             |  2 +-
 .../{recognizer.py => timestamper.py}         | 76 +++++++++----------
 accompanist/config.py                         |  1 +
 .../tests/unit_tests/test_timestamper.py      | 16 +++-
 poetry.lock                                   | 38 +++++++++-
 pyproject.toml                                |  1 +
 8 files changed, 105 insertions(+), 43 deletions(-)
 rename accompanist/collection/{recognizer.py => timestamper.py} (62%)

diff --git a/.env.sample b/.env.sample
index 158682e..749f85e 100644
--- a/.env.sample
+++ b/.env.sample
@@ -1,5 +1,6 @@
 DEPLOYMENT_HOST=127.0.0.1
 
+
 MODE=DEV
 
 DB_HOST=127.0.0.1
@@ -23,4 +24,5 @@ DOCKER_FRONTEND_PORT=80
 CELERY_CONCURRENCY=1
 STORAGE_PATH=./storage-volume
 
-GENIUS_CLIENT_ACCESS_TOKEN=...
\ No newline at end of file
+GENIUS_CLIENT_ACCESS_TOKEN=...
+OPENAI_API_KEY=...
diff --git a/README.md b/README.md
index e892b17..aad6ffb 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,10 @@ sudo apt-get install -y nvidia-container-toolkit
 sudo systemctl restart docker
 ```
 
+If you don't have a GPU, you'll need to comment
+[these](https://github.com/m-danya/accompanist/blob/2dd6a2b4e5a655a26f32ae536ae35a80ce0dbd9d/compose.yaml#L36C1-L42C36)
+lines in `compose.yaml`.
+
 How to run the system:
 
 ```
@@ -44,6 +48,8 @@ obtain _Client Access Token_. You can enter any app name an use any "App Website
 URL". After you get the token, place it into your `.env` file (corresponding
 variable is `GENIUS_CLIENT_ACCESS_TOKEN`).
 
+You can set `OPENAI_API_KEY` or left this key unfilled.
+
 Let's continue:
 
 ```
@@ -119,6 +125,9 @@ the system at `${DEPLOYMENT_HOST}:${DOCKER_FRONTEND_PORT}`.
 #### Possible todos
 
 - [All the "TODO"s in the code]
+- Storage management: delete (or even manage) stored mp3s and jpgs
+- Finish timestamper (see #4)
+- Autofetch albums/tracks in components every N seconds at frontend?
 - DragAndDrop albums at the main page
 - Fix admin panel being available only at ${FASTAPI_PORT} but not at ${NGINX_PORT}
 - Tune Genius search query (e.g. remove the parentheses in track title)
@@ -138,6 +147,7 @@ the system at `${DEPLOYMENT_HOST}:${DOCKER_FRONTEND_PORT}`.
 - Run `alembic upgrade head` at launching?
 - Publish docker image(s) to Docker Hub
 - Different gradient backgounds for different songs (random + choosing)
+- Recognize notes?
 - Frontend localization
 - Add Telegram bot wrapper for the backend
 - Add mypy (+ CI)
diff --git a/accompanist/collection/service.py b/accompanist/collection/service.py
index acc8015..579548f 100644
--- a/accompanist/collection/service.py
+++ b/accompanist/collection/service.py
@@ -2,7 +2,7 @@
 
 from accompanist.celery.tasks import process_album_task
 from accompanist.collection.dao import TrackDAO
-from accompanist.collection.recognizer import LyricsTimestamper
+from accompanist.collection.timestamper import LyricsTimestamper
 from accompanist.collection.schema import AlbumInfoFromUser, TrackUpdateRequest
 from accompanist.collection.service_genius import get_lyrics_from_genius
 from accompanist.config import settings
diff --git a/accompanist/collection/recognizer.py b/accompanist/collection/timestamper.py
similarity index 62%
rename from accompanist/collection/recognizer.py
rename to accompanist/collection/timestamper.py
index 26dc780..dff29e5 100644
--- a/accompanist/collection/recognizer.py
+++ b/accompanist/collection/timestamper.py
@@ -1,3 +1,4 @@
+import json
 from pathlib import Path
 from typing import List
 
@@ -8,26 +9,25 @@
     WhisperTokenizer,
     WhisperProcessor,
 )
+from accompanist.config import settings
+from openai import OpenAI
 
 
 class LyricsTimestamper:
     MODEL_TYPE = "small"
 
     def __init__(self) -> None:
-        # self.pipe = pipeline(
-        #     "automatic-speech-recognition",
-        #     model="openai/whisper-small",
-        #     chunk_length_s=30,
-        #     device_map="auto",
-        # )
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model = WhisperForConditionalGeneration.from_pretrained(
             "openai/whisper-small",
         ).to(self.device)
         self.processor = WhisperProcessor.from_pretrained("openai/whisper-small")
         self.tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small")
+        self.openai_client = OpenAI(api_key=settings.OPENAI_API_KEY)
 
     def get_karaoke_lyrics(self, vocals_mp3: Path, lyrics: str) -> List[dict[str, str]]:
+        # TODO: refactor this long method
+        # TODO: move calling this method to celery task
         # TODO: standardize lyrics preprocessing between frontend and backend
         lyrics_lines = [
             line.strip()
@@ -35,18 +35,9 @@ def get_karaoke_lyrics(self, vocals_mp3: Path, lyrics: str) -> List[dict[str, st
             if line.strip() and not line.startswith("[")
         ]
 
-        # audio = whisper.load_audio(path_mp3)
-        # audio = whisper.pad_or_trim(audio)
-
-        # mel = whisper.log_mel_spectrogram(audio).to(self.model.device)
-
-        # _, probs = self.model.detect_language(mel)
-        # options = whisper.DecodingOptions(language='ru')
         sampling_rate = 16_000  # as Whisper requires?
         waveform, _ = librosa.load(vocals_mp3, sr=sampling_rate, mono=True)
 
-        # non_silent_intervals = librosa.effects.split(waveform, top_db=30)
-
         chunk_length_s = 5
         overlap = chunk_length_s / 2  # 50% overlap
         chunk_samples = int(chunk_length_s * sampling_rate)
@@ -83,34 +74,43 @@ def get_karaoke_lyrics(self, vocals_mp3: Path, lyrics: str) -> List[dict[str, st
                 {"text": transcribed_text, "start_ts": start_ts, "end_ts": end_ts}
             )
 
-        # transcribed_lines = []
-        # for start_sample, end_sample in non_silent_intervals:
-        #     interval_audio = waveform[start_sample:end_sample]
-
-        # prediction = self.pipe(
-        #     audio_dict,
-        #     # as the pipeline does not support `language` kwarg and fails if
-        #     #  autodetection finds two different languages in one batch
-        #     batch_size=1,
-        #     return_timestamps=True,
-        # )["chunks"]
-
-        print(f"{lyrics_lines=}")
-        print(f"{transcribed_lines=}")
-        prompt = """
-        You're a deaf person at a local karaoke bar. Your job is to make timestamps
-        for karaoke songs. For each song you get `lyrics_lines` an array of ground
-        truth lines and`transcribed_lines`, which is an array of speech-to-text
-        model. You need to output a `lyrics_karaoke` list, each object of which is a
-        dict with keys "line" and "end_ts". In other words, you need to find an end
-        of every line in the text. 
+        system_prompt = """
+        You're a deaf person at a local karaoke bar. Your job is to make
+        timestamps for karaoke songs. For each song you get `lyrics_lines` an
+        array of ground truth lines and`transcribed_lines`, which is an array of
+        speech-to-text model. You need to output a `lyrics_karaoke` list, each
+        object of which is a dict with keys "line" and "end_ts". In other words,
+        you need to find an end of every line in the text. 
 
         `lyrics_karaoke` array must have exactly the same lines as `lyrics_lines` in
         exactly the same order. Values of `end_ts` must increase from one element to
         another, as they are lines of one song.
 
         Do not write any code, just output the `lyrics_karaoke` dict in a JSON format.
-        Try your best. Here is your input:
+        Try your best.
         """
-        lyrics_karaoke = "ChatGPT4(lyrics_lines, transcribed_lines, prompt)"
+        user_request = f"{lyrics_lines=}\n\n{transcribed_lines=}"
+        print(user_request)
+        completion = self.openai_client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {
+                    "role": "system",
+                    "content": system_prompt,
+                },
+                {
+                    "role": "user",
+                    "content": user_request,
+                },
+            ],
+        )
+        lyrics_karaoke = completion.choices[0].message.content
+        lyrics_karaoke = json.loads(
+            lyrics_karaoke.strip().replace("```json", "").replace("```", "")
+        )
+        try:
+            lyrics_karaoke = lyrics_karaoke["lyrics_karaoke"]
+        except Exception:
+            pass
+        print(lyrics_karaoke)
         return lyrics_karaoke
diff --git a/accompanist/config.py b/accompanist/config.py
index b706ac0..db27404 100644
--- a/accompanist/config.py
+++ b/accompanist/config.py
@@ -25,6 +25,7 @@ class Settings(BaseSettings):
     STORAGE_PATH: Path
 
     GENIUS_CLIENT_ACCESS_TOKEN: Optional[str]
+    OPENAI_API_KEY: Optional[str]
 
     @property
     def DATABASE_URL(self):
diff --git a/accompanist/tests/unit_tests/test_timestamper.py b/accompanist/tests/unit_tests/test_timestamper.py
index c5bec30..6c383ed 100644
--- a/accompanist/tests/unit_tests/test_timestamper.py
+++ b/accompanist/tests/unit_tests/test_timestamper.py
@@ -1,8 +1,11 @@
 from pathlib import Path
 
-from accompanist.collection.recognizer import LyricsTimestamper
+import pytest
 
+from accompanist.collection.timestamper import LyricsTimestamper
 
+
+@pytest.mark.skip(reason="It requires an OpenAI API key")
 def test_timestamper(test_track_data):
     timestamper = LyricsTimestamper()
     vocals_mp3_path = Path("accompanist/tests/melancholy_vocals.mp3")
@@ -12,4 +15,13 @@ def test_timestamper(test_track_data):
     lyrics_karaoke = timestamper.get_karaoke_lyrics(
         vocals_mp3_path, melancholy_track["lyrics"]
     )
-    # TODO: write asserts
+    assert lyrics_karaoke
+    # TODO: move preprocessing to model's property or somewhere else
+    lyrics_lines_preprocessed = [
+        line.strip()
+        for line in melancholy_track["lyrics"].split("\n")
+        if line.strip() and not line.startswith("[")
+    ]
+    for original_line, karaoke_line in zip(lyrics_lines_preprocessed, lyrics_karaoke):
+        assert original_line == karaoke_line["line"]
+    # TODO: assert non-descending of `end_ts` sequence
diff --git a/poetry.lock b/poetry.lock
index 80add9d..316bf92 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -725,6 +725,18 @@ tqdm = "*"
 [package.extras]
 dev = ["diffq (>=0.2.1)", "dora-search (>=0.1.12)", "einops", "flake8", "hydra-colorlog (>=1.1)", "hydra-core (>=1.1)", "julius (>=0.2.3)", "lameenc (>=1.2)", "museval", "mypy", "openunmix", "pyyaml", "soundfile (>=0.10.3)", "submitit", "torch (>=1.8.1)", "torchaudio (>=0.8)", "tqdm", "treetable"]
 
+[[package]]
+name = "distro"
+version = "1.9.0"
+description = "Distro - an OS platform information API"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"},
+    {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
+]
+
 [[package]]
 name = "dnspython"
 version = "2.6.1"
@@ -1950,6 +1962,30 @@ files = [
 antlr4-python3-runtime = ">=4.9.0,<4.10.0"
 PyYAML = ">=5.1.0"
 
+[[package]]
+name = "openai"
+version = "1.25.0"
+description = "The official Python library for the openai API"
+category = "main"
+optional = false
+python-versions = ">=3.7.1"
+files = [
+    {file = "openai-1.25.0-py3-none-any.whl", hash = "sha256:d0cfdf6afb31a5dabf3b95966cb31f3c757a0edaf3228715409cb404b9933de0"},
+    {file = "openai-1.25.0.tar.gz", hash = "sha256:22c35b26b8281cd2759b1a4c05ac99e2f2b26a9df71f90a0b4ddb75aa27adc81"},
+]
+
+[package.dependencies]
+anyio = ">=3.5.0,<5"
+distro = ">=1.7.0,<2"
+httpx = ">=0.23.0,<1"
+pydantic = ">=1.9.0,<3"
+sniffio = "*"
+tqdm = ">4"
+typing-extensions = ">=4.7,<5"
+
+[package.extras]
+datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
+
 [[package]]
 name = "openai-whisper"
 version = "20231117"
@@ -4093,4 +4129,4 @@ requests = ">=2.22"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "2a5d1f543b2c6fed9daefcac5e57a62815336f88a340a16a07864b7ebb1b1468"
+content-hash = "162c54ca166ad802b5f3d6f74bb2e845832bb1d48d3d7ceb15e404795270e202"
diff --git a/pyproject.toml b/pyproject.toml
index 08891dc..ef6a7db 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ openai-whisper = "^20231117"
 transformers = "^4.40.1"
 librosa = "^0.10.1"
 accelerate = "^0.29.3"
+openai = "^1.25.0"
 
 [tool.poetry.group.dev.dependencies]
 ruff = "^0.3.2"