Skip to content

Commit

Permalink
Ask OpenAI to process recognized speech to form lyrics_karaoke
Browse files Browse the repository at this point in the history
It works, but you can't use it yet if you're not running backend
locally. Also, the refactoring is needed in three places.
  • Loading branch information
m-danya committed May 1, 2024
1 parent 4039141 commit 8af35ca
Show file tree
Hide file tree
Showing 8 changed files with 105 additions and 43 deletions.
4 changes: 3 additions & 1 deletion .env.sample
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
DEPLOYMENT_HOST=127.0.0.1


MODE=DEV

DB_HOST=127.0.0.1
Expand All @@ -23,4 +24,5 @@ DOCKER_FRONTEND_PORT=80
CELERY_CONCURRENCY=1
STORAGE_PATH=./storage-volume

GENIUS_CLIENT_ACCESS_TOKEN=...
GENIUS_CLIENT_ACCESS_TOKEN=...
OPENAI_API_KEY=...
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ sudo apt-get install -y nvidia-container-toolkit
sudo systemctl restart docker
```

If you don't have a GPU, you'll need to comment
[these](https://github.com/m-danya/accompanist/blob/2dd6a2b4e5a655a26f32ae536ae35a80ce0dbd9d/compose.yaml#L36C1-L42C36)
lines in `compose.yaml`.

How to run the system:

```
Expand All @@ -44,6 +48,8 @@ obtain _Client Access Token_. You can enter any app name an use any "App Website
URL". After you get the token, place it into your `.env` file (corresponding
variable is `GENIUS_CLIENT_ACCESS_TOKEN`).

You can set `OPENAI_API_KEY` or left this key unfilled.

Let's continue:

```
Expand Down Expand Up @@ -119,6 +125,9 @@ the system at `${DEPLOYMENT_HOST}:${DOCKER_FRONTEND_PORT}`.
#### Possible todos

- [All the "TODO"s in the code]
- Storage management: delete (or even manage) stored mp3s and jpgs
- Finish timestamper (see #4)
- Autofetch albums/tracks in components every N seconds at frontend?
- DragAndDrop albums at the main page
- Fix admin panel being available only at ${FASTAPI_PORT} but not at ${NGINX_PORT}
- Tune Genius search query (e.g. remove the parentheses in track title)
Expand All @@ -138,6 +147,7 @@ the system at `${DEPLOYMENT_HOST}:${DOCKER_FRONTEND_PORT}`.
- Run `alembic upgrade head` at launching?
- Publish docker image(s) to Docker Hub
- Different gradient backgounds for different songs (random + choosing)
- Recognize notes?
- Frontend localization
- Add Telegram bot wrapper for the backend
- Add mypy (+ CI)
Expand Down
2 changes: 1 addition & 1 deletion accompanist/collection/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from accompanist.celery.tasks import process_album_task
from accompanist.collection.dao import TrackDAO
from accompanist.collection.recognizer import LyricsTimestamper
from accompanist.collection.timestamper import LyricsTimestamper
from accompanist.collection.schema import AlbumInfoFromUser, TrackUpdateRequest
from accompanist.collection.service_genius import get_lyrics_from_genius
from accompanist.config import settings
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from pathlib import Path
from typing import List

Expand All @@ -8,45 +9,35 @@
WhisperTokenizer,
WhisperProcessor,
)
from accompanist.config import settings
from openai import OpenAI


class LyricsTimestamper:
MODEL_TYPE = "small"

def __init__(self) -> None:
# self.pipe = pipeline(
# "automatic-speech-recognition",
# model="openai/whisper-small",
# chunk_length_s=30,
# device_map="auto",
# )
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = WhisperForConditionalGeneration.from_pretrained(
"openai/whisper-small",
).to(self.device)
self.processor = WhisperProcessor.from_pretrained("openai/whisper-small")
self.tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small")
self.openai_client = OpenAI(api_key=settings.OPENAI_API_KEY)

def get_karaoke_lyrics(self, vocals_mp3: Path, lyrics: str) -> List[dict[str, str]]:
# TODO: refactor this long method
# TODO: move calling this method to celery task
# TODO: standardize lyrics preprocessing between frontend and backend
lyrics_lines = [
line.strip()
for line in lyrics.split("\n")
if line.strip() and not line.startswith("[")
]

# audio = whisper.load_audio(path_mp3)
# audio = whisper.pad_or_trim(audio)

# mel = whisper.log_mel_spectrogram(audio).to(self.model.device)

# _, probs = self.model.detect_language(mel)
# options = whisper.DecodingOptions(language='ru')
sampling_rate = 16_000 # as Whisper requires?
waveform, _ = librosa.load(vocals_mp3, sr=sampling_rate, mono=True)

# non_silent_intervals = librosa.effects.split(waveform, top_db=30)

chunk_length_s = 5
overlap = chunk_length_s / 2 # 50% overlap
chunk_samples = int(chunk_length_s * sampling_rate)
Expand Down Expand Up @@ -83,34 +74,43 @@ def get_karaoke_lyrics(self, vocals_mp3: Path, lyrics: str) -> List[dict[str, st
{"text": transcribed_text, "start_ts": start_ts, "end_ts": end_ts}
)

# transcribed_lines = []
# for start_sample, end_sample in non_silent_intervals:
# interval_audio = waveform[start_sample:end_sample]

# prediction = self.pipe(
# audio_dict,
# # as the pipeline does not support `language` kwarg and fails if
# # autodetection finds two different languages in one batch
# batch_size=1,
# return_timestamps=True,
# )["chunks"]

print(f"{lyrics_lines=}")
print(f"{transcribed_lines=}")
prompt = """
You're a deaf person at a local karaoke bar. Your job is to make timestamps
for karaoke songs. For each song you get `lyrics_lines` an array of ground
truth lines and`transcribed_lines`, which is an array of speech-to-text
model. You need to output a `lyrics_karaoke` list, each object of which is a
dict with keys "line" and "end_ts". In other words, you need to find an end
of every line in the text.
system_prompt = """
You're a deaf person at a local karaoke bar. Your job is to make
timestamps for karaoke songs. For each song you get `lyrics_lines` an
array of ground truth lines and`transcribed_lines`, which is an array of
speech-to-text model. You need to output a `lyrics_karaoke` list, each
object of which is a dict with keys "line" and "end_ts". In other words,
you need to find an end of every line in the text.
`lyrics_karaoke` array must have exactly the same lines as `lyrics_lines` in
exactly the same order. Values of `end_ts` must increase from one element to
another, as they are lines of one song.
Do not write any code, just output the `lyrics_karaoke` dict in a JSON format.
Try your best. Here is your input:
Try your best.
"""
lyrics_karaoke = "ChatGPT4(lyrics_lines, transcribed_lines, prompt)"
user_request = f"{lyrics_lines=}\n\n{transcribed_lines=}"
print(user_request)
completion = self.openai_client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": system_prompt,
},
{
"role": "user",
"content": user_request,
},
],
)
lyrics_karaoke = completion.choices[0].message.content
lyrics_karaoke = json.loads(
lyrics_karaoke.strip().replace("```json", "").replace("```", "")
)
try:
lyrics_karaoke = lyrics_karaoke["lyrics_karaoke"]
except Exception:
pass
print(lyrics_karaoke)
return lyrics_karaoke
1 change: 1 addition & 0 deletions accompanist/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class Settings(BaseSettings):
STORAGE_PATH: Path

GENIUS_CLIENT_ACCESS_TOKEN: Optional[str]
OPENAI_API_KEY: Optional[str]

@property
def DATABASE_URL(self):
Expand Down
16 changes: 14 additions & 2 deletions accompanist/tests/unit_tests/test_timestamper.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from pathlib import Path

from accompanist.collection.recognizer import LyricsTimestamper
import pytest

from accompanist.collection.timestamper import LyricsTimestamper


@pytest.mark.skip(reason="It requires an OpenAI API key")
def test_timestamper(test_track_data):
timestamper = LyricsTimestamper()
vocals_mp3_path = Path("accompanist/tests/melancholy_vocals.mp3")
Expand All @@ -12,4 +15,13 @@ def test_timestamper(test_track_data):
lyrics_karaoke = timestamper.get_karaoke_lyrics(
vocals_mp3_path, melancholy_track["lyrics"]
)
# TODO: write asserts
assert lyrics_karaoke
# TODO: move preprocessing to model's property or somewhere else
lyrics_lines_preprocessed = [
line.strip()
for line in melancholy_track["lyrics"].split("\n")
if line.strip() and not line.startswith("[")
]
for original_line, karaoke_line in zip(lyrics_lines_preprocessed, lyrics_karaoke):
assert original_line == karaoke_line["line"]
# TODO: assert non-descending of `end_ts` sequence
38 changes: 37 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ openai-whisper = "^20231117"
transformers = "^4.40.1"
librosa = "^0.10.1"
accelerate = "^0.29.3"
openai = "^1.25.0"

[tool.poetry.group.dev.dependencies]
ruff = "^0.3.2"
Expand Down

0 comments on commit 8af35ca

Please sign in to comment.