追加：ソングでノートごとに ID を持たせ、音素と対応づける (#1460)

* stash * 歌声合成系のe2eテストを追加 * ノートごとにIDを持たせ、音素と対応づ * ID付与したスコアでのテストを追加
VOICEVOX · Aug 24, 2024 · 378b51e · 378b51e
1 parent 3c247dc
commit 378b51e
Show file tree

Hide file tree

Showing 7 changed files with 196 additions and 2 deletions.
diff --git a/test/e2e/__snapshots__/test_openapi/test_OpenAPIの形が変わっていないことを確認.json b/test/e2e/__snapshots__/test_openapi/test_OpenAPIの形が変わっていないことを確認.json
diff --git a/...eline/__snapshots__/test_sing_frame_audio_query/test_post_sing_frame_audio_query_200.json b/...eline/__snapshots__/test_sing_frame_audio_query/test_post_sing_frame_audio_query_200.json
diff --git a/...e/__snapshots__/test_sing_frame_audio_query/test_post_sing_old_frame_audio_query_200.json b/...e/__snapshots__/test_sing_frame_audio_query/test_post_sing_old_frame_audio_query_200.json
diff --git a/test/e2e/single_api/tts_pipeline/test_sing_frame_audio_query.py b/test/e2e/single_api/tts_pipeline/test_sing_frame_audio_query.py
@@ -11,6 +11,24 @@
 def test_post_sing_frame_audio_query_200(
     client: TestClient, snapshot_json: SnapshotAssertion
 ) -> None:
+    score = {
+        "notes": [
+            {"id": "a", "key": None, "frame_length": 10, "lyric": ""},
+            {"id": "b", "key": 30, "frame_length": 3, "lyric": "て"},
+            {"id": "c", "key": 30, "frame_length": 3, "lyric": "す"},
+            {"id": "d", "key": 40, "frame_length": 1, "lyric": "と"},
+            {"id": "e", "key": None, "frame_length": 10, "lyric": ""},
+        ]
+    }
+    response = client.post("/sing_frame_audio_query", params={"speaker": 0}, json=score)
+    assert response.status_code == 200
+    assert snapshot_json == round_floats(response.json(), 2)
+
+
+def test_post_sing_old_frame_audio_query_200(
+    client: TestClient, snapshot_json: SnapshotAssertion
+) -> None:
+    """古いバージョンの楽譜でもエラーなく合成できる"""
     score = {
         "notes": [
             {"key": None, "frame_length": 10, "lyric": ""},

diff --git a/...e/__snapshots__/test_tts_engine/test_mocked_synthesize_wave_from_score_output[query].json b/...e/__snapshots__/test_tts_engine/test_mocked_synthesize_wave_from_score_output[query].json
diff --git a/voicevox_engine/tts_pipeline/model.py b/voicevox_engine/tts_pipeline/model.py
@@ -5,10 +5,13 @@
 """
 
 from enum import Enum
+from typing import NewType
 
 from pydantic import BaseModel, ConfigDict, Field
 from pydantic.json_schema import SkipJsonSchema
 
+NoteId = NewType("NoteId", str)
+
 
 class Mora(BaseModel):
     """
@@ -63,6 +66,7 @@ class Note(BaseModel):
     音符ごとの情報
     """
 
+    id: NoteId | None = Field(default=None, description="ID")
     key: int | SkipJsonSchema[None] = Field(default=None, description="音階")
     frame_length: int = Field(description="音符のフレーム長")
     lyric: str = Field(description="音符の歌詞")
@@ -83,6 +87,7 @@ class FramePhoneme(BaseModel):
 
     phoneme: str = Field(description="音素")
     frame_length: int = Field(description="音素のフレーム長")
+    note_id: NoteId | None = Field(default=None, description="音符のID")
 
 
 class FrameAudioQuery(BaseModel):