From d9c94f803a5ca7d0bbcbf19823ecc285daf38c7d Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Sat, 23 Nov 2024 22:16:49 +0900 Subject: [PATCH 1/8] =?UTF-8?q?=E3=81=A8=E3=82=8A=E3=81=82=E3=81=88?= =?UTF-8?q?=E3=81=9AengineMock=E3=81=A8=E3=81=9D=E3=81=AE=E3=83=86?= =?UTF-8?q?=E3=82=B9=E3=83=88=E3=81=A0=E3=81=91=E5=AE=9F=E8=A3=85=E3=81=99?= =?UTF-8?q?=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package-lock.json | 52 ++++ package.json | 2 + src/{sing => helpers}/convertToWavFileData.ts | 11 +- src/mock/engineMock/README.md | 39 +++ .../mock/engineMock}/assets/icon_1.png | Bin .../mock/engineMock}/assets/icon_2.png | Bin .../mock/engineMock}/assets/icon_3.png | Bin .../mock/engineMock}/assets/icon_4.png | Bin .../mock/engineMock}/assets/portrait_1.png | Bin .../mock/engineMock}/assets/portrait_2.png | Bin .../mock/engineMock}/assets/portrait_3.png | Bin .../mock/engineMock}/assets/portrait_4.png | Bin src/mock/engineMock/audioQueryMock.ts | 195 ++++++++++++++ src/mock/engineMock/characterResourceMock.ts | 139 ++++++++++ src/mock/engineMock/index.ts | 213 +++++++++++++++ src/mock/engineMock/manifestMock.ts | 34 +++ src/mock/engineMock/phonemeMock.ts | 168 ++++++++++++ src/mock/engineMock/singModelMock.ts | 169 ++++++++++++ src/mock/engineMock/synthesisMock.ts | 254 ++++++++++++++++++ src/mock/engineMock/talkModelMock.ts | 239 ++++++++++++++++ src/store/singing.ts | 2 +- .../__snapshots__/index.spec.ts.snap | 176 ++++++++++++ tests/unit/mock/engineMock/index.spec.ts | 68 +++++ tests/unit/utils.ts | 11 +- 24 files changed, 1768 insertions(+), 4 deletions(-) rename src/{sing => helpers}/convertToWavFileData.ts (87%) create mode 100644 src/mock/engineMock/README.md rename {tests/e2e/browser => src/mock/engineMock}/assets/icon_1.png (100%) rename {tests/e2e/browser => src/mock/engineMock}/assets/icon_2.png (100%) rename {tests/e2e/browser => src/mock/engineMock}/assets/icon_3.png (100%) rename {tests/e2e/browser => src/mock/engineMock}/assets/icon_4.png (100%) rename {tests/e2e/browser => src/mock/engineMock}/assets/portrait_1.png (100%) rename {tests/e2e/browser => src/mock/engineMock}/assets/portrait_2.png (100%) rename {tests/e2e/browser => src/mock/engineMock}/assets/portrait_3.png (100%) rename {tests/e2e/browser => src/mock/engineMock}/assets/portrait_4.png (100%) create mode 100644 src/mock/engineMock/audioQueryMock.ts create mode 100644 src/mock/engineMock/characterResourceMock.ts create mode 100644 src/mock/engineMock/index.ts create mode 100644 src/mock/engineMock/manifestMock.ts create mode 100644 src/mock/engineMock/phonemeMock.ts create mode 100644 src/mock/engineMock/singModelMock.ts create mode 100644 src/mock/engineMock/synthesisMock.ts create mode 100644 src/mock/engineMock/talkModelMock.ts create mode 100644 tests/unit/mock/engineMock/__snapshots__/index.spec.ts.snap create mode 100644 tests/unit/mock/engineMock/index.spec.ts diff --git a/package-lock.json b/package-lock.json index 9db0bb586e..caf3005850 100644 --- a/package-lock.json +++ b/package-lock.json @@ -56,6 +56,7 @@ "@types/async-lock": "1.4.0", "@types/encoding-japanese": "1.0.18", "@types/glob": "8.0.0", + "@types/kuromoji": "0.1.3", "@types/markdown-it": "12.2.0", "@types/multistream": "4.1.0", "@types/semver": "7.3.9", @@ -85,6 +86,7 @@ "eslint-plugin-storybook": "0.8.0", "eslint-plugin-vue": "9.26.0", "happy-dom": "15.11.6", + "kuromoji": "0.1.2", "license-checker-rseidelsohn": "4.3.0", "markdownlint-cli": "0.37.0", "node-fetch": "2.7.0", @@ -3326,6 +3328,12 @@ "@types/ms": "*" } }, + "node_modules/@types/doublearray": { + "version": "0.0.32", + "resolved": "https://registry.npmjs.org/@types/doublearray/-/doublearray-0.0.32.tgz", + "integrity": "sha512-HloTru3I3a55runIVqZX1YBQi2L5A4peNQPh33yshzB4ttt1qHCnHPkuhy9Djy/cTx7i5xJvxItKRPCmvnfpGw==", + "dev": true + }, "node_modules/@types/earcut": { "version": "2.1.4", "resolved": "https://registry.npmjs.org/@types/earcut/-/earcut-2.1.4.tgz", @@ -3398,6 +3406,15 @@ "@types/node": "*" } }, + "node_modules/@types/kuromoji": { + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/@types/kuromoji/-/kuromoji-0.1.3.tgz", + "integrity": "sha512-u+YwX6eJj6Fmm0F5qunsyA+X8HSiyRNNE5ON3itD3tERax4meq9tv+S7bjTMXkPjqbdBGUmH2maGDCuEvpODwg==", + "dev": true, + "dependencies": { + "@types/doublearray": "*" + } + }, "node_modules/@types/linkify-it": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/@types/linkify-it/-/linkify-it-5.0.0.tgz", @@ -6722,6 +6739,12 @@ "url": "https://dotenvx.com" } }, + "node_modules/doublearray": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/doublearray/-/doublearray-0.0.2.tgz", + "integrity": "sha512-aw55FtZzT6AmiamEj2kvmR6BuFqvYgKZUkfQ7teqVRNqD5UE0rw8IeW/3gieHNKQ5sPuDKlljWEn4bzv5+1bHw==", + "dev": true + }, "node_modules/earcut": { "version": "2.2.4", "resolved": "https://registry.npmjs.org/earcut/-/earcut-2.2.4.tgz", @@ -10041,6 +10064,26 @@ "node": ">=6" } }, + "node_modules/kuromoji": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/kuromoji/-/kuromoji-0.1.2.tgz", + "integrity": "sha512-V0dUf+C2LpcPEXhoHLMAop/bOht16Dyr+mDiIE39yX3vqau7p80De/koFqpiTcL1zzdZlc3xuHZ8u5gjYRfFaQ==", + "dev": true, + "dependencies": { + "async": "^2.0.1", + "doublearray": "0.0.2", + "zlibjs": "^0.3.1" + } + }, + "node_modules/kuromoji/node_modules/async": { + "version": "2.6.4", + "resolved": "https://registry.npmjs.org/async/-/async-2.6.4.tgz", + "integrity": "sha512-mzo5dfJYwAn29PeiJ0zvwTo04zj8HDJj0Mn8TD7sno7q12prdbnasKJHhkm2c1LgrhlJ0teaea8860oxi51mGA==", + "dev": true, + "dependencies": { + "lodash": "^4.17.14" + } + }, "node_modules/lazy-val": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/lazy-val/-/lazy-val-1.0.5.tgz", @@ -16724,6 +16767,15 @@ "node": "*" } }, + "node_modules/zlibjs": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz", + "integrity": "sha512-+J9RrgTKOmlxFSDHo0pI1xM6BLVUv+o0ZT9ANtCxGkjIVCCUdx9alUF8Gm+dGLKbkkkidWIHFDZHDMpfITt4+w==", + "dev": true, + "engines": { + "node": "*" + } + }, "node_modules/zod": { "version": "3.22.4", "resolved": "https://registry.npmjs.org/zod/-/zod-3.22.4.tgz", diff --git a/package.json b/package.json index bf3c8d9476..32b003e17b 100644 --- a/package.json +++ b/package.json @@ -93,6 +93,7 @@ "@types/async-lock": "1.4.0", "@types/encoding-japanese": "1.0.18", "@types/glob": "8.0.0", + "@types/kuromoji": "0.1.3", "@types/markdown-it": "12.2.0", "@types/multistream": "4.1.0", "@types/semver": "7.3.9", @@ -122,6 +123,7 @@ "eslint-plugin-storybook": "0.8.0", "eslint-plugin-vue": "9.26.0", "happy-dom": "15.11.6", + "kuromoji": "0.1.2", "license-checker-rseidelsohn": "4.3.0", "markdownlint-cli": "0.37.0", "node-fetch": "2.7.0", diff --git a/src/sing/convertToWavFileData.ts b/src/helpers/convertToWavFileData.ts similarity index 87% rename from src/sing/convertToWavFileData.ts rename to src/helpers/convertToWavFileData.ts index 1ddfb637e4..8c2b8ef6e0 100644 --- a/src/sing/convertToWavFileData.ts +++ b/src/helpers/convertToWavFileData.ts @@ -1,4 +1,13 @@ -export const convertToWavFileData = (audioBuffer: AudioBuffer) => { +export const convertToWavFileData = ( + audioBuffer: + | AudioBuffer + | { + sampleRate: number; + length: number; + numberOfChannels: number; + getChannelData(channel: number): Float32Array; + }, +) => { const bytesPerSample = 4; // Float32 const formatCode = 3; // WAVE_FORMAT_IEEE_FLOAT diff --git a/src/mock/engineMock/README.md b/src/mock/engineMock/README.md new file mode 100644 index 0000000000..71a39a2a78 --- /dev/null +++ b/src/mock/engineMock/README.md @@ -0,0 +1,39 @@ +# エンジンモックのドキュメント + +## 概要 + +通信を介さずに音声合成できるエンジンのモックです。 + +同じ入力には同じ出力を返し、別の入力には別の出力を返すようになっています。 +また出力を見たときにUIや処理の実装の異常に気付けるように、ある程度直感に合う出力を返すよう努力されています。 + +例:音量を下げると音声が小さくなる、音程と周波数が一致する、など。 + +モックの実装は気軽に破壊的変更しても問題ありません。 + +## ビルド戦略 + +ブラウザ版でも使えるようにすべく、ソフトウェアにも組み込める形で実装されています。 +ビルド時のモックエンジンの取り扱いポリシーはこんな感じです。 + +- 重い処理が一切実行されないようにする + - 辞書の初期化、画像の読み込みなど +- なるべく重いファイルはビルドに含まれないようにする + - 形態素解析の辞書ファイルやダミー画像など + +## ファイル構成 + +- `talkModelMock.ts` + - トーク用の音声クエリを作るまでの処理周り +- `singModelMock.ts` + - ソング用の音声クエリを作るまでの処理周り +- `audioQueryMock.ts` + - 音声クエリ周り +- `synthesisMock.ts` + - 音声波形の合成周り +- `characterResourceMock.ts` + - キャラ名や画像などのリソース周り +- `phonemeMock.ts` + - 音素周り +- `manifestMock.ts` + - エンジンのマニフェスト周り diff --git a/tests/e2e/browser/assets/icon_1.png b/src/mock/engineMock/assets/icon_1.png similarity index 100% rename from tests/e2e/browser/assets/icon_1.png rename to src/mock/engineMock/assets/icon_1.png diff --git a/tests/e2e/browser/assets/icon_2.png b/src/mock/engineMock/assets/icon_2.png similarity index 100% rename from tests/e2e/browser/assets/icon_2.png rename to src/mock/engineMock/assets/icon_2.png diff --git a/tests/e2e/browser/assets/icon_3.png b/src/mock/engineMock/assets/icon_3.png similarity index 100% rename from tests/e2e/browser/assets/icon_3.png rename to src/mock/engineMock/assets/icon_3.png diff --git a/tests/e2e/browser/assets/icon_4.png b/src/mock/engineMock/assets/icon_4.png similarity index 100% rename from tests/e2e/browser/assets/icon_4.png rename to src/mock/engineMock/assets/icon_4.png diff --git a/tests/e2e/browser/assets/portrait_1.png b/src/mock/engineMock/assets/portrait_1.png similarity index 100% rename from tests/e2e/browser/assets/portrait_1.png rename to src/mock/engineMock/assets/portrait_1.png diff --git a/tests/e2e/browser/assets/portrait_2.png b/src/mock/engineMock/assets/portrait_2.png similarity index 100% rename from tests/e2e/browser/assets/portrait_2.png rename to src/mock/engineMock/assets/portrait_2.png diff --git a/tests/e2e/browser/assets/portrait_3.png b/src/mock/engineMock/assets/portrait_3.png similarity index 100% rename from tests/e2e/browser/assets/portrait_3.png rename to src/mock/engineMock/assets/portrait_3.png diff --git a/tests/e2e/browser/assets/portrait_4.png b/src/mock/engineMock/assets/portrait_4.png similarity index 100% rename from tests/e2e/browser/assets/portrait_4.png rename to src/mock/engineMock/assets/portrait_4.png diff --git a/src/mock/engineMock/audioQueryMock.ts b/src/mock/engineMock/audioQueryMock.ts new file mode 100644 index 0000000000..8a4c4dda33 --- /dev/null +++ b/src/mock/engineMock/audioQueryMock.ts @@ -0,0 +1,195 @@ +/** + * AudioQueryとFrameAudioQueryのモック。 + * VOICEVOX ENGINEリポジトリの処理とほぼ同じ。 + */ + +import { AccentPhrase, AudioQuery, FrameAudioQuery, Mora } from "@/openapi"; + +function generateSilenceMora(length: number): Mora { + return { + text: " ", + vowel: "sil", + vowelLength: length, + pitch: 0.0, + }; +} + +function toFlattenMoras(accentPhrases: AccentPhrase[]): Mora[] { + let moras: Mora[] = []; + accentPhrases.forEach((accentPhrase) => { + moras = moras.concat(accentPhrase.moras); + if (accentPhrase.pauseMora) { + moras.push(accentPhrase.pauseMora); + } + }); + return moras; +} + +function toFlattenPhonemes(moras: Mora[]): string[] { + const phonemes: string[] = []; + for (const mora of moras) { + if (mora.consonant) { + phonemes.push(mora.consonant); + } + phonemes.push(mora.vowel); + } + return phonemes; +} + +/** 前後の無音モーラを追加する */ +function applyPrePostSilence(moras: Mora[], query: AudioQuery): Mora[] { + const preSilenceMoras = [generateSilenceMora(query.prePhonemeLength)]; + const postSilenceMoras = [generateSilenceMora(query.postPhonemeLength)]; + return preSilenceMoras.concat(moras).concat(postSilenceMoras); +} + +/** 無音時間を置き換える */ +function applyPauseLength(moras: Mora[], query: AudioQuery): Mora[] { + if (query.pauseLength != undefined) { + for (const mora of moras) { + if (mora.vowel == "pau") { + mora.vowelLength = query.pauseLength; + } + } + } + return moras; +} + +/** 無音時間スケールを適用する */ +function applyPauseLengthScale(moras: Mora[], query: AudioQuery): Mora[] { + if (query.pauseLengthScale != undefined) { + for (const mora of moras) { + if (mora.vowel == "pau") { + mora.vowelLength *= query.pauseLengthScale; + } + } + } + return moras; +} + +/** 話速スケールを適用する */ +function applySpeedScale(moras: Mora[], query: AudioQuery): Mora[] { + for (const mora of moras) { + mora.vowelLength /= query.speedScale; + if (mora.consonantLength) { + mora.consonantLength /= query.speedScale; + } + } + return moras; +} + +/** 音高スケールを適用する */ +function applyPitchScale(moras: Mora[], query: AudioQuery): Mora[] { + for (const mora of moras) { + mora.pitch *= 2 ** query.pitchScale; + } + return moras; +} + +/** 抑揚スケールを適用する */ +function applyIntonationScale(moras: Mora[], query: AudioQuery): Mora[] { + const voiced = moras.filter((mora) => mora.pitch > 0); + if (voiced.length == 0) { + return moras; + } + + const meanF0 = + voiced.reduce((sum, mora) => sum + mora.pitch, 0) / voiced.length; + for (const mora of voiced) { + mora.pitch = (mora.pitch - meanF0) * query.intonationScale + meanF0; + } + return moras; +} + +/** 疑問文の最後に音高の高いモーラを追加する */ +function applyInterrogativeUpspeak(accentPhrases: Array) { + accentPhrases.forEach((accentPhrase) => { + const moras = accentPhrase.moras; + if ( + moras.length > 0 && + accentPhrase.isInterrogative && + moras[moras.length - 1].pitch > 0 + ) { + const lastMora = moras[moras.length - 1]; + const upspeakMora: Mora = { + text: "ー", + vowel: lastMora.vowel, + vowelLength: 0.15, + pitch: lastMora.pitch + 0.3, + }; + accentPhrase.moras.push(upspeakMora); + } + }); +} + +function secondToFrame(second: number): number { + const FRAME_RATE = 24000 / 256; + return Math.round(second * FRAME_RATE); +} + +/** モーラや音素ごとのフレーム数を数える */ +function countFramePerUnit(moras: Mora[]): { + framePerPhoneme: number[]; + framePerMora: number[]; +} { + const framePerPhoneme: number[] = []; + const framePerMora: number[] = []; + + for (const mora of moras) { + const vowelFrames = secondToFrame(mora.vowelLength); + const consonantFrames = mora.consonantLength + ? secondToFrame(mora.consonantLength) + : 0; + const moraFrames = vowelFrames + consonantFrames; + + if (mora.consonant) { + framePerPhoneme.push(consonantFrames); + } + framePerPhoneme.push(vowelFrames); + framePerMora.push(moraFrames); + } + + return { framePerPhoneme, framePerMora }; +} + +/** AudioQueryを適当にFrameAudioQueryに変換する */ +export function audioQueryToFrameAudioQueryMock( + audioQuery: AudioQuery, + { enableInterrogativeUpspeak }: { enableInterrogativeUpspeak: boolean }, +): FrameAudioQuery { + const accentPhrases = audioQuery.accentPhrases; + + if (enableInterrogativeUpspeak) { + applyInterrogativeUpspeak(accentPhrases); + } + + let moras = toFlattenMoras(accentPhrases); + moras = applyPrePostSilence(moras, audioQuery); + moras = applyPauseLength(moras, audioQuery); + moras = applyPauseLengthScale(moras, audioQuery); + moras = applySpeedScale(moras, audioQuery); + moras = applyPitchScale(moras, audioQuery); + moras = applyIntonationScale(moras, audioQuery); + + const { framePerPhoneme, framePerMora } = countFramePerUnit(moras); + + const f0 = moras.flatMap((mora, i) => + Array(framePerMora[i]).fill( + mora.pitch == 0 ? 0 : Math.exp(mora.pitch), + ), + ); + const volume = Array(f0.length).fill(audioQuery.volumeScale); + const phonemes = toFlattenPhonemes(moras).map((phoneme, i) => ({ + phoneme, + frameLength: framePerPhoneme[i], + })); + + return { + f0, + volume, + phonemes, + volumeScale: audioQuery.volumeScale, + outputSamplingRate: audioQuery.outputSamplingRate, + outputStereo: audioQuery.outputStereo, + }; +} diff --git a/src/mock/engineMock/characterResourceMock.ts b/src/mock/engineMock/characterResourceMock.ts new file mode 100644 index 0000000000..400fcd67de --- /dev/null +++ b/src/mock/engineMock/characterResourceMock.ts @@ -0,0 +1,139 @@ +/** + * キャラクター情報を作るモック。 + * なんとなくVOICEVOX ENGINEリポジトリのモック実装と揃えている。 + */ + +import { Speaker, SpeakerInfo } from "@/openapi"; + +/** 立ち絵のURLを得る */ +async function getPortraitUrl(characterIndex: number) { + const portraits = Object.values( + import.meta.glob<{ default: string }>("./assets/portrait_*.png"), + ); + return (await portraits[characterIndex]()).default; +} + +/** アイコンのURLを得る */ +async function getIconUrl(characterIndex: number) { + const icons = Object.values( + import.meta.glob<{ default: string }>("./assets/icon_*.png"), + ); + return (await icons[characterIndex]()).default; +} + +const baseCharactersMock = [ + // トーク2つ・ハミング2つ + { + name: "dummy1", + styles: [ + { name: "style0", id: 0 }, + { name: "style1", id: 2 }, + { name: "style2", id: 4, type: "frame_decode" }, + { name: "style3", id: 6, type: "frame_decode" }, + ], + speakerUuid: "7ffcb7ce-00ec-4bdc-82cd-45a8889e43ff", + version: "mock", + }, + // トーク2つ・ハミング1つ・ソング1つ + { + name: "dummy2", + styles: [ + { name: "style0", id: 1 }, + { name: "style1", id: 3 }, + { name: "style2", id: 5, type: "frame_decode" }, + { name: "style3", id: 7, type: "sing" }, + ], + speakerUuid: "388f246b-8c41-4ac1-8e2d-5d79f3ff56d9", + version: "mock", + }, + // トーク1つ + { + name: "dummy3", + styles: [{ name: "style0", id: 8, type: "talk" }], + speakerUuid: "35b2c544-660e-401e-b503-0e14c635303a", + version: "mock", + }, + // ソング1つ + { + name: "dummy4", + styles: [{ name: "style0", id: 9, type: "sing" }], + speakerUuid: "b1a81618-b27b-40d2-b0ea-27a9ad408c4b", + version: "mock", + }, +] satisfies Speaker[]; + +/** 全てのキャラクターを返すモック */ +export function getCharactersMock(): Speaker[] { + return baseCharactersMock; +} + +/** 喋れるキャラクターを返すモック */ +export function getSpeakersMock(): Speaker[] { + return ( + baseCharactersMock + // スタイルをトークのみに絞り込む + .map((character) => ({ + ...character, + styles: character.styles.filter( + (style) => style.type == undefined || style.type == "talk", + ), + })) + // 1つもスタイルがないキャラクターを除外 + .filter((character) => character.styles.length > 0) + ); +} + +/* 歌えるキャラクターを返すモック */ +export function getSingersMock(): Speaker[] { + return ( + baseCharactersMock + // スタイルをソングのみに絞り込む + .map((character) => ({ + ...character, + styles: character.styles.filter( + (style) => style.type == "frame_decode" || style.type == "sing", + ), + })) + // 1つもスタイルがないキャラクターを除外 + .filter((character) => character.styles.length > 0) + ); +} + +/** キャラクターの追加情報を返すモック。 */ +export async function getCharacterInfoMock( + speakerUuid: string, +): Promise { + // NOTE: 画像のURLを得るために必要 + const characterIndex = baseCharactersMock.findIndex( + (speaker) => speaker.speakerUuid === speakerUuid, + ); + if (characterIndex === -1) { + throw new Error(`Character not found: ${speakerUuid}`); + } + + const styleIds = baseCharactersMock[characterIndex].styles.map( + (style) => style.id, + ); + + return { + policy: `Dummy policy for ${speakerUuid}`, + portrait: await getPortraitUrl(characterIndex), + styleInfos: await Promise.all( + styleIds.map(async (id) => ({ + id, + icon: await getIconUrl(characterIndex), + voiceSamples: [], + })), + ), + }; +} + +/** + * 喋れるキャラクターの追加情報を返すモック。 + * 本当は喋れるスタイルのみでフィルタリングすべき。 + */ +export async function getSpeakerInfoMock( + speakerUuid: string, +): Promise { + return getCharacterInfoMock(speakerUuid); +} diff --git a/src/mock/engineMock/index.ts b/src/mock/engineMock/index.ts new file mode 100644 index 0000000000..c3318aa2f4 --- /dev/null +++ b/src/mock/engineMock/index.ts @@ -0,0 +1,213 @@ +import { audioQueryToFrameAudioQueryMock } from "./audioQueryMock"; +import { getEngineManifestMock } from "./manifestMock"; +import { + getSingersMock, + getSpeakerInfoMock, + getSpeakersMock, +} from "./characterResourceMock"; +import { synthesisFrameAudioQueryMock } from "./synthesisMock"; +import { + replaceLengthMock, + replacePitchMock, + textToActtentPhrasesMock, +} from "./talkModelMock"; +import { + notesAndFramePhonemesAndPitchToVolumeMock, + notesAndFramePhonemesToPitchMock, + notesToFramePhonemesMock, +} from "./singModelMock"; + +import { cloneWithUnwrapProxy } from "@/helpers/cloneWithUnwrapProxy"; +import { + AccentPhrase, + AccentPhrasesAccentPhrasesPostRequest, + AudioQuery, + AudioQueryAudioQueryPostRequest, + DefaultApiInterface, + EngineManifest, + FrameAudioQuery, + FrameSynthesisFrameSynthesisPostRequest, + MoraDataMoraDataPostRequest, + SingerInfoSingerInfoGetRequest, + SingFrameAudioQuerySingFrameAudioQueryPostRequest, + SingFrameVolumeSingFrameVolumePostRequest, + Speaker, + SpeakerInfo, + SpeakerInfoSpeakerInfoGetRequest, + SupportedDevicesInfo, + SynthesisSynthesisPostRequest, + UserDictWord, +} from "@/openapi"; + +/** + * エンジンのOpenAPIの関数群のモック。 + * 実装されていない関数もある。 + */ +export function createOpenAPIEngineMock(): DefaultApiInterface { + const mockApi: Partial = { + async versionVersionGet(): Promise { + return "mock"; + }, + + async engineManifestEngineManifestGet(): Promise { + return getEngineManifestMock(); + }, + + async supportedDevicesSupportedDevicesGet(): Promise { + return { cpu: true, cuda: false, dml: false }; + }, + + async isInitializedSpeakerIsInitializedSpeakerGet(): Promise { + return true; + }, + + async initializeSpeakerInitializeSpeakerPost(): Promise { + return; + }, + + async speakersSpeakersGet(): Promise { + return getSpeakersMock(); + }, + + async speakerInfoSpeakerInfoGet( + payload: SpeakerInfoSpeakerInfoGetRequest, + ): Promise { + return getSpeakerInfoMock(payload.speakerUuid); + }, + + async singersSingersGet(): Promise { + return getSingersMock(); + }, + + async singerInfoSingerInfoGet( + paload: SingerInfoSingerInfoGetRequest, + ): Promise { + return getSpeakerInfoMock(paload.speakerUuid); + }, + + async audioQueryAudioQueryPost( + payload: AudioQueryAudioQueryPostRequest, + ): Promise { + const accentPhrases = await textToActtentPhrasesMock( + payload.text, + payload.speaker, + ); + + return { + accentPhrases, + speedScale: 1.0, + pitchScale: 0, + intonationScale: 1.0, + volumeScale: 1.0, + prePhonemeLength: 0.1, + postPhonemeLength: 0.1, + outputSamplingRate: getEngineManifestMock().defaultSamplingRate, + outputStereo: false, + }; + }, + + async accentPhrasesAccentPhrasesPost( + payload: AccentPhrasesAccentPhrasesPostRequest, + ): Promise { + if (payload.isKana == true) + throw new Error("AquesTalk風記法は未対応です"); + + const accentPhrases = await textToActtentPhrasesMock( + payload.text, + payload.speaker, + ); + return accentPhrases; + }, + + async moraDataMoraDataPost( + payload: MoraDataMoraDataPostRequest, + ): Promise { + const accentPhrase = cloneWithUnwrapProxy(payload.accentPhrase); + replaceLengthMock(accentPhrase, payload.speaker); + replacePitchMock(accentPhrase, payload.speaker); + return accentPhrase; + }, + + async synthesisSynthesisPost( + payload: SynthesisSynthesisPostRequest, + ): Promise { + const frameAudioQuery = audioQueryToFrameAudioQueryMock( + payload.audioQuery, + { + enableInterrogativeUpspeak: + payload.enableInterrogativeUpspeak ?? false, + }, + ); + const buffer = synthesisFrameAudioQueryMock( + frameAudioQuery, + payload.speaker, + ); + return new Blob([buffer], { type: "audio/wav" }); + }, + + async singFrameAudioQuerySingFrameAudioQueryPost( + payload: SingFrameAudioQuerySingFrameAudioQueryPostRequest, + ): Promise { + const { score, speaker: styleId } = cloneWithUnwrapProxy(payload); + + const phonemes = notesToFramePhonemesMock(score.notes, styleId); + const f0 = notesAndFramePhonemesToPitchMock( + score.notes, + phonemes, + styleId, + ); + const volume = notesAndFramePhonemesAndPitchToVolumeMock( + score.notes, + phonemes, + f0, + styleId, + ); + + return { + f0, + volume, + phonemes, + volumeScale: 1.0, + outputSamplingRate: getEngineManifestMock().defaultSamplingRate, + outputStereo: false, + }; + }, + + async singFrameVolumeSingFrameVolumePost( + payload: SingFrameVolumeSingFrameVolumePostRequest, + ): Promise> { + const { + speaker: styleId, + bodySingFrameVolumeSingFrameVolumePost: { score, frameAudioQuery }, + } = cloneWithUnwrapProxy(payload); + + const volume = notesAndFramePhonemesAndPitchToVolumeMock( + score.notes, + frameAudioQuery.phonemes, + frameAudioQuery.f0, + styleId, + ); + return volume; + }, + + async frameSynthesisFrameSynthesisPost( + payload: FrameSynthesisFrameSynthesisPostRequest, + ): Promise { + const { speaker: styleId, frameAudioQuery } = + cloneWithUnwrapProxy(payload); + const buffer = synthesisFrameAudioQueryMock(frameAudioQuery, styleId); + return new Blob([buffer], { type: "audio/wav" }); + }, + + // 辞書系 + + async getUserDictWordsUserDictGet(): Promise<{ + [key: string]: UserDictWord; + }> { + // 空の辞書を返す + return {}; + }, + }; + + return mockApi as DefaultApiInterface; +} diff --git a/src/mock/engineMock/manifestMock.ts b/src/mock/engineMock/manifestMock.ts new file mode 100644 index 0000000000..7d645e6855 --- /dev/null +++ b/src/mock/engineMock/manifestMock.ts @@ -0,0 +1,34 @@ +/** + * エンジンマニフェストのモック。 + */ + +/** エンジンマニフェストを返すモック */ +export function getEngineManifestMock() { + return { + manifestVersion: "0.13.1", + name: "DUMMY Engine", + brandName: "DUMMY", + uuid: "c7b58856-bd56-4aa1-afb7-b8415f824b06", + url: "not_found", + icon: "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAAAXNSR0IArs4c6QAAAA1JREFUGFdjWHpl7X8AB24DJsTeKbEAAAAASUVORK5CYII=", // 1pxの画像 + defaultSamplingRate: 24000, + frameRate: 93.75, + termsOfService: "not_found", + updateInfos: [], + dependencyLicenses: [], + supportedVvlibManifestVersion: undefined, + supportedFeatures: { + adjustMoraPitch: true, + adjustPhonemeLength: true, + adjustSpeedScale: true, + adjustPitchScale: true, + adjustIntonationScale: true, + adjustVolumeScale: true, + interrogativeUpspeak: false, + synthesisMorphing: false, + sing: true, + manageLibrary: false, + returnResourceUrl: true, + }, + }; +} diff --git a/src/mock/engineMock/phonemeMock.ts b/src/mock/engineMock/phonemeMock.ts new file mode 100644 index 0000000000..d128b1648e --- /dev/null +++ b/src/mock/engineMock/phonemeMock.ts @@ -0,0 +1,168 @@ +/** カタカナを音素に変換する */ +export const moraToPhonemes: { + [key: string]: [string | undefined, string]; +} = { + ヴォ: ["v", "o"], + ヴェ: ["v", "e"], + ヴィ: ["v", "i"], + ヴァ: ["v", "a"], + ヴ: ["v", "u"], + ン: [undefined, "N"], + ワ: ["w", "a"], + ロ: ["r", "o"], + レ: ["r", "e"], + ル: ["r", "u"], + リョ: ["ry", "o"], + リュ: ["ry", "u"], + リャ: ["ry", "a"], + リェ: ["ry", "e"], + リ: ["r", "i"], + ラ: ["r", "a"], + ヨ: ["y", "o"], + ユ: ["y", "u"], + ヤ: ["y", "a"], + モ: ["m", "o"], + メ: ["m", "e"], + ム: ["m", "u"], + ミョ: ["my", "o"], + ミュ: ["my", "u"], + ミャ: ["my", "a"], + ミェ: ["my", "e"], + ミ: ["m", "i"], + マ: ["m", "a"], + ポ: ["p", "o"], + ボ: ["b", "o"], + ホ: ["h", "o"], + ペ: ["p", "e"], + ベ: ["b", "e"], + ヘ: ["h", "e"], + プ: ["p", "u"], + ブ: ["b", "u"], + フォ: ["f", "o"], + フェ: ["f", "e"], + フィ: ["f", "i"], + ファ: ["f", "a"], + フ: ["f", "u"], + ピョ: ["py", "o"], + ピュ: ["py", "u"], + ピャ: ["py", "a"], + ピェ: ["py", "e"], + ピ: ["p", "i"], + ビョ: ["by", "o"], + ビュ: ["by", "u"], + ビャ: ["by", "a"], + ビェ: ["by", "e"], + ビ: ["b", "i"], + ヒョ: ["hy", "o"], + ヒュ: ["hy", "u"], + ヒャ: ["hy", "a"], + ヒェ: ["hy", "e"], + ヒ: ["h", "i"], + パ: ["p", "a"], + バ: ["b", "a"], + ハ: ["h", "a"], + ノ: ["n", "o"], + ネ: ["n", "e"], + ヌ: ["n", "u"], + ニョ: ["ny", "o"], + ニュ: ["ny", "u"], + ニャ: ["ny", "a"], + ニェ: ["ny", "e"], + ニ: ["n", "i"], + ナ: ["n", "a"], + ドゥ: ["d", "u"], + ド: ["d", "o"], + トゥ: ["t", "u"], + ト: ["t", "o"], + デョ: ["dy", "o"], + デュ: ["dy", "u"], + デャ: ["dy", "a"], + デェ: ["dy", "e"], + ディ: ["d", "i"], + デ: ["d", "e"], + テョ: ["ty", "o"], + テュ: ["ty", "u"], + テャ: ["ty", "a"], + ティ: ["t", "i"], + テ: ["t", "e"], + ツォ: ["ts", "o"], + ツェ: ["ts", "e"], + ツィ: ["ts", "i"], + ツァ: ["ts", "a"], + ツ: ["ts", "u"], + ッ: [undefined, "cl"], + チョ: ["ch", "o"], + チュ: ["ch", "u"], + チャ: ["ch", "a"], + チェ: ["ch", "e"], + チ: ["ch", "i"], + ダ: ["d", "a"], + タ: ["t", "a"], + ゾ: ["z", "o"], + ソ: ["s", "o"], + ゼ: ["z", "e"], + セ: ["s", "e"], + ズィ: ["z", "i"], + ズ: ["z", "u"], + スィ: ["s", "i"], + ス: ["s", "u"], + ジョ: ["j", "o"], + ジュ: ["j", "u"], + ジャ: ["j", "a"], + ジェ: ["j", "e"], + ジ: ["j", "i"], + ショ: ["sh", "o"], + シュ: ["sh", "u"], + シャ: ["sh", "a"], + シェ: ["sh", "e"], + シ: ["sh", "i"], + ザ: ["z", "a"], + サ: ["s", "a"], + ゴ: ["g", "o"], + コ: ["k", "o"], + ゲ: ["g", "e"], + ケ: ["k", "e"], + グヮ: ["gw", "a"], + グ: ["g", "u"], + クヮ: ["kw", "a"], + ク: ["k", "u"], + ギョ: ["gy", "o"], + ギュ: ["gy", "u"], + ギャ: ["gy", "a"], + ギェ: ["gy", "e"], + ギ: ["g", "i"], + キョ: ["ky", "o"], + キュ: ["ky", "u"], + キャ: ["ky", "a"], + キェ: ["ky", "e"], + キ: ["k", "i"], + ガ: ["g", "a"], + カ: ["k", "a"], + オ: [undefined, "o"], + エ: [undefined, "e"], + ウォ: ["w", "o"], + ウェ: ["w", "e"], + ウィ: ["w", "i"], + ウ: [undefined, "u"], + イェ: ["y", "e"], + イ: [undefined, "i"], + ア: [undefined, "a"], + ヴョ: ["by", "o"], + ヴュ: ["by", "u"], + ヴャ: ["by", "a"], + ヲ: [undefined, "o"], + ヱ: [undefined, "e"], + ヰ: [undefined, "i"], + ヮ: ["w", "a"], + ョ: ["y", "o"], + ュ: ["y", "u"], + ヅ: ["z", "u"], + ヂ: ["j", "i"], + ヶ: ["k", "e"], + ャ: ["y", "a"], + ォ: [undefined, "o"], + ェ: [undefined, "e"], + ゥ: [undefined, "u"], + ィ: [undefined, "i"], + ァ: [undefined, "a"], +}; diff --git a/src/mock/engineMock/singModelMock.ts b/src/mock/engineMock/singModelMock.ts new file mode 100644 index 0000000000..f1c583fe03 --- /dev/null +++ b/src/mock/engineMock/singModelMock.ts @@ -0,0 +1,169 @@ +/** + * ソング系の構造体を作るモック。 + * 値は適当だが、テストで使えるよう決定論的に決まるようにしたり、UIのバグに気づけるようある程度規則を持たせている。 + */ + +import { moraToPhonemes } from "./phonemeMock"; +import { convertHiraToKana } from "@/domain/japanese"; +import { Note, FramePhoneme } from "@/openapi"; + +function noteNumberToFrequency(noteNumber: number) { + return 440 * Math.pow(2, (noteNumber - 69) / 12); +} + +/** アルファベット文字列を適当な0~1の適当な数値に変換する */ +function alphabetsToNumber(text: string): number { + const codes = text.split("").map((c) => c.charCodeAt(0)); + const sum = codes.reduce((a, b) => a + b); + return (sum % 256) / 256; +} + +/** 0.01~0.25になるように適当な長さを決める */ +function phonemeToLengthMock(phoneme: string): number { + return alphabetsToNumber(phoneme) * 0.24 + 0.01; +} + +/** 揺れ幅が-30cent~30centになるように適当なピッチを決める */ +function phonemeAndKeyToPitchMock(phoneme: string, key: number): number { + const base = noteNumberToFrequency(key); + const shift = (-30 + 60 * alphabetsToNumber(phoneme)) / 1200; + return base * Math.pow(2, shift); +} + +/** 0.8~1.0になるような適当な音量を決める */ +function phonemeAndPitchToVolumeMock(phoneme: string, pitch: number): number { + const minPitch = noteNumberToFrequency(1); + const maxPitch = noteNumberToFrequency(128); + const normalized = (pitch - minPitch) / (maxPitch - minPitch); + return 0.75 + normalized * 0.2 + alphabetsToNumber(phoneme) * 0.05; +} + +/** + * ノートから音素と適当な音素長を作成する。 + * 母音の開始位置をノートの開始位置は一致させ、子音は前のノートに食い込むようにする。 + */ +export function notesToFramePhonemesMock( + notes: Note[], + styleId: number, +): FramePhoneme[] { + const framePhonemes: FramePhoneme[] = []; + for (const note of notes) { + const noteId = note.id; + + // 休符の場合はノートの長さ + if (note.key == undefined && note.lyric == "") { + framePhonemes.push({ + noteId, + phoneme: "pau", + frameLength: note.frameLength, + }); + continue; + } + + const phonemes = moraToPhonemes[convertHiraToKana(note.lyric)]; + if (phonemes == undefined) + throw new Error(`音素に変換できません: ${note.lyric}`); + + const [consonant, vowel] = phonemes; + + if (consonant != undefined) { + // 子音は適当な長さ + let consonantLength = phonemeToLengthMock(consonant); + + // 別の歌手で同じにならないように適当に値をずらす + consonantLength += styleId * 0.03; + + // 子音の長さが前のノートの長さ以上になる場合、子音の長さをノートの半分にする + const beforeFramePhoneme = framePhonemes[framePhonemes.length - 1]; + if (beforeFramePhoneme.frameLength < consonantLength) { + consonantLength = beforeFramePhoneme.frameLength / 2; + } + + // 整数値にする + consonantLength = Math.max(Math.round(consonantLength), 1); + + // 子音は前のノートに食い込む + beforeFramePhoneme.frameLength -= consonantLength; + framePhonemes.push({ + noteId, + phoneme: consonant, + frameLength: consonantLength, + }); + } + + // 母音はノートの長さ + const vowelLength = note.frameLength; + framePhonemes.push({ noteId, phoneme: vowel, frameLength: vowelLength }); + } + + return framePhonemes; +} + +/** ノートと音素長から適当なピッチを作成する */ +export function notesAndFramePhonemesToPitchMock( + notes: Note[], + framePhonemes: FramePhoneme[], + styleId: number, +): number[] { + // 製品版エンジンへの特別対応の都合でstyleId=6000が来ることがあるので特別処理 + styleId %= 6000; + + return framePhonemes.flatMap((phoneme, i) => { + let pitch; + + // 休符の場合は0 + if (phoneme.phoneme == "pau") { + pitch = 0; + } else { + // IDが同じノートを探す + const note = notes + .filter((note) => note.id != undefined) + .find((note) => note.id == phoneme.noteId); + if (note == undefined) + throw new Error( + `ノートが見つかりません: index=${i} phoneme=${phoneme.phoneme}`, + ); + + if (note.key != undefined) { + pitch = phonemeAndKeyToPitchMock(phoneme.phoneme, note.key); + + // 別の歌手で同じにならないように適当に値をずらす + pitch *= 1 + styleId * 0.03; + } else { + pitch = 0; + } + } + + return Array(phoneme.frameLength).fill(pitch); + }); +} + +/** + * ノートと音素長とピッチから適当な音量を作成する。 + * ピッチが高いほど音量が大きくなるようにする。 + * NOTE: ノートは一旦無視している。 + */ +export function notesAndFramePhonemesAndPitchToVolumeMock( + notes: Note[], + framePhonemes: FramePhoneme[], + f0: number[], + styleId: number, +): number[] { + const phonemePerFrame = framePhonemes.flatMap((phoneme) => + Array(phoneme.frameLength).fill(phoneme.phoneme), + ); + + return Array(f0.length) + .fill(-1) + .map((_, i) => { + const phoneme = phonemePerFrame[i]; + const pitch = f0[i]; + + let volume = phonemeAndPitchToVolumeMock(phoneme, pitch); + + // 別の歌手で同じにならないように適当に値をずらす + volume *= 1 - styleId * 0.03; + + return volume; + }); +} diff --git a/src/mock/engineMock/synthesisMock.ts b/src/mock/engineMock/synthesisMock.ts new file mode 100644 index 0000000000..9042fc764b --- /dev/null +++ b/src/mock/engineMock/synthesisMock.ts @@ -0,0 +1,254 @@ +/** + * 音声合成するモック。 + * 音高と音量はそれっぽい音を合成する。 + * 音素は適当に別々の電子音にする。 + */ + +import { FrameAudioQuery } from "@/openapi"; +import { convertToWavFileData } from "@/helpers/convertToWavFileData"; +import { applyGaussianFilter } from "@/sing/utility"; + +/** 0~1を返す疑似乱数生成器 */ +function Random(seed: number = 0) { + // 線形合同法 + const a = 1664525; + const c = 1013904223; + const m = 2 ** 31; + + return () => { + seed = (a * seed + c) % m; + return seed / m; + }; +} + +/** 波形の種類 */ +const waveTypes = ["sine", "square", "noise", "silence"] as const; +type WaveType = (typeof waveTypes)[number]; + +/** サイン波などを生成する */ +function generateWave( + f0: Array, + volume: Array, + frameRate: number, + sampleRate: number, + type: WaveType, +) { + const duration = f0.length / frameRate; + const samplesPerOriginal = sampleRate / frameRate; + const wave = new Float32Array(sampleRate * duration); + + const seed = + Math.round(f0.concat(volume).reduce((acc, v) => acc + v, 0)) % 2 ** 31; // そこそこ被らないシード値 + const random = Random(seed); + let phase = 0; + for (let frameIndex = 0; frameIndex < f0.length; frameIndex++) { + const freq = f0[frameIndex]; + const vol = volume[frameIndex]; + const omega = (2 * Math.PI * freq) / sampleRate; + + for (let i = 0; i < samplesPerOriginal; i++) { + const sampleIndex = frameIndex * samplesPerOriginal + i; + switch (type) { + case "sine": + wave[sampleIndex] = Math.sin(phase); + break; + case "square": + wave[sampleIndex] = (phase / Math.PI) % 2 < 1 ? 1 : -1; + break; + case "noise": + wave[sampleIndex] = random() * 2 - 1; + break; + case "silence": + wave[sampleIndex] = 0; + break; + } + wave[sampleIndex] *= vol; + + phase += omega; + if (phase > 2 * Math.PI) { + phase -= 2 * Math.PI; + } + } + } + + return wave; +} + +/** + * 音素ごとの特徴。 + * FIXME: できるならデバッグしやすいようそれっぽい音に近づけたい。 + */ +const phonemeFeatures = { + 有声母音: ["a", "i", "u", "e", "o", "N"], + 無声母音: ["A", "I", "U", "E", "O"], + 無音: ["sil", "pau", "cl"], + 有声子音: [ + "b", + "by", + "d", + "dy", + "g", + "gw", + "gy", + "j", + "m", + "my", + "n", + "ny", + "r", + "ry", + "v", + "w", + "y", + "z", + ], + 無声子音: [ + "ch", + "f", + "h", + "hy", + "k", + "kw", + "ky", + "p", + "py", + "s", + "sh", + "t", + "ts", + "ty", + ], +}; + +/** 音素ごとの波形の配合率を適当に決める */ +function getWaveRate(phoneme: string): { [key in WaveType]: number } { + const waveRate: { [key in WaveType]: number } = { + sine: 0, + square: 0, + noise: 0, + silence: 0, + }; + + // 無音ならほぼ無音 + if (phonemeFeatures.無音.includes(phoneme)) { + const index = phonemeFeatures.無音.indexOf(phoneme); + waveRate.noise = ((index + 1) % 30) / 30; + return waveRate; + } + + // 有声母音ならノイズなし + if (phonemeFeatures.有声母音.includes(phoneme)) { + const rate = + phonemeFeatures.有声母音.indexOf(phoneme) / + (phonemeFeatures.有声母音.length - 1); + waveRate.sine = 1 - rate; + waveRate.square = rate; + return waveRate; + } + + // 無声母音ならノイズ多め + if (phonemeFeatures.無声母音.includes(phoneme)) { + const rate = + phonemeFeatures.無声母音.indexOf(phoneme) / + (phonemeFeatures.無声母音.length - 1); + waveRate.sine = (1 - rate) * 0.1; + waveRate.square = rate * 0.1; + waveRate.noise = 0.3; + return waveRate; + } + + // 有声子音ならノイズ少なめ + if (phonemeFeatures.有声子音.includes(phoneme)) { + const rate = + phonemeFeatures.有声子音.indexOf(phoneme) / + (phonemeFeatures.有声子音.length - 1); + waveRate.sine = (1 - rate) * 0.7; + waveRate.square = rate * 0.7; + waveRate.noise = 0.2; + return waveRate; + } + + // 無声子音ならノイズ多めで音量小さい + if (phonemeFeatures.無声子音.includes(phoneme)) { + const rate = + phonemeFeatures.無声子音.indexOf(phoneme) / + (phonemeFeatures.無声子音.length - 1); + waveRate.sine = (1 - rate) * 0.1; + waveRate.square = rate * 0.1; + waveRate.noise = 0.1; + return waveRate; + } + + throw new Error(`未対応の音素: ${phoneme}`); +} + +/** + * FrameAudioQueryから適当に音声合成する。 + * いろんな波形を作り、音素ごとに波形の配合率を変える。 + */ +export function synthesisFrameAudioQueryMock( + frameAudioQuery: FrameAudioQuery, + styleId: number, +): Uint8Array { + const sampleRate = frameAudioQuery.outputSamplingRate; + const samplePerFrame = 256; + const frameRate = sampleRate / samplePerFrame; + + const _generateWave = (type: WaveType) => + generateWave( + frameAudioQuery.f0, + frameAudioQuery.volume, + frameRate, + sampleRate, + type, + ); + const waves: { [key in WaveType]: Float32Array } = { + sine: _generateWave("sine"), + square: _generateWave("square"), + noise: _generateWave("noise"), + silence: _generateWave("silence"), + }; + + // フレームごとの音声波形の配分率 + const waveRatesPerFrame = frameAudioQuery.phonemes.flatMap((phoneme) => { + const waveRate = getWaveRate(phoneme.phoneme); + return Array<{ [key in WaveType]: number }>(phoneme.frameLength).fill( + waveRate, + ); + }); + + // サンプルごとの配分率 + // 耳が痛くならないように10msほどの移動平均を取る + const calcWaveRate = (type: WaveType) => { + const waveRate = waveRatesPerFrame.flatMap((o) => + Array(samplePerFrame).fill(o[type]), + ); + applyGaussianFilter(waveRate, (sampleRate * 0.01) / 3); + return waveRate; + }; + const waveRates = Object.fromEntries( + waveTypes.map((type) => [type, calcWaveRate(type)]), + ) as { [key in WaveType]: number[] }; + + // 波形を合成。 + // 念の為に-1~1に丸め、音量を1/10にする。 + // 話者ごとに同じにならないように適当に値をずらす + const wave = new Float32Array(frameAudioQuery.f0.length * samplePerFrame); + for (let i = 0; i < wave.length; i++) { + let sample = waveTypes.reduce((acc, type) => { + return acc + waves[type][i] * waveRates[type][i]; + }, 0); + sample += (styleId % 977) / 977 / 20; // 977は適当な素数 + wave[i] = Math.min(Math.max(sample, -1), 1) / 10; + } + + // Blobに変換 + const numberOfChannels = frameAudioQuery.outputStereo ? 2 : 1; + const buffer = convertToWavFileData({ + sampleRate, + length: wave.length, + numberOfChannels, + getChannelData: () => wave, + }); + return buffer; +} diff --git a/src/mock/engineMock/talkModelMock.ts b/src/mock/engineMock/talkModelMock.ts new file mode 100644 index 0000000000..8d952c6f84 --- /dev/null +++ b/src/mock/engineMock/talkModelMock.ts @@ -0,0 +1,239 @@ +/** + * ソング系の構造体を作るモック。 + * 値は適当だが、テストで使えるよう決定論的に決まるようにしたり、UIのバグに気づけるようある程度規則を持たせている。 + */ + +import kuromoji, { IpadicFeatures, Tokenizer } from "kuromoji"; +import { moraToPhonemes } from "./phonemeMock"; +import { moraPattern } from "@/domain/japanese"; +import { AccentPhrase, Mora } from "@/openapi"; +import packageJson from "@/../package.json"; + +let _tokenizer: Tokenizer | undefined; + +/** kuromoji用の辞書のパスを取得する */ +function getDicPath() { + // ブラウザのときはCDNから辞書を取得し、Nodeのときはローカルから取得する + + const pathForBrowser = `https://cdn.jsdelivr.net/npm/kuromoji@${packageJson.devDependencies.kuromoji}/dict`; + const pathForNode = "node_modules/kuromoji/dict"; + + // window.documentがなければNode + if (typeof window == "undefined" || typeof window.document == "undefined") { + return pathForNode; + } + + // happy-domのときはNode + if (typeof (window as { happyDOM?: unknown }).happyDOM != "undefined") { + return pathForNode; + } + + // それ以外はブラウザ + return pathForBrowser; +} + +/** テキストをトークン列に変換するトークナイザーを取得する */ +async function createOrGetTokenizer() { + if (_tokenizer != undefined) { + return _tokenizer; + } + + return new Promise>((resolve, reject) => { + kuromoji + .builder({ dicPath: getDicPath() }) + .build((err: Error, tokenizer: Tokenizer) => { + if (err) { + reject(err); + } else { + _tokenizer = tokenizer; + resolve(tokenizer); + } + }); + }); +} + +/** アルファベット文字列を適当な0~1の適当な数値に変換する */ +function alphabetsToNumber(text: string): number { + const codes = text.split("").map((c) => c.charCodeAt(0)); + const sum = codes.reduce((a, b) => a + b); + return (sum % 256) / 256; +} + +/** 0.01~0.25になるように適当な長さを決める */ +function phonemeToLengthMock(phoneme: string): number { + return alphabetsToNumber(phoneme) * 0.24 + 0.01; +} + +/** 3~5になるように適当なピッチを決める */ +function phonemeToPitchMock(phoneme: string): number { + return (1 - alphabetsToNumber(phoneme)) * 2 + 3; +} + +/** カタカナテキストをモーラに変換する */ +function textToMoraMock(text: string): Mora { + const phonemes = moraToPhonemes[text]; + if (phonemes == undefined) throw new Error(`モーラに変換できません: ${text}`); + + return { + text, + consonant: phonemes[0], + consonantLength: phonemes[0] == undefined ? undefined : 0, + vowel: phonemes[1], + vowelLength: 0, + pitch: 0, + }; +} + +/** + * カタカナテキストを適当なアクセント句に変換する。 + * アクセント位置は適当に決める。 + */ +function textToAccentPhraseMock(text: string): AccentPhrase { + const moras: Mora[] = [...text.matchAll(moraPattern)].map((m) => + textToMoraMock(m[0]), + ); + const alphabets = moras.map((m) => (m.consonant ?? "") + m.vowel).join(""); + const accent = + 1 + Math.round(alphabetsToNumber(alphabets) * (moras.length - 1)); + return { moras, accent }; +} + +/** + * アクセント句内のモーラの長さを適当に代入する。 + * 最後のモーラだけ長くする。 + */ +export function replaceLengthMock( + accentPhrases: AccentPhrase[], + styleId: number, +) { + for (const accentPhrase of accentPhrases) { + for (let i = 0; i < accentPhrase.moras.length; i++) { + const mora = accentPhrase.moras[i]; + + // 最後のモーラだけ長く + const offset = i == accentPhrase.moras.length - 1 ? 0.05 : 0; + + if (mora.consonant != undefined) + mora.consonantLength = + (phonemeToLengthMock(mora.consonant) + offset) / 5; + mora.vowelLength = phonemeToLengthMock(mora.vowel) + offset; + } + } + + // 別のアクセント句や話者で同じにならないように適当に値をずらす + for (let i = 0; i < accentPhrases.length; i++) { + const diff = i * 0.01 + styleId * 0.03; + const accentPhrase = accentPhrases[i]; + for (const mora of accentPhrase.moras) { + if (mora.consonantLength != undefined) mora.consonantLength += diff; + mora.vowelLength += diff; + } + if (accentPhrase.pauseMora != undefined) { + accentPhrase.pauseMora.vowelLength += diff; + } + } +} + +/** + * アクセント句内のモーラのピッチを適当に代入する。 + * アクセント位置のモーラだけ高くする。 + */ +export function replacePitchMock( + accentPhrases: AccentPhrase[], + styleId: number, +) { + for (const accentPhrase of accentPhrases) { + for (let i = 0; i < accentPhrase.moras.length; i++) { + const mora = accentPhrase.moras[i]; + + // 無声化している場合はピッチを0にする + if (mora.vowel == "U") { + mora.pitch = 0; + continue; + } + + // アクセント位置のモーラだけ高く + const offset = i == accentPhrase.accent ? 0.3 : 0; + + const phoneme = (mora.consonant ?? "") + mora.vowel[1]; + mora.pitch = phonemeToPitchMock(phoneme) + offset; + } + } + + // 別のアクセント句や話者で同じにならないように適当に値をずらす + for (let i = 0; i < accentPhrases.length; i++) { + const diff = i * 0.01 + styleId * 0.03; + const accentPhrase = accentPhrases[i]; + for (const mora of accentPhrase.moras) { + if (mora.pitch > 0) mora.pitch += diff; + } + } +} + +/** + * テキストを適当なアクセント句に分割する。 + * 助詞ごとに区切る。記号ごとに無音を入れる。 + * 無音で終わるアクセント句の最後のモーラが「す」「つ」の場合は無声化する。 + */ +export async function textToActtentPhrasesMock(text: string, styleId: number) { + const accentPhrases: AccentPhrase[] = []; + + // トークンに分割 + const tokenizer = await createOrGetTokenizer(); + const tokens = tokenizer.tokenize(text); + + let textPhrase = ""; + for (const token of tokens) { + // 記号の場合は無音を入れて区切る + if (token.pos == "記号") { + if (textPhrase.length == 0) continue; + + const accentPhrase = textToAccentPhraseMock(textPhrase); + accentPhrase.pauseMora = { + text: "、", + vowel: "pau", + vowelLength: 1 - 1 / (accentPhrases.length + 1), + pitch: 0, + }; + accentPhrases.push(accentPhrase); + textPhrase = ""; + continue; + } + + // 記号以外は連結 + if (token.reading == undefined) + throw new Error(`発音がないトークン: ${token.surface_form}`); + textPhrase += token.reading; + + // 助詞の場合は区切る + if (token.pos == "助詞") { + accentPhrases.push(textToAccentPhraseMock(textPhrase)); + textPhrase = ""; + } + } + if (textPhrase != "") { + accentPhrases.push(textToAccentPhraseMock(textPhrase)); + } + + // 最後のアクセント句の無音をなくす + if (accentPhrases.length > 0) { + const lastPhrase = accentPhrases[accentPhrases.length - 1]; + lastPhrase.pauseMora = undefined; + } + + // 無音のあるアクセント句を無声化 + for (const phrase of accentPhrases) { + if (phrase.pauseMora == undefined) continue; + const lastMora = phrase.moras[phrase.moras.length - 1]; + if (lastMora.text == "ス" || lastMora.text == "ツ") { + lastMora.vowel = "U"; + lastMora.pitch = 0; + } + } + + // 長さとピッチを代入 + replaceLengthMock(accentPhrases, styleId); + replacePitchMock(accentPhrases, styleId); + + return accentPhrases; +} diff --git a/src/store/singing.ts b/src/store/singing.ts index 611fe2ce22..c8536a16d5 100644 --- a/src/store/singing.ts +++ b/src/store/singing.ts @@ -103,7 +103,7 @@ import { getOrThrow } from "@/helpers/mapHelper"; import { cloneWithUnwrapProxy } from "@/helpers/cloneWithUnwrapProxy"; import { ufProjectToVoicevox } from "@/sing/utaformatixProject/toVoicevox"; import { uuid4 } from "@/helpers/random"; -import { convertToWavFileData } from "@/sing/convertToWavFileData"; +import { convertToWavFileData } from "@/helpers/convertToWavFileData"; import { generateWriteErrorMessage } from "@/helpers/fileHelper"; import path from "@/helpers/path"; diff --git a/tests/unit/mock/engineMock/__snapshots__/index.spec.ts.snap b/tests/unit/mock/engineMock/__snapshots__/index.spec.ts.snap new file mode 100644 index 0000000000..2905c56cb3 --- /dev/null +++ b/tests/unit/mock/engineMock/__snapshots__/index.spec.ts.snap @@ -0,0 +1,176 @@ +// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html + +exports[`createOpenAPIEngineMock > audioQueryAudioQueryPost 1`] = ` +{ + "accentPhrases": [ + { + "accent": 5, + "moras": [ + { + "consonant": "k", + "consonantLength": 0.0220625, + "pitch": 4.7734375, + "text": "コ", + "vowel": "o", + "vowelLength": 0.1140625, + }, + { + "consonant": undefined, + "consonantLength": undefined, + "pitch": 3.609375, + "text": "ン", + "vowel": "N", + "vowelLength": 0.08312499999999999, + }, + { + "consonant": "n", + "consonantLength": 0.022625, + "pitch": 4.75, + "text": "ニ", + "vowel": "i", + "vowelLength": 0.10843749999999999, + }, + { + "consonant": "ch", + "consonantLength": 0.0400625, + "pitch": 4.0234375, + "text": "チ", + "vowel": "i", + "vowelLength": 0.10843749999999999, + }, + { + "consonant": "h", + "consonantLength": 0.0315, + "pitch": 4.796875, + "text": "ハ", + "vowel": "a", + "vowelLength": 0.1509375, + }, + ], + "pauseMora": undefined, + }, + ], + "intonationScale": 1, + "outputSamplingRate": 24000, + "outputStereo": false, + "pitchScale": 0, + "postPhonemeLength": 0.1, + "prePhonemeLength": 0.1, + "speedScale": 1, + "volumeScale": 1, +} +`; + +exports[`createOpenAPIEngineMock > frameSynthesisFrameSynthesisPost 1`] = `"394cfbc01397e0b6fcc3433d9537aa850e6131f7d89048da6889e7375fe03a24"`; + +exports[`createOpenAPIEngineMock > singFrameAudioQuerySingFrameAudioQueryPost 1`] = ` +{ + "f0": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 46.17422889791779, + 46.08055781881967, + 46.08055781881967, + 46.16797823967303, + 46.18048040243588, + 46.18048040243588, + 82.27312267254713, + 82.21745071316357, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ], + "outputSamplingRate": 24000, + "outputStereo": false, + "phonemes": [ + { + "frameLength": 9, + "noteId": "a", + "phoneme": "pau", + }, + { + "frameLength": 1, + "noteId": "b", + "phoneme": "t", + }, + { + "frameLength": 2, + "noteId": "b", + "phoneme": "e", + }, + { + "frameLength": 1, + "noteId": "c", + "phoneme": "s", + }, + { + "frameLength": 2, + "noteId": "c", + "phoneme": "u", + }, + { + "frameLength": 1, + "noteId": "d", + "phoneme": "t", + }, + { + "frameLength": 1, + "noteId": "d", + "phoneme": "o", + }, + { + "frameLength": 10, + "noteId": "e", + "phoneme": "pau", + }, + ], + "volume": [ + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + 0.7732211475542967, + 0.7702900494608796, + 0.7702900494608796, + 0.7730257409255916, + 0.7734165541957456, + 0.7734165541957456, + 0.7737647610411076, + 0.7727873601766926, + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + 0.7635414345273557, + ], + "volumeScale": 1, +} +`; + +exports[`createOpenAPIEngineMock > synthesisSynthesisPost 1`] = `"23f4b910863418a7188648f7c5226a0f02b9d067b964be1690b69b1e9ffde7bc"`; + +exports[`createOpenAPIEngineMock > versionVersionGet 1`] = `"mock"`; diff --git a/tests/unit/mock/engineMock/index.spec.ts b/tests/unit/mock/engineMock/index.spec.ts new file mode 100644 index 0000000000..5855815692 --- /dev/null +++ b/tests/unit/mock/engineMock/index.spec.ts @@ -0,0 +1,68 @@ +import { hash } from "../../utils"; +import { createOpenAPIEngineMock } from "@/mock/engineMock"; + +describe("createOpenAPIEngineMock", () => { + const mock = createOpenAPIEngineMock(); + + it("versionVersionGet", async () => { + const response = await mock.versionVersionGet(); + expect(response).toMatchSnapshot(); + }); + + it("audioQueryAudioQueryPost", async () => { + const response = await mock.audioQueryAudioQueryPost({ + text: "こんにちは", + speaker: 0, + }); + expect(response).toMatchSnapshot(); + }); + + it("synthesisSynthesisPost", async () => { + const audioQuery = await mock.audioQueryAudioQueryPost({ + text: "こんにちは", + speaker: 0, + }); + const response = await mock.synthesisSynthesisPost({ + audioQuery, + speaker: 0, + }); + expect(await hash(await response.arrayBuffer())).toMatchSnapshot(); + }); + + it("singFrameAudioQuerySingFrameAudioQueryPost", async () => { + const response = await mock.singFrameAudioQuerySingFrameAudioQueryPost({ + speaker: 0, + score: { + notes: [ + { id: "a", key: undefined, frameLength: 10, lyric: "" }, + { id: "b", key: 30, frameLength: 3, lyric: "て" }, + { id: "c", key: 30, frameLength: 3, lyric: "す" }, + { id: "d", key: 40, frameLength: 1, lyric: "と" }, + { id: "e", key: undefined, frameLength: 10, lyric: "" }, + ], + }, + }); + expect(response).toMatchSnapshot(); + }); + + it("frameSynthesisFrameSynthesisPost", async () => { + const frameAudioQuery = + await mock.singFrameAudioQuerySingFrameAudioQueryPost({ + speaker: 0, + score: { + notes: [ + { id: "a", key: undefined, frameLength: 10, lyric: "" }, + { id: "b", key: 30, frameLength: 3, lyric: "て" }, + { id: "c", key: 30, frameLength: 3, lyric: "す" }, + { id: "d", key: 40, frameLength: 1, lyric: "と" }, + { id: "e", key: undefined, frameLength: 10, lyric: "" }, + ], + }, + }); + const response = await mock.frameSynthesisFrameSynthesisPost({ + frameAudioQuery, + speaker: 0, + }); + expect(await hash(await response.arrayBuffer())).toMatchSnapshot(); + }); +}); diff --git a/tests/unit/utils.ts b/tests/unit/utils.ts index 3bc85e6dd9..2c89db5c50 100644 --- a/tests/unit/utils.ts +++ b/tests/unit/utils.ts @@ -3,7 +3,7 @@ import { Component } from "vue"; // QPageContainerとQLayoutで囲うためのヘルパー関数。 // QPageはQLayout > QPageContainer > QPageの構造にしないとエラーになるため必要。 -export const wrapQPage = (page: Component) => { +export function wrapQPage(page: Component) { return { template: ` @@ -18,4 +18,11 @@ export const wrapQPage = (page: Component) => { QLayout, }, }; -}; +} + +/** バイナリからSHA-256ハッシュを計算する */ +export async function hash(data: ArrayBuffer): Promise { + const hashBuffer = await crypto.subtle.digest("SHA-256", data); + const hashArray = Array.from(new Uint8Array(hashBuffer)); + return hashArray.map((b) => b.toString(16).padStart(2, "0")).join(""); +} From a1c21a34c8bb15271228a6300a652a7be274c95b Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Sat, 23 Nov 2024 22:29:22 +0900 Subject: [PATCH 2/8] =?UTF-8?q?Fix:=20=E3=82=A2=E3=82=BB=E3=83=83=E3=83=88?= =?UTF-8?q?=E3=83=91=E3=82=B9=E3=82=92=E4=BF=AE=E6=AD=A3=E3=81=97=E3=81=A6?= =?UTF-8?q?=E3=82=A8=E3=83=B3=E3=82=B8=E3=83=B3=E3=83=A2=E3=83=83=E3=82=AF?= =?UTF-8?q?=E3=81=AE=E7=94=BB=E5=83=8F=E3=82=92=E6=AD=A3=E3=81=97=E3=81=8F?= =?UTF-8?q?=E5=8F=96=E5=BE=97=E3=81=99=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...\343\202\267\343\203\247\343\203\203\343\203\210.spec.ts" | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git "a/tests/e2e/browser/\343\202\271\343\202\257\343\203\252\343\203\274\343\203\263\343\202\267\343\203\247\343\203\203\343\203\210.spec.ts" "b/tests/e2e/browser/\343\202\271\343\202\257\343\203\252\343\203\274\343\203\263\343\202\267\343\203\247\343\203\203\343\203\210.spec.ts" index d47c880093..acbe5560b5 100644 --- "a/tests/e2e/browser/\343\202\271\343\202\257\343\203\252\343\203\274\343\203\263\343\202\267\343\203\247\343\203\203\343\203\210.spec.ts" +++ "b/tests/e2e/browser/\343\202\271\343\202\257\343\203\252\343\203\274\343\203\263\343\202\267\343\203\247\343\203\203\343\203\210.spec.ts" @@ -28,7 +28,10 @@ async function getSpeakerImages(): Promise< }[] > { if (!speakerImages) { - const assetsPath = path.resolve(__dirname, "assets"); + const assetsPath = path.resolve( + __dirname, + "../../../src/mock/engineMock/assets", + ); const images = await fs.readdir(assetsPath); const icons = images.filter((image) => image.startsWith("icon")); icons.sort( From 62f5314b0162f53b25953e0c6d7c026076e46c55 Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Sat, 23 Nov 2024 22:37:25 +0900 Subject: [PATCH 3/8] =?UTF-8?q?Refactor:=20audioBuffer=E3=81=AE=E5=9E=8B?= =?UTF-8?q?=E5=AE=9A=E7=BE=A9=E3=82=92=E7=B0=A1=E7=B4=A0=E5=8C=96=E3=81=99?= =?UTF-8?q?=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/helpers/convertToWavFileData.ts | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/helpers/convertToWavFileData.ts b/src/helpers/convertToWavFileData.ts index 8c2b8ef6e0..424b2f514e 100644 --- a/src/helpers/convertToWavFileData.ts +++ b/src/helpers/convertToWavFileData.ts @@ -1,12 +1,8 @@ export const convertToWavFileData = ( - audioBuffer: - | AudioBuffer - | { - sampleRate: number; - length: number; - numberOfChannels: number; - getChannelData(channel: number): Float32Array; - }, + audioBuffer: Pick< + AudioBuffer, + "sampleRate" | "length" | "numberOfChannels" | "getChannelData" + >, ) => { const bytesPerSample = 4; // Float32 const formatCode = 3; // WAVE_FORMAT_IEEE_FLOAT From 0bde9688d09f74c48272971142e409c412f52b8c Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Sat, 23 Nov 2024 22:47:46 +0900 Subject: [PATCH 4/8] =?UTF-8?q?Update:=20engineMock=E3=81=AEREADME.md?= =?UTF-8?q?=E3=82=92=E6=94=B9=E5=96=84=E3=81=97=E3=80=81=E5=AE=9F=E8=A3=85?= =?UTF-8?q?=E3=83=9D=E3=83=AA=E3=82=B7=E3=83=BC=E3=82=92=E6=98=8E=E7=A2=BA?= =?UTF-8?q?=E5=8C=96=E3=81=99=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mock/engineMock/README.md | 47 ++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/src/mock/engineMock/README.md b/src/mock/engineMock/README.md index 71a39a2a78..ace51fe679 100644 --- a/src/mock/engineMock/README.md +++ b/src/mock/engineMock/README.md @@ -3,37 +3,38 @@ ## 概要 通信を介さずに音声合成できるエンジンのモックです。 +エンジンのOpenAPIから自動生成されたインターフェイス`DefaultApi`を継承しています。 同じ入力には同じ出力を返し、別の入力には別の出力を返すようになっています。 -また出力を見たときにUIや処理の実装の異常に気付けるように、ある程度直感に合う出力を返すよう努力されています。 - +また出力を見たときにUIや処理の実装の異常に気付けるように、ある程度直感に合う出力を返すよう努力されています。 例:音量を下げると音声が小さくなる、音程と周波数が一致する、など。 モックの実装は気軽に破壊的変更しても問題ありません。 -## ビルド戦略 +## 実装ポリシー -ブラウザ版でも使えるようにすべく、ソフトウェアにも組み込める形で実装されています。 -ビルド時のモックエンジンの取り扱いポリシーはこんな感じです。 +ブラウザ版でも使えるように実装されています。 +モックエンジンの取り扱いポリシーはこんな感じです。 -- 重い処理が一切実行されないようにする - - 辞書の初期化、画像の読み込みなど -- なるべく重いファイルはビルドに含まれないようにする - - 形態素解析の辞書ファイルやダミー画像など +- Electronビルド成果物 + - モックエンジン関連の重いファイルはなるべく含まれないようにする + - 形態素解析の辞書ファイルやダミー画像など + - モックエンジン関連の重い処理が一切実行されないようにする + - 形態素解析の辞書の初期化、画像の読み込みなど ## ファイル構成 -- `talkModelMock.ts` - - トーク用の音声クエリを作るまでの処理周り -- `singModelMock.ts` - - ソング用の音声クエリを作るまでの処理周り -- `audioQueryMock.ts` - - 音声クエリ周り -- `synthesisMock.ts` - - 音声波形の合成周り -- `characterResourceMock.ts` - - キャラ名や画像などのリソース周り -- `phonemeMock.ts` - - 音素周り -- `manifestMock.ts` - - エンジンのマニフェスト周り +- `talkModelMock` + - トーク用の音声クエリを作るまでの処理 +- `singModelMock` + - ソング用の音声クエリを作るまでの処理 +- `audioQueryMock` + - 音声クエリを作る +- `synthesisMock` + - 音声波形の合成 +- `characterResourceMock` + - キャラ名や画像などのリソース +- `phonemeMock` + - 音素 +- `manifestMock` + - エンジンのマニフェスト From 6ebdb4874b59f5a8f3795e8b74e56b9a1f3adcf3 Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Sun, 15 Dec 2024 22:47:32 +0900 Subject: [PATCH 5/8] =?UTF-8?q?=E8=87=AA=E5=89=8D=E3=81=AEkuromoji?= =?UTF-8?q?=E3=82=92=E4=BD=BF=E3=81=86=E3=82=88=E3=81=86=E3=81=AB=E3=81=99?= =?UTF-8?q?=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package-lock.json | 42 +++++++----------------------------------- package.json | 3 +-- 2 files changed, 8 insertions(+), 37 deletions(-) diff --git a/package-lock.json b/package-lock.json index 3bbac99127..f3533798b3 100644 --- a/package-lock.json +++ b/package-lock.json @@ -23,6 +23,7 @@ "glob": "11.0.0", "hotkeys-js": "3.13.6", "immer": "9.0.21", + "kuromoji": "github:VOICEVOX/kuromoji.js#0.0.1", "markdown-it": "13.0.2", "move-file": "3.0.0", "multistream": "4.1.0", @@ -56,7 +57,6 @@ "@types/async-lock": "1.4.0", "@types/encoding-japanese": "1.0.18", "@types/glob": "8.0.0", - "@types/kuromoji": "0.1.3", "@types/markdown-it": "12.2.0", "@types/multistream": "4.1.0", "@types/semver": "7.3.9", @@ -86,7 +86,6 @@ "eslint-plugin-storybook": "0.8.0", "eslint-plugin-vue": "9.26.0", "happy-dom": "15.11.6", - "kuromoji": "0.1.2", "license-checker-rseidelsohn": "4.3.0", "markdownlint-cli": "0.37.0", "playwright": "1.48.2", @@ -3355,12 +3354,6 @@ "@types/ms": "*" } }, - "node_modules/@types/doublearray": { - "version": "0.0.32", - "resolved": "https://registry.npmjs.org/@types/doublearray/-/doublearray-0.0.32.tgz", - "integrity": "sha512-HloTru3I3a55runIVqZX1YBQi2L5A4peNQPh33yshzB4ttt1qHCnHPkuhy9Djy/cTx7i5xJvxItKRPCmvnfpGw==", - "dev": true - }, "node_modules/@types/earcut": { "version": "2.1.4", "resolved": "https://registry.npmjs.org/@types/earcut/-/earcut-2.1.4.tgz", @@ -3433,15 +3426,6 @@ "@types/node": "*" } }, - "node_modules/@types/kuromoji": { - "version": "0.1.3", - "resolved": "https://registry.npmjs.org/@types/kuromoji/-/kuromoji-0.1.3.tgz", - "integrity": "sha512-u+YwX6eJj6Fmm0F5qunsyA+X8HSiyRNNE5ON3itD3tERax4meq9tv+S7bjTMXkPjqbdBGUmH2maGDCuEvpODwg==", - "dev": true, - "dependencies": { - "@types/doublearray": "*" - } - }, "node_modules/@types/linkify-it": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/@types/linkify-it/-/linkify-it-5.0.0.tgz", @@ -6774,7 +6758,7 @@ "version": "0.0.2", "resolved": "https://registry.npmjs.org/doublearray/-/doublearray-0.0.2.tgz", "integrity": "sha512-aw55FtZzT6AmiamEj2kvmR6BuFqvYgKZUkfQ7teqVRNqD5UE0rw8IeW/3gieHNKQ5sPuDKlljWEn4bzv5+1bHw==", - "dev": true + "license": "MIT" }, "node_modules/earcut": { "version": "2.2.4", @@ -8151,7 +8135,6 @@ "version": "0.8.2", "resolved": "https://registry.npmjs.org/fflate/-/fflate-0.8.2.tgz", "integrity": "sha512-cPJU47OaAoCbg0pBvzsgpTPhmhqI5eJjh/JIu8tPj5q+T7iLvW/JAYUqmE7KOB4R1ZyEhzBaIQpQpardBF5z8A==", - "dev": true, "license": "MIT" }, "node_modules/figures": { @@ -10178,21 +10161,20 @@ } }, "node_modules/kuromoji": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/kuromoji/-/kuromoji-0.1.2.tgz", - "integrity": "sha512-V0dUf+C2LpcPEXhoHLMAop/bOht16Dyr+mDiIE39yX3vqau7p80De/koFqpiTcL1zzdZlc3xuHZ8u5gjYRfFaQ==", - "dev": true, + "version": "0.0.1", + "resolved": "git+ssh://git@github.com/VOICEVOX/kuromoji.js.git#0e8d670cd3df64217d0502d3bb71f431531ff353", + "license": "Apache-2.0", "dependencies": { "async": "^2.0.1", "doublearray": "0.0.2", - "zlibjs": "^0.3.1" + "fflate": "^0.8.2" } }, "node_modules/kuromoji/node_modules/async": { "version": "2.6.4", "resolved": "https://registry.npmjs.org/async/-/async-2.6.4.tgz", "integrity": "sha512-mzo5dfJYwAn29PeiJ0zvwTo04zj8HDJj0Mn8TD7sno7q12prdbnasKJHhkm2c1LgrhlJ0teaea8860oxi51mGA==", - "dev": true, + "license": "MIT", "dependencies": { "lodash": "^4.17.14" } @@ -10354,7 +10336,6 @@ "version": "4.17.21", "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==", - "dev": true, "license": "MIT" }, "node_modules/lodash.clonedeep": { @@ -17538,15 +17519,6 @@ "node": "*" } }, - "node_modules/zlibjs": { - "version": "0.3.1", - "resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz", - "integrity": "sha512-+J9RrgTKOmlxFSDHo0pI1xM6BLVUv+o0ZT9ANtCxGkjIVCCUdx9alUF8Gm+dGLKbkkkidWIHFDZHDMpfITt4+w==", - "dev": true, - "engines": { - "node": "*" - } - }, "node_modules/zod": { "version": "3.22.4", "resolved": "https://registry.npmjs.org/zod/-/zod-3.22.4.tgz", diff --git a/package.json b/package.json index bdc35ffc56..c705d416d1 100644 --- a/package.json +++ b/package.json @@ -57,6 +57,7 @@ "glob": "11.0.0", "hotkeys-js": "3.13.6", "immer": "9.0.21", + "kuromoji": "github:VOICEVOX/kuromoji.js#0.0.1", "markdown-it": "13.0.2", "move-file": "3.0.0", "multistream": "4.1.0", @@ -93,7 +94,6 @@ "@types/async-lock": "1.4.0", "@types/encoding-japanese": "1.0.18", "@types/glob": "8.0.0", - "@types/kuromoji": "0.1.3", "@types/markdown-it": "12.2.0", "@types/multistream": "4.1.0", "@types/semver": "7.3.9", @@ -123,7 +123,6 @@ "eslint-plugin-storybook": "0.8.0", "eslint-plugin-vue": "9.26.0", "happy-dom": "15.11.6", - "kuromoji": "0.1.2", "license-checker-rseidelsohn": "4.3.0", "markdownlint-cli": "0.37.0", "playwright": "1.48.2", From aecae0bbb0f8c75ce0737440a16e421e722ede37 Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Sun, 15 Dec 2024 23:07:10 +0900 Subject: [PATCH 6/8] =?UTF-8?q?=E3=82=B3=E3=83=A1=E3=83=B3=E3=83=88?= =?UTF-8?q?=E3=82=92=E5=BE=AE=E8=AA=BF=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mock/engineMock/README.md | 52 ++++++++++++++------------ src/mock/engineMock/index.ts | 9 +++-- src/mock/engineMock/manifestMock.ts | 4 +- src/mock/engineMock/singModelMock.ts | 1 - src/mock/engineMock/talkModelMock.ts | 55 +++++++++++++--------------- 5 files changed, 62 insertions(+), 59 deletions(-) diff --git a/src/mock/engineMock/README.md b/src/mock/engineMock/README.md index ace51fe679..98713b840d 100644 --- a/src/mock/engineMock/README.md +++ b/src/mock/engineMock/README.md @@ -3,38 +3,42 @@ ## 概要 通信を介さずに音声合成できるエンジンのモックです。 -エンジンのOpenAPIから自動生成されたインターフェイス`DefaultApi`を継承しています。 同じ入力には同じ出力を返し、別の入力には別の出力を返すようになっています。 -また出力を見たときにUIや処理の実装の異常に気付けるように、ある程度直感に合う出力を返すよう努力されています。 +また出力を見たときにUIや処理の実装の異常に気付けるように、ある程度直感に合う出力を返すよう努力されています。 + 例:音量を下げると音声が小さくなる、音程と周波数が一致する、など。 モックの実装は気軽に破壊的変更しても問題ありません。 -## 実装ポリシー +## ビルド戦略 -ブラウザ版でも使えるように実装されています。 -モックエンジンの取り扱いポリシーはこんな感じです。 +ブラウザ版でも使えるようにすべく、ソフトウェアにも組み込める形で実装されています。 +ビルド時のモックエンジンの取り扱いポリシーはこんな感じです。 -- Electronビルド成果物 - - モックエンジン関連の重いファイルはなるべく含まれないようにする - - 形態素解析の辞書ファイルやダミー画像など - - モックエンジン関連の重い処理が一切実行されないようにする - - 形態素解析の辞書の初期化、画像の読み込みなど +- 重い処理が一切実行されないようにする + - 辞書の初期化、画像の読み込みなど +- なるべく重いファイルはビルドに含まれないようにする + - 形態素解析の辞書ファイルやダミー画像など ## ファイル構成 -- `talkModelMock` - - トーク用の音声クエリを作るまでの処理 -- `singModelMock` - - ソング用の音声クエリを作るまでの処理 -- `audioQueryMock` - - 音声クエリを作る -- `synthesisMock` - - 音声波形の合成 -- `characterResourceMock` - - キャラ名や画像などのリソース -- `phonemeMock` - - 音素 -- `manifestMock` - - エンジンのマニフェスト +- `talkModelMock.ts` + - トーク用の音声クエリを作るまでの処理周り +- `singModelMock.ts` + - ソング用の音声クエリを作るまでの処理周り +- `audioQueryMock.ts` + - 音声クエリ周り +- `synthesisMock.ts` + - 音声波形の合成周り +- `characterResourceMock.ts` + - キャラ名や画像などのリソース周り +- `phonemeMock.ts` + - 音素周り +- `manifestMock.ts` + - エンジンのマニフェスト周り + +## kuromoji.jsについて + +本家kuromoji.jsはパス操作周りでエラーが起こるので、フォーク版を使っています。 +mock用途以外にkuromoji.jsを使う予定はなく、もし動かなくなった際は依存を切ることも検討します。 diff --git a/src/mock/engineMock/index.ts b/src/mock/engineMock/index.ts index c3318aa2f4..96d8ac01fa 100644 --- a/src/mock/engineMock/index.ts +++ b/src/mock/engineMock/index.ts @@ -49,6 +49,7 @@ export function createOpenAPIEngineMock(): DefaultApiInterface { return "mock"; }, + // メタ情報 async engineManifestEngineManifestGet(): Promise { return getEngineManifestMock(); }, @@ -57,6 +58,7 @@ export function createOpenAPIEngineMock(): DefaultApiInterface { return { cpu: true, cuda: false, dml: false }; }, + // キャラクター情報 async isInitializedSpeakerIsInitializedSpeakerGet(): Promise { return true; }, @@ -85,6 +87,7 @@ export function createOpenAPIEngineMock(): DefaultApiInterface { return getSpeakerInfoMock(paload.speakerUuid); }, + // トーク系 async audioQueryAudioQueryPost( payload: AudioQueryAudioQueryPostRequest, ): Promise { @@ -145,6 +148,7 @@ export function createOpenAPIEngineMock(): DefaultApiInterface { return new Blob([buffer], { type: "audio/wav" }); }, + // ソング系 async singFrameAudioQuerySingFrameAudioQueryPost( payload: SingFrameAudioQuerySingFrameAudioQueryPostRequest, ): Promise { @@ -200,14 +204,13 @@ export function createOpenAPIEngineMock(): DefaultApiInterface { }, // 辞書系 - async getUserDictWordsUserDictGet(): Promise<{ [key: string]: UserDictWord; }> { - // 空の辞書を返す + // ダミーで空の辞書を返す return {}; }, }; - return mockApi as DefaultApiInterface; + return mockApi satisfies Partial as DefaultApiInterface; } diff --git a/src/mock/engineMock/manifestMock.ts b/src/mock/engineMock/manifestMock.ts index 7d645e6855..4a32cf957b 100644 --- a/src/mock/engineMock/manifestMock.ts +++ b/src/mock/engineMock/manifestMock.ts @@ -2,6 +2,8 @@ * エンジンマニフェストのモック。 */ +import { EngineManifest } from "@/openapi"; + /** エンジンマニフェストを返すモック */ export function getEngineManifestMock() { return { @@ -30,5 +32,5 @@ export function getEngineManifestMock() { manageLibrary: false, returnResourceUrl: true, }, - }; + } satisfies EngineManifest; } diff --git a/src/mock/engineMock/singModelMock.ts b/src/mock/engineMock/singModelMock.ts index f1c583fe03..7005f8586e 100644 --- a/src/mock/engineMock/singModelMock.ts +++ b/src/mock/engineMock/singModelMock.ts @@ -1,6 +1,5 @@ /** * ソング系の構造体を作るモック。 - * 値は適当だが、テストで使えるよう決定論的に決まるようにしたり、UIのバグに気づけるようある程度規則を持たせている。 */ import { moraToPhonemes } from "./phonemeMock"; diff --git a/src/mock/engineMock/talkModelMock.ts b/src/mock/engineMock/talkModelMock.ts index 8d952c6f84..bcaaac3920 100644 --- a/src/mock/engineMock/talkModelMock.ts +++ b/src/mock/engineMock/talkModelMock.ts @@ -1,35 +1,29 @@ /** - * ソング系の構造体を作るモック。 - * 値は適当だが、テストで使えるよう決定論的に決まるようにしたり、UIのバグに気づけるようある程度規則を持たせている。 + * トーク系の構造体を作るモック。 */ -import kuromoji, { IpadicFeatures, Tokenizer } from "kuromoji"; +import { builder, IpadicFeatures, Tokenizer } from "kuromoji"; import { moraToPhonemes } from "./phonemeMock"; import { moraPattern } from "@/domain/japanese"; import { AccentPhrase, Mora } from "@/openapi"; -import packageJson from "@/../package.json"; + +/** Nodeとして動いてほしいかを判定する */ +const isNode = + // window.documentがなければNode + typeof window == "undefined" || + typeof window.document == "undefined" || + // happy-domのときはNode + typeof (window as { happyDOM?: unknown }).happyDOM != "undefined"; let _tokenizer: Tokenizer | undefined; /** kuromoji用の辞書のパスを取得する */ function getDicPath() { - // ブラウザのときはCDNから辞書を取得し、Nodeのときはローカルから取得する - - const pathForBrowser = `https://cdn.jsdelivr.net/npm/kuromoji@${packageJson.devDependencies.kuromoji}/dict`; - const pathForNode = "node_modules/kuromoji/dict"; - - // window.documentがなければNode - if (typeof window == "undefined" || typeof window.document == "undefined") { - return pathForNode; + if (isNode) { + return "node_modules/kuromoji/dict"; + } else { + return "https://cdn.jsdelivr.net/npm/kuromoji@0.1.2/dict"; } - - // happy-domのときはNode - if (typeof (window as { happyDOM?: unknown }).happyDOM != "undefined") { - return pathForNode; - } - - // それ以外はブラウザ - return pathForBrowser; } /** テキストをトークン列に変換するトークナイザーを取得する */ @@ -39,16 +33,17 @@ async function createOrGetTokenizer() { } return new Promise>((resolve, reject) => { - kuromoji - .builder({ dicPath: getDicPath() }) - .build((err: Error, tokenizer: Tokenizer) => { - if (err) { - reject(err); - } else { - _tokenizer = tokenizer; - resolve(tokenizer); - } - }); + builder({ + dicPath: getDicPath(), + nodeOrBrowser: isNode ? "node" : "browser", + }).build((err: Error, tokenizer: Tokenizer) => { + if (err) { + reject(err); + } else { + _tokenizer = tokenizer; + resolve(tokenizer); + } + }); }); } From 6c600a60d1c767d9779e373e67c215f28551f9f9 Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Sun, 15 Dec 2024 23:09:14 +0900 Subject: [PATCH 7/8] =?UTF-8?q?README.md=E3=81=AE=E6=96=87=E8=A8=80?= =?UTF-8?q?=E3=82=92=E5=BE=AE=E8=AA=BF=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mock/engineMock/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mock/engineMock/README.md b/src/mock/engineMock/README.md index 98713b840d..166e4043db 100644 --- a/src/mock/engineMock/README.md +++ b/src/mock/engineMock/README.md @@ -9,7 +9,7 @@ 例:音量を下げると音声が小さくなる、音程と周波数が一致する、など。 -モックの実装は気軽に破壊的変更しても問題ありません。 +モックの実装は気軽に破壊的変更をしても問題ありません。 ## ビルド戦略 @@ -41,4 +41,4 @@ ## kuromoji.jsについて 本家kuromoji.jsはパス操作周りでエラーが起こるので、フォーク版を使っています。 -mock用途以外にkuromoji.jsを使う予定はなく、もし動かなくなった際は依存を切ることも検討します。 +mock用途以外にkuromoji.jsを使う予定はなく、もし動かなくなった際は依存を外すことも検討します。 From 6a2e5650fed516d1a4cc0e3db6bb746aaa04c4b7 Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Sun, 22 Dec 2024 00:05:39 +0900 Subject: [PATCH 8/8] =?UTF-8?q?TODO=E3=82=B3=E3=83=A1=E3=83=B3=E3=83=88?= =?UTF-8?q?=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...\263\343\202\267\343\203\247\343\203\203\343\203\210.spec.ts" | 1 + 1 file changed, 1 insertion(+) diff --git "a/tests/e2e/browser/\343\202\271\343\202\257\343\203\252\343\203\274\343\203\263\343\202\267\343\203\247\343\203\203\343\203\210.spec.ts" "b/tests/e2e/browser/\343\202\271\343\202\257\343\203\252\343\203\274\343\203\263\343\202\267\343\203\247\343\203\203\343\203\210.spec.ts" index acbe5560b5..8c8df39ee8 100644 --- "a/tests/e2e/browser/\343\202\271\343\202\257\343\203\252\343\203\274\343\203\263\343\202\267\343\203\247\343\203\203\343\203\210.spec.ts" +++ "b/tests/e2e/browser/\343\202\271\343\202\257\343\203\252\343\203\274\343\203\263\343\202\267\343\203\247\343\203\203\343\203\210.spec.ts" @@ -20,6 +20,7 @@ let speakerImages: /** * 差し替え用の立ち絵・アイコンを取得する。 + * TOOD: エンジンモックを使ってこのコードを削除する。 */ async function getSpeakerImages(): Promise< {