Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ソング:ピッチ生成ステージを追加 #2341

Merged
merged 3 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 165 additions & 13 deletions src/store/singing.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ import {
EditorFrameAudioQueryKey,
EditorFrameAudioQuery,
TrackParameters,
SingingPitchKey,
SingingPitch,
} from "./type";
import {
buildSongTrackAudioFileNameFromRawData,
Expand Down Expand Up @@ -129,14 +131,15 @@ type PhraseRenderContext = Readonly<{

type PhraseRenderStageId =
| "queryGeneration"
| "singingPitchGeneration"
| "singingVolumeGeneration"
| "singingVoiceSynthesis";

/**
* フレーズレンダリングのステージのインターフェイス。
* フレーズレンダラー内で順に実行される。
*/
type PhraseRenderBaseStage = Readonly<{
type PhraseRenderStage = Readonly<{
id: PhraseRenderStageId;

/**
Expand Down Expand Up @@ -172,6 +175,20 @@ type QuerySource = Readonly<{
keyRangeAdjustment: number;
}>;

/**
* 歌唱ピッチの生成に必要なデータ
*/
type SingingPitchSource = Readonly<{
engineId: EngineId;
engineFrameRate: number;
tpqn: number;
tempos: Tempo[];
firstRestDuration: number;
notes: Note[];
keyRangeAdjustment: number;
queryForPitchGeneration: EditorFrameAudioQuery;
}>;

/**
* 歌唱ボリュームの生成に必要なデータ
*/
Expand Down Expand Up @@ -381,6 +398,13 @@ const calculateQueryKey = async (querySource: QuerySource) => {
return EditorFrameAudioQueryKey(hash);
};

const calculateSingingPitchKey = async (
singingPitchSource: SingingPitchSource,
) => {
const hash = await calculateHash(singingPitchSource);
return SingingPitchKey(hash);
};

const calculateSingingVolumeKey = async (
singingVolumeSource: SingingVolumeSource,
) => {
Expand Down Expand Up @@ -545,6 +569,7 @@ const sequences = new Map<SequenceId, Sequence & { trackId: TrackId }>();
const animationTimer = new AnimationTimer();

const queryCache = new Map<EditorFrameAudioQueryKey, EditorFrameAudioQuery>();
const singingPitchCache = new Map<SingingPitchKey, SingingPitch>();
const singingVolumeCache = new Map<SingingVolumeKey, SingingVolume>();
const singingVoiceCache = new Map<SingingVoiceKey, SingingVoice>();

Expand Down Expand Up @@ -745,6 +770,7 @@ export const singingStoreState: SingingStoreState = {
editorFrameRate: DEPRECATED_DEFAULT_EDITOR_FRAME_RATE,
phrases: new Map(),
phraseQueries: new Map(),
phraseSingingPitches: new Map(),
phraseSingingVolumes: new Map(),
sequencerZoomX: 0.5,
sequencerZoomY: 0.75,
Expand Down Expand Up @@ -1219,6 +1245,23 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
},
},

SET_SINGING_PITCH_KEY_TO_PHRASE: {
mutation(
state,
{
phraseKey,
singingPitchKey,
}: {
phraseKey: PhraseKey;
singingPitchKey: SingingPitchKey | undefined;
},
) {
const phrase = getOrThrow(state.phrases, phraseKey);

phrase.singingPitchKey = singingPitchKey;
},
},

SET_SINGING_VOLUME_KEY_TO_PHRASE: {
mutation(
state,
Expand Down Expand Up @@ -1291,6 +1334,24 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
},
},

SET_PHRASE_SINGING_PITCH: {
mutation(
state,
{
singingPitchKey,
singingPitch,
}: { singingPitchKey: SingingPitchKey; singingPitch: SingingPitch },
) {
state.phraseSingingPitches.set(singingPitchKey, singingPitch);
},
},

DELETE_PHRASE_SINGING_PITCH: {
mutation(state, { singingPitchKey }: { singingPitchKey: SingingPitchKey }) {
state.phraseSingingPitches.delete(singingPitchKey);
},
},

SET_PHRASE_SINGING_VOLUME: {
mutation(
state,
Expand Down Expand Up @@ -1809,9 +1870,7 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
}
};

const generateQuery = async (
querySource: QuerySource,
): Promise<EditorFrameAudioQuery> => {
const generateQuery = async (querySource: QuerySource) => {
const notesForRequestToEngine = createNotesForRequestToEngine(
querySource.firstRestDuration,
lastRestDurationSeconds,
Expand All @@ -1836,7 +1895,7 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
return query;
};

const queryGenerationStage: PhraseRenderBaseStage = {
const queryGenerationStage: PhraseRenderStage = {
id: "queryGeneration",
shouldBeExecuted: async (context: PhraseRenderContext) => {
const track = getOrThrow(context.snapshot.tracks, context.trackId);
Expand Down Expand Up @@ -1877,7 +1936,7 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
const phrase = getOrThrow(state.phrases, context.phraseKey);
const phraseQueryKey = phrase.queryKey;
if (phraseQueryKey != undefined) {
mutations.DELETE_PHRASE_QUERY({ queryKey: phraseQueryKey });
throw new Error("The previous query has not been removed.");
}
mutations.SET_PHRASE_QUERY({ queryKey, query });
mutations.SET_QUERY_KEY_TO_PHRASE({
Expand All @@ -1887,6 +1946,97 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
},
};

const generateSingingPitchSource = (
context: PhraseRenderContext,
): SingingPitchSource => {
const track = getOrThrow(context.snapshot.tracks, context.trackId);
if (track.singer == undefined) {
throw new Error("track.singer is undefined.");
}
const phrase = getOrThrow(state.phrases, context.phraseKey);
const phraseQueryKey = phrase.queryKey;
if (phraseQueryKey == undefined) {
throw new Error("phraseQueryKey is undefined.");
}
const query = getOrThrow(state.phraseQueries, phraseQueryKey);
const clonedQuery = cloneWithUnwrapProxy(query);
// TODO: 音素タイミングの編集データの適用を行うようにする
return {
engineId: track.singer.engineId,
engineFrameRate: query.frameRate,
tpqn: context.snapshot.tpqn,
tempos: context.snapshot.tempos,
firstRestDuration: phrase.firstRestDuration,
notes: phrase.notes,
keyRangeAdjustment: track.keyRangeAdjustment,
queryForPitchGeneration: clonedQuery,
Hiroshiba marked this conversation as resolved.
Show resolved Hide resolved
};
};

const generateSingingPitch = async (
singingPitchSource: SingingPitchSource,
) => {
// TODO: ピッチ生成APIに対応する
return singingPitchSource.queryForPitchGeneration.f0;
};
Comment on lines +1976 to +1981
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ひとまずクエリ生成ステージで生成済みのピッチを返すようにしています。

Copy link
Member

@Hiroshiba Hiroshiba Nov 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

あっ ピッチ生成APIがまだないですね!!!!!

見てみたのですが、すでにコアにはピッチ生成機能があり、エンジンもコアのその機能の接続まではできてたのですが、WEB APIとして実装されてない雰囲気でした!!
実装までの道を確認しつつ、流れを書いてみたのでとりあえずメモまで!!


たぶん手順4つで実装が可能そう!

  1. tts_engine.py内のcreate_sing_volume_from_phoneme_and_f0のようにcreate_sing_f0_from_phonemeを作る。
    create_sing_volume_from_phoneme_and_f0
    https://github.com/VOICEVOX/voicevox_engine/blob/9506de28639a067cf0540aa1725586d15c9bcf2c/voicevox_engine/tts_pipeline/tts_engine.py#L657
    (ファイル名がtts_engine.pyですが、sing APIのための関数もここに含まれてます。。)

  2. sing_frame_volume WEB APIのようにsing_frame_f0 WEB APIを作る。
    sing_frame_volume
    https://github.com/VOICEVOX/voicevox_engine/blob/9506de28639a067cf0540aa1725586d15c9bcf2c/voicevox_engine/app/routers/tts_pipeline.py#L419

  3. いい感じにテストも作る。(たぶん結構コピペで作れる・・・はず)
    テストによってはスナップショットの更新も必要なはず。
    CONTRIBUTING.mdの↓のとこに書いてます!
    https://github.com/VOICEVOX/voicevox_engine/blob/9506de28639a067cf0540aa1725586d15c9bcf2c/CONTRIBUTING.md#L316-L317

  4. エディタ側のOpenAPI更新。

  5. (オプション)プレビュー版リリース作成(エディタ側のNightlyビルドに必要)
    これはヒホのタスク。Github Actions回すだけなはず。


const singingPitchGenerationStage: PhraseRenderStage = {
id: "singingPitchGeneration",
shouldBeExecuted: async (context: PhraseRenderContext) => {
const track = getOrThrow(context.snapshot.tracks, context.trackId);
if (track.singer == undefined) {
return false;
}
const phrase = getOrThrow(state.phrases, context.phraseKey);
const phraseSingingPitchKey = phrase.singingPitchKey;
const singingPitchSource = generateSingingPitchSource(context);
const singingPitchKey =
await calculateSingingPitchKey(singingPitchSource);
return (
phraseSingingPitchKey == undefined ||
phraseSingingPitchKey !== singingPitchKey
);
},
deleteExecutionResult: (context: PhraseRenderContext) => {
const phrase = getOrThrow(state.phrases, context.phraseKey);
const phraseSingingPitchKey = phrase.singingPitchKey;
if (phraseSingingPitchKey != undefined) {
mutations.DELETE_PHRASE_SINGING_PITCH({
singingPitchKey: phraseSingingPitchKey,
});
mutations.SET_SINGING_PITCH_KEY_TO_PHRASE({
phraseKey: context.phraseKey,
singingPitchKey: undefined,
});
}
},
execute: async (context: PhraseRenderContext) => {
const singingPitchSource = generateSingingPitchSource(context);
const singingPitchKey =
await calculateSingingPitchKey(singingPitchSource);

let singingPitch = singingPitchCache.get(singingPitchKey);
if (singingPitch != undefined) {
logger.info(`Loaded singing pitch from cache.`);
} else {
singingPitch = await generateSingingPitch(singingPitchSource);
logger.info(`Generated singing pitch.`);
singingPitchCache.set(singingPitchKey, singingPitch);
}

const phrase = getOrThrow(state.phrases, context.phraseKey);
const phraseSingingPitchKey = phrase.singingPitchKey;
if (phraseSingingPitchKey != undefined) {
throw new Error("The previous singing pitch has not been removed.");
}
mutations.SET_PHRASE_SINGING_PITCH({ singingPitchKey, singingPitch });
mutations.SET_SINGING_PITCH_KEY_TO_PHRASE({
phraseKey: context.phraseKey,
singingPitchKey,
});
},
};

const generateSingingVolumeSource = (
context: PhraseRenderContext,
): SingingVolumeSource => {
Expand All @@ -1899,6 +2049,7 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
if (phraseQueryKey == undefined) {
throw new Error("phraseQueryKey is undefined.");
}
// TODO: ピッチ生成ステージで生成したピッチを使用するようにする
const query = getOrThrow(state.phraseQueries, phraseQueryKey);
const clonedQuery = cloneWithUnwrapProxy(query);
applyPitchEdit(
Expand Down Expand Up @@ -1960,7 +2111,7 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
return singingVolume;
};

const singingVolumeGenerationStage: PhraseRenderBaseStage = {
const singingVolumeGenerationStage: PhraseRenderStage = {
id: "singingVolumeGeneration",
shouldBeExecuted: async (context: PhraseRenderContext) => {
const track = getOrThrow(context.snapshot.tracks, context.trackId);
Expand Down Expand Up @@ -2007,9 +2158,9 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
const phrase = getOrThrow(state.phrases, context.phraseKey);
const phraseSingingVolumeKey = phrase.singingVolumeKey;
if (phraseSingingVolumeKey != undefined) {
mutations.DELETE_PHRASE_SINGING_VOLUME({
singingVolumeKey: phraseSingingVolumeKey,
});
throw new Error(
"The previous singing volume has not been removed.",
);
}
mutations.SET_PHRASE_SINGING_VOLUME({
singingVolumeKey,
Expand Down Expand Up @@ -2087,7 +2238,7 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
}
};

const singingVoiceSynthesisStage: PhraseRenderBaseStage = {
const singingVoiceSynthesisStage: PhraseRenderStage = {
id: "singingVoiceSynthesis",
shouldBeExecuted: async (context: PhraseRenderContext) => {
const track = getOrThrow(context.snapshot.tracks, context.trackId);
Expand Down Expand Up @@ -2132,7 +2283,7 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
const phrase = getOrThrow(state.phrases, context.phraseKey);
const phraseSingingVoiceKey = phrase.singingVoiceKey;
if (phraseSingingVoiceKey != undefined) {
phraseSingingVoices.delete(phraseSingingVoiceKey);
throw new Error("The previous singing voice has not been removed.");
}
phraseSingingVoices.set(singingVoiceKey, singingVoice);
mutations.SET_SINGING_VOICE_KEY_TO_PHRASE({
Expand All @@ -2142,8 +2293,9 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
},
};

const stages: readonly PhraseRenderBaseStage[] = [
const stages: readonly PhraseRenderStage[] = [
queryGenerationStage,
singingPitchGenerationStage,
singingVolumeGenerationStage,
singingVoiceSynthesisStage,
];
Expand Down
30 changes: 30 additions & 0 deletions src/store/type.ts
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,11 @@ export type PhraseState =
*/
export type EditorFrameAudioQuery = FrameAudioQuery & { frameRate: number };

/**
* 歌唱ピッチ
*/
export type SingingPitch = number[];

/**
* 歌唱ボリューム
*/
Expand All @@ -770,6 +775,11 @@ export const EditorFrameAudioQueryKey = (
id: string,
): EditorFrameAudioQueryKey => editorFrameAudioQueryKeySchema.parse(id);

const singingPitchKeySchema = z.string().brand<"SingingPitchKey">();
export type SingingPitchKey = z.infer<typeof singingPitchKeySchema>;
export const SingingPitchKey = (id: string): SingingPitchKey =>
singingPitchKeySchema.parse(id);

const singingVolumeKeySchema = z.string().brand<"SingingVolumeKey">();
export type SingingVolumeKey = z.infer<typeof singingVolumeKeySchema>;
export const SingingVolumeKey = (id: string): SingingVolumeKey =>
Expand All @@ -794,6 +804,7 @@ export type Phrase = {
startTime: number;
state: PhraseState;
queryKey?: EditorFrameAudioQueryKey;
singingPitchKey?: SingingPitchKey;
singingVolumeKey?: SingingVolumeKey;
singingVoiceKey?: SingingVoiceKey;
sequenceId?: SequenceId;
Expand Down Expand Up @@ -839,6 +850,7 @@ export type SingingStoreState = {
editorFrameRate: number;
phrases: Map<PhraseKey, Phrase>;
phraseQueries: Map<EditorFrameAudioQueryKey, EditorFrameAudioQuery>;
phraseSingingPitches: Map<SingingPitchKey, SingingPitch>;
phraseSingingVolumes: Map<SingingVolumeKey, SingingVolume>;
sequencerZoomX: number;
sequencerZoomY: number;
Expand Down Expand Up @@ -999,6 +1011,13 @@ export type SingingStoreTypes = {
};
};

SET_SINGING_PITCH_KEY_TO_PHRASE: {
mutation: {
phraseKey: PhraseKey;
singingPitchKey: SingingPitchKey | undefined;
};
};

SET_SINGING_VOLUME_KEY_TO_PHRASE: {
mutation: {
phraseKey: PhraseKey;
Expand Down Expand Up @@ -1031,6 +1050,17 @@ export type SingingStoreTypes = {
mutation: { queryKey: EditorFrameAudioQueryKey };
};

SET_PHRASE_SINGING_PITCH: {
mutation: {
singingPitchKey: SingingPitchKey;
singingPitch: SingingPitch;
};
};

DELETE_PHRASE_SINGING_PITCH: {
mutation: { singingPitchKey: SingingPitchKey };
};

SET_PHRASE_SINGING_VOLUME: {
mutation: {
singingVolumeKey: SingingVolumeKey;
Expand Down
Loading