VOICEVOX · Hiroshiba · Nov 5, 2024 · Nov 4, 2024 · Nov 4, 2024 · Nov 5, 2024
@@ -25,6 +25,8 @@ import {
   EditorFrameAudioQueryKey,
   EditorFrameAudioQuery,
   TrackParameters,
+  SingingPitchKey,
+  SingingPitch,
 } from "./type";
 import {
   buildSongTrackAudioFileNameFromRawData,
@@ -129,14 +131,15 @@ type PhraseRenderContext = Readonly<{
 
 type PhraseRenderStageId =
   | "queryGeneration"
+  | "singingPitchGeneration"
   | "singingVolumeGeneration"
   | "singingVoiceSynthesis";
 
 /**
  * フレーズレンダリングのステージのインターフェイス。
  * フレーズレンダラー内で順に実行される。
  */
-type PhraseRenderBaseStage = Readonly<{
+type PhraseRenderStage = Readonly<{
   id: PhraseRenderStageId;
 
   /**
@@ -172,6 +175,20 @@ type QuerySource = Readonly<{
   keyRangeAdjustment: number;
 }>;
 
+/**
+ * 歌唱ピッチの生成に必要なデータ
+ */
+type SingingPitchSource = Readonly<{
+  engineId: EngineId;
+  engineFrameRate: number;
+  tpqn: number;
+  tempos: Tempo[];
+  firstRestDuration: number;
+  notes: Note[];
+  keyRangeAdjustment: number;
+  queryForPitchGeneration: EditorFrameAudioQuery;
+}>;
+
 /**
  * 歌唱ボリュームの生成に必要なデータ
  */
@@ -381,6 +398,13 @@ const calculateQueryKey = async (querySource: QuerySource) => {
   return EditorFrameAudioQueryKey(hash);
 };
 
+const calculateSingingPitchKey = async (
+  singingPitchSource: SingingPitchSource,
+) => {
+  const hash = await calculateHash(singingPitchSource);
+  return SingingPitchKey(hash);
+};
+
 const calculateSingingVolumeKey = async (
   singingVolumeSource: SingingVolumeSource,
 ) => {
@@ -545,6 +569,7 @@ const sequences = new Map<SequenceId, Sequence & { trackId: TrackId }>();
 const animationTimer = new AnimationTimer();
 
 const queryCache = new Map<EditorFrameAudioQueryKey, EditorFrameAudioQuery>();
+const singingPitchCache = new Map<SingingPitchKey, SingingPitch>();
 const singingVolumeCache = new Map<SingingVolumeKey, SingingVolume>();
 const singingVoiceCache = new Map<SingingVoiceKey, SingingVoice>();
 
@@ -745,6 +770,7 @@ export const singingStoreState: SingingStoreState = {
   editorFrameRate: DEPRECATED_DEFAULT_EDITOR_FRAME_RATE,
   phrases: new Map(),
   phraseQueries: new Map(),
+  phraseSingingPitches: new Map(),
   phraseSingingVolumes: new Map(),
   sequencerZoomX: 0.5,
   sequencerZoomY: 0.75,
@@ -1219,6 +1245,23 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
     },
   },
 
+  SET_SINGING_PITCH_KEY_TO_PHRASE: {
+    mutation(
+      state,
+      {
+        phraseKey,
+        singingPitchKey,
+      }: {
+        phraseKey: PhraseKey;
+        singingPitchKey: SingingPitchKey | undefined;
+      },
+    ) {
+      const phrase = getOrThrow(state.phrases, phraseKey);
+
+      phrase.singingPitchKey = singingPitchKey;
+    },
+  },
+
   SET_SINGING_VOLUME_KEY_TO_PHRASE: {
     mutation(
       state,
@@ -1291,6 +1334,24 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
     },
   },
 
+  SET_PHRASE_SINGING_PITCH: {
+    mutation(
+      state,
+      {
+        singingPitchKey,
+        singingPitch,
+      }: { singingPitchKey: SingingPitchKey; singingPitch: SingingPitch },
+    ) {
+      state.phraseSingingPitches.set(singingPitchKey, singingPitch);
+    },
+  },
+
+  DELETE_PHRASE_SINGING_PITCH: {
+    mutation(state, { singingPitchKey }: { singingPitchKey: SingingPitchKey }) {
+      state.phraseSingingPitches.delete(singingPitchKey);
+    },
+  },
+
   SET_PHRASE_SINGING_VOLUME: {
     mutation(
       state,
@@ -1809,9 +1870,7 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
         }
       };
 
-      const generateQuery = async (
-        querySource: QuerySource,
-      ): Promise<EditorFrameAudioQuery> => {
+      const generateQuery = async (querySource: QuerySource) => {
         const notesForRequestToEngine = createNotesForRequestToEngine(
           querySource.firstRestDuration,
           lastRestDurationSeconds,
@@ -1836,7 +1895,7 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
         return query;
       };
 
-      const queryGenerationStage: PhraseRenderBaseStage = {
+      const queryGenerationStage: PhraseRenderStage = {
         id: "queryGeneration",
         shouldBeExecuted: async (context: PhraseRenderContext) => {
           const track = getOrThrow(context.snapshot.tracks, context.trackId);
@@ -1877,7 +1936,7 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
           const phrase = getOrThrow(state.phrases, context.phraseKey);
           const phraseQueryKey = phrase.queryKey;
           if (phraseQueryKey != undefined) {
-            mutations.DELETE_PHRASE_QUERY({ queryKey: phraseQueryKey });
+            throw new Error("The previous query has not been removed.");
           }
           mutations.SET_PHRASE_QUERY({ queryKey, query });
           mutations.SET_QUERY_KEY_TO_PHRASE({
@@ -1887,6 +1946,97 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
         },
       };
 
+      const generateSingingPitchSource = (
+        context: PhraseRenderContext,
+      ): SingingPitchSource => {
+        const track = getOrThrow(context.snapshot.tracks, context.trackId);
+        if (track.singer == undefined) {
+          throw new Error("track.singer is undefined.");
+        }
+        const phrase = getOrThrow(state.phrases, context.phraseKey);
+        const phraseQueryKey = phrase.queryKey;
+        if (phraseQueryKey == undefined) {
+          throw new Error("phraseQueryKey is undefined.");
+        }
+        const query = getOrThrow(state.phraseQueries, phraseQueryKey);
+        const clonedQuery = cloneWithUnwrapProxy(query);
+        // TODO: 音素タイミングの編集データの適用を行うようにする
+        return {
+          engineId: track.singer.engineId,
+          engineFrameRate: query.frameRate,
+          tpqn: context.snapshot.tpqn,
+          tempos: context.snapshot.tempos,
+          firstRestDuration: phrase.firstRestDuration,
+          notes: phrase.notes,
+          keyRangeAdjustment: track.keyRangeAdjustment,
+          queryForPitchGeneration: clonedQuery,
+        };
+      };
+
+      const generateSingingPitch = async (
+        singingPitchSource: SingingPitchSource,
+      ) => {
+        // TODO: ピッチ生成APIに対応する
+        return singingPitchSource.queryForPitchGeneration.f0;
+      };
+
+      const singingPitchGenerationStage: PhraseRenderStage = {
+        id: "singingPitchGeneration",
+        shouldBeExecuted: async (context: PhraseRenderContext) => {
+          const track = getOrThrow(context.snapshot.tracks, context.trackId);
+          if (track.singer == undefined) {
+            return false;
+          }
+          const phrase = getOrThrow(state.phrases, context.phraseKey);
+          const phraseSingingPitchKey = phrase.singingPitchKey;
+          const singingPitchSource = generateSingingPitchSource(context);
+          const singingPitchKey =
+            await calculateSingingPitchKey(singingPitchSource);
+          return (
+            phraseSingingPitchKey == undefined ||
+            phraseSingingPitchKey !== singingPitchKey
+          );
+        },
+        deleteExecutionResult: (context: PhraseRenderContext) => {
+          const phrase = getOrThrow(state.phrases, context.phraseKey);
+          const phraseSingingPitchKey = phrase.singingPitchKey;
+          if (phraseSingingPitchKey != undefined) {
+            mutations.DELETE_PHRASE_SINGING_PITCH({
+              singingPitchKey: phraseSingingPitchKey,
+            });
+            mutations.SET_SINGING_PITCH_KEY_TO_PHRASE({
+              phraseKey: context.phraseKey,
+              singingPitchKey: undefined,
+            });
+          }
+        },
+        execute: async (context: PhraseRenderContext) => {
+          const singingPitchSource = generateSingingPitchSource(context);
+          const singingPitchKey =
+            await calculateSingingPitchKey(singingPitchSource);
+
+          let singingPitch = singingPitchCache.get(singingPitchKey);
+          if (singingPitch != undefined) {
+            logger.info(`Loaded singing pitch from cache.`);
+          } else {
+            singingPitch = await generateSingingPitch(singingPitchSource);
+            logger.info(`Generated singing pitch.`);
+            singingPitchCache.set(singingPitchKey, singingPitch);
+          }
+
+          const phrase = getOrThrow(state.phrases, context.phraseKey);
+          const phraseSingingPitchKey = phrase.singingPitchKey;
+          if (phraseSingingPitchKey != undefined) {
+            throw new Error("The previous singing pitch has not been removed.");
+          }
+          mutations.SET_PHRASE_SINGING_PITCH({ singingPitchKey, singingPitch });
+          mutations.SET_SINGING_PITCH_KEY_TO_PHRASE({
+            phraseKey: context.phraseKey,
+            singingPitchKey,
+          });
+        },
+      };
+
       const generateSingingVolumeSource = (
         context: PhraseRenderContext,
       ): SingingVolumeSource => {
@@ -1899,6 +2049,7 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
         if (phraseQueryKey == undefined) {
           throw new Error("phraseQueryKey is undefined.");
         }
+        // TODO: ピッチ生成ステージで生成したピッチを使用するようにする
         const query = getOrThrow(state.phraseQueries, phraseQueryKey);
         const clonedQuery = cloneWithUnwrapProxy(query);
         applyPitchEdit(
@@ -1960,7 +2111,7 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
         return singingVolume;
       };
 
-      const singingVolumeGenerationStage: PhraseRenderBaseStage = {
+      const singingVolumeGenerationStage: PhraseRenderStage = {
         id: "singingVolumeGeneration",
         shouldBeExecuted: async (context: PhraseRenderContext) => {
           const track = getOrThrow(context.snapshot.tracks, context.trackId);
@@ -2007,9 +2158,9 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
           const phrase = getOrThrow(state.phrases, context.phraseKey);
           const phraseSingingVolumeKey = phrase.singingVolumeKey;
           if (phraseSingingVolumeKey != undefined) {
-            mutations.DELETE_PHRASE_SINGING_VOLUME({
-              singingVolumeKey: phraseSingingVolumeKey,
-            });
+            throw new Error(
+              "The previous singing volume has not been removed.",
+            );
           }
           mutations.SET_PHRASE_SINGING_VOLUME({
             singingVolumeKey,
@@ -2087,7 +2238,7 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
         }
       };
 
-      const singingVoiceSynthesisStage: PhraseRenderBaseStage = {
+      const singingVoiceSynthesisStage: PhraseRenderStage = {
         id: "singingVoiceSynthesis",
         shouldBeExecuted: async (context: PhraseRenderContext) => {
           const track = getOrThrow(context.snapshot.tracks, context.trackId);
@@ -2132,7 +2283,7 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
           const phrase = getOrThrow(state.phrases, context.phraseKey);
           const phraseSingingVoiceKey = phrase.singingVoiceKey;
           if (phraseSingingVoiceKey != undefined) {
-            phraseSingingVoices.delete(phraseSingingVoiceKey);
+            throw new Error("The previous singing voice has not been removed.");
           }
           phraseSingingVoices.set(singingVoiceKey, singingVoice);
           mutations.SET_SINGING_VOICE_KEY_TO_PHRASE({
@@ -2142,8 +2293,9 @@ export const singingStore = createPartialStore<SingingStoreTypes>({
         },
       };
 
-      const stages: readonly PhraseRenderBaseStage[] = [
+      const stages: readonly PhraseRenderStage[] = [
         queryGenerationStage,
+        singingPitchGenerationStage,
         singingVolumeGenerationStage,
         singingVoiceSynthesisStage,
       ];

@@ -750,6 +750,11 @@ export type PhraseState =
  */
 export type EditorFrameAudioQuery = FrameAudioQuery & { frameRate: number };
 
+/**
+ * 歌唱ピッチ
+ */
+export type SingingPitch = number[];
+
 /**
  * 歌唱ボリューム
  */
@@ -770,6 +775,11 @@ export const EditorFrameAudioQueryKey = (
   id: string,
 ): EditorFrameAudioQueryKey => editorFrameAudioQueryKeySchema.parse(id);
 
+const singingPitchKeySchema = z.string().brand<"SingingPitchKey">();
+export type SingingPitchKey = z.infer<typeof singingPitchKeySchema>;
+export const SingingPitchKey = (id: string): SingingPitchKey =>
+  singingPitchKeySchema.parse(id);
+
 const singingVolumeKeySchema = z.string().brand<"SingingVolumeKey">();
 export type SingingVolumeKey = z.infer<typeof singingVolumeKeySchema>;
 export const SingingVolumeKey = (id: string): SingingVolumeKey =>
@@ -794,6 +804,7 @@ export type Phrase = {
   startTime: number;
   state: PhraseState;
   queryKey?: EditorFrameAudioQueryKey;
+  singingPitchKey?: SingingPitchKey;
   singingVolumeKey?: SingingVolumeKey;
   singingVoiceKey?: SingingVoiceKey;
   sequenceId?: SequenceId;
@@ -839,6 +850,7 @@ export type SingingStoreState = {
   editorFrameRate: number;
   phrases: Map<PhraseKey, Phrase>;
   phraseQueries: Map<EditorFrameAudioQueryKey, EditorFrameAudioQuery>;
+  phraseSingingPitches: Map<SingingPitchKey, SingingPitch>;
   phraseSingingVolumes: Map<SingingVolumeKey, SingingVolume>;
   sequencerZoomX: number;
   sequencerZoomY: number;
@@ -999,6 +1011,13 @@ export type SingingStoreTypes = {
     };
   };
 
+  SET_SINGING_PITCH_KEY_TO_PHRASE: {
+    mutation: {
+      phraseKey: PhraseKey;
+      singingPitchKey: SingingPitchKey | undefined;
+    };
+  };
+
   SET_SINGING_VOLUME_KEY_TO_PHRASE: {
     mutation: {
       phraseKey: PhraseKey;
@@ -1031,6 +1050,17 @@ export type SingingStoreTypes = {
     mutation: { queryKey: EditorFrameAudioQueryKey };
   };
 
+  SET_PHRASE_SINGING_PITCH: {
+    mutation: {
+      singingPitchKey: SingingPitchKey;
+      singingPitch: SingingPitch;
+    };
+  };
+
+  DELETE_PHRASE_SINGING_PITCH: {
+    mutation: { singingPitchKey: SingingPitchKey };
+  };
+
   SET_PHRASE_SINGING_VOLUME: {
     mutation: {
       singingVolumeKey: SingingVolumeKey;