From f2206a26f50277eb793fdef77b65feeaabb01790 Mon Sep 17 00:00:00 2001 From: oxygen-dioxide <54425948+oxygen-dioxide@users.noreply.github.com> Date: Tue, 29 Aug 2023 14:19:22 +0800 Subject: [PATCH 1/2] energy and breathiness support --- OpenUtau.Core/DiffSinger/DiffSingerConfig.cs | 2 + OpenUtau.Core/DiffSinger/DiffSingerPitch.cs | 1 - .../DiffSinger/DiffSingerRenderer.cs | 47 ++--- OpenUtau.Core/DiffSinger/DiffSingerSinger.cs | 35 ++-- .../DiffSingerSpeakerEmbedManager.cs | 104 +++++++++++ .../DiffSinger/DiffSingerVariance.cs | 174 ++++++++++++++++++ 6 files changed, 308 insertions(+), 55 deletions(-) create mode 100644 OpenUtau.Core/DiffSinger/DiffSingerSpeakerEmbedManager.cs create mode 100644 OpenUtau.Core/DiffSinger/DiffSingerVariance.cs diff --git a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs index 2e1ab100d..480eabb5a 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs @@ -23,6 +23,8 @@ public class DsConfig { public int hiddenSize = 256; public bool useKeyShiftEmbed = false; public bool useSpeedEmbed = false; + public bool useEnergyEmbed = false; + public bool useBreathinessEmbed= false; public AugmentationArgs augmentationArgs; public string dur; public string linguistic; diff --git a/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs b/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs index 21991e373..c616fe04e 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs @@ -3,7 +3,6 @@ using System.IO; using System.Linq; using System.Text; -using System.Transactions; using Microsoft.ML.OnnxRuntime; using Microsoft.ML.OnnxRuntime.Tensors; diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs index 5b216faa5..33f08acf0 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs @@ -153,39 +153,8 @@ float[] InvokeDiffsinger(RenderPhrase phrase,int speedup) { //speaker if(singer.dsConfig.speakers != null) { - var speakers = singer.dsConfig.speakers; - var hiddenSize = singer.dsConfig.hiddenSize; - var speakerEmbeds = singer.getSpeakerEmbeds(); - //get default speaker - var headDefaultSpk = speakers.IndexOf(phrase.phones[0].suffix); - var tailDefaultSpk = speakers.IndexOf(phrase.phones[^1].suffix); - var defaultSpkByFrame = Enumerable.Repeat(headDefaultSpk, headFrames).ToList(); - defaultSpkByFrame.AddRange(Enumerable.Range(0, phrase.phones.Length) - .SelectMany(phIndex => Enumerable.Repeat(speakers.IndexOf(phrase.phones[phIndex].suffix), durations[phIndex+1]))); - defaultSpkByFrame.AddRange(Enumerable.Repeat(tailDefaultSpk, tailFrames)); - //get speaker curves - NDArray spkCurves = np.zeros(totalFrames, speakers.Count); - foreach(var curve in phrase.curves) { - if(IsVoiceColorCurve(curve.Item1,out int subBankId) && subBankId < singer.Subbanks.Count) { - var spkId = speakers.IndexOf(singer.Subbanks[subBankId].Suffix); - spkCurves[":", spkId] = DiffSingerUtils.SampleCurve(phrase, curve.Item2, 0, - frameMs, totalFrames, headFrames, tailFrames, x => x * 0.01f) - .Select(f => (float)f).ToArray(); - } - } - foreach(int frameId in Enumerable.Range(0,totalFrames)) { - //standarization - var spkSum = spkCurves[frameId, ":"].ToArray().Sum(); - if (spkSum > 1) { - spkCurves[frameId, ":"] /= spkSum; - } else { - spkCurves[frameId,defaultSpkByFrame[frameId]] += 1 - spkSum; - } - } - var spkEmbedResult = np.dot(spkCurves, speakerEmbeds.T); - var spkEmbedTensor = new DenseTensor(spkEmbedResult.ToArray(), - new int[] { totalFrames, hiddenSize }) - .Reshape(new int[] { 1, totalFrames, hiddenSize }); + var speakerEmbedManager = singer.getSpeakerEmbedManager(); + var spkEmbedTensor = speakerEmbedManager.PhraseSpeakerEmbed(phrase, durations, frameMs, totalFrames, headFrames, tailFrames); acousticInputs.Add(NamedOnnxValue.CreateFromTensor("spk_embed", spkEmbedTensor)); } //gender @@ -222,6 +191,18 @@ float[] InvokeDiffsinger(RenderPhrase phrase,int speedup) { acousticInputs.Add(NamedOnnxValue.CreateFromTensor("velocity", velocityTensor)); } + //Variance: Energy and Breathiness + if(singer.dsConfig.useBreathinessEmbed || singer.dsConfig.useEnergyEmbed){ + var varianceResult = singer.getVariancePredictor().Process(phrase); + //TODO: let user edit variance curves + if(singer.dsConfig.useEnergyEmbed){ + acousticInputs.Add(NamedOnnxValue.CreateFromTensor("energy", varianceResult.energy)); + } + if(singer.dsConfig.useBreathinessEmbed){ + acousticInputs.Add(NamedOnnxValue.CreateFromTensor("breathiness", varianceResult.breathiness)); + } + } + Tensor mel; var acousticOutputs = singer.getAcousticSession().Run(acousticInputs); mel = acousticOutputs.First().AsTensor().Clone(); diff --git a/OpenUtau.Core/DiffSinger/DiffSingerSinger.cs b/OpenUtau.Core/DiffSinger/DiffSingerSinger.cs index c1478c7c7..0d40b0f95 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerSinger.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerSinger.cs @@ -40,8 +40,8 @@ class DiffSingerSinger : USinger { public InferenceSession acousticSession = null; public DsVocoder vocoder = null; public DsPitch pitchPredictor = null; - public NDArray speakerEmbeds = null; - + public DiffSingerSpeakerEmbedManager speakerEmbedManager = null; + public DsVariance variancePredictor = null; public DiffSingerSinger(Voicebank voicebank) { this.voicebank = voicebank; @@ -130,30 +130,23 @@ public DsPitch getPitchPredictor(){ } return pitchPredictor; } - - public NDArray loadSpeakerEmbed(string speaker) { - string path = Path.Join(Location, speaker + ".emb"); - if(File.Exists(path)) { - var reader = new BinaryReader(File.OpenRead(path)); - return np.array(Enumerable.Range(0, dsConfig.hiddenSize) - .Select(i => reader.ReadSingle())); - } else { - throw new Exception("Speaker embed file {path} not found"); + + public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){ + if(speakerEmbedManager is null) { + speakerEmbedManager = new DiffSingerSpeakerEmbedManager(dsConfig, Location); } + return speakerEmbedManager; } - public NDArray getSpeakerEmbeds() { - if(speakerEmbeds == null) { - if(dsConfig.speakers == null) { - return null; - } else { - speakerEmbeds = np.zeros(dsConfig.hiddenSize, dsConfig.speakers.Count); - foreach(var spkId in Enumerable.Range(0, dsConfig.speakers.Count)) { - speakerEmbeds[":", spkId] = loadSpeakerEmbed(dsConfig.speakers[spkId]); - } + public DsVariance getVariancePredictor(){ + if(variancePredictor is null) { + if(File.Exists(Path.Join(Location,"dsvariance", "dsconfig.yaml"))){ + variancePredictor = new DsVariance(Path.Join(Location, "dsvariance")); + return variancePredictor; } + variancePredictor = new DsVariance(Location); } - return speakerEmbeds; + return variancePredictor; } } } diff --git a/OpenUtau.Core/DiffSinger/DiffSingerSpeakerEmbedManager.cs b/OpenUtau.Core/DiffSinger/DiffSingerSpeakerEmbedManager.cs new file mode 100644 index 000000000..3aa6d71ff --- /dev/null +++ b/OpenUtau.Core/DiffSinger/DiffSingerSpeakerEmbedManager.cs @@ -0,0 +1,104 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; + +using Microsoft.ML.OnnxRuntime.Tensors; +using NumSharp; + +using OpenUtau.Core.Render; + +namespace OpenUtau.Core.DiffSinger +{ + public class DiffSingerSpeakerEmbedManager + { + DsConfig dsConfig; + string rootPath; + public NDArray speakerEmbeds = null; + const string VoiceColorHeader = DiffSingerUtils.VoiceColorHeader; + + public DiffSingerSpeakerEmbedManager(DsConfig dsConfig, string rootPath) { + this.dsConfig = dsConfig; + this.rootPath = rootPath; + } + public NDArray loadSpeakerEmbed(string speaker) { + string path = Path.Join(rootPath, speaker + ".emb"); + if(File.Exists(path)) { + var reader = new BinaryReader(File.OpenRead(path)); + return np.array(Enumerable.Range(0, dsConfig.hiddenSize) + .Select(i => reader.ReadSingle())); + } else { + throw new Exception("Speaker embed file {path} not found"); + } + } + + public NDArray getSpeakerEmbeds() { + if(speakerEmbeds == null) { + if(dsConfig.speakers == null) { + return null; + } else { + speakerEmbeds = np.zeros(dsConfig.hiddenSize, dsConfig.speakers.Count); + foreach(var spkId in Enumerable.Range(0, dsConfig.speakers.Count)) { + speakerEmbeds[":", spkId] = loadSpeakerEmbed(dsConfig.speakers[spkId]); + } + } + } + return speakerEmbeds; + } + + public bool IsVoiceColorCurve(string abbr, out int subBankId) { + subBankId = 0; + if (abbr.StartsWith(VoiceColorHeader) && int.TryParse(abbr.Substring(2), out subBankId)) {; + subBankId -= 1; + return true; + } else { + return false; + } + } + + public int getSpeakerIndexBySuffix(string suffix){ + var speakerIndex = dsConfig.speakers.IndexOf(suffix); + if(speakerIndex == -1){ + speakerIndex = 0; + } + return speakerIndex; + } + + public Tensor PhraseSpeakerEmbed(RenderPhrase phrase, IList durations, float frameMs, int totalFrames, int headFrames, int tailFrames){ + var singer = phrase.singer; + var hiddenSize = dsConfig.hiddenSize; + var speakerEmbeds = getSpeakerEmbeds(); + //get default speaker for each phoneme + var headDefaultSpk = getSpeakerIndexBySuffix(phrase.phones[0].suffix); + var tailDefaultSpk = getSpeakerIndexBySuffix(phrase.phones[^1].suffix); + var defaultSpkByFrame = Enumerable.Repeat(headDefaultSpk, headFrames).ToList(); + defaultSpkByFrame.AddRange(Enumerable.Range(0, phrase.phones.Length) + .SelectMany(phIndex => Enumerable.Repeat(getSpeakerIndexBySuffix(phrase.phones[phIndex].suffix), durations[phIndex+1]))); + defaultSpkByFrame.AddRange(Enumerable.Repeat(tailDefaultSpk, tailFrames)); + //get speaker curves + NDArray spkCurves = np.zeros(totalFrames, dsConfig.speakers.Count); + foreach(var curve in phrase.curves) { + if(IsVoiceColorCurve(curve.Item1,out int subBankId) && subBankId < singer.Subbanks.Count) { + var spkId = getSpeakerIndexBySuffix(singer.Subbanks[subBankId].Suffix); + spkCurves[":", spkId] += DiffSingerUtils.SampleCurve(phrase, curve.Item2, 0, + frameMs, totalFrames, headFrames, tailFrames, x => x * 0.01f) + .Select(f => (float)f).ToArray(); + } + } + foreach(int frameId in Enumerable.Range(0,totalFrames)) { + //standarization + var spkSum = spkCurves[frameId, ":"].ToArray().Sum(); + if (spkSum > 1) { + spkCurves[frameId, ":"] /= spkSum; + } else { + spkCurves[frameId,defaultSpkByFrame[frameId]] += 1 - spkSum; + } + } + var spkEmbedResult = np.dot(spkCurves, speakerEmbeds.T); + var spkEmbedTensor = new DenseTensor(spkEmbedResult.ToArray(), + new int[] { totalFrames, hiddenSize }) + .Reshape(new int[] { 1, totalFrames, hiddenSize }); + return spkEmbedTensor; + } + } +} \ No newline at end of file diff --git a/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs b/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs new file mode 100644 index 000000000..4c81cd4ba --- /dev/null +++ b/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs @@ -0,0 +1,174 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; + +using Serilog; +using Microsoft.ML.OnnxRuntime; +using Microsoft.ML.OnnxRuntime.Tensors; + +using OpenUtau.Api; +using OpenUtau.Core.Render; +using OpenUtau.Core.Util; + +namespace OpenUtau.Core.DiffSinger{ + public struct VarianceResult{ + public Tensor energy; + public Tensor breathiness; + } + public class DsVariance{ + string rootPath; + DsConfig dsConfig; + List phonemes; + InferenceSession linguisticModel; + InferenceSession varianceModel; + IG2p g2p; + float frameMs; + const float headMs = DiffSingerUtils.headMs; + const float tailMs = DiffSingerUtils.tailMs; + DiffSingerSpeakerEmbedManager speakerEmbedManager; + + + public DsVariance(string rootPath) + { + this.rootPath = rootPath; + dsConfig = Yaml.DefaultDeserializer.Deserialize( + File.ReadAllText(Path.Combine(rootPath, "dsconfig.yaml"), + Encoding.UTF8)); + //Load phonemes list + string phonemesPath = Path.Combine(rootPath, dsConfig.phonemes); + phonemes = File.ReadLines(phonemesPath, Encoding.UTF8).ToList(); + //Load models + var linguisticModelPath = Path.Join(rootPath, dsConfig.linguistic); + linguisticModel = Onnx.getInferenceSession(linguisticModelPath); + var varianceModelPath = Path.Join(rootPath, dsConfig.variance); + varianceModel = Onnx.getInferenceSession(varianceModelPath); + frameMs = 1000f * dsConfig.hop_size / dsConfig.sample_rate; + //Load g2p + g2p = LoadG2p(rootPath); + } + + protected IG2p LoadG2p(string rootPath) { + var g2ps = new List(); + // Load dictionary from singer folder. + string file = Path.Combine(rootPath, "dsdict.yaml"); + if (File.Exists(file)) { + try { + g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(file)).Build()); + } catch (Exception e) { + Log.Error(e, $"Failed to load {file}"); + } + } + return new G2pFallbacks(g2ps.ToArray()); + } + + public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){ + if(speakerEmbedManager is null) { + speakerEmbedManager = new DiffSingerSpeakerEmbedManager(dsConfig, rootPath); + } + return speakerEmbedManager; + } + + public VarianceResult Process(RenderPhrase phrase){ + int headFrames = (int)Math.Round(headMs / frameMs); + int tailFrames = (int)Math.Round(tailMs / frameMs); + //Linguistic Encoder + var linguisticInputs = new List(); + var tokens = phrase.phones + .Select(p => (Int64)phonemes.IndexOf(p.phoneme)) + .Prepend((Int64)phonemes.IndexOf("SP")) + .Append((Int64)phonemes.IndexOf("SP")) + .ToArray(); + var ph_dur = phrase.phones + .Select(p => (int)Math.Round(p.endMs / frameMs) - (int)Math.Round(p.positionMs / frameMs))//prevent cumulative error + .Prepend(headFrames) + .Append(tailFrames) + .ToArray(); + int totalFrames = ph_dur.Sum(); + linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("tokens", + new DenseTensor(tokens, new int[] { tokens.Length }, false) + .Reshape(new int[] { 1, tokens.Length }))); + if(dsConfig.predict_dur){ + //if predict_dur is true, use word encode mode + var vowelIds = Enumerable.Range(0,phrase.phones.Length) + .Where(i=>g2p.IsVowel(phrase.phones[i].phoneme)) + .ToArray(); + var word_div = vowelIds.Zip(vowelIds.Skip(1),(a,b)=>(Int64)(b-a)) + .Prepend(vowelIds[0] + 1) + .Append(phrase.phones.Length - vowelIds[^1] + 1) + .ToArray(); + var word_dur = vowelIds.Zip(vowelIds.Skip(1), + (a,b)=>(Int64)(phrase.phones[b-1].endMs/frameMs) - (Int64)(phrase.phones[a].positionMs/frameMs)) + .Prepend((Int64)(phrase.phones[vowelIds[0]].positionMs/frameMs) - (Int64)(phrase.phones[0].positionMs/frameMs) + headFrames) + .Append((Int64)(phrase.notes[^1].endMs/frameMs) - (Int64)(phrase.phones[vowelIds[^1]].positionMs/frameMs) + tailFrames) + .ToArray(); + linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_div", + new DenseTensor(word_div, new int[] { word_div.Length }, false) + .Reshape(new int[] { 1, word_div.Length }))); + linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_dur", + new DenseTensor(word_dur, new int[] { word_dur.Length }, false) + .Reshape(new int[] { 1, word_dur.Length }))); + }else{ + //if predict_dur is true, use phoneme encode mode + linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("ph_dur", + new DenseTensor(ph_dur.Select(x=>(Int64)x).ToArray(), new int[] { ph_dur.Length }, false) + .Reshape(new int[] { 1, ph_dur.Length }))); + } + + var linguisticOutputs = linguisticModel.Run(linguisticInputs); + Tensor encoder_out = linguisticOutputs + .Where(o => o.Name == "encoder_out") + .First() + .AsTensor(); + + //Variance Predictor + var pitch = DiffSingerUtils.SampleCurve(phrase, phrase.pitches, 0, frameMs, totalFrames, headFrames, tailFrames, + x => x * 0.01) + .Select(f => (float)f).ToArray(); + var energy = Enumerable.Repeat(0f, totalFrames).ToArray(); + var breathiness = Enumerable.Repeat(0f, totalFrames).ToArray(); + var retake = Enumerable.Repeat(true, totalFrames*2).ToArray(); + var speedup = Preferences.Default.DiffsingerSpeedup; + + var varianceInputs = new List(); + varianceInputs.Add(NamedOnnxValue.CreateFromTensor("encoder_out", encoder_out)); + varianceInputs.Add(NamedOnnxValue.CreateFromTensor("ph_dur", + new DenseTensor(ph_dur.Select(x=>(Int64)x).ToArray(), new int[] { ph_dur.Length }, false) + .Reshape(new int[] { 1, ph_dur.Length }))); + varianceInputs.Add(NamedOnnxValue.CreateFromTensor("pitch", + new DenseTensor(pitch, new int[] { pitch.Length }, false) + .Reshape(new int[] { 1, totalFrames }))); + varianceInputs.Add(NamedOnnxValue.CreateFromTensor("energy", + new DenseTensor(energy, new int[] { energy.Length }, false) + .Reshape(new int[] { 1, totalFrames }))); + varianceInputs.Add(NamedOnnxValue.CreateFromTensor("breathiness", + new DenseTensor(breathiness, new int[] { breathiness.Length }, false) + .Reshape(new int[] { 1, totalFrames }))); + varianceInputs.Add(NamedOnnxValue.CreateFromTensor("retake", + new DenseTensor(retake, new int[] { retake.Length }, false) + .Reshape(new int[] { 1, totalFrames, 2 }))); + varianceInputs.Add(NamedOnnxValue.CreateFromTensor("speedup", + new DenseTensor(new long[] { speedup }, new int[] { 1 },false))); + //Speaker + if(dsConfig.speakers != null) { + var speakerEmbedManager = getSpeakerEmbedManager(); + var spkEmbedTensor = speakerEmbedManager.PhraseSpeakerEmbed(phrase, ph_dur, frameMs, totalFrames, headFrames, tailFrames); + varianceInputs.Add(NamedOnnxValue.CreateFromTensor("spk_embed", spkEmbedTensor)); + } + var varianceOutputs = varianceModel.Run(varianceInputs); + Tensor energy_pred = varianceOutputs + .Where(o => o.Name == "energy_pred") + .First() + .AsTensor(); + Tensor breathiness_pred = varianceOutputs + .Where(o => o.Name == "breathiness_pred") + .First() + .AsTensor(); + return new VarianceResult{ + energy = energy_pred, + breathiness = breathiness_pred + }; + } + } +} \ No newline at end of file From d7d7074ed65ecea749468b7b2a23af607804e35a Mon Sep 17 00:00:00 2001 From: oxygen-dioxide <54425948+oxygen-dioxide@users.noreply.github.com> Date: Tue, 29 Aug 2023 15:48:38 +0800 Subject: [PATCH 2/2] energy and breathiness curve expression --- .../DiffSinger/DiffSingerRenderer.cs | 39 ++++++++++++++++++- OpenUtau.Core/DiffSinger/DiffSingerUtils.cs | 1 + .../DiffSinger/DiffSingerVariance.cs | 8 ++-- 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs index 33f08acf0..585bbab17 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs @@ -19,6 +19,7 @@ public class DiffSingerRenderer : IRenderer { const float headMs = DiffSingerUtils.headMs; const float tailMs = DiffSingerUtils.tailMs; const string VELC = DiffSingerUtils.VELC; + const string ENE = DiffSingerUtils.ENE; const string VoiceColorHeader = DiffSingerUtils.VoiceColorHeader; static readonly HashSet supportedExp = new HashSet(){ @@ -26,7 +27,9 @@ public class DiffSingerRenderer : IRenderer { Format.Ustx.PITD, Format.Ustx.GENC, Format.Ustx.CLR, + Format.Ustx.BREC, VELC, + ENE, }; static readonly object lockObj = new object(); @@ -196,10 +199,28 @@ float[] InvokeDiffsinger(RenderPhrase phrase,int speedup) { var varianceResult = singer.getVariancePredictor().Process(phrase); //TODO: let user edit variance curves if(singer.dsConfig.useEnergyEmbed){ - acousticInputs.Add(NamedOnnxValue.CreateFromTensor("energy", varianceResult.energy)); + var energyCurve = phrase.curves.FirstOrDefault(curve => curve.Item1 == ENE); + IEnumerable userEnergy; + if(energyCurve!=null){ + userEnergy = DiffSingerUtils.SampleCurve(phrase, energyCurve.Item2, + 0, frameMs, totalFrames, headFrames, tailFrames, + x => x); + } else{ + userEnergy = Enumerable.Repeat(0d, totalFrames); + } + var energy = varianceResult.energy.Zip(userEnergy, (x,y)=>(float)Math.Min(x + y*12/100, 0)).ToArray(); + acousticInputs.Add(NamedOnnxValue.CreateFromTensor("energy", + new DenseTensor(energy, new int[] { energy.Length }) + .Reshape(new int[] { 1, energy.Length }))); } if(singer.dsConfig.useBreathinessEmbed){ - acousticInputs.Add(NamedOnnxValue.CreateFromTensor("breathiness", varianceResult.breathiness)); + var userBreathiness = DiffSingerUtils.SampleCurve(phrase, phrase.breathiness, + 0, frameMs, totalFrames, headFrames, tailFrames, + x => x); + var breathiness = varianceResult.breathiness.Zip(userBreathiness, (x,y)=>(float)Math.Min(x + y*12/100, 0)).ToArray(); + acousticInputs.Add(NamedOnnxValue.CreateFromTensor("breathiness", + new DenseTensor(breathiness, new int[] { breathiness.Length }) + .Reshape(new int[] { 1, breathiness.Length }))); } } @@ -224,6 +245,7 @@ public RenderPitchResult LoadRenderedPitch(RenderPhrase phrase) { public UExpressionDescriptor[] GetSuggestedExpressions(USinger singer, URenderSettings renderSettings) { var result = new List { + //velocity new UExpressionDescriptor{ name="velocity (curve)", abbr=VELC, @@ -232,8 +254,19 @@ public UExpressionDescriptor[] GetSuggestedExpressions(USinger singer, URenderSe max=200, defaultValue=100, isFlag=false, + }, + //energy + new UExpressionDescriptor{ + name="energy (curve)", + abbr=ENE, + type=UExpressionType.Curve, + min=-100, + max=100, + defaultValue=0, + isFlag=false, } }; + //speakers var dsSinger = singer as DiffSingerSinger; if(dsSinger!=null && dsSinger.dsConfig.speakers != null) { result.AddRange(Enumerable.Zip( @@ -249,6 +282,8 @@ public UExpressionDescriptor[] GetSuggestedExpressions(USinger singer, URenderSe isFlag=false, })); } + //energy + return result.ToArray(); } diff --git a/OpenUtau.Core/DiffSinger/DiffSingerUtils.cs b/OpenUtau.Core/DiffSinger/DiffSingerUtils.cs index 756d6a3f8..89cec5a91 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerUtils.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerUtils.cs @@ -4,6 +4,7 @@ namespace OpenUtau.Core.DiffSinger { public static class DiffSingerUtils { public const string VELC = "velc"; + public const string ENE = "ene"; public const string VoiceColorHeader = "cl"; public const float headMs = 100; public const float tailMs = 100; diff --git a/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs b/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs index 4c81cd4ba..879b6f9c2 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs @@ -14,8 +14,8 @@ namespace OpenUtau.Core.DiffSinger{ public struct VarianceResult{ - public Tensor energy; - public Tensor breathiness; + public float[] energy; + public float[] breathiness; } public class DsVariance{ string rootPath; @@ -166,8 +166,8 @@ public VarianceResult Process(RenderPhrase phrase){ .First() .AsTensor(); return new VarianceResult{ - energy = energy_pred, - breathiness = breathiness_pred + energy = energy_pred.ToArray(), + breathiness = breathiness_pred.ToArray() }; } }