Skip to content

Commit

Permalink
Merge pull request #60 from oxygen-dioxide/diffsinger
Browse files Browse the repository at this point in the history
Diffsinger
  • Loading branch information
oxygen-dioxide authored Aug 29, 2023
2 parents 90c0f52 + d7d7074 commit 91db461
Show file tree
Hide file tree
Showing 7 changed files with 344 additions and 55 deletions.
2 changes: 2 additions & 0 deletions OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ public class DsConfig {
public int hiddenSize = 256;
public bool useKeyShiftEmbed = false;
public bool useSpeedEmbed = false;
public bool useEnergyEmbed = false;
public bool useBreathinessEmbed= false;
public AugmentationArgs augmentationArgs;
public string dur;
public string linguistic;
Expand Down
1 change: 0 additions & 1 deletion OpenUtau.Core/DiffSinger/DiffSingerPitch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
using System.IO;
using System.Linq;
using System.Text;
using System.Transactions;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;

Expand Down
82 changes: 49 additions & 33 deletions OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,17 @@ public class DiffSingerRenderer : IRenderer {
const float headMs = DiffSingerUtils.headMs;
const float tailMs = DiffSingerUtils.tailMs;
const string VELC = DiffSingerUtils.VELC;
const string ENE = DiffSingerUtils.ENE;
const string VoiceColorHeader = DiffSingerUtils.VoiceColorHeader;

static readonly HashSet<string> supportedExp = new HashSet<string>(){
Format.Ustx.DYN,
Format.Ustx.PITD,
Format.Ustx.GENC,
Format.Ustx.CLR,
Format.Ustx.BREC,
VELC,
ENE,
};

static readonly object lockObj = new object();
Expand Down Expand Up @@ -153,39 +156,8 @@ float[] InvokeDiffsinger(RenderPhrase phrase,int speedup) {

//speaker
if(singer.dsConfig.speakers != null) {
var speakers = singer.dsConfig.speakers;
var hiddenSize = singer.dsConfig.hiddenSize;
var speakerEmbeds = singer.getSpeakerEmbeds();
//get default speaker
var headDefaultSpk = speakers.IndexOf(phrase.phones[0].suffix);
var tailDefaultSpk = speakers.IndexOf(phrase.phones[^1].suffix);
var defaultSpkByFrame = Enumerable.Repeat(headDefaultSpk, headFrames).ToList();
defaultSpkByFrame.AddRange(Enumerable.Range(0, phrase.phones.Length)
.SelectMany(phIndex => Enumerable.Repeat(speakers.IndexOf(phrase.phones[phIndex].suffix), durations[phIndex+1])));
defaultSpkByFrame.AddRange(Enumerable.Repeat(tailDefaultSpk, tailFrames));
//get speaker curves
NDArray spkCurves = np.zeros<float>(totalFrames, speakers.Count);
foreach(var curve in phrase.curves) {
if(IsVoiceColorCurve(curve.Item1,out int subBankId) && subBankId < singer.Subbanks.Count) {
var spkId = speakers.IndexOf(singer.Subbanks[subBankId].Suffix);
spkCurves[":", spkId] = DiffSingerUtils.SampleCurve(phrase, curve.Item2, 0,
frameMs, totalFrames, headFrames, tailFrames, x => x * 0.01f)
.Select(f => (float)f).ToArray();
}
}
foreach(int frameId in Enumerable.Range(0,totalFrames)) {
//standarization
var spkSum = spkCurves[frameId, ":"].ToArray<float>().Sum();
if (spkSum > 1) {
spkCurves[frameId, ":"] /= spkSum;
} else {
spkCurves[frameId,defaultSpkByFrame[frameId]] += 1 - spkSum;
}
}
var spkEmbedResult = np.dot(spkCurves, speakerEmbeds.T);
var spkEmbedTensor = new DenseTensor<float>(spkEmbedResult.ToArray<float>(),
new int[] { totalFrames, hiddenSize })
.Reshape(new int[] { 1, totalFrames, hiddenSize });
var speakerEmbedManager = singer.getSpeakerEmbedManager();
var spkEmbedTensor = speakerEmbedManager.PhraseSpeakerEmbed(phrase, durations, frameMs, totalFrames, headFrames, tailFrames);
acousticInputs.Add(NamedOnnxValue.CreateFromTensor("spk_embed", spkEmbedTensor));
}
//gender
Expand Down Expand Up @@ -222,6 +194,36 @@ float[] InvokeDiffsinger(RenderPhrase phrase,int speedup) {
acousticInputs.Add(NamedOnnxValue.CreateFromTensor("velocity", velocityTensor));
}

//Variance: Energy and Breathiness
if(singer.dsConfig.useBreathinessEmbed || singer.dsConfig.useEnergyEmbed){
var varianceResult = singer.getVariancePredictor().Process(phrase);
//TODO: let user edit variance curves
if(singer.dsConfig.useEnergyEmbed){
var energyCurve = phrase.curves.FirstOrDefault(curve => curve.Item1 == ENE);
IEnumerable<double> userEnergy;
if(energyCurve!=null){
userEnergy = DiffSingerUtils.SampleCurve(phrase, energyCurve.Item2,
0, frameMs, totalFrames, headFrames, tailFrames,
x => x);
} else{
userEnergy = Enumerable.Repeat(0d, totalFrames);
}
var energy = varianceResult.energy.Zip(userEnergy, (x,y)=>(float)Math.Min(x + y*12/100, 0)).ToArray();
acousticInputs.Add(NamedOnnxValue.CreateFromTensor("energy",
new DenseTensor<float>(energy, new int[] { energy.Length })
.Reshape(new int[] { 1, energy.Length })));
}
if(singer.dsConfig.useBreathinessEmbed){
var userBreathiness = DiffSingerUtils.SampleCurve(phrase, phrase.breathiness,
0, frameMs, totalFrames, headFrames, tailFrames,
x => x);
var breathiness = varianceResult.breathiness.Zip(userBreathiness, (x,y)=>(float)Math.Min(x + y*12/100, 0)).ToArray();
acousticInputs.Add(NamedOnnxValue.CreateFromTensor("breathiness",
new DenseTensor<float>(breathiness, new int[] { breathiness.Length })
.Reshape(new int[] { 1, breathiness.Length })));
}
}

Tensor<float> mel;
var acousticOutputs = singer.getAcousticSession().Run(acousticInputs);
mel = acousticOutputs.First().AsTensor<float>().Clone();
Expand All @@ -243,6 +245,7 @@ public RenderPitchResult LoadRenderedPitch(RenderPhrase phrase) {

public UExpressionDescriptor[] GetSuggestedExpressions(USinger singer, URenderSettings renderSettings) {
var result = new List<UExpressionDescriptor> {
//velocity
new UExpressionDescriptor{
name="velocity (curve)",
abbr=VELC,
Expand All @@ -251,8 +254,19 @@ public UExpressionDescriptor[] GetSuggestedExpressions(USinger singer, URenderSe
max=200,
defaultValue=100,
isFlag=false,
},
//energy
new UExpressionDescriptor{
name="energy (curve)",
abbr=ENE,
type=UExpressionType.Curve,
min=-100,
max=100,
defaultValue=0,
isFlag=false,
}
};
//speakers
var dsSinger = singer as DiffSingerSinger;
if(dsSinger!=null && dsSinger.dsConfig.speakers != null) {
result.AddRange(Enumerable.Zip(
Expand All @@ -268,6 +282,8 @@ public UExpressionDescriptor[] GetSuggestedExpressions(USinger singer, URenderSe
isFlag=false,
}));
}
//energy

return result.ToArray();
}

Expand Down
35 changes: 14 additions & 21 deletions OpenUtau.Core/DiffSinger/DiffSingerSinger.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ class DiffSingerSinger : USinger {
public InferenceSession acousticSession = null;
public DsVocoder vocoder = null;
public DsPitch pitchPredictor = null;
public NDArray speakerEmbeds = null;

public DiffSingerSpeakerEmbedManager speakerEmbedManager = null;
public DsVariance variancePredictor = null;

public DiffSingerSinger(Voicebank voicebank) {
this.voicebank = voicebank;
Expand Down Expand Up @@ -130,30 +130,23 @@ public DsPitch getPitchPredictor(){
}
return pitchPredictor;
}

public NDArray loadSpeakerEmbed(string speaker) {
string path = Path.Join(Location, speaker + ".emb");
if(File.Exists(path)) {
var reader = new BinaryReader(File.OpenRead(path));
return np.array<float>(Enumerable.Range(0, dsConfig.hiddenSize)
.Select(i => reader.ReadSingle()));
} else {
throw new Exception("Speaker embed file {path} not found");

public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
if(speakerEmbedManager is null) {
speakerEmbedManager = new DiffSingerSpeakerEmbedManager(dsConfig, Location);
}
return speakerEmbedManager;
}

public NDArray getSpeakerEmbeds() {
if(speakerEmbeds == null) {
if(dsConfig.speakers == null) {
return null;
} else {
speakerEmbeds = np.zeros<float>(dsConfig.hiddenSize, dsConfig.speakers.Count);
foreach(var spkId in Enumerable.Range(0, dsConfig.speakers.Count)) {
speakerEmbeds[":", spkId] = loadSpeakerEmbed(dsConfig.speakers[spkId]);
}
public DsVariance getVariancePredictor(){
if(variancePredictor is null) {
if(File.Exists(Path.Join(Location,"dsvariance", "dsconfig.yaml"))){
variancePredictor = new DsVariance(Path.Join(Location, "dsvariance"));
return variancePredictor;
}
variancePredictor = new DsVariance(Location);
}
return speakerEmbeds;
return variancePredictor;
}
}
}
104 changes: 104 additions & 0 deletions OpenUtau.Core/DiffSinger/DiffSingerSpeakerEmbedManager.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;

using Microsoft.ML.OnnxRuntime.Tensors;
using NumSharp;

using OpenUtau.Core.Render;

namespace OpenUtau.Core.DiffSinger
{
public class DiffSingerSpeakerEmbedManager
{
DsConfig dsConfig;
string rootPath;
public NDArray speakerEmbeds = null;
const string VoiceColorHeader = DiffSingerUtils.VoiceColorHeader;

public DiffSingerSpeakerEmbedManager(DsConfig dsConfig, string rootPath) {
this.dsConfig = dsConfig;
this.rootPath = rootPath;
}
public NDArray loadSpeakerEmbed(string speaker) {
string path = Path.Join(rootPath, speaker + ".emb");
if(File.Exists(path)) {
var reader = new BinaryReader(File.OpenRead(path));
return np.array<float>(Enumerable.Range(0, dsConfig.hiddenSize)
.Select(i => reader.ReadSingle()));
} else {
throw new Exception("Speaker embed file {path} not found");
}
}

public NDArray getSpeakerEmbeds() {
if(speakerEmbeds == null) {
if(dsConfig.speakers == null) {
return null;
} else {
speakerEmbeds = np.zeros<float>(dsConfig.hiddenSize, dsConfig.speakers.Count);
foreach(var spkId in Enumerable.Range(0, dsConfig.speakers.Count)) {
speakerEmbeds[":", spkId] = loadSpeakerEmbed(dsConfig.speakers[spkId]);
}
}
}
return speakerEmbeds;
}

public bool IsVoiceColorCurve(string abbr, out int subBankId) {
subBankId = 0;
if (abbr.StartsWith(VoiceColorHeader) && int.TryParse(abbr.Substring(2), out subBankId)) {;
subBankId -= 1;
return true;
} else {
return false;
}
}

public int getSpeakerIndexBySuffix(string suffix){
var speakerIndex = dsConfig.speakers.IndexOf(suffix);
if(speakerIndex == -1){
speakerIndex = 0;
}
return speakerIndex;
}

public Tensor<float> PhraseSpeakerEmbed(RenderPhrase phrase, IList<int> durations, float frameMs, int totalFrames, int headFrames, int tailFrames){
var singer = phrase.singer;
var hiddenSize = dsConfig.hiddenSize;
var speakerEmbeds = getSpeakerEmbeds();
//get default speaker for each phoneme
var headDefaultSpk = getSpeakerIndexBySuffix(phrase.phones[0].suffix);
var tailDefaultSpk = getSpeakerIndexBySuffix(phrase.phones[^1].suffix);
var defaultSpkByFrame = Enumerable.Repeat(headDefaultSpk, headFrames).ToList();
defaultSpkByFrame.AddRange(Enumerable.Range(0, phrase.phones.Length)
.SelectMany(phIndex => Enumerable.Repeat(getSpeakerIndexBySuffix(phrase.phones[phIndex].suffix), durations[phIndex+1])));
defaultSpkByFrame.AddRange(Enumerable.Repeat(tailDefaultSpk, tailFrames));
//get speaker curves
NDArray spkCurves = np.zeros<float>(totalFrames, dsConfig.speakers.Count);
foreach(var curve in phrase.curves) {
if(IsVoiceColorCurve(curve.Item1,out int subBankId) && subBankId < singer.Subbanks.Count) {
var spkId = getSpeakerIndexBySuffix(singer.Subbanks[subBankId].Suffix);
spkCurves[":", spkId] += DiffSingerUtils.SampleCurve(phrase, curve.Item2, 0,
frameMs, totalFrames, headFrames, tailFrames, x => x * 0.01f)
.Select(f => (float)f).ToArray();
}
}
foreach(int frameId in Enumerable.Range(0,totalFrames)) {
//standarization
var spkSum = spkCurves[frameId, ":"].ToArray<float>().Sum();
if (spkSum > 1) {
spkCurves[frameId, ":"] /= spkSum;
} else {
spkCurves[frameId,defaultSpkByFrame[frameId]] += 1 - spkSum;
}
}
var spkEmbedResult = np.dot(spkCurves, speakerEmbeds.T);
var spkEmbedTensor = new DenseTensor<float>(spkEmbedResult.ToArray<float>(),
new int[] { totalFrames, hiddenSize })
.Reshape(new int[] { 1, totalFrames, hiddenSize });
return spkEmbedTensor;
}
}
}
1 change: 1 addition & 0 deletions OpenUtau.Core/DiffSinger/DiffSingerUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
namespace OpenUtau.Core.DiffSinger {
public static class DiffSingerUtils {
public const string VELC = "velc";
public const string ENE = "ene";
public const string VoiceColorHeader = "cl";
public const float headMs = 100;
public const float tailMs = 100;
Expand Down
Loading

0 comments on commit 91db461

Please sign in to comment.