Skip to content

Commit

Permalink
Merge pull request #512 from ai16z/fix/speech
Browse files Browse the repository at this point in the history
fix: speech service fix
  • Loading branch information
ponderingdemocritus authored Nov 22, 2024
2 parents a6c1b1b + 79f3ce4 commit dde12eb
Show file tree
Hide file tree
Showing 5 changed files with 1,936 additions and 8,530 deletions.
5 changes: 0 additions & 5 deletions packages/client-discord/src/voice.ts
Original file line number Diff line number Diff line change
Expand Up @@ -416,11 +416,6 @@ export class VoiceManager extends EventEmitter {
ServiceType.TRANSCRIPTION
);

console.log(
"transcriptionService: ",
transcriptionService
);

if (!transcriptionService) {
throw new Error(
"Transcription generation service not found"
Expand Down
1 change: 1 addition & 0 deletions packages/plugin-node/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"cldr-segmentation": "2.2.1",
"command-exists": "1.2.9",
"csv-writer": "1.6.0",
"echogarden": "^2.0.5",
"espeak-ng": "1.0.2",
"ffmpeg-static": "5.2.0",
"fluent-ffmpeg": "2.1.3",
Expand Down
342 changes: 226 additions & 116 deletions packages/plugin-node/src/services/speech.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
import { PassThrough, Readable } from "stream";
import {
IAgentRuntime,
ISpeechService,
ITranscriptionService,
ServiceType,
} from "@ai16z/eliza";
import { IAgentRuntime, ISpeechService, ServiceType } from "@ai16z/eliza";
import { getWavHeader } from "./audioUtils.ts";
import { synthesize } from "../vendor/vits.ts";
import { Service } from "@ai16z/eliza";
import { validateNodeConfig } from "../enviroment.ts";
import * as Echogarden from "echogarden";

function prependWavHeader(
readable: Readable,
Expand Down Expand Up @@ -40,77 +35,141 @@ function prependWavHeader(

async function textToSpeech(runtime: IAgentRuntime, text: string) {
await validateNodeConfig(runtime);
const body = {
model_id: runtime.getSetting("ELEVENLABS_MODEL_ID"),
text: text,
voice_settings: {
similarity_boost: runtime.getSetting(
"ELEVENLABS_VOICE_SIMILARITY_BOOST"
),
stability: runtime.getSetting("ELEVENLABS_VOICE_STABILITY"),
style: runtime.getSetting("ELEVENLABS_VOICE_STYLE"),
use_speaker_boost: runtime.getSetting(
"ELEVENLABS_VOICE_USE_SPEAKER_BOOST"
),
},
};
const options = {
method: "POST",
headers: {
"Content-Type": "application/json",
"xi-api-key": runtime.getSetting("ELEVENLABS_XI_API_KEY"),
},
body: JSON.stringify(body),
};

const response = await fetch(
`https://api.elevenlabs.io/v1/text-to-speech/${runtime.getSetting("ELEVENLABS_VOICE_ID")}/stream?optimize_streaming_latency=${runtime.getSetting("ELEVENLABS_OPTIMIZE_STREAMING_LATENCY")}&output_format=${runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT")}`,
options
);

const status = response.status;
if (status != 200) {
console.log(`Received status ${status} from Eleven Labs API`);
const errorBodyString = await response.text();
throw new Error(
`Received status ${status} from Eleven Labs API: ${errorBodyString}`
try {
const response = await fetch(
`https://api.elevenlabs.io/v1/text-to-speech/${runtime.getSetting("ELEVENLABS_VOICE_ID")}/stream?optimize_streaming_latency=${runtime.getSetting("ELEVENLABS_OPTIMIZE_STREAMING_LATENCY")}&output_format=${runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT")}`,
{
method: "POST",
headers: {
"Content-Type": "application/json",
"xi-api-key": runtime.getSetting("ELEVENLABS_XI_API_KEY"),
},
body: JSON.stringify({
model_id: runtime.getSetting("ELEVENLABS_MODEL_ID"),
text: text,
voice_settings: {
similarity_boost: runtime.getSetting(
"ELEVENLABS_VOICE_SIMILARITY_BOOST"
),
stability: runtime.getSetting(
"ELEVENLABS_VOICE_STABILITY"
),
style: runtime.getSetting("ELEVENLABS_VOICE_STYLE"),
use_speaker_boost: runtime.getSetting(
"ELEVENLABS_VOICE_USE_SPEAKER_BOOST"
),
},
}),
}
);
}

if (response) {
const reader = response.body?.getReader();
const readable = new Readable({
read() {
reader &&
reader.read().then(({ done, value }) => {
if (done) {
this.push(null);
} else {
this.push(value);
}
});
},
});

if (runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").startsWith("pcm_")) {
const sampleRate = parseInt(
runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").substring(4)
);
const withHeader = prependWavHeader(
readable,
1024 * 1024 * 100,
sampleRate,
1,
16
const status = response.status;
if (status != 200) {
const errorBodyString = await response.text();
const errorBody = JSON.parse(errorBodyString);

// Check for quota exceeded error
if (
status === 401 &&
errorBody.detail?.status === "quota_exceeded"
) {
console.log("ElevenLabs quota exceeded, falling back to VITS");
throw new Error("QUOTA_EXCEEDED");
}

throw new Error(
`Received status ${status} from Eleven Labs API: ${errorBodyString}`
);
return withHeader;
}

if (response) {
const reader = response.body?.getReader();
const readable = new Readable({
read() {
reader &&
reader.read().then(({ done, value }) => {
if (done) {
this.push(null);
} else {
this.push(value);
}
});
},
});

if (
runtime
.getSetting("ELEVENLABS_OUTPUT_FORMAT")
.startsWith("pcm_")
) {
const sampleRate = parseInt(
runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").substring(4)
);
const withHeader = prependWavHeader(
readable,
1024 * 1024 * 100,
sampleRate,
1,
16
);
return withHeader;
} else {
return readable;
}
} else {
return readable;
return new Readable({
read() {},
});
}
} catch (error) {
if (error.message === "QUOTA_EXCEEDED") {
// Fall back to VITS
const { audio } = await Echogarden.synthesize(text, {
engine: "vits",
voice: "en_US-hfc_female-medium",
});

let wavStream: Readable;
if (audio instanceof Buffer) {
console.log("audio is a buffer");
wavStream = Readable.from(audio);
} else if ("audioChannels" in audio && "sampleRate" in audio) {
console.log("audio is a RawAudio");
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
console.log("buffer length: ", floatBuffer.length);

// Get the sample rate from the RawAudio object
const sampleRate = audio.sampleRate;

// Create a Float32Array view of the floatBuffer
const floatArray = new Float32Array(floatBuffer.buffer);

// Convert 32-bit float audio to 16-bit PCM
const pcmBuffer = new Int16Array(floatArray.length);
for (let i = 0; i < floatArray.length; i++) {
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
}

// Prepend WAV header to the buffer
const wavHeaderBuffer = getWavHeader(
pcmBuffer.length * 2,
sampleRate,
1,
16
);
const wavBuffer = Buffer.concat([
wavHeaderBuffer,
Buffer.from(pcmBuffer.buffer),
]);

wavStream = Readable.from(wavBuffer);
} else {
throw new Error("Unsupported audio format");
}
return wavStream;
}
} else {
return new Readable({
read() {},
});
throw error; // Re-throw other errors
}
}

Expand All @@ -124,53 +183,104 @@ export class SpeechService extends Service implements ISpeechService {
}

async generate(runtime: IAgentRuntime, text: string): Promise<Readable> {
// check for elevenlabs API key
if (runtime.getSetting("ELEVENLABS_XI_API_KEY")) {
return textToSpeech(runtime, text);
}
const { audio } = await synthesize(text, {
engine: "vits",
voice: "en_US-hfc_female-medium",
});

let wavStream: Readable;
if (audio instanceof Buffer) {
console.log("audio is a buffer");
wavStream = Readable.from(audio);
} else if ("audioChannels" in audio && "sampleRate" in audio) {
console.log("audio is a RawAudio");
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
console.log("buffer length: ", floatBuffer.length);

// Get the sample rate from the RawAudio object
const sampleRate = audio.sampleRate;

// Create a Float32Array view of the floatBuffer
const floatArray = new Float32Array(floatBuffer.buffer);

// Convert 32-bit float audio to 16-bit PCM
const pcmBuffer = new Int16Array(floatArray.length);
for (let i = 0; i < floatArray.length; i++) {
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
try {
// check for elevenlabs API key
if (runtime.getSetting("ELEVENLABS_XI_API_KEY")) {
return await textToSpeech(runtime, text);
}

// Prepend WAV header to the buffer
const wavHeaderBuffer = getWavHeader(
pcmBuffer.length * 2,
sampleRate,
1,
16
);
const wavBuffer = Buffer.concat([
wavHeaderBuffer,
Buffer.from(pcmBuffer.buffer),
]);
// Default to VITS if no ElevenLabs API key
const { audio } = await Echogarden.synthesize(text, {
engine: "vits",
voice: "en_US-hfc_female-medium",
});

wavStream = Readable.from(wavBuffer);
} else {
throw new Error("Unsupported audio format");
}
let wavStream: Readable;
if (audio instanceof Buffer) {
console.log("audio is a buffer");
wavStream = Readable.from(audio);
} else if ("audioChannels" in audio && "sampleRate" in audio) {
console.log("audio is a RawAudio");
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
console.log("buffer length: ", floatBuffer.length);

// Get the sample rate from the RawAudio object
const sampleRate = audio.sampleRate;

// Create a Float32Array view of the floatBuffer
const floatArray = new Float32Array(floatBuffer.buffer);

// Convert 32-bit float audio to 16-bit PCM
const pcmBuffer = new Int16Array(floatArray.length);
for (let i = 0; i < floatArray.length; i++) {
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
}

// Prepend WAV header to the buffer
const wavHeaderBuffer = getWavHeader(
pcmBuffer.length * 2,
sampleRate,
1,
16
);
const wavBuffer = Buffer.concat([
wavHeaderBuffer,
Buffer.from(pcmBuffer.buffer),
]);

wavStream = Readable.from(wavBuffer);
} else {
throw new Error("Unsupported audio format");
}

return wavStream;
return wavStream;
} catch (error) {
console.error("Speech generation error:", error);
// If ElevenLabs fails for any reason, fall back to VITS
const { audio } = await Echogarden.synthesize(text, {
engine: "vits",
voice: "en_US-hfc_female-medium",
});

let wavStream: Readable;
if (audio instanceof Buffer) {
console.log("audio is a buffer");
wavStream = Readable.from(audio);
} else if ("audioChannels" in audio && "sampleRate" in audio) {
console.log("audio is a RawAudio");
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
console.log("buffer length: ", floatBuffer.length);

// Get the sample rate from the RawAudio object
const sampleRate = audio.sampleRate;

// Create a Float32Array view of the floatBuffer
const floatArray = new Float32Array(floatBuffer.buffer);

// Convert 32-bit float audio to 16-bit PCM
const pcmBuffer = new Int16Array(floatArray.length);
for (let i = 0; i < floatArray.length; i++) {
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
}

// Prepend WAV header to the buffer
const wavHeaderBuffer = getWavHeader(
pcmBuffer.length * 2,
sampleRate,
1,
16
);
const wavBuffer = Buffer.concat([
wavHeaderBuffer,
Buffer.from(pcmBuffer.buffer),
]);

wavStream = Readable.from(wavBuffer);
} else {
throw new Error("Unsupported audio format");
}

return wavStream;
}
}
}
Loading

0 comments on commit dde12eb

Please sign in to comment.