Merge pull request #512 from ai16z/fix/speech

fix: speech service fix
elizaOS · Nov 22, 2024 · dde12eb · dde12eb
2 parents a6c1b1b + 79f3ce4
commit dde12eb
Show file tree

Hide file tree

Showing 5 changed files with 1,936 additions and 8,530 deletions.
diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
@@ -416,11 +416,6 @@ export class VoiceManager extends EventEmitter {
                                 ServiceType.TRANSCRIPTION
                             );
 
-                        console.log(
-                            "transcriptionService: ",
-                            transcriptionService
-                        );
-
                         if (!transcriptionService) {
                             throw new Error(
                                 "Transcription generation service not found"

diff --git a/packages/plugin-node/package.json b/packages/plugin-node/package.json
@@ -20,6 +20,7 @@
         "cldr-segmentation": "2.2.1",
         "command-exists": "1.2.9",
         "csv-writer": "1.6.0",
+        "echogarden": "^2.0.5",
         "espeak-ng": "1.0.2",
         "ffmpeg-static": "5.2.0",
         "fluent-ffmpeg": "2.1.3",

diff --git a/packages/plugin-node/src/services/speech.ts b/packages/plugin-node/src/services/speech.ts
@@ -1,14 +1,9 @@
 import { PassThrough, Readable } from "stream";
-import {
-    IAgentRuntime,
-    ISpeechService,
-    ITranscriptionService,
-    ServiceType,
-} from "@ai16z/eliza";
+import { IAgentRuntime, ISpeechService, ServiceType } from "@ai16z/eliza";
 import { getWavHeader } from "./audioUtils.ts";
-import { synthesize } from "../vendor/vits.ts";
 import { Service } from "@ai16z/eliza";
 import { validateNodeConfig } from "../enviroment.ts";
+import * as Echogarden from "echogarden";
 
 function prependWavHeader(
     readable: Readable,
@@ -40,77 +35,141 @@ function prependWavHeader(
 
 async function textToSpeech(runtime: IAgentRuntime, text: string) {
     await validateNodeConfig(runtime);
-    const body = {
-        model_id: runtime.getSetting("ELEVENLABS_MODEL_ID"),
-        text: text,
-        voice_settings: {
-            similarity_boost: runtime.getSetting(
-                "ELEVENLABS_VOICE_SIMILARITY_BOOST"
-            ),
-            stability: runtime.getSetting("ELEVENLABS_VOICE_STABILITY"),
-            style: runtime.getSetting("ELEVENLABS_VOICE_STYLE"),
-            use_speaker_boost: runtime.getSetting(
-                "ELEVENLABS_VOICE_USE_SPEAKER_BOOST"
-            ),
-        },
-    };
-    const options = {
-        method: "POST",
-        headers: {
-            "Content-Type": "application/json",
-            "xi-api-key": runtime.getSetting("ELEVENLABS_XI_API_KEY"),
-        },
-        body: JSON.stringify(body),
-    };
-
-    const response = await fetch(
-        `https://api.elevenlabs.io/v1/text-to-speech/${runtime.getSetting("ELEVENLABS_VOICE_ID")}/stream?optimize_streaming_latency=${runtime.getSetting("ELEVENLABS_OPTIMIZE_STREAMING_LATENCY")}&output_format=${runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT")}`,
-        options
-    );
 
-    const status = response.status;
-    if (status != 200) {
-        console.log(`Received status ${status} from Eleven Labs API`);
-        const errorBodyString = await response.text();
-        throw new Error(
-            `Received status ${status} from Eleven Labs API: ${errorBodyString}`
+    try {
+        const response = await fetch(
+            `https://api.elevenlabs.io/v1/text-to-speech/${runtime.getSetting("ELEVENLABS_VOICE_ID")}/stream?optimize_streaming_latency=${runtime.getSetting("ELEVENLABS_OPTIMIZE_STREAMING_LATENCY")}&output_format=${runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT")}`,
+            {
+                method: "POST",
+                headers: {
+                    "Content-Type": "application/json",
+                    "xi-api-key": runtime.getSetting("ELEVENLABS_XI_API_KEY"),
+                },
+                body: JSON.stringify({
+                    model_id: runtime.getSetting("ELEVENLABS_MODEL_ID"),
+                    text: text,
+                    voice_settings: {
+                        similarity_boost: runtime.getSetting(
+                            "ELEVENLABS_VOICE_SIMILARITY_BOOST"
+                        ),
+                        stability: runtime.getSetting(
+                            "ELEVENLABS_VOICE_STABILITY"
+                        ),
+                        style: runtime.getSetting("ELEVENLABS_VOICE_STYLE"),
+                        use_speaker_boost: runtime.getSetting(
+                            "ELEVENLABS_VOICE_USE_SPEAKER_BOOST"
+                        ),
+                    },
+                }),
+            }
         );
-    }
 
-    if (response) {
-        const reader = response.body?.getReader();
-        const readable = new Readable({
-            read() {
-                reader &&
-                    reader.read().then(({ done, value }) => {
-                        if (done) {
-                            this.push(null);
-                        } else {
-                            this.push(value);
-                        }
-                    });
-            },
-        });
-
-        if (runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").startsWith("pcm_")) {
-            const sampleRate = parseInt(
-                runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").substring(4)
-            );
-            const withHeader = prependWavHeader(
-                readable,
-                1024 * 1024 * 100,
-                sampleRate,
-                1,
-                16
+        const status = response.status;
+        if (status != 200) {
+            const errorBodyString = await response.text();
+            const errorBody = JSON.parse(errorBodyString);
+
+            // Check for quota exceeded error
+            if (
+                status === 401 &&
+                errorBody.detail?.status === "quota_exceeded"
+            ) {
+                console.log("ElevenLabs quota exceeded, falling back to VITS");
+                throw new Error("QUOTA_EXCEEDED");
+            }
+
+            throw new Error(
+                `Received status ${status} from Eleven Labs API: ${errorBodyString}`
             );
-            return withHeader;
+        }
+
+        if (response) {
+            const reader = response.body?.getReader();
+            const readable = new Readable({
+                read() {
+                    reader &&
+                        reader.read().then(({ done, value }) => {
+                            if (done) {
+                                this.push(null);
+                            } else {
+                                this.push(value);
+                            }
+                        });
+                },
+            });
+
+            if (
+                runtime
+                    .getSetting("ELEVENLABS_OUTPUT_FORMAT")
+                    .startsWith("pcm_")
+            ) {
+                const sampleRate = parseInt(
+                    runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").substring(4)
+                );
+                const withHeader = prependWavHeader(
+                    readable,
+                    1024 * 1024 * 100,
+                    sampleRate,
+                    1,
+                    16
+                );
+                return withHeader;
+            } else {
+                return readable;
+            }
         } else {
-            return readable;
+            return new Readable({
+                read() {},
+            });
+        }
+    } catch (error) {
+        if (error.message === "QUOTA_EXCEEDED") {
+            // Fall back to VITS
+            const { audio } = await Echogarden.synthesize(text, {
+                engine: "vits",
+                voice: "en_US-hfc_female-medium",
+            });
+
+            let wavStream: Readable;
+            if (audio instanceof Buffer) {
+                console.log("audio is a buffer");
+                wavStream = Readable.from(audio);
+            } else if ("audioChannels" in audio && "sampleRate" in audio) {
+                console.log("audio is a RawAudio");
+                const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
+                console.log("buffer length: ", floatBuffer.length);
+
+                // Get the sample rate from the RawAudio object
+                const sampleRate = audio.sampleRate;
+
+                // Create a Float32Array view of the floatBuffer
+                const floatArray = new Float32Array(floatBuffer.buffer);
+
+                // Convert 32-bit float audio to 16-bit PCM
+                const pcmBuffer = new Int16Array(floatArray.length);
+                for (let i = 0; i < floatArray.length; i++) {
+                    pcmBuffer[i] = Math.round(floatArray[i] * 32767);
+                }
+
+                // Prepend WAV header to the buffer
+                const wavHeaderBuffer = getWavHeader(
+                    pcmBuffer.length * 2,
+                    sampleRate,
+                    1,
+                    16
+                );
+                const wavBuffer = Buffer.concat([
+                    wavHeaderBuffer,
+                    Buffer.from(pcmBuffer.buffer),
+                ]);
+
+                wavStream = Readable.from(wavBuffer);
+            } else {
+                throw new Error("Unsupported audio format");
+            }
+            return wavStream;
         }
-    } else {
-        return new Readable({
-            read() {},
-        });
+        throw error; // Re-throw other errors
     }
 }
 
@@ -124,53 +183,104 @@ export class SpeechService extends Service implements ISpeechService {
     }
 
     async generate(runtime: IAgentRuntime, text: string): Promise<Readable> {
-        // check for elevenlabs API key
-        if (runtime.getSetting("ELEVENLABS_XI_API_KEY")) {
-            return textToSpeech(runtime, text);
-        }
-        const { audio } = await synthesize(text, {
-            engine: "vits",
-            voice: "en_US-hfc_female-medium",
-        });
-
-        let wavStream: Readable;
-        if (audio instanceof Buffer) {
-            console.log("audio is a buffer");
-            wavStream = Readable.from(audio);
-        } else if ("audioChannels" in audio && "sampleRate" in audio) {
-            console.log("audio is a RawAudio");
-            const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
-            console.log("buffer length: ", floatBuffer.length);
-
-            // Get the sample rate from the RawAudio object
-            const sampleRate = audio.sampleRate;
-
-            // Create a Float32Array view of the floatBuffer
-            const floatArray = new Float32Array(floatBuffer.buffer);
-
-            // Convert 32-bit float audio to 16-bit PCM
-            const pcmBuffer = new Int16Array(floatArray.length);
-            for (let i = 0; i < floatArray.length; i++) {
-                pcmBuffer[i] = Math.round(floatArray[i] * 32767);
+        try {
+            // check for elevenlabs API key
+            if (runtime.getSetting("ELEVENLABS_XI_API_KEY")) {
+                return await textToSpeech(runtime, text);
             }
 
-            // Prepend WAV header to the buffer
-            const wavHeaderBuffer = getWavHeader(
-                pcmBuffer.length * 2,
-                sampleRate,
-                1,
-                16
-            );
-            const wavBuffer = Buffer.concat([
-                wavHeaderBuffer,
-                Buffer.from(pcmBuffer.buffer),
-            ]);
+            // Default to VITS if no ElevenLabs API key
+            const { audio } = await Echogarden.synthesize(text, {
+                engine: "vits",
+                voice: "en_US-hfc_female-medium",
+            });
 
-            wavStream = Readable.from(wavBuffer);
-        } else {
-            throw new Error("Unsupported audio format");
-        }
+            let wavStream: Readable;
+            if (audio instanceof Buffer) {
+                console.log("audio is a buffer");
+                wavStream = Readable.from(audio);
+            } else if ("audioChannels" in audio && "sampleRate" in audio) {
+                console.log("audio is a RawAudio");
+                const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
+                console.log("buffer length: ", floatBuffer.length);
+
+                // Get the sample rate from the RawAudio object
+                const sampleRate = audio.sampleRate;
+
+                // Create a Float32Array view of the floatBuffer
+                const floatArray = new Float32Array(floatBuffer.buffer);
+
+                // Convert 32-bit float audio to 16-bit PCM
+                const pcmBuffer = new Int16Array(floatArray.length);
+                for (let i = 0; i < floatArray.length; i++) {
+                    pcmBuffer[i] = Math.round(floatArray[i] * 32767);
+                }
+
+                // Prepend WAV header to the buffer
+                const wavHeaderBuffer = getWavHeader(
+                    pcmBuffer.length * 2,
+                    sampleRate,
+                    1,
+                    16
+                );
+                const wavBuffer = Buffer.concat([
+                    wavHeaderBuffer,
+                    Buffer.from(pcmBuffer.buffer),
+                ]);
+
+                wavStream = Readable.from(wavBuffer);
+            } else {
+                throw new Error("Unsupported audio format");
+            }
 
-        return wavStream;
+            return wavStream;
+        } catch (error) {
+            console.error("Speech generation error:", error);
+            // If ElevenLabs fails for any reason, fall back to VITS
+            const { audio } = await Echogarden.synthesize(text, {
+                engine: "vits",
+                voice: "en_US-hfc_female-medium",
+            });
+
+            let wavStream: Readable;
+            if (audio instanceof Buffer) {
+                console.log("audio is a buffer");
+                wavStream = Readable.from(audio);
+            } else if ("audioChannels" in audio && "sampleRate" in audio) {
+                console.log("audio is a RawAudio");
+                const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
+                console.log("buffer length: ", floatBuffer.length);
+
+                // Get the sample rate from the RawAudio object
+                const sampleRate = audio.sampleRate;
+
+                // Create a Float32Array view of the floatBuffer
+                const floatArray = new Float32Array(floatBuffer.buffer);
+
+                // Convert 32-bit float audio to 16-bit PCM
+                const pcmBuffer = new Int16Array(floatArray.length);
+                for (let i = 0; i < floatArray.length; i++) {
+                    pcmBuffer[i] = Math.round(floatArray[i] * 32767);
+                }
+
+                // Prepend WAV header to the buffer
+                const wavHeaderBuffer = getWavHeader(
+                    pcmBuffer.length * 2,
+                    sampleRate,
+                    1,
+                    16
+                );
+                const wavBuffer = Buffer.concat([
+                    wavHeaderBuffer,
+                    Buffer.from(pcmBuffer.buffer),
+                ]);
+
+                wavStream = Readable.from(wavBuffer);
+            } else {
+                throw new Error("Unsupported audio format");
+            }
+
+            return wavStream;
+        }
     }
 }