fix CI & add missing changetsets (#914)

livekit · Oct 15, 2024 · c1f2674 · c1f2674
1 parent 5a0f994
commit c1f2674
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 20 deletions.
diff --git a/.changeset/light-tools-jump.md b/.changeset/light-tools-jump.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-azure": minor
+---
+
+Azure TTS Prosody SSML support #912
diff --git a/livekit-agents/livekit/agents/pipeline/pipeline_agent.py b/livekit-agents/livekit/agents/pipeline/pipeline_agent.py
@@ -718,7 +718,7 @@ def _commit_user_question_if_needed() -> None:
 
             if tool_calls:
                 extra_tools_messages.append(
-                    ChatMessage.create_tool_calls(tool_calls, content=collected_text)
+                    ChatMessage.create_tool_calls(tool_calls, text=collected_text)
                 )
                 extra_tools_messages.extend(tool_calls_results_msg)
 

diff --git a/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py b/livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py
@@ -17,9 +17,10 @@
 from dataclasses import dataclass
 from typing import Literal
 
-import azure.cognitiveservices.speech as speechsdk  # type: ignore
 from livekit.agents import tts, utils
 
+import azure.cognitiveservices.speech as speechsdk  # type: ignore
+
 AZURE_SAMPLE_RATE: int = 16000
 AZURE_BITS_PER_SAMPLE: int = 16
 AZURE_NUM_CHANNELS: int = 1
@@ -65,7 +66,13 @@ def validate(self) -> None:
                     "Prosody volume must be one of 'silent', 'x-soft', 'soft', 'medium', 'loud', 'x-loud'"
                 )
 
-        if self.pitch and self.pitch not in ["x-low", "low", "medium", "high", "x-high"]:
+        if self.pitch and self.pitch not in [
+            "x-low",
+            "low",
+            "medium",
+            "high",
+            "x-high",
+        ]:
             raise ValueError(
                 "Prosody pitch must be one of 'x-low', 'low', 'medium', 'high', 'x-high'"
             )
@@ -153,25 +160,22 @@ async def _main_task(self):
             stream=stream_callback,
         )
 
-        def _create_ssml_text(text: str, opts: _TTSOptions) -> str:
-            ssml = f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{opts.language or "en-US"}">'
-            prosody_ssml = "<prosody"
-            if opts.prosody.rate:
-                prosody_ssml += f' rate="{opts.prosody.rate}"'
-            if opts.prosody.volume:
-                prosody_ssml += f' volume="{opts.prosody.volume}"'
-            if opts.prosody.pitch:
-                prosody_ssml += f' pitch="{opts.prosody.pitch}"'
-            prosody_ssml += ">"
-            ssml += prosody_ssml
-            ssml += text
-            ssml += "</prosody></speak>"
-            return ssml
-
         def _synthesize() -> speechsdk.SpeechSynthesisResult:
             if self._opts.prosody:
-                ssml_text = _create_ssml_text(self._text, self._opts)
-                return synthesizer.speak_ssml_async(ssml_text).get()
+                ssml = f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{self._opts.language or "en-US"}">'
+                prosody_ssml = "<prosody"
+                if self._opts.prosody.rate:
+                    prosody_ssml += f' rate="{self._opts.prosody.rate}"'
+                if self._opts.prosody.volume:
+                    prosody_ssml += f' volume="{self._opts.prosody.volume}"'
+                if self._opts.prosody.pitch:
+                    prosody_ssml += f' pitch="{self._opts.prosody.pitch}"'
+                prosody_ssml += ">"
+                ssml += prosody_ssml
+                ssml += self._text
+                ssml += "</prosody></speak>"
+                return synthesizer.speak_ssml_async(ssml).get()  # type: ignore
+
             return synthesizer.speak_text_async(self._text).get()  # type: ignore
 
         result = None