microsoft · TaoChenOSU · Nov 12, 2024 · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024
diff --git a/.github/workflows/python-integration-tests.yml b/.github/workflows/python-integration-tests.yml
@@ -65,6 +65,7 @@ jobs:
       AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
       AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
       AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
+      AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT: ${{ secrets.AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT }}
       AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
       AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
       BING_API_KEY: ${{ secrets.BING_API_KEY }}
@@ -234,6 +235,7 @@ jobs:
       AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
       AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
       AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
+      AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT: ${{ secrets.AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT }}
       AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
       AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
       BING_API_KEY: ${{ secrets.BING_API_KEY }}

@@ -4,7 +4,7 @@
 import logging
 import os
 
-from samples.concepts.audio_to_text.audio_recorder import AudioRecorder
+from samples.concepts.audio.audio_recorder import AudioRecorder
 from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
 from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
     OpenAIChatPromptExecutionSettings,
@@ -17,6 +17,10 @@
 # to create a chat bot that can communicate with the user using audio input.
 # The user can enage a long conversation with the chat bot by speaking to it.
 
+# Resources required for this sample:
+# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
+# 2. An Azure Speech to Text deployment (e.g. whisper).
+
 # Additional dependencies required for this sample:
 # - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
 # - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.

@@ -0,0 +1,97 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+import logging
+
+from samples.concepts.audio.audio_player import AudioPlayer
+from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
+from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
+    OpenAIChatPromptExecutionSettings,
+)
+from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
+    OpenAITextToAudioExecutionSettings,
+)
+from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_audio import AzureTextToAudio
+from semantic_kernel.contents import ChatHistory
+
+# This simple sample demonstrates how to use the AzureChatCompletion and AzureTextToAudio services
+# to create a chat bot that can communicate with the user using audio output.
+# The chatbot will engage in a conversation with the user and respond using audio output.
+
+# Resources required for this sample:
+# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
+# 2. An Azure Text to Speech deployment (e.g. tts).
+
+# Additional dependencies required for this sample:
+# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
+# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.
+
+
+logging.basicConfig(level=logging.WARNING)
+
+system_message = """
+You are a chat bot. Your name is Mosscap and
+you have one goal: figure out what people need.
+Your full name, should you need to know it, is
+Splendid Speckled Mosscap. You communicate
+effectively, but you tend to answer with long
+flowery prose.
+"""
+
+
+chat_service = AzureChatCompletion()
+text_to_audio_service = AzureTextToAudio()
+
+history = ChatHistory()
+history.add_user_message("Hi there, who are you?")
+history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
+
+
+async def chat() -> bool:
+    try:
+        user_input = input("User:> ")
+    except KeyboardInterrupt:
+        print("\n\nExiting chat...")
+        return False
+    except EOFError:
+        print("\n\nExiting chat...")
+        return False
+
+    if user_input == "exit":
+        print("\n\nExiting chat...")
+        return False
+
+    history.add_user_message(user_input)
+
+    # No need to stream the response since we can only pass the
+    # response to the text to audio service as a whole
+    response = await chat_service.get_chat_message_content(
+        chat_history=history,
+        settings=OpenAIChatPromptExecutionSettings(
+            max_tokens=2000,
+            temperature=0.7,
+            top_p=0.8,
+        ),
+    )
+
+    # Need to set the response format to wav since the audio player only supports wav files
+    audio_content = await text_to_audio_service.get_audio_content(
+        response.content, OpenAITextToAudioExecutionSettings(response_format="wav")
+    )
+    AudioPlayer(audio_content=audio_content).play()
+
+    print(f"Mosscap:> {response.content}")
+
+    history.add_assistant_message(response.content)
+
+    return True
+
+
+async def main() -> None:
+    chatting = True
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,115 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+import logging
+import os
+
+from samples.concepts.audio.audio_player import AudioPlayer
+from samples.concepts.audio.audio_recorder import AudioRecorder
+from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
+from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
+    OpenAIChatPromptExecutionSettings,
+)
+from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
+    OpenAITextToAudioExecutionSettings,
+)
+from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
+from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_audio import AzureTextToAudio
+from semantic_kernel.contents import ChatHistory
+from semantic_kernel.contents.audio_content import AudioContent
+
+# This simple sample demonstrates how to use the AzureChatCompletion, AzureTextToAudio, and AzureAudioToText
+# services to create a chat bot that can communicate with the user using both audio input and output.
+# The chatbot will engage in a conversation with the user by audio only.
+# This sample combines the functionality of the samples/concepts/audio/01-chat_with_audio_input.py and
+# samples/concepts/audio/02-chat_with_audio_output.py samples.
+
+# Resources required for this sample:
+# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
+# 2. An Azure Text to Speech deployment (e.g. tts).
+# 3. An Azure Speech to Text deployment (e.g. whisper).
+
+# Additional dependencies required for this sample:
+# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
+# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.
+
+
+logging.basicConfig(level=logging.WARNING)
+AUDIO_FILEPATH = os.path.join(os.path.dirname(__file__), "output.wav")
+
+
+system_message = """
+You are a chat bot. Your name is Mosscap and
+you have one goal: figure out what people need.
+Your full name, should you need to know it, is
+Splendid Speckled Mosscap. You communicate
+effectively, but you tend to answer with long
+flowery prose.
+"""
+
+
+chat_service = AzureChatCompletion()
+text_to_audio_service = AzureTextToAudio()
+audio_to_text_service = AzureAudioToText()
+
+history = ChatHistory()
+history.add_user_message("Hi there, who are you?")
+history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
+
+
+async def chat() -> bool:
+    try:
+        print("User:> ", end="", flush=True)
+        with AudioRecorder(output_filepath=AUDIO_FILEPATH) as recorder:
+            recorder.start_recording()
+            user_input = await audio_to_text_service.get_text_content(AudioContent.from_audio_file(AUDIO_FILEPATH))
+            print(user_input.text)
+    except KeyboardInterrupt:
+        print("\n\nExiting chat...")
+        return False
+    except EOFError:
+        print("\n\nExiting chat...")
+        return False
+
+    if "exit" in user_input.text.lower():
+        print("\n\nExiting chat...")
+        return False
+
+    history.add_user_message(user_input.text)
+
+    # No need to stream the response since we can only pass the
+    # response to the text to audio service as a whole
+    response = await chat_service.get_chat_message_content(
+        chat_history=history,
+        settings=OpenAIChatPromptExecutionSettings(
+            max_tokens=2000,
+            temperature=0.7,
+            top_p=0.8,
+        ),
+    )
+
+    # Need to set the response format to wav since the audio player only supports wav files
+    audio_content = await text_to_audio_service.get_audio_content(
+        response.content, OpenAITextToAudioExecutionSettings(response_format="wav")
+    )
+    print("Mosscap:> ", end="", flush=True)
+    AudioPlayer(audio_content=audio_content).play(text=response.content)
+
+    history.add_assistant_message(response.content)
+
+    return True
+
+
+async def main() -> None:
+    print(
+        "Instruction: when it's your turn to speak, press the spacebar to start recording."
+        " Release the spacebar to stop recording."
+    )
+
+    chatting = True
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,99 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import io
+import logging
+import wave
+from typing import ClassVar
+
+import pyaudio
+
+from semantic_kernel.contents.audio_content import AudioContent
+from semantic_kernel.kernel_pydantic import KernelBaseModel
+
+logging.basicConfig(level=logging.WARNING)
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class AudioPlayer(KernelBaseModel):
+    """A class to play an audio file to the default audio output device."""
+
+    # Audio replay parameters
+    CHUNK: ClassVar[int] = 1024
+
+    audio_content: AudioContent
+
+    def play(self, text: str | None = None) -> None:
+        """Play the audio content to the default audio output device.
+
+        Args:
+            text (str, optional): The text to display while playing the audio. Defaults to None.
+        """
+        audio_stream = io.BytesIO(self.audio_content.data)
+        with wave.open(audio_stream, "rb") as wf:
+            audio = pyaudio.PyAudio()
+            stream = audio.open(
+                format=audio.get_format_from_width(wf.getsampwidth()),
+                channels=wf.getnchannels(),
+                rate=wf.getframerate(),
+                output=True,
+            )
+
+            if text:
+                # Simulate the output of text while playing the audio
+                data_frames = []
+
+                data = wf.readframes(self.CHUNK)
+                while data:
+                    data_frames.append(data)
+                    data = wf.readframes(self.CHUNK)
+
+                if len(data_frames) < len(text):
+                    logger.warning(
+                        "The audio is too short to play the entire text. ",
+                        "The text will be displayed without synchronization.",
+                    )
+                    print(text)
+                else:
+                    for data_frame, text_frame in self._zip_text_and_audio(text, data_frames):
+                        stream.write(data_frame)
+                        print(text_frame, end="", flush=True)
+                    print()
+            else:
+                data = wf.readframes(self.CHUNK)
+                while data:
+                    stream.write(data)
+                    data = wf.readframes(self.CHUNK)
+
+            stream.stop_stream()
+            stream.close()
+            audio.terminate()
+
+    def _zip_text_and_audio(self, text: str, audio_frames: list) -> zip:
+        """Zip the text and audio frames together so that they can be displayed in sync.
+
+        This is done by evenly distributing empty strings between each character and
+        append the remaining empty strings at the end.
+
+        Args:
+            text (str): The text to display while playing the audio.
+            audio_frames (list): The audio frames to play.
+
+        Returns:
+            zip: The zipped text and audio frames.
+        """
+        text_frames = list(text)
+        empty_string_count = len(audio_frames) - len(text_frames)
+        empty_string_spacing = len(text_frames) // empty_string_count
+
+        modified_text_frames = []
+        current_empty_string_count = 0
+        for i, text_frame in enumerate(text_frames):
+            modified_text_frames.append(text_frame)
+            if current_empty_string_count < empty_string_count and i % empty_string_spacing == 0:
+                modified_text_frames.append("")
+                current_empty_string_count += 1
+
+        if current_empty_string_count < empty_string_count:
+            modified_text_frames.extend([""] * (empty_string_count - current_empty_string_count))
+
+        return zip(audio_frames, modified_text_frames)
@@ -14,7 +14,9 @@ class OpenAIAudioToTextExecutionSettings(PromptExecutionSettings):
     """Request settings for OpenAI audio to text services."""
 
     ai_model_id: str | None = Field(None, serialization_alias="model")
-    filename: str | None = None
+    filename: str | None = Field(
+        None, description="Do not set this manually. It is set by the service based on the audio content."
+    )
     language: str | None = None
     prompt: str | None = None
     response_format: str | None = None

@@ -38,7 +38,9 @@ class OpenAIPromptExecutionSettings(PromptExecutionSettings):
 class OpenAITextPromptExecutionSettings(OpenAIPromptExecutionSettings):
     """Specific settings for the completions endpoint."""
 
-    prompt: str | None = None
+    prompt: str | None = Field(
+        None, description="Do not set this manually. It is set by the service based on the text content."
+    )
     best_of: int | None = Field(None, ge=1)
     echo: bool = False
     logprobs: int | None = Field(None, ge=0, le=5)
@@ -66,7 +68,9 @@ class OpenAIChatPromptExecutionSettings(OpenAIPromptExecutionSettings):
     ) = None
     function_call: str | None = None
     functions: list[dict[str, Any]] | None = None
-    messages: list[dict[str, Any]] | None = None
+    messages: list[dict[str, Any]] | None = Field(
+        None, description="Do not set this manually. It is set by the service based on the chat history."
+    )
     function_call_behavior: FunctionCallBehavior | None = Field(None, exclude=True)
     parallel_tool_calls: bool = True
     tools: list[dict[str, Any]] | None = Field(