microsoft · TaoChenOSU · Nov 12, 2024 · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024
diff --git a/.github/workflows/python-integration-tests.yml b/.github/workflows/python-integration-tests.yml
@@ -64,13 +64,19 @@ jobs:
       AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME }} # azure-text-embedding-ada-002
       AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
       AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
-      AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
+      AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME }}
+      AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME }}
+      AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME }}
       AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
       AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
+      AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
+      AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT: ${{ secrets.AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT }}
       BING_API_KEY: ${{ secrets.BING_API_KEY }}
       OPENAI_CHAT_MODEL_ID: ${{ vars.OPENAI_CHAT_MODEL_ID }}
       OPENAI_TEXT_MODEL_ID: ${{ vars.OPENAI_TEXT_MODEL_ID }}
       OPENAI_EMBEDDING_MODEL_ID: ${{ vars.OPENAI_EMBEDDING_MODEL_ID }}
+      OPENAI_AUDIO_TO_TEXT_MODEL_ID: ${{ vars.OPENAI_AUDIO_TO_TEXT_MODEL_ID }}
+      OPENAI_TEXT_TO_AUDIO_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_AUDIO_MODEL_ID }}
       OPENAI_TEXT_TO_IMAGE_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_IMAGE_MODEL_ID }}
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       PINECONE_API_KEY: ${{ secrets.PINECONE__APIKEY }}
@@ -233,13 +239,19 @@ jobs:
       AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME }} # azure-text-embedding-ada-002
       AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
       AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
-      AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
+      AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME }}
+      AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME }}
+      AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME }}
       AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
       AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
+      AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
+      AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT: ${{ secrets.AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT }}
       BING_API_KEY: ${{ secrets.BING_API_KEY }}
       OPENAI_CHAT_MODEL_ID: ${{ vars.OPENAI_CHAT_MODEL_ID }}
       OPENAI_TEXT_MODEL_ID: ${{ vars.OPENAI_TEXT_MODEL_ID }}
       OPENAI_EMBEDDING_MODEL_ID: ${{ vars.OPENAI_EMBEDDING_MODEL_ID }}
+      OPENAI_AUDIO_TO_TEXT_MODEL_ID: ${{ vars.OPENAI_AUDIO_TO_TEXT_MODEL_ID }}
+      OPENAI_TEXT_TO_AUDIO_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_AUDIO_MODEL_ID }}
       OPENAI_TEXT_TO_IMAGE_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_IMAGE_MODEL_ID }}
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       PINECONE_API_KEY: ${{ secrets.PINECONE__APIKEY }}

@@ -4,19 +4,22 @@
 import logging
 import os
 
-from samples.concepts.audio_to_text.audio_recorder import AudioRecorder
-from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
-from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
+from samples.concepts.audio.audio_recorder import AudioRecorder
+from semantic_kernel.connectors.ai.open_ai import (
+    AzureAudioToText,
+    AzureChatCompletion,
     OpenAIChatPromptExecutionSettings,
 )
-from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
-from semantic_kernel.contents import ChatHistory
-from semantic_kernel.contents.audio_content import AudioContent
+from semantic_kernel.contents import AudioContent, ChatHistory
 
 # This simple sample demonstrates how to use the AzureChatCompletion and AzureAudioToText services
 # to create a chat bot that can communicate with the user using audio input.
 # The user can enage a long conversation with the chat bot by speaking to it.
 
+# Resources required for this sample:
+# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
+# 2. An Azure Speech to Text deployment (e.g. whisper).
+
 # Additional dependencies required for this sample:
 # - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
 # - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.

@@ -0,0 +1,95 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+import logging
+
+from samples.concepts.audio.audio_player import AudioPlayer
+from semantic_kernel.connectors.ai.open_ai import (
+    AzureChatCompletion,
+    AzureTextToAudio,
+    OpenAIChatPromptExecutionSettings,
+    OpenAITextToAudioExecutionSettings,
+)
+from semantic_kernel.contents import ChatHistory
+
+# This simple sample demonstrates how to use the AzureChatCompletion and AzureTextToAudio services
+# to create a chat bot that can communicate with the user using audio output.
+# The chatbot will engage in a conversation with the user and respond using audio output.
+
+# Resources required for this sample:
+# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
+# 2. An Azure Text to Speech deployment (e.g. tts).
+
+# Additional dependencies required for this sample:
+# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
+# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.
+
+
+logging.basicConfig(level=logging.WARNING)
+
+system_message = """
+You are a chat bot. Your name is Mosscap and
+you have one goal: figure out what people need.
+Your full name, should you need to know it, is
+Splendid Speckled Mosscap. You communicate
+effectively, but you tend to answer with long
+flowery prose.
+"""
+
+
+chat_service = AzureChatCompletion()
+text_to_audio_service = AzureTextToAudio()
+
+history = ChatHistory()
+history.add_user_message("Hi there, who are you?")
+history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
+
+
+async def chat() -> bool:
+    try:
+        user_input = input("User:> ")
+    except KeyboardInterrupt:
+        print("\n\nExiting chat...")
+        return False
+    except EOFError:
+        print("\n\nExiting chat...")
+        return False
+
+    if user_input == "exit":
+        print("\n\nExiting chat...")
+        return False
+
+    history.add_user_message(user_input)
+
+    # No need to stream the response since we can only pass the
+    # response to the text to audio service as a whole
+    response = await chat_service.get_chat_message_content(
+        chat_history=history,
+        settings=OpenAIChatPromptExecutionSettings(
+            max_tokens=2000,
+            temperature=0.7,
+            top_p=0.8,
+        ),
+    )
+
+    # Need to set the response format to wav since the audio player only supports wav files
+    audio_content = await text_to_audio_service.get_audio_content(
+        response.content, OpenAITextToAudioExecutionSettings(response_format="wav")
+    )
+    AudioPlayer(audio_content=audio_content).play()
+
+    print(f"Mosscap:> {response.content}")
+
+    history.add_message(response)
+
+    return True
+
+
+async def main() -> None:
+    chatting = True
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,112 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+import logging
+import os
+
+from samples.concepts.audio.audio_player import AudioPlayer
+from samples.concepts.audio.audio_recorder import AudioRecorder
+from semantic_kernel.connectors.ai.open_ai import (
+    AzureAudioToText,
+    AzureChatCompletion,
+    AzureTextToAudio,
+    OpenAIChatPromptExecutionSettings,
+    OpenAITextToAudioExecutionSettings,
+)
+from semantic_kernel.contents import AudioContent, ChatHistory
+
+# This simple sample demonstrates how to use the AzureChatCompletion, AzureTextToAudio, and AzureAudioToText
+# services to create a chat bot that can communicate with the user using both audio input and output.
+# The chatbot will engage in a conversation with the user by audio only.
+# This sample combines the functionality of the samples/concepts/audio/01-chat_with_audio_input.py and
+# samples/concepts/audio/02-chat_with_audio_output.py samples.
+
+# Resources required for this sample:
+# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
+# 2. An Azure Text to Speech deployment (e.g. tts).
+# 3. An Azure Speech to Text deployment (e.g. whisper).
+
+# Additional dependencies required for this sample:
+# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
+# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.
+
+
+logging.basicConfig(level=logging.WARNING)
+AUDIO_FILEPATH = os.path.join(os.path.dirname(__file__), "output.wav")
+
+
+system_message = """
+You are a chat bot. Your name is Mosscap and
+you have one goal: figure out what people need.
+Your full name, should you need to know it, is
+Splendid Speckled Mosscap. You communicate
+effectively, but you tend to answer with long
+flowery prose.
+"""
+
+
+chat_service = AzureChatCompletion()
+text_to_audio_service = AzureTextToAudio()
+audio_to_text_service = AzureAudioToText()
+
+history = ChatHistory()
+history.add_user_message("Hi there, who are you?")
+history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
+
+
+async def chat() -> bool:
+    try:
+        print("User:> ", end="", flush=True)
+        with AudioRecorder(output_filepath=AUDIO_FILEPATH) as recorder:
+            recorder.start_recording()
+            user_input = await audio_to_text_service.get_text_content(AudioContent.from_audio_file(AUDIO_FILEPATH))
+            print(user_input.text)
+    except KeyboardInterrupt:
+        print("\n\nExiting chat...")
+        return False
+    except EOFError:
+        print("\n\nExiting chat...")
+        return False
+
+    if "exit" in user_input.text.lower():
+        print("\n\nExiting chat...")
+        return False
+
+    history.add_user_message(user_input.text)
+
+    # No need to stream the response since we can only pass the
+    # response to the text to audio service as a whole
+    response = await chat_service.get_chat_message_content(
+        chat_history=history,
+        settings=OpenAIChatPromptExecutionSettings(
+            max_tokens=2000,
+            temperature=0.7,
+            top_p=0.8,
+        ),
+    )
+
+    # Need to set the response format to wav since the audio player only supports wav files
+    audio_content = await text_to_audio_service.get_audio_content(
+        response.content, OpenAITextToAudioExecutionSettings(response_format="wav")
+    )
+    print("Mosscap:> ", end="", flush=True)
+    AudioPlayer(audio_content=audio_content).play(text=response.content)
+
+    history.add_message(response)
+
+    return True
+
+
+async def main() -> None:
+    print(
+        "Instruction: when it's your turn to speak, press the spacebar to start recording."
+        " Release the spacebar to stop recording."
+    )
+
+    chatting = True
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,99 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import io
+import logging
+import wave
+from typing import ClassVar
+
+import pyaudio
+from pydantic import BaseModel
+
+from semantic_kernel.contents import AudioContent
+
+logging.basicConfig(level=logging.WARNING)
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class AudioPlayer(BaseModel):
+    """A class to play an audio file to the default audio output device."""
+
+    # Audio replay parameters
+    CHUNK: ClassVar[int] = 1024
+
+    audio_content: AudioContent
+
+    def play(self, text: str | None = None) -> None:
+        """Play the audio content to the default audio output device.
+
+        Args:
+            text (str, optional): The text to display while playing the audio. Defaults to None.
+        """
+        audio_stream = io.BytesIO(self.audio_content.data)
+        with wave.open(audio_stream, "rb") as wf:
+            audio = pyaudio.PyAudio()
+            stream = audio.open(
+                format=audio.get_format_from_width(wf.getsampwidth()),
+                channels=wf.getnchannels(),
+                rate=wf.getframerate(),
+                output=True,
+            )
+
+            if text:
+                # Simulate the output of text while playing the audio
+                data_frames = []
+
+                data = wf.readframes(self.CHUNK)
+                while data:
+                    data_frames.append(data)
+                    data = wf.readframes(self.CHUNK)
+
+                if len(data_frames) < len(text):
+                    logger.warning(
+                        "The audio is too short to play the entire text. ",
+                        "The text will be displayed without synchronization.",
+                    )
+                    print(text)
+                else:
+                    for data_frame, text_frame in self._zip_text_and_audio(text, data_frames):
+                        stream.write(data_frame)
+                        print(text_frame, end="", flush=True)
+                    print()
+            else:
+                data = wf.readframes(self.CHUNK)
+                while data:
+                    stream.write(data)
+                    data = wf.readframes(self.CHUNK)
+
+            stream.stop_stream()
+            stream.close()
+            audio.terminate()
+
+    def _zip_text_and_audio(self, text: str, audio_frames: list) -> zip:
+        """Zip the text and audio frames together so that they can be displayed in sync.
+
+        This is done by evenly distributing empty strings between each character and
+        append the remaining empty strings at the end.
+
+        Args:
+            text (str): The text to display while playing the audio.
+            audio_frames (list): The audio frames to play.
+
+        Returns:
+            zip: The zipped text and audio frames.
+        """
+        text_frames = list(text)
+        empty_string_count = len(audio_frames) - len(text_frames)
+        empty_string_spacing = len(text_frames) // empty_string_count
+
+        modified_text_frames = []
+        current_empty_string_count = 0
+        for i, text_frame in enumerate(text_frames):
+            modified_text_frames.append(text_frame)
+            if current_empty_string_count < empty_string_count and i % empty_string_spacing == 0:
+                modified_text_frames.append("")
+                current_empty_string_count += 1
+
+        if current_empty_string_count < empty_string_count:
+            modified_text_frames.extend([""] * (empty_string_count - current_empty_string_count))
+
+        return zip(audio_frames, modified_text_frames)
@@ -6,11 +6,10 @@
 
 import keyboard
 import pyaudio
+from pydantic import BaseModel
 
-from semantic_kernel.kernel_pydantic import KernelBaseModel
 
-
-class AudioRecorder(KernelBaseModel):
+class AudioRecorder(BaseModel):
     """A class to record audio from the microphone and save it to a WAV file.
 
     To start recording, press the spacebar. To stop recording, release the spacebar.