Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python: Text to audio #9625

Merged
merged 9 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions .github/workflows/python-integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,19 @@ jobs:
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME }} # azure-text-embedding-ada-002
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME }}
AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME }}
AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME }}
AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT: ${{ secrets.AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT }}
BING_API_KEY: ${{ secrets.BING_API_KEY }}
OPENAI_CHAT_MODEL_ID: ${{ vars.OPENAI_CHAT_MODEL_ID }}
OPENAI_TEXT_MODEL_ID: ${{ vars.OPENAI_TEXT_MODEL_ID }}
OPENAI_EMBEDDING_MODEL_ID: ${{ vars.OPENAI_EMBEDDING_MODEL_ID }}
OPENAI_AUDIO_TO_TEXT_MODEL_ID: ${{ vars.OPENAI_AUDIO_TO_TEXT_MODEL_ID }}
OPENAI_TEXT_TO_AUDIO_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_AUDIO_MODEL_ID }}
OPENAI_TEXT_TO_IMAGE_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_IMAGE_MODEL_ID }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
PINECONE_API_KEY: ${{ secrets.PINECONE__APIKEY }}
Expand Down Expand Up @@ -233,13 +239,19 @@ jobs:
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME }} # azure-text-embedding-ada-002
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME }}
AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME }}
AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME }}
AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT: ${{ secrets.AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT }}
BING_API_KEY: ${{ secrets.BING_API_KEY }}
OPENAI_CHAT_MODEL_ID: ${{ vars.OPENAI_CHAT_MODEL_ID }}
OPENAI_TEXT_MODEL_ID: ${{ vars.OPENAI_TEXT_MODEL_ID }}
OPENAI_EMBEDDING_MODEL_ID: ${{ vars.OPENAI_EMBEDDING_MODEL_ID }}
OPENAI_AUDIO_TO_TEXT_MODEL_ID: ${{ vars.OPENAI_AUDIO_TO_TEXT_MODEL_ID }}
OPENAI_TEXT_TO_AUDIO_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_AUDIO_MODEL_ID }}
OPENAI_TEXT_TO_IMAGE_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_IMAGE_MODEL_ID }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
PINECONE_API_KEY: ${{ secrets.PINECONE__APIKEY }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,22 @@
import logging
import os

from samples.concepts.audio_to_text.audio_recorder import AudioRecorder
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
from samples.concepts.audio.audio_recorder import AudioRecorder
from semantic_kernel.connectors.ai.open_ai import (
AzureAudioToText,
AzureChatCompletion,
OpenAIChatPromptExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
from semantic_kernel.contents import ChatHistory
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents import AudioContent, ChatHistory

# This simple sample demonstrates how to use the AzureChatCompletion and AzureAudioToText services
# to create a chat bot that can communicate with the user using audio input.
# The user can enage a long conversation with the chat bot by speaking to it.

# Resources required for this sample:
# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
# 2. An Azure Speech to Text deployment (e.g. whisper).

# Additional dependencies required for this sample:
# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.
Expand Down
95 changes: 95 additions & 0 deletions python/samples/concepts/audio/02-chat_with_audio_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio
import logging

from samples.concepts.audio.audio_player import AudioPlayer
from semantic_kernel.connectors.ai.open_ai import (
AzureChatCompletion,
AzureTextToAudio,
OpenAIChatPromptExecutionSettings,
OpenAITextToAudioExecutionSettings,
)
from semantic_kernel.contents import ChatHistory

# This simple sample demonstrates how to use the AzureChatCompletion and AzureTextToAudio services
# to create a chat bot that can communicate with the user using audio output.
# The chatbot will engage in a conversation with the user and respond using audio output.

# Resources required for this sample:
# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
# 2. An Azure Text to Speech deployment (e.g. tts).

# Additional dependencies required for this sample:
# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.


logging.basicConfig(level=logging.WARNING)

system_message = """
You are a chat bot. Your name is Mosscap and
you have one goal: figure out what people need.
Your full name, should you need to know it, is
Splendid Speckled Mosscap. You communicate
effectively, but you tend to answer with long
flowery prose.
"""


chat_service = AzureChatCompletion()
text_to_audio_service = AzureTextToAudio()

history = ChatHistory()
history.add_user_message("Hi there, who are you?")
history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")


async def chat() -> bool:
try:
user_input = input("User:> ")
except KeyboardInterrupt:
print("\n\nExiting chat...")
return False
except EOFError:
print("\n\nExiting chat...")
return False

if user_input == "exit":
print("\n\nExiting chat...")
return False

history.add_user_message(user_input)

# No need to stream the response since we can only pass the
# response to the text to audio service as a whole
response = await chat_service.get_chat_message_content(
chat_history=history,
settings=OpenAIChatPromptExecutionSettings(
max_tokens=2000,
temperature=0.7,
top_p=0.8,
),
)

# Need to set the response format to wav since the audio player only supports wav files
audio_content = await text_to_audio_service.get_audio_content(
response.content, OpenAITextToAudioExecutionSettings(response_format="wav")
)
AudioPlayer(audio_content=audio_content).play()

print(f"Mosscap:> {response.content}")

history.add_message(response)

return True


async def main() -> None:
chatting = True
while chatting:
chatting = await chat()


if __name__ == "__main__":
asyncio.run(main())
112 changes: 112 additions & 0 deletions python/samples/concepts/audio/03-chat_with_audio_input_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio
import logging
import os

from samples.concepts.audio.audio_player import AudioPlayer
from samples.concepts.audio.audio_recorder import AudioRecorder
from semantic_kernel.connectors.ai.open_ai import (
AzureAudioToText,
AzureChatCompletion,
AzureTextToAudio,
OpenAIChatPromptExecutionSettings,
OpenAITextToAudioExecutionSettings,
)
from semantic_kernel.contents import AudioContent, ChatHistory

# This simple sample demonstrates how to use the AzureChatCompletion, AzureTextToAudio, and AzureAudioToText
# services to create a chat bot that can communicate with the user using both audio input and output.
# The chatbot will engage in a conversation with the user by audio only.
# This sample combines the functionality of the samples/concepts/audio/01-chat_with_audio_input.py and
# samples/concepts/audio/02-chat_with_audio_output.py samples.

# Resources required for this sample:
# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
# 2. An Azure Text to Speech deployment (e.g. tts).
# 3. An Azure Speech to Text deployment (e.g. whisper).

# Additional dependencies required for this sample:
# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.


logging.basicConfig(level=logging.WARNING)
AUDIO_FILEPATH = os.path.join(os.path.dirname(__file__), "output.wav")


system_message = """
You are a chat bot. Your name is Mosscap and
you have one goal: figure out what people need.
Your full name, should you need to know it, is
Splendid Speckled Mosscap. You communicate
effectively, but you tend to answer with long
flowery prose.
"""


chat_service = AzureChatCompletion()
text_to_audio_service = AzureTextToAudio()
audio_to_text_service = AzureAudioToText()

history = ChatHistory()
history.add_user_message("Hi there, who are you?")
history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")


async def chat() -> bool:
try:
print("User:> ", end="", flush=True)
with AudioRecorder(output_filepath=AUDIO_FILEPATH) as recorder:
recorder.start_recording()
user_input = await audio_to_text_service.get_text_content(AudioContent.from_audio_file(AUDIO_FILEPATH))
print(user_input.text)
except KeyboardInterrupt:
print("\n\nExiting chat...")
return False
except EOFError:
print("\n\nExiting chat...")
return False

if "exit" in user_input.text.lower():
print("\n\nExiting chat...")
return False

history.add_user_message(user_input.text)

# No need to stream the response since we can only pass the
# response to the text to audio service as a whole
response = await chat_service.get_chat_message_content(
chat_history=history,
settings=OpenAIChatPromptExecutionSettings(
max_tokens=2000,
temperature=0.7,
top_p=0.8,
),
)

# Need to set the response format to wav since the audio player only supports wav files
audio_content = await text_to_audio_service.get_audio_content(
response.content, OpenAITextToAudioExecutionSettings(response_format="wav")
)
print("Mosscap:> ", end="", flush=True)
AudioPlayer(audio_content=audio_content).play(text=response.content)

history.add_message(response)

return True


async def main() -> None:
print(
"Instruction: when it's your turn to speak, press the spacebar to start recording."
" Release the spacebar to stop recording."
)

chatting = True
while chatting:
chatting = await chat()


if __name__ == "__main__":
asyncio.run(main())
99 changes: 99 additions & 0 deletions python/samples/concepts/audio/audio_player.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (c) Microsoft. All rights reserved.

import io
import logging
import wave
from typing import ClassVar

import pyaudio
from pydantic import BaseModel

from semantic_kernel.contents import AudioContent

logging.basicConfig(level=logging.WARNING)
logger: logging.Logger = logging.getLogger(__name__)


class AudioPlayer(BaseModel):
"""A class to play an audio file to the default audio output device."""

# Audio replay parameters
CHUNK: ClassVar[int] = 1024

audio_content: AudioContent

def play(self, text: str | None = None) -> None:
"""Play the audio content to the default audio output device.

Args:
text (str, optional): The text to display while playing the audio. Defaults to None.
"""
audio_stream = io.BytesIO(self.audio_content.data)
with wave.open(audio_stream, "rb") as wf:
audio = pyaudio.PyAudio()
stream = audio.open(
format=audio.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True,
)

if text:
# Simulate the output of text while playing the audio
data_frames = []

data = wf.readframes(self.CHUNK)
while data:
data_frames.append(data)
data = wf.readframes(self.CHUNK)

if len(data_frames) < len(text):
logger.warning(
"The audio is too short to play the entire text. ",
"The text will be displayed without synchronization.",
)
print(text)
else:
for data_frame, text_frame in self._zip_text_and_audio(text, data_frames):
stream.write(data_frame)
print(text_frame, end="", flush=True)
print()
else:
data = wf.readframes(self.CHUNK)
while data:
stream.write(data)
data = wf.readframes(self.CHUNK)

stream.stop_stream()
stream.close()
audio.terminate()

def _zip_text_and_audio(self, text: str, audio_frames: list) -> zip:
"""Zip the text and audio frames together so that they can be displayed in sync.

This is done by evenly distributing empty strings between each character and
append the remaining empty strings at the end.

Args:
text (str): The text to display while playing the audio.
audio_frames (list): The audio frames to play.

Returns:
zip: The zipped text and audio frames.
"""
text_frames = list(text)
empty_string_count = len(audio_frames) - len(text_frames)
empty_string_spacing = len(text_frames) // empty_string_count

modified_text_frames = []
current_empty_string_count = 0
for i, text_frame in enumerate(text_frames):
modified_text_frames.append(text_frame)
if current_empty_string_count < empty_string_count and i % empty_string_spacing == 0:
modified_text_frames.append("")
current_empty_string_count += 1

if current_empty_string_count < empty_string_count:
modified_text_frames.extend([""] * (empty_string_count - current_empty_string_count))

return zip(audio_frames, modified_text_frames)
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@

import keyboard
import pyaudio
from pydantic import BaseModel

from semantic_kernel.kernel_pydantic import KernelBaseModel


class AudioRecorder(KernelBaseModel):
class AudioRecorder(BaseModel):
"""A class to record audio from the microphone and save it to a WAV file.

To start recording, press the spacebar. To stop recording, release the spacebar.
Expand Down
Loading
Loading