Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python: Text to audio #9625

Merged
merged 9 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions python/samples/concepts/audio/01-chat_with_audio_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,11 @@
import os

from samples.concepts.audio.audio_recorder import AudioRecorder
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
from semantic_kernel.connectors.ai.open_ai import AzureAudioToText, AzureChatCompletion
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
TaoChenOSU marked this conversation as resolved.
Show resolved Hide resolved
OpenAIChatPromptExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
from semantic_kernel.contents import ChatHistory
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents import AudioContent, ChatHistory

# This simple sample demonstrates how to use the AzureChatCompletion and AzureAudioToText services
# to create a chat bot that can communicate with the user using audio input.
Expand Down
10 changes: 4 additions & 6 deletions python/samples/concepts/audio/02-chat_with_audio_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@
import logging

from samples.concepts.audio.audio_player import AudioPlayer
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
from semantic_kernel.connectors.ai.open_ai import (
AzureChatCompletion,
AzureTextToAudio,
OpenAIChatPromptExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
OpenAITextToAudioExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_audio import AzureTextToAudio
from semantic_kernel.contents import ChatHistory

# This simple sample demonstrates how to use the AzureChatCompletion and AzureTextToAudio services
Expand Down Expand Up @@ -82,7 +80,7 @@ async def chat() -> bool:

print(f"Mosscap:> {response.content}")

history.add_assistant_message(response.content)
history.add_message(response)

return True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,14 @@

from samples.concepts.audio.audio_player import AudioPlayer
from samples.concepts.audio.audio_recorder import AudioRecorder
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
from semantic_kernel.connectors.ai.open_ai import (
AzureAudioToText,
AzureChatCompletion,
AzureTextToAudio,
OpenAIChatPromptExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
OpenAITextToAudioExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_audio import AzureTextToAudio
from semantic_kernel.contents import ChatHistory
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents import AudioContent, ChatHistory

# This simple sample demonstrates how to use the AzureChatCompletion, AzureTextToAudio, and AzureAudioToText
# services to create a chat bot that can communicate with the user using both audio input and output.
Expand Down Expand Up @@ -95,7 +92,7 @@ async def chat() -> bool:
print("Mosscap:> ", end="", flush=True)
AudioPlayer(audio_content=audio_content).play(text=response.content)

history.add_assistant_message(response.content)
history.add_message(response)

return True

Expand Down
6 changes: 3 additions & 3 deletions python/samples/concepts/audio/audio_player.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
from typing import ClassVar

import pyaudio
from pydantic import BaseModel

from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.kernel_pydantic import KernelBaseModel
from semantic_kernel.contents import AudioContent

logging.basicConfig(level=logging.WARNING)
logger: logging.Logger = logging.getLogger(__name__)


class AudioPlayer(KernelBaseModel):
class AudioPlayer(BaseModel):
"""A class to play an audio file to the default audio output device."""

# Audio replay parameters
Expand Down
5 changes: 2 additions & 3 deletions python/samples/concepts/audio/audio_recorder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@

import keyboard
import pyaudio
from pydantic import BaseModel

from semantic_kernel.kernel_pydantic import KernelBaseModel


class AudioRecorder(KernelBaseModel):
class AudioRecorder(BaseModel):
"""A class to record audio from the microphone and save it to a WAV file.

To start recording, press the spacebar. To stop recording, release the spacebar.
Expand Down
24 changes: 20 additions & 4 deletions python/samples/concepts/setup/ALL_SETTINGS.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,43 @@ OpenAI | [OpenAIChatCompletion](../../../semantic_kernel/connectors/ai/open_ai/s
| | | ai_model_id | OPENAI_TEXT_TO_IMAGE_MODEL_ID | Yes
| | | api_key | OPENAI_API_KEY | Yes
| | | org_id | OPENAI_ORG_ID | No
| | [OpenAITextToAudio](../../../semantic_kernel/connectors/ai/open_ai/services/open_ai_text_to_audio.py)
| | | ai_model_id | OPENAI_TEXT_TO_AUDIO_MODEL_ID | Yes
| | | api_key | OPENAI_API_KEY | Yes
| | | org_id | OPENAI_ORG_ID | No
| | [OpenAIAudioToText](../../../semantic_kernel/connectors/ai/open_ai/services/open_ai_audio_to_text.py)
| | | ai_model_id | OPENAI_AUDIO_TO_TEXT_MODEL_ID | Yes
| | | api_key | OPENAI_API_KEY | Yes
| | | org_id | OPENAI_ORG_ID | No
Azure OpenAI | [AzureOpenAIChatCompletion](../../../semantic_kernel/connectors/ai/open_ai/services/azure_chat_completion.py) | | | | [AzureOpenAISettings](../../../semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py)
| | | deployment_name | AZURE_OPENAI_CHAT_DEPLOYMENT_NAME | Yes
| | | api_key | AZURE_OPENAI_API_KEY | Yes
| | | api_key | AZURE_OPENAI_API_KEY | No
| | | endpoint | AZURE_OPENAI_ENDPOINT | Yes
| | | api_version | AZURE_OPENAI_API_VERSION | Yes
| | | base_url | AZURE_OPENAI_BASE_URL | Yes
| | [AzureOpenAITextCompletion](../../../semantic_kernel/connectors/ai/open_ai/services/azure_text_completion.py)
| | | deployment_name | AZURE_OPENAI_TEXT_DEPLOYMENT_NAME | Yes
| | | api_key | AZURE_OPENAI_API_KEY | Yes
| | | api_key | AZURE_OPENAI_API_KEY | No
| | | endpoint | AZURE_OPENAI_ENDPOINT | Yes
| | | api_version | AZURE_OPENAI_API_VERSION | Yes
| | | base_url | AZURE_OPENAI_BASE_URL | Yes
| | [AzureOpenAITextEmbedding](../../../semantic_kernel/connectors/ai/open_ai/services/azure_text_embedding.py)
| | | deployment_name | AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME | Yes
| | | api_key | AZURE_OPENAI_API_KEY | Yes
| | | api_key | AZURE_OPENAI_API_KEY | No
| | | endpoint | AZURE_OPENAI_ENDPOINT | Yes
| | | api_version | AZURE_OPENAI_API_VERSION | Yes
| | | base_url | AZURE_OPENAI_BASE_URL | Yes
| | [AzureTextToImage](../../../semantic_kernel/connectors/ai/open_ai/services/azure_text_to_image.py)
| | | deployment_name | AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME | Yes
| | | api_key | AZURE_OPENAI_API_KEY | Yes
| | | api_key | AZURE_OPENAI_API_KEY | No
| | | endpoint | AZURE_OPENAI_ENDPOINT | Yes
| | [AzureTextToAudio](../../../semantic_kernel/connectors/ai/open_ai/services/azure_text_to_audio.py)
| | | deployment_name | AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME | Yes
| | | api_key | AZURE_OPENAI_API_KEY | No
| | | endpoint | AZURE_OPENAI_ENDPOINT | Yes
| | [AzureAudioToText](../../../semantic_kernel/connectors/ai/open_ai/services/azure_audio_to_text.py)
| | | deployment_name | AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME | Yes
| | | api_key | AZURE_OPENAI_API_KEY | No
| | | endpoint | AZURE_OPENAI_ENDPOINT | Yes

## Memory Service Settings used across SK:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
from typing import Any

from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents.text_content import TextContent
from semantic_kernel.contents import AudioContent, TextContent
TaoChenOSU marked this conversation as resolved.
Show resolved Hide resolved
from semantic_kernel.services.ai_service_client_base import AIServiceClientBase


Expand Down
20 changes: 20 additions & 0 deletions python/semantic_kernel/connectors/ai/open_ai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,39 @@
DataSourceFieldsMapping,
ExtraBody,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_audio_to_text_execution_settings import (
OpenAIAudioToTextExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
OpenAIChatPromptExecutionSettings,
OpenAIEmbeddingPromptExecutionSettings,
OpenAIPromptExecutionSettings,
OpenAITextPromptExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
OpenAITextToAudioExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_image_execution_settings import (
OpenAITextToImageExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
from semantic_kernel.connectors.ai.open_ai.services.azure_chat_completion import AzureChatCompletion
from semantic_kernel.connectors.ai.open_ai.services.azure_text_completion import AzureTextCompletion
from semantic_kernel.connectors.ai.open_ai.services.azure_text_embedding import AzureTextEmbedding
from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_audio import AzureTextToAudio
from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_image import AzureTextToImage
from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText
from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_completion import OpenAITextCompletion
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio import OpenAITextToAudio
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_image import OpenAITextToImage

__all__ = [
"ApiKeyAuthentication",
"AzureAISearchDataSource",
"AzureAISearchDataSourceParameters",
"AzureAudioToText",
"AzureChatCompletion",
"AzureChatPromptExecutionSettings",
"AzureCosmosDBDataSource",
Expand All @@ -40,17 +54,23 @@
"AzureEmbeddingDependency",
"AzureTextCompletion",
"AzureTextEmbedding",
"AzureTextToAudio",
"AzureTextToImage",
"ConnectionStringAuthentication",
"DataSourceFieldsMapping",
"DataSourceFieldsMapping",
"ExtraBody",
"OpenAIAudioToText",
"OpenAIAudioToTextExecutionSettings",
"OpenAIChatCompletion",
"OpenAIChatPromptExecutionSettings",
"OpenAIEmbeddingPromptExecutionSettings",
"OpenAIPromptExecutionSettings",
"OpenAITextCompletion",
"OpenAITextEmbedding",
"OpenAITextPromptExecutionSettings",
"OpenAITextToAudio",
"OpenAITextToAudioExecutionSettings",
"OpenAITextToImage",
"OpenAITextToImageExecutionSettings",
]
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,10 @@
from typing_extensions import override # pragma: no cover

from semantic_kernel.connectors.ai.audio_to_text_client_base import AudioToTextClientBase
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_audio_to_text_execution_settings import (
OpenAIAudioToTextExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai import OpenAIAudioToTextExecutionSettings
from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents.text_content import TextContent
from semantic_kernel.contents import AudioContent, TextContent


class OpenAIAudioToTextBase(OpenAIHandler, AudioToTextClientBase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,15 @@
from openai.types.images_response import ImagesResponse
from pydantic import BaseModel

from semantic_kernel.connectors.ai.open_ai.exceptions.content_filter_ai_exception import ContentFilterAIException
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_audio_to_text_execution_settings import (
from semantic_kernel.connectors.ai.open_ai import (
OpenAIAudioToTextExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
OpenAIChatPromptExecutionSettings,
OpenAIEmbeddingPromptExecutionSettings,
OpenAIPromptExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
OpenAITextToAudioExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_image_execution_settings import (
OpenAITextToImageExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.exceptions.content_filter_ai_exception import ContentFilterAIException
from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.connectors.utils.structured_output_schema import generate_structured_output_response_format_schema
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,11 @@
else:
from typing_extensions import override # pragma: no cover

from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
OpenAITextToAudioExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai import OpenAITextToAudioExecutionSettings
from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.connectors.ai.text_to_audio_client_base import TextToAudioClientBase
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents import AudioContent


class OpenAITextToAudioBase(OpenAIHandler, TextToAudioClientBase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Any

from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents import AudioContent
from semantic_kernel.services.ai_service_client_base import AIServiceClientBase


Expand Down
2 changes: 2 additions & 0 deletions python/semantic_kernel/contents/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft. All rights reserved.

from semantic_kernel.contents.annotation_content import AnnotationContent
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents.chat_history import ChatHistory
from semantic_kernel.contents.chat_message_content import ChatMessageContent
from semantic_kernel.contents.function_call_content import FunctionCallContent
Expand All @@ -16,6 +17,7 @@

__all__ = [
"AnnotationContent",
"AudioContent",
"AuthorRole",
"ChatHistory",
"ChatMessageContent",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
import pytest

from semantic_kernel.connectors.ai.audio_to_text_client_base import AudioToTextClientBase
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText
from semantic_kernel.connectors.ai.open_ai import AzureAudioToText, OpenAIAudioToText
from tests.integration.utils import is_service_setup_for_testing

# There is only the whisper model available on Azure OpenAI for audio to text. And that model is
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest

from semantic_kernel.connectors.ai.audio_to_text_client_base import AudioToTextClientBase
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents import AudioContent
from tests.integration.audio_to_text.audio_to_text_test_base import AudioToTextTestBase

pytestmark = pytest.mark.parametrize(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

from semantic_kernel.connectors.ai.text_to_audio_client_base import TextToAudioClientBase
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents import AudioContent
from tests.integration.text_to_audio.text_to_audio_test_base import TextToAudioTestBase

pytestmark = pytest.mark.parametrize(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@

import pytest

from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_audio import AzureTextToAudio
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio import OpenAITextToAudio
from semantic_kernel.connectors.ai.open_ai import AzureTextToAudio, OpenAITextToAudio
from semantic_kernel.connectors.ai.text_to_audio_client_base import TextToAudioClientBase
from tests.integration.utils import is_service_setup_for_testing

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from openai.resources.audio.transcriptions import AsyncTranscriptions
from openai.types.audio import Transcription

from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.connectors.ai.open_ai import AzureAudioToText
from semantic_kernel.contents import AudioContent
from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError, ServiceInvalidRequestError


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from openai import AsyncAzureOpenAI, _legacy_response
from openai.resources.audio.speech import AsyncSpeech

from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_audio import AzureTextToAudio
from semantic_kernel.connectors.ai.open_ai import AzureTextToAudio
from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,9 @@
from openai.resources.audio.transcriptions import AsyncTranscriptions
from openai.types.audio import Transcription

from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_audio_to_text_execution_settings import (
OpenAIAudioToTextExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai import OpenAIAudioToTextExecutionSettings
from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents import AudioContent
from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError, ServiceInvalidRequestError


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,7 @@
from openai import AsyncClient, _legacy_response
from openai.resources.audio.speech import AsyncSpeech

from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
OpenAITextToAudioExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio import OpenAITextToAudio
from semantic_kernel.connectors.ai.open_ai import OpenAITextToAudio, OpenAITextToAudioExecutionSettings
from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError


Expand Down
Loading
Loading