diff --git a/OpenAI_API/Audio/AudioRequest.cs b/OpenAI_API/Audio/AudioRequest.cs new file mode 100644 index 0000000..be74589 --- /dev/null +++ b/OpenAI_API/Audio/AudioRequest.cs @@ -0,0 +1,56 @@ +using System; +using System.Collections.Generic; +using System.Text; +using Newtonsoft.Json; +using static OpenAI_API.Audio.TextToSpeechRequest; + +namespace OpenAI_API.Audio +{ + public class AudioRequest + { + /// + /// The model to use for this request. Currently only is supported. + /// + [JsonProperty("model")] + public string Model { get; set; } = OpenAI_API.Models.Model.DefaultTranscriptionModel; + + /// + /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language for transcriptions, or English for translations. + /// + [JsonProperty("prompt", DefaultValueHandling = DefaultValueHandling.Ignore)] + public string Prompt { get; set; } = null; + + /// + /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. + /// + [JsonProperty("language", DefaultValueHandling = DefaultValueHandling.Ignore)] + public string Language { get; set; } = null; + + /// + /// The format of the transcript output, should be one of the options in . See + /// + [JsonProperty("response_format", DefaultValueHandling = DefaultValueHandling.Ignore)] + public string ResponseFormat { get; set; } = null; + + /// + /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. + /// + [JsonProperty("temperature", DefaultValueHandling = DefaultValueHandling.Ignore)] + public double Temperature { get; set; } = 0; + + + /// + /// The format of the transcript output. See + /// + public static class ResponseFormats + { +#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member + public const string JSON = "json"; + public const string Text = "text"; + public const string SRT = "srt"; + public const string VerboseJson = "verbose_json"; + public const string VTT = "vtt"; +#pragma warning restore CS1591 // Missing XML comment for publicly visible type or member + } + } +} diff --git a/OpenAI_API/Audio/AudioResult.cs b/OpenAI_API/Audio/AudioResult.cs new file mode 100644 index 0000000..5c77983 --- /dev/null +++ b/OpenAI_API/Audio/AudioResult.cs @@ -0,0 +1,32 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace OpenAI_API.Audio +{ + /// + /// Represents a verbose_json output from the OpenAI Transcribe or Translate endpoints. + /// + public class AudioResultVerbose : ApiResultBase + { + public double duration { get; set; } + public string language { get; set; } + public List segments { get; set; } + public string task { get; set; } + public string text { get; set; } + + public class Segment + { + public double avg_logprob { get; set; } + public double compression_ratio { get; set; } + public double end { get; set; } + public int id { get; set; } + public double no_speech_prob { get; set; } + public int seek { get; set; } + public double start { get; set; } + public double temperature { get; set; } + public string text { get; set; } + public List tokens { get; set; } + } + } +} diff --git a/OpenAI_API/Audio/ITextToSpeechEndpoint.cs b/OpenAI_API/Audio/ITextToSpeechEndpoint.cs index 83ab097..2eed94e 100644 --- a/OpenAI_API/Audio/ITextToSpeechEndpoint.cs +++ b/OpenAI_API/Audio/ITextToSpeechEndpoint.cs @@ -30,7 +30,7 @@ public interface ITextToSpeechEndpoint /// The default response format is "mp3", but other formats are available in . See /// TTS is an AI model that converts text to natural sounding spoken text. OpenAI offers two different model variates, is optimized for real time text to speech use cases and is optimized for quality. /// A stream of the audio file in the requested format. - Task GetSpeechAsStreamAsync(string input, string voice = null, decimal? speed = null, string responseFormat = null, Model model = null); + Task GetSpeechAsStreamAsync(string input, string voice = null, double? speed = null, string responseFormat = null, Model model = null); /// /// Calls the API to create speech from text, and saves the audio file to disk. @@ -50,7 +50,7 @@ public interface ITextToSpeechEndpoint /// The default response format is "mp3", but other formats are available in . See /// TTS is an AI model that converts text to natural sounding spoken text. OpenAI offers two different model variates, is optimized for real time text to speech use cases and is optimized for quality. /// A stream of the audio file in the requested format. - Task SaveSpeechToFileAsync(string input, string localPath, string voice = null, decimal? speed = null, string responseFormat = null, Model model = null); + Task SaveSpeechToFileAsync(string input, string localPath, string voice = null, double? speed = null, string responseFormat = null, Model model = null); } diff --git a/OpenAI_API/Audio/ITranscriptionEndpoint.cs b/OpenAI_API/Audio/ITranscriptionEndpoint.cs new file mode 100644 index 0000000..384ac6d --- /dev/null +++ b/OpenAI_API/Audio/ITranscriptionEndpoint.cs @@ -0,0 +1,81 @@ +using System.IO; +using System.Threading.Tasks; + +namespace OpenAI_API.Audio +{ + /// + /// Transcribe audio into text, with optional translation into English. + /// + public interface ITranscriptionEndpoint + { + /// + /// This allows you to set default parameters for every request, for example to set a default language. For every request, if you do not have a parameter set on the request but do have it set here as a default, the request will automatically pick up the default value. + /// + AudioRequest DefaultRequestArgs { get; set; } + + /// + /// Gets the transcription of the audio stream, in the specified format + /// + /// The stream containing audio data, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + /// The name of the audio file in the stream. This does not have to be real, but it must contain the correct file extension. For example, "file.mp3" if you are supplying an mp3 audio stream. + /// The format of the response. Suggested value are or . For text and Json formats, try or instead. + /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. + /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. + /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. + /// A string of the transcribed text + Task GetAsFormatAsync(Stream audioStream, string filename,string responseFormat, string language = null, string prompt = null, double? temperature = null); + + /// + /// Gets the transcription of the audio file, in the specified format + /// + /// The local path to the audio file, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + /// The format of the response. Suggested value are or . For text and Json formats, try or instead. + /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. + /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. + /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. + /// A string of the transcribed text + Task GetAsFormatAsync(string audioFilePath, string responseFormat, string language = null, string prompt = null, double? temperature = null); + + /// + /// Gets the transcription of the audio stream, with full metadata + /// + /// The stream containing audio data, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + /// The name of the audio file in the stream. This does not have to be real, but it must contain the correct file extension. For example, "file.mp3" if you are supplying an mp3 audio stream. + /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. + /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. + /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. + /// A string of the transcribed text + Task GetWithDetailsAsync(Stream audioStream, string filename,string language = null, string prompt = null, double? temperature = null); + + /// + /// Gets the transcription of the audio file, with full metadata + /// + /// The local path to the audio file, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. + /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. + /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. + /// A string of the transcribed text + Task GetWithDetailsAsync(string audioFilePath, string language = null, string prompt = null, double? temperature = null); + + /// + /// Gets the transcription of the audio stream as a text string + /// + /// The stream containing audio data, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + /// The name of the audio file in the stream. This does not have to be real, but it must contain the correct file extension. For example, "file.mp3" if you are supplying an mp3 audio stream. + /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. + /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. + /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. + /// A string of the transcribed text + Task GetTextAsync(Stream audioStream, string filename, string language = null, string prompt = null, double? temperature = null); + + /// + /// Gets the transcription of the audio file as a text string + /// + /// The local path to the audio file, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. + /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. + /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. + /// A string of the transcribed text + Task GetTextAsync(string audioFilePath, string language = null, string prompt = null, double? temperature = null); + } +} \ No newline at end of file diff --git a/OpenAI_API/Audio/TextToSpeechEndpoint.cs b/OpenAI_API/Audio/TextToSpeechEndpoint.cs index 04c76d8..91a8a74 100644 --- a/OpenAI_API/Audio/TextToSpeechEndpoint.cs +++ b/OpenAI_API/Audio/TextToSpeechEndpoint.cs @@ -24,7 +24,7 @@ public class TextToSpeechEndpoint : EndpointBase, ITextToSpeechEndpoint public TextToSpeechRequest DefaultTTSRequestArgs { get; set; } = new TextToSpeechRequest(); /// - /// Constructor of the api endpoint. Rather than instantiating this yourself, access it through an instance of as . + /// Constructor of the api endpoint. Rather than instantiating this yourself, access it through an instance of as . /// /// Pass in the instance of the api internal TextToSpeechEndpoint(OpenAIAPI api) : base(api) { } @@ -48,7 +48,7 @@ public async Task GetSpeechAsStreamAsync(TextToSpeechRequest request) /// The default response format is "mp3", but other formats are available in . See /// TTS is an AI model that converts text to natural sounding spoken text. OpenAI offers two different model variates, is optimized for real time text to speech use cases and is optimized for quality. /// A stream of the audio file in the requested format. - public async Task GetSpeechAsStreamAsync(string input, string voice = null, decimal? speed = null, string responseFormat = null, Model model = null) + public async Task GetSpeechAsStreamAsync(string input, string voice = null, double? speed = null, string responseFormat = null, Model model = null) { var request = new TextToSpeechRequest() { @@ -87,7 +87,7 @@ public async Task SaveSpeechToFileAsync(TextToSpeechRequest request, s /// The default response format is "mp3", but other formats are available in . See /// TTS is an AI model that converts text to natural sounding spoken text. OpenAI offers two different model variates, is optimized for real time text to speech use cases and is optimized for quality. /// A stream of the audio file in the requested format. - public async Task SaveSpeechToFileAsync(string input, string localPath, string voice = null, decimal? speed = null, string responseFormat = null, Model model = null) + public async Task SaveSpeechToFileAsync(string input, string localPath, string voice = null, double? speed = null, string responseFormat = null, Model model = null) { var request = new TextToSpeechRequest() { diff --git a/OpenAI_API/Audio/TextToSpeechRequest.cs b/OpenAI_API/Audio/TextToSpeechRequest.cs index 1387d41..f62956f 100644 --- a/OpenAI_API/Audio/TextToSpeechRequest.cs +++ b/OpenAI_API/Audio/TextToSpeechRequest.cs @@ -35,7 +35,7 @@ public class TextToSpeechRequest /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default. /// [JsonProperty("speed", DefaultValueHandling = DefaultValueHandling.Ignore)] - public decimal? Speed { get; set; } = null; + public double? Speed { get; set; } = null; /// /// Supported voices are alloy, echo, fable, onyx, nova, and shimmer. Previews of the voices are available in the Text to speech guide. See . diff --git a/OpenAI_API/Audio/TranscriptionEndpoint.cs b/OpenAI_API/Audio/TranscriptionEndpoint.cs new file mode 100644 index 0000000..8de241c --- /dev/null +++ b/OpenAI_API/Audio/TranscriptionEndpoint.cs @@ -0,0 +1,191 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Net.Http; +using System.Text; +using System.Threading.Tasks; + +namespace OpenAI_API.Audio +{ + /// + /// Transcribe audio into text, with optional translation into English. + /// + public class TranscriptionEndpoint : EndpointBase, ITranscriptionEndpoint + { + /// + protected override string Endpoint + { + get + { + if (TranslateToEnglish) + { + return "audio/translations"; + } + else + { + return "audio/transcriptions"; + } + } + } + + /// + /// Constructor of the api endpoint. Rather than instantiating this yourself, access it through an instance of as . + /// + /// Pass in the instance of the api + /// If , the response will translate non-English audio into English. Otherwise the returned text will be in the spoken language. + internal TranscriptionEndpoint(OpenAIAPI api, bool translate) : base(api) + { + TranslateToEnglish = translate; + } + + /// + /// This allows you to set default parameters for every request, for example to set a default language. For every request, if you do not have a parameter set on the request but do have it set here as a default, the request will automatically pick up the default value. + /// + public AudioRequest DefaultRequestArgs { get; set; } = new AudioRequest(); + + /// + /// If , the response will translate non-English audio into English. Otherwise the returned text will be in the spoken language. + /// + private bool TranslateToEnglish { get; } + + /// + /// Gets the transcription of the audio stream as a text string + /// + /// The stream containing audio data, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + /// The name of the audio file in the stream. This does not have to be real, but it must contain the correct file extension. For example, "file.mp3" if you are supplying an mp3 audio stream. + /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. + /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. + /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. + /// A string of the transcribed text + public async Task GetTextAsync(Stream audioStream, string filename, string language = null, string prompt = null, double? temperature = null) + => await GetAsFormatAsync(audioStream, filename, AudioRequest.ResponseFormats.Text, language, prompt, temperature); + + /// + /// Gets the transcription of the audio file as a text string + /// + /// The local path to the audio file, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. + /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. + /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. + /// A string of the transcribed text + public async Task GetTextAsync(string audioFilePath, string language = null, string prompt = null, double? temperature = null) + { + using (var fileStream = File.OpenRead(audioFilePath)) + { + return await GetTextAsync(fileStream, Path.GetFileName(audioFilePath), language, prompt, temperature); + } + } + + /// + /// Gets the transcription of the audio stream, with full metadata + /// + /// The stream containing audio data, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + /// The name of the audio file in the stream. This does not have to be real, but it must contain the correct file extension. For example, "file.mp3" if you are supplying an mp3 audio stream. + /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. + /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. + /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. + /// A string of the transcribed text + public async Task GetWithDetailsAsync(Stream audioStream, string filename, string language = null, string prompt = null, double? temperature = null) + { + var request = new AudioRequest() + { + Language = language ?? DefaultRequestArgs.Language, + Model = DefaultRequestArgs.Model, + Prompt = prompt ?? DefaultRequestArgs.Prompt, + Temperature = temperature ?? DefaultRequestArgs.Temperature + }; + request.ResponseFormat = AudioRequest.ResponseFormats.VerboseJson; + MultipartFormDataContent content; + using (var memoryStream = new MemoryStream()) + { + audioStream.CopyTo(memoryStream); + content = new MultipartFormDataContent + { + { new StringContent(request.Model), "model" }, + { new StringContent(request.ResponseFormat), "response_format" }, + { new ByteArrayContent(memoryStream.ToArray()), "file", filename } + }; + if (!string.IsNullOrEmpty(request.Language)) + content.Add(new StringContent(request.Language), "language"); + if (!string.IsNullOrEmpty(request.Prompt)) + content.Add(new StringContent(request.Prompt), "prompt"); + if (request.Temperature != 0) + content.Add(new StringContent(request.Temperature.ToString()), "temperature"); + } + return await HttpPost(Url, content); + } + + /// + /// Gets the transcription of the audio file, with full metadata + /// + /// The local path to the audio file, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. + /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. + /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. + /// A string of the transcribed text + public async Task GetWithDetailsAsync(string audioFilePath, string language = null, string prompt = null, double? temperature = null) + { + using (var fileStream = File.OpenRead(audioFilePath)) + { + return await GetWithDetailsAsync(fileStream, Path.GetFileName(audioFilePath), language, prompt, temperature); + } + } + + /// + /// Gets the transcription of the audio stream, in the specified format + /// + /// The stream containing audio data, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + /// The name of the audio file in the stream. This does not have to be real, but it must contain the correct file extension. For example, "file.mp3" if you are supplying an mp3 audio stream. + /// The format of the response. Suggested value are or . For text and Json formats, try or instead. + /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. + /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. + /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. + /// A string of the transcribed text + public async Task GetAsFormatAsync(Stream audioStream, string filename, string responseFormat, string language = null, string prompt = null, double? temperature = null) + { + var request = new AudioRequest() + { + Language = language ?? DefaultRequestArgs.Language, + Model = DefaultRequestArgs.Model, + Prompt = prompt ?? DefaultRequestArgs.Prompt, + Temperature = temperature ?? DefaultRequestArgs.Temperature, + ResponseFormat = responseFormat ?? DefaultRequestArgs.ResponseFormat + }; + MultipartFormDataContent content; + using (var memoryStream = new MemoryStream()) + { + audioStream.CopyTo(memoryStream); + content = new MultipartFormDataContent + { + { new StringContent(request.Model), "model" }, + { new StringContent(request.ResponseFormat), "response_format" }, + { new ByteArrayContent(memoryStream.ToArray()), "file", filename } + }; + if (!string.IsNullOrEmpty(request.Language)) + content.Add(new StringContent(request.Language), "language"); + if (!string.IsNullOrEmpty(request.Prompt)) + content.Add(new StringContent(request.Prompt), "prompt"); + if (request.Temperature != 0) + content.Add(new StringContent(request.Temperature.ToString()), "temperature"); + } + return await HttpGetContent(Url, HttpMethod.Post, content); + } + + /// + /// Gets the transcription of the audio file, in the specified format + /// + /// The local path to the audio file, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + /// The format of the response. Suggested value are or . For text and Json formats, try or instead. + /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. + /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. + /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. + /// A string of the transcribed text + public async Task GetAsFormatAsync(string audioFilePath, string responseFormat, string language = null, string prompt = null, double? temperature = null) + { + using (var fileStream = File.OpenRead(audioFilePath)) + { + return await GetAsFormatAsync(fileStream, Path.GetFileName(audioFilePath), responseFormat, language, prompt, temperature); + } + } + } +} diff --git a/OpenAI_API/IOpenAIAPI.cs b/OpenAI_API/IOpenAIAPI.cs index 08d3e04..d79c150 100644 --- a/OpenAI_API/IOpenAIAPI.cs +++ b/OpenAI_API/IOpenAIAPI.cs @@ -67,8 +67,18 @@ public interface IOpenAIAPI IImageGenerationEndpoint ImageGenerations { get; } /// - /// The Endpoint for the Text to Speech API. This allows you to generate audio from text. See + /// The endpoint for the Text to Speech API. This allows you to generate audio from text. See /// ITextToSpeechEndpoint TextToSpeech { get; } - } + + /// + /// The endpoint for the audio transcription API. This allows you to generate text from audio. See + /// + ITranscriptionEndpoint Transcriptions { get; } + + /// + /// The endpoint for the audio translation API. This allows you to generate English text from audio in other languages. See + /// + ITranscriptionEndpoint Translations { get; } + } } \ No newline at end of file diff --git a/OpenAI_API/OpenAIAPI.cs b/OpenAI_API/OpenAIAPI.cs index 5269824..349f08f 100644 --- a/OpenAI_API/OpenAIAPI.cs +++ b/OpenAI_API/OpenAIAPI.cs @@ -52,6 +52,8 @@ public OpenAIAPI(APIAuthentication apiKeys = null) Moderation = new ModerationEndpoint(this); ImageGenerations = new ImageGenerationEndpoint(this); TextToSpeech = new TextToSpeechEndpoint(this); + Transcriptions = new TranscriptionEndpoint(this, false); + Translations = new TranscriptionEndpoint(this, true); } /// @@ -108,5 +110,15 @@ public static OpenAIAPI ForAzure(string YourResourceName, string deploymentId, A /// The Endpoint for the Text to Speech API. This allows you to generate audio from text. See /// public ITextToSpeechEndpoint TextToSpeech { get; } + + /// + /// The endpoint for the audio transcription API. This allows you to generate text from audio. See + /// + public ITranscriptionEndpoint Transcriptions { get; } + + /// + /// The endpoint for the audio translation API. This allows you to generate English text from audio in other languages. See + /// + public ITranscriptionEndpoint Translations { get; } } } diff --git a/OpenAI_Tests/OpenAI_Tests.csproj b/OpenAI_Tests/OpenAI_Tests.csproj index f26766e..4ffcdb4 100644 --- a/OpenAI_Tests/OpenAI_Tests.csproj +++ b/OpenAI_Tests/OpenAI_Tests.csproj @@ -19,6 +19,12 @@ + + PreserveNewest + + + PreserveNewest + PreserveNewest diff --git a/OpenAI_Tests/TextToSpeechTests.cs b/OpenAI_Tests/TextToSpeechTests.cs index 235b7cc..7e23c47 100644 --- a/OpenAI_Tests/TextToSpeechTests.cs +++ b/OpenAI_Tests/TextToSpeechTests.cs @@ -7,6 +7,7 @@ using OpenAI_API.Moderation; using System; using System.Collections.Generic; +using System.Diagnostics; using System.IO; using System.Linq; using System.Threading; @@ -29,16 +30,16 @@ public void Setup() [TestCase("fable", false, 1)] [TestCase("onyx", true, 1.25)] [TestCase("nova", false, 0.5)] - public async Task SimpleTTSStreamTest(string voice, bool hd, decimal? speed) + public async Task SimpleTTSStreamTest(string voice, bool hd, double? speed) { var api = new OpenAI_API.OpenAIAPI(); - using (var result = await api.TextToSpeech.GetSpeechAsStreamAsync("Hello, brave new world! This is a test.", voice, speed, TextToSpeechRequest.ResponseFormats.FLAC, hd ? Model.TTS_HD : null)) + using (Stream result = await api.TextToSpeech.GetSpeechAsStreamAsync("Hello, brave new world! This is a test.", voice, speed, TextToSpeechRequest.ResponseFormats.FLAC, hd ? Model.TTS_HD : null)) { Assert.IsNotNull(result); - using (TextReader reader = new StreamReader(result)) + using (StreamReader reader = new StreamReader(result)) { Assert.Greater(result.Length, 10000); - string asString = reader.ReadLine(); + string asString = await reader.ReadToEndAsync(); Assert.AreEqual("fLaC", asString.Substring(0, 4)); } } @@ -63,12 +64,13 @@ public async Task SimpleTTSFileTest() [TestCase("aac")] public async Task ManualTTSStreamTest(string format) { + var api = new OpenAI_API.OpenAIAPI(); + var request = new TextToSpeechRequest() { Input = "Hello, brave new world! This is a test.", ResponseFormat = format, }; - var api = new OpenAI_API.OpenAIAPI(); using (var result = await api.TextToSpeech.GetSpeechAsStreamAsync(request)) { Assert.IsNotNull(result); diff --git a/OpenAI_Tests/TranscriptionTests.cs b/OpenAI_Tests/TranscriptionTests.cs new file mode 100644 index 0000000..65e6811 --- /dev/null +++ b/OpenAI_Tests/TranscriptionTests.cs @@ -0,0 +1,115 @@ +using Newtonsoft.Json; +using NUnit.Framework; +using OpenAI_API.Audio; +using OpenAI_API.Chat; +using OpenAI_API.Completions; +using OpenAI_API.Models; +using OpenAI_API.Moderation; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using static OpenAI_API.Audio.TextToSpeechRequest; +using static OpenAI_API.Chat.ChatMessage; + +namespace OpenAI_Tests +{ + public class TranscriptionTests + { + [SetUp] + public void Setup() + { + OpenAI_API.APIAuthentication.Default = new OpenAI_API.APIAuthentication(Environment.GetEnvironmentVariable("TEST_OPENAI_SECRET_KEY")); + } + + [Test] + public async Task EnglishTranscribeToText() + { + var api = new OpenAI_API.OpenAIAPI(); + + string result = await api.Transcriptions.GetTextAsync("english-test.m4a"); + Assert.IsNotNull(result); + Assert.AreEqual("Hello, this is a test of the transcription function. Is it coming out okay?", result.Trim()); + + result = await api.Transcriptions.GetTextAsync("english-test.m4a", "en"); + Assert.IsNotNull(result); + Assert.AreEqual("Hello, this is a test of the transcription function. Is it coming out okay?", result.Trim()); + } + + [Test] + public async Task ChineseTranscribeToText() + { + var api = new OpenAI_API.OpenAIAPI(); + string result = await api.Transcriptions.GetTextAsync("chinese-test.m4a"); + Assert.IsNotNull(result); + Assert.AreEqual("你好,我的名字是初培。我会说一点点普通话。你呢?", result.Trim()); + + result = await api.Transcriptions.GetTextAsync("chinese-test.m4a", "zh"); + Assert.IsNotNull(result); + Assert.AreEqual("你好,我的名字是初培。我会说一点点普通话。你呢?", result.Trim()); + } + + [Test] + public async Task ChineseTranslateToEnglishText() + { + var api = new OpenAI_API.OpenAIAPI(); + string result = await api.Translations.GetTextAsync("chinese-test.m4a"); + Assert.IsNotNull(result); + Assert.AreEqual("Hello, my name is Chu Pei. I can speak a little Mandarin. How about you?", result.Trim()); + } + + [TestCase("json", "\"text\": ")] + [TestCase("srt", "00:00:00,000")] + [TestCase("vtt", "00:00:00.000")] + public async Task TranscribeToFormat(string format, string searchFor) + { + var api = new OpenAI_API.OpenAIAPI(); + string result = await api.Transcriptions.GetAsFormatAsync("english-test.m4a", format); + Assert.IsNotNull(result); + Assert.IsNotEmpty(result); + Assert.True(result.Contains("Hello, this is a test of the transcription function. Is it coming out okay?")); + Assert.True(result.Contains(searchFor), "Did not contain the format indicator: "+searchFor); + result = await api.Transcriptions.GetAsFormatAsync("chinese-test.m4a",format, "zh"); + Assert.IsNotNull(result); + Assert.IsNotEmpty(result); + Assert.True(result.Contains("你好,我的名字是初培。我会说一点点普通话。你呢?")); + Assert.True(result.Contains(searchFor), "Did not contain the format indicator: " + searchFor); + } + + [Test] + public async Task GetDetailedTranscribeJson() + { + var api = new OpenAI_API.OpenAIAPI(); + AudioResultVerbose result = await api.Transcriptions.GetWithDetailsAsync("english-test.m4a"); + Assert.IsNotNull(result); + Assert.IsNotEmpty(result.RequestId); + Assert.Greater(result.ProcessingTime.TotalMilliseconds, 100); + Assert.AreEqual(6.99, result.duration, 0.05); + Assert.AreEqual("english", result.language); + Assert.AreEqual("transcribe", result.task); + Assert.AreEqual("Hello, this is a test of the transcription function. Is it coming out okay?", result.text.Trim()); + Assert.AreEqual(1,result.segments.Count); + Assert.AreEqual("Hello, this is a test of the transcription function. Is it coming out okay?", result.segments[0].text.Trim()); + Assert.AreEqual(19, result.segments[0].tokens.Count); + } + + + [Test] + public async Task GetDetailedTranslateJson() + { + var api = new OpenAI_API.OpenAIAPI(); + var result = await api.Translations.GetWithDetailsAsync("chinese-test.m4a"); + Assert.IsNotNull(result); + Assert.IsNotEmpty(result.RequestId); + Assert.Greater(result.ProcessingTime.TotalMilliseconds, 100); + Assert.AreEqual(10.62, result.duration, 0.05); + Assert.AreEqual("translate", result.task); + Assert.AreEqual("Hello, my name is Chu Pei. I can speak a little Mandarin. How about you?", result.text.Trim()); + Assert.AreEqual(1, result.segments.Count); + Assert.AreEqual("Hello, my name is Chu Pei. I can speak a little Mandarin. How about you?", result.segments[0].text.Trim()); + Assert.AreEqual(22, result.segments[0].tokens.Count); + } + } +} diff --git a/OpenAI_Tests/chinese-test.m4a b/OpenAI_Tests/chinese-test.m4a new file mode 100644 index 0000000..d4fea03 Binary files /dev/null and b/OpenAI_Tests/chinese-test.m4a differ diff --git a/OpenAI_Tests/english-test.m4a b/OpenAI_Tests/english-test.m4a new file mode 100644 index 0000000..fd02ef6 Binary files /dev/null and b/OpenAI_Tests/english-test.m4a differ diff --git a/README.md b/README.md index ff0d5f0..8892d4e 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,10 @@ Console.WriteLine(result); * [JSON Mode](#json-mode) * [Completions API](#completions) * [Streaming completion results](#streaming) + * Audio + * Text to Speech + * Transcribe Audio to Text + * Translate Audio to English Text * [Embeddings API](#embeddings) * [Moderation API](#moderation) * [Files API](#files-for-fine-tuning) @@ -38,8 +42,8 @@ Console.WriteLine(result); ## Status [![OpenAI](https://badgen.net/nuget/v/OpenAI)](https://www.nuget.org/packages/OpenAI/) -Adds updated models as of December 11, 2023, including the new [GPT-4 Vision](#gpt-vision), GPT-4 Turbo, and [DALL-E 3](#dall-e-3). Adds [json result format](#json-mode). Fixes chat result streaming bug. -Support for text-to-speech, and the other new features shown at OpenAI DevDay will be coming soon, but are not yet implemented. +Adds updated models as of December 13, 2023, including the new [GPT-4 Vision](#gpt-vision), GPT-4 Turbo, and [DALL-E 3](#dall-e-3). Adds text-to-speech as well as audio transcription and translation (Whisper). Adds [json result format](#json-mode). Fixes chat result streaming bug. +Support for assistants and other new features shown at OpenAI DevDay will be coming soon, but are not yet implemented. ## Requirements @@ -298,6 +302,95 @@ await api.Completions.StreamCompletionAsync( res => ResumeTextbox.Text += res.ToString()); ``` +### Audio +The Audio API's are Text to Speech, Transcription (speech to text), and Translation (non-English speech to English text). + +#### Text to Speech (TTS) +The TTS API is accessed via `OpenAIAPI.TextToSpeech`: + +```csharp +await api.TextToSpeech.SaveSpeechToFileAsync("Hello, brave new world! This is a test.", outputPath); +// You can open it in the defaul audio player like this: +Process.Start(outputPath); +``` + +You can also specify all of the request parameters with a `TextToSpeechRequest` object: + +```csharp +var request = new TextToSpeechRequest() +{ + Input = "Hello, brave new world! This is a test.", + ResponseFormat = ResponseFormats.AAC, + Model = Model.TTS_HD, + Voice = Voices.Nova, + Speed = 0.9 +}; +await api.TextToSpeech.SaveSpeechToFileAsync(request, "test.aac"); +``` + +Instead of saving to a file, you can get audio byte stream with `api.TextToSpeech.GetSpeechAsStreamAsync(request)`: + +```csharp +using (Stream result = await api.TextToSpeech.GetSpeechAsStreamAsync("Hello, brave new world!", Voices.Fable)) +using (StreamReader reader = new StreamReader(result)) +{ + // do something with the audio stream here +} +``` + +#### Transcription (Speech to Text) + +The Audio Transcription API allows you to generate text from audio, in any of the supported languages. It is accessed via `OpenAIAPI.Transcriptions`: + +```csharp +string resultText = await api.Transcriptions.GetTextAsync("path/to/file.mp3"); +``` + +You can ask for verbose results, which will give you segment and token-level information, as well as the standard OpenAI metadata such as processing time: + +```csharp +AudioResultVerbose result = await api.Transcriptions.GetWithDetailsAsync("path/to/file.m4a"); +Console.WriteLine(result.ProcessingTime.TotalMilliseconds); // 496ms +Console.WriteLine(result.text); // "Hello, this is a test of the transcription function." +Console.WriteLine(result.language); // "english" +Console.WriteLine(result.segments[0].no_speech_prob); // 0.03712 +// etc +``` + +You can also ask for results in SRT or VTT format, which is useful for generating subtitles for videos: + +```csharp +string result = await api.Transcriptions.GetAsFormatAsync("path/to/file.m4a", AudioRequest.ResponseFormats.SRT); +``` + +Additional parameters such as temperature, prompt, language, etc can be specified either per-request or as a default: + +```csharp +// inline +result = await api.Transcriptions.GetTextAsync("conversation.mp3", "en", "This is a transcript of a conversation between a medical doctor and her patient: ", 0.3); + +// set defaults +api.Transcriptions.DefaultTranscriptionRequestArgs.Language = "en"; +``` + +Instead of providing a local file on disk, you can provide a stream of audio bytes. This can be useful for streaming audio from the microphone or another source without having to first write to disk. Please not you must specify a filename, which does not have to exist, but which must have an accurate extension for the type of audio that you are sending. OpenAI uses the filename extension to determine what format your audio stream is in. + +```csharp +using (var audioStream = File.OpenRead("path-here.mp3")) +{ + return await api.Transcriptions.GetTextAsync(audioStream, "file.mp3"); +} +``` + +#### Translations (Non-English Speech to English Text) + +Translations allow you to transcribe text from any of the supported languages to English. OpenAI does not support translating into any other language, only English. It is accessed via `OpenAIAPI.Translations`. +It supports all of the same functionality as the Transcriptions. + +```csharp +string result = await api.Translations.GetTextAsync("chinese-example.m4a"); +``` + ### Embeddings The Embedding API is accessed via `OpenAIAPI.Embeddings`: