Skip to content

Commit

Permalink
feat: Added client implementation.
Browse files Browse the repository at this point in the history
  • Loading branch information
HavenDV committed Nov 15, 2023
1 parent bb16d85 commit 007d9ce
Show file tree
Hide file tree
Showing 85 changed files with 6,143 additions and 5 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2023 tryAGI
Copyright (c) 2023 tryAGI and Stephen Hodgson

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
154 changes: 154 additions & 0 deletions src/libs/OpenAI/Client/Audio/AudioEndpoint.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
using System;
using OpenAI.Extensions;
using System.IO;
using System.Net.Http;
using System.Text.Json;
using System.Text.Json.Serialization;
using System.Threading;
using System.Threading.Tasks;

namespace OpenAI.Audio
{
/// <summary>
/// Transforms audio into text.<br/>
/// <see href="https://platform.openai.com/docs/api-reference/audio"/>
/// </summary>
public sealed class AudioEndpoint : BaseEndPoint
{
private class AudioResponse
{
public AudioResponse(string text)
{
Text = text;
}

[JsonPropertyName("text")]
public string Text { get; }
}

/// <inheritdoc />
public AudioEndpoint(OpenAIClient api) : base(api) { }

/// <inheritdoc />
protected override string Root => "audio";

/// <summary>
/// Generates audio from the input text.
/// </summary>
/// <param name="request"><see cref="SpeechRequest"/>.</param>
/// <param name="chunkCallback">Optional, partial chunk <see cref="ReadOnlyMemory{T}"/> callback to stream audio as it arrives.</param>
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
/// <returns><see cref="ReadOnlyMemory{T}"/></returns>
public async Task<ReadOnlyMemory<byte>> CreateSpeechAsync(SpeechRequest request, Func<ReadOnlyMemory<byte>, Task> chunkCallback = null, CancellationToken cancellationToken = default)
{
var jsonContent = JsonSerializer.Serialize(request, OpenAIClient.JsonSerializationOptions).ToJsonStringContent(EnableDebug);
var response = await Api.Client.PostAsync(GetUrl("/speech"), jsonContent, cancellationToken).ConfigureAwait(false);
await response.CheckResponseAsync(cancellationToken).ConfigureAwait(false);
await using var responseStream = await response.Content.ReadAsStreamAsync(cancellationToken).ConfigureAwait(false);
await using var memoryStream = new MemoryStream();
int bytesRead;
var totalBytesRead = 0;
var buffer = new byte[8192];

while ((bytesRead = await responseStream.ReadAsync(buffer, cancellationToken).ConfigureAwait(false)) > 0)
{
await memoryStream.WriteAsync(new ReadOnlyMemory<byte>(buffer, 0, bytesRead), cancellationToken).ConfigureAwait(false);

if (chunkCallback != null)
{
try
{
await chunkCallback(new ReadOnlyMemory<byte>(memoryStream.GetBuffer(), totalBytesRead, bytesRead)).ConfigureAwait(false);
}
catch (Exception e)
{
Console.WriteLine(e);
}
}

totalBytesRead += bytesRead;
}

return new ReadOnlyMemory<byte>(memoryStream.GetBuffer(), 0, totalBytesRead);
}

/// <summary>
/// Transcribes audio into the input language.
/// </summary>
/// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
/// <returns>The transcribed text.</returns>
public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
{
using var content = new MultipartFormDataContent();
using var audioData = new MemoryStream();
await request.Audio.CopyToAsync(audioData, cancellationToken).ConfigureAwait(false);
content.Add(new ByteArrayContent(audioData.ToArray()), "file", request.AudioName);
content.Add(new StringContent(request.Model), "model");

if (!string.IsNullOrWhiteSpace(request.Prompt))
{
content.Add(new StringContent(request.Prompt), "prompt");
}

var responseFormat = request.ResponseFormat;
content.Add(new StringContent(responseFormat.ToString().ToLower()), "response_format");

if (request.Temperature.HasValue)
{
content.Add(new StringContent(request.Temperature.ToString()), "temperature");
}

if (!string.IsNullOrWhiteSpace(request.Language))
{
content.Add(new StringContent(request.Language), "language");
}

request.Dispose();

var response = await Api.Client.PostAsync(GetUrl("/transcriptions"), content, cancellationToken).ConfigureAwait(false);
var responseAsString = await response.ReadAsStringAsync(EnableDebug, cancellationToken).ConfigureAwait(false);

return responseFormat == AudioResponseFormat.Json
? JsonSerializer.Deserialize<AudioResponse>(responseAsString)?.Text
: responseAsString;
}

/// <summary>
/// Translates audio into English.
/// </summary>
/// <param name="request"></param>
/// <param name="cancellationToken"></param>
/// <returns>The translated text.</returns>
public async Task<string> CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
{
using var content = new MultipartFormDataContent();
using var audioData = new MemoryStream();
await request.Audio.CopyToAsync(audioData, cancellationToken).ConfigureAwait(false);
content.Add(new ByteArrayContent(audioData.ToArray()), "file", request.AudioName);
content.Add(new StringContent(request.Model), "model");

if (!string.IsNullOrWhiteSpace(request.Prompt))
{
content.Add(new StringContent(request.Prompt), "prompt");
}

var responseFormat = request.ResponseFormat;
content.Add(new StringContent(responseFormat.ToString().ToLower()), "response_format");

if (request.Temperature.HasValue)
{
content.Add(new StringContent(request.Temperature.ToString()), "temperature");
}

request.Dispose();

var response = await Api.Client.PostAsync(GetUrl("/translations"), content, cancellationToken).ConfigureAwait(false);
var responseAsString = await response.ReadAsStringAsync(EnableDebug, cancellationToken).ConfigureAwait(false);

return responseFormat == AudioResponseFormat.Json
? JsonSerializer.Deserialize<AudioResponse>(responseAsString)?.Text
: responseAsString;
}
}
}
11 changes: 11 additions & 0 deletions src/libs/OpenAI/Client/Audio/AudioResponseFormat.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
namespace OpenAI.Audio
{
public enum AudioResponseFormat
{
Json = 0,
Verbose_Json,
Text,
Srt,
Vtt
}
}
173 changes: 173 additions & 0 deletions src/libs/OpenAI/Client/Audio/AudioTranscriptionRequest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
using System;
using System.IO;

namespace OpenAI.Audio
{
public sealed class AudioTranscriptionRequest : IDisposable
{
/// <summary>
/// Constructor.
/// </summary>
/// <param name="audioPath">
/// The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
/// </param>
/// <param name="model">
/// ID of the model to use.
/// </param>
/// <param name="prompt">
/// Optional, An optional text to guide the model's style or continue a previous audio segment.<br/>
/// The prompt should be in English.
/// </param>
/// <param name="responseFormat">
/// Optional, The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.<br/>
/// Defaults to json.
/// </param>
/// <param name="temperature">
/// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random,
/// while lower values like 0.2 will make it more focused and deterministic. If set to 0,
/// the model will use log probability to automatically increase the temperature until certain thresholds are hit.<br/>
/// Defaults to 0
/// </param>
/// <param name="language">
/// Optional, The language of the input audio.
/// Supplying the input language in ISO-639-1 format will improve accuracy and latency.<br/>
/// Currently supported languages: Afrikaans, Arabic, Armenian, Azerbaijani, Belarusian, Bosnian, Bulgarian, Catalan,
/// Chinese, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, Galician, German, Greek, Hebrew,
/// Hindi, Hungarian, Icelandic, Indonesian, Italian, Japanese, Kannada, Kazakh, Korean, Latvian, Lithuanian,
/// Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian,
/// Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh.
/// </param>
public AudioTranscriptionRequest(
string audioPath,
string model = null,
string prompt = null,
AudioResponseFormat responseFormat = AudioResponseFormat.Json,
float? temperature = null,
string language = null)
: this(File.OpenRead(audioPath), Path.GetFileName(audioPath), model, prompt, responseFormat, temperature, language)
{
}

/// <summary>
/// Constructor.
/// </summary>
/// <param name="audio">
/// The audio stream to transcribe.
/// </param>
/// <param name="audioName">
/// The name of the audio file to transcribe.
/// </param>
/// <param name="model">
/// ID of the model to use. Only whisper-1 is currently available.
/// </param>
/// <param name="prompt">
/// Optional, An optional text to guide the model's style or continue a previous audio segment.<br/>
/// The prompt should be in English.
/// </param>
/// <param name="responseFormat">
/// Optional, The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.<br/>
/// Defaults to json.
/// </param>
/// <param name="temperature">
/// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random,
/// while lower values like 0.2 will make it more focused and deterministic. If set to 0,
/// the model will use log probability to automatically increase the temperature until certain thresholds are hit.<br/>
/// Defaults to 0
/// </param>
/// <param name="language">
/// Optional, The language of the input audio.
/// Supplying the input language in ISO-639-1 format will improve accuracy and latency.<br/>
/// Currently supported languages: Afrikaans, Arabic, Armenian, Azerbaijani, Belarusian, Bosnian, Bulgarian, Catalan,
/// Chinese, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, Galician, German, Greek, Hebrew,
/// Hindi, Hungarian, Icelandic, Indonesian, Italian, Japanese, Kannada, Kazakh, Korean, Latvian, Lithuanian,
/// Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian,
/// Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh.
/// </param>
public AudioTranscriptionRequest(
Stream audio,
string audioName,
string model = null,
string prompt = null,
AudioResponseFormat responseFormat = AudioResponseFormat.Json,
float? temperature = null,
string language = null)
{
Audio = audio;

if (string.IsNullOrWhiteSpace(audioName))
{
audioName = "audio.wav";
}

AudioName = audioName;
Model = string.IsNullOrWhiteSpace(model) ? Models.Model.Whisper1 : model;
Prompt = prompt;
ResponseFormat = responseFormat;
Temperature = temperature;
Language = language;
}

~AudioTranscriptionRequest() => Dispose(false);

/// <summary>
/// The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
/// </summary>
public Stream Audio { get; }

/// <summary>
/// The name of the audio file to transcribe.
/// </summary>
public string AudioName { get; }

/// <summary>
/// ID of the model to use. Only whisper-1 is currently available.
/// </summary>
public string Model { get; }

/// <summary>
/// Optional, An optional text to guide the model's style or continue a previous audio segment.<br/>
/// The prompt should be in English.
/// </summary>
public string Prompt { get; }

/// <summary>
/// Optional, The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.<br/>
/// Defaults to json.
/// </summary>
public AudioResponseFormat ResponseFormat { get; }

/// <summary>
/// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random,
/// while lower values like 0.2 will make it more focused and deterministic. If set to 0,
/// the model will use log probability to automatically increase the temperature until certain thresholds are hit.<br/>
/// Defaults to 0
/// </summary>
public float? Temperature { get; }

/// <summary>
/// Optional, The language of the input audio.
/// Supplying the input language in ISO-639-1 format will improve accuracy and latency.<br/>
/// Currently supported languages: Afrikaans, Arabic, Armenian, Azerbaijani, Belarusian, Bosnian, Bulgarian, Catalan,
/// Chinese, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, Galician, German, Greek, Hebrew,
/// Hindi, Hungarian, Icelandic, Indonesian, Italian, Japanese, Kannada, Kazakh, Korean, Latvian, Lithuanian,
/// Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian,
/// Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh.
/// </summary>
public string Language { get; }

private void Dispose(bool disposing)
{
if (disposing)
{
Audio?.Close();
Audio?.Dispose();
}
}

public void Dispose()
{
Dispose(true);
GC.SuppressFinalize(this);
}
}
}
Loading

0 comments on commit 007d9ce

Please sign in to comment.