forked from Azure-Samples/cognitive-services-speech-sdk
-
Notifications
You must be signed in to change notification settings - Fork 0
/
compressed-audio-input.cpp
109 lines (93 loc) · 3.8 KB
/
compressed-audio-input.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include <iostream> // cin, cout
#include <speechapi_cxx.h>
using namespace Microsoft::CognitiveServices::Speech;
using namespace Microsoft::CognitiveServices::Speech::Audio;
static void* OpenCompressedFile(const std::string& compressedFileName)
{
FILE *filep = NULL;
filep = fopen(compressedFileName.c_str(), "rb");
return filep;
}
static void closeStream(void* fp)
{
fclose((FILE*)fp);
}
static int ReadCompressedBinaryData(void *stream, uint8_t *ptr, uint32_t bufSize)
{
FILE* compressedStreamfptr = (FILE*)stream;
if (compressedStreamfptr != NULL && !feof(compressedStreamfptr))
{
return fread(ptr, 1, bufSize, compressedStreamfptr);
}
else
{
return 0;
}
}
void recognizeSpeech(const std::string& compressedFileName)
{
std::shared_ptr<SpeechRecognizer> recognizer;
std::shared_ptr<PullAudioInputStream> pullAudioStream;
// Creates an instance of a speech config with specified subscription key and service region.
// Replace with your own subscription key and service region (e.g., "westus").
auto config = SpeechConfig::FromSubscription("YourSubscriptionKey", "YourServiceRegion");
AudioStreamContainerFormat inputFormat;
if (compressedFileName.find(".mp3") == (compressedFileName.size() - 4))
{
inputFormat = AudioStreamContainerFormat::MP3;
}
else if (compressedFileName.find(".opus") == (compressedFileName.size() - 5))
{
inputFormat = AudioStreamContainerFormat::OGG_OPUS;
}
else
{
std::cout << "Only Opus and MP3 input files are currently supported" << std::endl;
return;
}
pullAudioStream = AudioInputStream::CreatePullStream(
AudioStreamFormat::GetCompressedFormat(inputFormat),
OpenCompressedFile(compressedFileName),
ReadCompressedBinaryData,
closeStream
);
recognizer = SpeechRecognizer::FromConfig(config, AudioConfig::FromStreamInput(pullAudioStream));
std::cout << "Recognizing ..." << std::endl;
// Starts speech recognition, and returns after a single utterance is recognized. The end of a
// single utterance is determined by listening for silence at the end or until a maximum of 15
// seconds of audio is processed. The task returns the recognition text as result.
// Note: Since RecognizeOnceAsync() returns only a single utterance, it is suitable only for single
// shot recognition like command or query.
// For long-running multi-utterance recognition, use StartContinuousRecognitionAsync() instead.
auto result = recognizer->RecognizeOnceAsync().get();
// Checks result.
if (result->Reason == ResultReason::RecognizedSpeech) {
std::cout << "We recognized: " << result->Text << std::endl;
}
else if (result->Reason == ResultReason::NoMatch) {
std::cout << "NOMATCH: Speech could not be recognized." << std::endl;
}
else if (result->Reason == ResultReason::Canceled) {
auto cancellation = CancellationDetails::FromResult(result);
std::cout << "CANCELED: Reason=" << (int)cancellation->Reason << std::endl;
if (cancellation->Reason == CancellationReason::Error) {
std::cout << "CANCELED: ErrorCode= " << (int)cancellation->ErrorCode << std::endl;
std::cout << "CANCELED: ErrorDetails=" << cancellation->ErrorDetails << std::endl;
std::cout << "CANCELED: Did you update the subscription info?" << std::endl;
}
}
}
int main(int argc, char **argv) {
if (argc != 2)
{
std::cout << "Usage: ./compressed-audio-input <filename>" << std::endl;
return 0;
}
setlocale(LC_ALL, "");
recognizeSpeech(argv[1]);
return 0;
}