Merge branch 'master' into anzr299-patch-2

openvinotoolkit · Dec 20, 2024 · 6a36e94 · 6a36e94
2 parents 6ff80b7 + 4d18f8b
commit 6a36e94
Show file tree

Hide file tree

Showing 40 changed files with 923 additions and 72 deletions.
diff --git a/samples/cpp/whisper_speech_recognition/README.md b/samples/cpp/whisper_speech_recognition/README.md
@@ -33,6 +33,91 @@ timestamps: [0, 2] text:  How are you doing today?
 
 See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
 
+# Whisper pipeline usage
+
+```c++
+#include "openvino/genai/whisper_pipeline.hpp"
+
+ov::genai::WhisperPipeline pipeline(model_dir, "CPU");
+// Pipeline expects normalized audio with Sample Rate of 16kHz
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech);
+//  How are you doing today?
+```
+
+### Transcription
+
+Whisper pipeline predicts the language of the source audio automatically.
+
+```c++
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech);
+//  How are you doing today?
+
+raw_speech = read_wav("fr_sample.wav");
+result = pipeline.generate(raw_speech);
+//  Il s'agit d'une entité très complexe qui consiste...
+```
+
+If the source audio languange is know in advance, it can be specified as an argument to `generate` method:
+
+```c++
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech, ov::genai::language("<|en|>"));
+//  How are you doing today?
+
+raw_speech = read_wav("fr_sample.wav");
+result = pipeline.generate(raw_speech, ov::genai::language("<|fr|>"));
+//  Il s'agit d'une entité très complexe qui consiste...
+```
+
+### Translation
+
+By default, Whisper performs the task of speech transcription, where the source audio language is the same as the target text language. To perform speech translation, where the target text is in English, set the task to "translate":
+
+```c++
+ov::genai::RawSpeechInput raw_speech = read_wav("fr_sample.wav");
+auto result = pipeline.generate(raw_speech, ov::genai::task("translate"));
+//  It is a very complex entity that consists...
+```
+
+### Timestamps prediction
+
+The model can predict timestamps. For sentence-level timestamps, pass the `return_timestamps` argument:
+
+```C++
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech, ov::genai::return_timestamps(true));
+
+std::cout << std::setprecision(2);
+for (auto& chunk : *result.chunks) {
+    std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n";
+}
+// timestamps: [0, 2] text:  How are you doing today?
+```
+
+### Long-Form audio Transcription
+
+The Whisper model is designed to work on audio samples of up to 30s in duration. Whisper pipeline uses sequential chunking algorithm to transcribe audio samples of arbitrary length.
+Sequential chunking algorithm uses a "sliding window", transcribing 30-second slices one after the other.
+
+### Initial prompt and hotwords
+
+Whisper pipeline has `initial_prompt` and `hotwords` generate arguments:
+* `initial_prompt`: initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window
+* `hotwords`: hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows
+
+The Whisper model can use that context to better understand the speech and maintain a consistent writing style. However, prompts do not need to be genuine transcripts from prior audio segments. Such prompts can be used to steer the model to use particular spellings or styles:
+
+```c++
+auto result = pipeline.generate(raw_speech);
+//  He has gone and gone for good answered Paul Icrom who...
+
+result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome"));
+//  He has gone and gone for good answered Polychrome who...
+```
+
+
 ### Troubleshooting
 
 #### Empty or rubbish output

diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
@@ -28,6 +28,7 @@ int main(int argc, char* argv[]) try {
 
     std::cout << result << "\n";
 
+    std::cout << std::setprecision(2);
     for (auto& chunk : *result.chunks) {
         std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n";
     }

diff --git a/samples/python/whisper_speech_recognition/README.md b/samples/python/whisper_speech_recognition/README.md
@@ -40,6 +40,93 @@ timestamps: [0, 2] text:  How are you doing today?
 
 See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
 
+# Whisper pipeline usage
+
+```python
+import openvino_genai
+import librosa
+
+def read_wav(filepath):
+    raw_speech, samplerate = librosa.load(filepath, sr=16000)
+    return raw_speech.tolist()
+
+pipe = openvino_genai.WhisperPipeline(model_dir, "CPU")
+# Pipeline expects normalized audio with Sample Rate of 16kHz
+raw_speech = read_wav('how_are_you_doing_today.wav')
+result = pipe.generate(raw_speech)
+#  How are you doing today?
+```
+
+### Transcription
+
+Whisper pipeline predicts the language of the source audio automatically.
+
+```python
+raw_speech = read_wav('how_are_you_doing_today.wav')
+result = pipe.generate(raw_speech)
+#  How are you doing today?
+
+raw_speech = read_wav('fr_sample.wav')
+result = pipe.generate(raw_speech)
+#  Il s'agit d'une entité très complexe qui consiste...
+```
+
+If the source audio languange is know in advance, it can be specified as an argument to `generate` method:
+
+```python
+raw_speech = read_wav("how_are_you_doing_today.wav")
+result = pipe.generate(raw_speech, language="<|en|>")
+#  How are you doing today?
+
+raw_speech = read_wav("fr_sample.wav")
+result = pipe.generate(raw_speech, language="<|fr|>")
+#  Il s'agit d'une entité très complexe qui consiste...
+```
+
+### Translation
+
+By default, Whisper performs the task of speech transcription, where the source audio language is the same as the target text language. To perform speech translation, where the target text is in English, set the task to "translate":
+
+```python
+raw_speech = read_wav("fr_sample.wav")
+result = pipe.generate(raw_speech, task="translate")
+# It is a very complex entity that consists...
+```
+
+### Timestamps prediction
+
+The model can predict timestamps. For sentence-level timestamps, pass the `return_timestamps` argument:
+
+```python
+raw_speech = read_wav("how_are_you_doing_today.wav")
+result = pipe.generate(raw_speech, return_timestamps=True)
+
+for chunk in result.chunks:
+    print(f"timestamps: [{chunk.start_ts:.2f}, {chunk.end_ts:.2f}] text: {chunk.text}")
+# timestamps: [0.00, 2.00] text:  How are you doing today?
+```
+
+### Long-Form audio Transcription
+
+The Whisper model is designed to work on audio samples of up to 30s in duration. Whisper pipeline uses sequential chunking algorithm to transcribe audio samples of arbitrary length.
+Sequential chunking algorithm uses a "sliding window", transcribing 30-second slices one after the other.
+
+### Initial prompt and hotwords
+
+Whisper pipeline has `initial_prompt` and `hotwords` generate arguments:
+* `initial_prompt`: initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window
+* `hotwords`: hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows
+
+The Whisper model can use that context to better understand the speech and maintain a consistent writing style. However, prompts do not need to be genuine transcripts from prior audio segments. Such prompts can be used to steer the model to use particular spellings or styles:
+
+```python
+result = pipe.generate(raw_speech)
+#  He has gone and gone for good answered Paul Icrom who...
+
+result = pipe.generate(raw_speech, initial_prompt="Polychrome")
+#  He has gone and gone for good answered Polychrome who...
+```
+
 ### Troubleshooting
 
 #### Empty or rubbish output

diff --git a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py
@@ -18,7 +18,7 @@ def main():
     parser.add_argument("wav_file_path")
     args = parser.parse_args()
 
-    device = "CPU"  # GPU can be used as well
+    device = "CPU"  # GPU, NPU can be used as well
     pipe = openvino_genai.WhisperPipeline(args.model_dir, device)
 
     config = pipe.get_generation_config()
@@ -34,8 +34,9 @@ def main():
 
     print(result)
 
-    for chunk in result.chunks:
-        print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}")
+    if result.chunks:
+        for chunk in result.chunks:
+            print(f"timestamps: [{chunk.start_ts:.2f}, {chunk.end_ts:.2f}] text: {chunk.text}")
 
 
 if "__main__" == __name__:

diff --git a/src/cpp/include/openvino/genai/image_generation/scheduler.hpp b/src/cpp/include/openvino/genai/image_generation/scheduler.hpp
@@ -19,7 +19,8 @@ class OPENVINO_GENAI_EXPORTS Scheduler {
         DDIM,
         EULER_DISCRETE,
         FLOW_MATCH_EULER_DISCRETE,
-        PNDM
+        PNDM,
+        EULER_ANCESTRAL_DISCRETE
     };
 
     static std::shared_ptr<Scheduler> from_config(const std::filesystem::path& scheduler_config_path,

diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <optional>
 #include <filesystem>
+#include <optional>
 
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/runtime/compiled_model.hpp"
@@ -46,6 +46,9 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig {
     // Transcribe token id.
     int64_t transcribe_token_id = 50359;
 
+    // Corresponds to the ”<|startofprev|>” token.
+    int64_t prev_sot_token_id = 50361;
+
     // No timestamps token id.
     int64_t no_timestamps_token_id = 50363;
 
@@ -75,6 +78,32 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig {
     // Note that a segment of text refers to a sequence of one or more words, rather than individual words.
     bool return_timestamps = false;
 
+    /*
+     * Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing
+     * window. Can be used to steer the model to use particular spellings or styles.
+     *
+     * Example:
+     *  auto result = pipeline.generate(raw_speech);
+     *  //  He has gone and gone for good answered Paul Icrom who...
+     *
+     *  auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome"));
+     *  //  He has gone and gone for good answered Polychrome who...
+     */
+    std::optional<std::string> initial_prompt = std::nullopt;
+
+    /*
+     * Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows.
+     * Can be used to steer the model to use particular spellings or styles.
+     *
+     * Example:
+     *  auto result = pipeline.generate(raw_speech);
+     *  //  He has gone and gone for good answered Paul Icrom who...
+     *
+     *  auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome"));
+     *  //  He has gone and gone for good answered Polychrome who...
+     */
+    std::optional<std::string> hotwords = std::nullopt;
+
     // A list containing tokens that will be suppressed at the beginning of the sampling process.
     std::vector<int64_t> begin_suppress_tokens;
 
@@ -111,9 +140,12 @@ static constexpr ov::Property<int64_t> pad_token_id{"pad_token_id"};
 static constexpr ov::Property<int64_t> transcribe_token_id{"transcribe_token_id"};
 static constexpr ov::Property<int64_t> translate_token_id{"translate_token_id"};
 static constexpr ov::Property<int64_t> no_timestamps_token_id{"no_timestamps_token_id"};
+static constexpr ov::Property<int64_t> prev_sot_token_id{"prev_sot_token_id"};
 static constexpr ov::Property<std::string> language{"language"};
 static constexpr ov::Property<std::string> task{"task"};
 static constexpr ov::Property<bool> return_timestamps{"return_timestamps"};
+static constexpr ov::Property<std::string> initial_prompt{"initial_prompt"};
+static constexpr ov::Property<std::string> hotwords{"hotwords"};
 static constexpr ov::Property<std::map<std::string, int64_t>> lang_to_id{"lang_to_id"};
 
 }  // namespace genai