From 37469ea7b26a75b1506c407081b94243725dd1f6 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 24 Oct 2024 17:15:07 +0200 Subject: [PATCH] Support return timestamps --- .../whisper_speech_recognition.cpp | 2 +- src/cpp/src/whisper_pipeline_static.cpp | 38 ++++++++++++++++--- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp index 48770dd61b..7b6d5f131c 100644 --- a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp +++ b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp @@ -11,7 +11,7 @@ int main(int argc, char* argv[]) try { std::filesystem::path models_path = argv[1]; std::string wav_file_path = argv[2]; - std::string device = "CPU"; // GPU can be used as well + std::string device = "CPU"; // GPU can be used as well ov::genai::WhisperPipeline pipeline(models_path, device); diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp index 470a95fa61..3ac4d9ac3b 100644 --- a/src/cpp/src/whisper_pipeline_static.cpp +++ b/src/cpp/src/whisper_pipeline_static.cpp @@ -23,7 +23,7 @@ void fill_tensor(ov::Tensor tensor, T fill_val) { } template -void copy_to_tensor(const std::vector& src_vec, ov::Tensor dst_tensor) { +void copy_to_tensor(const std::vector& src_vec, ov::Tensor&& dst_tensor) { auto* dst_ptr = dst_tensor.data(); OPENVINO_ASSERT(src_vec.size() == dst_tensor.get_size()); std::copy(src_vec.begin(), src_vec.end(), dst_ptr); @@ -47,7 +47,7 @@ ov::Tensor encode(ov::InferRequest& request, } // FIXME: Duplicate from llm_pipeline_static.cpp - need to reuse instead of copy-paste -ov::Tensor make_tensor_slice(ov::Tensor tensor, size_t dim, size_t start_pos, size_t end_pos) { +ov::Tensor make_tensor_slice(ov::Tensor& tensor, size_t dim, size_t start_pos, size_t end_pos) { ov::Shape start_shape(std::vector(tensor.get_shape().size(), 0u)); start_shape[dim] = start_pos; ov::Shape end_shape = tensor.get_shape(); @@ -100,16 +100,40 @@ void update_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, con } } +void set_decoder_input_ids_attention_mask(ov::InferRequest& decoder, + const std::vector& init_ids, + const int64_t pad_token) { + auto input_ids_tensor = decoder.get_tensor("input_ids"); + auto attention_mask_tensor = decoder.get_tensor("attention_mask"); + + const size_t seq_length = input_ids_tensor.get_shape()[1]; + + OPENVINO_ASSERT(seq_length >= init_ids.size()); + + const size_t pad_size = seq_length - init_ids.size(); + + // pad right + // input_ids [token, token, token, pad_token] + // attention_mask [1, 1, 1, 0] + // normally pad left expected to work, but it didn't + auto input_ids_data = input_ids_tensor.data(); + std::copy(init_ids.begin(), init_ids.end(), input_ids_data); + std::fill_n(input_ids_data + init_ids.size(), pad_size, static_cast(pad_token)); + + auto attention_mask_data = attention_mask_tensor.data(); + std::fill_n(attention_mask_data, init_ids.size(), 1u); + std::fill_n(attention_mask_data + init_ids.size(), pad_size, 0u); +} + int64_t decode(ov::Tensor& encoder_hidden_state, ov::InferRequest& decoder, - const std::vector& input_ids, + const std::vector& init_ids, const ov::genai::WhisperGenerationConfig& config, const bool apply_logit_processors = true, const bool return_timestamps = false) { // NB: Fill decoder inputs encoder_hidden_state.copy_to(decoder.get_tensor("encoder_hidden_states")); - copy_to_tensor(input_ids, decoder.get_tensor("input_ids")); - fill_tensor(decoder.get_tensor("attention_mask"), 1); + set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id); decoder.infer(); @@ -166,6 +190,8 @@ void zero_past_key_values(ov::InferRequest& request) { } void prepare_decoder_with_past(ov::InferRequest& decoder_with_past, ov::InferRequest& decoder) { + // initial mask size of 3 tested with 4 init tokens + // it may change in case of fewer init tokens // NB: Prepare attetion mask to be in a format [1, 1, 1, 0, 0, 0, 0, ..., 1] auto attention_mask = decoder_with_past.get_tensor("attention_mask"); auto* attention_mask_ptr = attention_mask.data(); @@ -224,7 +250,7 @@ std::pair> full_decode(ov::Tensor& encoder_hidden_sta const size_t max_new_tokens, const bool return_timestamps, const std::shared_ptr streamer) { - int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config); + int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, true, return_timestamps); std::vector output_tokens{output_token}; const size_t timestamp_begin = config.no_timestamps_token_id + 1;