From 37469ea7b26a75b1506c407081b94243725dd1f6 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 24 Oct 2024 17:15:07 +0200
Subject: [PATCH] Support return timestamps

---
 .../whisper_speech_recognition.cpp            |  2 +-
 src/cpp/src/whisper_pipeline_static.cpp       | 38 ++++++++++++++++---
 2 files changed, 33 insertions(+), 7 deletions(-)
diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
index 48770dd61b..7b6d5f131c 100644
--- a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
+++ b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
@@ -11,7 +11,7 @@ int main(int argc, char* argv[]) try {
 
     std::filesystem::path models_path = argv[1];
     std::string wav_file_path = argv[2];
-    std::string device = "CPU"; // GPU can be used as well
+    std::string device = "CPU";  // GPU can be used as well
 
     ov::genai::WhisperPipeline pipeline(models_path, device);
 
diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp
index 470a95fa61..3ac4d9ac3b 100644
--- a/src/cpp/src/whisper_pipeline_static.cpp
+++ b/src/cpp/src/whisper_pipeline_static.cpp
@@ -23,7 +23,7 @@ void fill_tensor(ov::Tensor tensor, T fill_val) {
 }
 
 template <typename T>
-void copy_to_tensor(const std::vector<T>& src_vec, ov::Tensor dst_tensor) {
+void copy_to_tensor(const std::vector<T>& src_vec, ov::Tensor&& dst_tensor) {
     auto* dst_ptr = dst_tensor.data<T>();
     OPENVINO_ASSERT(src_vec.size() == dst_tensor.get_size());
     std::copy(src_vec.begin(), src_vec.end(), dst_ptr);
@@ -47,7 +47,7 @@ ov::Tensor encode(ov::InferRequest& request,
 }
 
 // FIXME: Duplicate from llm_pipeline_static.cpp - need to reuse instead of copy-paste
-ov::Tensor make_tensor_slice(ov::Tensor tensor, size_t dim, size_t start_pos, size_t end_pos) {
+ov::Tensor make_tensor_slice(ov::Tensor& tensor, size_t dim, size_t start_pos, size_t end_pos) {
     ov::Shape start_shape(std::vector<size_t>(tensor.get_shape().size(), 0u));
     start_shape[dim] = start_pos;
     ov::Shape end_shape = tensor.get_shape();
@@ -100,16 +100,40 @@ void update_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, con
     }
 }
 
+void set_decoder_input_ids_attention_mask(ov::InferRequest& decoder,
+                                          const std::vector<int32_t>& init_ids,
+                                          const int64_t pad_token) {
+    auto input_ids_tensor = decoder.get_tensor("input_ids");
+    auto attention_mask_tensor = decoder.get_tensor("attention_mask");
+
+    const size_t seq_length = input_ids_tensor.get_shape()[1];
+
+    OPENVINO_ASSERT(seq_length >= init_ids.size());
+
+    const size_t pad_size = seq_length - init_ids.size();
+
+    // pad right
+    // input_ids [token, token, token, pad_token]
+    // attention_mask [1, 1, 1, 0]
+    // normally pad left expected to work, but it didn't
+    auto input_ids_data = input_ids_tensor.data<int32_t>();
+    std::copy(init_ids.begin(), init_ids.end(), input_ids_data);
+    std::fill_n(input_ids_data + init_ids.size(), pad_size, static_cast<int32_t>(pad_token));
+
+    auto attention_mask_data = attention_mask_tensor.data<ov::float16>();
+    std::fill_n(attention_mask_data, init_ids.size(), 1u);
+    std::fill_n(attention_mask_data + init_ids.size(), pad_size, 0u);
+}
+
 int64_t decode(ov::Tensor& encoder_hidden_state,
                ov::InferRequest& decoder,
-               const std::vector<int32_t>& input_ids,
+               const std::vector<int32_t>& init_ids,
                const ov::genai::WhisperGenerationConfig& config,
                const bool apply_logit_processors = true,
                const bool return_timestamps = false) {
     // NB: Fill decoder inputs
     encoder_hidden_state.copy_to(decoder.get_tensor("encoder_hidden_states"));
-    copy_to_tensor(input_ids, decoder.get_tensor("input_ids"));
-    fill_tensor<ov::float16>(decoder.get_tensor("attention_mask"), 1);
+    set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id);
 
     decoder.infer();
 
@@ -166,6 +190,8 @@ void zero_past_key_values(ov::InferRequest& request) {
 }
 
 void prepare_decoder_with_past(ov::InferRequest& decoder_with_past, ov::InferRequest& decoder) {
+    // initial mask size of 3 tested with 4 init tokens
+    // it may change in case of fewer init tokens
     // NB: Prepare attetion mask to be in a format [1, 1, 1, 0, 0, 0, 0, ..., 1]
     auto attention_mask = decoder_with_past.get_tensor("attention_mask");
     auto* attention_mask_ptr = attention_mask.data<ov::float16>();
@@ -224,7 +250,7 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
                                                   const size_t max_new_tokens,
                                                   const bool return_timestamps,
                                                   const std::shared_ptr<ov::genai::StreamerBase> streamer) {
-    int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config);
+    int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, true, return_timestamps);
     std::vector<int64_t> output_tokens{output_token};
 
     const size_t timestamp_begin = config.no_timestamps_token_id + 1;