Skip to content

Commit

Permalink
Support return timestamps
Browse files Browse the repository at this point in the history
  • Loading branch information
as-suvorov committed Oct 24, 2024
1 parent fb34668 commit 37469ea
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ int main(int argc, char* argv[]) try {

std::filesystem::path models_path = argv[1];
std::string wav_file_path = argv[2];
std::string device = "CPU"; // GPU can be used as well
std::string device = "CPU"; // GPU can be used as well

ov::genai::WhisperPipeline pipeline(models_path, device);

Expand Down
38 changes: 32 additions & 6 deletions src/cpp/src/whisper_pipeline_static.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ void fill_tensor(ov::Tensor tensor, T fill_val) {
}

template <typename T>
void copy_to_tensor(const std::vector<T>& src_vec, ov::Tensor dst_tensor) {
void copy_to_tensor(const std::vector<T>& src_vec, ov::Tensor&& dst_tensor) {
auto* dst_ptr = dst_tensor.data<T>();
OPENVINO_ASSERT(src_vec.size() == dst_tensor.get_size());
std::copy(src_vec.begin(), src_vec.end(), dst_ptr);
Expand All @@ -47,7 +47,7 @@ ov::Tensor encode(ov::InferRequest& request,
}

// FIXME: Duplicate from llm_pipeline_static.cpp - need to reuse instead of copy-paste
ov::Tensor make_tensor_slice(ov::Tensor tensor, size_t dim, size_t start_pos, size_t end_pos) {
ov::Tensor make_tensor_slice(ov::Tensor& tensor, size_t dim, size_t start_pos, size_t end_pos) {
ov::Shape start_shape(std::vector<size_t>(tensor.get_shape().size(), 0u));
start_shape[dim] = start_pos;
ov::Shape end_shape = tensor.get_shape();
Expand Down Expand Up @@ -100,16 +100,40 @@ void update_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, con
}
}

void set_decoder_input_ids_attention_mask(ov::InferRequest& decoder,
const std::vector<int32_t>& init_ids,
const int64_t pad_token) {
auto input_ids_tensor = decoder.get_tensor("input_ids");
auto attention_mask_tensor = decoder.get_tensor("attention_mask");

const size_t seq_length = input_ids_tensor.get_shape()[1];

OPENVINO_ASSERT(seq_length >= init_ids.size());

const size_t pad_size = seq_length - init_ids.size();

// pad right
// input_ids [token, token, token, pad_token]
// attention_mask [1, 1, 1, 0]
// normally pad left expected to work, but it didn't
auto input_ids_data = input_ids_tensor.data<int32_t>();
std::copy(init_ids.begin(), init_ids.end(), input_ids_data);
std::fill_n(input_ids_data + init_ids.size(), pad_size, static_cast<int32_t>(pad_token));

auto attention_mask_data = attention_mask_tensor.data<ov::float16>();
std::fill_n(attention_mask_data, init_ids.size(), 1u);
std::fill_n(attention_mask_data + init_ids.size(), pad_size, 0u);
}

int64_t decode(ov::Tensor& encoder_hidden_state,
ov::InferRequest& decoder,
const std::vector<int32_t>& input_ids,
const std::vector<int32_t>& init_ids,
const ov::genai::WhisperGenerationConfig& config,
const bool apply_logit_processors = true,
const bool return_timestamps = false) {
// NB: Fill decoder inputs
encoder_hidden_state.copy_to(decoder.get_tensor("encoder_hidden_states"));
copy_to_tensor(input_ids, decoder.get_tensor("input_ids"));
fill_tensor<ov::float16>(decoder.get_tensor("attention_mask"), 1);
set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id);

decoder.infer();

Expand Down Expand Up @@ -166,6 +190,8 @@ void zero_past_key_values(ov::InferRequest& request) {
}

void prepare_decoder_with_past(ov::InferRequest& decoder_with_past, ov::InferRequest& decoder) {
// initial mask size of 3 tested with 4 init tokens
// it may change in case of fewer init tokens
// NB: Prepare attetion mask to be in a format [1, 1, 1, 0, 0, 0, 0, ..., 1]
auto attention_mask = decoder_with_past.get_tensor("attention_mask");
auto* attention_mask_ptr = attention_mask.data<ov::float16>();
Expand Down Expand Up @@ -224,7 +250,7 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
const size_t max_new_tokens,
const bool return_timestamps,
const std::shared_ptr<ov::genai::StreamerBase> streamer) {
int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config);
int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, true, return_timestamps);
std::vector<int64_t> output_tokens{output_token};

const size_t timestamp_begin = config.no_timestamps_token_id + 1;
Expand Down

0 comments on commit 37469ea

Please sign in to comment.