From 937d4476d217a04e57f2e27bd1e62861fee7a803 Mon Sep 17 00:00:00 2001 From: Sofya Balandina Date: Sat, 16 Nov 2024 03:12:55 +0000 Subject: [PATCH] Fix wrong logits processing without applying of slice matmul (#1217) --- src/cpp/src/llm_pipeline.cpp | 5 ----- src/cpp/src/lm_encoding.cpp | 5 ++++- src/cpp/src/visual_language/pipeline.cpp | 1 - 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 830fa7ac37..62a72b1cbd 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -269,18 +269,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { SequenceGroup::Ptr sequence_group; if (is_chat_conversation && !m_is_cache_empty) { sequence_group = std::make_shared(request_id, m_tokenized_chat_history.input_ids, config, block_size, enable_prefix_caching); - sequence_group->update_processed_tokens_num(m_tokenized_chat_history.input_ids.get_shape().at(1) - 1); } else { size_t seq_len = input_ids.get_shape().at(1); size_t batch_offset = request_id * seq_len; const int64_t* prompt_start = input_ids.data() + batch_offset; std::vector tokenized_prompt(prompt_start, prompt_start + seq_len); - // in case of multi batch scenario, remove eos_token_id at start of prompt - auto real_prompt_start = std::find_if(tokenized_prompt.begin(), tokenized_prompt.end(), [&config](int64_t token) { return token != config.eos_token_id; }); - tokenized_prompt.erase(tokenized_prompt.begin(), real_prompt_start); sequence_group = std::make_shared(request_id, tokenized_prompt, config, block_size, enable_prefix_caching); - sequence_group->update_processed_tokens_num(tokenized_prompt.size() - 1); } sequence_group->set_sequence_group_ptr(sequence_group); diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp index 644aa369c6..c76d9f7edf 100644 --- a/src/cpp/src/lm_encoding.cpp +++ b/src/cpp/src/lm_encoding.cpp @@ -106,9 +106,12 @@ std::pair get_lm_encoded_results( auto logits = m_llm.get_tensor("logits"); int64_t sequence_len = logits.get_shape().at(1); - for (auto& sequence_group : sequence_groups) + for (auto& sequence_group : sequence_groups) { + sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - sequence_len); sequence_group->schedule_tokens(sequence_len); + } + std::map beam_offets; for (size_t i = 0; i < sequence_groups.size(); i++) beam_offets.insert({sequence_groups.at(i)->get_request_id(), i}); diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 92358f5810..9ece0ff754 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -105,7 +105,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { std::fill_n(prompt_ids.data(), prompt_ids.get_size(), 0); SequenceGroup::Ptr sequence_group = std::make_shared(request_id, prompt_ids, generation_config, block_size, enable_prefix_caching); - sequence_group->update_processed_tokens_num(history_size); sequence_group->set_sequence_group_ptr(sequence_group); requests.push_back(sequence_group);