From 321e0211f27998214d0e06fbf0dce533b582b2e2 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Thu, 4 Jul 2024 10:28:54 +0000 Subject: [PATCH 01/10] Support chat mode for LLM Static pipeline - naive --- src/cpp/src/llm_pipeline_static.cpp | 55 +++++++++++++++++++++-------- src/cpp/src/llm_pipeline_static.hpp | 11 +++--- 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index ec123aa167..0fc5c36901 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -8,6 +8,8 @@ #include "text_callback_streamer.hpp" #include "utils.hpp" +#include + namespace { std::shared_ptr add_slices_to_kvcache_inputs(const std::shared_ptr& model) { @@ -75,18 +77,15 @@ void reshape_to_static(std::shared_ptr model, model->reshape(new_shapes); } -void fill_tensor(ov::Tensor tensor, int64_t fill_val) { +void fill_tensor(ov::Tensor tensor, int64_t fill_val, int32_t offset = 0) { int64_t* tensor_data = tensor.data(); - std::fill(tensor_data, tensor_data + tensor.get_size(), fill_val); + std::fill(tensor_data + offset, tensor_data + tensor.get_size(), fill_val); } -void copy_with_left_offset(const ov::Tensor& orig, ov::Tensor& padded) { - const auto orig_size = orig.get_size(); - const auto padded_size = padded.get_size(); - const auto kLeftOffset = padded_size - orig_size; +void copy_with_offset(const ov::Tensor& orig, const int32_t offset, ov::Tensor& padded) { int64_t* orig_data = orig.data(); int64_t* padded_data = padded.data(); - std::copy(orig_data, orig_data + orig_size, padded_data + kLeftOffset); + std::copy(orig_data, orig_data + orig.get_size(), padded_data + offset); } ov::AnyMap extract_config_or_empty(const ov::AnyMap& config, const std::string& config_name) { @@ -127,11 +126,12 @@ StaticLLMPipeline::StaticLLMPipeline( // (1) Read the template model - this will be kvcache model auto kvcache_model = core.read_model(path / "openvino_model.xml"); // (2) TODO: Expose KV-cache input and output layers from kvcache model + ov::pass::StatefulToStateless().run_on_model(kvcache_model); // (3) Clone the model - this will be prefill auto prefill_model = kvcache_model->clone(); prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill"); // (4) Reshape both models to static shape - m_kvcache_desc = KVCacheDesc { 1024u, 0u }; + m_kvcache_desc = KVCacheDesc { 1024, 0u }; const uint32_t max_prompt_size = m_kvcache_desc.total_size; const uint32_t max_kvcache_size = m_kvcache_desc.total_size; reshape_to_static(prefill_model, max_prompt_size, max_kvcache_size); @@ -156,6 +156,14 @@ StaticLLMPipeline::StaticLLMPipeline( ) : StaticLLMPipeline(path, path.string(), device, config) { } +void StaticLLMPipeline::start_chat() { + m_is_chat_conversation = true; +}; +void StaticLLMPipeline::finish_chat() { + m_is_chat_conversation = false; + m_history.clear(); +}; + void StaticLLMPipeline::prepare_for_new_conversation() { fill_tensor(m_prefill_request.get_tensor("input_ids"), m_tokenizer.get_pad_token_id()); fill_tensor(m_prefill_request.get_tensor("position_ids"), 0u); @@ -175,9 +183,25 @@ DecodedResults StaticLLMPipeline::generate( } OPENVINO_ASSERT(std::holds_alternative(inputs)); - auto tokenized_input = m_tokenizer.encode(std::get(inputs)); + auto& prompt = std::get(inputs); + + if (m_is_chat_conversation) { + m_history.push_back({{"role", "user"}, {"content", prompt}}); + constexpr bool add_generation_prompt = true; + prompt = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + } + + auto tokenized_input = m_tokenizer.encode(prompt); auto encoded_results = generate(tokenized_input, config, streamer); - return {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores}; + DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores}; + + if (m_is_chat_conversation) { + // Tail of chat template is missing in KV cache. + // Find the tail to concatenate it with the next input prompt. + auto answer = decoded_results.texts[0]; + m_history.push_back({{"role", "assistant"}, {"content", answer}}); + } + return decoded_results; } EncodedResults StaticLLMPipeline::generate( @@ -224,20 +248,22 @@ EncodedResults StaticLLMPipeline::generate( results.scores.resize(1u); results.tokens.resize(1u); - // NB: Check if input prompt less than maximum size + // NB: Check if there is enough space in KV-cache to process input prompt auto prompt_len = input_ids.get_size(); if (prompt_len > m_kvcache_desc.total_size) { OPENVINO_THROW("Currently static pipeline only process up to " + std::to_string(m_kvcache_desc.total_size) + " tokens"); } - // NB: Reset tensors on every generate call - chat conversation isn't supported yet! + // NB: From the "generate" perspective, every prompt is treated as start of new conversation, + // but in case the real chat, prompt contains information about past conversation context prepare_for_new_conversation(); auto padded_input_ids = m_prefill_request.get_tensor("input_ids"); - copy_with_left_offset(input_ids, padded_input_ids); + const auto offset = padded_input_ids.get_size() - input_ids.get_size(); + copy_with_offset(input_ids, offset, padded_input_ids); auto padded_attention_mask = m_prefill_request.get_tensor("attention_mask"); - copy_with_left_offset(attention_mask, padded_attention_mask); + fill_tensor(padded_attention_mask, 1u, offset); auto padded_position_ids = m_prefill_request.get_tensor("position_ids"); auto* padded_pos_data = padded_position_ids.data(); @@ -254,7 +280,6 @@ EncodedResults StaticLLMPipeline::generate( padded_attention_mask.copy_to(m_kvcache_request.get_tensor("attention_mask")); - // Inputs: input_ids, attention_mask, position_ids, ... // Outputs: logits, ... const auto kStartInputKVCacheLayers = 3u; diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 2ec40c2152..c883c8004b 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -35,12 +35,8 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { StreamerVariant streamer ) override; - void start_chat() override { - OPENVINO_THROW("Currently chat conversation mode isn't supported"); - }; - void finish_chat() override { - OPENVINO_THROW("Currently chat conversation mode isn't supported"); - }; + void start_chat() override; + void finish_chat() override; private: void prepare_for_new_conversation(); @@ -54,6 +50,9 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { KVCacheDesc m_kvcache_desc; ov::InferRequest m_kvcache_request; ov::InferRequest m_prefill_request; + + bool m_is_chat_conversation = false; + ChatHistory m_history; }; } // namespace genai From 6e8c2e0a81878c70eed95997067ee05bfed35036 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Fri, 5 Jul 2024 09:40:40 +0000 Subject: [PATCH 02/10] Fix bug with first missing generated token --- src/cpp/src/llm_pipeline_static.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 084e83cf8f..5847e63a2a 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -246,6 +246,7 @@ EncodedResults StaticLLMPipeline::generate( ov::genai::EncodedResults results; // NB: Only batch=1 is supported now results.scores.resize(1u); + results.scores[0] = 0u; results.tokens.resize(1u); // NB: Check if there is enough space in KV-cache to process input prompt @@ -274,6 +275,7 @@ EncodedResults StaticLLMPipeline::generate( // NB: Now there are prompt_len tokens in KV-cache m_kvcache_desc.num_stored_tokens += prompt_len; int64_t last_token = utils::argmax(m_prefill_request.get_tensor("logits"), 0); + results.tokens[0].push_back(last_token); if (streamer_ptr && streamer_ptr->put(last_token)) { return results; } @@ -311,7 +313,6 @@ EncodedResults StaticLLMPipeline::generate( last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0); results.tokens[0].push_back(last_token); - results.scores[0] = 0u; if (streamer_ptr && streamer_ptr->put(last_token)) { break; From 361aa057ccca4aa091d60fcd96520c4e72a697e8 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Mon, 15 Jul 2024 14:15:18 +0000 Subject: [PATCH 03/10] Small updates * Adapt for new start_chat(const string&) signature * Handle "ignore_eos" config option --- src/cpp/src/llm_pipeline_static.cpp | 13 +++++++------ src/cpp/src/llm_pipeline_static.hpp | 3 +-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index cd6035a0a9..9ef411e534 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -176,7 +176,10 @@ StaticLLMPipeline::StaticLLMPipeline( ) : StaticLLMPipeline(path, path.string(), device, config) { } -void StaticLLMPipeline::start_chat() { +void StaticLLMPipeline::start_chat(const std::string& system_message) { + if (system_message.empty()) { + m_history.push_back({{"role", "system"}, {"content", system_message}}); + } m_is_chat_conversation = true; }; void StaticLLMPipeline::finish_chat() { @@ -216,8 +219,6 @@ DecodedResults StaticLLMPipeline::generate( DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores}; if (m_is_chat_conversation) { - // Tail of chat template is missing in KV cache. - // Find the tail to concatenate it with the next input prompt. auto answer = decoded_results.texts[0]; m_history.push_back({{"role", "assistant"}, {"content", answer}}); } @@ -275,8 +276,8 @@ EncodedResults StaticLLMPipeline::generate( OPENVINO_THROW("Currently static pipeline only process up to " + std::to_string(m_kvcache_desc.total_size) + " tokens"); } - // NB: From the "generate" perspective, every prompt is treated as start of new conversation, - // but in case the real chat, prompt contains information about past conversation context + // NB: From the "generate" perspective, every call is treated as start of new conversation, + // but if continuation is needed, prompt contains information about the entire conversation. prepare_for_new_conversation(); auto padded_input_ids = m_prefill_request.get_tensor("input_ids"); @@ -338,7 +339,7 @@ EncodedResults StaticLLMPipeline::generate( break; } - if (last_token == m_generation_config.eos_token_id) { + if (last_token == config.eos_token_id && !config.ignore_eos) { break; } diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index c883c8004b..85488e1880 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -35,9 +35,8 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { StreamerVariant streamer ) override; - void start_chat() override; + void start_chat(const std::string& system_message) override; void finish_chat() override; - private: void prepare_for_new_conversation(); From 7d826a341138cde6c36f704d2c81c58d41d594a1 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Mon, 15 Jul 2024 14:16:21 +0000 Subject: [PATCH 04/10] Extend chat_sample to accept device string --- samples/cpp/chat_sample/chat_sample.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp index d9d9c2b2de..e5e89c203d 100644 --- a/samples/cpp/chat_sample/chat_sample.cpp +++ b/samples/cpp/chat_sample/chat_sample.cpp @@ -4,15 +4,14 @@ #include "openvino/genai/llm_pipeline.hpp" int main(int argc, char* argv[]) try { - if (2 != argc) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " "); + if (2 != argc && 3 != argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " "); } std::string prompt; std::string model_path = argv[1]; + std::string device = argc == 3 ? argv[2] : "CPU"; + ov::genai::LLMPipeline pipe(model_path, device); - std::string device = "CPU"; // GPU can be used as well - ov::genai::LLMPipeline pipe(model_path, "CPU"); - ov::genai::GenerationConfig config; config.max_new_tokens = 100; std::function streamer = [](std::string word) { From 389dcdf8fec335b6de5d0d06026b219122f0fb8f Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Tue, 16 Jul 2024 09:33:33 +0000 Subject: [PATCH 05/10] Emphasize that device is optional in chat_sample --- samples/cpp/chat_sample/chat_sample.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp index e5e89c203d..d8cb8305bb 100644 --- a/samples/cpp/chat_sample/chat_sample.cpp +++ b/samples/cpp/chat_sample/chat_sample.cpp @@ -5,7 +5,7 @@ int main(int argc, char* argv[]) try { if (2 != argc && 3 != argc) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " "); + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " (optional)"); } std::string prompt; std::string model_path = argv[1]; From 758fd05bb18438e918dc0eaaf1f164398b4ee609 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Wed, 17 Jul 2024 09:09:06 +0000 Subject: [PATCH 06/10] Revert chat_sample.cpp changes --- samples/cpp/chat_sample/chat_sample.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp index d8cb8305bb..ae4dad88a2 100644 --- a/samples/cpp/chat_sample/chat_sample.cpp +++ b/samples/cpp/chat_sample/chat_sample.cpp @@ -4,14 +4,15 @@ #include "openvino/genai/llm_pipeline.hpp" int main(int argc, char* argv[]) try { - if (2 != argc && 3 != argc) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " (optional)"); + if (2 != argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " "); } std::string prompt; std::string model_path = argv[1]; - std::string device = argc == 3 ? argv[2] : "CPU"; - ov::genai::LLMPipeline pipe(model_path, device); + std::string device = "CPU"; // GPU, NPU can be used as well + ov::genai::LLMPipeline pipe(model_path, "CPU"); + ov::genai::GenerationConfig config; config.max_new_tokens = 100; std::function streamer = [](std::string word) { From b1e53a4fc454a036e4b824da5f39b929adc04a73 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Wed, 17 Jul 2024 09:16:24 +0000 Subject: [PATCH 07/10] Change offset integer type --- src/cpp/src/llm_pipeline_static.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 9ef411e534..05e9960966 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -77,7 +77,7 @@ void reshape_to_static(std::shared_ptr model, model->reshape(new_shapes); } -void fill_tensor(ov::Tensor tensor, int64_t fill_val, int32_t offset = 0) { +void fill_tensor(ov::Tensor tensor, int64_t fill_val, size_t offset = 0u) { int64_t* tensor_data = tensor.data(); std::fill(tensor_data + offset, tensor_data + tensor.get_size(), fill_val); } @@ -281,7 +281,7 @@ EncodedResults StaticLLMPipeline::generate( prepare_for_new_conversation(); auto padded_input_ids = m_prefill_request.get_tensor("input_ids"); - const auto offset = padded_input_ids.get_size() - input_ids.get_size(); + const size_t offset = padded_input_ids.get_size() - input_ids.get_size(); copy_with_offset(input_ids, offset, padded_input_ids); auto padded_attention_mask = m_prefill_request.get_tensor("attention_mask"); From d20e86542176511e62d9b0eda2be7fe33a496377 Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Wed, 17 Jul 2024 11:24:34 +0100 Subject: [PATCH 08/10] Update src/cpp/src/llm_pipeline_static.cpp Co-authored-by: Pavel Esir --- src/cpp/src/llm_pipeline_static.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 05e9960966..376e280487 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -177,7 +177,7 @@ StaticLLMPipeline::StaticLLMPipeline( } void StaticLLMPipeline::start_chat(const std::string& system_message) { - if (system_message.empty()) { + if (!system_message.empty()) { m_history.push_back({{"role", "system"}, {"content", system_message}}); } m_is_chat_conversation = true; From bb47c8cfe7d2000f970185d442f9d8b9eac342ff Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Wed, 17 Jul 2024 11:34:15 +0100 Subject: [PATCH 09/10] Update src/cpp/src/llm_pipeline_static.cpp Co-authored-by: Pavel Esir --- src/cpp/src/llm_pipeline_static.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 376e280487..9954b6a45e 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -182,6 +182,7 @@ void StaticLLMPipeline::start_chat(const std::string& system_message) { } m_is_chat_conversation = true; }; + void StaticLLMPipeline::finish_chat() { m_is_chat_conversation = false; m_history.clear(); From 6e1a66527b2b2ef18358c89195c444865a4f685c Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Wed, 17 Jul 2024 12:20:16 +0100 Subject: [PATCH 10/10] Update llm_pipeline_static.cpp --- src/cpp/src/llm_pipeline_static.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 9954b6a45e..3f50d30ec9 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -108,7 +108,7 @@ ov::AnyMap extract_config_or_default(const ov::AnyMap& config, const std::string { "NPUW_FOLD", "YES" }, { "NPUW_DCOFF_TYPE", "f16" }, { "NPUW_DCOFF_SCALE", "YES" }, - { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, + { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add" }, { "NPUW_PARALLEL_COMPILE", "YES" }, { "NPUW_FUNCALL_ASYNC", "YES" } };