From 0e91fae93975d01d690c1265ee9bdad83cf3ee2c Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Mon, 16 Dec 2024 22:24:20 +0400 Subject: [PATCH 01/18] [Streamer] Handle stop strings in case of sampler --- src/cpp/src/continuous_batching_impl.cpp | 13 +-- src/cpp/src/llm_pipeline.cpp | 2 +- src/cpp/src/llm_pipeline_static.cpp | 2 +- src/cpp/src/sampler.cpp | 5 +- src/cpp/src/sequence_group.hpp | 22 +++-- .../speculative_decoding_impl.cpp | 4 +- src/cpp/src/text_callback_streamer.cpp | 93 +++++++++++++++---- src/cpp/src/text_callback_streamer.hpp | 8 +- 8 files changed, 104 insertions(+), 45 deletions(-) diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 1e42f5b2d9..442fd6f7c5 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -246,8 +246,8 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector& streamer) { return streamer; }, - [this](const std::function& streamer) -> std::shared_ptr { - return std::make_unique(m_tokenizer, streamer); + [this, &sampling_params](const std::function& streamer) -> std::shared_ptr { + return sampling_params.size() == 1 ? std::make_unique(m_tokenizer, streamer, sampling_params.begin()->stop_strings) : std::make_unique(m_tokenizer, streamer); } }, streamer); @@ -275,8 +275,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectorend(); } - if (!continue_generation) { - drop_requests(); - } else { - OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed"); - } + OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed"); for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) { const auto& generation = generations[generation_idx]; diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index f663b27dd9..89e71f21f8 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -273,7 +273,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { } else if (auto streamer_obj = std::get_if>(&streamer)) { streamer_ptr = *streamer_obj; } else if (auto callback = std::get_if>(&streamer)) { - streamer_ptr = std::make_shared(m_tokenizer, *callback); + streamer_ptr = std::make_shared(m_tokenizer, *callback, generation_config->stop_strings); } auto batch_size = input_ids.get_shape().at(0); diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index cb83209b4b..01a06230d0 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -967,7 +967,7 @@ EncodedResults StaticLLMPipeline::generate( } else if (auto streamer_obj = std::get_if>(&streamer)) { streamer_ptr = *streamer_obj; } else if (auto callback = std::get_if>(&streamer)) { - streamer_ptr = std::make_shared(m_tokenizer, *callback); + streamer_ptr = std::make_shared(m_tokenizer, *callback, generation_config->stop_strings); } if (!config.is_greedy_decoding()) { diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index f77463d767..f1abc862e2 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -578,8 +578,6 @@ std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen if (!sampling_params.stop_strings.empty()) { int num_matched_last_tokens = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), sampling_params.stop_strings); if (num_matched_last_tokens) { - if (!sampling_params.include_stop_str_in_output) - running_sequence->remove_last_tokens(num_matched_last_tokens); running_sequence->set_status(SequenceStatus::FINISHED); running_sequence->set_finish_reason(GenerationFinishReason::STOP); dropped_seq_ids.push_back(running_sequence->get_id()); @@ -886,8 +884,7 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, // Notify handle after sampling is done. // For non-streaming this is effective only when the generation is finished. OPENVINO_ASSERT(num_tokens_to_process >= max_removed_tokens_per_request); - size_t num_output_token_to_push = num_tokens_to_process - max_removed_tokens_per_request + 1; - sequence_group->notify_handle(num_output_token_to_push); + sequence_group->notify_handle(); } else { // we are in prompt processing phase when prompt is split into chunks and processed step by step } diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index 6755255fe8..c8b4c59486 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -221,6 +221,8 @@ class SequenceGroup { // flag to enable/disable token generation, e.g. in speculative decoding scenario bool m_is_gen_paused = false; + size_t m_num_streamed_tokens = 0; + SequenceGroup(uint64_t request_id, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching) : m_request_id(request_id), @@ -612,7 +614,7 @@ class SequenceGroup { m_generation_stream->push(std::move(outputs)); } - void notify_handle(size_t num_output_token_to_push = 0) { + void notify_handle() { if (out_of_memory()) { set_generation_status(GenerationStatus::IGNORED); } else if (has_finished()) { @@ -626,10 +628,18 @@ class SequenceGroup { } else if (m_sampling_params.is_greedy_decoding() || m_sampling_params.is_multinomial()) { // We can stream only when one sequence is returned and we don't use stop strings that would be excluded from the output // (after stop string is detected its tokens are already sent) - if (num_total_seqs() == 1 && - (m_sampling_params.stop_strings.empty() || m_sampling_params.include_stop_str_in_output)) { - if (num_output_token_to_push) - push_partial_outputs(num_output_token_to_push); + if (num_total_seqs() == 1) { + const auto generated_len = m_sequences.front()->get_generated_len(); + // speculative decoding draft handling + if (generated_len < m_num_streamed_tokens) { + m_num_streamed_tokens = generated_len; + } + OPENVINO_ASSERT(generated_len >= m_num_streamed_tokens); + auto delta = generated_len - m_num_streamed_tokens; + + size_t num_output_token_to_push = generated_len - m_num_streamed_tokens; + push_partial_outputs(num_output_token_to_push); + m_num_streamed_tokens += (num_output_token_to_push); } else if (has_finished() || out_of_memory()) { push_outputs(); } @@ -661,4 +671,4 @@ class SequenceGroup { m_generation_stream->push(std::move(outputs)); } }; -} +} \ No newline at end of file diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index e4f3b1ad1f..fd9bf00785 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -199,8 +199,8 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector< [](const std::shared_ptr& streamer) { return streamer; }, - [this](const std::function& streamer) -> std::shared_ptr { - return std::make_unique(m_tokenizer, streamer); + [this, &sampling_params](const std::function& streamer) -> std::shared_ptr { + return sampling_params.size() == 1 ? std::make_unique(m_tokenizer, streamer, sampling_params.begin()->stop_strings) : std::make_unique(m_tokenizer, streamer); } }, streamer); diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp index 314a7ffa4d..46b4c666b9 100644 --- a/src/cpp/src/text_callback_streamer.cpp +++ b/src/cpp/src/text_callback_streamer.cpp @@ -6,32 +6,84 @@ namespace ov { namespace genai { -TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback) { +std::vector encode_and_process_stop_string(const std::string& stop_string, ov::genai::Tokenizer& tokenizer) { + // encode stop_string + ov::Tensor ov_encoded_stop_string = tokenizer.encode(stop_string).input_ids; + size_t tensor_size = ov_encoded_stop_string.get_size(); + std::vector source_encoded_stop_string(tensor_size), encoded_stop_string; + std::copy_n(ov_encoded_stop_string.data(), tensor_size, source_encoded_stop_string.begin()); + // remove special symbols + for (const auto& token_id : source_encoded_stop_string) { + if (token_id != tokenizer.get_bos_token_id() && + token_id != tokenizer.get_eos_token_id() && + token_id != tokenizer.get_pad_token_id()) { + encoded_stop_string.push_back(token_id); + } + } + return encoded_stop_string; +} + +TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback, const std::set& stop_strings) { m_tokenizer = tokenizer; on_finalized_subword_callback = callback; + for (const auto& stop_string : stop_strings) { + auto encoded_stop_string = encode_and_process_stop_string(stop_string, m_tokenizer); + m_max_stop_string_len = std::max(encoded_stop_string.size(), m_max_stop_string_len); + m_stop_strings.insert(stop_string); + } } bool TextCallbackStreamer::put(int64_t token) { std::stringstream res; - m_tokens_cache.push_back(token); - std::string text = m_tokenizer.decode(m_tokens_cache); - if (!text.empty() && '\n' == text.back() && text.size() > print_len) { - // Flush the cache after the new line symbol - res << std::string_view{text.data() + print_len, text.size() - print_len}; - m_tokens_cache.clear(); - print_len = 0; - return on_finalized_subword_callback(res.str()); - } + m_tokens_cache_stop_string.push_back(token); + if (m_tokens_cache_stop_string.size() > m_max_stop_string_len || token == m_tokenizer.get_eos_token_id()) { + std::vector buffer(m_tokens_cache_stop_string.begin(), m_tokens_cache_stop_string.end()); + std::string text = m_tokenizer.decode(buffer); + std::string activated_stop_string = ""; + for (const auto& stop_string : m_stop_strings) { + if (text.find(stop_string) != std::string::npos) { + activated_stop_string = stop_string; + break; + } + } + + + if (activated_stop_string.empty() && token != m_tokenizer.get_eos_token_id()) { + m_tokens_cache.push_back(m_tokens_cache_stop_string.front()); + m_tokens_cache_stop_string.pop_front(); + } else { + m_tokens_cache.insert(m_tokens_cache.end(), m_tokens_cache_stop_string.begin(), m_tokens_cache_stop_string.end()); + m_tokens_cache_stop_string.clear(); + } + + text = m_tokenizer.decode(m_tokens_cache); + if (!activated_stop_string.empty()) { + auto pos = text.find(activated_stop_string); + if (pos != std::string::npos) { + text.replace(pos, activated_stop_string.length(), ""); + } + m_tokens_cache.clear(); + } + + if (!text.empty() && '\n' == text.back() && text.size() > print_len) { + // Flush the cache after the new line symbol + res << std::string_view{text.data() + print_len, text.size() - print_len}; + m_tokens_cache.clear(); + print_len = 0; + return on_finalized_subword_callback(res.str()); + } + - constexpr char replacement[] = "\xef\xbf\xbd"; // MSVC with /utf-8 fails to compile � directly with newline in string literal error. - if (text.size() >= 3 && text.compare(text.size() - 3, 3, replacement) == 0) { - // Don't print incomplete text - return on_finalized_subword_callback(res.str()); - } else if (text.size() > print_len) { - // It is possible to have a shorter text after adding new token. - // Print to output only if text length is increaesed. - res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; - print_len = text.size(); + constexpr char replacement[] = "\xef\xbf\xbd"; // MSVC with /utf-8 fails to compile � directly with newline in string literal error. + if (text.size() >= 3 && text.compare(text.size() - 3, 3, replacement) == 0) { + // Don't print incomplete text + return on_finalized_subword_callback(res.str()); + } else { + // It is possible to have a shorter text after adding new token. + // Print to output only if text length is increaesed. + res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + print_len = text.size(); + } } return on_finalized_subword_callback(res.str()); @@ -39,7 +91,8 @@ bool TextCallbackStreamer::put(int64_t token) { void TextCallbackStreamer::end() { std::stringstream res; - std::string text = m_tokenizer.decode(m_tokens_cache); + std::vector buffer(m_tokens_cache.begin(), m_tokens_cache.end()); + std::string text = m_tokenizer.decode(buffer); if (text.size() <= print_len) return ; res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp index a03b0deccb..ae353d27d5 100644 --- a/src/cpp/src/text_callback_streamer.hpp +++ b/src/cpp/src/text_callback_streamer.hpp @@ -3,6 +3,8 @@ #pragma once +#include + #include "openvino/genai/streamer_base.hpp" #include "openvino/genai/tokenizer.hpp" @@ -14,14 +16,16 @@ class TextCallbackStreamer: public StreamerBase { bool put(int64_t token) override; void end() override; - TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback); + TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback, const std::set& stop_strings = {}); std::function on_finalized_subword_callback = [](std::string words)->bool { return false; }; protected: Tokenizer m_tokenizer; std::vector m_tokens_cache; - size_t print_len = 0; + std::list m_tokens_cache_stop_string; + size_t print_len = 0, m_max_stop_string_len = 0; + std::set m_stop_strings; }; } // namespace genai From 22d6fe1f3cc6a03158d5e81b546c8841e336259a Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Tue, 17 Dec 2024 11:12:50 +0400 Subject: [PATCH 02/18] Handle output sequence --- .../speculative_decoding_lm.cpp | 3 +- src/cpp/src/sampler.cpp | 7 +++- src/cpp/src/sequence_group.hpp | 37 +++++++++++++------ 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index dc6761879c..3c2c5b04a9 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -34,7 +34,8 @@ int main(int argc, char* argv[]) try { main_model_path, main_device, ov::genai::draft_model(draft_model_path, draft_device), - ov::genai::scheduler_config(scheduler_config)); + ov::genai::scheduler_config(scheduler_config) + ); auto streamer = [](std::string subword) { std::cout << subword << std::flush; diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index f1abc862e2..b689f4883c 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -400,7 +400,9 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, Sa if(!m_parameters.include_stop_str_in_output) { // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point) - candidate.m_sequence->remove_last_tokens(num_last_matched_tokens - 1); + if (!m_parameters.include_stop_str_in_output) { + candidate.m_sequence->set_num_token_token_cnt_to_ignore(num_last_matched_tokens - 1); + } } // try to finish candidate @@ -578,6 +580,9 @@ std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen if (!sampling_params.stop_strings.empty()) { int num_matched_last_tokens = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), sampling_params.stop_strings); if (num_matched_last_tokens) { + if (!sampling_params.include_stop_str_in_output) { + running_sequence->set_num_token_token_cnt_to_ignore(num_matched_last_tokens - 1); + } running_sequence->set_status(SequenceStatus::FINISHED); running_sequence->set_finish_reason(GenerationFinishReason::STOP); dropped_seq_ids.push_back(running_sequence->get_id()); diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index c8b4c59486..e5c63901dd 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -44,6 +44,8 @@ class Sequence { static std::mutex m_counter_mutex; size_t _make_hash(size_t content_length); + // num tokens to remove from result. Used in case of match by stop_string + size_t m_token_cnt_to_ignore = 0; public: using Ptr = std::shared_ptr; using CPtr = std::shared_ptr; @@ -128,24 +130,35 @@ class Sequence { GenerationOutput get_last_generation_output(size_t token_cnt = 1) { GenerationOutput output; - OPENVINO_ASSERT(m_generated_ids.size()); - output.score = get_cumulative_log_probs(); + if (token_cnt > 0) { + OPENVINO_ASSERT(m_generated_ids.size()); + output.score = get_cumulative_log_probs(); - auto generated_token_id = get_generated_ids(); - auto generated_log_probs = get_generated_log_probs(); + auto generated_token_id = get_generated_ids(); + auto generated_log_probs = get_generated_log_probs(); - OPENVINO_ASSERT(get_generated_len() >= token_cnt); - auto offset = get_generated_len() - token_cnt; + OPENVINO_ASSERT(get_generated_len() >= token_cnt); + auto offset = get_generated_len() - token_cnt; - std::vector token_id(generated_token_id.begin() + offset, generated_token_id.end()); - std::vector log_probs(generated_log_probs.begin() + offset, generated_log_probs.end()); + auto offset_back = get_generated_len() - m_token_cnt_to_ignore; + if (m_token_cnt_to_ignore) + auto a = 0; + m_token_cnt_to_ignore = 0; - output.generated_ids = token_id; - output.generated_log_probs = log_probs; - output.finish_reason = get_finish_reason(); + std::vector token_id(generated_token_id.begin() + offset, generated_token_id.begin() + offset_back); + std::vector log_probs(generated_log_probs.begin() + offset, generated_log_probs.begin() + offset_back); + + output.generated_ids = token_id; + output.generated_log_probs = log_probs; + output.finish_reason = get_finish_reason(); + } return output; } + void set_num_token_token_cnt_to_ignore(size_t k) { + m_token_cnt_to_ignore = k; + } + size_t get_generated_len() const { return m_generated_ids.size(); } @@ -621,7 +634,7 @@ class SequenceGroup { set_generation_status(GenerationStatus::FINISHED); } // For beam search streaming is not available, so we notify only upon finishing - if(m_sampling_params.is_beam_search()) { + if (m_sampling_params.is_beam_search()) { if (has_finished() || out_of_memory()) { push_outputs(); } From dcb80ddf0ab3c0f2e71ce27ac937a3cf04046c08 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Tue, 17 Dec 2024 12:35:48 +0400 Subject: [PATCH 03/18] one more --- src/cpp/include/openvino/genai/generation_handle.hpp | 1 + src/cpp/src/generation_handle.cpp | 3 ++- src/cpp/src/sampler.cpp | 4 ++-- src/cpp/src/sequence_group.hpp | 11 ++++------- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp index 7ff172e645..2e4f94a524 100644 --- a/src/cpp/include/openvino/genai/generation_handle.hpp +++ b/src/cpp/include/openvino/genai/generation_handle.hpp @@ -57,6 +57,7 @@ struct GenerationOutput { std::vector generated_log_probs; float score; GenerationFinishReason finish_reason; + size_t token_cnt_to_ignore; }; using GenerationOutputs = std::unordered_map; diff --git a/src/cpp/src/generation_handle.cpp b/src/cpp/src/generation_handle.cpp index a1dd467523..e6968e481b 100644 --- a/src/cpp/src/generation_handle.cpp +++ b/src/cpp/src/generation_handle.cpp @@ -46,7 +46,8 @@ void add_partial_result(std::unordered_map& partial_ } else { auto generated_len = iteration_result.second.generated_ids.size(); OPENVINO_ASSERT(generated_len == iteration_result.second.generated_log_probs.size()); - for (size_t i = 0; i < generated_len; ++i) { + OPENVINO_ASSERT(generated_len >= iteration_result.second.token_cnt_to_ignore); + for (size_t i = 0; i < generated_len - iteration_result.second.token_cnt_to_ignore; ++i) { partial_result_iter->second.generated_ids.push_back(iteration_result.second.generated_ids[i]); partial_result_iter->second.generated_log_probs.push_back(iteration_result.second.generated_log_probs[i]); } diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index b689f4883c..49d83ec6bd 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -401,7 +401,7 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, Sa if(!m_parameters.include_stop_str_in_output) { // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point) if (!m_parameters.include_stop_str_in_output) { - candidate.m_sequence->set_num_token_token_cnt_to_ignore(num_last_matched_tokens - 1); + candidate.m_sequence->set_num_token_token_cnt_to_ignore(num_last_matched_tokens); } } @@ -581,7 +581,7 @@ std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen int num_matched_last_tokens = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), sampling_params.stop_strings); if (num_matched_last_tokens) { if (!sampling_params.include_stop_str_in_output) { - running_sequence->set_num_token_token_cnt_to_ignore(num_matched_last_tokens - 1); + running_sequence->set_num_token_token_cnt_to_ignore(num_matched_last_tokens); } running_sequence->set_status(SequenceStatus::FINISHED); running_sequence->set_finish_reason(GenerationFinishReason::STOP); diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index e5c63901dd..0a332d776b 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -139,18 +139,15 @@ class Sequence { OPENVINO_ASSERT(get_generated_len() >= token_cnt); auto offset = get_generated_len() - token_cnt; + auto offset_back = get_generated_len(); - auto offset_back = get_generated_len() - m_token_cnt_to_ignore; - if (m_token_cnt_to_ignore) - auto a = 0; - m_token_cnt_to_ignore = 0; - - std::vector token_id(generated_token_id.begin() + offset, generated_token_id.begin() + offset_back); - std::vector log_probs(generated_log_probs.begin() + offset, generated_log_probs.begin() + offset_back); + std::vector token_id(generated_token_id.begin() + offset, generated_token_id.end()); + std::vector log_probs(generated_log_probs.begin() + offset, generated_log_probs.end()); output.generated_ids = token_id; output.generated_log_probs = log_probs; output.finish_reason = get_finish_reason(); + output.token_cnt_to_ignore = m_token_cnt_to_ignore; } return output; } From 87379b1d2da2ca00dacd46f54dd044fe12c18797 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Tue, 17 Dec 2024 12:42:43 +0400 Subject: [PATCH 04/18] Update sampler.cpp --- src/cpp/src/sampler.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 49d83ec6bd..b689f4883c 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -401,7 +401,7 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, Sa if(!m_parameters.include_stop_str_in_output) { // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point) if (!m_parameters.include_stop_str_in_output) { - candidate.m_sequence->set_num_token_token_cnt_to_ignore(num_last_matched_tokens); + candidate.m_sequence->set_num_token_token_cnt_to_ignore(num_last_matched_tokens - 1); } } @@ -581,7 +581,7 @@ std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen int num_matched_last_tokens = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), sampling_params.stop_strings); if (num_matched_last_tokens) { if (!sampling_params.include_stop_str_in_output) { - running_sequence->set_num_token_token_cnt_to_ignore(num_matched_last_tokens); + running_sequence->set_num_token_token_cnt_to_ignore(num_matched_last_tokens - 1); } running_sequence->set_status(SequenceStatus::FINISHED); running_sequence->set_finish_reason(GenerationFinishReason::STOP); From 713f091d3792badd8cda12d7e05e32105410b67f Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Wed, 18 Dec 2024 00:10:26 +0400 Subject: [PATCH 05/18] Handle stop strings in Sampler & Seq Group --- .../openvino/genai/generation_handle.hpp | 1 - src/cpp/src/continuous_batching_impl.cpp | 6 +- src/cpp/src/generation_handle.cpp | 3 +- src/cpp/src/llm_pipeline.cpp | 2 +- src/cpp/src/llm_pipeline_static.cpp | 2 +- src/cpp/src/sampler.cpp | 176 ++++++++++-------- src/cpp/src/sampler.hpp | 4 +- src/cpp/src/sequence_group.hpp | 48 ++--- .../speculative_decoding_impl.cpp | 4 +- src/cpp/src/text_callback_streamer.cpp | 95 +++------- src/cpp/src/text_callback_streamer.hpp | 10 +- 11 files changed, 159 insertions(+), 192 deletions(-) diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp index 2e4f94a524..7ff172e645 100644 --- a/src/cpp/include/openvino/genai/generation_handle.hpp +++ b/src/cpp/include/openvino/genai/generation_handle.hpp @@ -57,7 +57,6 @@ struct GenerationOutput { std::vector generated_log_probs; float score; GenerationFinishReason finish_reason; - size_t token_cnt_to_ignore; }; using GenerationOutputs = std::unordered_map; diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 442fd6f7c5..7f648b6ffc 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -246,8 +246,8 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector& streamer) { return streamer; }, - [this, &sampling_params](const std::function& streamer) -> std::shared_ptr { - return sampling_params.size() == 1 ? std::make_unique(m_tokenizer, streamer, sampling_params.begin()->stop_strings) : std::make_unique(m_tokenizer, streamer); + [this](const std::function& streamer) -> std::shared_ptr { + return std::make_unique(m_tokenizer, streamer); } }, streamer); @@ -285,7 +285,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectorcan_read()) { std::unordered_map token = generations.at(0).get()->back(); for (const auto& gen_token : token.begin()->second.generated_ids) { - if (!streamer_ptr->put(gen_token)) { + if (streamer_ptr->put(gen_token)) { break; } } diff --git a/src/cpp/src/generation_handle.cpp b/src/cpp/src/generation_handle.cpp index e6968e481b..a1dd467523 100644 --- a/src/cpp/src/generation_handle.cpp +++ b/src/cpp/src/generation_handle.cpp @@ -46,8 +46,7 @@ void add_partial_result(std::unordered_map& partial_ } else { auto generated_len = iteration_result.second.generated_ids.size(); OPENVINO_ASSERT(generated_len == iteration_result.second.generated_log_probs.size()); - OPENVINO_ASSERT(generated_len >= iteration_result.second.token_cnt_to_ignore); - for (size_t i = 0; i < generated_len - iteration_result.second.token_cnt_to_ignore; ++i) { + for (size_t i = 0; i < generated_len; ++i) { partial_result_iter->second.generated_ids.push_back(iteration_result.second.generated_ids[i]); partial_result_iter->second.generated_log_probs.push_back(iteration_result.second.generated_log_probs[i]); } diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 89e71f21f8..f663b27dd9 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -273,7 +273,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { } else if (auto streamer_obj = std::get_if>(&streamer)) { streamer_ptr = *streamer_obj; } else if (auto callback = std::get_if>(&streamer)) { - streamer_ptr = std::make_shared(m_tokenizer, *callback, generation_config->stop_strings); + streamer_ptr = std::make_shared(m_tokenizer, *callback); } auto batch_size = input_ids.get_shape().at(0); diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 01a06230d0..cb83209b4b 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -967,7 +967,7 @@ EncodedResults StaticLLMPipeline::generate( } else if (auto streamer_obj = std::get_if>(&streamer)) { streamer_ptr = *streamer_obj; } else if (auto callback = std::get_if>(&streamer)) { - streamer_ptr = std::make_shared(m_tokenizer, *callback, generation_config->stop_strings); + streamer_ptr = std::make_shared(m_tokenizer, *callback); } if (!config.is_greedy_decoding()) { diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 49d83ec6bd..a0eb8f4980 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -85,75 +85,65 @@ std::string clean_wrapped_text(const std::string& wrapped_text, const std::strin return clean_text; } +std::vector encode_and_process_string(const std::string& stop_string, ov::genai::Tokenizer& tokenizer) { + // encode stop_string + ov::Tensor ov_encoded_stop_string = tokenizer.encode(stop_string).input_ids; + size_t tensor_size = ov_encoded_stop_string.get_size(); + std::vector source_encoded_stop_string(tensor_size), encoded_stop_string; + std::copy_n(ov_encoded_stop_string.data(), tensor_size, source_encoded_stop_string.begin()); + // remove special symbols + for (const auto& token_id : source_encoded_stop_string) { + if (token_id != tokenizer.get_bos_token_id() && + token_id != tokenizer.get_eos_token_id() && + token_id != tokenizer.get_pad_token_id()) { + encoded_stop_string.push_back(token_id); + } + } + return encoded_stop_string; +} + +struct MatchStopStringResult { + size_t to_remove = 0; + int64_t last_token_id = 0; + bool is_to_update_last_token = false; + bool is_matched = false; +}; + // Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned. -int match_stop_string(Tokenizer & tokenizer, const TokenIds & generated_tokens, const std::set & stop_strings) { - /* - For catching stop_string hit we run comparisons character-wise to catch cases where stop string - overlaps with part of another token on both sides or is just a part of a single token. - For every stop_string we iterate over generated tokens starting from the last one and going backwards. - Every token is wrapped with prefix tokens to ensure tokenizer doesn't remove prefix whitespace of the actual token. - After that all tokens are decoded and prefix is removed from the decoded text, so we end up with decoded token. - Its characters are compared to the stop_string character at a current_position - (position of a character in the stop_string counting from the last one) - at the beginning position is 0. - When characters match we increase current_position and check if we have a full match already, if not we continue. - If we have already matched some characters (current_position > 0) and next character is not matching - before we reach the full match, then we reset current_position to 0. - */ - std::string prefix = "a"; - auto prefix_ov = tokenizer.encode(prefix).input_ids; - std::vector prefix_tokens(prefix_ov.data(), prefix_ov.data() + prefix_ov.get_size()); - std::string suffix = "b"; - auto suffix_ov = tokenizer.encode(suffix).input_ids; - std::vector suffix_tokens(suffix_ov.data(), suffix_ov.data() + suffix_ov.get_size()); - - // Since whitespace can be added at the beginning of the suffix we also try to capture that behavior here - // and get suffix string that will actually be part of the decoded string so we can remove it correctly - auto wrapped_suffix_tokens = suffix_tokens; - wrapped_suffix_tokens.insert(wrapped_suffix_tokens.begin(), prefix_tokens.begin(), prefix_tokens.end()); - std::string wrapped_suffix = tokenizer.decode(wrapped_suffix_tokens); - auto wrapper_pos = wrapped_suffix.find(prefix); - suffix = wrapped_suffix.substr(wrapper_pos + prefix.size()); - - for (auto stop_string: stop_strings) { - int current_position = 0; - int num_matched_tokens = 0; - // Getting reverse iterator to check tokens starting from the last one generated and going backwards - auto generated_tokens_rit = generated_tokens.rbegin(); - std::vector tokens_buffer; - while (generated_tokens_rit != generated_tokens.rend()) { - num_matched_tokens++; - tokens_buffer.insert(tokens_buffer.begin(), *generated_tokens_rit); - - std::vector wrapped_tokens = wrap_tokens(tokens_buffer, prefix_tokens, suffix_tokens); - std::string wrapped_text = tokenizer.decode(wrapped_tokens); - std::string clean_text = clean_wrapped_text(wrapped_text, prefix, suffix); - - if (clean_text == "" || (clean_text.size() >= 3 && (clean_text.compare(clean_text.size() - 3, 3, "�") == 0))) { - generated_tokens_rit++; - continue; - } else { - tokens_buffer.clear(); - } - // Checking clean_text characters starting from the last one - for (auto clean_text_rit = clean_text.rbegin(); clean_text_rit != clean_text.rend(); clean_text_rit++) { - // On character match increment current_position for the next comparisons - if (*clean_text_rit == *(stop_string.rbegin() + current_position)) { - current_position++; - // If this is the last character from the stop_string we have a match - if ((stop_string.rbegin() + current_position) == stop_string.rend()) { - return num_matched_tokens; - } - } else if (current_position) { - // Already found matching characters, but the last one didn't match, so we reset current_position - current_position = 0; - // Looking for the match will start over from this character so we decrement iterator - clean_text_rit--; +MatchStopStringResult match_stop_string(Tokenizer& tokenizer, + const TokenIds& generated_tokens, + const std::pair>& stop_strings, + bool is_include_to_output) { + MatchStopStringResult result; + if (generated_tokens.size() >= stop_strings.first) { + size_t offset = generated_tokens.size() - stop_strings.first; + TokenIds buffer(generated_tokens.begin() + offset, generated_tokens.end()); + std::string decoded_buffer = tokenizer.decode(buffer); + for (const auto& stop_string : stop_strings.second) { + auto pos = decoded_buffer.find(stop_string); + if (pos != std::string::npos) { + result.is_matched = true; + + auto stop_string_len = is_include_to_output ? stop_string.length() : 0; + decoded_buffer = decoded_buffer.substr(0, pos + stop_string_len); + + auto encoded_buffer = encode_and_process_string(decoded_buffer, tokenizer); + if (buffer == encoded_buffer) { + return result; + } else if (encoded_buffer.size() > 0) { + result.last_token_id = encoded_buffer.back(); + result.is_to_update_last_token = 0; + encoded_buffer.pop_back(); } + + result.to_remove = buffer.size() - encoded_buffer.size(); + buffer = TokenIds(buffer.begin(), buffer.begin() + encoded_buffer.size()); + OPENVINO_ASSERT(buffer == encoded_buffer); + return result; } - generated_tokens_rit++; } } - return 0; + return result; } // Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned. @@ -245,7 +235,9 @@ std::map Sampler::GroupBeamSearcher::get_beam_idxs() { return next_beams; } -void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output) { +void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, + SamplerOutput& sampler_output, + const std::pair>& stop_strings) { assert(m_parameters.num_beams % m_parameters.num_beam_groups == 0 && "number of beams should be divisible by number of groups"); size_t group_size = m_parameters.num_beams / m_parameters.num_beam_groups; @@ -392,21 +384,19 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, Sa // There's probably a better way to do that, than copying whole vector... std::vector token_ids = candidate.m_sequence->get_generated_ids(); token_ids.push_back(candidate.m_token_id); - int num_last_matched_tokens = match_stop_string(m_tokenizer, token_ids, m_sequence_group->get_sampling_parameters().stop_strings); - if (num_last_matched_tokens) { + auto match_result = match_stop_string(m_tokenizer, token_ids, stop_strings, m_parameters.include_stop_str_in_output); + if (match_result.is_matched) { // If beam_token does not belong to top num_beams tokens, it should not be added if (cand_idx >= group_size) continue; - if(!m_parameters.include_stop_str_in_output) { - // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point) - if (!m_parameters.include_stop_str_in_output) { - candidate.m_sequence->set_num_token_token_cnt_to_ignore(num_last_matched_tokens); - } - } + // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point) + candidate.m_sequence->remove_last_tokens(match_result.to_remove); + + candidate.m_token_id = match_result.last_token_id; // try to finish candidate - try_to_finish_candidate(group, candidate, m_parameters.include_stop_str_in_output); + try_to_finish_candidate(group, candidate); continue; } } @@ -578,11 +568,20 @@ std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen } if (!sampling_params.stop_strings.empty()) { - int num_matched_last_tokens = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), sampling_params.stop_strings); - if (num_matched_last_tokens) { - if (!sampling_params.include_stop_str_in_output) { - running_sequence->set_num_token_token_cnt_to_ignore(num_matched_last_tokens); + auto& stop_strings = m_stop_strings.at(sequence_group->get_request_id()); + auto match_result = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), stop_strings, sampling_params.include_stop_str_in_output); + if (match_result.is_matched) { + if (match_result.to_remove > 0) { + if (match_result.to_remove > 1) { + running_sequence->remove_last_tokens(match_result.to_remove - 1); + } + auto log_prob = running_sequence->get_generated_log_probs().back(); + running_sequence->remove_last_tokens(1); + if (match_result.is_to_update_last_token) { + running_sequence->append_token(match_result.last_token_id, log_prob); + } } + running_sequence->set_status(SequenceStatus::FINISHED); running_sequence->set_finish_reason(GenerationFinishReason::STOP); dropped_seq_ids.push_back(running_sequence->get_id()); @@ -744,6 +743,19 @@ float get_p_prime(Sequence::Ptr& running_sequence, return p_prime; } +std::pair> +process_stop_strings(const std::set& stop_strings, Tokenizer& tokenizer) { + std::pair> result; + for (const auto& stop_string : stop_strings) { + auto encoded_stop_string = encode_and_process_string(stop_string, tokenizer); + if (result.first < encoded_stop_string.size()) { + result.first = encoded_stop_string.size(); + } + result.second.insert(stop_string); + } + return result; +} + SamplerOutput Sampler::sample(std::vector & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled) { @@ -767,6 +779,12 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, if (!m_logit_processors.count(request_id)) { m_logit_processors.insert({request_id, LogitProcessor(sampling_params, sequence_group->get_prompt_ids())}); } + if (!m_stop_strings.count(request_id)) { + auto processed_stop_string = process_stop_strings(sampling_params.stop_strings, m_tokenizer); + m_stop_strings.insert({request_id, processed_stop_string}); + sequence_group->set_stream_window_size(processed_stop_string.first); + } + auto& stop_strings = m_stop_strings.at(request_id); auto& logit_processor = m_logit_processors.at(request_id); const void * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens; ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data); @@ -876,7 +894,7 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, } // current algorithm already adds new tokens to running sequences and - m_beam_search_info.at(request_id).select_next_tokens(sequence_group_logits, sampler_output); + m_beam_search_info.at(request_id).select_next_tokens(sequence_group_logits, sampler_output, stop_strings); // check max length stop criteria std::vector running_sequences = sequence_group->get_running_sequences(); diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 0f7876cbf9..2ebe75ad97 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -57,6 +57,8 @@ class Sampler { std::mt19937 rng_engine; // { request_id, logit_processor } std::map m_logit_processors; + // { request_id, { max_encoded_len, { stop_strings }}} + std::map>> m_stop_strings; Tokenizer m_tokenizer; @@ -115,7 +117,7 @@ class Sampler::GroupBeamSearcher { public: explicit GroupBeamSearcher(SequenceGroup::Ptr sequence_group, Tokenizer tokenizer); - void select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output); + void select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output, const std::pair>& stop_strings); void finalize(SamplerOutput& sampler_output); std::map get_beam_idxs(); }; diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index 0a332d776b..73884b7a69 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -44,8 +44,6 @@ class Sequence { static std::mutex m_counter_mutex; size_t _make_hash(size_t content_length); - // num tokens to remove from result. Used in case of match by stop_string - size_t m_token_cnt_to_ignore = 0; public: using Ptr = std::shared_ptr; using CPtr = std::shared_ptr; @@ -128,7 +126,7 @@ class Sequence { } } - GenerationOutput get_last_generation_output(size_t token_cnt = 1) { + GenerationOutput get_last_generation_output(size_t token_cnt = 1, size_t num_token_to_ignore = 0) { GenerationOutput output; if (token_cnt > 0) { OPENVINO_ASSERT(m_generated_ids.size()); @@ -137,25 +135,25 @@ class Sequence { auto generated_token_id = get_generated_ids(); auto generated_log_probs = get_generated_log_probs(); + if (get_generated_len() < token_cnt) { + auto a = 0; + } OPENVINO_ASSERT(get_generated_len() >= token_cnt); - auto offset = get_generated_len() - token_cnt; - auto offset_back = get_generated_len(); + if (get_generated_len() > num_token_to_ignore) { + auto offset = get_generated_len() - token_cnt - num_token_to_ignore; + auto offset_back = get_generated_len() - num_token_to_ignore; - std::vector token_id(generated_token_id.begin() + offset, generated_token_id.end()); - std::vector log_probs(generated_log_probs.begin() + offset, generated_log_probs.end()); + std::vector token_id(generated_token_id.begin() + offset, generated_token_id.begin() + offset_back); + std::vector log_probs(generated_log_probs.begin() + offset, generated_log_probs.begin() + offset_back); - output.generated_ids = token_id; - output.generated_log_probs = log_probs; - output.finish_reason = get_finish_reason(); - output.token_cnt_to_ignore = m_token_cnt_to_ignore; + output.generated_ids = token_id; + output.generated_log_probs = log_probs; + output.finish_reason = get_finish_reason(); + } } return output; } - void set_num_token_token_cnt_to_ignore(size_t k) { - m_token_cnt_to_ignore = k; - } - size_t get_generated_len() const { return m_generated_ids.size(); } @@ -231,7 +229,7 @@ class SequenceGroup { // flag to enable/disable token generation, e.g. in speculative decoding scenario bool m_is_gen_paused = false; - size_t m_num_streamed_tokens = 0; + size_t m_num_streamed_tokens = 0, m_stream_window_size = 0; SequenceGroup(uint64_t request_id, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching) @@ -466,6 +464,10 @@ class SequenceGroup { size_t get_num_tokens_to_validate() { return m_num_validation_tokens; } + + void set_stream_window_size(size_t k) { + m_stream_window_size = k; + } size_t get_num_available_tokens_for_batching() const { OPENVINO_ASSERT(!has_finished(), "Internal error: this function cannot be called on finished sequence group"); @@ -613,7 +615,7 @@ class SequenceGroup { for (auto& sequence : m_sequences) { // todo: check seq.is_finished() to generate without several // or is it ok to use padding? - auto output = sequence->get_last_generation_output(token_cnt); + auto output = sequence->get_last_generation_output(token_cnt, m_stream_window_size); if (m_sampling_params.echo && !m_has_echoed) { output.generated_ids.insert(output.generated_ids.begin(), m_prompt_ids.begin(), m_prompt_ids.end()); output.generated_log_probs.insert(output.generated_log_probs.begin(), m_prompt_log_probs.begin(), m_prompt_log_probs.end()); @@ -640,14 +642,18 @@ class SequenceGroup { // (after stop string is detected its tokens are already sent) if (num_total_seqs() == 1) { const auto generated_len = m_sequences.front()->get_generated_len(); + if (has_finished()) { + m_stream_window_size = 0; + } + if (generated_len <= (m_num_streamed_tokens + m_stream_window_size)) { + return; + } // speculative decoding draft handling if (generated_len < m_num_streamed_tokens) { m_num_streamed_tokens = generated_len; } - OPENVINO_ASSERT(generated_len >= m_num_streamed_tokens); - auto delta = generated_len - m_num_streamed_tokens; - - size_t num_output_token_to_push = generated_len - m_num_streamed_tokens; + OPENVINO_ASSERT(generated_len >= (m_num_streamed_tokens + m_stream_window_size)); + size_t num_output_token_to_push = generated_len - m_num_streamed_tokens - m_stream_window_size; push_partial_outputs(num_output_token_to_push); m_num_streamed_tokens += (num_output_token_to_push); } else if (has_finished() || out_of_memory()) { diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index fd9bf00785..e4f3b1ad1f 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -199,8 +199,8 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector< [](const std::shared_ptr& streamer) { return streamer; }, - [this, &sampling_params](const std::function& streamer) -> std::shared_ptr { - return sampling_params.size() == 1 ? std::make_unique(m_tokenizer, streamer, sampling_params.begin()->stop_strings) : std::make_unique(m_tokenizer, streamer); + [this](const std::function& streamer) -> std::shared_ptr { + return std::make_unique(m_tokenizer, streamer); } }, streamer); diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp index 46b4c666b9..5938b55f6c 100644 --- a/src/cpp/src/text_callback_streamer.cpp +++ b/src/cpp/src/text_callback_streamer.cpp @@ -6,84 +6,32 @@ namespace ov { namespace genai { -std::vector encode_and_process_stop_string(const std::string& stop_string, ov::genai::Tokenizer& tokenizer) { - // encode stop_string - ov::Tensor ov_encoded_stop_string = tokenizer.encode(stop_string).input_ids; - size_t tensor_size = ov_encoded_stop_string.get_size(); - std::vector source_encoded_stop_string(tensor_size), encoded_stop_string; - std::copy_n(ov_encoded_stop_string.data(), tensor_size, source_encoded_stop_string.begin()); - // remove special symbols - for (const auto& token_id : source_encoded_stop_string) { - if (token_id != tokenizer.get_bos_token_id() && - token_id != tokenizer.get_eos_token_id() && - token_id != tokenizer.get_pad_token_id()) { - encoded_stop_string.push_back(token_id); - } - } - return encoded_stop_string; -} - -TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback, const std::set& stop_strings) { +TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback) { m_tokenizer = tokenizer; on_finalized_subword_callback = callback; - for (const auto& stop_string : stop_strings) { - auto encoded_stop_string = encode_and_process_stop_string(stop_string, m_tokenizer); - m_max_stop_string_len = std::max(encoded_stop_string.size(), m_max_stop_string_len); - m_stop_strings.insert(stop_string); - } } bool TextCallbackStreamer::put(int64_t token) { std::stringstream res; - m_tokens_cache_stop_string.push_back(token); - if (m_tokens_cache_stop_string.size() > m_max_stop_string_len || token == m_tokenizer.get_eos_token_id()) { - std::vector buffer(m_tokens_cache_stop_string.begin(), m_tokens_cache_stop_string.end()); - std::string text = m_tokenizer.decode(buffer); - std::string activated_stop_string = ""; - for (const auto& stop_string : m_stop_strings) { - if (text.find(stop_string) != std::string::npos) { - activated_stop_string = stop_string; - break; - } - } - - - if (activated_stop_string.empty() && token != m_tokenizer.get_eos_token_id()) { - m_tokens_cache.push_back(m_tokens_cache_stop_string.front()); - m_tokens_cache_stop_string.pop_front(); - } else { - m_tokens_cache.insert(m_tokens_cache.end(), m_tokens_cache_stop_string.begin(), m_tokens_cache_stop_string.end()); - m_tokens_cache_stop_string.clear(); - } - - text = m_tokenizer.decode(m_tokens_cache); - if (!activated_stop_string.empty()) { - auto pos = text.find(activated_stop_string); - if (pos != std::string::npos) { - text.replace(pos, activated_stop_string.length(), ""); - } - m_tokens_cache.clear(); - } - - if (!text.empty() && '\n' == text.back() && text.size() > print_len) { - // Flush the cache after the new line symbol - res << std::string_view{text.data() + print_len, text.size() - print_len}; - m_tokens_cache.clear(); - print_len = 0; - return on_finalized_subword_callback(res.str()); - } - + m_tokens_cache.push_back(token); + std::string text = m_tokenizer.decode(m_tokens_cache); + if (!text.empty() && '\n' == text.back() && text.size() > print_len) { + // Flush the cache after the new line symbol + res << std::string_view{text.data() + print_len, text.size() - print_len}; + m_tokens_cache.clear(); + print_len = 0; + return on_finalized_subword_callback(res.str()); + } - constexpr char replacement[] = "\xef\xbf\xbd"; // MSVC with /utf-8 fails to compile � directly with newline in string literal error. - if (text.size() >= 3 && text.compare(text.size() - 3, 3, replacement) == 0) { - // Don't print incomplete text - return on_finalized_subword_callback(res.str()); - } else { - // It is possible to have a shorter text after adding new token. - // Print to output only if text length is increaesed. - res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; - print_len = text.size(); - } + constexpr char replacement[] = "\xef\xbf\xbd"; // MSVC with /utf-8 fails to compile � directly with newline in string literal error. + if (text.size() >= 3 && text.compare(text.size() - 3, 3, replacement) == 0) { + // Don't print incomplete text + return on_finalized_subword_callback(res.str()); + } else if (text.size() > print_len) { + // It is possible to have a shorter text after adding new token. + // Print to output only if text length is increaesed. + res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + print_len = text.size(); } return on_finalized_subword_callback(res.str()); @@ -91,8 +39,7 @@ bool TextCallbackStreamer::put(int64_t token) { void TextCallbackStreamer::end() { std::stringstream res; - std::vector buffer(m_tokens_cache.begin(), m_tokens_cache.end()); - std::string text = m_tokenizer.decode(buffer); + std::string text = m_tokenizer.decode(m_tokens_cache); if (text.size() <= print_len) return ; res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; @@ -105,4 +52,4 @@ void TextCallbackStreamer::end() { ov::genai::StreamerBase::~StreamerBase() = default; } // namespace genai -} // namespace ov +} // namespace ov \ No newline at end of file diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp index ae353d27d5..6f0872ad1b 100644 --- a/src/cpp/src/text_callback_streamer.hpp +++ b/src/cpp/src/text_callback_streamer.hpp @@ -3,8 +3,6 @@ #pragma once -#include - #include "openvino/genai/streamer_base.hpp" #include "openvino/genai/tokenizer.hpp" @@ -16,17 +14,15 @@ class TextCallbackStreamer: public StreamerBase { bool put(int64_t token) override; void end() override; - TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback, const std::set& stop_strings = {}); + TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback); std::function on_finalized_subword_callback = [](std::string words)->bool { return false; }; protected: Tokenizer m_tokenizer; std::vector m_tokens_cache; - std::list m_tokens_cache_stop_string; - size_t print_len = 0, m_max_stop_string_len = 0; - std::set m_stop_strings; + size_t print_len = 0; }; } // namespace genai -} // namespace ov +} // namespace ov \ No newline at end of file From 64d1b803887ccd14642101da1208cc6ba4de3a9a Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Wed, 18 Dec 2024 14:11:23 +0400 Subject: [PATCH 06/18] ci --- src/cpp/src/sampler.cpp | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index a0eb8f4980..2b8a95b2b3 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -104,8 +104,8 @@ std::vector encode_and_process_string(const std::string& stop_string, o struct MatchStopStringResult { size_t to_remove = 0; - int64_t last_token_id = 0; - bool is_to_update_last_token = false; + // int64_t last_token_id = 0; + // bool is_to_update_last_token = false; bool is_matched = false; }; @@ -126,19 +126,19 @@ MatchStopStringResult match_stop_string(Tokenizer& tokenizer, auto stop_string_len = is_include_to_output ? stop_string.length() : 0; decoded_buffer = decoded_buffer.substr(0, pos + stop_string_len); - - auto encoded_buffer = encode_and_process_string(decoded_buffer, tokenizer); - if (buffer == encoded_buffer) { + if (decoded_buffer.empty()) { + result.to_remove = buffer.size(); return result; - } else if (encoded_buffer.size() > 0) { - result.last_token_id = encoded_buffer.back(); - result.is_to_update_last_token = 0; - encoded_buffer.pop_back(); } - result.to_remove = buffer.size() - encoded_buffer.size(); - buffer = TokenIds(buffer.begin(), buffer.begin() + encoded_buffer.size()); - OPENVINO_ASSERT(buffer == encoded_buffer); + // find token cnt to be removed from sequence by decoding token by token + std::string decoded_partially_string = ""; + for (size_t i = 0; i < buffer.size(); ++i) { + decoded_partially_string += tokenizer.decode(TokenIds{buffer[i]}); + if (decoded_partially_string.find(decoded_buffer) != std::string::npos) { + result.to_remove = buffer.size() - i - 1; + } + } return result; } } @@ -393,7 +393,7 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point) candidate.m_sequence->remove_last_tokens(match_result.to_remove); - candidate.m_token_id = match_result.last_token_id; + // candidate.m_token_id = match_result.last_token_id; // try to finish candidate try_to_finish_candidate(group, candidate); @@ -572,14 +572,7 @@ std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen auto match_result = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), stop_strings, sampling_params.include_stop_str_in_output); if (match_result.is_matched) { if (match_result.to_remove > 0) { - if (match_result.to_remove > 1) { - running_sequence->remove_last_tokens(match_result.to_remove - 1); - } - auto log_prob = running_sequence->get_generated_log_probs().back(); - running_sequence->remove_last_tokens(1); - if (match_result.is_to_update_last_token) { - running_sequence->append_token(match_result.last_token_id, log_prob); - } + running_sequence->remove_last_tokens(match_result.to_remove); } running_sequence->set_status(SequenceStatus::FINISHED); From 449a53ee14a19ff1cfe12bbc56739edcd8d53de0 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Wed, 18 Dec 2024 15:59:54 +0400 Subject: [PATCH 07/18] streaming --- src/cpp/src/continuous_batching_impl.cpp | 12 +++++++++--- src/cpp/src/lm_encoding.cpp | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 7f648b6ffc..2b48852458 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -275,7 +275,8 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectorcan_read()) { std::unordered_map token = generations.at(0).get()->back(); for (const auto& gen_token : token.begin()->second.generated_ids) { - if (streamer_ptr->put(gen_token)) { + continue_generation = !streamer_ptr->put(gen_token); + if (!continue_generation) { break; } } @@ -296,7 +298,11 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectorend(); } - OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed"); + if (!continue_generation) { + drop_requests(); + } else { + OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed"); + } for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) { const auto& generation = generations[generation_idx]; diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp index 3ab041fa58..5b1661831b 100644 --- a/src/cpp/src/lm_encoding.cpp +++ b/src/cpp/src/lm_encoding.cpp @@ -130,7 +130,7 @@ std::pair get_lm_encoded_results( if (streamer_ptr && generations.at(0).get()->can_read()) { std::unordered_map token = generations.at(0).get()->back(); for (const auto& gen_token : token.begin()->second.generated_ids) { - if (!streamer_ptr->put(gen_token)) { + if (streamer_ptr->put(gen_token)) { break; } } From b8fdb8e3f96a8d15a639243d138f404cb70c5027 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Thu, 19 Dec 2024 12:13:14 +0400 Subject: [PATCH 08/18] test --- src/cpp/src/sampler.cpp | 5 +++++ tests/python_tests/common.py | 28 ++++++++++++++++++++++++++++ tests/python_tests/test_sampling.py | 10 +++++++++- 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 2b8a95b2b3..0e9eeff519 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -126,6 +126,10 @@ MatchStopStringResult match_stop_string(Tokenizer& tokenizer, auto stop_string_len = is_include_to_output ? stop_string.length() : 0; decoded_buffer = decoded_buffer.substr(0, pos + stop_string_len); + // to remove word splitting symbols from tail + while (decoded_buffer.back() == ' ' || decoded_buffer.back() == '\n') { + decoded_buffer.pop_back(); + } if (decoded_buffer.empty()) { result.to_remove = buffer.size(); return result; @@ -137,6 +141,7 @@ MatchStopStringResult match_stop_string(Tokenizer& tokenizer, decoded_partially_string += tokenizer.decode(TokenIds{buffer[i]}); if (decoded_partially_string.find(decoded_buffer) != std::string::npos) { result.to_remove = buffer.size() - i - 1; + break; } } return result; diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 50ee452f5c..7c97088abc 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -125,6 +125,34 @@ def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig: generation_config.include_stop_str_in_output = True return generation_config +def get_greedy_stop_strings_exclude_from_output() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.max_new_tokens = 30 + generation_config.stop_strings = { "machines" } + generation_config.include_stop_str_in_output = False + return generation_config + +def get_greedy_stop_strings_include_to_output() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.max_new_tokens = 30 + generation_config.stop_strings = { "machines" } + generation_config.include_stop_str_in_output = True + return generation_config + +def get_greedy_n_stop_strings_exclude_from_output() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.max_new_tokens = 30 + generation_config.stop_strings = { "machines", "anag" } + generation_config.include_stop_str_in_output = False + return generation_config + +def get_greedy_n_stop_strings_include_to_output() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.max_new_tokens = 30 + generation_config.stop_strings = { "machines", "anag" } + generation_config.include_stop_str_in_output = True + return generation_config + def get_multinomial_temperature() -> GenerationConfig: generation_config = GenerationConfig() generation_config.do_sample = True diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index 9aa6931d85..d5df28bfd6 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -21,6 +21,8 @@ get_beam_search, get_beam_search_min_and_max_tokens, get_beam_search_with_single_stop_string, \ get_beam_search_with_multiple_stop_strings, get_beam_search_with_multiple_stop_strings_no_match, get_multinomial_max_and_min_token, \ get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \ + get_greedy_stop_strings_exclude_from_output, get_greedy_stop_strings_include_to_output, \ + get_greedy_n_stop_strings_exclude_from_output, get_greedy_n_stop_strings_include_to_output, \ generate_and_compare_with_hf, get_multinomial_temperature_and_repetition_penalty, get_scheduler_config, \ run_continuous_batching @@ -77,7 +79,9 @@ def test_eos_greedy(tmp_path): @pytest.mark.precommit @pytest.mark.parametrize("generation_config", [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(), get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(), - get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(), ], + get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(), + get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(), + get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output() ], ids=[ "greedy", "greedy_with_min_and_max_tokens", @@ -88,6 +92,10 @@ def test_eos_greedy(tmp_path): "beam", "beam_search_min_and_max_tokens", "beam_search_with_multiple_stop_strings_no_match", + "get_greedy_stop_strings_exclude_from_output", + "get_greedy_stop_strings_include_to_output", + "get_greedy_n_stop_strings_exclude_from_output", + "get_greedy_n_stop_strings_include_to_output" ]) def test_individual_generation_configs_deterministic(tmp_path, generation_config): prompts = [ From df7b512ce541b3e25fb1892c554bad81c7e20d0c Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Thu, 19 Dec 2024 12:25:00 +0400 Subject: [PATCH 09/18] tokenizers --- src/cpp/src/sampler.cpp | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 0e9eeff519..1a80848264 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -87,18 +87,11 @@ std::string clean_wrapped_text(const std::string& wrapped_text, const std::strin std::vector encode_and_process_string(const std::string& stop_string, ov::genai::Tokenizer& tokenizer) { // encode stop_string - ov::Tensor ov_encoded_stop_string = tokenizer.encode(stop_string).input_ids; + std::string stop_string_copy = stop_string; + ov::Tensor ov_encoded_stop_string = tokenizer.encode(stop_string_copy, ov::genai::add_special_tokens(false)).input_ids; size_t tensor_size = ov_encoded_stop_string.get_size(); - std::vector source_encoded_stop_string(tensor_size), encoded_stop_string; - std::copy_n(ov_encoded_stop_string.data(), tensor_size, source_encoded_stop_string.begin()); - // remove special symbols - for (const auto& token_id : source_encoded_stop_string) { - if (token_id != tokenizer.get_bos_token_id() && - token_id != tokenizer.get_eos_token_id() && - token_id != tokenizer.get_pad_token_id()) { - encoded_stop_string.push_back(token_id); - } - } + std::vector encoded_stop_string(tensor_size); + std::copy_n(ov_encoded_stop_string.data(), tensor_size, encoded_stop_string.begin()); return encoded_stop_string; } From 796e1489766bb9cd7a6c2f91f1176d1c703a9124 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Thu, 19 Dec 2024 14:01:53 +0400 Subject: [PATCH 10/18] check --- samples/cpp/text_generation/greedy_causal_lm.cpp | 8 +++++++- tests/python_tests/common.py | 15 ++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/samples/cpp/text_generation/greedy_causal_lm.cpp b/samples/cpp/text_generation/greedy_causal_lm.cpp index b5ca59095b..8c8c6d3ebe 100644 --- a/samples/cpp/text_generation/greedy_causal_lm.cpp +++ b/samples/cpp/text_generation/greedy_causal_lm.cpp @@ -13,7 +13,13 @@ int main(int argc, char* argv[]) try { ov::genai::LLMPipeline pipe(models_path, device); ov::genai::GenerationConfig config; - config.max_new_tokens = 100; + config.max_new_tokens = 30; + + config.stop_strings = { "machines", "manage" }; + // anag + config.include_stop_str_in_output = false; + + std::string result = pipe.generate(prompt, config); std::cout << result << std::endl; } catch (const std::exception& error) { diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 7c97088abc..163a00192e 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -142,14 +142,14 @@ def get_greedy_stop_strings_include_to_output() -> GenerationConfig: def get_greedy_n_stop_strings_exclude_from_output() -> GenerationConfig: generation_config = GenerationConfig() generation_config.max_new_tokens = 30 - generation_config.stop_strings = { "machines", "anag" } + generation_config.stop_strings = { "machines", "manage" } generation_config.include_stop_str_in_output = False return generation_config def get_greedy_n_stop_strings_include_to_output() -> GenerationConfig: generation_config = GenerationConfig() generation_config.max_new_tokens = 30 - generation_config.stop_strings = { "machines", "anag" } + generation_config.stop_strings = { "machines", "manage" } generation_config.include_stop_str_in_output = True return generation_config @@ -387,9 +387,14 @@ def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, ge # Note, that for fp32 / fp16 models scores are different less than 0.001 assert abs(hf_score - ov_score) < 0.02 - assert len(hf_result.m_generation_ids) == len(ov_result.m_generation_ids) - for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids): - assert hf_text == ov_text + if not generation_config.include_stop_str_in_output and len(generation_config.stop_strings) > 0: + assert len(hf_result.m_generation_ids) >= len(ov_result.m_generation_ids) + for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids): + assert ov_text in hf_text + else: + assert len(hf_result.m_generation_ids) == len(ov_result.m_generation_ids) + for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids): + assert hf_text == ov_text def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path): model.save_pretrained(models_path) From 7dbb3ad3dada51babcffc2b4187932244702df52 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Thu, 19 Dec 2024 18:02:03 +0400 Subject: [PATCH 11/18] Update greedy_causal_lm.cpp --- samples/cpp/text_generation/greedy_causal_lm.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/samples/cpp/text_generation/greedy_causal_lm.cpp b/samples/cpp/text_generation/greedy_causal_lm.cpp index 8c8c6d3ebe..62d961a6b7 100644 --- a/samples/cpp/text_generation/greedy_causal_lm.cpp +++ b/samples/cpp/text_generation/greedy_causal_lm.cpp @@ -13,13 +13,8 @@ int main(int argc, char* argv[]) try { ov::genai::LLMPipeline pipe(models_path, device); ov::genai::GenerationConfig config; - config.max_new_tokens = 30; + config.max_new_tokens = 100; - config.stop_strings = { "machines", "manage" }; - // anag - config.include_stop_str_in_output = false; - - std::string result = pipe.generate(prompt, config); std::cout << result << std::endl; } catch (const std::exception& error) { From 617471ac4523e134b5e0b31d114606ad741cf6fa Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Thu, 19 Dec 2024 18:02:27 +0400 Subject: [PATCH 12/18] Update greedy_causal_lm.cpp --- samples/cpp/text_generation/greedy_causal_lm.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/samples/cpp/text_generation/greedy_causal_lm.cpp b/samples/cpp/text_generation/greedy_causal_lm.cpp index 62d961a6b7..acdd8b690d 100644 --- a/samples/cpp/text_generation/greedy_causal_lm.cpp +++ b/samples/cpp/text_generation/greedy_causal_lm.cpp @@ -13,8 +13,7 @@ int main(int argc, char* argv[]) try { ov::genai::LLMPipeline pipe(models_path, device); ov::genai::GenerationConfig config; - config.max_new_tokens = 100; - + config.max_new_tokens = 100; std::string result = pipe.generate(prompt, config); std::cout << result << std::endl; } catch (const std::exception& error) { From d2acb396ba21517dc37f3be766b077cbb9c4ddb9 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Fri, 20 Dec 2024 14:24:35 +0400 Subject: [PATCH 13/18] Update speculative_decoding_lm.cpp --- .../cpp/speculative_decoding_lm/speculative_decoding_lm.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index 2dc46f8a82..487296566b 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -33,8 +33,7 @@ int main(int argc, char* argv[]) try { main_model_path, main_device, ov::genai::draft_model(draft_model_path, draft_device), - ov::genai::scheduler_config(scheduler_config) - ); + ov::genai::scheduler_config(scheduler_config)); auto streamer = [](std::string subword) { std::cout << subword << std::flush; From 4b7a767211b3541293f6cb28372ad4148b417c29 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Fri, 20 Dec 2024 14:25:12 +0400 Subject: [PATCH 14/18] Update greedy_causal_lm.cpp --- samples/cpp/text_generation/greedy_causal_lm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/cpp/text_generation/greedy_causal_lm.cpp b/samples/cpp/text_generation/greedy_causal_lm.cpp index acdd8b690d..b5ca59095b 100644 --- a/samples/cpp/text_generation/greedy_causal_lm.cpp +++ b/samples/cpp/text_generation/greedy_causal_lm.cpp @@ -13,7 +13,7 @@ int main(int argc, char* argv[]) try { ov::genai::LLMPipeline pipe(models_path, device); ov::genai::GenerationConfig config; - config.max_new_tokens = 100; + config.max_new_tokens = 100; std::string result = pipe.generate(prompt, config); std::cout << result << std::endl; } catch (const std::exception& error) { From 09307a40af3cb532036a356129ce811db76bbe12 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Fri, 20 Dec 2024 14:26:50 +0400 Subject: [PATCH 15/18] Update sequence_group.hpp --- src/cpp/src/sequence_group.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index 73884b7a69..198d3b53aa 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -135,9 +135,6 @@ class Sequence { auto generated_token_id = get_generated_ids(); auto generated_log_probs = get_generated_log_probs(); - if (get_generated_len() < token_cnt) { - auto a = 0; - } OPENVINO_ASSERT(get_generated_len() >= token_cnt); if (get_generated_len() > num_token_to_ignore) { auto offset = get_generated_len() - token_cnt - num_token_to_ignore; @@ -687,4 +684,4 @@ class SequenceGroup { m_generation_stream->push(std::move(outputs)); } }; -} \ No newline at end of file +} From fb812ec1d417ac73eabed7e01b1adef31098b552 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Fri, 20 Dec 2024 14:29:48 +0400 Subject: [PATCH 16/18] Update sampler.cpp --- src/cpp/src/sampler.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 1a80848264..9c18dc7721 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -391,8 +391,6 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point) candidate.m_sequence->remove_last_tokens(match_result.to_remove); - // candidate.m_token_id = match_result.last_token_id; - // try to finish candidate try_to_finish_candidate(group, candidate); continue; @@ -569,9 +567,7 @@ std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen auto& stop_strings = m_stop_strings.at(sequence_group->get_request_id()); auto match_result = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), stop_strings, sampling_params.include_stop_str_in_output); if (match_result.is_matched) { - if (match_result.to_remove > 0) { - running_sequence->remove_last_tokens(match_result.to_remove); - } + running_sequence->remove_last_tokens(match_result.to_remove); running_sequence->set_status(SequenceStatus::FINISHED); running_sequence->set_finish_reason(GenerationFinishReason::STOP); @@ -937,6 +933,7 @@ void Sampler::create_logit_processor(uint64_t request_id, const GenerationConfig void Sampler::clear_request_info(uint64_t request_id) { m_beam_search_info.erase(request_id); m_logit_processors.erase(request_id); + m_stop_strings.erase(request_id); } int64_t Sampler::GroupBeamSearcher::Group::finish(Beam beam, const ov::genai::GenerationConfig& sampling_params) { From 1905fdda72c8b99bea78a9228be14937c43af109 Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Fri, 20 Dec 2024 14:47:01 +0400 Subject: [PATCH 17/18] Update lm_encoding.cpp --- src/cpp/src/lm_encoding.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp index ebe57d6d01..cf163d5d3f 100644 --- a/src/cpp/src/lm_encoding.cpp +++ b/src/cpp/src/lm_encoding.cpp @@ -133,6 +133,8 @@ std::pair get_lm_encoded_results( SamplerOutput sampler_output = sampler.sample(sequence_groups, logits); stream_generated_tokens(); + + // "Generation" phase while (!active_sequence_groups.empty()) { size_t total_num_tokens = 0; @@ -237,4 +239,4 @@ std::pair get_lm_encoded_results( } } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov From c7bc7b8d1a504360532dc9138928dc609b46f8cd Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Fri, 20 Dec 2024 14:47:27 +0400 Subject: [PATCH 18/18] Update lm_encoding.cpp --- src/cpp/src/lm_encoding.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp index cf163d5d3f..031214468e 100644 --- a/src/cpp/src/lm_encoding.cpp +++ b/src/cpp/src/lm_encoding.cpp @@ -133,7 +133,7 @@ std::pair get_lm_encoded_results( SamplerOutput sampler_output = sampler.sample(sequence_groups, logits); stream_generated_tokens(); - + // "Generation" phase while (!active_sequence_groups.empty()) {