From 0e91fae93975d01d690c1265ee9bdad83cf3ee2c Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Mon, 16 Dec 2024 22:24:20 +0400
Subject: [PATCH 01/18] [Streamer] Handle stop strings in case of sampler

---
 src/cpp/src/continuous_batching_impl.cpp      | 13 +--
 src/cpp/src/llm_pipeline.cpp                  |  2 +-
 src/cpp/src/llm_pipeline_static.cpp           |  2 +-
 src/cpp/src/sampler.cpp                       |  5 +-
 src/cpp/src/sequence_group.hpp                | 22 +++--
 .../speculative_decoding_impl.cpp             |  4 +-
 src/cpp/src/text_callback_streamer.cpp        | 93 +++++++++++++++----
 src/cpp/src/text_callback_streamer.hpp        |  8 +-
 8 files changed, 104 insertions(+), 45 deletions(-)
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 1e42f5b2d9..442fd6f7c5 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -246,8 +246,8 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         [](const std::shared_ptr<StreamerBase>& streamer) {
             return streamer;
         },
-        [this](const std::function<bool(std::string)>& streamer) -> std::shared_ptr<StreamerBase> {
-            return std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer);
+        [this, &sampling_params](const std::function<bool(std::string)>& streamer) -> std::shared_ptr<StreamerBase> {
+            return sampling_params.size() == 1 ? std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer, sampling_params.begin()->stop_strings) : std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer);
         }
     }, streamer);
 
@@ -275,8 +275,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         m_requests.clear();
     };
 
-    bool continue_generation = true;
-    while (has_non_finished_requests() && continue_generation) {
+    while (has_non_finished_requests()) {
         try {
             step();
         } catch (...) {
@@ -297,11 +296,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         streamer_ptr->end();
     }
 
-    if (!continue_generation) {
-        drop_requests();
-    } else {
-        OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed");
-    }
+    OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed");
 
     for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) {
         const auto& generation = generations[generation_idx];
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index f663b27dd9..89e71f21f8 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -273,7 +273,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         } else if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) {
             streamer_ptr = *streamer_obj;
         } else if (auto callback = std::get_if<std::function<bool(std::string)>>(&streamer)) {
-            streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
+            streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback, generation_config->stop_strings);
         }
 
         auto batch_size = input_ids.get_shape().at(0);
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index cb83209b4b..01a06230d0 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -967,7 +967,7 @@ EncodedResults StaticLLMPipeline::generate(
     } else if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) {
         streamer_ptr = *streamer_obj;
     } else if (auto callback = std::get_if<std::function<bool(std::string)>>(&streamer)) {
-        streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
+        streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback, generation_config->stop_strings);
     }
 
     if (!config.is_greedy_decoding()) {
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index f77463d767..f1abc862e2 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -578,8 +578,6 @@ std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen
         if (!sampling_params.stop_strings.empty()) {
             int num_matched_last_tokens = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), sampling_params.stop_strings);
             if (num_matched_last_tokens) {
-                if (!sampling_params.include_stop_str_in_output)
-                    running_sequence->remove_last_tokens(num_matched_last_tokens);
                 running_sequence->set_status(SequenceStatus::FINISHED);
                 running_sequence->set_finish_reason(GenerationFinishReason::STOP);
                 dropped_seq_ids.push_back(running_sequence->get_id());
@@ -886,8 +884,7 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
             // Notify handle after sampling is done. 
             // For non-streaming this is effective only when the generation is finished.
             OPENVINO_ASSERT(num_tokens_to_process >= max_removed_tokens_per_request);
-            size_t num_output_token_to_push = num_tokens_to_process - max_removed_tokens_per_request + 1;
-            sequence_group->notify_handle(num_output_token_to_push);
+            sequence_group->notify_handle();
         } else {
             // we are in prompt processing phase when prompt is split into chunks and processed step by step
         }
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 6755255fe8..c8b4c59486 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -221,6 +221,8 @@ class SequenceGroup {
     // flag to enable/disable token generation, e.g. in speculative decoding scenario
     bool m_is_gen_paused = false;
 
+    size_t m_num_streamed_tokens = 0;
+
 
     SequenceGroup(uint64_t request_id, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching)
         : m_request_id(request_id),
@@ -612,7 +614,7 @@ class SequenceGroup {
         m_generation_stream->push(std::move(outputs));
     }
 
-    void notify_handle(size_t num_output_token_to_push = 0) {
+    void notify_handle() {
         if (out_of_memory()) {
             set_generation_status(GenerationStatus::IGNORED);
         } else if (has_finished()) {
@@ -626,10 +628,18 @@ class SequenceGroup {
         } else if (m_sampling_params.is_greedy_decoding() || m_sampling_params.is_multinomial()) {
             // We can stream only when one sequence is returned and we don't use stop strings that would be excluded from the output
             // (after stop string is detected its tokens are already sent)
-            if (num_total_seqs() == 1 &&
-                (m_sampling_params.stop_strings.empty() || m_sampling_params.include_stop_str_in_output)) {
-                if (num_output_token_to_push)
-                    push_partial_outputs(num_output_token_to_push);
+            if (num_total_seqs() == 1) {
+                const auto generated_len = m_sequences.front()->get_generated_len();
+                // speculative decoding draft handling
+                if (generated_len < m_num_streamed_tokens) {
+                    m_num_streamed_tokens = generated_len;
+                }
+                OPENVINO_ASSERT(generated_len >= m_num_streamed_tokens);
+                auto delta = generated_len - m_num_streamed_tokens;
+
+                size_t num_output_token_to_push = generated_len - m_num_streamed_tokens;
+                push_partial_outputs(num_output_token_to_push);
+                m_num_streamed_tokens += (num_output_token_to_push);
             } else if (has_finished() || out_of_memory()) {
                 push_outputs();
             }
@@ -661,4 +671,4 @@ class SequenceGroup {
         m_generation_stream->push(std::move(outputs));
     } 
 };
-}
+}
\ No newline at end of file
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
index e4f3b1ad1f..fd9bf00785 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -199,8 +199,8 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
         [](const std::shared_ptr<StreamerBase>& streamer) {
             return streamer;
         },
-        [this](const std::function<bool(std::string)>& streamer) -> std::shared_ptr<StreamerBase> {
-            return std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer);
+        [this, &sampling_params](const std::function<bool(std::string)>& streamer) -> std::shared_ptr<StreamerBase> {
+            return sampling_params.size() == 1 ? std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer, sampling_params.begin()->stop_strings) : std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer);
         }
     }, streamer);
 
diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp
index 314a7ffa4d..46b4c666b9 100644
--- a/src/cpp/src/text_callback_streamer.cpp
+++ b/src/cpp/src/text_callback_streamer.cpp
@@ -6,32 +6,84 @@
 namespace ov {
 namespace genai {
 
-TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function<bool(std::string)> callback) {
+std::vector<int64_t> encode_and_process_stop_string(const std::string& stop_string, ov::genai::Tokenizer& tokenizer) {
+    // encode stop_string
+    ov::Tensor ov_encoded_stop_string = tokenizer.encode(stop_string).input_ids;
+    size_t tensor_size = ov_encoded_stop_string.get_size();
+    std::vector<int64_t> source_encoded_stop_string(tensor_size), encoded_stop_string;
+    std::copy_n(ov_encoded_stop_string.data<int64_t>(), tensor_size, source_encoded_stop_string.begin());
+    // remove special symbols
+    for (const auto& token_id : source_encoded_stop_string) {
+        if (token_id != tokenizer.get_bos_token_id() &&
+            token_id != tokenizer.get_eos_token_id() &&
+            token_id != tokenizer.get_pad_token_id()) {
+            encoded_stop_string.push_back(token_id);
+        }
+    }
+    return encoded_stop_string;
+}
+
+TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function<bool(std::string)> callback, const std::set<std::string>& stop_strings) {
     m_tokenizer = tokenizer;
     on_finalized_subword_callback = callback;
+    for (const auto& stop_string : stop_strings) {
+        auto encoded_stop_string = encode_and_process_stop_string(stop_string, m_tokenizer);
+        m_max_stop_string_len = std::max(encoded_stop_string.size(), m_max_stop_string_len);
+        m_stop_strings.insert(stop_string);
+    }
 }
 
 bool TextCallbackStreamer::put(int64_t token) {
     std::stringstream res;
-    m_tokens_cache.push_back(token);
-    std::string text = m_tokenizer.decode(m_tokens_cache);
-    if (!text.empty() && '\n' == text.back() && text.size() > print_len) {
-        // Flush the cache after the new line symbol
-        res << std::string_view{text.data() + print_len, text.size() - print_len};
-        m_tokens_cache.clear();
-        print_len = 0;
-        return on_finalized_subword_callback(res.str());
-    }
+    m_tokens_cache_stop_string.push_back(token);
+    if (m_tokens_cache_stop_string.size() > m_max_stop_string_len || token == m_tokenizer.get_eos_token_id()) {
+        std::vector<int64_t> buffer(m_tokens_cache_stop_string.begin(), m_tokens_cache_stop_string.end());
+        std::string text = m_tokenizer.decode(buffer);
+        std::string activated_stop_string = "";
+        for (const auto& stop_string : m_stop_strings) {
+            if (text.find(stop_string) != std::string::npos) {
+                activated_stop_string = stop_string;
+                break;
+            }
+        }
+        
+        
+        if (activated_stop_string.empty() && token != m_tokenizer.get_eos_token_id()) {
+            m_tokens_cache.push_back(m_tokens_cache_stop_string.front());
+            m_tokens_cache_stop_string.pop_front();
+        } else {
+            m_tokens_cache.insert(m_tokens_cache.end(), m_tokens_cache_stop_string.begin(), m_tokens_cache_stop_string.end());
+            m_tokens_cache_stop_string.clear();
+        }
+
+        text = m_tokenizer.decode(m_tokens_cache);
+        if (!activated_stop_string.empty()) {
+            auto pos = text.find(activated_stop_string);
+            if (pos != std::string::npos) {
+                text.replace(pos, activated_stop_string.length(), "");
+            }
+            m_tokens_cache.clear();
+        }
+
+        if (!text.empty() && '\n' == text.back() && text.size() > print_len) {
+            // Flush the cache after the new line symbol
+            res << std::string_view{text.data() + print_len, text.size() - print_len};
+            m_tokens_cache.clear();
+            print_len = 0;
+            return on_finalized_subword_callback(res.str());
+        }
+
 
-    constexpr char replacement[] = "\xef\xbf\xbd";  // MSVC with /utf-8 fails to compile � directly with newline in string literal error.
-    if (text.size() >= 3 && text.compare(text.size() - 3, 3, replacement) == 0) {
-        // Don't print incomplete text
-        return on_finalized_subword_callback(res.str());
-    } else if (text.size() > print_len) {
-        // It is possible to have a shorter text after adding new token.
-        // Print to output only if text length is increaesed.
-        res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
-        print_len = text.size();
+        constexpr char replacement[] = "\xef\xbf\xbd";  // MSVC with /utf-8 fails to compile � directly with newline in string literal error.
+        if (text.size() >= 3 && text.compare(text.size() - 3, 3, replacement) == 0) {
+            // Don't print incomplete text
+            return on_finalized_subword_callback(res.str());
+        } else {
+            // It is possible to have a shorter text after adding new token.
+            // Print to output only if text length is increaesed.
+            res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+            print_len = text.size();
+        }
     }
 
     return on_finalized_subword_callback(res.str());
@@ -39,7 +91,8 @@ bool TextCallbackStreamer::put(int64_t token) {
 
 void TextCallbackStreamer::end() {
     std::stringstream res;
-    std::string text = m_tokenizer.decode(m_tokens_cache);
+    std::vector<int64_t> buffer(m_tokens_cache.begin(), m_tokens_cache.end());
+    std::string text = m_tokenizer.decode(buffer);
     if (text.size() <= print_len)
         return ;
     res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp
index a03b0deccb..ae353d27d5 100644
--- a/src/cpp/src/text_callback_streamer.hpp
+++ b/src/cpp/src/text_callback_streamer.hpp
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <list>
+
 #include "openvino/genai/streamer_base.hpp"
 #include "openvino/genai/tokenizer.hpp"
 
@@ -14,14 +16,16 @@ class TextCallbackStreamer: public StreamerBase {
     bool put(int64_t token) override;
     void end() override;
 
-    TextCallbackStreamer(const Tokenizer& tokenizer, std::function<bool(std::string)> callback);
+    TextCallbackStreamer(const Tokenizer& tokenizer, std::function<bool(std::string)> callback, const std::set<std::string>& stop_strings = {});
 
     std::function<bool(std::string)> on_finalized_subword_callback = [](std::string words)->bool { return false; };
 
 protected:
     Tokenizer m_tokenizer;
     std::vector<int64_t> m_tokens_cache;
-    size_t print_len = 0;
+    std::list<int64_t> m_tokens_cache_stop_string;
+    size_t print_len = 0, m_max_stop_string_len = 0;
+    std::set<std::string> m_stop_strings;
 };
 
 }  // namespace genai

From 22d6fe1f3cc6a03158d5e81b546c8841e336259a Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Tue, 17 Dec 2024 11:12:50 +0400
Subject: [PATCH 02/18] Handle output sequence

---
 .../speculative_decoding_lm.cpp               |  3 +-
 src/cpp/src/sampler.cpp                       |  7 +++-
 src/cpp/src/sequence_group.hpp                | 37 +++++++++++++------
 3 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
index dc6761879c..3c2c5b04a9 100644
--- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
+++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
@@ -34,7 +34,8 @@ int main(int argc, char* argv[]) try {
         main_model_path,
         main_device,
         ov::genai::draft_model(draft_model_path, draft_device),
-        ov::genai::scheduler_config(scheduler_config));
+        ov::genai::scheduler_config(scheduler_config)
+    );
 
     auto streamer = [](std::string subword) {
         std::cout << subword << std::flush;
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index f1abc862e2..b689f4883c 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -400,7 +400,9 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, Sa
 
                     if(!m_parameters.include_stop_str_in_output) {
                         // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point)
-                        candidate.m_sequence->remove_last_tokens(num_last_matched_tokens - 1);
+                        if (!m_parameters.include_stop_str_in_output) {
+                            candidate.m_sequence->set_num_token_token_cnt_to_ignore(num_last_matched_tokens - 1);
+                        }
                     }
 
                     // try to finish candidate
@@ -578,6 +580,9 @@ std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen
         if (!sampling_params.stop_strings.empty()) {
             int num_matched_last_tokens = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), sampling_params.stop_strings);
             if (num_matched_last_tokens) {
+                if (!sampling_params.include_stop_str_in_output) {
+                    running_sequence->set_num_token_token_cnt_to_ignore(num_matched_last_tokens - 1);
+                }
                 running_sequence->set_status(SequenceStatus::FINISHED);
                 running_sequence->set_finish_reason(GenerationFinishReason::STOP);
                 dropped_seq_ids.push_back(running_sequence->get_id());
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index c8b4c59486..e5c63901dd 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -44,6 +44,8 @@ class Sequence {
     static std::mutex m_counter_mutex;
 
     size_t _make_hash(size_t content_length);
+    // num tokens to remove from result. Used in case of match by stop_string
+    size_t m_token_cnt_to_ignore = 0;
 public:
     using Ptr = std::shared_ptr<Sequence>;
     using CPtr = std::shared_ptr<const Sequence>;
@@ -128,24 +130,35 @@ class Sequence {
 
     GenerationOutput get_last_generation_output(size_t token_cnt = 1) {
         GenerationOutput output;
-        OPENVINO_ASSERT(m_generated_ids.size());
-        output.score = get_cumulative_log_probs();
+        if (token_cnt > 0) {
+            OPENVINO_ASSERT(m_generated_ids.size());
+            output.score = get_cumulative_log_probs();
 
-        auto generated_token_id = get_generated_ids();
-        auto generated_log_probs = get_generated_log_probs();
+            auto generated_token_id = get_generated_ids();
+            auto generated_log_probs = get_generated_log_probs();
 
-        OPENVINO_ASSERT(get_generated_len() >= token_cnt);
-        auto offset = get_generated_len() - token_cnt;
+            OPENVINO_ASSERT(get_generated_len() >= token_cnt);
+            auto offset = get_generated_len() - token_cnt;
 
-        std::vector<int64_t> token_id(generated_token_id.begin() + offset, generated_token_id.end());
-        std::vector<float> log_probs(generated_log_probs.begin() + offset, generated_log_probs.end());
+            auto offset_back = get_generated_len() - m_token_cnt_to_ignore;
+            if (m_token_cnt_to_ignore)
+                auto a = 0;
+            m_token_cnt_to_ignore = 0;
 
-        output.generated_ids = token_id;
-        output.generated_log_probs = log_probs;
-        output.finish_reason = get_finish_reason();
+            std::vector<int64_t> token_id(generated_token_id.begin() + offset, generated_token_id.begin() + offset_back);
+            std::vector<float> log_probs(generated_log_probs.begin() + offset, generated_log_probs.begin() + offset_back);
+
+            output.generated_ids = token_id;
+            output.generated_log_probs = log_probs;
+            output.finish_reason = get_finish_reason();
+        }
         return output;
     }
 
+    void set_num_token_token_cnt_to_ignore(size_t k) {
+        m_token_cnt_to_ignore = k;
+    }
+
     size_t get_generated_len() const {
         return m_generated_ids.size();
     }
@@ -621,7 +634,7 @@ class SequenceGroup {
             set_generation_status(GenerationStatus::FINISHED);
         }
         // For beam search streaming is not available, so we notify only upon finishing
-        if(m_sampling_params.is_beam_search()) {
+        if (m_sampling_params.is_beam_search()) {
             if (has_finished() || out_of_memory()) {
                 push_outputs();
             }

From dcb80ddf0ab3c0f2e71ce27ac937a3cf04046c08 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Tue, 17 Dec 2024 12:35:48 +0400
Subject: [PATCH 03/18] one more

---
 src/cpp/include/openvino/genai/generation_handle.hpp |  1 +
 src/cpp/src/generation_handle.cpp                    |  3 ++-
 src/cpp/src/sampler.cpp                              |  4 ++--
 src/cpp/src/sequence_group.hpp                       | 11 ++++-------
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp
index 7ff172e645..2e4f94a524 100644
--- a/src/cpp/include/openvino/genai/generation_handle.hpp
+++ b/src/cpp/include/openvino/genai/generation_handle.hpp
@@ -57,6 +57,7 @@ struct GenerationOutput {
     std::vector<float> generated_log_probs;
     float score;
     GenerationFinishReason finish_reason;
+    size_t token_cnt_to_ignore;
 };
 
 using GenerationOutputs = std::unordered_map<uint64_t, GenerationOutput>;
diff --git a/src/cpp/src/generation_handle.cpp b/src/cpp/src/generation_handle.cpp
index a1dd467523..e6968e481b 100644
--- a/src/cpp/src/generation_handle.cpp
+++ b/src/cpp/src/generation_handle.cpp
@@ -46,7 +46,8 @@ void add_partial_result(std::unordered_map<uint64_t, GenerationOutput>& partial_
         } else {
             auto generated_len = iteration_result.second.generated_ids.size();
             OPENVINO_ASSERT(generated_len == iteration_result.second.generated_log_probs.size());
-            for (size_t i = 0; i < generated_len; ++i) {
+            OPENVINO_ASSERT(generated_len >= iteration_result.second.token_cnt_to_ignore);
+            for (size_t i = 0; i < generated_len - iteration_result.second.token_cnt_to_ignore; ++i) {
                 partial_result_iter->second.generated_ids.push_back(iteration_result.second.generated_ids[i]);
                 partial_result_iter->second.generated_log_probs.push_back(iteration_result.second.generated_log_probs[i]);
             }
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index b689f4883c..49d83ec6bd 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -401,7 +401,7 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, Sa
                     if(!m_parameters.include_stop_str_in_output) {
                         // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point)
                         if (!m_parameters.include_stop_str_in_output) {
-                            candidate.m_sequence->set_num_token_token_cnt_to_ignore(num_last_matched_tokens - 1);
+                            candidate.m_sequence->set_num_token_token_cnt_to_ignore(num_last_matched_tokens);
                         }
                     }
 
@@ -581,7 +581,7 @@ std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen
             int num_matched_last_tokens = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), sampling_params.stop_strings);
             if (num_matched_last_tokens) {
                 if (!sampling_params.include_stop_str_in_output) {
-                    running_sequence->set_num_token_token_cnt_to_ignore(num_matched_last_tokens - 1);
+                    running_sequence->set_num_token_token_cnt_to_ignore(num_matched_last_tokens);
                 }
                 running_sequence->set_status(SequenceStatus::FINISHED);
                 running_sequence->set_finish_reason(GenerationFinishReason::STOP);
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index e5c63901dd..0a332d776b 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -139,18 +139,15 @@ class Sequence {
 
             OPENVINO_ASSERT(get_generated_len() >= token_cnt);
             auto offset = get_generated_len() - token_cnt;
+            auto offset_back = get_generated_len();
 
-            auto offset_back = get_generated_len() - m_token_cnt_to_ignore;
-            if (m_token_cnt_to_ignore)
-                auto a = 0;
-            m_token_cnt_to_ignore = 0;
-
-            std::vector<int64_t> token_id(generated_token_id.begin() + offset, generated_token_id.begin() + offset_back);
-            std::vector<float> log_probs(generated_log_probs.begin() + offset, generated_log_probs.begin() + offset_back);
+            std::vector<int64_t> token_id(generated_token_id.begin() + offset, generated_token_id.end());
+            std::vector<float> log_probs(generated_log_probs.begin() + offset, generated_log_probs.end());
 
             output.generated_ids = token_id;
             output.generated_log_probs = log_probs;
             output.finish_reason = get_finish_reason();
+            output.token_cnt_to_ignore = m_token_cnt_to_ignore;
         }
         return output;
     }

From 87379b1d2da2ca00dacd46f54dd044fe12c18797 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Tue, 17 Dec 2024 12:42:43 +0400
Subject: [PATCH 04/18] Update sampler.cpp

---
 src/cpp/src/sampler.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index 49d83ec6bd..b689f4883c 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -401,7 +401,7 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, Sa
                     if(!m_parameters.include_stop_str_in_output) {
                         // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point)
                         if (!m_parameters.include_stop_str_in_output) {
-                            candidate.m_sequence->set_num_token_token_cnt_to_ignore(num_last_matched_tokens);
+                            candidate.m_sequence->set_num_token_token_cnt_to_ignore(num_last_matched_tokens - 1);
                         }
                     }
 
@@ -581,7 +581,7 @@ std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen
             int num_matched_last_tokens = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), sampling_params.stop_strings);
             if (num_matched_last_tokens) {
                 if (!sampling_params.include_stop_str_in_output) {
-                    running_sequence->set_num_token_token_cnt_to_ignore(num_matched_last_tokens);
+                    running_sequence->set_num_token_token_cnt_to_ignore(num_matched_last_tokens - 1);
                 }
                 running_sequence->set_status(SequenceStatus::FINISHED);
                 running_sequence->set_finish_reason(GenerationFinishReason::STOP);

From 713f091d3792badd8cda12d7e05e32105410b67f Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Wed, 18 Dec 2024 00:10:26 +0400
Subject: [PATCH 05/18] Handle stop strings in Sampler & Seq Group

---
 .../openvino/genai/generation_handle.hpp      |   1 -
 src/cpp/src/continuous_batching_impl.cpp      |   6 +-
 src/cpp/src/generation_handle.cpp             |   3 +-
 src/cpp/src/llm_pipeline.cpp                  |   2 +-
 src/cpp/src/llm_pipeline_static.cpp           |   2 +-
 src/cpp/src/sampler.cpp                       | 176 ++++++++++--------
 src/cpp/src/sampler.hpp                       |   4 +-
 src/cpp/src/sequence_group.hpp                |  48 ++---
 .../speculative_decoding_impl.cpp             |   4 +-
 src/cpp/src/text_callback_streamer.cpp        |  95 +++-------
 src/cpp/src/text_callback_streamer.hpp        |  10 +-
 11 files changed, 159 insertions(+), 192 deletions(-)

diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp
index 2e4f94a524..7ff172e645 100644
--- a/src/cpp/include/openvino/genai/generation_handle.hpp
+++ b/src/cpp/include/openvino/genai/generation_handle.hpp
@@ -57,7 +57,6 @@ struct GenerationOutput {
     std::vector<float> generated_log_probs;
     float score;
     GenerationFinishReason finish_reason;
-    size_t token_cnt_to_ignore;
 };
 
 using GenerationOutputs = std::unordered_map<uint64_t, GenerationOutput>;
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 442fd6f7c5..7f648b6ffc 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -246,8 +246,8 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         [](const std::shared_ptr<StreamerBase>& streamer) {
             return streamer;
         },
-        [this, &sampling_params](const std::function<bool(std::string)>& streamer) -> std::shared_ptr<StreamerBase> {
-            return sampling_params.size() == 1 ? std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer, sampling_params.begin()->stop_strings) : std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer);
+        [this](const std::function<bool(std::string)>& streamer) -> std::shared_ptr<StreamerBase> {
+            return std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer);
         }
     }, streamer);
 
@@ -285,7 +285,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         if (streamer_ptr && generations.at(0)->can_read()) {
             std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
             for (const auto& gen_token : token.begin()->second.generated_ids) {
-                if (!streamer_ptr->put(gen_token)) {
+                if (streamer_ptr->put(gen_token)) {
                     break;
                 }
             }
diff --git a/src/cpp/src/generation_handle.cpp b/src/cpp/src/generation_handle.cpp
index e6968e481b..a1dd467523 100644
--- a/src/cpp/src/generation_handle.cpp
+++ b/src/cpp/src/generation_handle.cpp
@@ -46,8 +46,7 @@ void add_partial_result(std::unordered_map<uint64_t, GenerationOutput>& partial_
         } else {
             auto generated_len = iteration_result.second.generated_ids.size();
             OPENVINO_ASSERT(generated_len == iteration_result.second.generated_log_probs.size());
-            OPENVINO_ASSERT(generated_len >= iteration_result.second.token_cnt_to_ignore);
-            for (size_t i = 0; i < generated_len - iteration_result.second.token_cnt_to_ignore; ++i) {
+            for (size_t i = 0; i < generated_len; ++i) {
                 partial_result_iter->second.generated_ids.push_back(iteration_result.second.generated_ids[i]);
                 partial_result_iter->second.generated_log_probs.push_back(iteration_result.second.generated_log_probs[i]);
             }
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 89e71f21f8..f663b27dd9 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -273,7 +273,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         } else if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) {
             streamer_ptr = *streamer_obj;
         } else if (auto callback = std::get_if<std::function<bool(std::string)>>(&streamer)) {
-            streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback, generation_config->stop_strings);
+            streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
         }
 
         auto batch_size = input_ids.get_shape().at(0);
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 01a06230d0..cb83209b4b 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -967,7 +967,7 @@ EncodedResults StaticLLMPipeline::generate(
     } else if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) {
         streamer_ptr = *streamer_obj;
     } else if (auto callback = std::get_if<std::function<bool(std::string)>>(&streamer)) {
-        streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback, generation_config->stop_strings);
+        streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
     }
 
     if (!config.is_greedy_decoding()) {
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index 49d83ec6bd..a0eb8f4980 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -85,75 +85,65 @@ std::string clean_wrapped_text(const std::string& wrapped_text, const std::strin
     return clean_text;
 }
 
+std::vector<int64_t> encode_and_process_string(const std::string& stop_string, ov::genai::Tokenizer& tokenizer) {
+    // encode stop_string
+    ov::Tensor ov_encoded_stop_string = tokenizer.encode(stop_string).input_ids;
+    size_t tensor_size = ov_encoded_stop_string.get_size();
+    std::vector<int64_t> source_encoded_stop_string(tensor_size), encoded_stop_string;
+    std::copy_n(ov_encoded_stop_string.data<int64_t>(), tensor_size, source_encoded_stop_string.begin());
+    // remove special symbols
+    for (const auto& token_id : source_encoded_stop_string) {
+        if (token_id != tokenizer.get_bos_token_id() &&
+            token_id != tokenizer.get_eos_token_id() &&
+            token_id != tokenizer.get_pad_token_id()) {
+            encoded_stop_string.push_back(token_id);
+        }
+    }
+    return encoded_stop_string;
+}
+
+struct MatchStopStringResult {
+    size_t to_remove = 0;
+    int64_t last_token_id = 0;
+    bool is_to_update_last_token = false;
+    bool is_matched = false;
+};
+
 // Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned.
-int match_stop_string(Tokenizer & tokenizer, const TokenIds & generated_tokens, const std::set<std::string> & stop_strings) {
-    /*
-    For catching stop_string hit we run comparisons character-wise to catch cases where stop string 
-    overlaps with part of another token on both sides or is just a part of a single token. 
-    For every stop_string we iterate over generated tokens starting from the last one and going backwards. 
-    Every token is wrapped with prefix tokens to ensure tokenizer doesn't remove prefix whitespace of the actual token.
-    After that all tokens are decoded and prefix is removed from the decoded text, so we end up with decoded token.
-    Its characters are compared to the stop_string character at a current_position 
-    (position of a character in the stop_string counting from the last one) - at the beginning position is 0.
-    When characters match we increase current_position and check if we have a full match already, if not we continue.
-    If we have already matched some characters (current_position > 0) and next character is not matching 
-    before we reach the full match, then we reset current_position to 0. 
-    */ 
-    std::string prefix = "a";
-    auto prefix_ov = tokenizer.encode(prefix).input_ids;
-    std::vector<int64_t> prefix_tokens(prefix_ov.data<int64_t>(), prefix_ov.data<int64_t>() + prefix_ov.get_size());
-    std::string suffix = "b";
-    auto suffix_ov = tokenizer.encode(suffix).input_ids;
-    std::vector<int64_t> suffix_tokens(suffix_ov.data<int64_t>(), suffix_ov.data<int64_t>() + suffix_ov.get_size());
-
-    // Since whitespace can be added at the beginning of the suffix we also try to capture that behavior here
-    // and get suffix string that will actually be part of the decoded string so we can remove it correctly
-    auto wrapped_suffix_tokens = suffix_tokens;
-    wrapped_suffix_tokens.insert(wrapped_suffix_tokens.begin(), prefix_tokens.begin(), prefix_tokens.end());
-    std::string wrapped_suffix = tokenizer.decode(wrapped_suffix_tokens);
-    auto wrapper_pos = wrapped_suffix.find(prefix);
-    suffix = wrapped_suffix.substr(wrapper_pos + prefix.size());
-    
-    for (auto stop_string: stop_strings) {
-        int current_position = 0;
-        int num_matched_tokens = 0; 
-        // Getting reverse iterator to check tokens starting from the last one generated and going backwards
-        auto generated_tokens_rit = generated_tokens.rbegin();
-        std::vector<int64_t> tokens_buffer;
-        while (generated_tokens_rit != generated_tokens.rend()) {
-            num_matched_tokens++;
-            tokens_buffer.insert(tokens_buffer.begin(), *generated_tokens_rit);
-
-            std::vector<int64_t> wrapped_tokens = wrap_tokens(tokens_buffer, prefix_tokens, suffix_tokens);
-            std::string wrapped_text = tokenizer.decode(wrapped_tokens);
-            std::string clean_text = clean_wrapped_text(wrapped_text, prefix, suffix);
-
-            if (clean_text == "" || (clean_text.size() >= 3 && (clean_text.compare(clean_text.size() - 3, 3, "�") == 0))) { 
-                generated_tokens_rit++;
-                continue;
-            } else {
-                tokens_buffer.clear();
-            }
-            // Checking clean_text characters starting from the last one
-            for (auto clean_text_rit = clean_text.rbegin(); clean_text_rit != clean_text.rend(); clean_text_rit++) {
-                // On character match increment current_position for the next comparisons
-                if (*clean_text_rit == *(stop_string.rbegin() + current_position)) {
-                    current_position++;
-                    // If this is the last character from the stop_string we have a match
-                    if ((stop_string.rbegin() + current_position) == stop_string.rend()) {
-                        return num_matched_tokens;
-                    } 
-                } else if (current_position) {
-                    // Already found matching characters, but the last one didn't match, so we reset current_position
-                    current_position = 0;
-                    // Looking for the match will start over from this character so we decrement iterator
-                    clean_text_rit--;
+MatchStopStringResult match_stop_string(Tokenizer& tokenizer,
+                      const TokenIds& generated_tokens,
+                      const std::pair<size_t, std::set<std::string>>& stop_strings,
+                      bool is_include_to_output) {
+    MatchStopStringResult result;
+    if (generated_tokens.size() >= stop_strings.first) {
+        size_t offset = generated_tokens.size() - stop_strings.first;
+        TokenIds buffer(generated_tokens.begin() + offset, generated_tokens.end());
+        std::string decoded_buffer = tokenizer.decode(buffer);
+        for (const auto& stop_string : stop_strings.second) {
+            auto pos = decoded_buffer.find(stop_string);
+            if (pos != std::string::npos) {
+                result.is_matched = true;
+
+                auto stop_string_len = is_include_to_output ? stop_string.length() : 0;
+                decoded_buffer = decoded_buffer.substr(0, pos + stop_string_len);
+
+                auto encoded_buffer = encode_and_process_string(decoded_buffer, tokenizer);
+                if (buffer == encoded_buffer) {
+                    return result;
+                } else if (encoded_buffer.size() > 0) {
+                    result.last_token_id = encoded_buffer.back();
+                    result.is_to_update_last_token = 0;
+                    encoded_buffer.pop_back();
                 }
+
+                result.to_remove = buffer.size() - encoded_buffer.size();
+                buffer = TokenIds(buffer.begin(), buffer.begin() + encoded_buffer.size());
+                OPENVINO_ASSERT(buffer == encoded_buffer);
+                return result;
             }
-            generated_tokens_rit++;
         }
     }
-    return 0;
+    return result;
 }
 
 // Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned.
@@ -245,7 +235,9 @@ std::map<size_t, int32_t> Sampler::GroupBeamSearcher::get_beam_idxs() {
     return next_beams;
 }
 
-void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output) {
+void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits,
+    SamplerOutput& sampler_output,
+    const std::pair<size_t, std::set<std::string>>& stop_strings) {
     assert(m_parameters.num_beams % m_parameters.num_beam_groups == 0 &&
         "number of beams should be divisible by number of groups");
     size_t group_size = m_parameters.num_beams / m_parameters.num_beam_groups;
@@ -392,21 +384,19 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, Sa
                 // There's probably a better way to do that, than copying whole vector...
                 std::vector<int64_t> token_ids = candidate.m_sequence->get_generated_ids();
                 token_ids.push_back(candidate.m_token_id);
-                int num_last_matched_tokens = match_stop_string(m_tokenizer, token_ids, m_sequence_group->get_sampling_parameters().stop_strings);
-                if (num_last_matched_tokens) {
+                auto match_result = match_stop_string(m_tokenizer, token_ids, stop_strings, m_parameters.include_stop_str_in_output);
+                if (match_result.is_matched) {
                     // If beam_token does not belong to top num_beams tokens, it should not be added
                     if (cand_idx >= group_size)
                         continue;
 
-                    if(!m_parameters.include_stop_str_in_output) {
-                        // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point)
-                        if (!m_parameters.include_stop_str_in_output) {
-                            candidate.m_sequence->set_num_token_token_cnt_to_ignore(num_last_matched_tokens);
-                        }
-                    }
+                    // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point)
+                    candidate.m_sequence->remove_last_tokens(match_result.to_remove);
+
+                    candidate.m_token_id = match_result.last_token_id;
 
                     // try to finish candidate
-                    try_to_finish_candidate(group, candidate, m_parameters.include_stop_str_in_output);
+                    try_to_finish_candidate(group, candidate);
                     continue;
                 }
             }
@@ -578,11 +568,20 @@ std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen
         }
 
         if (!sampling_params.stop_strings.empty()) {
-            int num_matched_last_tokens = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), sampling_params.stop_strings);
-            if (num_matched_last_tokens) {
-                if (!sampling_params.include_stop_str_in_output) {
-                    running_sequence->set_num_token_token_cnt_to_ignore(num_matched_last_tokens);
+            auto& stop_strings = m_stop_strings.at(sequence_group->get_request_id());
+            auto match_result = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), stop_strings, sampling_params.include_stop_str_in_output);
+            if (match_result.is_matched) {
+                if (match_result.to_remove > 0) {
+                    if (match_result.to_remove > 1) {
+                        running_sequence->remove_last_tokens(match_result.to_remove - 1);
+                    }
+                    auto log_prob = running_sequence->get_generated_log_probs().back();
+                    running_sequence->remove_last_tokens(1);
+                    if (match_result.is_to_update_last_token) {
+                        running_sequence->append_token(match_result.last_token_id, log_prob);
+                    }
                 }
+
                 running_sequence->set_status(SequenceStatus::FINISHED);
                 running_sequence->set_finish_reason(GenerationFinishReason::STOP);
                 dropped_seq_ids.push_back(running_sequence->get_id());
@@ -744,6 +743,19 @@ float get_p_prime(Sequence::Ptr& running_sequence,
     return p_prime;
 }
 
+std::pair<size_t, std::set<std::string>>
+process_stop_strings(const std::set<std::string>& stop_strings, Tokenizer& tokenizer) {
+    std::pair<size_t, std::set<std::string>> result;
+    for (const auto& stop_string : stop_strings) {
+        auto encoded_stop_string = encode_and_process_string(stop_string, tokenizer);
+        if (result.first < encoded_stop_string.size()) {
+            result.first = encoded_stop_string.size();
+        }
+        result.second.insert(stop_string);
+    }
+    return result;
+}
+
 SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
                               ov::Tensor logits,
                               bool is_validation_mode_enabled) {
@@ -767,6 +779,12 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
         if (!m_logit_processors.count(request_id)) {
             m_logit_processors.insert({request_id, LogitProcessor(sampling_params, sequence_group->get_prompt_ids())});
         }
+        if (!m_stop_strings.count(request_id)) {
+            auto processed_stop_string = process_stop_strings(sampling_params.stop_strings, m_tokenizer);
+            m_stop_strings.insert({request_id, processed_stop_string});
+            sequence_group->set_stream_window_size(processed_stop_string.first);
+        }
+        auto& stop_strings = m_stop_strings.at(request_id);
         auto& logit_processor = m_logit_processors.at(request_id);
         const void * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens;
         ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data);
@@ -876,7 +894,7 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
                 }
 
                 // current algorithm already adds new tokens to running sequences and
-                m_beam_search_info.at(request_id).select_next_tokens(sequence_group_logits, sampler_output);
+                m_beam_search_info.at(request_id).select_next_tokens(sequence_group_logits, sampler_output, stop_strings);
 
                 // check max length stop criteria
                 std::vector<Sequence::Ptr> running_sequences = sequence_group->get_running_sequences();
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 0f7876cbf9..2ebe75ad97 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -57,6 +57,8 @@ class Sampler {
     std::mt19937 rng_engine;
     // { request_id, logit_processor }
     std::map<uint64_t, LogitProcessor> m_logit_processors;
+    // { request_id, { max_encoded_len, { stop_strings }}}
+    std::map<int64_t, std::pair<size_t, std::set<std::string>>> m_stop_strings;
 
     Tokenizer m_tokenizer;
 
@@ -115,7 +117,7 @@ class Sampler::GroupBeamSearcher {
 public:
     explicit GroupBeamSearcher(SequenceGroup::Ptr sequence_group, Tokenizer tokenizer);
 
-    void select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output);
+    void select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output, const std::pair<size_t, std::set<std::string>>& stop_strings);
     void finalize(SamplerOutput& sampler_output);
     std::map<size_t, int32_t> get_beam_idxs();
 };
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 0a332d776b..73884b7a69 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -44,8 +44,6 @@ class Sequence {
     static std::mutex m_counter_mutex;
 
     size_t _make_hash(size_t content_length);
-    // num tokens to remove from result. Used in case of match by stop_string
-    size_t m_token_cnt_to_ignore = 0;
 public:
     using Ptr = std::shared_ptr<Sequence>;
     using CPtr = std::shared_ptr<const Sequence>;
@@ -128,7 +126,7 @@ class Sequence {
         }
     }
 
-    GenerationOutput get_last_generation_output(size_t token_cnt = 1) {
+    GenerationOutput get_last_generation_output(size_t token_cnt = 1, size_t num_token_to_ignore = 0) {
         GenerationOutput output;
         if (token_cnt > 0) {
             OPENVINO_ASSERT(m_generated_ids.size());
@@ -137,25 +135,25 @@ class Sequence {
             auto generated_token_id = get_generated_ids();
             auto generated_log_probs = get_generated_log_probs();
 
+            if (get_generated_len() < token_cnt) {
+                auto a = 0;
+            }
             OPENVINO_ASSERT(get_generated_len() >= token_cnt);
-            auto offset = get_generated_len() - token_cnt;
-            auto offset_back = get_generated_len();
+            if (get_generated_len() > num_token_to_ignore) {
+                auto offset = get_generated_len() - token_cnt - num_token_to_ignore;
+                auto offset_back = get_generated_len() - num_token_to_ignore;
 
-            std::vector<int64_t> token_id(generated_token_id.begin() + offset, generated_token_id.end());
-            std::vector<float> log_probs(generated_log_probs.begin() + offset, generated_log_probs.end());
+                std::vector<int64_t> token_id(generated_token_id.begin() + offset, generated_token_id.begin() + offset_back);
+                std::vector<float> log_probs(generated_log_probs.begin() + offset, generated_log_probs.begin() + offset_back);
 
-            output.generated_ids = token_id;
-            output.generated_log_probs = log_probs;
-            output.finish_reason = get_finish_reason();
-            output.token_cnt_to_ignore = m_token_cnt_to_ignore;
+                output.generated_ids = token_id;
+                output.generated_log_probs = log_probs;
+                output.finish_reason = get_finish_reason();
+            }
         }
         return output;
     }
 
-    void set_num_token_token_cnt_to_ignore(size_t k) {
-        m_token_cnt_to_ignore = k;
-    }
-
     size_t get_generated_len() const {
         return m_generated_ids.size();
     }
@@ -231,7 +229,7 @@ class SequenceGroup {
     // flag to enable/disable token generation, e.g. in speculative decoding scenario
     bool m_is_gen_paused = false;
 
-    size_t m_num_streamed_tokens = 0;
+    size_t m_num_streamed_tokens = 0, m_stream_window_size = 0;
 
 
     SequenceGroup(uint64_t request_id, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching)
@@ -466,6 +464,10 @@ class SequenceGroup {
     size_t get_num_tokens_to_validate() {
         return m_num_validation_tokens;
     }
+    
+    void set_stream_window_size(size_t k) {
+        m_stream_window_size = k;
+    }
 
     size_t get_num_available_tokens_for_batching() const {
         OPENVINO_ASSERT(!has_finished(), "Internal error: this function cannot be called on finished sequence group");
@@ -613,7 +615,7 @@ class SequenceGroup {
         for (auto& sequence : m_sequences) {
             // todo: check seq.is_finished() to generate without several </s>
             // or is it ok to use padding?
-            auto output = sequence->get_last_generation_output(token_cnt);
+            auto output = sequence->get_last_generation_output(token_cnt, m_stream_window_size);
             if (m_sampling_params.echo && !m_has_echoed) {
                 output.generated_ids.insert(output.generated_ids.begin(), m_prompt_ids.begin(), m_prompt_ids.end());
                 output.generated_log_probs.insert(output.generated_log_probs.begin(), m_prompt_log_probs.begin(), m_prompt_log_probs.end());
@@ -640,14 +642,18 @@ class SequenceGroup {
             // (after stop string is detected its tokens are already sent)
             if (num_total_seqs() == 1) {
                 const auto generated_len = m_sequences.front()->get_generated_len();
+                if (has_finished()) {
+                    m_stream_window_size = 0;
+                }
+                if (generated_len <= (m_num_streamed_tokens + m_stream_window_size)) {
+                    return;
+                }
                 // speculative decoding draft handling
                 if (generated_len < m_num_streamed_tokens) {
                     m_num_streamed_tokens = generated_len;
                 }
-                OPENVINO_ASSERT(generated_len >= m_num_streamed_tokens);
-                auto delta = generated_len - m_num_streamed_tokens;
-
-                size_t num_output_token_to_push = generated_len - m_num_streamed_tokens;
+                OPENVINO_ASSERT(generated_len >= (m_num_streamed_tokens + m_stream_window_size));
+                size_t num_output_token_to_push = generated_len - m_num_streamed_tokens - m_stream_window_size;
                 push_partial_outputs(num_output_token_to_push);
                 m_num_streamed_tokens += (num_output_token_to_push);
             } else if (has_finished() || out_of_memory()) {
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
index fd9bf00785..e4f3b1ad1f 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -199,8 +199,8 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
         [](const std::shared_ptr<StreamerBase>& streamer) {
             return streamer;
         },
-        [this, &sampling_params](const std::function<bool(std::string)>& streamer) -> std::shared_ptr<StreamerBase> {
-            return sampling_params.size() == 1 ? std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer, sampling_params.begin()->stop_strings) : std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer);
+        [this](const std::function<bool(std::string)>& streamer) -> std::shared_ptr<StreamerBase> {
+            return std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer);
         }
     }, streamer);
 
diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp
index 46b4c666b9..5938b55f6c 100644
--- a/src/cpp/src/text_callback_streamer.cpp
+++ b/src/cpp/src/text_callback_streamer.cpp
@@ -6,84 +6,32 @@
 namespace ov {
 namespace genai {
 
-std::vector<int64_t> encode_and_process_stop_string(const std::string& stop_string, ov::genai::Tokenizer& tokenizer) {
-    // encode stop_string
-    ov::Tensor ov_encoded_stop_string = tokenizer.encode(stop_string).input_ids;
-    size_t tensor_size = ov_encoded_stop_string.get_size();
-    std::vector<int64_t> source_encoded_stop_string(tensor_size), encoded_stop_string;
-    std::copy_n(ov_encoded_stop_string.data<int64_t>(), tensor_size, source_encoded_stop_string.begin());
-    // remove special symbols
-    for (const auto& token_id : source_encoded_stop_string) {
-        if (token_id != tokenizer.get_bos_token_id() &&
-            token_id != tokenizer.get_eos_token_id() &&
-            token_id != tokenizer.get_pad_token_id()) {
-            encoded_stop_string.push_back(token_id);
-        }
-    }
-    return encoded_stop_string;
-}
-
-TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function<bool(std::string)> callback, const std::set<std::string>& stop_strings) {
+TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function<bool(std::string)> callback) {
     m_tokenizer = tokenizer;
     on_finalized_subword_callback = callback;
-    for (const auto& stop_string : stop_strings) {
-        auto encoded_stop_string = encode_and_process_stop_string(stop_string, m_tokenizer);
-        m_max_stop_string_len = std::max(encoded_stop_string.size(), m_max_stop_string_len);
-        m_stop_strings.insert(stop_string);
-    }
 }
 
 bool TextCallbackStreamer::put(int64_t token) {
     std::stringstream res;
-    m_tokens_cache_stop_string.push_back(token);
-    if (m_tokens_cache_stop_string.size() > m_max_stop_string_len || token == m_tokenizer.get_eos_token_id()) {
-        std::vector<int64_t> buffer(m_tokens_cache_stop_string.begin(), m_tokens_cache_stop_string.end());
-        std::string text = m_tokenizer.decode(buffer);
-        std::string activated_stop_string = "";
-        for (const auto& stop_string : m_stop_strings) {
-            if (text.find(stop_string) != std::string::npos) {
-                activated_stop_string = stop_string;
-                break;
-            }
-        }
-        
-        
-        if (activated_stop_string.empty() && token != m_tokenizer.get_eos_token_id()) {
-            m_tokens_cache.push_back(m_tokens_cache_stop_string.front());
-            m_tokens_cache_stop_string.pop_front();
-        } else {
-            m_tokens_cache.insert(m_tokens_cache.end(), m_tokens_cache_stop_string.begin(), m_tokens_cache_stop_string.end());
-            m_tokens_cache_stop_string.clear();
-        }
-
-        text = m_tokenizer.decode(m_tokens_cache);
-        if (!activated_stop_string.empty()) {
-            auto pos = text.find(activated_stop_string);
-            if (pos != std::string::npos) {
-                text.replace(pos, activated_stop_string.length(), "");
-            }
-            m_tokens_cache.clear();
-        }
-
-        if (!text.empty() && '\n' == text.back() && text.size() > print_len) {
-            // Flush the cache after the new line symbol
-            res << std::string_view{text.data() + print_len, text.size() - print_len};
-            m_tokens_cache.clear();
-            print_len = 0;
-            return on_finalized_subword_callback(res.str());
-        }
-
+    m_tokens_cache.push_back(token);
+    std::string text = m_tokenizer.decode(m_tokens_cache);
+    if (!text.empty() && '\n' == text.back() && text.size() > print_len) {
+        // Flush the cache after the new line symbol
+        res << std::string_view{text.data() + print_len, text.size() - print_len};
+        m_tokens_cache.clear();
+        print_len = 0;
+        return on_finalized_subword_callback(res.str());
+    }
 
-        constexpr char replacement[] = "\xef\xbf\xbd";  // MSVC with /utf-8 fails to compile � directly with newline in string literal error.
-        if (text.size() >= 3 && text.compare(text.size() - 3, 3, replacement) == 0) {
-            // Don't print incomplete text
-            return on_finalized_subword_callback(res.str());
-        } else {
-            // It is possible to have a shorter text after adding new token.
-            // Print to output only if text length is increaesed.
-            res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
-            print_len = text.size();
-        }
+    constexpr char replacement[] = "\xef\xbf\xbd";  // MSVC with /utf-8 fails to compile � directly with newline in string literal error.
+    if (text.size() >= 3 && text.compare(text.size() - 3, 3, replacement) == 0) {
+        // Don't print incomplete text
+        return on_finalized_subword_callback(res.str());
+    } else if (text.size() > print_len) {
+        // It is possible to have a shorter text after adding new token.
+        // Print to output only if text length is increaesed.
+        res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+        print_len = text.size();
     }
 
     return on_finalized_subword_callback(res.str());
@@ -91,8 +39,7 @@ bool TextCallbackStreamer::put(int64_t token) {
 
 void TextCallbackStreamer::end() {
     std::stringstream res;
-    std::vector<int64_t> buffer(m_tokens_cache.begin(), m_tokens_cache.end());
-    std::string text = m_tokenizer.decode(buffer);
+    std::string text = m_tokenizer.decode(m_tokens_cache);
     if (text.size() <= print_len)
         return ;
     res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
@@ -105,4 +52,4 @@ void TextCallbackStreamer::end() {
 ov::genai::StreamerBase::~StreamerBase() = default;
 
 }  // namespace genai
-}  // namespace ov
+}  // namespace ov
\ No newline at end of file
diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp
index ae353d27d5..6f0872ad1b 100644
--- a/src/cpp/src/text_callback_streamer.hpp
+++ b/src/cpp/src/text_callback_streamer.hpp
@@ -3,8 +3,6 @@
 
 #pragma once
 
-#include <list>
-
 #include "openvino/genai/streamer_base.hpp"
 #include "openvino/genai/tokenizer.hpp"
 
@@ -16,17 +14,15 @@ class TextCallbackStreamer: public StreamerBase {
     bool put(int64_t token) override;
     void end() override;
 
-    TextCallbackStreamer(const Tokenizer& tokenizer, std::function<bool(std::string)> callback, const std::set<std::string>& stop_strings = {});
+    TextCallbackStreamer(const Tokenizer& tokenizer, std::function<bool(std::string)> callback);
 
     std::function<bool(std::string)> on_finalized_subword_callback = [](std::string words)->bool { return false; };
 
 protected:
     Tokenizer m_tokenizer;
     std::vector<int64_t> m_tokens_cache;
-    std::list<int64_t> m_tokens_cache_stop_string;
-    size_t print_len = 0, m_max_stop_string_len = 0;
-    std::set<std::string> m_stop_strings;
+    size_t print_len = 0;
 };
 
 }  // namespace genai
-}  // namespace ov
+}  // namespace ov
\ No newline at end of file

From 64d1b803887ccd14642101da1208cc6ba4de3a9a Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Wed, 18 Dec 2024 14:11:23 +0400
Subject: [PATCH 06/18] ci

---
 src/cpp/src/sampler.cpp | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index a0eb8f4980..2b8a95b2b3 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -104,8 +104,8 @@ std::vector<int64_t> encode_and_process_string(const std::string& stop_string, o
 
 struct MatchStopStringResult {
     size_t to_remove = 0;
-    int64_t last_token_id = 0;
-    bool is_to_update_last_token = false;
+    // int64_t last_token_id = 0;
+    // bool is_to_update_last_token = false;
     bool is_matched = false;
 };
 
@@ -126,19 +126,19 @@ MatchStopStringResult match_stop_string(Tokenizer& tokenizer,
 
                 auto stop_string_len = is_include_to_output ? stop_string.length() : 0;
                 decoded_buffer = decoded_buffer.substr(0, pos + stop_string_len);
-
-                auto encoded_buffer = encode_and_process_string(decoded_buffer, tokenizer);
-                if (buffer == encoded_buffer) {
+                if (decoded_buffer.empty()) {
+                    result.to_remove = buffer.size();
                     return result;
-                } else if (encoded_buffer.size() > 0) {
-                    result.last_token_id = encoded_buffer.back();
-                    result.is_to_update_last_token = 0;
-                    encoded_buffer.pop_back();
                 }
 
-                result.to_remove = buffer.size() - encoded_buffer.size();
-                buffer = TokenIds(buffer.begin(), buffer.begin() + encoded_buffer.size());
-                OPENVINO_ASSERT(buffer == encoded_buffer);
+                // find token cnt to be removed from sequence by decoding token by token
+                std::string decoded_partially_string = "";
+                for (size_t i = 0; i < buffer.size(); ++i) {
+                    decoded_partially_string += tokenizer.decode(TokenIds{buffer[i]});
+                    if (decoded_partially_string.find(decoded_buffer) != std::string::npos) {
+                        result.to_remove = buffer.size() - i - 1;
+                    }
+                }
                 return result;
             }
         }
@@ -393,7 +393,7 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits,
                     // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point)
                     candidate.m_sequence->remove_last_tokens(match_result.to_remove);
 
-                    candidate.m_token_id = match_result.last_token_id;
+                    // candidate.m_token_id = match_result.last_token_id;
 
                     // try to finish candidate
                     try_to_finish_candidate(group, candidate);
@@ -572,14 +572,7 @@ std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen
             auto match_result = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), stop_strings, sampling_params.include_stop_str_in_output);
             if (match_result.is_matched) {
                 if (match_result.to_remove > 0) {
-                    if (match_result.to_remove > 1) {
-                        running_sequence->remove_last_tokens(match_result.to_remove - 1);
-                    }
-                    auto log_prob = running_sequence->get_generated_log_probs().back();
-                    running_sequence->remove_last_tokens(1);
-                    if (match_result.is_to_update_last_token) {
-                        running_sequence->append_token(match_result.last_token_id, log_prob);
-                    }
+                    running_sequence->remove_last_tokens(match_result.to_remove);
                 }
 
                 running_sequence->set_status(SequenceStatus::FINISHED);

From 449a53ee14a19ff1cfe12bbc56739edcd8d53de0 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Wed, 18 Dec 2024 15:59:54 +0400
Subject: [PATCH 07/18] streaming

---
 src/cpp/src/continuous_batching_impl.cpp | 12 +++++++++---
 src/cpp/src/lm_encoding.cpp              |  2 +-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 7f648b6ffc..2b48852458 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -275,7 +275,8 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         m_requests.clear();
     };
 
-    while (has_non_finished_requests()) {
+    bool continue_generation = true;
+    while (has_non_finished_requests() && continue_generation) {
         try {
             step();
         } catch (...) {
@@ -285,7 +286,8 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         if (streamer_ptr && generations.at(0)->can_read()) {
             std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
             for (const auto& gen_token : token.begin()->second.generated_ids) {
-                if (streamer_ptr->put(gen_token)) {
+                continue_generation = !streamer_ptr->put(gen_token);
+                if (!continue_generation) {
                     break;
                 }
             }
@@ -296,7 +298,11 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         streamer_ptr->end();
     }
 
-    OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed");
+    if (!continue_generation) {
+        drop_requests();
+    } else {
+        OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed");
+    }
 
     for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) {
         const auto& generation = generations[generation_idx];
diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index 3ab041fa58..5b1661831b 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -130,7 +130,7 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
         if (streamer_ptr && generations.at(0).get()->can_read()) {
             std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
             for (const auto& gen_token : token.begin()->second.generated_ids) {
-                if (!streamer_ptr->put(gen_token)) {
+                if (streamer_ptr->put(gen_token)) {
                     break;
                 }
             }

From b8fdb8e3f96a8d15a639243d138f404cb70c5027 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Thu, 19 Dec 2024 12:13:14 +0400
Subject: [PATCH 08/18] test

---
 src/cpp/src/sampler.cpp             |  5 +++++
 tests/python_tests/common.py        | 28 ++++++++++++++++++++++++++++
 tests/python_tests/test_sampling.py | 10 +++++++++-
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index 2b8a95b2b3..0e9eeff519 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -126,6 +126,10 @@ MatchStopStringResult match_stop_string(Tokenizer& tokenizer,
 
                 auto stop_string_len = is_include_to_output ? stop_string.length() : 0;
                 decoded_buffer = decoded_buffer.substr(0, pos + stop_string_len);
+                // to remove word splitting symbols from tail
+                while (decoded_buffer.back() == ' ' || decoded_buffer.back() == '\n') {
+                    decoded_buffer.pop_back();
+                }
                 if (decoded_buffer.empty()) {
                     result.to_remove = buffer.size();
                     return result;
@@ -137,6 +141,7 @@ MatchStopStringResult match_stop_string(Tokenizer& tokenizer,
                     decoded_partially_string += tokenizer.decode(TokenIds{buffer[i]});
                     if (decoded_partially_string.find(decoded_buffer) != std::string::npos) {
                         result.to_remove = buffer.size() - i - 1;
+                        break;
                     }
                 }
                 return result;
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 50ee452f5c..7c97088abc 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -125,6 +125,34 @@ def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig:
     generation_config.include_stop_str_in_output = True
     return generation_config
 
+def get_greedy_stop_strings_exclude_from_output() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.max_new_tokens = 30
+    generation_config.stop_strings = { "machines" }
+    generation_config.include_stop_str_in_output = False
+    return generation_config
+
+def get_greedy_stop_strings_include_to_output() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.max_new_tokens = 30
+    generation_config.stop_strings = { "machines" }
+    generation_config.include_stop_str_in_output = True
+    return generation_config
+
+def get_greedy_n_stop_strings_exclude_from_output() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.max_new_tokens = 30
+    generation_config.stop_strings = { "machines", "anag" }
+    generation_config.include_stop_str_in_output = False
+    return generation_config
+
+def get_greedy_n_stop_strings_include_to_output() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.max_new_tokens = 30
+    generation_config.stop_strings = { "machines", "anag" }
+    generation_config.include_stop_str_in_output = True
+    return generation_config
+
 def get_multinomial_temperature() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.do_sample = True
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index 9aa6931d85..d5df28bfd6 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -21,6 +21,8 @@
     get_beam_search, get_beam_search_min_and_max_tokens, get_beam_search_with_single_stop_string, \
     get_beam_search_with_multiple_stop_strings, get_beam_search_with_multiple_stop_strings_no_match, get_multinomial_max_and_min_token, \
     get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \
+    get_greedy_stop_strings_exclude_from_output, get_greedy_stop_strings_include_to_output, \
+    get_greedy_n_stop_strings_exclude_from_output, get_greedy_n_stop_strings_include_to_output, \
     generate_and_compare_with_hf, get_multinomial_temperature_and_repetition_penalty, get_scheduler_config, \
     run_continuous_batching
 
@@ -77,7 +79,9 @@ def test_eos_greedy(tmp_path):
 @pytest.mark.precommit
 @pytest.mark.parametrize("generation_config", [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(),
                                                get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(), 
-                                               get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(), ],
+                                               get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(),
+                                               get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(),
+                                               get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output() ],
         ids=[
             "greedy",
             "greedy_with_min_and_max_tokens",
@@ -88,6 +92,10 @@ def test_eos_greedy(tmp_path):
             "beam",
             "beam_search_min_and_max_tokens",
             "beam_search_with_multiple_stop_strings_no_match",
+            "get_greedy_stop_strings_exclude_from_output",
+            "get_greedy_stop_strings_include_to_output",
+            "get_greedy_n_stop_strings_exclude_from_output",
+            "get_greedy_n_stop_strings_include_to_output"
             ])
 def test_individual_generation_configs_deterministic(tmp_path, generation_config):
     prompts = [

From df7b512ce541b3e25fb1892c554bad81c7e20d0c Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Thu, 19 Dec 2024 12:25:00 +0400
Subject: [PATCH 09/18] tokenizers

---
 src/cpp/src/sampler.cpp | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index 0e9eeff519..1a80848264 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -87,18 +87,11 @@ std::string clean_wrapped_text(const std::string& wrapped_text, const std::strin
 
 std::vector<int64_t> encode_and_process_string(const std::string& stop_string, ov::genai::Tokenizer& tokenizer) {
     // encode stop_string
-    ov::Tensor ov_encoded_stop_string = tokenizer.encode(stop_string).input_ids;
+    std::string stop_string_copy = stop_string;
+    ov::Tensor ov_encoded_stop_string = tokenizer.encode(stop_string_copy, ov::genai::add_special_tokens(false)).input_ids;
     size_t tensor_size = ov_encoded_stop_string.get_size();
-    std::vector<int64_t> source_encoded_stop_string(tensor_size), encoded_stop_string;
-    std::copy_n(ov_encoded_stop_string.data<int64_t>(), tensor_size, source_encoded_stop_string.begin());
-    // remove special symbols
-    for (const auto& token_id : source_encoded_stop_string) {
-        if (token_id != tokenizer.get_bos_token_id() &&
-            token_id != tokenizer.get_eos_token_id() &&
-            token_id != tokenizer.get_pad_token_id()) {
-            encoded_stop_string.push_back(token_id);
-        }
-    }
+    std::vector<int64_t> encoded_stop_string(tensor_size);
+    std::copy_n(ov_encoded_stop_string.data<int64_t>(), tensor_size, encoded_stop_string.begin());
     return encoded_stop_string;
 }
 

From 796e1489766bb9cd7a6c2f91f1176d1c703a9124 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Thu, 19 Dec 2024 14:01:53 +0400
Subject: [PATCH 10/18] check

---
 samples/cpp/text_generation/greedy_causal_lm.cpp |  8 +++++++-
 tests/python_tests/common.py                     | 15 ++++++++++-----
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/samples/cpp/text_generation/greedy_causal_lm.cpp b/samples/cpp/text_generation/greedy_causal_lm.cpp
index b5ca59095b..8c8c6d3ebe 100644
--- a/samples/cpp/text_generation/greedy_causal_lm.cpp
+++ b/samples/cpp/text_generation/greedy_causal_lm.cpp
@@ -13,7 +13,13 @@ int main(int argc, char* argv[]) try {
 
     ov::genai::LLMPipeline pipe(models_path, device);
     ov::genai::GenerationConfig config;
-    config.max_new_tokens = 100;
+    config.max_new_tokens = 30;
+    
+    config.stop_strings = { "machines", "manage" };
+    // anag
+    config.include_stop_str_in_output = false;
+
+
     std::string result = pipe.generate(prompt, config);
     std::cout << result << std::endl;
 } catch (const std::exception& error) {
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 7c97088abc..163a00192e 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -142,14 +142,14 @@ def get_greedy_stop_strings_include_to_output() -> GenerationConfig:
 def get_greedy_n_stop_strings_exclude_from_output() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.max_new_tokens = 30
-    generation_config.stop_strings = { "machines", "anag" }
+    generation_config.stop_strings = { "machines", "manage" }
     generation_config.include_stop_str_in_output = False
     return generation_config
 
 def get_greedy_n_stop_strings_include_to_output() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.max_new_tokens = 30
-    generation_config.stop_strings = { "machines", "anag" }
+    generation_config.stop_strings = { "machines", "manage" }
     generation_config.include_stop_str_in_output = True
     return generation_config
 
@@ -387,9 +387,14 @@ def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, ge
             # Note, that for fp32 / fp16 models scores are different less than 0.001
             assert abs(hf_score - ov_score) < 0.02
 
-    assert len(hf_result.m_generation_ids) == len(ov_result.m_generation_ids)
-    for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids):
-        assert hf_text == ov_text
+    if not generation_config.include_stop_str_in_output and len(generation_config.stop_strings) > 0:
+        assert len(hf_result.m_generation_ids) >= len(ov_result.m_generation_ids)
+        for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids):
+            assert ov_text in hf_text
+    else:
+        assert len(hf_result.m_generation_ids) == len(ov_result.m_generation_ids)
+        for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids):
+            assert hf_text == ov_text
 
 def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path):
     model.save_pretrained(models_path)

From 7dbb3ad3dada51babcffc2b4187932244702df52 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Thu, 19 Dec 2024 18:02:03 +0400
Subject: [PATCH 11/18] Update greedy_causal_lm.cpp

---
 samples/cpp/text_generation/greedy_causal_lm.cpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/samples/cpp/text_generation/greedy_causal_lm.cpp b/samples/cpp/text_generation/greedy_causal_lm.cpp
index 8c8c6d3ebe..62d961a6b7 100644
--- a/samples/cpp/text_generation/greedy_causal_lm.cpp
+++ b/samples/cpp/text_generation/greedy_causal_lm.cpp
@@ -13,13 +13,8 @@ int main(int argc, char* argv[]) try {
 
     ov::genai::LLMPipeline pipe(models_path, device);
     ov::genai::GenerationConfig config;
-    config.max_new_tokens = 30;
+    config.max_new_tokens = 100;
     
-    config.stop_strings = { "machines", "manage" };
-    // anag
-    config.include_stop_str_in_output = false;
-
-
     std::string result = pipe.generate(prompt, config);
     std::cout << result << std::endl;
 } catch (const std::exception& error) {

From 617471ac4523e134b5e0b31d114606ad741cf6fa Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Thu, 19 Dec 2024 18:02:27 +0400
Subject: [PATCH 12/18] Update greedy_causal_lm.cpp

---
 samples/cpp/text_generation/greedy_causal_lm.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/samples/cpp/text_generation/greedy_causal_lm.cpp b/samples/cpp/text_generation/greedy_causal_lm.cpp
index 62d961a6b7..acdd8b690d 100644
--- a/samples/cpp/text_generation/greedy_causal_lm.cpp
+++ b/samples/cpp/text_generation/greedy_causal_lm.cpp
@@ -13,8 +13,7 @@ int main(int argc, char* argv[]) try {
 
     ov::genai::LLMPipeline pipe(models_path, device);
     ov::genai::GenerationConfig config;
-    config.max_new_tokens = 100;
-    
+    config.max_new_tokens = 100;    
     std::string result = pipe.generate(prompt, config);
     std::cout << result << std::endl;
 } catch (const std::exception& error) {

From d2acb396ba21517dc37f3be766b077cbb9c4ddb9 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Fri, 20 Dec 2024 14:24:35 +0400
Subject: [PATCH 13/18] Update speculative_decoding_lm.cpp

---
 .../cpp/speculative_decoding_lm/speculative_decoding_lm.cpp    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
index 2dc46f8a82..487296566b 100644
--- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
+++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
@@ -33,8 +33,7 @@ int main(int argc, char* argv[]) try {
         main_model_path,
         main_device,
         ov::genai::draft_model(draft_model_path, draft_device),
-        ov::genai::scheduler_config(scheduler_config)
-    );
+        ov::genai::scheduler_config(scheduler_config));
 
     auto streamer = [](std::string subword) {
         std::cout << subword << std::flush;

From 4b7a767211b3541293f6cb28372ad4148b417c29 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Fri, 20 Dec 2024 14:25:12 +0400
Subject: [PATCH 14/18] Update greedy_causal_lm.cpp

---
 samples/cpp/text_generation/greedy_causal_lm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/cpp/text_generation/greedy_causal_lm.cpp b/samples/cpp/text_generation/greedy_causal_lm.cpp
index acdd8b690d..b5ca59095b 100644
--- a/samples/cpp/text_generation/greedy_causal_lm.cpp
+++ b/samples/cpp/text_generation/greedy_causal_lm.cpp
@@ -13,7 +13,7 @@ int main(int argc, char* argv[]) try {
 
     ov::genai::LLMPipeline pipe(models_path, device);
     ov::genai::GenerationConfig config;
-    config.max_new_tokens = 100;    
+    config.max_new_tokens = 100;
     std::string result = pipe.generate(prompt, config);
     std::cout << result << std::endl;
 } catch (const std::exception& error) {

From 09307a40af3cb532036a356129ce811db76bbe12 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Fri, 20 Dec 2024 14:26:50 +0400
Subject: [PATCH 15/18] Update sequence_group.hpp

---
 src/cpp/src/sequence_group.hpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 73884b7a69..198d3b53aa 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -135,9 +135,6 @@ class Sequence {
             auto generated_token_id = get_generated_ids();
             auto generated_log_probs = get_generated_log_probs();
 
-            if (get_generated_len() < token_cnt) {
-                auto a = 0;
-            }
             OPENVINO_ASSERT(get_generated_len() >= token_cnt);
             if (get_generated_len() > num_token_to_ignore) {
                 auto offset = get_generated_len() - token_cnt - num_token_to_ignore;
@@ -687,4 +684,4 @@ class SequenceGroup {
         m_generation_stream->push(std::move(outputs));
     } 
 };
-}
\ No newline at end of file
+}

From fb812ec1d417ac73eabed7e01b1adef31098b552 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Fri, 20 Dec 2024 14:29:48 +0400
Subject: [PATCH 16/18] Update sampler.cpp

---
 src/cpp/src/sampler.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index 1a80848264..9c18dc7721 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -391,8 +391,6 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits,
                     // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point)
                     candidate.m_sequence->remove_last_tokens(match_result.to_remove);
 
-                    // candidate.m_token_id = match_result.last_token_id;
-
                     // try to finish candidate
                     try_to_finish_candidate(group, candidate);
                     continue;
@@ -569,9 +567,7 @@ std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen
             auto& stop_strings = m_stop_strings.at(sequence_group->get_request_id());
             auto match_result = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), stop_strings, sampling_params.include_stop_str_in_output);
             if (match_result.is_matched) {
-                if (match_result.to_remove > 0) {
-                    running_sequence->remove_last_tokens(match_result.to_remove);
-                }
+                running_sequence->remove_last_tokens(match_result.to_remove);
 
                 running_sequence->set_status(SequenceStatus::FINISHED);
                 running_sequence->set_finish_reason(GenerationFinishReason::STOP);
@@ -937,6 +933,7 @@ void Sampler::create_logit_processor(uint64_t request_id, const GenerationConfig
 void Sampler::clear_request_info(uint64_t request_id) { 
     m_beam_search_info.erase(request_id);
     m_logit_processors.erase(request_id);
+    m_stop_strings.erase(request_id);
 }
 
 int64_t Sampler::GroupBeamSearcher::Group::finish(Beam beam, const ov::genai::GenerationConfig& sampling_params) {

From 1905fdda72c8b99bea78a9228be14937c43af109 Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Fri, 20 Dec 2024 14:47:01 +0400
Subject: [PATCH 17/18] Update lm_encoding.cpp

---
 src/cpp/src/lm_encoding.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index ebe57d6d01..cf163d5d3f 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -133,6 +133,8 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
 
     SamplerOutput sampler_output = sampler.sample(sequence_groups, logits);
     stream_generated_tokens();
+    
+    // "Generation" phase
 
     while (!active_sequence_groups.empty()) {
         size_t total_num_tokens = 0;
@@ -237,4 +239,4 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
 }
 
 }  // namespace genai
-}  // namespace ov
\ No newline at end of file
+}  // namespace ov

From c7bc7b8d1a504360532dc9138928dc609b46f8cd Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Fri, 20 Dec 2024 14:47:27 +0400
Subject: [PATCH 18/18] Update lm_encoding.cpp

---
 src/cpp/src/lm_encoding.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index cf163d5d3f..031214468e 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -133,7 +133,7 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
 
     SamplerOutput sampler_output = sampler.sample(sequence_groups, logits);
     stream_generated_tokens();
-    
+
     // "Generation" phase
 
     while (!active_sequence_groups.empty()) {