diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp
deleted file mode 100644
index a0262c0dc8..0000000000
--- a/src/cpp/src/group_beam_searcher.cpp
+++ /dev/null
@@ -1,455 +0,0 @@
-// Copyright (C) 2023-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include <openvino/runtime/tensor.hpp>
-
-#include <cassert>
-
-#include "openvino/genai/llm_pipeline.hpp"
-#include "utils.hpp"
-#include "lm_encoding.hpp"
-
-namespace {
-
-// Modified Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurrence in haystack
-std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std::vector<int64_t>& needle) {
-    if (needle.empty()) {  // no_repeat_ngram_size == 1, ban every token
-        return {haystack.begin(), haystack.end()};
-    }
-    std::vector<int> partial_match_table(needle.size() + 1, -1);
-    int cnd = 0;
-    for (size_t pos = 1; pos < needle.size(); ++pos) {
-        if (needle.at(pos) == needle.at(size_t(cnd))) {
-            partial_match_table.at(pos) = partial_match_table.at(size_t(cnd));
-        } else {
-            partial_match_table.at(pos) = cnd;
-            while (cnd >= 0 && needle.at(pos) != needle.at(size_t(cnd))) {
-                cnd = partial_match_table.at(size_t(cnd));
-            }
-        }
-        ++cnd;
-    }
-    partial_match_table.back() = cnd;
-    std::vector<int64_t> res;
-    size_t haystack_id = 0;
-    int needle_id = 0;
-    while (haystack_id < haystack.size() - 1) {
-        if (needle.at(size_t(needle_id)) == haystack.at(haystack_id)) {
-            ++haystack_id;
-            ++needle_id;
-            if (needle_id == int(needle.size())) {
-                res.push_back(haystack.at(haystack_id));
-                needle_id = partial_match_table.at(size_t(needle_id));
-            }
-        } else {
-            needle_id = partial_match_table.at(size_t(needle_id));
-            if (needle_id < 0) {
-                ++haystack_id;
-                ++needle_id;
-            }
-        }
-    }
-    return res;
-}
-
-struct Token {
-    float log_prob;
-    int64_t idx;
-};
-
-std::vector<Token> log_softmax(const ov::Tensor& logits, const size_t batch_idx) {
-    if (logits.get_shape().at(0) <= batch_idx) {
-        throw std::runtime_error("logits batch size doesn't match the number of beams");
-    }
-    size_t vocab_size = logits.get_shape().back();
-    size_t batch_offset = batch_idx * logits.get_shape().at(1) * vocab_size;
-    size_t sequence_offset = (logits.get_shape().at(1) - 1) * vocab_size;
-    const float* beam_logits = logits.data<const float>() + batch_offset + sequence_offset;
-    float max_logit = *std::max_element(beam_logits, beam_logits + vocab_size);
-    float log_sum = std::log(
-        std::accumulate(beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) {
-            return accumulated + std::exp(to_add - max_logit);
-        }));
-    std::vector<Token> tokens;
-    tokens.reserve(vocab_size);
-    for (size_t idx = 0; idx < vocab_size; ++idx) {
-        tokens.push_back({beam_logits[idx] - max_logit - log_sum, int64_t(idx)});
-    }
-    return tokens;
-}
-
-struct Beam {
-    float score = -std::numeric_limits<float>::infinity();  // The bigger, the better
-    std::vector<int64_t> tokens;
-    size_t global_beam_idx = 0;
-};
-
-bool greater(const Beam& left, const Beam& right) {
-    return left.score > right.score;
-}
-
-struct Parameters {
-    std::vector<std::vector<int64_t>> prompts;
-    int64_t eos_token_id;
-    size_t n_groups = 3;
-    size_t group_size = 5;
-    float diversity_penalty = 1.0;
-    size_t max_new_tokens = 20;
-    ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::HEURISTIC;
-    float length_penalty = 1.0;
-    size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
-
-    std::function<bool(const Beam&)> early_finish = [](const Beam&) {
-        return false;
-    };
-};
-
-struct Group {
-    std::vector<Beam> ongoing;   // Best beams in front
-    std::vector<Beam> min_heap;  // The worst of the best completed beams is the first
-    bool done = false;
-
-    void finish(Beam&& beam, const Parameters& parameters) {
-        beam.score /= std::pow(float(beam.tokens.size()), parameters.length_penalty);
-
-        min_heap.push_back(std::move(beam));
-        std::push_heap(min_heap.begin(), min_heap.end(), greater);
-        if (min_heap.size() > parameters.group_size) {
-            std::pop_heap(min_heap.begin(), min_heap.end(), greater);
-            min_heap.pop_back();
-        }
-    }
-    void is_done(const Parameters& parameters) {
-        if (min_heap.size() < parameters.group_size) {
-            return;
-        }
-        size_t cur_len = ongoing.front().tokens.size();
-        float best_sum_logprobs = ongoing.front().score;
-        float worst_score = min_heap.front().score;
-        switch (parameters.stop_criteria) {
-        case ov::genai::StopCriteria::EARLY:
-            done = true;
-            return;
-        case ov::genai::StopCriteria::HEURISTIC: {
-            float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
-            done = worst_score >= highest_attainable_score;
-            return;
-        }
-        case ov::genai::StopCriteria::NEVER: {
-            size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
-            float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
-            done = worst_score >= highest_attainable_score;
-            return;
-        }
-        default:
-            throw std::runtime_error("Never reached");
-        }
-    }
-};
-
-// GroupBeamSearcher processes logits prduced by a language model and accumulates beams using group beam search
-// algorithm. select_next_tokens() returns token ids selected by the algorithm and corresponding beam ids. These values
-// are used for next inference. select_next_tokens() returns empty, if all groups are completed
-struct GroupBeamSearcher {
-    Parameters parameters;
-    std::vector<std::vector<Group>> prompts_groups;
-
-    GroupBeamSearcher(Parameters parameters) : parameters{parameters}, prompts_groups{parameters.prompts.size()} {
-        if (parameters.no_repeat_ngram_size == 0) {
-            throw std::runtime_error("no_repeat_ngram_size must be positive");
-        }
-        for (std::vector<Group>& prompts_groups : prompts_groups) {
-            prompts_groups.resize(parameters.n_groups);
-            for (Group& group : prompts_groups) {
-                group.ongoing.resize(parameters.group_size);
-                group.ongoing.front().score = 0.0;
-            }
-        }
-    }
-
-    std::pair<std::vector<int64_t>, std::vector<int32_t>> select_next_tokens(const ov::Tensor& logits) {
-        std::vector<int64_t> next_tokens;
-        std::vector<int32_t> next_beams;
-
-        const size_t promts_size = parameters.prompts.size();
-
-        next_tokens.reserve(promts_size * parameters.n_groups * parameters.group_size);
-        next_beams.reserve(promts_size * parameters.n_groups * parameters.group_size);
-
-        size_t beam_count = 0;
-        size_t prompt_id = 0;
-        for (std::vector<Group>& groups : prompts_groups) {
-            for (Group& group : groups) {
-                if (group.done) {
-                    continue;
-                }
-                for (Beam& beam : group.ongoing) {
-                    // beam.tokens.empty() holds for the first select_next_tokens() call.
-                    // Every beam is constructed from the single batch at first call
-                    if (beam.tokens.empty()) {
-                        beam.global_beam_idx = prompt_id;
-                    } else {
-                        beam.global_beam_idx = beam_count;
-                        ++beam_count;
-                    }
-                }
-            }
-
-            prompt_id += 1;
-        }
-
-        for (int prompt_id = 0; prompt_id < promts_size; prompt_id++) {
-            const std::vector<int64_t> prompt = parameters.prompts[prompt_id];
-            std::vector<Group>& groups = prompts_groups[prompt_id];
-            auto [prompt_next_tokens, prompt_next_beams] = select_prompt_next_tokens(logits, prompt, groups);
-
-            next_tokens.insert(next_tokens.end(), prompt_next_tokens.begin(), prompt_next_tokens.end());
-            next_beams.insert(next_beams.end(), prompt_next_beams.begin(), prompt_next_beams.end());
-        }
-
-        return {next_tokens, next_beams};
-    }
-
-    std::pair<std::vector<int64_t>, std::vector<int32_t>> select_prompt_next_tokens(const ov::Tensor& logits,
-                                                                                    const std::vector<int64_t>& prompt,
-                                                                                    std::vector<Group>& groups) {
-        std::vector<int64_t> next_tokens;
-        std::vector<int32_t> next_beams;
-        next_tokens.reserve(parameters.n_groups * parameters.group_size);
-        next_beams.reserve(parameters.n_groups * parameters.group_size);
-
-        for (auto group = groups.begin(); group != groups.end(); ++group) {
-            if (group->done) {
-                continue;
-            }
-            std::vector<Beam> candidates;
-            candidates.reserve(parameters.group_size * 2 * parameters.group_size);
-            for (const Beam& beam : group->ongoing) {
-                std::vector<Token> tokens = log_softmax(logits, beam.global_beam_idx);
-                for (auto prev_group = groups.cbegin(); prev_group != group; ++prev_group) {
-                    for (const Beam& prev_beam : prev_group->ongoing) {
-                        if (prev_beam.tokens.size() > beam.tokens.size()) {
-                            tokens.at(size_t(prev_beam.tokens.back())).log_prob -= parameters.diversity_penalty;
-                        }
-                    }
-                }
-                std::vector<int64_t> full_text{prompt};
-                full_text.insert(full_text.end(), beam.tokens.begin(), beam.tokens.end());
-                if (full_text.size() > 1 && full_text.size() >= parameters.no_repeat_ngram_size) {
-                    auto tail_start = full_text.end() - ptrdiff_t(parameters.no_repeat_ngram_size) + 1;
-                    for (int64_t banned_token : kmp_search(full_text, {tail_start, full_text.end()})) {
-                        tokens.at(size_t(banned_token)).log_prob = -std::numeric_limits<float>::infinity();
-                    }
-                }
-                std::sort(tokens.begin(), tokens.end(), [](Token left, Token right) {
-                    return left.log_prob > right.log_prob;  // Most probable tokens in front
-                });
-                size_t add_count = 0;
-                for (Token token : tokens) {
-                    Beam new_candidate = beam;
-                    new_candidate.score += token.log_prob;
-                    new_candidate.tokens.push_back(token.idx);
-                    if (parameters.early_finish(new_candidate)) {
-                        group->finish(std::move(new_candidate), parameters);
-                    } else {
-                        candidates.push_back(std::move(new_candidate));
-                        ++add_count;
-                        if (add_count == 2 * parameters.group_size) {
-                            break;
-                        }
-                    }
-                }
-            }
-            // Sample 2 * group_size highest score tokens to get at least 1 non EOS token per beam
-            if (candidates.size() < 2 * parameters.group_size) {
-                throw std::runtime_error("No beams left to search");
-            }
-            auto to_sort = candidates.begin() + ptrdiff_t(2 * parameters.group_size);
-            std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater);
-            group->ongoing.clear();
-            for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) {
-                if (parameters.eos_token_id == candidates.at(cand_idx).tokens.back()) {
-                    // If beam_token does not belong to top num_beams tokens, it should not be added
-                    if (cand_idx >= parameters.group_size) {
-                        continue;
-                    }
-                    group->finish(std::move(candidates.at(cand_idx)), parameters);
-                } else {
-                    group->ongoing.push_back(std::move(candidates.at(cand_idx)));
-                    if (group->ongoing.size() == parameters.group_size) {
-                        break;
-                    }
-                }
-            }
-            group->is_done(parameters);
-            if (!group->done) {
-                for (const Beam& beam : group->ongoing) {
-                    next_tokens.push_back(beam.tokens.back());
-                    next_beams.push_back(int32_t(beam.global_beam_idx));
-                }
-            }
-        }
-        return {next_tokens, next_beams};
-    }
-};
-
-// Consume group_beam_searcher because beams are consumed
-std::vector<std::vector<std::vector<Beam>>> finalize(GroupBeamSearcher&& group_beam_searcher) {
-    std::vector<std::vector<std::vector<Beam>>> finalized;
-    finalized.resize(group_beam_searcher.prompts_groups.size());
-
-    for (size_t prompt_id = 0; prompt_id < group_beam_searcher.prompts_groups.size(); prompt_id++) {
-        std::vector<Group>& groups = group_beam_searcher.prompts_groups.at(prompt_id);
-        finalized.at(prompt_id).reserve(groups.size());
-
-        for (Group& group : groups) {
-            if (!group.done) {
-                for (Beam& beam : group.ongoing) {
-                    group.finish(std::move(beam), group_beam_searcher.parameters);
-                }
-            }
-            finalized.at(prompt_id).push_back(std::move(group.min_heap));
-        }
-    }
-
-    return finalized;
-}
-
-void reset_all_inputs_to_empty_tensors(ov::InferRequest& request) {
-    request.set_tensor("input_ids", ov::Tensor(ov::element::i64, {0, 0}));
-    request.set_tensor("beam_idx", ov::Tensor(ov::element::i32, {0}));
-    if (request.get_compiled_model().inputs().size() == 4)
-        request.set_tensor("position_ids", ov::Tensor(ov::element::i64, {0, 0}));
-}
-}  // namespace
-
-namespace ov {
-namespace genai {
-
-std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
-                           ov::Tensor input_ids,
-                           ov::Tensor attention_mask,
-                           GenerationConfig config, 
-                           std::optional<ov::Tensor> position_ids,
-                           std::optional<int32_t> selected_beam_idx) {
-    OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0,
-                    "number of beams should be divisible by number of groups");
-    
-    auto batch_size = input_ids.get_shape().at(0);
-    auto sequence_length = input_ids.get_shape().at(1);
-    
-    // Initialize beam search.
-    const int64_t* prompt_data = input_ids.data<const int64_t>();
-    std::vector<std::vector<int64_t>> prompts;
-    prompts.reserve(batch_size);
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        size_t batch_offset = batch * sequence_length;
-        const int64_t* prompt_start = prompt_data + batch_offset;
-        prompts.push_back(std::vector<int64_t>{prompt_start, prompt_start + sequence_length});
-    }
-
-    lm.set_tensor("input_ids", input_ids);
-    lm.set_tensor("attention_mask", attention_mask);
-    if (position_ids.has_value())
-        lm.set_tensor("position_ids", *position_ids);
-
-    ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {batch_size});
-    auto beam_data = beam_idx.data<int32_t>();
-    if (selected_beam_idx.has_value())
-        beam_data[0] = *selected_beam_idx;
-    else
-        std::fill_n(beam_data, batch_size, 0);
-    lm.set_tensor("beam_idx", beam_idx);
-
-    Parameters parameters{std::move(prompts)};
-    parameters.max_new_tokens = config.get_max_new_tokens(sequence_length);
-    parameters.eos_token_id = config.eos_token_id;
-    parameters.n_groups = config.num_beam_groups;
-    parameters.group_size = config.num_beams / config.num_beam_groups;
-    parameters.diversity_penalty = config.diversity_penalty;
-    parameters.length_penalty = config.length_penalty;
-    parameters.stop_criteria = config.stop_criteria;
-    parameters.no_repeat_ngram_size = config.no_repeat_ngram_size;
-    GroupBeamSearcher group_beam_searcher{parameters};
-
-    std::vector<int64_t> next_tokens;
-    std::vector<int32_t> next_beams;
-
-    // Reserve for performance counters.
-    std::vector<std::chrono::steady_clock::time_point> new_token_times;
-    std::vector<size_t> batch_sizes;
-    new_token_times.reserve(parameters.max_new_tokens);
-    batch_sizes.reserve(parameters.max_new_tokens);
-
-    for (size_t length_count = 0; ; ++length_count) {
-        lm.infer();
-
-        std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits"));
-        new_token_times.emplace_back(std::chrono::steady_clock::now());
-        batch_sizes.emplace_back(batch_size);
-
-        if (next_tokens.empty() || length_count == parameters.max_new_tokens - 1) {
-            // Break the cycle before masks are extended in update_attention_mask_with_beams.
-            // If generation is continued, attention_mask length should be equal to KV cache size.
-            break;
-        }
-        
-        size_t running_batch_size = next_tokens.size();
-        // Set pointers
-        lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {running_batch_size, 1}, next_tokens.data()});
-        lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {running_batch_size}, next_beams.data()});
-
-        // Set auxiliary inputs
-        update_attention_mask_with_beams(lm.get_tensor("attention_mask"), next_beams);
-        if (position_ids.has_value())
-            update_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask"));
-    }
-
-    reset_all_inputs_to_empty_tensors(lm);
-
-    auto scores_comparator = [](Beam& left, Beam& right) {
-        return (left.score > right.score);
-    };
-
-    auto result = finalize(std::move(group_beam_searcher));
-    ov::genai::EncodedResults results;
-    int32_t res_selected_beam_idx = 0;
-    results.scores.reserve(config.num_return_sequences * result.size());
-    results.tokens.reserve(config.num_return_sequences * result.size());
-    auto& raw_perf_counters = results.perf_metrics.raw_metrics;
-    raw_perf_counters.m_new_token_times = new_token_times;
-    raw_perf_counters.m_batch_sizes = batch_sizes;
-    
-    // align output with HF
-    for (size_t prompt_id = 0; prompt_id < result.size(); prompt_id++) {
-        auto prompt_group = result.at(prompt_id);
-        std::vector<std::reference_wrapper<Beam>> plain_beams;
-        plain_beams.reserve(parameters.n_groups * parameters.group_size);
-        for (std::vector<Beam>& group : prompt_group) {
-            for (Beam& beam : group) {
-                plain_beams.push_back(beam);
-            }
-        }
-        assert(config.num_return_sequences <= plain_beams.size());
-        std::partial_sort(
-            plain_beams.begin(),
-            plain_beams.begin() + config.num_return_sequences,
-            plain_beams.end(),
-            scores_comparator
-        );
-        res_selected_beam_idx = plain_beams.at(0).get().global_beam_idx;
-        for (
-            auto beam = plain_beams.begin();
-            beam != plain_beams.begin() + config.num_return_sequences;
-            ++beam
-        ) {
-            results.scores.push_back(beam->get().score);
-            results.tokens.push_back(std::move(beam->get().tokens));
-        }
-    }
-    
-    return {results, res_selected_beam_idx};
-}
-
-}  // namespace genai
-}  // namespace ov
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index f663b27dd9..4636e57270 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -24,27 +24,22 @@
 namespace ov {
 namespace genai {
 
-std::pair<EncodedResults, int32_t> beam_search(
-    ov::InferRequest& lm,
-    ov::Tensor prompts,
-    ov::Tensor attention_mask,
-    GenerationConfig config,
-    std::optional<ov::Tensor> position_ids,
-    std::optional<int32_t> selected_beam_idx
-);
-
 class StatefulLLMPipeline final : public LLMPipelineImplBase {
 public:
     ov::InferRequest m_model_runner;
     bool is_chat_conversation = false;
     bool m_trust_encoded_history = true;
-    std::optional<int32_t> m_selected_beam = std::nullopt;
     ChatHistory m_history;
     std::string m_templated_chat_history = {};
     std::vector<int64_t> m_tokenized_chat_history;
     ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
-    size_t m_to_remove_from_hist = 0;
     size_t m_kv_cache_seq_length_axis = 2;
+    // Tail of previous output in chat mode is missing in KV cache, let's keep it
+    std::optional<int64_t> m_last_disappeared_token = std::nullopt;
+    // If sequence contains some symbols, which could be ambiguous encoded by tokenizer, we need to trim kv cache
+    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
+    // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
+    ov::genai::utils::HistoryRemoveManager m_to_remove_from_hist = {0, 0};
 
     StatefulLLMPipeline(
         const ov::InferRequest& request,
@@ -156,8 +151,15 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
                 if (m_tokenized_chat_history.empty()) {
                     encoded_input = new_chat_tokens;
-                } else if (last_same_hist_token != SIZE_MAX) {
-                    m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token;
+                } else if (last_same_hist_token != SIZE_MAX || m_to_remove_from_hist.is_kv_cache_need_to_update()) {
+                    // is_kv_cache_need_to_update will be true here if beam search is activated
+                    // in beam search mode we want to remove all history about last model answer from kv cache and add best answer directly
+                    // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_to_remove_from_hist
+                    if (m_to_remove_from_hist.is_kv_cache_need_to_update()) {
+                        last_same_hist_token = m_to_remove_from_hist.last_hist_token_to_unchange;
+                    } else {
+                        m_to_remove_from_hist.num_token_to_remove_from_kv_cache = m_tokenized_chat_history.size() - last_same_hist_token;
+                    }
 
                     ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
                                                        {1, new_chat_tokens.input_ids.get_shape().at(1) - last_same_hist_token},
@@ -170,12 +172,17 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                                                        {1, new_chat_tokens.input_ids.get_shape().at(1) - last_same_hist_token});
                     new_tensor.copy_to(encoded_input.input_ids);
                     encoded_input.attention_mask = new_attention_mask;
-
-                    m_selected_beam = std::nullopt;
+                    m_last_disappeared_token = std::nullopt;
                 } else {
                     encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
+
+                    // no need to add m_last_disappeared_token to encoded_input, it was kept by subtract_chat_tokenized_inputs
+                    if (m_last_disappeared_token.has_value() && *m_last_disappeared_token == encoded_input.input_ids.data<int64_t>()[0]) {
+                        m_last_disappeared_token = std::nullopt;
+                    }
                 }
                 m_templated_chat_history = new_templated_chat_history;
+
                 m_tokenized_chat_history.clear();
                 m_tokenized_chat_history.reserve(new_chat_tokens.input_ids.get_size());
                 std::copy_n(new_chat_tokens.input_ids.data<int64_t>(), new_chat_tokens.input_ids.get_size(),
@@ -257,6 +264,12 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
             std::copy(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history));
 
+        // Tail of previous output in chat mode is missing in KV cache.
+        if (is_chat_conversation && m_last_disappeared_token.has_value()) {
+            attention_mask = ov::genai::utils::push_front_inputs(attention_mask, std::vector<int64_t>{1});
+            input_ids = ov::genai::utils::push_front_inputs(input_ids, std::vector<int64_t>{*m_last_disappeared_token});
+        }
+
         GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
 
         // If eos_token_id was not provided, take value from default m_generation_config
@@ -288,7 +301,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                         "(input_ids, attention_mask, position_ids, beam_idx) "
                         "but you have '" + std::to_string(num_inputs) + "' inputs");
 
-        ov::genai::utils::trim_kv_cache(m_model_runner, m_to_remove_from_hist, m_kv_cache_seq_length_axis, m_adapter_controller);
+        ov::genai::utils::trim_kv_cache(m_model_runner, m_to_remove_from_hist.num_token_to_remove_from_kv_cache, m_kv_cache_seq_length_axis, m_adapter_controller);
 
         size_t kv_cache_len = 0;
         ov::Tensor concatenated_attention_mask;
@@ -298,10 +311,12 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             // Between subsequent runs attention_mask should not be modified.
             auto atten_mask_history = m_model_runner.get_tensor("attention_mask");
             auto prompt_len = attention_mask.get_shape()[1];
-            kv_cache_len = atten_mask_history.get_shape()[1] - m_to_remove_from_hist;
+
+            kv_cache_len = atten_mask_history.get_shape()[1] - m_to_remove_from_hist.num_token_to_remove_from_kv_cache;
 
             ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}};
-            auto start_atten_hst = atten_mask_history.data<int64_t>() + kv_cache_len * (*m_selected_beam);
+            auto start_atten_hst = atten_mask_history.data<int64_t>();
+
             std::copy(start_atten_hst, start_atten_hst + kv_cache_len,
                     new_atten_mask.data<int64_t>());
             std::copy(attention_mask.data<int64_t>(), attention_mask.data<int64_t>() + prompt_len,
@@ -311,6 +326,8 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             concatenated_attention_mask = attention_mask;
         }
 
+        size_t prev_attn_mask_size = concatenated_attention_mask.get_shape()[1];
+
         bool position_ids_available = (num_inputs == 4);
         std::optional<ov::Tensor> position_ids = std::nullopt;
         if (position_ids_available) {
@@ -324,48 +341,54 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
         if (is_chat_conversation && !m_trust_encoded_history) {
             m_trust_encoded_history = true;
-            m_to_remove_from_hist = 0;
+            m_to_remove_from_hist.reset();
         }
 
-        ov::genai::EncodedResults result;
-        if (config.is_beam_search() && is_chat_conversation) {
-            std::tie(result, m_selected_beam) = beam_search(m_model_runner, input_ids, concatenated_attention_mask,
-                                                            config, position_ids, m_selected_beam);
-        } else {
-            std::vector<SequenceGroup::Ptr> requests;
-            size_t block_size = 1;
-            bool enable_prefix_caching = false;
-
-            for (size_t request_id = 0; request_id < batch_size; request_id++) {
-                SequenceGroup::Ptr sequence_group;
-                if (is_chat_conversation) {
-                    ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
-                    sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching);
-                } else {
-                    size_t seq_len = input_ids.get_shape().at(1);
-                    size_t batch_offset = request_id * seq_len;
-                    const int64_t* prompt_start = input_ids.data<const int64_t>() + batch_offset;
-                    std::vector<int64_t> tokenized_prompt(prompt_start, prompt_start + seq_len);
+        std::vector<SequenceGroup::Ptr> requests;
+        size_t block_size = 1;
+        bool enable_prefix_caching = false;
 
-                    sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_prompt, config, block_size, enable_prefix_caching);
-                }
+        for (size_t request_id = 0; request_id < batch_size; request_id++) {
+            SequenceGroup::Ptr sequence_group;
+            if (is_chat_conversation) {
+                ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
+                sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching);
+            } else {
+                size_t seq_len = input_ids.get_shape().at(1);
+                size_t batch_offset = request_id * seq_len;
+                const int64_t* prompt_start = input_ids.data<const int64_t>() + batch_offset;
+                std::vector<int64_t> tokenized_prompt(prompt_start, prompt_start + seq_len);
 
-                sequence_group->set_sequence_group_ptr(sequence_group);
-                requests.push_back(sequence_group);
+                sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_prompt, config, block_size, enable_prefix_caching);
             }
 
-            Sampler sampler = Sampler(m_tokenizer);
-            std::tie(result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask, streamer_ptr,
-                                                                                  sampler, requests, position_ids, std::nullopt, m_selected_beam);
+            sequence_group->set_sequence_group_ptr(sequence_group);
+            requests.push_back(sequence_group);
         }
 
+        Sampler sampler = Sampler(m_tokenizer);
+        ov::genai::EncodedResults result = ov::genai::get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask,
+                                                                             streamer_ptr, sampler, requests, position_ids, std::nullopt);
+
         if (is_chat_conversation) {
+            // force remove from kv_cache last answer
+            if (config.is_beam_search() && m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) {
+                m_to_remove_from_hist.last_hist_token_to_unchange = m_tokenized_chat_history.size();
+                m_to_remove_from_hist.num_token_to_remove_from_kv_cache = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size;
+            }
+
+            // in chat mode it will be only one requests
+            if (requests[0]->get_finished_sequences()[0]->get_finish_reason() == GenerationFinishReason::LENGTH)
+                m_last_disappeared_token = result.tokens[0].back();
+
             std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
         } else {
             reset_kv_state();
-            m_selected_beam = std::nullopt;
         }
 
+        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
+            std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
+
         auto stop_time = std::chrono::steady_clock::now();
 
         // If is called without tokenization then that stat will not be reported.
@@ -379,10 +402,10 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
     void start_chat(const std::string& system_message) override {
         is_chat_conversation = true;
-        m_selected_beam = std::nullopt;
         m_trust_encoded_history = true;
-        m_to_remove_from_hist = 0;
+        m_to_remove_from_hist.reset();
         m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+        m_last_disappeared_token = std::nullopt;
         if (!m_tokenized_chat_history.empty()) {
             reset_kv_state();
             m_history = {};
@@ -400,10 +423,10 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
     void finish_chat() override {
         is_chat_conversation = false;
-        m_selected_beam = std::nullopt;
         m_trust_encoded_history = true;
-        m_to_remove_from_hist = 0;
+        m_to_remove_from_hist.reset();
         m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+        m_last_disappeared_token = std::nullopt;
         if (!m_tokenized_chat_history.empty()) {
             reset_kv_state();
             m_history.clear();
diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index 3ab041fa58..6b2b8806de 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -9,12 +9,11 @@
 #include <regex>
 #include <vector>
 
+#include "utils.hpp"
+#include "debug_utils.hpp"
 #include "lm_encoding.hpp"
 #include "openvino/genai/perf_metrics.hpp"
 
-#include "debug_utils.hpp"
-
-#include "utils.hpp"
 
 namespace ov {
 namespace genai {
@@ -51,7 +50,7 @@ void update_attention_mask_with_beams(ov::Tensor&& attention_mask, std::vector<i
 }
 
 
-std::pair<EncodedResults, int32_t> get_lm_encoded_results(
+EncodedResults get_lm_encoded_results(
     ov::InferRequest& m_llm,
     const ov::Tensor& input_ids,
     const ov::Tensor& attention_mask,
@@ -59,8 +58,7 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
     Sampler& sampler,
     std::vector<SequenceGroup::Ptr> sequence_groups,
     std::optional<ov::Tensor> position_ids,
-    std::optional<EmbeddingsModel> m_embedding,
-    std::optional<int32_t> selected_beam_idx
+    std::optional<EmbeddingsModel> m_embedding
 ) {
     std::vector<GenerationHandle> generations;
     for (SequenceGroup::Ptr sequence_group : sequence_groups) {
@@ -88,10 +86,7 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
 
     ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {batch_size});
     auto beam_data = beam_idx.data<int32_t>();
-    if (selected_beam_idx.has_value())
-        beam_data[0] = *selected_beam_idx;
-    else
-        std::fill_n(beam_data, batch_size, 0);
+    std::fill_n(beam_data, batch_size, 0);
     m_llm.set_tensor("beam_idx", beam_idx);
 
     const auto infer_start = std::chrono::steady_clock::now();
@@ -109,7 +104,6 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
     for (auto& sequence_group : sequence_groups) {
         sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - sequence_len);
         sequence_group->schedule_tokens(sequence_len);
-
     }
 
     std::map<size_t, size_t> beam_offets;
@@ -172,7 +166,7 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
                 // apply strides to shift to a next sequence
                 input_ids_data += num_scheduled_tokens;
 
-                // for different sequences iteration of beams started from 0, but we collect it to one input_ids#
+                // for different sequences iteration of beams started from 0, but we collect it to one input_ids
                 next_beams.push_back(beam_idxs[sequence->get_id()] + beam_offets.at(sequence_group->get_request_id()));
             }
         }
@@ -187,23 +181,19 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
 
         if (m_embedding.has_value()) {
             const ov::Tensor& embed_prompt_tensor = (*m_embedding).infer(new_input_ids);
-
-            m_llm.get_tensor("inputs_embeds").set_shape(embed_prompt_tensor.get_shape());
             m_llm.set_tensor("inputs_embeds", embed_prompt_tensor);
         } else {
-            m_llm.get_tensor("input_ids").set_shape(new_input_ids.get_shape());
             m_llm.set_tensor("input_ids", new_input_ids);
         }
 
+        m_llm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {total_num_tokens}, next_beams.data()});
+
         update_attention_mask_with_beams(m_llm.get_tensor("attention_mask"), next_beams);
 
         if (position_ids.has_value()) {
             update_position_ids(m_llm.get_tensor("position_ids"), m_llm.get_tensor("attention_mask"));
         }
 
-        m_llm.get_tensor("beam_idx").set_shape({ total_num_tokens });
-        m_llm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {total_num_tokens}, next_beams.data()});
-
         const auto infer_start = std::chrono::steady_clock::now();
         m_llm.infer();
         const auto infer_end = std::chrono::steady_clock::now();
@@ -228,26 +218,20 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
     if (streamer_ptr) {
         streamer_ptr->end();
     }
-    
-    size_t next_selected_beam = 0;
-    for (size_t i = 0; i < sequence_groups.size(); i++) {
-        auto request = sequence_groups[i];
-        auto generation_outputs = generations[i]->read_all();
-
-        std::sort(generation_outputs.begin(), generation_outputs.end(), [] (const GenerationOutput& r1, const GenerationOutput& r2) {
-            return r1.score > r2.score;
-        });
-
-        auto num_outputs = std::min(request->get_sampling_parameters().num_return_sequences, generation_outputs.size());
-        for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
-            const auto& generation_output = generation_outputs[generation_output_idx];
-            results.tokens.push_back(std::move(generation_output.generated_ids));
-            results.scores.push_back(generation_output.score);
+
+    for (auto& sequence_group : sequence_groups) {
+        // sequences is sorted by cumulative_log_prob with length_penalty
+        auto outputs = sequence_group->get_finished_sequences();
+
+        auto num_outputs = std::min(sequence_group->get_sampling_parameters().num_return_sequences, outputs.size());
+        for (size_t output_idx = 0; output_idx < num_outputs; ++output_idx) {
+            const auto& output = outputs[output_idx];
+            results.tokens.push_back(output->get_generated_ids());
+            results.scores.push_back(output->get_cumulative_score_with_length_penalty(sequence_group->get_sampling_parameters()));
         }
-        // next_selected_beam = sampler.last_selected_beam(request);
     }
 
-    return {results, next_selected_beam};
+    return results;
 }
 
 }  // namespace genai
diff --git a/src/cpp/src/lm_encoding.hpp b/src/cpp/src/lm_encoding.hpp
index fa6692ede0..0a342f0a37 100644
--- a/src/cpp/src/lm_encoding.hpp
+++ b/src/cpp/src/lm_encoding.hpp
@@ -8,13 +8,9 @@
 namespace ov {
 namespace genai {
 
-std::pair<EncodedResults, int32_t> get_lm_encoded_results(ov::InferRequest& m_llm, const ov::Tensor& input_ids, const ov::Tensor& attention_mask,
+EncodedResults get_lm_encoded_results(ov::InferRequest& m_llm, const ov::Tensor& input_ids, const ov::Tensor& attention_mask,
                                                           const std::shared_ptr<StreamerBase>& streamer_ptr, Sampler& sampler, std::vector<SequenceGroup::Ptr> sequence_groups,
-                                                          std::optional<ov::Tensor> position_ids, std::optional<EmbeddingsModel> m_embedding, std::optional<int32_t> selected_beam_idx);
-
-void update_attention_mask_with_beams(ov::Tensor&& attention_mask, std::vector<int32_t> next_beams);
-
-void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask);
+                                                          std::optional<ov::Tensor> position_ids, std::optional<EmbeddingsModel> m_embedding);
 
 }
 }
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 6755255fe8..3c30ce52de 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -167,7 +167,7 @@ class Sequence {
         m_generated_log_probs[idx] = log_prob;
     }
 
-    float get_beam_search_score(const ov::genai::GenerationConfig& sampling_params) const {
+    float get_cumulative_score_with_length_penalty(const ov::genai::GenerationConfig& sampling_params) const {
         float cumulative_log_prob = get_cumulative_log_probs(), current_length = get_generated_len();
         float score = cumulative_log_prob / std::pow(current_length, sampling_params.length_penalty);
         return score;
@@ -339,7 +339,7 @@ class SequenceGroup {
 
         // do we need to sort sequences here or sampler can handle it for us?
         std::sort(finished_seqs.begin(), finished_seqs.end(), [=] (Sequence::CPtr s1, Sequence::CPtr s2) {
-            return s1->get_beam_search_score(m_sampling_params) > s2->get_beam_search_score(m_sampling_params);
+            return s1->get_cumulative_score_with_length_penalty(m_sampling_params) > s2->get_cumulative_score_with_length_penalty(m_sampling_params);
         });
 
         return finished_seqs;
@@ -589,7 +589,7 @@ class SequenceGroup {
                 output.generated_ids.insert(output.generated_ids.begin(), m_prompt_ids.begin(), m_prompt_ids.end());
                 output.generated_log_probs.insert(output.generated_log_probs.begin(), m_prompt_log_probs.begin(), m_prompt_log_probs.end());
             }
-            output.score = m_sampling_params.is_beam_search() ? sequence->get_beam_search_score(m_sampling_params) : sequence->get_cumulative_log_probs();
+            output.score = m_sampling_params.is_beam_search() ? sequence->get_cumulative_score_with_length_penalty(m_sampling_params) : sequence->get_cumulative_log_probs();
             output.finish_reason = sequence->get_finish_reason();
             outputs.emplace(sequence->get_grouped_id(), output);
         }
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index 3690920295..626806ddc0 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -381,6 +381,14 @@ void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t se
     }
 }
 
+ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, std::vector<int64_t> add_to_front) {
+    ov::Tensor new_tensor = ov::Tensor{ov::element::i64, {base_tensor.get_shape().at(0), base_tensor.get_shape().at(1) + add_to_front.size()}};
+    auto new_tensor_data = new_tensor.data<int64_t>();
+    std::copy(add_to_front.begin(), add_to_front.end(), new_tensor_data);
+    std::copy(base_tensor.data<int64_t>(), base_tensor.data<int64_t>() + base_tensor.get_size(), new_tensor_data + add_to_front.size());
+    return new_tensor;
+}
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 57728cd0dc..8fc3b41e66 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -28,6 +28,21 @@ enum class GenerationChatInputsType {
     ENCODED_INPUTS = 2, // Type of inputs is EncodedInputs
 };
 
+struct HistoryRemoveManager
+{
+    size_t num_token_to_remove_from_kv_cache = 0;
+    size_t last_hist_token_to_unchange = 0;
+
+    bool is_kv_cache_need_to_update() {
+        return (last_hist_token_to_unchange > 0 || num_token_to_remove_from_kv_cache > 0);
+    }
+
+    void reset() {
+        num_token_to_remove_from_kv_cache = 0;
+        last_hist_token_to_unchange = 0;
+    }
+};
+
 Tensor init_attention_mask(const Tensor& position_ids);
 
 void print_tensor(const ov::Tensor& tensor);
@@ -104,6 +119,8 @@ size_t get_seq_len_axis(std::shared_ptr<const ov::Model> model);
 
 void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t seq_length_axis, std::optional<AdapterController> adapter_controller);
 
+ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, std::vector<int64_t> add_to_front);
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index dfdb1521ef..c7bad169c6 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -41,9 +41,10 @@ class InputsEmbedder::IInputsEmbedder {
     std::string m_templated_chat_history;
     // Tokenized chat history
     std::vector<int64_t> m_tokenized_chat_history;
-    // The number of elements, which need to remove from the end of KV cache
-    // removed elements will be added to inputs_ids
-    size_t m_to_remove_from_hist = 0;
+    // If sequence contains some symbols, which could be ambiguous encoded by tokenizer, we need to trim kv cache
+    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
+    // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
+    ov::genai::utils::HistoryRemoveManager m_to_remove_from_hist = {0, 0};
 
 public:
     virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
@@ -61,17 +62,22 @@ class InputsEmbedder::IInputsEmbedder {
     }
 
     size_t get_amount_to_remove_from_hist() const {
-        return m_to_remove_from_hist;
+        return m_to_remove_from_hist.num_token_to_remove_from_kv_cache;
     }
 
-    void update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
+    void update_tokenized_chat_history(std::vector<int64_t> encoded_result, bool is_beam_search, size_t last_answer_len) {
+        if (is_beam_search) {
+            m_to_remove_from_hist.last_hist_token_to_unchange = m_tokenized_chat_history.size();
+            m_to_remove_from_hist.num_token_to_remove_from_kv_cache = last_answer_len;
+        } else {
+            m_to_remove_from_hist.reset();
+        }
         std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_chat_history));
-        m_to_remove_from_hist = 0;
     }
 
     virtual void start_chat(const std::string& system_message) {
         m_is_chat_conversation = true;
-        m_to_remove_from_hist = 0;
+        m_to_remove_from_hist.reset();
         if (!m_tokenized_chat_history.empty()) {
             m_history.clear();
             m_templated_chat_history.clear();
@@ -94,7 +100,7 @@ class InputsEmbedder::IInputsEmbedder {
 
     virtual void finish_chat() {
         m_is_chat_conversation = false;
-        m_to_remove_from_hist = 0;
+        m_to_remove_from_hist.reset();
 
         m_history.clear();
         m_templated_chat_history.clear();
@@ -172,8 +178,15 @@ class InputsEmbedder::IInputsEmbedder {
 
             if (m_tokenized_chat_history.empty()) {
                 encoded_input_ids = new_chat_tokens;
-            } else if (last_same_hist_token != SIZE_MAX) {
-                m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token;
+            } else if (last_same_hist_token != SIZE_MAX || m_to_remove_from_hist.is_kv_cache_need_to_update()) {
+                // is_kv_cache_need_to_update will be true here if beam search is activated
+                // in beam search mode we want to remove all history about last model answer from kv cache and add best answer directly
+                // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_to_remove_from_hist
+                if (m_to_remove_from_hist.is_kv_cache_need_to_update()) {
+                    last_same_hist_token = m_to_remove_from_hist.last_hist_token_to_unchange;
+                } else {
+                    m_to_remove_from_hist.num_token_to_remove_from_kv_cache = m_tokenized_chat_history.size() - last_same_hist_token;
+                }
 
                 ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.get_element_type(),
                                                    {1, new_chat_tokens.get_shape().at(1) - last_same_hist_token},
@@ -1175,8 +1188,8 @@ std::vector<int64_t> InputsEmbedder::get_tokenized_chat_history() const {
     return m_impl->get_tokenized_chat_history();
 }
 
-void InputsEmbedder::update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
-    return m_impl->update_tokenized_chat_history(encoded_result);
+void InputsEmbedder::update_tokenized_chat_history(std::vector<int64_t> encoded_result, bool is_beam_search, size_t last_answer_len) {
+    return m_impl->update_tokenized_chat_history(encoded_result, is_beam_search, last_answer_len);
 }
 
 size_t InputsEmbedder::get_amount_to_remove_from_hist() const {
diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp
index 5c5b9d2b81..8a3deafb54 100644
--- a/src/cpp/src/visual_language/inputs_embedder.hpp
+++ b/src/cpp/src/visual_language/inputs_embedder.hpp
@@ -43,7 +43,7 @@ class InputsEmbedder {
     // returns tokenized chat history
     std::vector<int64_t> get_tokenized_chat_history() const;
     // add new results to tokenized chat history
-    void update_tokenized_chat_history(std::vector<int64_t> encoded_result);
+    void update_tokenized_chat_history(std::vector<int64_t> encoded_result, bool is_beam_search, size_t last_answer_len);
     // returns amount of elements, which need to remove from the end of the KV cache
     size_t get_amount_to_remove_from_hist() const;
 
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
index b8e89a8e04..49db7c6f9e 100644
--- a/src/cpp/src/visual_language/pipeline.cpp
+++ b/src/cpp/src/visual_language/pipeline.cpp
@@ -163,8 +163,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         auto to_remove_from_hist = m_inputs_embedder->get_amount_to_remove_from_hist();
         ov::genai::utils::trim_kv_cache(m_language, to_remove_from_hist, m_kv_cache_seq_length_axis, std::nullopt);
 
-        Sampler sampler = Sampler(m_tokenizer);
-
         std::vector<SequenceGroup::Ptr> requests;
         size_t request_id = 0;
         size_t block_size = 1; // not used
@@ -204,10 +202,10 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }};
         std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), history_size);
 
-        ov::genai::EncodedResults encoded_result;
-        int32_t m_selected_beam = 0;
-        std::tie(encoded_result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, sampler, requests,
-                                                                                      position_ids, m_embedding, std::nullopt);
+        Sampler sampler = Sampler(m_tokenizer);
+
+        ov::genai::EncodedResults encoded_result = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, sampler, requests,
+                                                                                     position_ids, m_embedding);
 
         auto decode_start_time = std::chrono::steady_clock::now();
         VLMDecodedResults decoded;
@@ -217,6 +215,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         }
         auto decode_end_time = std::chrono::steady_clock::now();
 
+        m_inputs_embedder->update_tokenized_chat_history(encoded_result.tokens[0], generation_config.is_beam_search(), m_language.get_tensor("attention_mask").get_shape()[1] - (history_size + inputs_embeds_size));
+
         std::string decoded_results = decoded.texts.at(0);
         if (m_is_chat_conversation) {
             m_inputs_embedder->update_chat_history(decoded_results);
@@ -243,8 +243,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         decoded.perf_metrics.m_evaluated = false;
         decoded.perf_metrics.evaluate_statistics(generate_start_time);
 
-        m_inputs_embedder->update_tokenized_chat_history(encoded_result.tokens[0]);
-
         return decoded;
     }