From ba91fdea09a8bb1d1753090b3a8d25a3953f6a91 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Tue, 26 Mar 2024 13:51:14 +0100
Subject: [PATCH 01/97] initial generate

---
 text_generation/causal_lm/cpp/CMakeLists.txt  |  31 ++++-
 .../generate_pipeline/generate_pipeline.hpp   | 111 ++++++++++++++++++
 .../causal_lm/cpp/generate_pipeline/main.cpp  | 103 ++++++++++++++++
 .../generate_pipeline/sampling_parameters.hpp |  73 ++++++++++++
 4 files changed, 312 insertions(+), 6 deletions(-)
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/main.cpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 03aced09e0..b59953e29e 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -21,10 +21,29 @@ target_link_libraries(beam_search_causal_lm PRIVATE openvino::runtime)
 set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD 17)
 set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-add_executable(speculative_decoding_lm speculative_decoding_lm.cpp)
-target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
-target_include_directories(speculative_decoding_lm PRIVATE ./)
+set(TARGET_NAME beam_search_sample)
+add_executable(${TARGET_NAME} beam_search_causal_lm.cpp)
+target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime)
-set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD 17)
-set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
+target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
+
+set(TARGET_NAME speculative_decoding_lm)
+add_executable(${TARGET_NAME} speculative_decoding_lm.cpp)
+target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+find_package(OpenVINO REQUIRED COMPONENTS Runtime)
+target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
+
+set(TARGET_NAME generate_pipeline)
+add_executable(${TARGET_NAME} generate_pipeline/main.cpp)
+target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+find_package(OpenVINO REQUIRED COMPONENTS Runtime)
+target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
new file mode 100644
index 0000000000..1d95921278
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -0,0 +1,111 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <openvino/openvino.hpp>
+#include "sampling_parameters.hpp"
+
+
+using GenerationResult = std::vector<int64_t>;
+
+class LLMEngine {
+    ov::InferRequest m_model_runner;
+
+    GenerationResult greedy_search(ov::Tensor prompts, SamplingParameters sampling_params) {
+        ov::Shape prompts_shape = prompts.get_shape();
+        size_t batch_size = prompts_shape[0];
+        OPENVINO_ASSERT(batch_size == 1);
+        
+        GenerationResult results;
+        results.reserve(sampling_params.max_new_tokens);
+        auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()};
+        std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
+        auto position_ids = ov::Tensor{ov::element::i64, prompts.get_shape()};
+        std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
+        auto initial_seq_len = prompts.get_shape()[1];
+
+        m_model_runner.set_tensor("input_ids", prompts);
+        m_model_runner.set_tensor("attention_mask", attention_mask);
+        m_model_runner.set_tensor("position_ids", position_ids);
+    
+        // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1
+        m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
+        m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+
+        for (size_t i = 0; i < sampling_params.max_new_tokens; ++i) {
+            m_model_runner.infer();
+            auto logits = m_model_runner.get_tensor("logits");
+            ov::Shape logits_shape = logits.get_shape();
+
+            size_t batch_size = logits_shape[0], seq_len = logits_shape[1], vocab_size = logits_shape[2];
+            OPENVINO_ASSERT(batch_size == 1);
+            // todo: implement for batch > 1
+
+            const float * logits_data = logits.data<const float>() + (seq_len - 1) * vocab_size;
+            int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
+
+            m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});
+            m_model_runner.get_tensor("input_ids").data<int64_t>()[0] = out_token;
+            
+            m_model_runner.get_tensor("attention_mask").set_shape({batch_size, m_model_runner.get_tensor("attention_mask").get_shape()[1] + 1});
+            std::fill_n(m_model_runner.get_tensor("attention_mask").data<int64_t>(), m_model_runner.get_tensor("attention_mask").get_size(), 1);
+            
+            m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
+            m_model_runner.get_tensor("position_ids").data<int64_t>()[0] = int64_t(initial_seq_len + i);
+            results.emplace_back(out_token);
+        }
+        return results;
+    }
+
+    GenerationResult beam_search(ov::Tensor prompts, SamplingParameters sampling_params) {
+        // todo: implement
+        GenerationResult results;
+        results.reserve(10);
+        return results;
+    }
+
+    GenerationResult multinomial_sampling(ov::Tensor prompts, SamplingParameters sampling_params) {
+        // todo: implement
+        GenerationResult results;
+        results.reserve(10);
+        return results;
+    }
+
+public:
+    LLMEngine(ov::InferRequest& request) :
+          m_model_runner(request) {
+            // todo
+    }
+
+    // more high level interface
+    GenerationResult generate(ov::Tensor prompts, SamplingParameters sampling_params) {
+        if (sampling_params.is_gready_sampling()) {
+            return greedy_search(prompts, sampling_params);
+        } else if (sampling_params.is_beam_search()) {
+            return beam_search(prompts, sampling_params);
+        } else {  // if (sampling_params.is_multimomial()) {
+            return multinomial_sampling(prompts, sampling_params);
+        }
+    }
+};
+
+class LLMPipeline {
+    ov::InferRequest m_model_runner;
+    std::string m_path;
+    SamplingParameters sampling_parameters;
+
+public:
+    LLMPipeline(std::string& path) : m_path(path) {
+        // load generation config from the file
+        // todo
+    }
+
+    GenerationResult call() {
+        // will call generate inside itself
+        GenerationResult results;
+        results.reserve(10);
+        return results;
+    }
+
+};
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
new file mode 100644
index 0000000000..c91df74be0
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
@@ -0,0 +1,103 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openvino/openvino.hpp>
+
+#include "generate_pipeline.hpp"
+
+namespace {
+
+constexpr size_t BATCH_SIZE = 1;
+
+std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string prompt) {
+    tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt});
+    tokenizer.infer();
+    return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
+}
+
+std::string detokenize(ov::InferRequest& detokenizer, std::vector<int64_t> tokens) {
+    constexpr size_t BATCH_SIZE = 1;
+    detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {BATCH_SIZE, tokens.size()}, tokens.data()});
+    detokenizer.infer();
+    return detokenizer.get_output_tensor().data<std::string>()[0];
+}
+
+std::string detokenize(ov::InferRequest& detokenizer, ov::Tensor tokens) {
+    detokenizer.set_input_tensor(tokens);
+    detokenizer.infer();
+    return detokenizer.get_output_tensor().data<std::string>()[0];
+}
+
+// The following reasons require TextStreamer to keep a cache of previous tokens:
+// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
+// but detokenize(tokenize("prefix a")) == "prefix a"
+// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
+struct TextStreamer {
+    ov::InferRequest detokenizer;
+    std::vector<int64_t> token_cache;
+    size_t print_len = 0;
+
+    void put(int64_t token) {
+        token_cache.push_back(token);
+        std::string text = detokenize(detokenizer, token_cache);
+        if (!text.empty() && '\n' == text.back()) {
+            // Flush the cache after the new line symbol
+            std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
+            token_cache.clear();
+            print_len = 0;
+            return;
+        }
+        if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
+            // Don't print incomplete text
+            return;
+        }
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+        print_len = text.size();
+    }
+
+    void end() {
+        std::string text = detokenize(detokenizer, token_cache);
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
+        token_cache.clear();
+        print_len = 0;
+    }
+};
+
+}  // namespace
+
+void print_generation_results(GenerationResult results, ov::InferRequest& detokenizer) {
+    TextStreamer text_streamer{std::move(detokenizer)};
+    for (const auto& result: results) {
+        text_streamer.put(result);
+    }
+    text_streamer.end();
+}
+
+int main(int argc, char* argv[]) try {
+    std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
+    ov::Core core;
+    // core.add_extension("libuser_ov_extensions.so");
+    core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
+    // tokenizer and detokenizer work on CPU only
+    ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
+    ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+
+    // The model can be compiled for GPU as well
+    std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
+    ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
+    
+    auto [input_ids, attention_mask] = tokenize(tokenizer, argv[1]);
+    
+    SamplingParameters sampling_params = SamplingParameters::greedy();
+
+    LLMEngine engine(request);
+    GenerationResult generation_results = engine.generate(input_ids, sampling_params);
+    print_generation_results(generation_results, detokenizer);
+
+} catch (const std::exception& error) {
+    std::cerr << error.what() << '\n';
+    return EXIT_FAILURE;
+} catch (...) {
+    std::cerr << "Non-exception object thrown\n";
+    return EXIT_FAILURE;
+}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
new file mode 100644
index 0000000000..39de510fd6
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
@@ -0,0 +1,73 @@
+
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdlib>
+#include <functional>
+
+enum class StopCriteria {early, heuristic, never};
+
+// forward declaration
+class Sequence;
+
+struct SamplingParameters {
+    // Generic
+    size_t max_new_tokens = 100;
+    bool ignore_eos = false;
+    int64_t eos_token = 2; // There's no way to extract special token values from the tokenizer for now
+
+    // Beam search specific
+    size_t n_groups = 1;
+    size_t group_size = 1; // beam_width
+    float diversity_penalty = 1.0f; // 0.0 means no diversity
+    StopCriteria stop_criteria = StopCriteria::heuristic;
+    float length_penalty = 1.0f;
+    size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
+    std::function<bool(const Sequence&)> early_finish = [](const Sequence&) {return false; };
+
+    // Multinomial
+    float temperature = 0.0f; // by default we use greedy sampling
+    int top_k = -1; // maybe to assign vocab_size ?
+    float top_p = 1.0f; // by default convsider all tokens
+
+    static SamplingParameters greedy() {
+        SamplingParameters greedy_params;
+        greedy_params.temperature = 0.0f;
+        greedy_params.ignore_eos = true;
+        return greedy_params;
+    }
+
+    static SamplingParameters beam_search() {
+        SamplingParameters beam_search;
+        beam_search.n_groups = 2;
+        beam_search.group_size = 2;
+        beam_search.max_new_tokens = 100;
+        beam_search.diversity_penalty = 2.0f;
+        return beam_search;
+    }
+
+    static SamplingParameters multimomial() {
+        SamplingParameters multimomial;
+        multimomial.temperature = 0.8f;
+        multimomial.top_p = 0.8;
+        multimomial.top_k = 20;
+        return multimomial;
+    }
+
+    bool is_gready_sampling() const {
+        return temperature == 0.0f && !is_beam_search();
+    }
+
+    bool is_beam_search() const {
+        return n_groups * group_size > 1;
+    }
+
+    bool is_multimomial() const {
+        return temperature == 0.0f && !is_beam_search();
+    }
+    
+};
+
+enum class SamplingAlgorithm{greedy, multinomial, baeam_search};

From 9d85a0ee18ce61078fe08f59bc2bac447675c5f6 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 28 Mar 2024 14:46:20 +0100
Subject: [PATCH 02/97] LLM pipeline

---
 .gitmodules                                   |   3 +
 text_generation/causal_lm/cpp/CMakeLists.txt  |   4 +
 .../generate_pipeline/generate_pipeline.hpp   | 104 ++++++++++++++++--
 .../causal_lm/cpp/generate_pipeline/main.cpp  |  88 ++++-----------
 .../generate_pipeline/sampling_parameters.hpp |  32 +++++-
 thirdparty/nlohmann_json                      |   1 +
 6 files changed, 151 insertions(+), 81 deletions(-)
 create mode 160000 thirdparty/nlohmann_json

diff --git a/.gitmodules b/.gitmodules
index f72fd83489..3bf6d7a4f0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "thirdparty/openvino_tokenizers"]
     path = thirdparty/openvino_tokenizers
     url = https://github.com/openvinotoolkit/openvino_tokenizers.git
+[submodule "/home/epavel/devel/openvino.genai/thirdparty/nlohmann_json"]
+	path = /home/epavel/devel/openvino.genai/thirdparty/nlohmann_json
+	url = https://github.com/nlohmann/json.git
diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index b59953e29e..6d3e1047fe 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -5,6 +5,8 @@ cmake_minimum_required(VERSION 3.15)
 project(causal_lm)
 
 add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
+add_subdirectory(../../../thirdparty/nlohmann_json/ "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json/")
+
 
 add_executable(greedy_causal_lm greedy_causal_lm.cpp)
 target_compile_definitions(greedy_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
@@ -45,5 +47,7 @@ target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
+target_link_libraries(${TARGET_NAME} PRIVATE nlohmann_json::nlohmann_json)
+
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
index 1d95921278..1124177229 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -77,6 +77,8 @@ class LLMEngine {
           m_model_runner(request) {
             // todo
     }
+    
+    LLMEngine() = default;
 
     // more high level interface
     GenerationResult generate(ov::Tensor prompts, SamplingParameters sampling_params) {
@@ -90,22 +92,104 @@ class LLMEngine {
     }
 };
 
+std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string prompt) {
+    size_t batch_size = 1;
+    tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
+    tokenizer.infer();
+    return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
+}
+
+std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::vector<std::string> prompts) {
+    tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, &prompts});
+    tokenizer.infer();
+    return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
+}
+
+std::string detokenize(ov::InferRequest& detokenizer, std::vector<int64_t> tokens) {
+    size_t batch_size = 1;
+    detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()});
+    detokenizer.infer();
+    return detokenizer.get_output_tensor().data<std::string>()[0];
+}
+
+std::vector<std::string> detokenize(ov::InferRequest& detokenizer, ov::Tensor tokens) {
+    detokenizer.set_input_tensor(tokens);
+    detokenizer.infer();
+    auto res = detokenizer.get_output_tensor();
+    
+    std::vector<std::string> strings;
+    strings.reserve(res.get_shape()[0]);
+    for (int i = 0; i < res.get_shape()[0]; ++i) {
+        strings.emplace_back(res.data<std::string>()[i]);
+    }
+    return strings;
+}
+
+
+// The following reasons require TextStreamer to keep a cache of previous tokens:
+// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
+// but detokenize(tokenize("prefix a")) == "prefix a"
+// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
+struct TextStreamer {
+    ov::InferRequest detokenizer;
+    std::vector<int64_t> token_cache;
+    size_t print_len = 0;
+
+    void put(int64_t token) {
+        token_cache.push_back(token);
+        std::string text = detokenize(detokenizer, token_cache);
+        if (!text.empty() && '\n' == text.back()) {
+            // Flush the cache after the new line symbol
+            std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
+            token_cache.clear();
+            print_len = 0;
+            return;
+        }
+        if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
+            // Don't print incomplete text
+            return;
+        }
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+        print_len = text.size();
+    }
+
+    void end() {
+        std::string text = detokenize(detokenizer, token_cache);
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
+        token_cache.clear();
+        print_len = 0;
+    }
+};
+
 class LLMPipeline {
-    ov::InferRequest m_model_runner;
+    LLMEngine m_model_runner;
+    ov::InferRequest m_tokenizer;
+    ov::InferRequest m_detokenizer;
     std::string m_path;
-    SamplingParameters sampling_parameters;
+    SamplingParameters m_sampling_parameters;
 
 public:
     LLMPipeline(std::string& path) : m_path(path) {
-        // load generation config from the file
-        // todo
-    }
+        if (std::filesystem::exists(m_path + "/generation_config.json")) {
+            m_sampling_parameters = SamplingParameters(m_path + "/generation_config.json");
+        }
 
-    GenerationResult call() {
-        // will call generate inside itself
-        GenerationResult results;
-        results.reserve(10);
-        return results;
+        ov::Core core;
+        // The model can be compiled for GPU as well
+        auto model_request = core.compile_model(m_path + "/openvino_model.xml", "CPU").create_infer_request();
+        m_model_runner = LLMEngine(model_request);
+
+        // tokenizer and detokenizer work on CPU only
+        core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
+        m_tokenizer = core.compile_model(m_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
+        m_detokenizer = core.compile_model(m_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
     }
 
+    std::string call(std::string text) {
+        auto [input_ids, attention_mask] = tokenize(m_tokenizer, text);
+
+        auto generate_results = m_model_runner.generate(input_ids, m_sampling_parameters);
+
+        return detokenize(m_detokenizer, generate_results);
+    }
 };
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
index c91df74be0..1701fdddb5 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
@@ -9,60 +9,6 @@ namespace {
 
 constexpr size_t BATCH_SIZE = 1;
 
-std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string prompt) {
-    tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt});
-    tokenizer.infer();
-    return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
-}
-
-std::string detokenize(ov::InferRequest& detokenizer, std::vector<int64_t> tokens) {
-    constexpr size_t BATCH_SIZE = 1;
-    detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {BATCH_SIZE, tokens.size()}, tokens.data()});
-    detokenizer.infer();
-    return detokenizer.get_output_tensor().data<std::string>()[0];
-}
-
-std::string detokenize(ov::InferRequest& detokenizer, ov::Tensor tokens) {
-    detokenizer.set_input_tensor(tokens);
-    detokenizer.infer();
-    return detokenizer.get_output_tensor().data<std::string>()[0];
-}
-
-// The following reasons require TextStreamer to keep a cache of previous tokens:
-// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
-// but detokenize(tokenize("prefix a")) == "prefix a"
-// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
-struct TextStreamer {
-    ov::InferRequest detokenizer;
-    std::vector<int64_t> token_cache;
-    size_t print_len = 0;
-
-    void put(int64_t token) {
-        token_cache.push_back(token);
-        std::string text = detokenize(detokenizer, token_cache);
-        if (!text.empty() && '\n' == text.back()) {
-            // Flush the cache after the new line symbol
-            std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
-            token_cache.clear();
-            print_len = 0;
-            return;
-        }
-        if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
-            // Don't print incomplete text
-            return;
-        }
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
-        print_len = text.size();
-    }
-
-    void end() {
-        std::string text = detokenize(detokenizer, token_cache);
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
-        token_cache.clear();
-        print_len = 0;
-    }
-};
-
 }  // namespace
 
 void print_generation_results(GenerationResult results, ov::InferRequest& detokenizer) {
@@ -75,24 +21,28 @@ void print_generation_results(GenerationResult results, ov::InferRequest& detoke
 
 int main(int argc, char* argv[]) try {
     std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-    ov::Core core;
-    // core.add_extension("libuser_ov_extensions.so");
-    core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-    // tokenizer and detokenizer work on CPU only
-    ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
-    ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
-
-    // The model can be compiled for GPU as well
-    std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
-    ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
     
-    auto [input_ids, attention_mask] = tokenize(tokenizer, argv[1]);
+    LLMPipeline pipe(model_path);
+    std::cout << pipe.call("Alan Turing was a");
+    
+    // ov::Core core;
+    // // core.add_extension("libuser_ov_extensions.so");
+    // core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
+    // // tokenizer and detokenizer work on CPU only
+    // ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
+    // ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+
+    // // The model can be compiled for GPU as well
+    // std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
+    // ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
+    
+    // auto [input_ids, attention_mask] = tokenize(tokenizer, argv[1]);
     
-    SamplingParameters sampling_params = SamplingParameters::greedy();
+    // SamplingParameters sampling_params = SamplingParameters::greedy();
 
-    LLMEngine engine(request);
-    GenerationResult generation_results = engine.generate(input_ids, sampling_params);
-    print_generation_results(generation_results, detokenizer);
+    // LLMEngine engine(request);
+    // GenerationResult generation_results = engine.generate(input_ids, sampling_params);
+    // print_generation_results(generation_results, detokenizer);
 
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
index 39de510fd6..2ee4a88096 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
@@ -6,15 +6,20 @@
 
 #include <cstdlib>
 #include <functional>
+#include <nlohmann/json.hpp>
+#include <fstream>
 
 enum class StopCriteria {early, heuristic, never};
 
 // forward declaration
 class Sequence;
 
+// SamplingParameters is similar to HuggingFace GenerationConfig 
+// and has parameters that are not present in the original SamplingParameters for continous batching
 struct SamplingParameters {
     // Generic
     size_t max_new_tokens = 100;
+    size_t max_length = 100; // max_new tokens should have priority over max_new_tokens
     bool ignore_eos = false;
     int64_t eos_token = 2; // There's no way to extract special token values from the tokenizer for now
 
@@ -31,6 +36,28 @@ struct SamplingParameters {
     float temperature = 0.0f; // by default we use greedy sampling
     int top_k = -1; // maybe to assign vocab_size ?
     float top_p = 1.0f; // by default convsider all tokens
+    bool do_sample;
+
+    // special tokens
+    int64_t bos_token_id = 0;
+    int64_t eos_token_id = 0;
+    int64_t pad_token_id = 0;
+
+    SamplingParameters() = default;
+
+    SamplingParameters(std::string json_path) {
+        std::ifstream f(json_path);
+        nlohmann::json data = nlohmann::json::parse(f);
+
+        bos_token_id = data.value("bos_token_id", 0);
+        eos_token_id = data.value("eos_token_id", 0);
+        max_length = data.value("max_length", 0);
+        pad_token_id = data.value("pad_token_id", 0);
+        
+        temperature = data.value("temperature", 0.0f);
+        do_sample = data.value("do_sample", false);
+        top_p = data.value("top_p", 0.0f);
+    }
 
     static SamplingParameters greedy() {
         SamplingParameters greedy_params;
@@ -53,11 +80,12 @@ struct SamplingParameters {
         multimomial.temperature = 0.8f;
         multimomial.top_p = 0.8;
         multimomial.top_k = 20;
+        multimomial.do_sample = 20;
         return multimomial;
     }
 
     bool is_gready_sampling() const {
-        return temperature == 0.0f && !is_beam_search();
+        return !do_sample && !is_beam_search();
     }
 
     bool is_beam_search() const {
@@ -65,7 +93,7 @@ struct SamplingParameters {
     }
 
     bool is_multimomial() const {
-        return temperature == 0.0f && !is_beam_search();
+        return do_sample;
     }
     
 };
diff --git a/thirdparty/nlohmann_json b/thirdparty/nlohmann_json
new file mode 160000
index 0000000000..199dea11b1
--- /dev/null
+++ b/thirdparty/nlohmann_json
@@ -0,0 +1 @@
+Subproject commit 199dea11b17c533721b26249e2dcaee6ca1d51d3

From b21c6c1cf4e38340d402f58c7a3951b5e60eef80 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Tue, 2 Apr 2024 21:45:57 +0200
Subject: [PATCH 03/97] Added calculating for several batches

---
 .gitmodules                                   |  4 +-
 text_generation/causal_lm/cpp/CMakeLists.txt  | 16 +++++---
 .../generate_pipeline/generate_pipeline.hpp   | 40 +++++++++----------
 .../causal_lm/cpp/generate_pipeline/main.cpp  | 14 +++----
 4 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 3bf6d7a4f0..97bc043641 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "thirdparty/openvino_tokenizers"]
     path = thirdparty/openvino_tokenizers
     url = https://github.com/openvinotoolkit/openvino_tokenizers.git
-[submodule "/home/epavel/devel/openvino.genai/thirdparty/nlohmann_json"]
-	path = /home/epavel/devel/openvino.genai/thirdparty/nlohmann_json
+[submodule "thirdparty/nlohmann_json"]
+	path = thirdparty/nlohmann_json
 	url = https://github.com/nlohmann/json.git
diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 6d3e1047fe..b197a1feb7 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -8,12 +8,14 @@ add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINAR
 add_subdirectory(../../../thirdparty/nlohmann_json/ "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json/")
 
 
-add_executable(greedy_causal_lm greedy_causal_lm.cpp)
-target_compile_definitions(greedy_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+set(TARGET_NAME continuous_batching)
+add_executable(${TARGET_NAME} continuous_batching/main.cpp)
+target_include_directories(${TARGET_NAME} PRIVATE continuous_batching)
+target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-target_link_libraries(greedy_causal_lm PRIVATE openvino::runtime)
-set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD 17)
-set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
+target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 20)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 add_executable(beam_search_causal_lm beam_search_causal_lm.cpp)
 target_compile_definitions(beam_search_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
@@ -48,6 +50,8 @@ target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<T
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
 target_link_libraries(${TARGET_NAME} PRIVATE nlohmann_json::nlohmann_json)
-
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
+
+target_link_libraries(${TARGET_NAME} PRIVATE stdc++fs)
+
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
index 1124177229..783b5a5474 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -5,9 +5,9 @@
 
 #include <openvino/openvino.hpp>
 #include "sampling_parameters.hpp"
+#include <experimental/filesystem>
 
-
-using GenerationResult = std::vector<int64_t>;
+using GenerationResult = ov::Tensor;
 
 class LLMEngine {
     ov::InferRequest m_model_runner;
@@ -15,10 +15,11 @@ class LLMEngine {
     GenerationResult greedy_search(ov::Tensor prompts, SamplingParameters sampling_params) {
         ov::Shape prompts_shape = prompts.get_shape();
         size_t batch_size = prompts_shape[0];
+        // todo: implement for batch > 1
         OPENVINO_ASSERT(batch_size == 1);
         
-        GenerationResult results;
-        results.reserve(sampling_params.max_new_tokens);
+        GenerationResult results = ov::Tensor{ov::element::i64, {batch_size, sampling_params.max_new_tokens}};
+
         auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()};
         std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
         auto position_ids = ov::Tensor{ov::element::i64, prompts.get_shape()};
@@ -37,23 +38,22 @@ class LLMEngine {
             m_model_runner.infer();
             auto logits = m_model_runner.get_tensor("logits");
             ov::Shape logits_shape = logits.get_shape();
-
-            size_t batch_size = logits_shape[0], seq_len = logits_shape[1], vocab_size = logits_shape[2];
-            OPENVINO_ASSERT(batch_size == 1);
-            // todo: implement for batch > 1
-
-            const float * logits_data = logits.data<const float>() + (seq_len - 1) * vocab_size;
-            int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
+            size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];
 
             m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});
-            m_model_runner.get_tensor("input_ids").data<int64_t>()[0] = out_token;
-            
             m_model_runner.get_tensor("attention_mask").set_shape({batch_size, m_model_runner.get_tensor("attention_mask").get_shape()[1] + 1});
             std::fill_n(m_model_runner.get_tensor("attention_mask").data<int64_t>(), m_model_runner.get_tensor("attention_mask").get_size(), 1);
             
             m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
-            m_model_runner.get_tensor("position_ids").data<int64_t>()[0] = int64_t(initial_seq_len + i);
-            results.emplace_back(out_token);
+
+            for (size_t batch = 0; batch < batch_size; ++batch) {
+                const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
+                int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
+                results.data<int64_t>()[sampling_params.max_new_tokens * batch + i] = out_token;
+                
+                m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
+                m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(initial_seq_len + i);
+            }
         }
         return results;
     }
@@ -61,14 +61,12 @@ class LLMEngine {
     GenerationResult beam_search(ov::Tensor prompts, SamplingParameters sampling_params) {
         // todo: implement
         GenerationResult results;
-        results.reserve(10);
         return results;
     }
 
     GenerationResult multinomial_sampling(ov::Tensor prompts, SamplingParameters sampling_params) {
         // todo: implement
         GenerationResult results;
-        results.reserve(10);
         return results;
     }
 
@@ -114,18 +112,18 @@ std::string detokenize(ov::InferRequest& detokenizer, std::vector<int64_t> token
 
 std::vector<std::string> detokenize(ov::InferRequest& detokenizer, ov::Tensor tokens) {
     detokenizer.set_input_tensor(tokens);
+    auto shape = tokens.get_shape();
+    auto data = tokens.data<int64_t>();
     detokenizer.infer();
     auto res = detokenizer.get_output_tensor();
     
     std::vector<std::string> strings;
-    strings.reserve(res.get_shape()[0]);
     for (int i = 0; i < res.get_shape()[0]; ++i) {
         strings.emplace_back(res.data<std::string>()[i]);
     }
     return strings;
 }
 
-
 // The following reasons require TextStreamer to keep a cache of previous tokens:
 // detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
 // but detokenize(tokenize("prefix a")) == "prefix a"
@@ -170,7 +168,7 @@ class LLMPipeline {
 
 public:
     LLMPipeline(std::string& path) : m_path(path) {
-        if (std::filesystem::exists(m_path + "/generation_config.json")) {
+        if (std::experimental::filesystem::exists(m_path + "/generation_config.json")) {
             m_sampling_parameters = SamplingParameters(m_path + "/generation_config.json");
         }
 
@@ -190,6 +188,6 @@ class LLMPipeline {
 
         auto generate_results = m_model_runner.generate(input_ids, m_sampling_parameters);
 
-        return detokenize(m_detokenizer, generate_results);
+        return detokenize(m_detokenizer, generate_results)[0];
     }
 };
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
index 1701fdddb5..88a4d3621c 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
@@ -11,13 +11,13 @@ constexpr size_t BATCH_SIZE = 1;
 
 }  // namespace
 
-void print_generation_results(GenerationResult results, ov::InferRequest& detokenizer) {
-    TextStreamer text_streamer{std::move(detokenizer)};
-    for (const auto& result: results) {
-        text_streamer.put(result);
-    }
-    text_streamer.end();
-}
+// void print_generation_results(GenerationResult results, ov::InferRequest& detokenizer) {
+//     TextStreamer text_streamer{std::move(detokenizer)};
+//     for (const auto& result: results) {
+//         text_streamer.put(result);
+//     }
+//     text_streamer.end();
+// }
 
 int main(int argc, char* argv[]) try {
     std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";

From e52e90da58422dd65447fd13dac6490ed3cc8d4f Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 3 Apr 2024 10:14:06 +0200
Subject: [PATCH 04/97] Greedy search works

---
 .../generate_pipeline/generate_pipeline.hpp   | 98 +++++++++++++++++--
 .../causal_lm/cpp/generate_pipeline/main.cpp  | 43 +++-----
 .../generate_pipeline/sampling_parameters.hpp | 21 ++--
 .../causal_lm/cpp/group_beam_searcher.hpp     |  1 +
 4 files changed, 122 insertions(+), 41 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
index 783b5a5474..e3bcc52473 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -6,8 +6,10 @@
 #include <openvino/openvino.hpp>
 #include "sampling_parameters.hpp"
 #include <experimental/filesystem>
+#include "group_beam_searcher.hpp"
 
-using GenerationResult = ov::Tensor;
+// using GenerationResult = ov::Tensor;
+using GenerationResult = std::vector<std::vector<int64_t>>;
 
 class LLMEngine {
     ov::InferRequest m_model_runner;
@@ -15,10 +17,9 @@ class LLMEngine {
     GenerationResult greedy_search(ov::Tensor prompts, SamplingParameters sampling_params) {
         ov::Shape prompts_shape = prompts.get_shape();
         size_t batch_size = prompts_shape[0];
-        // todo: implement for batch > 1
         OPENVINO_ASSERT(batch_size == 1);
         
-        GenerationResult results = ov::Tensor{ov::element::i64, {batch_size, sampling_params.max_new_tokens}};
+        GenerationResult results(batch_size);
 
         auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()};
         std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
@@ -45,12 +46,13 @@ class LLMEngine {
             std::fill_n(m_model_runner.get_tensor("attention_mask").data<int64_t>(), m_model_runner.get_tensor("attention_mask").get_size(), 1);
             
             m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
-
+    
             for (size_t batch = 0; batch < batch_size; ++batch) {
                 const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
                 int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
-                results.data<int64_t>()[sampling_params.max_new_tokens * batch + i] = out_token;
+                results[batch].emplace_back(out_token);
                 
+                // todo: add exit criteria when pad or EOS is met
                 m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
                 m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(initial_seq_len + i);
             }
@@ -59,8 +61,72 @@ class LLMEngine {
     }
 
     GenerationResult beam_search(ov::Tensor prompts, SamplingParameters sampling_params) {
-        // todo: implement
+        ov::Shape prompts_shape = prompts.get_shape();
+        size_t batch_size = prompts_shape[0];
+        // todo: implement for batch > 1
+        OPENVINO_ASSERT(batch_size == 1);
+
+        // initialize inputs
+        auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()};
+        std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
+        auto position_ids = ov::Tensor{ov::element::i64, prompts.get_shape()};
+        std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
+        auto initial_seq_len = prompts.get_shape()[1];
+
+        m_model_runner.set_tensor("input_ids", prompts);
+        m_model_runner.set_tensor("attention_mask", attention_mask);
+        m_model_runner.set_tensor("position_ids", position_ids);
+    
+        // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1
+        m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
+        m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+
+        const int64_t* prompt_data = prompts.data<const int64_t>();
+        
+        // todo: remove this duplicatino and use the same SamplingParameters for both greedy and beam
+        Parameters parameters{std::vector<int64_t>{prompt_data, prompt_data + prompts.get_size()}};
+        parameters.n_groups = sampling_params.n_groups;
+        parameters.diversity_penalty = sampling_params.diversity_penalty;
+        parameters.group_size = sampling_params.group_size;
+
+        GroupBeamSearcher group_beam_searcher{parameters};
+        std::vector<int64_t> next_tokens;
+        std::vector<int32_t> next_beams;
+        for (size_t length_count = 0; length_count < sampling_params.max_new_tokens; ++length_count) {
+            m_model_runner.infer();
+            std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(m_model_runner.get_tensor("logits"));
+            if (next_tokens.empty()) {
+                break;
+            }
+            size_t batch_size = next_tokens.size();
+            // Set pointers
+            m_model_runner.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
+            m_model_runner.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
+            // Set auxiliary inputs
+            ov::Tensor attention_mask = m_model_runner.get_tensor("attention_mask");
+            ov::Shape mask_shape{batch_size, attention_mask.get_shape()[1] + 1};
+            attention_mask.set_shape(mask_shape);
+            std::fill_n(attention_mask.data<int64_t>(), ov::shape_size(mask_shape), 1);
+
+            m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
+            std::fill_n(m_model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1);
+        }
+
+        std::vector<Beam> beams;
+        for (const std::vector<Beam>& group : finalize(std::move(group_beam_searcher))) {
+            for (const Beam& beam : group) {
+                beams.emplace_back(beam);
+                // results.emplace_back(beam.tokens);
+            }
+        }
+
+        auto compare_scores = [](Beam left, Beam right) { return (left.score < right.score); };
+        std::sort(beams.begin(), beams.end(), compare_scores);
+        
         GenerationResult results;
+        for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.num_return_sequences; ++beam) {
+            results.emplace_back(beam->tokens);
+        }
         return results;
     }
 
@@ -124,6 +190,23 @@ std::vector<std::string> detokenize(ov::InferRequest& detokenizer, ov::Tensor to
     return strings;
 }
 
+std::vector<std::string> detokenize(ov::InferRequest& detokenizer, 
+                                    std::vector<std::vector<int64_t>> lines, 
+                                    int64_t pad_token_idx) {
+    // todo: implement calling detokenizer in a single batch
+
+    std::vector<std::string> strings;
+    for (auto& line: lines){
+        ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()};
+        detokenizer.set_input_tensor(tokens);
+        detokenizer.infer();
+        auto res = detokenizer.get_output_tensor();
+        strings.emplace_back(res.data<std::string>()[0]);
+    }
+    
+    return strings;
+}
+
 // The following reasons require TextStreamer to keep a cache of previous tokens:
 // detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
 // but detokenize(tokenize("prefix a")) == "prefix a"
@@ -171,6 +254,7 @@ class LLMPipeline {
         if (std::experimental::filesystem::exists(m_path + "/generation_config.json")) {
             m_sampling_parameters = SamplingParameters(m_path + "/generation_config.json");
         }
+        m_sampling_parameters = SamplingParameters(m_path + "/generation_config_beam.json");
 
         ov::Core core;
         // The model can be compiled for GPU as well
@@ -188,6 +272,6 @@ class LLMPipeline {
 
         auto generate_results = m_model_runner.generate(input_ids, m_sampling_parameters);
 
-        return detokenize(m_detokenizer, generate_results)[0];
+        return detokenize(m_detokenizer, generate_results, 0)[0];
     }
 };
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
index 88a4d3621c..12b85bbe02 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
@@ -11,38 +11,27 @@ constexpr size_t BATCH_SIZE = 1;
 
 }  // namespace
 
-// void print_generation_results(GenerationResult results, ov::InferRequest& detokenizer) {
-//     TextStreamer text_streamer{std::move(detokenizer)};
-//     for (const auto& result: results) {
-//         text_streamer.put(result);
-//     }
-//     text_streamer.end();
-// }
-
 int main(int argc, char* argv[]) try {
+    // PIPELINE
     std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-    
     LLMPipeline pipe(model_path);
-    std::cout << pipe.call("Alan Turing was a");
+    // std::cout << pipe.call("Alan Turing was a");
     
-    // ov::Core core;
-    // // core.add_extension("libuser_ov_extensions.so");
-    // core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-    // // tokenizer and detokenizer work on CPU only
-    // ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
-    // ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
-
-    // // The model can be compiled for GPU as well
-    // std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
-    // ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
+    // GENERATE
+    ov::Core core;
+    core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
+    ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
+    ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+
+    // todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted
+    std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
+    ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
     
-    // auto [input_ids, attention_mask] = tokenize(tokenizer, argv[1]);
-    
-    // SamplingParameters sampling_params = SamplingParameters::greedy();
-
-    // LLMEngine engine(request);
-    // GenerationResult generation_results = engine.generate(input_ids, sampling_params);
-    // print_generation_results(generation_results, detokenizer);
+    auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a");
+    SamplingParameters sampling_params = SamplingParameters::beam_search();
+    LLMEngine engine(request);
+    GenerationResult generation_results = engine.generate(input_ids, sampling_params);
+    std::cout << detokenize(detokenizer, generation_results[0]);
 
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
index 2ee4a88096..b12a25bbe7 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
@@ -1,4 +1,3 @@
-
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
@@ -8,8 +7,7 @@
 #include <functional>
 #include <nlohmann/json.hpp>
 #include <fstream>
-
-enum class StopCriteria {early, heuristic, never};
+#include <group_beam_searcher.hpp>  // used only for StopCriteria
 
 // forward declaration
 class Sequence;
@@ -18,15 +16,17 @@ class Sequence;
 // and has parameters that are not present in the original SamplingParameters for continous batching
 struct SamplingParameters {
     // Generic
-    size_t max_new_tokens = 100;
+    size_t max_new_tokens = 10;
     size_t max_length = 100; // max_new tokens should have priority over max_new_tokens
     bool ignore_eos = false;
     int64_t eos_token = 2; // There's no way to extract special token values from the tokenizer for now
+    size_t num_return_sequences = 3;
 
     // Beam search specific
     size_t n_groups = 1;
     size_t group_size = 1; // beam_width
     float diversity_penalty = 1.0f; // 0.0 means no diversity
+    
     StopCriteria stop_criteria = StopCriteria::heuristic;
     float length_penalty = 1.0f;
     size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
@@ -53,10 +53,17 @@ struct SamplingParameters {
         eos_token_id = data.value("eos_token_id", 0);
         max_length = data.value("max_length", 0);
         pad_token_id = data.value("pad_token_id", 0);
+        num_return_sequences = data.value("num_return_sequences", 1);
         
         temperature = data.value("temperature", 0.0f);
         do_sample = data.value("do_sample", false);
         top_p = data.value("top_p", 0.0f);
+        
+        // beam_search_params
+        n_groups = data.value("num_beam_groups", 1);
+        diversity_penalty = data.value("diversity_penalty", 1.0f);
+        int num_beams = data.value("num_beams", 1);
+        group_size = num_beams / n_groups;
     }
 
     static SamplingParameters greedy() {
@@ -68,9 +75,9 @@ struct SamplingParameters {
 
     static SamplingParameters beam_search() {
         SamplingParameters beam_search;
-        beam_search.n_groups = 2;
-        beam_search.group_size = 2;
-        beam_search.max_new_tokens = 100;
+        beam_search.n_groups = 3;
+        beam_search.group_size = 5;
+        beam_search.max_new_tokens = 10;
         beam_search.diversity_penalty = 2.0f;
         return beam_search;
     }
diff --git a/text_generation/causal_lm/cpp/group_beam_searcher.hpp b/text_generation/causal_lm/cpp/group_beam_searcher.hpp
index 6c97c869a3..b34cda05f5 100644
--- a/text_generation/causal_lm/cpp/group_beam_searcher.hpp
+++ b/text_generation/causal_lm/cpp/group_beam_searcher.hpp
@@ -1,5 +1,6 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
+#pragma once
 
 #include <openvino/runtime/tensor.hpp>
 

From 745a804ce9c7d96e5576e3a5d274e0e867a00526 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 4 Apr 2024 16:14:40 +0200
Subject: [PATCH 05/97] rename to GenerationConfig

---
 .../generate_pipeline/generate_pipeline.hpp   | 16 ++++----
 .../causal_lm/cpp/generate_pipeline/main.cpp  | 38 ++++++++++++-------
 .../generate_pipeline/sampling_parameters.hpp | 23 +++++------
 3 files changed, 44 insertions(+), 33 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
index e3bcc52473..8e3d8109d2 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -14,7 +14,7 @@ using GenerationResult = std::vector<std::vector<int64_t>>;
 class LLMEngine {
     ov::InferRequest m_model_runner;
 
-    GenerationResult greedy_search(ov::Tensor prompts, SamplingParameters sampling_params) {
+    GenerationResult greedy_search(ov::Tensor prompts, GenerationConfig sampling_params) {
         ov::Shape prompts_shape = prompts.get_shape();
         size_t batch_size = prompts_shape[0];
         OPENVINO_ASSERT(batch_size == 1);
@@ -60,7 +60,7 @@ class LLMEngine {
         return results;
     }
 
-    GenerationResult beam_search(ov::Tensor prompts, SamplingParameters sampling_params) {
+    GenerationResult beam_search(ov::Tensor prompts, GenerationConfig sampling_params) {
         ov::Shape prompts_shape = prompts.get_shape();
         size_t batch_size = prompts_shape[0];
         // todo: implement for batch > 1
@@ -120,7 +120,7 @@ class LLMEngine {
             }
         }
 
-        auto compare_scores = [](Beam left, Beam right) { return (left.score < right.score); };
+        auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); };
         std::sort(beams.begin(), beams.end(), compare_scores);
         
         GenerationResult results;
@@ -130,7 +130,7 @@ class LLMEngine {
         return results;
     }
 
-    GenerationResult multinomial_sampling(ov::Tensor prompts, SamplingParameters sampling_params) {
+    GenerationResult multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
         // todo: implement
         GenerationResult results;
         return results;
@@ -145,7 +145,7 @@ class LLMEngine {
     LLMEngine() = default;
 
     // more high level interface
-    GenerationResult generate(ov::Tensor prompts, SamplingParameters sampling_params) {
+    GenerationResult generate(ov::Tensor prompts, GenerationConfig sampling_params) {
         if (sampling_params.is_gready_sampling()) {
             return greedy_search(prompts, sampling_params);
         } else if (sampling_params.is_beam_search()) {
@@ -247,14 +247,14 @@ class LLMPipeline {
     ov::InferRequest m_tokenizer;
     ov::InferRequest m_detokenizer;
     std::string m_path;
-    SamplingParameters m_sampling_parameters;
+    GenerationConfig m_sampling_parameters;
 
 public:
     LLMPipeline(std::string& path) : m_path(path) {
         if (std::experimental::filesystem::exists(m_path + "/generation_config.json")) {
-            m_sampling_parameters = SamplingParameters(m_path + "/generation_config.json");
+            m_sampling_parameters = GenerationConfig(m_path + "/generation_config.json");
         }
-        m_sampling_parameters = SamplingParameters(m_path + "/generation_config_beam.json");
+        m_sampling_parameters = GenerationConfig(m_path + "/generation_config_beam.json");
 
         ov::Core core;
         // The model can be compiled for GPU as well
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
index 12b85bbe02..57b0531cf3 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
@@ -15,23 +15,33 @@ int main(int argc, char* argv[]) try {
     // PIPELINE
     std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
     LLMPipeline pipe(model_path);
-    // std::cout << pipe.call("Alan Turing was a");
+    std::cout << pipe.call("Alan Turing was a");
     
     // GENERATE
-    ov::Core core;
-    core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-    ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
-    ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
-
-    // todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted
-    std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
-    ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
+    // ov::Core core;
+    // core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
+    // ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
+    // ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+
+    // // todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted
+    // std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
+    // ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
     
-    auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a");
-    SamplingParameters sampling_params = SamplingParameters::beam_search();
-    LLMEngine engine(request);
-    GenerationResult generation_results = engine.generate(input_ids, sampling_params);
-    std::cout << detokenize(detokenizer, generation_results[0]);
+    // auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a");
+    // GenerationConfig sampling_params = GenerationConfig::beam_search();
+    // LLMEngine engine(request);
+    // GenerationResult generation_results = engine.generate(input_ids, sampling_params);
+    // std::cout << detokenize(detokenizer, generation_results[0]);
+    
+    // std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
+    // LLMPipeline pipe(model_path);
+    // GenerationConfig params;
+    // std::cout << pipe("Alan Turing was a", params.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
+    
+    // LLMEngine engine(request);
+    // GenerationConfig params.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2);
+    // GenerationResult generation_results = engine.generate(input_ids, params);
+    // std::cout << detokenize(detokenizer, generation_results[0]);
 
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
index b12a25bbe7..5cd42aa4b9 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
@@ -12,9 +12,9 @@
 // forward declaration
 class Sequence;
 
-// SamplingParameters is similar to HuggingFace GenerationConfig 
-// and has parameters that are not present in the original SamplingParameters for continous batching
-struct SamplingParameters {
+// Similar to HuggingFace GenerationConfig 
+// but has parameters that are not present in the original SamplingParameters for continous batching
+struct GenerationConfig {
     // Generic
     size_t max_new_tokens = 10;
     size_t max_length = 100; // max_new tokens should have priority over max_new_tokens
@@ -43,9 +43,9 @@ struct SamplingParameters {
     int64_t eos_token_id = 0;
     int64_t pad_token_id = 0;
 
-    SamplingParameters() = default;
+    GenerationConfig() = default;
 
-    SamplingParameters(std::string json_path) {
+    GenerationConfig(std::string json_path) {
         std::ifstream f(json_path);
         nlohmann::json data = nlohmann::json::parse(f);
 
@@ -54,6 +54,7 @@ struct SamplingParameters {
         max_length = data.value("max_length", 0);
         pad_token_id = data.value("pad_token_id", 0);
         num_return_sequences = data.value("num_return_sequences", 1);
+        max_new_tokens = data.value("max_new_tokens", 100);
         
         temperature = data.value("temperature", 0.0f);
         do_sample = data.value("do_sample", false);
@@ -66,15 +67,15 @@ struct SamplingParameters {
         group_size = num_beams / n_groups;
     }
 
-    static SamplingParameters greedy() {
-        SamplingParameters greedy_params;
+    static GenerationConfig greedy() {
+        GenerationConfig greedy_params;
         greedy_params.temperature = 0.0f;
         greedy_params.ignore_eos = true;
         return greedy_params;
     }
 
-    static SamplingParameters beam_search() {
-        SamplingParameters beam_search;
+    static GenerationConfig beam_search() {
+        GenerationConfig beam_search;
         beam_search.n_groups = 3;
         beam_search.group_size = 5;
         beam_search.max_new_tokens = 10;
@@ -82,8 +83,8 @@ struct SamplingParameters {
         return beam_search;
     }
 
-    static SamplingParameters multimomial() {
-        SamplingParameters multimomial;
+    static GenerationConfig multimomial() {
+        GenerationConfig multimomial;
         multimomial.temperature = 0.8f;
         multimomial.top_p = 0.8;
         multimomial.top_k = 20;

From 8895ed0b9cc6b8a72e553855fe65c6f27bb9416f Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 5 Apr 2024 07:44:22 +0200
Subject: [PATCH 06/97] Add fluent interface

---
 text_generation/causal_lm/cpp/CMakeLists.txt  |  23 +--
 .../generate_pipeline/generate_pipeline.hpp   |  55 ++++--
 .../causal_lm/cpp/generate_pipeline/main.cpp  |  73 +++++---
 .../generate_pipeline/sampling_parameters.hpp | 171 +++++++++++++-----
 4 files changed, 231 insertions(+), 91 deletions(-)

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index b197a1feb7..34ff304ea3 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -8,22 +8,13 @@ add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINAR
 add_subdirectory(../../../thirdparty/nlohmann_json/ "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json/")
 
 
-set(TARGET_NAME continuous_batching)
-add_executable(${TARGET_NAME} continuous_batching/main.cpp)
-target_include_directories(${TARGET_NAME} PRIVATE continuous_batching)
-target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
-find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
-set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 20)
-set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
-
-add_executable(beam_search_causal_lm beam_search_causal_lm.cpp)
-target_compile_definitions(beam_search_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
-target_include_directories(beam_search_causal_lm PRIVATE ./)
-find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-target_link_libraries(beam_search_causal_lm PRIVATE openvino::runtime)
-set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD 17)
-set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
+# add_executable(beam_search_causal_lm beam_search_causal_lm.cpp)
+# target_compile_definitions(beam_search_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+# target_include_directories(beam_search_causal_lm PRIVATE ./)
+# find_package(OpenVINO REQUIRED COMPONENTS Runtime)
+# target_link_libraries(beam_search_causal_lm PRIVATE openvino::runtime)
+# set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD 17)
+# set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 set(TARGET_NAME beam_search_sample)
 add_executable(${TARGET_NAME} beam_search_causal_lm.cpp)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
index 8e3d8109d2..3f6641028d 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -11,7 +11,7 @@
 // using GenerationResult = ov::Tensor;
 using GenerationResult = std::vector<std::vector<int64_t>>;
 
-class LLMEngine {
+class LLMModel {
     ov::InferRequest m_model_runner;
 
     GenerationResult greedy_search(ov::Tensor prompts, GenerationConfig sampling_params) {
@@ -35,7 +35,7 @@ class LLMEngine {
         m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
         m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;
 
-        for (size_t i = 0; i < sampling_params.max_new_tokens; ++i) {
+        for (size_t i = 0; i < sampling_params.m_max_new_tokens; ++i) {
             m_model_runner.infer();
             auto logits = m_model_runner.get_tensor("logits");
             ov::Shape logits_shape = logits.get_shape();
@@ -85,14 +85,14 @@ class LLMEngine {
         
         // todo: remove this duplicatino and use the same SamplingParameters for both greedy and beam
         Parameters parameters{std::vector<int64_t>{prompt_data, prompt_data + prompts.get_size()}};
-        parameters.n_groups = sampling_params.n_groups;
-        parameters.diversity_penalty = sampling_params.diversity_penalty;
-        parameters.group_size = sampling_params.group_size;
+        parameters.n_groups = sampling_params.m_num_groups;
+        parameters.diversity_penalty = sampling_params.m_diversity_penalty;
+        parameters.group_size = sampling_params.m_group_size;
 
         GroupBeamSearcher group_beam_searcher{parameters};
         std::vector<int64_t> next_tokens;
         std::vector<int32_t> next_beams;
-        for (size_t length_count = 0; length_count < sampling_params.max_new_tokens; ++length_count) {
+        for (size_t length_count = 0; length_count < sampling_params.m_max_new_tokens; ++length_count) {
             m_model_runner.infer();
             std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(m_model_runner.get_tensor("logits"));
             if (next_tokens.empty()) {
@@ -124,7 +124,7 @@ class LLMEngine {
         std::sort(beams.begin(), beams.end(), compare_scores);
         
         GenerationResult results;
-        for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.num_return_sequences; ++beam) {
+        for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) {
             results.emplace_back(beam->tokens);
         }
         return results;
@@ -137,12 +137,12 @@ class LLMEngine {
     }
 
 public:
-    LLMEngine(ov::InferRequest& request) :
+    LLMModel(ov::InferRequest& request) :
           m_model_runner(request) {
             // todo
     }
     
-    LLMEngine() = default;
+    LLMModel() = default;
 
     // more high level interface
     GenerationResult generate(ov::Tensor prompts, GenerationConfig sampling_params) {
@@ -165,6 +165,7 @@ std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::str
 
 std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::vector<std::string> prompts) {
     tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, &prompts});
+    auto size_ = tokenizer.get_input_tensor().get_shape();
     tokenizer.infer();
     return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
 }
@@ -243,7 +244,7 @@ struct TextStreamer {
 };
 
 class LLMPipeline {
-    LLMEngine m_model_runner;
+    LLMModel m_model_runner;
     ov::InferRequest m_tokenizer;
     ov::InferRequest m_detokenizer;
     std::string m_path;
@@ -259,7 +260,7 @@ class LLMPipeline {
         ov::Core core;
         // The model can be compiled for GPU as well
         auto model_request = core.compile_model(m_path + "/openvino_model.xml", "CPU").create_infer_request();
-        m_model_runner = LLMEngine(model_request);
+        m_model_runner = LLMModel(model_request);
 
         // tokenizer and detokenizer work on CPU only
         core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
@@ -267,6 +268,10 @@ class LLMPipeline {
         m_detokenizer = core.compile_model(m_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
     }
 
+    GenerationConfig generation_config() const {
+        return m_sampling_parameters;
+    }
+
     std::string call(std::string text) {
         auto [input_ids, attention_mask] = tokenize(m_tokenizer, text);
 
@@ -274,4 +279,32 @@ class LLMPipeline {
 
         return detokenize(m_detokenizer, generate_results, 0)[0];
     }
+    
+    std::string call(std::string text, GenerationConfig sampling_parameters) {
+        auto [input_ids, attention_mask] = tokenize(m_tokenizer, text);
+
+        auto generate_results = m_model_runner.generate(input_ids, sampling_parameters);
+
+        return detokenize(m_detokenizer, generate_results, 0)[0];
+    }
+
+    std::vector<std::string> call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
+        auto [input_ids, attention_mask] = tokenize(m_tokenizer, text);
+
+        auto generate_results = m_model_runner.generate(input_ids, sampling_parameters);
+
+        return detokenize(m_detokenizer, generate_results, 0);
+    }
+
+    std::string operator()(std::string text, GenerationConfig sampling_parameters) {
+        return call(text, sampling_parameters);
+    }
+    
+    std::vector<std::string> operator()(std::vector<std::string> text, GenerationConfig sampling_parameters) {
+        return call(text, sampling_parameters);
+    }
+    
+    std::vector<std::string> operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters) {
+        return call(text, sampling_parameters);
+    }
 };
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
index 57b0531cf3..711910d7b0 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
@@ -12,33 +12,62 @@ constexpr size_t BATCH_SIZE = 1;
 }  // namespace
 
 int main(int argc, char* argv[]) try {
-    // PIPELINE
+    // PIPELINE ex.1
     std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-    LLMPipeline pipe(model_path);
-    std::cout << pipe.call("Alan Turing was a");
-    
+    // LLMPipeline pipe(model_path);
+    // std::cout << pipe.call("Alan Turing was a");
+
+    // {   
+    //     // PIPELINE ex.2
+    //     LLMPipeline pipe(model_path);
+    //     GenerationConfig config = pipe.generation_config();
+        
+    //     std::cout << pipe("Alan Turing was a", config.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
+        
+    //     // batched inputs
+    //     auto results = pipe({"table is made of ", 
+    //                         "Alan Turing was a", 
+    //                         "1 + 1 = ",
+    //                         "Why is the Sun yellow?"
+    //                         }, config.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
+        
+    //     for (const auto& res: results) {
+    //         std::cout << res << std::endl;
+    //     }
+    // }
+    {
+
+GenerationConfig config = GenerationConfig().group_size(3).num_groups(3).diversity_penalty(1.2);
+config.max_length(100);
+
+    }
+
+    {
+GenerationConfig config;
+config.m_bos_token_id = 0;
+config.m_num_groups = 3;
+config.m_group_size = 5;
+config.m_max_new_tokens = 100;
+    }
+
     // GENERATE
-    // ov::Core core;
-    // core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-    // ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
-    // ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
-
-    // // todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted
-    // std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
-    // ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
+    ov::Core core;
+    core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
+    ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
+    ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+
+    // todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted
+    std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
+    ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
     
-    // auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a");
-    // GenerationConfig sampling_params = GenerationConfig::beam_search();
-    // LLMEngine engine(request);
-    // GenerationResult generation_results = engine.generate(input_ids, sampling_params);
-    // std::cout << detokenize(detokenizer, generation_results[0]);
+    auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a");
+    GenerationConfig sampling_params = GenerationConfig::beam_search();
+    LLMModel engine(request);
+    GenerationResult generation_results = engine.generate(input_ids, sampling_params);
+    std::cout << detokenize(detokenizer, generation_results[0]);
     
-    // std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-    // LLMPipeline pipe(model_path);
-    // GenerationConfig params;
-    // std::cout << pipe("Alan Turing was a", params.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
     
-    // LLMEngine engine(request);
+    // LLMModel engine(request);
     // GenerationConfig params.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2);
     // GenerationResult generation_results = engine.generate(input_ids, params);
     // std::cout << detokenize(detokenizer, generation_results[0]);
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
index 5cd42aa4b9..991334501e 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
@@ -16,32 +16,119 @@ class Sequence;
 // but has parameters that are not present in the original SamplingParameters for continous batching
 struct GenerationConfig {
     // Generic
-    size_t max_new_tokens = 10;
-    size_t max_length = 100; // max_new tokens should have priority over max_new_tokens
-    bool ignore_eos = false;
-    int64_t eos_token = 2; // There's no way to extract special token values from the tokenizer for now
-    size_t num_return_sequences = 3;
+    size_t m_max_new_tokens = 10;
+    size_t m_max_length = 100; // max_new tokens should have priority over max_new_tokens
+    bool m_ignore_eos = false;
+    int64_t m_eos_token = 2; // There's no way to extract special token values from the tokenizer for now
+    size_t m_num_return_sequences = 3;
 
     // Beam search specific
-    size_t n_groups = 1;
-    size_t group_size = 1; // beam_width
-    float diversity_penalty = 1.0f; // 0.0 means no diversity
+    size_t m_num_groups = 1;
+    size_t m_group_size = 1; // beam_width
+    float m_diversity_penalty = 1.0f; // 0.0 means no diversity
+    float m_repetition_penalty = 1.0f;
     
     StopCriteria stop_criteria = StopCriteria::heuristic;
-    float length_penalty = 1.0f;
-    size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
+    float m_length_penalty = 1.0f;
+    size_t m_no_repeat_ngram_size = std::numeric_limits<size_t>::max();
     std::function<bool(const Sequence&)> early_finish = [](const Sequence&) {return false; };
 
     // Multinomial
-    float temperature = 0.0f; // by default we use greedy sampling
-    int top_k = -1; // maybe to assign vocab_size ?
-    float top_p = 1.0f; // by default convsider all tokens
-    bool do_sample;
+    float m_temperature = 0.0f; // by default we use greedy sampling
+    int m_top_k = -1; // maybe to assign vocab_size ?
+    float m_top_p = 1.0f; // by default convsider all tokens
+    bool m_do_sample;
 
     // special tokens
-    int64_t bos_token_id = 0;
-    int64_t eos_token_id = 0;
-    int64_t pad_token_id = 0;
+    int64_t m_bos_token_id = 0;
+    int64_t m_eos_token_id = 0;
+    int64_t m_pad_token_id = 0;
+
+    GenerationConfig& max_new_tokens(size_t max_new_tokens) {
+        this->m_max_new_tokens = max_new_tokens;
+         return *this;
+     }
+
+    GenerationConfig& max_length(size_t max_length) {
+        this->m_max_length = max_length;
+         return *this;
+     }
+
+    GenerationConfig& ignore_eos(bool ignore_eos) {
+        this->m_ignore_eos = ignore_eos;
+         return *this;
+     }
+
+    GenerationConfig& eos_token(int64_t eos_token) {
+        this->m_eos_token = eos_token;
+         return *this;
+     }
+
+    GenerationConfig& num_return_sequences(size_t num_return_sequences) {
+        this->m_num_return_sequences = num_return_sequences;
+         return *this;
+     }
+
+    GenerationConfig& num_groups(size_t num_groups) {
+        this->m_num_groups = num_groups;
+         return *this;
+     }
+
+    GenerationConfig& group_size(size_t group_size) {
+        this->m_group_size = group_size;
+         return *this;
+     }
+    GenerationConfig& diversity_penalty(float diversity_penalty) {
+        this->m_diversity_penalty = diversity_penalty;
+         return *this;
+     }
+
+    GenerationConfig& length_penalty(float length_penalty) {
+        this->m_length_penalty = length_penalty;
+         return *this;
+     }
+    GenerationConfig& no_repeat_ngram_size(size_t no_repeat_ngram_size) {
+        this->m_no_repeat_ngram_size = no_repeat_ngram_size;
+         return *this;
+     }
+
+    GenerationConfig& temperature(float temperature) {
+        this->m_temperature = temperature;
+         return *this;
+     }
+    GenerationConfig& top_k(size_t top_k) {
+        this->m_top_k = top_k;
+         return *this;
+     }
+
+    GenerationConfig& top_p(size_t top_p) {
+        this->m_top_p = top_p;
+         return *this;
+     }
+    GenerationConfig& do_sample(bool do_sample) {
+        this->m_do_sample = do_sample;
+         return *this;
+     }
+
+    GenerationConfig& repetition_penalty(float repetition_penalty) {
+        this->m_repetition_penalty = repetition_penalty;
+         return *this;
+     }
+
+    GenerationConfig& bos_token_id(int64_t bos_token_id) {
+        this->m_bos_token_id = bos_token_id;
+         return *this;
+     }
+
+    GenerationConfig& eos_token_id(int64_t eos_token_id) {
+        this->m_eos_token_id = eos_token_id;
+         return *this;
+     }
+
+    GenerationConfig& pad_token_id(int64_t pad_token_id) {
+        this->m_pad_token_id = pad_token_id;
+         return *this;
+     }
 
     GenerationConfig() = default;
 
@@ -49,59 +136,59 @@ struct GenerationConfig {
         std::ifstream f(json_path);
         nlohmann::json data = nlohmann::json::parse(f);
 
-        bos_token_id = data.value("bos_token_id", 0);
-        eos_token_id = data.value("eos_token_id", 0);
-        max_length = data.value("max_length", 0);
-        pad_token_id = data.value("pad_token_id", 0);
-        num_return_sequences = data.value("num_return_sequences", 1);
-        max_new_tokens = data.value("max_new_tokens", 100);
+        m_bos_token_id = data.value("bos_token_id", 0);
+        m_eos_token_id = data.value("eos_token_id", 0);
+        m_max_length = data.value("max_length", 0);
+        m_pad_token_id = data.value("pad_token_id", 0);
+        m_num_return_sequences = data.value("num_return_sequences", 1);
+        m_max_new_tokens = data.value("max_new_tokens", 100);
         
-        temperature = data.value("temperature", 0.0f);
-        do_sample = data.value("do_sample", false);
-        top_p = data.value("top_p", 0.0f);
+        m_temperature = data.value("temperature", 0.0f);
+        m_do_sample = data.value("do_sample", false);
+        m_top_p = data.value("top_p", 0.0f);
         
         // beam_search_params
-        n_groups = data.value("num_beam_groups", 1);
-        diversity_penalty = data.value("diversity_penalty", 1.0f);
+        m_num_groups = data.value("num_beam_groups", 1);
+        m_diversity_penalty = data.value("diversity_penalty", 1.0f);
         int num_beams = data.value("num_beams", 1);
-        group_size = num_beams / n_groups;
+        m_group_size = num_beams / m_num_groups;
     }
 
     static GenerationConfig greedy() {
         GenerationConfig greedy_params;
-        greedy_params.temperature = 0.0f;
-        greedy_params.ignore_eos = true;
+        greedy_params.m_temperature = 0.0f;
+        greedy_params.m_ignore_eos = true;
         return greedy_params;
     }
 
     static GenerationConfig beam_search() {
         GenerationConfig beam_search;
-        beam_search.n_groups = 3;
-        beam_search.group_size = 5;
-        beam_search.max_new_tokens = 10;
-        beam_search.diversity_penalty = 2.0f;
+        beam_search.m_num_groups = 3;
+        beam_search.m_group_size = 5;
+        beam_search.m_max_new_tokens = 10;
+        beam_search.m_diversity_penalty = 2.0f;
         return beam_search;
     }
 
     static GenerationConfig multimomial() {
         GenerationConfig multimomial;
-        multimomial.temperature = 0.8f;
-        multimomial.top_p = 0.8;
-        multimomial.top_k = 20;
-        multimomial.do_sample = 20;
+        multimomial.m_temperature = 0.8f;
+        multimomial.m_top_p = 0.8;
+        multimomial.m_top_k = 20;
+        multimomial.m_do_sample = 20;
         return multimomial;
     }
 
     bool is_gready_sampling() const {
-        return !do_sample && !is_beam_search();
+        return !m_do_sample && !is_beam_search();
     }
 
     bool is_beam_search() const {
-        return n_groups * group_size > 1;
+        return m_num_groups * m_group_size > 1;
     }
 
     bool is_multimomial() const {
-        return do_sample;
+        return m_do_sample;
     }
     
 };

From b24977d55eee46076fd604f08c0b2e3ac682f247 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Fri, 5 Apr 2024 12:03:28 +0200
Subject: [PATCH 07/97] Update
 text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp

---
 .../causal_lm/cpp/generate_pipeline/generate_pipeline.hpp        | 1 -
 1 file changed, 1 deletion(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
index 3f6641028d..dce79f8935 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -139,7 +139,6 @@ class LLMModel {
 public:
     LLMModel(ov::InferRequest& request) :
           m_model_runner(request) {
-            // todo
     }
     
     LLMModel() = default;

From c933ca0267e5e59d436a04b6fc3b19627d67b087 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 5 Apr 2024 12:18:22 +0200
Subject: [PATCH 08/97] cosmetic changes in main

---
 .../causal_lm/cpp/generate_pipeline/main.cpp  | 67 ++++++++-----------
 1 file changed, 28 insertions(+), 39 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
index 711910d7b0..01d358ae04 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
@@ -12,46 +12,37 @@ constexpr size_t BATCH_SIZE = 1;
 }  // namespace
 
 int main(int argc, char* argv[]) try {
-    // PIPELINE ex.1
-    std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-    // LLMPipeline pipe(model_path);
-    // std::cout << pipe.call("Alan Turing was a");
+    {
+        // PIPELINE Ex.1
+        std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
+        LLMPipeline pipe(model_path);
+        std::cout << pipe.call("Alan Turing was a");
+    }
 
-    // {   
-    //     // PIPELINE ex.2
-    //     LLMPipeline pipe(model_path);
-    //     GenerationConfig config = pipe.generation_config();
+    {   
+        // PIPELINE Ex.2
+        std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
+        LLMPipeline pipe(model_path);
+        GenerationConfig config = pipe.generation_config();
         
-    //     std::cout << pipe("Alan Turing was a", config.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
+        std::cout << pipe("Alan Turing was a", config.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
         
-    //     // batched inputs
-    //     auto results = pipe({"table is made of ", 
-    //                         "Alan Turing was a", 
-    //                         "1 + 1 = ",
-    //                         "Why is the Sun yellow?"
-    //                         }, config.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
+        // batched inputs 
+        // todo: tokenizer fails for batched input strings
+        auto results = pipe({"table is made of ", 
+                            "Alan Turing was a", 
+                            "1 + 1 = ",
+                            "Why is the Sun yellow?"
+                            }, config.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
         
-    //     for (const auto& res: results) {
-    //         std::cout << res << std::endl;
-    //     }
-    // }
-    {
-
-GenerationConfig config = GenerationConfig().group_size(3).num_groups(3).diversity_penalty(1.2);
-config.max_length(100);
-
-    }
-
-    {
-GenerationConfig config;
-config.m_bos_token_id = 0;
-config.m_num_groups = 3;
-config.m_group_size = 5;
-config.m_max_new_tokens = 100;
+        for (const auto& res: results) {
+            std::cout << res << std::endl;
+        }
     }
 
     // GENERATE
     ov::Core core;
+    std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
     core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
     ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
     ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
@@ -61,16 +52,14 @@ config.m_max_new_tokens = 100;
     ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
     
     auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a");
-    GenerationConfig sampling_params = GenerationConfig::beam_search();
+    GenerationConfig config = GenerationConfig::beam_search();
     LLMModel engine(request);
-    GenerationResult generation_results = engine.generate(input_ids, sampling_params);
-    std::cout << detokenize(detokenizer, generation_results[0]);
     
+    GenerationResult generation_results = engine.generate(input_ids, config);
+    std::cout << detokenize(detokenizer, generation_results[0]);
     
-    // LLMModel engine(request);
-    // GenerationConfig params.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2);
-    // GenerationResult generation_results = engine.generate(input_ids, params);
-    // std::cout << detokenize(detokenizer, generation_results[0]);
+    generation_results = engine.generate(input_ids, config.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
+    std::cout << detokenize(detokenizer, generation_results[0]);
 
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';

From c43e901440b3cf27bcad48625da35c033dd775d0 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 10 Apr 2024 08:56:05 +0200
Subject: [PATCH 09/97] greedy search with batches and left padding works

---
 text_generation/causal_lm/cpp/CMakeLists.txt  |  15 +-
 .../generate_pipeline/generate_pipeline.hpp   | 180 +++++++++++++++---
 .../causal_lm/cpp/generate_pipeline/main.cpp  |  60 +++---
 3 files changed, 189 insertions(+), 66 deletions(-)

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 34ff304ea3..7c75aad0af 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -8,13 +8,14 @@ add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINAR
 add_subdirectory(../../../thirdparty/nlohmann_json/ "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json/")
 
 
-# add_executable(beam_search_causal_lm beam_search_causal_lm.cpp)
-# target_compile_definitions(beam_search_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
-# target_include_directories(beam_search_causal_lm PRIVATE ./)
-# find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-# target_link_libraries(beam_search_causal_lm PRIVATE openvino::runtime)
-# set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD 17)
-# set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
+set(TARGET_NAME greedy_causal_lm)
+add_executable(${TARGET_NAME} greedy_causal_lm.cpp)
+target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+find_package(OpenVINO REQUIRED COMPONENTS Runtime)
+target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 set(TARGET_NAME beam_search_sample)
 add_executable(${TARGET_NAME} beam_search_causal_lm.cpp)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
index dce79f8935..10edab39de 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -10,30 +10,117 @@
 
 // using GenerationResult = ov::Tensor;
 using GenerationResult = std::vector<std::vector<int64_t>>;
+using namespace std;
+
+std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token=2) {
+    const size_t batch_size = input_ids.get_shape().at(0);
+    const size_t sequence_length = input_ids.get_shape().at(1);
+    int64_t* inputs_data = input_ids.data<int64_t>();
+    int64_t* attention_mask_data = attention_mask.data<int64_t>();
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        const size_t batch_offset = batch * sequence_length;
+
+        // last token in the sequence is not a PAD_TOKEN, skipping
+        if (inputs_data[batch_offset + sequence_length - 1] != pad_token) {
+            continue;
+        }
+
+        size_t pad_tokens_number = 0;
+        for (int i = sequence_length - 1; i >= 0; i--) {
+            const size_t token_offset = batch_offset + i;
+
+            if (inputs_data[token_offset] == pad_token) {
+                continue;
+            }
+
+            if (pad_tokens_number == 0) {
+                pad_tokens_number = sequence_length - i - 1;
+            }
+
+            std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]);
+            std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]);
+        }
+    }
+
+    return {input_ids, attention_mask};
+}
+
+void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) {
+    const size_t batch_size = attention_mask.get_shape().at(0);
+    const size_t seq_length = attention_mask.get_shape().at(1);
+    position_ids.set_shape({batch_size, 1});
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        int64_t* start = attention_mask.data<int64_t>() + batch * seq_length;
+        position_ids.data<int64_t>()[batch] = std::accumulate(start, start + seq_length - 1, 0);
+    }
+}
+
+void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) {
+    const size_t batch_size = attention_mask.get_shape()[0];
+    const size_t seq_length = attention_mask.get_shape()[1];
+
+    const int64_t* attention_mask_data = attention_mask.data<int64_t>();
+    int64_t* position_ids_data = position_ids.data<int64_t>();
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        size_t sum = 0;
+        for (size_t i = 0; i < seq_length; i++) {
+            const size_t element_offset = batch * seq_length + i;
+            position_ids_data[element_offset] = sum;
+            if (attention_mask_data[element_offset] == 1) {
+                sum += 1;
+            }
+        }
+    }
+}
+
+class LLModel {
+    ov::Tensor extend_attention(ov::Tensor attention_mask) {
+        auto shape = attention_mask.get_shape();
+        auto batch_size = shape[0];
+        auto seq_len = shape[1];
+
+        ov::Tensor new_atten_mask = ov::Tensor{attention_mask.get_element_type(), {batch_size, seq_len + 1}};
+        auto old_data = attention_mask.data<int64_t>();
+        auto new_data = new_atten_mask.data<int64_t>();
+        for (size_t batch = 0; batch < batch_size; ++batch) {
+            std::memcpy(new_data + batch * (seq_len + 1), old_data + batch * seq_len, seq_len * sizeof(int64_t));
+            new_data[batch * (seq_len + 1) + seq_len] = 1;
+        }
+        return new_atten_mask;
+    }
 
-class LLMModel {
     ov::InferRequest m_model_runner;
+    
+    GenerationResult greedy_search(ov::Tensor input_ids, GenerationConfig sampling_params) {
+        auto attention_mask = ov::Tensor{ov::element::i64, input_ids.get_shape()};
+        std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
+        return greedy_search(input_ids, attention_mask, sampling_params);
+    }
 
-    GenerationResult greedy_search(ov::Tensor prompts, GenerationConfig sampling_params) {
-        ov::Shape prompts_shape = prompts.get_shape();
+    GenerationResult greedy_search(ov::Tensor input_ids, 
+                                   ov::Tensor attention_mask, 
+                                   GenerationConfig sampling_params) {
+        ov::Shape prompts_shape = input_ids.get_shape();
         size_t batch_size = prompts_shape[0];
-        OPENVINO_ASSERT(batch_size == 1);
         
         GenerationResult results(batch_size);
 
-        auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()};
-        std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
-        auto position_ids = ov::Tensor{ov::element::i64, prompts.get_shape()};
-        std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
-        auto initial_seq_len = prompts.get_shape()[1];
+        auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
+        // std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
+        initialize_position_ids(position_ids, attention_mask);
 
-        m_model_runner.set_tensor("input_ids", prompts);
+        size_t initial_seq_len = input_ids.get_shape()[1];
+
+        m_model_runner.set_tensor("input_ids", input_ids);
         m_model_runner.set_tensor("attention_mask", attention_mask);
         m_model_runner.set_tensor("position_ids", position_ids);
     
-        // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1
         m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
-        m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+        auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
+        std::iota(beam_data, beam_data + batch_size, 0);
 
         for (size_t i = 0; i < sampling_params.m_max_new_tokens; ++i) {
             m_model_runner.infer();
@@ -42,11 +129,10 @@ class LLMModel {
             size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];
 
             m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});
-            m_model_runner.get_tensor("attention_mask").set_shape({batch_size, m_model_runner.get_tensor("attention_mask").get_shape()[1] + 1});
-            std::fill_n(m_model_runner.get_tensor("attention_mask").data<int64_t>(), m_model_runner.get_tensor("attention_mask").get_size(), 1);
-            
-            m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
-    
+            m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
+            // m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
+            update_position_ids(position_ids, attention_mask);
+
             for (size_t batch = 0; batch < batch_size; ++batch) {
                 const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
                 int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
@@ -137,16 +223,16 @@ class LLMModel {
     }
 
 public:
-    LLMModel(ov::InferRequest& request) :
+    LLModel(ov::InferRequest& request) :
           m_model_runner(request) {
     }
     
-    LLMModel() = default;
+    LLModel() = default;
 
     // more high level interface
-    GenerationResult generate(ov::Tensor prompts, GenerationConfig sampling_params) {
+    GenerationResult generate(ov::Tensor prompts, ov::Tensor attention_mask, GenerationConfig sampling_params) {
         if (sampling_params.is_gready_sampling()) {
-            return greedy_search(prompts, sampling_params);
+            return greedy_search(prompts, attention_mask, sampling_params);
         } else if (sampling_params.is_beam_search()) {
             return beam_search(prompts, sampling_params);
         } else {  // if (sampling_params.is_multimomial()) {
@@ -159,13 +245,48 @@ std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::str
     size_t batch_size = 1;
     tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
     tokenizer.infer();
+
+    vector<vector<int64_t>> input_ids_vec;
+    input_ids_vec.reserve(1);
+    auto res_tensor = tokenizer.get_tensor("input_ids");
+    auto res_shape = res_tensor.get_shape();
+    
+    for (int i = 0; i < res_shape[0]; ++i) {
+        int64_t* start = res_tensor.data<int64_t>() + i * res_shape[1];
+        input_ids_vec.emplace_back(std::vector<int64_t>(start, start + res_shape[1]));
+    }
+
     return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
 }
 
 std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::vector<std::string> prompts) {
-    tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, &prompts});
+    tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
     auto size_ = tokenizer.get_input_tensor().get_shape();
     tokenizer.infer();
+
+    pad_left(tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask"));
+    // fix mask filled with '2' instead of '0'
+    ov::Tensor attention_mask = tokenizer.get_tensor("attention_mask");
+    int64_t* attention_mask_data = attention_mask.data<int64_t>();
+    std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0);
+    
+    vector<vector<int64_t>> input_ids_vec;
+    vector<vector<int64_t>> atten_mask_vec;
+    
+    input_ids_vec.reserve(prompts.size());
+    atten_mask_vec.reserve(prompts.size());
+    auto res_tensor = tokenizer.get_tensor("input_ids");
+    auto atten_tensor = tokenizer.get_tensor("attention_mask");
+    auto res_shape = res_tensor.get_shape();
+    
+    for (int i = 0; i < res_shape[0]; ++i) {
+        int64_t* start = res_tensor.data<int64_t>() + i * res_shape[1];
+        input_ids_vec.emplace_back(std::vector<int64_t>(start, start + res_shape[1]));
+        
+        int64_t* atten_start = atten_tensor.data<int64_t>() + i * res_shape[1];
+        atten_mask_vec.emplace_back(std::vector<int64_t>(atten_start, atten_start + res_shape[1]));
+    }
+
     return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
 }
 
@@ -201,7 +322,8 @@ std::vector<std::string> detokenize(ov::InferRequest& detokenizer,
         detokenizer.set_input_tensor(tokens);
         detokenizer.infer();
         auto res = detokenizer.get_output_tensor();
-        strings.emplace_back(res.data<std::string>()[0]);
+        auto res_str = res.data<std::string>()[0];
+        strings.emplace_back(res_str);
     }
     
     return strings;
@@ -243,7 +365,7 @@ struct TextStreamer {
 };
 
 class LLMPipeline {
-    LLMModel m_model_runner;
+    LLModel m_model_runner;
     ov::InferRequest m_tokenizer;
     ov::InferRequest m_detokenizer;
     std::string m_path;
@@ -254,12 +376,12 @@ class LLMPipeline {
         if (std::experimental::filesystem::exists(m_path + "/generation_config.json")) {
             m_sampling_parameters = GenerationConfig(m_path + "/generation_config.json");
         }
-        m_sampling_parameters = GenerationConfig(m_path + "/generation_config_beam.json");
+        // m_sampling_parameters = GenerationConfig(m_path + "/generation_config_beam.json");
 
         ov::Core core;
         // The model can be compiled for GPU as well
         auto model_request = core.compile_model(m_path + "/openvino_model.xml", "CPU").create_infer_request();
-        m_model_runner = LLMModel(model_request);
+        m_model_runner = LLModel(model_request);
 
         // tokenizer and detokenizer work on CPU only
         core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
@@ -274,7 +396,7 @@ class LLMPipeline {
     std::string call(std::string text) {
         auto [input_ids, attention_mask] = tokenize(m_tokenizer, text);
 
-        auto generate_results = m_model_runner.generate(input_ids, m_sampling_parameters);
+        auto generate_results = m_model_runner.generate(input_ids, attention_mask, m_sampling_parameters);
 
         return detokenize(m_detokenizer, generate_results, 0)[0];
     }
@@ -282,7 +404,7 @@ class LLMPipeline {
     std::string call(std::string text, GenerationConfig sampling_parameters) {
         auto [input_ids, attention_mask] = tokenize(m_tokenizer, text);
 
-        auto generate_results = m_model_runner.generate(input_ids, sampling_parameters);
+        auto generate_results = m_model_runner.generate(input_ids, attention_mask, sampling_parameters);
 
         return detokenize(m_detokenizer, generate_results, 0)[0];
     }
@@ -290,7 +412,7 @@ class LLMPipeline {
     std::vector<std::string> call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
         auto [input_ids, attention_mask] = tokenize(m_tokenizer, text);
 
-        auto generate_results = m_model_runner.generate(input_ids, sampling_parameters);
+        auto generate_results = m_model_runner.generate(input_ids, attention_mask, sampling_parameters);
 
         return detokenize(m_detokenizer, generate_results, 0);
     }
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
index 01d358ae04..70b3a343dc 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
@@ -11,55 +11,55 @@ constexpr size_t BATCH_SIZE = 1;
 
 }  // namespace
 
+using namespace std;
+
 int main(int argc, char* argv[]) try {
     {
-        // PIPELINE Ex.1
-        std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-        LLMPipeline pipe(model_path);
-        std::cout << pipe.call("Alan Turing was a");
+        // // PIPELINE Ex.1
+        // std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
+        // LLMPipeline pipe(model_path);
+        // std::cout << pipe.call("table is made of");
     }
 
-    {   
+    {
         // PIPELINE Ex.2
-        std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
+        std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
         LLMPipeline pipe(model_path);
         GenerationConfig config = pipe.generation_config();
         
-        std::cout << pipe("Alan Turing was a", config.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
-        
-        // batched inputs 
-        // todo: tokenizer fails for batched input strings
-        auto results = pipe({"table is made of ", 
-                            "Alan Turing was a", 
+        // batched inputs
+        auto results = pipe({"table is made of", 
+                            "Alan Turing was a",
                             "1 + 1 = ",
                             "Why is the Sun yellow?"
-                            }, config.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
+                            }, config.do_sample(false));
         
         for (const auto& res: results) {
-            std::cout << res << std::endl;
+            cout << res << endl;
+            cout << "-------------------" << endl;
         }
     }
 
-    // GENERATE
-    ov::Core core;
-    std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-    core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-    ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
-    ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+    // // GENERATE
+    // ov::Core core;
+    // std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
+    // core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
+    // ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
+    // ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
 
-    // todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted
-    std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
-    ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
+    // // todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted
+    // std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
+    // ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
     
-    auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a");
-    GenerationConfig config = GenerationConfig::beam_search();
-    LLMModel engine(request);
+    // auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a");
+    // GenerationConfig config = GenerationConfig::beam_search();
+    // LLMModel engine(request);
     
-    GenerationResult generation_results = engine.generate(input_ids, config);
-    std::cout << detokenize(detokenizer, generation_results[0]);
+    // GenerationResult generation_results = engine.generate(input_ids, config);
+    // std::cout << detokenize(detokenizer, generation_results[0]);
     
-    generation_results = engine.generate(input_ids, config.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
-    std::cout << detokenize(detokenizer, generation_results[0]);
+    // generation_results = engine.generate(input_ids, config.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
+    // std::cout << detokenize(detokenizer, generation_results[0]);
 
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';

From 5a914f6abcf145581999671b7f712cf1121846ad Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 10 Apr 2024 11:28:20 +0200
Subject: [PATCH 10/97] combine LLModel with LLMPipeline

---
 .../generate_pipeline/generate_pipeline.hpp   | 337 ++++++++----------
 1 file changed, 158 insertions(+), 179 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
index 10edab39de..82828179f9 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <openvino/openvino.hpp>
+#include <openvino/core/any.hpp>
 #include "sampling_parameters.hpp"
 #include <experimental/filesystem>
 #include "group_beam_searcher.hpp"
@@ -22,21 +23,18 @@ std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&&
         const size_t batch_offset = batch * sequence_length;
 
         // last token in the sequence is not a PAD_TOKEN, skipping
-        if (inputs_data[batch_offset + sequence_length - 1] != pad_token) {
+        if (inputs_data[batch_offset + sequence_length - 1] != pad_token)
             continue;
-        }
 
         size_t pad_tokens_number = 0;
         for (int i = sequence_length - 1; i >= 0; i--) {
             const size_t token_offset = batch_offset + i;
 
-            if (inputs_data[token_offset] == pad_token) {
+            if (inputs_data[token_offset] == pad_token)
                 continue;
-            }
 
-            if (pad_tokens_number == 0) {
+            if (pad_tokens_number == 0)
                 pad_tokens_number = sequence_length - i - 1;
-            }
 
             std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]);
             std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]);
@@ -76,170 +74,20 @@ void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attenti
     }
 }
 
-class LLModel {
-    ov::Tensor extend_attention(ov::Tensor attention_mask) {
-        auto shape = attention_mask.get_shape();
-        auto batch_size = shape[0];
-        auto seq_len = shape[1];
-
-        ov::Tensor new_atten_mask = ov::Tensor{attention_mask.get_element_type(), {batch_size, seq_len + 1}};
-        auto old_data = attention_mask.data<int64_t>();
-        auto new_data = new_atten_mask.data<int64_t>();
-        for (size_t batch = 0; batch < batch_size; ++batch) {
-            std::memcpy(new_data + batch * (seq_len + 1), old_data + batch * seq_len, seq_len * sizeof(int64_t));
-            new_data[batch * (seq_len + 1) + seq_len] = 1;
-        }
-        return new_atten_mask;
-    }
-
-    ov::InferRequest m_model_runner;
-    
-    GenerationResult greedy_search(ov::Tensor input_ids, GenerationConfig sampling_params) {
-        auto attention_mask = ov::Tensor{ov::element::i64, input_ids.get_shape()};
-        std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
-        return greedy_search(input_ids, attention_mask, sampling_params);
-    }
-
-    GenerationResult greedy_search(ov::Tensor input_ids, 
-                                   ov::Tensor attention_mask, 
-                                   GenerationConfig sampling_params) {
-        ov::Shape prompts_shape = input_ids.get_shape();
-        size_t batch_size = prompts_shape[0];
-        
-        GenerationResult results(batch_size);
-
-        auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
-        // std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
-        initialize_position_ids(position_ids, attention_mask);
-
-        size_t initial_seq_len = input_ids.get_shape()[1];
-
-        m_model_runner.set_tensor("input_ids", input_ids);
-        m_model_runner.set_tensor("attention_mask", attention_mask);
-        m_model_runner.set_tensor("position_ids", position_ids);
-    
-        m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
-        auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
-        std::iota(beam_data, beam_data + batch_size, 0);
-
-        for (size_t i = 0; i < sampling_params.m_max_new_tokens; ++i) {
-            m_model_runner.infer();
-            auto logits = m_model_runner.get_tensor("logits");
-            ov::Shape logits_shape = logits.get_shape();
-            size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];
-
-            m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});
-            m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
-            // m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
-            update_position_ids(position_ids, attention_mask);
-
-            for (size_t batch = 0; batch < batch_size; ++batch) {
-                const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
-                int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
-                results[batch].emplace_back(out_token);
-                
-                // todo: add exit criteria when pad or EOS is met
-                m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
-                m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(initial_seq_len + i);
-            }
-        }
-        return results;
-    }
-
-    GenerationResult beam_search(ov::Tensor prompts, GenerationConfig sampling_params) {
-        ov::Shape prompts_shape = prompts.get_shape();
-        size_t batch_size = prompts_shape[0];
-        // todo: implement for batch > 1
-        OPENVINO_ASSERT(batch_size == 1);
-
-        // initialize inputs
-        auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()};
-        std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
-        auto position_ids = ov::Tensor{ov::element::i64, prompts.get_shape()};
-        std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
-        auto initial_seq_len = prompts.get_shape()[1];
-
-        m_model_runner.set_tensor("input_ids", prompts);
-        m_model_runner.set_tensor("attention_mask", attention_mask);
-        m_model_runner.set_tensor("position_ids", position_ids);
-    
-        // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1
-        m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
-        m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;
-
-        const int64_t* prompt_data = prompts.data<const int64_t>();
-        
-        // todo: remove this duplicatino and use the same SamplingParameters for both greedy and beam
-        Parameters parameters{std::vector<int64_t>{prompt_data, prompt_data + prompts.get_size()}};
-        parameters.n_groups = sampling_params.m_num_groups;
-        parameters.diversity_penalty = sampling_params.m_diversity_penalty;
-        parameters.group_size = sampling_params.m_group_size;
-
-        GroupBeamSearcher group_beam_searcher{parameters};
-        std::vector<int64_t> next_tokens;
-        std::vector<int32_t> next_beams;
-        for (size_t length_count = 0; length_count < sampling_params.m_max_new_tokens; ++length_count) {
-            m_model_runner.infer();
-            std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(m_model_runner.get_tensor("logits"));
-            if (next_tokens.empty()) {
-                break;
-            }
-            size_t batch_size = next_tokens.size();
-            // Set pointers
-            m_model_runner.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
-            m_model_runner.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
-            // Set auxiliary inputs
-            ov::Tensor attention_mask = m_model_runner.get_tensor("attention_mask");
-            ov::Shape mask_shape{batch_size, attention_mask.get_shape()[1] + 1};
-            attention_mask.set_shape(mask_shape);
-            std::fill_n(attention_mask.data<int64_t>(), ov::shape_size(mask_shape), 1);
-
-            m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
-            std::fill_n(m_model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1);
-        }
-
-        std::vector<Beam> beams;
-        for (const std::vector<Beam>& group : finalize(std::move(group_beam_searcher))) {
-            for (const Beam& beam : group) {
-                beams.emplace_back(beam);
-                // results.emplace_back(beam.tokens);
-            }
-        }
-
-        auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); };
-        std::sort(beams.begin(), beams.end(), compare_scores);
-        
-        GenerationResult results;
-        for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) {
-            results.emplace_back(beam->tokens);
-        }
-        return results;
-    }
-
-    GenerationResult multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
-        // todo: implement
-        GenerationResult results;
-        return results;
-    }
-
-public:
-    LLModel(ov::InferRequest& request) :
-          m_model_runner(request) {
+ov::Tensor extend_attention(ov::Tensor attention_mask) {
+    auto shape = attention_mask.get_shape();
+    auto batch_size = shape[0];
+    auto seq_len = shape[1];
+
+    ov::Tensor new_atten_mask = ov::Tensor{attention_mask.get_element_type(), {batch_size, seq_len + 1}};
+    auto old_data = attention_mask.data<int64_t>();
+    auto new_data = new_atten_mask.data<int64_t>();
+    for (size_t batch = 0; batch < batch_size; ++batch) {
+        std::memcpy(new_data + batch * (seq_len + 1), old_data + batch * seq_len, seq_len * sizeof(int64_t));
+        new_data[batch * (seq_len + 1) + seq_len] = 1;
     }
-    
-    LLModel() = default;
-
-    // more high level interface
-    GenerationResult generate(ov::Tensor prompts, ov::Tensor attention_mask, GenerationConfig sampling_params) {
-        if (sampling_params.is_gready_sampling()) {
-            return greedy_search(prompts, attention_mask, sampling_params);
-        } else if (sampling_params.is_beam_search()) {
-            return beam_search(prompts, sampling_params);
-        } else {  // if (sampling_params.is_multimomial()) {
-            return multinomial_sampling(prompts, sampling_params);
-        }
-    }
-};
+    return new_atten_mask;
+}
 
 std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string prompt) {
     size_t batch_size = 1;
@@ -365,23 +213,21 @@ struct TextStreamer {
 };
 
 class LLMPipeline {
-    LLModel m_model_runner;
+    ov::InferRequest m_model_runner;
     ov::InferRequest m_tokenizer;
     ov::InferRequest m_detokenizer;
     std::string m_path;
     GenerationConfig m_sampling_parameters;
 
 public:
-    LLMPipeline(std::string& path) : m_path(path) {
+    LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& config={}) : m_path(path) {
         if (std::experimental::filesystem::exists(m_path + "/generation_config.json")) {
             m_sampling_parameters = GenerationConfig(m_path + "/generation_config.json");
         }
-        // m_sampling_parameters = GenerationConfig(m_path + "/generation_config_beam.json");
-
+        
         ov::Core core;
-        // The model can be compiled for GPU as well
-        auto model_request = core.compile_model(m_path + "/openvino_model.xml", "CPU").create_infer_request();
-        m_model_runner = LLModel(model_request);
+        auto model_request = core.compile_model(m_path + "/openvino_model.xml", device, config).create_infer_request();
+        m_model_runner = model_request;
 
         // tokenizer and detokenizer work on CPU only
         core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
@@ -396,7 +242,7 @@ class LLMPipeline {
     std::string call(std::string text) {
         auto [input_ids, attention_mask] = tokenize(m_tokenizer, text);
 
-        auto generate_results = m_model_runner.generate(input_ids, attention_mask, m_sampling_parameters);
+        auto generate_results = generate(input_ids, attention_mask, m_sampling_parameters);
 
         return detokenize(m_detokenizer, generate_results, 0)[0];
     }
@@ -404,7 +250,7 @@ class LLMPipeline {
     std::string call(std::string text, GenerationConfig sampling_parameters) {
         auto [input_ids, attention_mask] = tokenize(m_tokenizer, text);
 
-        auto generate_results = m_model_runner.generate(input_ids, attention_mask, sampling_parameters);
+        auto generate_results = generate(input_ids, attention_mask, sampling_parameters);
 
         return detokenize(m_detokenizer, generate_results, 0)[0];
     }
@@ -412,7 +258,7 @@ class LLMPipeline {
     std::vector<std::string> call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
         auto [input_ids, attention_mask] = tokenize(m_tokenizer, text);
 
-        auto generate_results = m_model_runner.generate(input_ids, attention_mask, sampling_parameters);
+        auto generate_results = generate(input_ids, attention_mask, sampling_parameters);
 
         return detokenize(m_detokenizer, generate_results, 0);
     }
@@ -428,4 +274,137 @@ class LLMPipeline {
     std::vector<std::string> operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters) {
         return call(text, sampling_parameters);
     }
+
+    GenerationResult greedy_search(ov::Tensor input_ids, 
+                                   ov::Tensor attention_mask, 
+                                   GenerationConfig sampling_params) {
+        ov::Shape prompts_shape = input_ids.get_shape();
+        size_t batch_size = prompts_shape[0];
+        
+        GenerationResult results(batch_size);
+
+        auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
+        // std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
+        initialize_position_ids(position_ids, attention_mask);
+
+        size_t initial_seq_len = input_ids.get_shape()[1];
+
+        m_model_runner.set_tensor("input_ids", input_ids);
+        m_model_runner.set_tensor("attention_mask", attention_mask);
+        m_model_runner.set_tensor("position_ids", position_ids);
+    
+        m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
+        auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
+        std::iota(beam_data, beam_data + batch_size, 0);
+
+        for (size_t i = 0; i < sampling_params.m_max_new_tokens; ++i) {
+            m_model_runner.infer();
+            auto logits = m_model_runner.get_tensor("logits");
+            ov::Shape logits_shape = logits.get_shape();
+            size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];
+
+            m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});
+            m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
+            // m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
+            update_position_ids(position_ids, attention_mask);
+
+            for (size_t batch = 0; batch < batch_size; ++batch) {
+                const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
+                int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
+                results[batch].emplace_back(out_token);
+                
+                // todo: add exit criteria when pad or EOS is met
+                m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
+                m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(initial_seq_len + i);
+            }
+        }
+        return results;
+    }
+
+    GenerationResult beam_search(ov::Tensor prompts, GenerationConfig sampling_params) {
+        ov::Shape prompts_shape = prompts.get_shape();
+        size_t batch_size = prompts_shape[0];
+        // todo: implement for batch > 1
+        OPENVINO_ASSERT(batch_size == 1);
+
+        // initialize inputs
+        auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()};
+        std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
+        auto position_ids = ov::Tensor{ov::element::i64, prompts.get_shape()};
+        std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
+        auto initial_seq_len = prompts.get_shape()[1];
+
+        m_model_runner.set_tensor("input_ids", prompts);
+        m_model_runner.set_tensor("attention_mask", attention_mask);
+        m_model_runner.set_tensor("position_ids", position_ids);
+    
+        // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1
+        m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
+        m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+
+        const int64_t* prompt_data = prompts.data<const int64_t>();
+        
+        // todo: remove this duplicatino and use the same SamplingParameters for both greedy and beam
+        Parameters parameters{std::vector<int64_t>{prompt_data, prompt_data + prompts.get_size()}};
+        parameters.n_groups = sampling_params.m_num_groups;
+        parameters.diversity_penalty = sampling_params.m_diversity_penalty;
+        parameters.group_size = sampling_params.m_group_size;
+
+        GroupBeamSearcher group_beam_searcher{parameters};
+        std::vector<int64_t> next_tokens;
+        std::vector<int32_t> next_beams;
+        for (size_t length_count = 0; length_count < sampling_params.m_max_new_tokens; ++length_count) {
+            m_model_runner.infer();
+            std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(m_model_runner.get_tensor("logits"));
+            if (next_tokens.empty()) {
+                break;
+            }
+            size_t batch_size = next_tokens.size();
+            // Set pointers
+            m_model_runner.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
+            m_model_runner.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
+            // Set auxiliary inputs
+            ov::Tensor attention_mask = m_model_runner.get_tensor("attention_mask");
+            ov::Shape mask_shape{batch_size, attention_mask.get_shape()[1] + 1};
+            attention_mask.set_shape(mask_shape);
+            std::fill_n(attention_mask.data<int64_t>(), ov::shape_size(mask_shape), 1);
+
+            m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
+            std::fill_n(m_model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1);
+        }
+
+        std::vector<Beam> beams;
+        for (const std::vector<Beam>& group : finalize(std::move(group_beam_searcher))) {
+            for (const Beam& beam : group) {
+                beams.emplace_back(beam);
+                // results.emplace_back(beam.tokens);
+            }
+        }
+
+        auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); };
+        std::sort(beams.begin(), beams.end(), compare_scores);
+        
+        GenerationResult results;
+        for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) {
+            results.emplace_back(beam->tokens);
+        }
+        return results;
+    }
+
+    GenerationResult multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
+        // todo: implement
+        GenerationResult results;
+        return results;
+    }
+
+
+    GenerationResult generate(ov::Tensor prompts, ov::Tensor attention_mask, GenerationConfig sampling_params) {
+    if (sampling_params.is_gready_sampling()) {
+        return greedy_search(prompts, attention_mask, sampling_params);
+    } else if (sampling_params.is_beam_search()) {
+        return beam_search(prompts, sampling_params);
+    } else {  // if (sampling_params.is_multimomial()) {
+        return multinomial_sampling(prompts, sampling_params);
+    }
+}
 };

From c1e0c9df7163b9fb0dcb74debcfe51e2208d7a57 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 10 Apr 2024 11:40:45 +0200
Subject: [PATCH 11/97] wip: enable calling tokenize/detokenize for LLMPipeline

---
 .../generate_pipeline/generate_pipeline.hpp   | 377 ++++++++++--------
 ...g_parameters.hpp => generation_config.hpp} |   2 +
 .../causal_lm/cpp/generate_pipeline/main.cpp  |  37 +-
 3 files changed, 233 insertions(+), 183 deletions(-)
 rename text_generation/causal_lm/cpp/generate_pipeline/{sampling_parameters.hpp => generation_config.hpp} (99%)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
index 82828179f9..a3d945d80f 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -5,8 +5,8 @@
 
 #include <openvino/openvino.hpp>
 #include <openvino/core/any.hpp>
-#include "sampling_parameters.hpp"
-#include <experimental/filesystem>
+#include "generation_config.hpp"
+#include <filesystem>
 #include "group_beam_searcher.hpp"
 
 // using GenerationResult = ov::Tensor;
@@ -74,6 +74,13 @@ void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attenti
     }
 }
 
+ov::Tensor init_attention_mask(ov::Tensor& position_ids) {
+    auto shape = position_ids.get_shape();
+    auto attention_mask = ov::Tensor{position_ids.get_element_type(), shape};
+    std::fill_n(attention_mask.data<int64_t>(), shape[0] * shape[1], 1);
+    return attention_mask;
+}
+
 ov::Tensor extend_attention(ov::Tensor attention_mask) {
     auto shape = attention_mask.get_shape();
     auto batch_size = shape[0];
@@ -89,190 +96,190 @@ ov::Tensor extend_attention(ov::Tensor attention_mask) {
     return new_atten_mask;
 }
 
-std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string prompt) {
-    size_t batch_size = 1;
-    tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
-    tokenizer.infer();
-
-    vector<vector<int64_t>> input_ids_vec;
-    input_ids_vec.reserve(1);
-    auto res_tensor = tokenizer.get_tensor("input_ids");
-    auto res_shape = res_tensor.get_shape();
-    
-    for (int i = 0; i < res_shape[0]; ++i) {
-        int64_t* start = res_tensor.data<int64_t>() + i * res_shape[1];
-        input_ids_vec.emplace_back(std::vector<int64_t>(start, start + res_shape[1]));
-    }
-
-    return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
-}
-
-std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::vector<std::string> prompts) {
-    tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
-    auto size_ = tokenizer.get_input_tensor().get_shape();
-    tokenizer.infer();
 
-    pad_left(tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask"));
-    // fix mask filled with '2' instead of '0'
-    ov::Tensor attention_mask = tokenizer.get_tensor("attention_mask");
-    int64_t* attention_mask_data = attention_mask.data<int64_t>();
-    std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0);
-    
-    vector<vector<int64_t>> input_ids_vec;
-    vector<vector<int64_t>> atten_mask_vec;
-    
-    input_ids_vec.reserve(prompts.size());
-    atten_mask_vec.reserve(prompts.size());
-    auto res_tensor = tokenizer.get_tensor("input_ids");
-    auto atten_tensor = tokenizer.get_tensor("attention_mask");
-    auto res_shape = res_tensor.get_shape();
-    
-    for (int i = 0; i < res_shape[0]; ++i) {
-        int64_t* start = res_tensor.data<int64_t>() + i * res_shape[1];
-        input_ids_vec.emplace_back(std::vector<int64_t>(start, start + res_shape[1]));
-        
-        int64_t* atten_start = atten_tensor.data<int64_t>() + i * res_shape[1];
-        atten_mask_vec.emplace_back(std::vector<int64_t>(atten_start, atten_start + res_shape[1]));
-    }
-
-    return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
-}
-
-std::string detokenize(ov::InferRequest& detokenizer, std::vector<int64_t> tokens) {
-    size_t batch_size = 1;
-    detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()});
-    detokenizer.infer();
-    return detokenizer.get_output_tensor().data<std::string>()[0];
-}
-
-std::vector<std::string> detokenize(ov::InferRequest& detokenizer, ov::Tensor tokens) {
-    detokenizer.set_input_tensor(tokens);
-    auto shape = tokens.get_shape();
-    auto data = tokens.data<int64_t>();
-    detokenizer.infer();
-    auto res = detokenizer.get_output_tensor();
-    
-    std::vector<std::string> strings;
-    for (int i = 0; i < res.get_shape()[0]; ++i) {
-        strings.emplace_back(res.data<std::string>()[i]);
-    }
-    return strings;
-}
-
-std::vector<std::string> detokenize(ov::InferRequest& detokenizer, 
-                                    std::vector<std::vector<int64_t>> lines, 
-                                    int64_t pad_token_idx) {
-    // todo: implement calling detokenizer in a single batch
-
-    std::vector<std::string> strings;
-    for (auto& line: lines){
-        ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()};
-        detokenizer.set_input_tensor(tokens);
-        detokenizer.infer();
-        auto res = detokenizer.get_output_tensor();
-        auto res_str = res.data<std::string>()[0];
-        strings.emplace_back(res_str);
-    }
-    
-    return strings;
-}
 
 // The following reasons require TextStreamer to keep a cache of previous tokens:
 // detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
 // but detokenize(tokenize("prefix a")) == "prefix a"
 // 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
-struct TextStreamer {
-    ov::InferRequest detokenizer;
-    std::vector<int64_t> token_cache;
-    size_t print_len = 0;
-
-    void put(int64_t token) {
-        token_cache.push_back(token);
-        std::string text = detokenize(detokenizer, token_cache);
-        if (!text.empty() && '\n' == text.back()) {
-            // Flush the cache after the new line symbol
-            std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
-            token_cache.clear();
-            print_len = 0;
-            return;
-        }
-        if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
-            // Don't print incomplete text
-            return;
-        }
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
-        print_len = text.size();
-    }
-
-    void end() {
-        std::string text = detokenize(detokenizer, token_cache);
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
-        token_cache.clear();
-        print_len = 0;
-    }
-};
+// TODO: CHECK IF WE REALLY NEED TEXT STREAMER
+// struct TextStreamer {
+//     ov::InferRequest detokenizer;
+//     std::vector<int64_t> token_cache;
+//     size_t print_len = 0;
+
+//     void put(int64_t token) {
+//         token_cache.push_back(token);
+//         std::string text = detokenize(detokenizer, token_cache);
+//         if (!text.empty() && '\n' == text.back()) {
+//             // Flush the cache after the new line symbol
+//             std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
+//             token_cache.clear();
+//             print_len = 0;
+//             return;
+//         }
+//         if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
+//             // Don't print incomplete text
+//             return;
+//         }
+//         std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+//         print_len = text.size();
+//     }
+
+//     void end() {
+//         std::string text = detokenize(detokenizer, token_cache);
+//         std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
+//         token_cache.clear();
+//         print_len = 0;
+//     }
+// };
 
 class LLMPipeline {
     ov::InferRequest m_model_runner;
     ov::InferRequest m_tokenizer;
     ov::InferRequest m_detokenizer;
-    std::string m_path;
     GenerationConfig m_sampling_parameters;
 
 public:
-    LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& config={}) : m_path(path) {
-        if (std::experimental::filesystem::exists(m_path + "/generation_config.json")) {
-            m_sampling_parameters = GenerationConfig(m_path + "/generation_config.json");
+    // TODO: add constructor for specifying manually tokenizer path
+    // dir path
+    // xml file path
+    // compiled model
+    // infer request
+    // ov::Model
+    
+    LLMPipeline(
+        std::string& model_path,
+        std::string& tokenizer_path,
+        std::string& detokenizer_path,
+        std::string device="CPU",
+        const ov::AnyMap& config={}
+    ) {
+        ov::Core core;
+        
+        auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
+        
+        std::string full_path = model_path;
+	    if (!is_xml(full_path))
+		    full_path += "/openvino_model.xml";
+        m_model_runner = core.compile_model(full_path, device, config).create_infer_request();
+
+        core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
+        // tokenizer and detokenizer work on CPU only
+        full_path = tokenizer_path;
+	    if (!is_xml(full_path))
+		    full_path += "/openvino_tokenizer.xml";
+        m_tokenizer = core.compile_model(full_path, "CPU").create_infer_request();
+
+        full_path = detokenizer_path;
+	    if (!is_xml(full_path))
+		    full_path += "/openvino_detokenizer.xml";
+        m_detokenizer = core.compile_model(full_path, "CPU").create_infer_request();        
+    }
+
+    LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& config={}) {
+        if (std::filesystem::exists(path + "/generation_config.json")) {
+            m_sampling_parameters = GenerationConfig(path + "/generation_config.json");
         }
         
         ov::Core core;
-        auto model_request = core.compile_model(m_path + "/openvino_model.xml", device, config).create_infer_request();
+        auto model_request = core.compile_model(path + "/openvino_model.xml", device, config).create_infer_request();
         m_model_runner = model_request;
 
         // tokenizer and detokenizer work on CPU only
         core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-        m_tokenizer = core.compile_model(m_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
-        m_detokenizer = core.compile_model(m_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+        m_tokenizer = core.compile_model(path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
+        m_detokenizer = core.compile_model(path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
     }
 
     GenerationConfig generation_config() const {
         return m_sampling_parameters;
     }
 
-    std::string call(std::string text) {
-        auto [input_ids, attention_mask] = tokenize(m_tokenizer, text);
-
-        auto generate_results = generate(input_ids, attention_mask, m_sampling_parameters);
+    std::pair<ov::Tensor, ov::Tensor> tokenize(std::string prompt) {
+        size_t batch_size = 1;
+        m_tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
+        m_tokenizer.infer();
 
-        return detokenize(m_detokenizer, generate_results, 0)[0];
-    }
-    
-    std::string call(std::string text, GenerationConfig sampling_parameters) {
-        auto [input_ids, attention_mask] = tokenize(m_tokenizer, text);
-
-        auto generate_results = generate(input_ids, attention_mask, sampling_parameters);
+        vector<vector<int64_t>> input_ids_vec;
+        input_ids_vec.reserve(1);
+        auto res_tensor = m_tokenizer.get_tensor("input_ids");
+        auto res_shape = res_tensor.get_shape();
+        
+        for (int i = 0; i < res_shape[0]; ++i) {
+            int64_t* start = res_tensor.data<int64_t>() + i * res_shape[1];
+            input_ids_vec.emplace_back(std::vector<int64_t>(start, start + res_shape[1]));
+        }
 
-        return detokenize(m_detokenizer, generate_results, 0)[0];
+        return {m_tokenizer.get_tensor("input_ids"), m_tokenizer.get_tensor("attention_mask")};
     }
 
-    std::vector<std::string> call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
-        auto [input_ids, attention_mask] = tokenize(m_tokenizer, text);
+    std::pair<ov::Tensor, ov::Tensor> tokenize(std::vector<std::string> prompts) {
+        m_tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
+        auto size_ = m_tokenizer.get_input_tensor().get_shape();
+        m_tokenizer.infer();
 
-        auto generate_results = generate(input_ids, attention_mask, sampling_parameters);
+        pad_left(m_tokenizer.get_tensor("input_ids"), m_tokenizer.get_tensor("attention_mask"));
+        // fix mask filled with '2' instead of '0'
+        ov::Tensor attention_mask = m_tokenizer.get_tensor("attention_mask");
+        int64_t* attention_mask_data = attention_mask.data<int64_t>();
+        std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0);
+        
+        vector<vector<int64_t>> input_ids_vec;
+        vector<vector<int64_t>> atten_mask_vec;
+        
+        input_ids_vec.reserve(prompts.size());
+        atten_mask_vec.reserve(prompts.size());
+        auto res_tensor = m_tokenizer.get_tensor("input_ids");
+        auto atten_tensor = m_tokenizer.get_tensor("attention_mask");
+        auto res_shape = res_tensor.get_shape();
+        
+        for (int i = 0; i < res_shape[0]; ++i) {
+            int64_t* start = res_tensor.data<int64_t>() + i * res_shape[1];
+            input_ids_vec.emplace_back(std::vector<int64_t>(start, start + res_shape[1]));
+            
+            int64_t* atten_start = atten_tensor.data<int64_t>() + i * res_shape[1];
+            atten_mask_vec.emplace_back(std::vector<int64_t>(atten_start, atten_start + res_shape[1]));
+        }
 
-        return detokenize(m_detokenizer, generate_results, 0);
+        return {m_tokenizer.get_tensor("input_ids"), m_tokenizer.get_tensor("attention_mask")};
     }
 
-    std::string operator()(std::string text, GenerationConfig sampling_parameters) {
-        return call(text, sampling_parameters);
+    std::string detokenize(std::vector<int64_t> tokens) {
+        size_t batch_size = 1;
+        m_detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()});
+        m_detokenizer.infer();
+        return m_detokenizer.get_output_tensor().data<std::string>()[0];
     }
-    
-    std::vector<std::string> operator()(std::vector<std::string> text, GenerationConfig sampling_parameters) {
-        return call(text, sampling_parameters);
+
+    std::vector<std::string> detokenize(ov::Tensor tokens) {
+        m_detokenizer.set_input_tensor(tokens);
+        auto shape = tokens.get_shape();
+        auto data = tokens.data<int64_t>();
+        m_detokenizer.infer();
+        auto res = m_detokenizer.get_output_tensor();
+        
+        std::vector<std::string> strings;
+        for (int i = 0; i < res.get_shape()[0]; ++i) {
+            strings.emplace_back(res.data<std::string>()[i]);
+        }
+        return strings;
     }
-    
-    std::vector<std::string> operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters) {
-        return call(text, sampling_parameters);
+
+    std::vector<std::string> detokenize(GenerationResult lines) {
+        // todo: implement calling detokenizer in a single batch
+
+        std::vector<std::string> strings;
+        for (auto& line: lines){
+            ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()};
+            m_detokenizer.set_input_tensor(tokens);
+            m_detokenizer.infer();
+            auto res = m_detokenizer.get_output_tensor();
+            auto res_str = res.data<std::string>()[0];
+            strings.emplace_back(res_str);
+        }
+        
+        return strings;
     }
 
     GenerationResult greedy_search(ov::Tensor input_ids, 
@@ -397,14 +404,70 @@ class LLMPipeline {
         return results;
     }
 
+    std::string call(std::string text) {
+        auto [input_ids, attention_mask] = tokenize(text);
 
-    GenerationResult generate(ov::Tensor prompts, ov::Tensor attention_mask, GenerationConfig sampling_params) {
-    if (sampling_params.is_gready_sampling()) {
-        return greedy_search(prompts, attention_mask, sampling_params);
-    } else if (sampling_params.is_beam_search()) {
-        return beam_search(prompts, sampling_params);
-    } else {  // if (sampling_params.is_multimomial()) {
-        return multinomial_sampling(prompts, sampling_params);
+        auto generate_results = generate(input_ids, attention_mask, m_sampling_parameters);
+
+        return detokenize(generate_results)[0];
     }
-}
+    
+    std::string call(std::string text, GenerationConfig generation_config) {
+        auto [input_ids, attention_mask] = tokenize(text);
+        // to keep config specified during LLMPipeline creation need to get existing 
+        // and modify only after that, e.g.:
+        // GenerationConfig config = pipe.generation_config();
+        // config.do_sample(false).max_new_tokens(20);
+        auto generate_results = generate(input_ids, attention_mask, generation_config);
+
+        return detokenize(generate_results)[0];
+    }
+
+    std::vector<std::string> call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
+        auto [input_ids, attention_mask] = tokenize(text);
+
+        auto generate_results = generate(input_ids, attention_mask, sampling_parameters);
+
+        return detokenize(generate_results);
+    }
+    
+    std::string operator()(std::string text) {
+        return call(text);
+    }
+
+    std::string operator()(std::string text, GenerationConfig sampling_parameters) {
+        return call(text, sampling_parameters);
+    }
+    
+    std::vector<std::string> operator()(std::vector<std::string> text, GenerationConfig sampling_parameters) {
+        return call(text, sampling_parameters);
+    }
+    
+    std::vector<std::string> operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters) {
+        return call(text, sampling_parameters);
+    }
+    
+    GenerationResult generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
+        if (sampling_params.is_gready_sampling()) {
+            return greedy_search(input_ids, attention_mask, sampling_params);
+        } else if (sampling_params.is_beam_search()) {
+            return beam_search(input_ids, sampling_params);
+        } else {  // if (sampling_params.is_multimomial()) {
+            return multinomial_sampling(input_ids, sampling_params);
+        }
+    }
+
+    GenerationResult generate(ov::Tensor input_ids, ov::Tensor attention_mask) {
+        return generate(input_ids, attention_mask, m_sampling_parameters);
+    }
+
+    GenerationResult generate(ov::Tensor input_ids, GenerationConfig sampling_params) {
+
+        return generate(input_ids, init_attention_mask(input_ids), sampling_params);
+    }
+
+    GenerationResult generate(ov::Tensor input_ids) {
+        return generate(input_ids, init_attention_mask(input_ids), m_sampling_parameters);
+    }
+
 };
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
similarity index 99%
rename from text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
rename to text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index 991334501e..72a45a84e4 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -15,6 +15,8 @@ class Sequence;
 // Similar to HuggingFace GenerationConfig 
 // but has parameters that are not present in the original SamplingParameters for continous batching
 struct GenerationConfig {
+    // todo: add copy constructor
+    
     // Generic
     size_t m_max_new_tokens = 10;
     size_t m_max_length = 100; // max_new tokens should have priority over max_new_tokens
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
index 70b3a343dc..a28fa02800 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
@@ -15,18 +15,18 @@ using namespace std;
 
 int main(int argc, char* argv[]) try {
     {
-        // // PIPELINE Ex.1
-        // std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-        // LLMPipeline pipe(model_path);
-        // std::cout << pipe.call("table is made of");
+        // PIPELINE Ex.1
+        std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
+        LLMPipeline pipe(model_path);
+        std::cout << pipe("table is made of");
     }
+    cout << endl <<  "-------------END OF GENERATE ------" << endl;
 
     {
         // PIPELINE Ex.2
         std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
         LLMPipeline pipe(model_path);
         GenerationConfig config = pipe.generation_config();
-        
         // batched inputs
         auto results = pipe({"table is made of", 
                             "Alan Turing was a",
@@ -40,27 +40,12 @@ int main(int argc, char* argv[]) try {
         }
     }
 
-    // // GENERATE
-    // ov::Core core;
-    // std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-    // core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-    // ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
-    // ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
-
-    // // todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted
-    // std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
-    // ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
-    
-    // auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a");
-    // GenerationConfig config = GenerationConfig::beam_search();
-    // LLMModel engine(request);
-    
-    // GenerationResult generation_results = engine.generate(input_ids, config);
-    // std::cout << detokenize(detokenizer, generation_results[0]);
-    
-    // generation_results = engine.generate(input_ids, config.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
-    // std::cout << detokenize(detokenizer, generation_results[0]);
-
+    // GENERATE
+    std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
+    LLMPipeline pipe(model_path);
+    auto [input_ids, attention_mask] = pipe.tokenize("table is made of");
+    auto res = pipe.generate(input_ids, attention_mask);
+    std::cout << pipe.detokenize(res)[0];
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
     return EXIT_FAILURE;

From 8d6635329d4b779ca63318f1bb6e178228c1142a Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 11 Apr 2024 11:46:28 +0200
Subject: [PATCH 12/97] add callback to generate

---
 .../generate_pipeline/generate_pipeline.hpp   | 72 ++++++-------------
 .../generate_pipeline/generation_config.hpp   | 46 ++++++++----
 .../causal_lm/cpp/generate_pipeline/main.cpp  | 50 +++++++++++--
 3 files changed, 99 insertions(+), 69 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
index a3d945d80f..66cb2f8eaa 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -9,7 +9,6 @@
 #include <filesystem>
 #include "group_beam_searcher.hpp"
 
-// using GenerationResult = ov::Tensor;
 using GenerationResult = std::vector<std::vector<int64_t>>;
 using namespace std;
 
@@ -51,7 +50,7 @@ void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_m
 
     for (size_t batch = 0; batch < batch_size; batch++) {
         int64_t* start = attention_mask.data<int64_t>() + batch * seq_length;
-        position_ids.data<int64_t>()[batch] = std::accumulate(start, start + seq_length - 1, 0);
+        position_ids.data<int64_t>()[batch] = std::accumulate(start, start + seq_length, 0);
     }
 }
 
@@ -96,44 +95,6 @@ ov::Tensor extend_attention(ov::Tensor attention_mask) {
     return new_atten_mask;
 }
 
-
-
-// The following reasons require TextStreamer to keep a cache of previous tokens:
-// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
-// but detokenize(tokenize("prefix a")) == "prefix a"
-// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
-// TODO: CHECK IF WE REALLY NEED TEXT STREAMER
-// struct TextStreamer {
-//     ov::InferRequest detokenizer;
-//     std::vector<int64_t> token_cache;
-//     size_t print_len = 0;
-
-//     void put(int64_t token) {
-//         token_cache.push_back(token);
-//         std::string text = detokenize(detokenizer, token_cache);
-//         if (!text.empty() && '\n' == text.back()) {
-//             // Flush the cache after the new line symbol
-//             std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
-//             token_cache.clear();
-//             print_len = 0;
-//             return;
-//         }
-//         if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
-//             // Don't print incomplete text
-//             return;
-//         }
-//         std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
-//         print_len = text.size();
-//     }
-
-//     void end() {
-//         std::string text = detokenize(detokenizer, token_cache);
-//         std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
-//         token_cache.clear();
-//         print_len = 0;
-//     }
-// };
-
 class LLMPipeline {
     ov::InferRequest m_model_runner;
     ov::InferRequest m_tokenizer;
@@ -291,10 +252,10 @@ class LLMPipeline {
         GenerationResult results(batch_size);
 
         auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
-        // std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
+        // todo: make this work even if position_ids are not specified
         initialize_position_ids(position_ids, attention_mask);
 
-        size_t initial_seq_len = input_ids.get_shape()[1];
+        size_t prompt_len = input_ids.get_shape()[1];
 
         m_model_runner.set_tensor("input_ids", input_ids);
         m_model_runner.set_tensor("attention_mask", attention_mask);
@@ -304,7 +265,8 @@ class LLMPipeline {
         auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
         std::iota(beam_data, beam_data + batch_size, 0);
 
-        for (size_t i = 0; i < sampling_params.m_max_new_tokens; ++i) {
+        for (size_t i = 0; i < sampling_params.get_max_new_tokens(prompt_len); ++i) {
+            // todo: consider replacing with start_async and run callback right after that
             m_model_runner.infer();
             auto logits = m_model_runner.get_tensor("logits");
             ov::Shape logits_shape = logits.get_shape();
@@ -312,18 +274,26 @@ class LLMPipeline {
 
             m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});
             m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
-            // m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
-            update_position_ids(position_ids, attention_mask);
-
+            update_position_ids(position_ids, attention_mask);  // todo: check why does not always work correctly
+            
+            std::vector<int64_t> token_iter_results(batch_size);  // results of a single infer request
+            std::vector<int> eos_met(batch_size, 0);  // use int because can not use std::all_of with vector<bool>
             for (size_t batch = 0; batch < batch_size; ++batch) {
                 const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
                 int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
                 results[batch].emplace_back(out_token);
-                
-                // todo: add exit criteria when pad or EOS is met
+                token_iter_results[batch] = out_token;
+                eos_met[batch] != (out_token == sampling_params.m_eos_token_id);
+
                 m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
-                m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(initial_seq_len + i);
+                m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(prompt_len + i);
             }
+            sampling_params.m_callback(std::move(token_iter_results), *this);
+            
+            // stop generation when EOS is met in all batches
+            bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; });
+            if (!sampling_params.m_ignore_eos && all_are_eos)
+                break;
         }
         return results;
     }
@@ -339,7 +309,7 @@ class LLMPipeline {
         std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
         auto position_ids = ov::Tensor{ov::element::i64, prompts.get_shape()};
         std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
-        auto initial_seq_len = prompts.get_shape()[1];
+        auto prompt_len = prompts.get_shape()[1];
 
         m_model_runner.set_tensor("input_ids", prompts);
         m_model_runner.set_tensor("attention_mask", attention_mask);
@@ -360,7 +330,7 @@ class LLMPipeline {
         GroupBeamSearcher group_beam_searcher{parameters};
         std::vector<int64_t> next_tokens;
         std::vector<int32_t> next_beams;
-        for (size_t length_count = 0; length_count < sampling_params.m_max_new_tokens; ++length_count) {
+        for (size_t length_count = 0; length_count < sampling_params.get_max_new_tokens(prompt_len); ++length_count) {
             m_model_runner.infer();
             std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(m_model_runner.get_tensor("logits"));
             if (next_tokens.empty()) {
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index 72a45a84e4..348559f3a0 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -8,29 +8,33 @@
 #include <nlohmann/json.hpp>
 #include <fstream>
 #include <group_beam_searcher.hpp>  // used only for StopCriteria
+#include <limits>
 
 // forward declaration
 class Sequence;
 
-// Similar to HuggingFace GenerationConfig 
-// but has parameters that are not present in the original SamplingParameters for continous batching
+// forward declaration
+class LLMPipeline;
+
+// Similar to HuggingFace GenerationConfig
 struct GenerationConfig {
     // todo: add copy constructor
     
     // Generic
-    size_t m_max_new_tokens = 10;
-    size_t m_max_length = 100; // max_new tokens should have priority over max_new_tokens
+    size_t m_max_new_tokens = SIZE_MAX;
+    size_t m_max_length = SIZE_MAX; // m_max_new_tokens should have priority over m_max_length
     bool m_ignore_eos = false;
     int64_t m_eos_token = 2; // There's no way to extract special token values from the tokenizer for now
-    size_t m_num_return_sequences = 3;
 
     // Beam search specific
     size_t m_num_groups = 1;
     size_t m_group_size = 1; // beam_width
     float m_diversity_penalty = 1.0f; // 0.0 means no diversity
-    float m_repetition_penalty = 1.0f;
-    
+    size_t m_num_return_sequences = 3;  // is used by beam search, in other case is equal to batch size
     StopCriteria stop_criteria = StopCriteria::heuristic;
+    
+    
+    float m_repetition_penalty = 1.0f;
     float m_length_penalty = 1.0f;
     size_t m_no_repeat_ngram_size = std::numeric_limits<size_t>::max();
     std::function<bool(const Sequence&)> early_finish = [](const Sequence&) {return false; };
@@ -43,9 +47,22 @@ struct GenerationConfig {
 
     // special tokens
     int64_t m_bos_token_id = 0;
-    int64_t m_eos_token_id = 0;
+    int64_t m_eos_token_id = 0;  // todo: do we need both m_eos_token and m_eos_token_id?
     int64_t m_pad_token_id = 0;
 
+    std::function<void (std::vector<int64_t>&&, LLMPipeline&)> m_callback = [](std::vector<int64_t>&& tokens, LLMPipeline& pipe){ ;};
+
+
+    size_t get_max_new_tokens(size_t prompt_length = 0) {
+        // max_new_tokens has priority over max_length,
+        // only if m_max_new_tokens was not specified use max_length
+        if (m_max_new_tokens != SIZE_MAX) {
+            return m_max_new_tokens;
+        } else {
+            return m_max_length - prompt_length;
+        }
+    }
+
     GenerationConfig& max_new_tokens(size_t max_new_tokens) {
         this->m_max_new_tokens = max_new_tokens;
          return *this;
@@ -132,6 +149,11 @@ struct GenerationConfig {
          return *this;
      }
 
+    GenerationConfig& set_callback(std::function<void (std::vector<int64_t>&&, LLMPipeline&)> callback) {
+        this->m_callback = callback;
+         return *this;
+     }
+
     GenerationConfig() = default;
 
     GenerationConfig(std::string json_path) {
@@ -140,10 +162,12 @@ struct GenerationConfig {
 
         m_bos_token_id = data.value("bos_token_id", 0);
         m_eos_token_id = data.value("eos_token_id", 0);
-        m_max_length = data.value("max_length", 0);
+
         m_pad_token_id = data.value("pad_token_id", 0);
         m_num_return_sequences = data.value("num_return_sequences", 1);
-        m_max_new_tokens = data.value("max_new_tokens", 100);
+        
+        m_max_new_tokens = data.value("max_new_tokens", SIZE_MAX);
+        m_max_length = data.value("max_length", SIZE_MAX);
         
         m_temperature = data.value("temperature", 0.0f);
         m_do_sample = data.value("do_sample", false);
@@ -194,5 +218,3 @@ struct GenerationConfig {
     }
     
 };
-
-enum class SamplingAlgorithm{greedy, multinomial, baeam_search};
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
index a28fa02800..61aa4e274b 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <openvino/openvino.hpp>
-
 #include "generate_pipeline.hpp"
 
 namespace {
@@ -13,26 +12,65 @@ constexpr size_t BATCH_SIZE = 1;
 
 using namespace std;
 
+struct TextStreamer {
+    LLMPipeline pipe;
+    std::vector<int64_t> token_cache;
+    size_t print_len = 0;
+
+    void put(int64_t token) {
+        token_cache.push_back(token);
+        std::string text = pipe.detokenize(token_cache);
+        if (!text.empty() && '\n' == text.back()) {
+            // Flush the cache after the new line symbol
+            std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
+            token_cache.clear();
+            print_len = 0;
+	    return;
+        }
+        if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
+            // Don't print incomplete text
+            return;
+        }
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+        print_len = text.size();
+    }
+
+    void end() {
+        std::string text = pipe.detokenize(token_cache);
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
+        token_cache.clear();
+        print_len = 0;
+    }
+};
+
 int main(int argc, char* argv[]) try {
     {
         // PIPELINE Ex.1
         std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-        LLMPipeline pipe(model_path);
-        std::cout << pipe("table is made of");
+        LLMPipeline pipe(model_path, "CPU");
+        GenerationConfig config = pipe.generation_config();
+
+        auto text_streamer = TextStreamer{pipe};
+        auto print_text_callback = [&text_streamer](std::vector<int64_t>&& tokens, LLMPipeline& pipe){
+            text_streamer.put(tokens[0]);
+        };
+
+        pipe("table is made of", config.max_new_tokens(100).set_callback(print_text_callback));
+        text_streamer.end();
+        cout << endl <<  "------------- END OF GENERATE -------------" << endl;
     }
-    cout << endl <<  "-------------END OF GENERATE ------" << endl;
 
     {
         // PIPELINE Ex.2
         std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-        LLMPipeline pipe(model_path);
+        LLMPipeline pipe(model_path, "CPU");
         GenerationConfig config = pipe.generation_config();
         // batched inputs
         auto results = pipe({"table is made of", 
                             "Alan Turing was a",
                             "1 + 1 = ",
                             "Why is the Sun yellow?"
-                            }, config.do_sample(false));
+                            }, config.do_sample(false).max_new_tokens(100));
         
         for (const auto& res: results) {
             cout << res << endl;

From fa12da7394486a9992f09a735f81bfa470054ee4 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 11 Apr 2024 15:18:15 +0200
Subject: [PATCH 13/97] cleanup generate_sample.cpp

---
 text_generation/causal_lm/cpp/CMakeLists.txt  |   6 +-
 .../generate_pipeline/generate_pipeline.hpp   |  22 +++-
 .../cpp/generate_pipeline/generate_sample.cpp | 113 ++++++++++++++++++
 .../generate_pipeline/generation_config.hpp   |   1 -
 .../causal_lm/cpp/generate_pipeline/main.cpp  |  93 --------------
 .../causal_lm/cpp/group_beam_searcher.hpp     |   6 +-
 6 files changed, 135 insertions(+), 106 deletions(-)
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
 delete mode 100644 text_generation/causal_lm/cpp/generate_pipeline/main.cpp

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 7c75aad0af..26e99843d1 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -17,7 +17,7 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-set(TARGET_NAME beam_search_sample)
+set(TARGET_NAME beam_search_causal_lm)
 add_executable(${TARGET_NAME} beam_search_causal_lm.cpp)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
@@ -35,8 +35,8 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-set(TARGET_NAME generate_pipeline)
-add_executable(${TARGET_NAME} generate_pipeline/main.cpp)
+set(TARGET_NAME generate_sample)
+add_executable(${TARGET_NAME} generate_pipeline/generate_sample.cpp)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
index 66cb2f8eaa..5ed6d1b65d 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -9,7 +9,7 @@
 #include <filesystem>
 #include "group_beam_searcher.hpp"
 
-using GenerationResult = std::vector<std::vector<int64_t>>;
+using GenerationResult = std::vector<std::pair<float, std::vector<int64_t>>>;
 using namespace std;
 
 std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token=2) {
@@ -124,7 +124,8 @@ class LLMPipeline {
 	    if (!is_xml(full_path))
 		    full_path += "/openvino_model.xml";
         m_model_runner = core.compile_model(full_path, device, config).create_infer_request();
-
+        
+        // todo: add loading EOS_TOKEN_ID from IR
         core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
         // tokenizer and detokenizer work on CPU only
         full_path = tokenizer_path;
@@ -205,6 +206,11 @@ class LLMPipeline {
 
         return {m_tokenizer.get_tensor("input_ids"), m_tokenizer.get_tensor("attention_mask")};
     }
+    
+    std::pair<ov::Tensor, ov::Tensor> tokenize(std::initializer_list<std::string> text) {
+        return tokenize(std::vector<std::string>(text.begin(), text.end()));
+    }
+    
 
     std::string detokenize(std::vector<int64_t> tokens) {
         size_t batch_size = 1;
@@ -231,7 +237,7 @@ class LLMPipeline {
         // todo: implement calling detokenizer in a single batch
 
         std::vector<std::string> strings;
-        for (auto& line: lines){
+        for (auto& [score, line]: lines){
             ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()};
             m_detokenizer.set_input_tensor(tokens);
             m_detokenizer.infer();
@@ -281,13 +287,14 @@ class LLMPipeline {
             for (size_t batch = 0; batch < batch_size; ++batch) {
                 const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
                 int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
-                results[batch].emplace_back(out_token);
+                results[batch].second.emplace_back(out_token);
                 token_iter_results[batch] = out_token;
                 eos_met[batch] != (out_token == sampling_params.m_eos_token_id);
 
                 m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
                 m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(prompt_len + i);
             }
+            // place
             sampling_params.m_callback(std::move(token_iter_results), *this);
             
             // stop generation when EOS is met in all batches
@@ -348,13 +355,16 @@ class LLMPipeline {
 
             m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
             std::fill_n(m_model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1);
+            
+            // place
+            sampling_params.m_callback(std::move(next_tokens), *this);
+
         }
 
         std::vector<Beam> beams;
         for (const std::vector<Beam>& group : finalize(std::move(group_beam_searcher))) {
             for (const Beam& beam : group) {
                 beams.emplace_back(beam);
-                // results.emplace_back(beam.tokens);
             }
         }
 
@@ -363,7 +373,7 @@ class LLMPipeline {
         
         GenerationResult results;
         for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) {
-            results.emplace_back(beam->tokens);
+            results.emplace_back(std::pair(beam->score, beam->tokens));
         }
         return results;
     }
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
new file mode 100644
index 0000000000..9af5e474a6
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -0,0 +1,113 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openvino/openvino.hpp>
+#include "generate_pipeline.hpp"
+
+
+// The following reasons require TextStreamer to keep a cache of previous tokens:
+// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
+// but detokenize(tokenize("prefix a")) == "prefix a"
+// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
+struct TextStreamer {
+    LLMPipeline pipe;
+    std::vector<int64_t> token_cache;
+    size_t print_len = 0;
+
+    void put(int64_t token) {
+        token_cache.push_back(token);
+        std::string text = pipe.detokenize(token_cache);
+        if (!text.empty() && '\n' == text.back()) {
+            // Flush the cache after the new line symbol
+            std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
+            token_cache.clear();
+            print_len = 0;
+	    return;
+        }
+        if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
+            // Don't print incomplete text
+            return;
+        }
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+        print_len = text.size();
+    }
+
+    void end() {
+        std::string text = pipe.detokenize(token_cache);
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
+        token_cache.clear();
+        print_len = 0;
+    }
+};
+
+int main(int argc, char* argv[]) try {
+    if (2 >= argc && argc <= 4)
+        throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> \"<PROMPT>\" <DEVICE>");
+    
+    std::string prompt = "table is made of";
+    std::string device = "CPU"; // can be replaced with GPU
+
+    std::string model_path = argv[1];
+    if (argc > 2)
+        prompt = argv[2];
+    if (argc > 3)
+        device = argv[3];
+
+    // Example 1: TextStreaming example with greedy search
+    LLMPipeline pipe(model_path, device);
+    // Will try to load config from generation_config.json.
+    // but if not found default velues for gready search will be used
+    GenerationConfig config = pipe.generation_config();
+
+    auto text_streamer = TextStreamer{pipe};
+    auto text_streamer_callback = [&text_streamer](std::vector<int64_t>&& tokens, LLMPipeline& pipe){
+        text_streamer.put(tokens[0]);
+    };
+
+    cout << "greedy generate streaming mode:" << endl;
+    config.max_new_tokens(20).set_callback(text_streamer_callback);
+    pipe(prompt, config);
+    text_streamer.end();
+    
+    // Example 2: Grouped Beam Search decoding example
+    pipe = LLMPipeline(model_path, device);  
+    config = pipe.generation_config();
+
+    // will return vector with num_return_sequences strings
+    auto num_return_sequences = 3;
+    config.max_new_tokens(20).num_groups(3).group_size(5).num_return_sequences(num_return_sequences);
+    
+    cout << endl << "grouped beam search generated candidates:" << endl;
+    auto generation_results = pipe({prompt}, config);
+    for (int i = 0; i < num_return_sequences; ++i)
+        cout << "candidate " << i << ": " << generation_results[i] << endl;
+
+    // Example 3: Greedy Decoding with multiple batch
+    pipe = LLMPipeline(model_path, device);
+    config = pipe.generation_config();
+
+    cout << endl << "greedy decoding with multiple batches:" << endl;
+    std::vector<std::string> prompts = {"table is made of", "Alan Turing was a", "1 + 1 = ", "Why is the Sun yellow?"};
+    auto results = pipe(prompts, config.max_new_tokens(20));
+    for (int i = 0; i < prompts.size(); i++)
+        cout << prompts[i] << ": " << results[i] << endl;
+
+    // Example 4: Calling tokenizer/detokenizer manually and getting beam scores for all candidates
+    pipe = LLMPipeline(model_path);
+    auto [input_ids, attention_mask] = pipe.tokenize({prompt});
+    config = GenerationConfig::beam_search();
+    // config for grouped beam search
+    config.max_new_tokens(30).num_groups(3).group_size(5).num_return_sequences(15);
+    
+    cout << endl << "beam search with printing of all candidates:" << endl;
+    auto beams = pipe.generate(input_ids, attention_mask, config);
+    for (const auto& beam : beams)
+        std::cout << beam.first << ": " << pipe.detokenize(beam.second) << std::endl;
+
+} catch (const std::exception& error) {
+    std::cerr << error.what() << '\n';
+    return EXIT_FAILURE;
+} catch (...) {
+    std::cerr << "Non-exception object thrown\n";
+    return EXIT_FAILURE;
+}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index 348559f3a0..ce250696e4 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -33,7 +33,6 @@ struct GenerationConfig {
     size_t m_num_return_sequences = 3;  // is used by beam search, in other case is equal to batch size
     StopCriteria stop_criteria = StopCriteria::heuristic;
     
-    
     float m_repetition_penalty = 1.0f;
     float m_length_penalty = 1.0f;
     size_t m_no_repeat_ngram_size = std::numeric_limits<size_t>::max();
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
deleted file mode 100644
index 61aa4e274b..0000000000
--- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (C) 2023-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include <openvino/openvino.hpp>
-#include "generate_pipeline.hpp"
-
-namespace {
-
-constexpr size_t BATCH_SIZE = 1;
-
-}  // namespace
-
-using namespace std;
-
-struct TextStreamer {
-    LLMPipeline pipe;
-    std::vector<int64_t> token_cache;
-    size_t print_len = 0;
-
-    void put(int64_t token) {
-        token_cache.push_back(token);
-        std::string text = pipe.detokenize(token_cache);
-        if (!text.empty() && '\n' == text.back()) {
-            // Flush the cache after the new line symbol
-            std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
-            token_cache.clear();
-            print_len = 0;
-	    return;
-        }
-        if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
-            // Don't print incomplete text
-            return;
-        }
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
-        print_len = text.size();
-    }
-
-    void end() {
-        std::string text = pipe.detokenize(token_cache);
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
-        token_cache.clear();
-        print_len = 0;
-    }
-};
-
-int main(int argc, char* argv[]) try {
-    {
-        // PIPELINE Ex.1
-        std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-        LLMPipeline pipe(model_path, "CPU");
-        GenerationConfig config = pipe.generation_config();
-
-        auto text_streamer = TextStreamer{pipe};
-        auto print_text_callback = [&text_streamer](std::vector<int64_t>&& tokens, LLMPipeline& pipe){
-            text_streamer.put(tokens[0]);
-        };
-
-        pipe("table is made of", config.max_new_tokens(100).set_callback(print_text_callback));
-        text_streamer.end();
-        cout << endl <<  "------------- END OF GENERATE -------------" << endl;
-    }
-
-    {
-        // PIPELINE Ex.2
-        std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-        LLMPipeline pipe(model_path, "CPU");
-        GenerationConfig config = pipe.generation_config();
-        // batched inputs
-        auto results = pipe({"table is made of", 
-                            "Alan Turing was a",
-                            "1 + 1 = ",
-                            "Why is the Sun yellow?"
-                            }, config.do_sample(false).max_new_tokens(100));
-        
-        for (const auto& res: results) {
-            cout << res << endl;
-            cout << "-------------------" << endl;
-        }
-    }
-
-    // GENERATE
-    std::string model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
-    LLMPipeline pipe(model_path);
-    auto [input_ids, attention_mask] = pipe.tokenize("table is made of");
-    auto res = pipe.generate(input_ids, attention_mask);
-    std::cout << pipe.detokenize(res)[0];
-} catch (const std::exception& error) {
-    std::cerr << error.what() << '\n';
-    return EXIT_FAILURE;
-} catch (...) {
-    std::cerr << "Non-exception object thrown\n";
-    return EXIT_FAILURE;
-}
diff --git a/text_generation/causal_lm/cpp/group_beam_searcher.hpp b/text_generation/causal_lm/cpp/group_beam_searcher.hpp
index b34cda05f5..6cc90386df 100644
--- a/text_generation/causal_lm/cpp/group_beam_searcher.hpp
+++ b/text_generation/causal_lm/cpp/group_beam_searcher.hpp
@@ -113,9 +113,9 @@ struct Group {
         }
 
         min_heap.push_back(std::move(beam));
-        std::push_heap(min_heap.begin(), min_heap.end(), greater);
+        std::push_heap(min_heap.begin(), min_heap.end(), ::greater);
         if (min_heap.size() > parameters.group_size) {
-            std::pop_heap(min_heap.begin(), min_heap.end(), greater);
+            std::pop_heap(min_heap.begin(), min_heap.end(), ::greater);
             min_heap.pop_back();
         }
     }
@@ -265,7 +265,7 @@ struct GroupBeamSearcher {
                 throw std::runtime_error("No beams left to search");
             }
             auto to_sort = candidates.begin() + ptrdiff_t(2 * parameters.group_size);
-            std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater);
+            std::partial_sort(candidates.begin(), to_sort, candidates.end(), ::greater);
             group->ongoing.clear();
             for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) {
                 if (parameters.eos_token == candidates.at(cand_idx).tokens.back()) {

From 5ceb9d59df57d0326a53c2be8a65a3304e19d25e Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Tue, 16 Apr 2024 14:09:20 +0200
Subject: [PATCH 14/97] add speculative decoding

---
 text_generation/causal_lm/cpp/CMakeLists.txt  |   1 -
 .../generate_pipeline/generate_pipeline.hpp   | 206 +++++++++++++++++-
 .../cpp/generate_pipeline/generate_sample.cpp |  12 +
 .../generate_pipeline/generation_config.hpp   | 190 ++++++++++------
 4 files changed, 344 insertions(+), 65 deletions(-)

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 26e99843d1..6db072680e 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -46,4 +46,3 @@ set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 target_link_libraries(${TARGET_NAME} PRIVATE stdc++fs)
-
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
index 5ed6d1b65d..6eace2467f 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -95,11 +95,60 @@ ov::Tensor extend_attention(ov::Tensor attention_mask) {
     return new_atten_mask;
 }
 
+ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len) {
+    // Copy elements from the old to a new tensor and return it.
+    // It's assumed that key/values tensor has a shape [BATCH_SIZE, num_kv_heads, seq_len, head_size] or [seq_len, ...],
+    // It that's not the case for your model please implement your own trim method.
+    OPENVINO_ASSERT(seq_len_axis == 2 || seq_len_axis == 0, "Cannot trim key/values with sequence length axis = ", seq_len_axis);
+    
+    auto old_tensor_data = tensor.data<float>();
+    auto shape = tensor.get_shape();
+    size_t batch_size = shape[0];
+    size_t num_kv_heads = shape[1];
+    size_t old_seq_len = shape[2];
+    size_t head_size = shape[3];
+    
+    OPENVINO_ASSERT(new_seq_len <= old_seq_len);
+    
+    // if new_seq_len equal to old one no need to copy tensor, return as is
+    if (old_seq_len == new_seq_len)
+        return tensor;
+
+    if (seq_len_axis == 0) {
+        shape[0] = new_seq_len;
+        tensor.set_shape(shape);
+    }
+
+    // if seq_len_axis == 2, then data is not contiguous, in order to trim need to repack tensor
+    auto new_tensor = ov::Tensor{ov::element::f32, {batch_size, num_kv_heads, new_seq_len, head_size}};
+    auto new_tensor_data = new_tensor.data<float>();
+    for (size_t batch = 0; batch < batch_size; ++batch){
+        for (size_t i = 0; i < num_kv_heads; ++i) {
+            for (size_t j = 0; j < new_seq_len; ++j) {
+                auto dst_ptr = new_tensor_data + num_kv_heads * new_seq_len * head_size * batch + new_seq_len * head_size * i +  head_size * j;
+                auto src_ptr = old_tensor_data + num_kv_heads * new_seq_len * head_size * batch + old_seq_len * head_size * i +  head_size * j;
+                std::memcpy(dst_ptr, src_ptr, head_size * sizeof(float));
+            }
+        }
+    }
+    return new_tensor;
+}
+
+void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) {
+    // trim kv_cache values up to the new_seq_len
+    for (auto& state: request.query_state()) {
+        ov::Tensor old_tensor = state.get_state();
+        state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
+    }
+}
+
 class LLMPipeline {
     ov::InferRequest m_model_runner;
     ov::InferRequest m_tokenizer;
     ov::InferRequest m_detokenizer;
     GenerationConfig m_sampling_parameters;
+    std::string m_device;
+    ov::AnyMap m_config;
 
 public:
     // TODO: add constructor for specifying manually tokenizer path
@@ -116,6 +165,8 @@ class LLMPipeline {
         std::string device="CPU",
         const ov::AnyMap& config={}
     ) {
+        m_device = device;
+        m_config = config;
         ov::Core core;
         
         auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
@@ -136,14 +187,15 @@ class LLMPipeline {
         full_path = detokenizer_path;
 	    if (!is_xml(full_path))
 		    full_path += "/openvino_detokenizer.xml";
-        m_detokenizer = core.compile_model(full_path, "CPU").create_infer_request();        
+        m_detokenizer = core.compile_model(full_path, "CPU").create_infer_request();
     }
 
     LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& config={}) {
         if (std::filesystem::exists(path + "/generation_config.json")) {
             m_sampling_parameters = GenerationConfig(path + "/generation_config.json");
         }
-        
+        m_device = device;
+
         ov::Core core;
         auto model_request = core.compile_model(path + "/openvino_model.xml", device, config).create_infer_request();
         m_model_runner = model_request;
@@ -378,6 +430,152 @@ class LLMPipeline {
         return results;
     }
 
+    /* Speculative decoding works the following way. The draft model predicts the next K
+    tokens one by one in an autoregressive manner, while the main model validates these
+    predictions and corrects them if necessary. We go through each predicted token, and
+    if a difference is detected between the draft and main model, we stop and keep the
+    last token predicted by the main model. Then the draft model gets the latest main
+    prediction and again tries to predict the next K tokens, repeating the cycle.
+
+    This approach reduces the need for multiple infer requests to the main model,
+    enhancing performance. For instance, in more predictable parts of text generation,
+    the draft model can, in best-case scenarios, generate the next K tokens that exactly
+    match the target. In tha caste the are validated in a single inference request to
+    the main model (which is bigger, more accurate but slower) instead of running K
+    subsequent requests. 
+    */
+    GenerationResult speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
+        auto batch_size = input_ids.get_shape()[0];
+        OPENVINO_ASSERT(batch_size == 1);
+        auto draft_model = sampling_params.get_assistant_model(m_device, m_config);
+        auto main_model = m_model_runner;
+        
+        auto draft_input_ids = ov::Tensor{input_ids.get_element_type(), input_ids.get_shape()};
+        input_ids.copy_to(draft_input_ids);
+        auto draft_attention_mask = ov::Tensor{input_ids.get_element_type(), input_ids.get_shape()};
+        
+        draft_model.set_tensor("input_ids", draft_input_ids);
+        draft_model.set_tensor("attention_mask", draft_attention_mask);
+        
+        ov::Tensor draft_position_ids = draft_model.get_tensor("position_ids");
+        draft_position_ids.set_shape(draft_input_ids.get_shape());
+        std::iota(draft_position_ids.data<int64_t>(), draft_position_ids.data<int64_t>() + draft_position_ids.get_size(), 0);
+        uint64_t seq_len = draft_input_ids.get_shape()[1];
+
+        // Input tensors for the main model should not be mixed with draft.
+        // Do not feed the same draft_postion_ids to the main, but copy input_ids from the draft_input_ids
+        // auto input_ids = main_model.get_tensor("input_ids");
+        // input_ids.set_shape(draft_input_ids.get_shape());
+        // draft_input_ids.copy_to(input_ids);
+
+        // auto attention_mask = main_model.get_tensor("attention_mask");
+        // attention_mask.set_shape(draft_input_ids.get_shape());
+        // std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
+
+        auto position_ids = main_model.get_tensor("position_ids");
+        position_ids.set_shape(draft_input_ids.get_shape());
+        std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
+        
+        // set beam_idx for stateful model: no beam search is used and batch_size = 1
+        draft_model.get_tensor("beam_idx").set_shape({batch_size});
+        draft_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+        main_model.get_tensor("beam_idx").set_shape({batch_size});
+        main_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+
+        main_model.set_tensor("input_ids", input_ids);
+        main_model.set_tensor("attention_mask", attention_mask);
+        main_model.set_tensor("position_ids", position_ids);
+
+        // To coollect kv-cache for the <PROMPT> and to get the next token run the very first infer request
+        draft_model.infer();
+        main_model.infer();
+
+        size_t vocab_size = draft_model.get_tensor("logits").get_shape().back();
+        OPENVINO_ASSERT(vocab_size == main_model.get_tensor("logits").get_shape().back(), "vocab size should be the same for the both models");
+        
+        // logits shape is [batch_size, seq_len, vocab_size]
+        auto logits = main_model.get_tensor("logits");
+        auto data_logits = logits.data<float>() + (seq_len - 1) * vocab_size;
+        int64_t out_token = std::max_element(data_logits, data_logits + vocab_size) - data_logits;
+        
+        // the first token which is fed to both draft and main netwoks on each iteration
+        auto first_token = out_token;
+
+        GenerationResult results(batch_size);
+        results[0].second.emplace_back(out_token);
+        
+        // run K infer requests on draft model and get next K prediction tokens on each iteration
+        uint64_t K = sampling_params.m_num_assistant_tokens;
+        std::vector<int64_t> draft_tokens;
+
+        // The draft model predicts tokens one by one in an auto-regressive manner, draft_input_ids length should be 1.
+        draft_input_ids.set_shape({batch_size, 1});
+        draft_position_ids.set_shape({batch_size, 1});
+
+        int max_sequence_length = sampling_params.m_max_new_tokens;
+        auto eos_token = sampling_params.m_eos_token_id;
+        
+        while (out_token != eos_token && seq_len < max_sequence_length) {
+            // infer the K next tokens with draft model
+            for (int i = 0; i < K; ++i) {
+                draft_input_ids.data<int64_t>()[0] = out_token;
+                draft_attention_mask.set_shape({batch_size, seq_len + i + 1});
+                std::fill_n(draft_attention_mask.data<int64_t>(), draft_attention_mask.get_size(), 1);
+                draft_position_ids.data<int64_t>()[0] = int64_t(draft_attention_mask.get_size() - 1);
+
+                draft_model.infer();
+
+                auto draft_logits = draft_model.get_tensor("logits").data<float>();
+                int64_t arg_max_token = std::max_element(draft_logits, draft_logits + vocab_size) - draft_logits;
+                out_token = arg_max_token;
+                draft_tokens.emplace_back(arg_max_token);
+            }
+
+            // For the main network, K tokens will be fed at once in a single infer request.
+            input_ids.set_shape({batch_size, K});
+            // Set the first token for the main model to be the same as for the draft model.
+            input_ids.data<int64_t>()[0] = first_token;
+            for (int i = 0; i < K - 1; i++)
+                input_ids.data<int64_t>()[i + 1] = draft_tokens[i];
+
+            attention_mask.set_shape({batch_size, seq_len + K});
+            std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
+
+            position_ids.set_shape({batch_size, K});
+            std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), seq_len);
+
+            main_model.infer();
+
+            data_logits = logits.data<float>();  // [batch_size, K, vocab_size]
+            size_t disagree_idx = K - 1;
+            // Iterate through the predicted tokens from the main model and compare them with draft predictions.
+            // In the worst-case scenario (disagreement at the beginning), iter will increase by 1.
+            // In the best-case scenario, all elements match, and K predicted tokens will be taken.
+            for (size_t i = 0; i < K; i++) {
+                auto start = data_logits + vocab_size * i;
+                auto stop = data_logits + vocab_size * (i + 1);
+                out_token = std::max_element(start, stop) - start;
+                results[0].second.emplace_back(out_token);
+
+                disagree_idx = i;                
+                if (out_token != draft_tokens[i] || out_token == eos_token || seq_len + disagree_idx + 1 >= max_sequence_length)
+                    break;
+            }
+
+            // After the inference request, key/values have shape [batch_size, seq_len + K, vocab_size].
+            // Increment the sequence length by the number of matched tokens, and
+            // trim the KV cache to match the new sequence length.
+            seq_len += disagree_idx + 1;
+            update_kv_cache(draft_model, sampling_params.m_seq_len_axis, seq_len);
+            update_kv_cache(main_model, sampling_params.m_seq_len_axis, seq_len);
+            
+            draft_tokens.clear();
+            first_token = out_token;
+        }
+
+        return results;
+    }
+
     GenerationResult multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
         // todo: implement
         GenerationResult results;
@@ -432,8 +630,10 @@ class LLMPipeline {
             return greedy_search(input_ids, attention_mask, sampling_params);
         } else if (sampling_params.is_beam_search()) {
             return beam_search(input_ids, sampling_params);
-        } else {  // if (sampling_params.is_multimomial()) {
+        } else if (sampling_params.is_multimomial()) {
             return multinomial_sampling(input_ids, sampling_params);
+        } else { // speculative
+            return speculative_sampling(input_ids, attention_mask, sampling_params);
         }
     }
 
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
index 9af5e474a6..6c20bee63c 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -104,6 +104,18 @@ int main(int argc, char* argv[]) try {
     for (const auto& beam : beams)
         std::cout << beam.first << ": " << pipe.detokenize(beam.second) << std::endl;
 
+    {
+        // Example 5: Speculative sampling
+        std::string assitive_model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16";
+        pipe = LLMPipeline(model_path);
+        auto [input_ids, attention_mask] = pipe.tokenize({prompt});
+        config = GenerationConfig::assistive_decoding(assitive_model_path).num_assistant_tokens(5).max_new_tokens(20);
+        
+        auto results = pipe.generate(input_ids, attention_mask, config);
+        for (const auto& beam : results)
+            std::cout << pipe.detokenize(beam.second) << std::endl;
+    }
+
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
     return EXIT_FAILURE;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index ce250696e4..9e97d91d5a 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -51,7 +51,6 @@ struct GenerationConfig {
 
     std::function<void (std::vector<int64_t>&&, LLMPipeline&)> m_callback = [](std::vector<int64_t>&& tokens, LLMPipeline& pipe){ ;};
 
-
     size_t get_max_new_tokens(size_t prompt_length = 0) {
         // max_new_tokens has priority over max_length,
         // only if m_max_new_tokens was not specified use max_length
@@ -63,95 +62,94 @@ struct GenerationConfig {
     }
 
     GenerationConfig& max_new_tokens(size_t max_new_tokens) {
-        this->m_max_new_tokens = max_new_tokens;
-         return *this;
-     }
+        m_max_new_tokens = max_new_tokens;
+        return *this;
+    }
 
     GenerationConfig& max_length(size_t max_length) {
-        this->m_max_length = max_length;
-         return *this;
-     }
+        m_max_length = max_length;
+        return *this;
+    }
 
     GenerationConfig& ignore_eos(bool ignore_eos) {
-        this->m_ignore_eos = ignore_eos;
-         return *this;
-     }
+        m_ignore_eos = ignore_eos;
+        return *this;
+    }
 
     GenerationConfig& eos_token(int64_t eos_token) {
-        this->m_eos_token = eos_token;
-         return *this;
-     }
+        m_eos_token = eos_token;
+        return *this;
+    }
 
     GenerationConfig& num_return_sequences(size_t num_return_sequences) {
-        this->m_num_return_sequences = num_return_sequences;
-         return *this;
-     }
+        m_num_return_sequences = num_return_sequences;
+        return *this;
+    }
 
     GenerationConfig& num_groups(size_t num_groups) {
-        this->m_num_groups = num_groups;
-         return *this;
-     }
+        m_num_groups = num_groups;
+        return *this;
+    }
 
     GenerationConfig& group_size(size_t group_size) {
-        this->m_group_size = group_size;
-         return *this;
-     }
+        m_group_size = group_size;
+        return *this;
+    }
+
     GenerationConfig& diversity_penalty(float diversity_penalty) {
-        this->m_diversity_penalty = diversity_penalty;
-         return *this;
-     }
+        m_diversity_penalty = diversity_penalty;
+        return *this;
+    }
 
     GenerationConfig& length_penalty(float length_penalty) {
-        this->m_length_penalty = length_penalty;
-         return *this;
-     }
+        m_length_penalty = length_penalty;
+        return *this;
+    }
+
     GenerationConfig& no_repeat_ngram_size(size_t no_repeat_ngram_size) {
-        this->m_no_repeat_ngram_size = no_repeat_ngram_size;
-         return *this;
-     }
+        m_no_repeat_ngram_size = no_repeat_ngram_size;
+        return *this;
+    }
 
     GenerationConfig& temperature(float temperature) {
-        this->m_temperature = temperature;
-         return *this;
-     }
+        m_temperature = temperature;
+        return *this;
+    }
+
     GenerationConfig& top_k(size_t top_k) {
-        this->m_top_k = top_k;
-         return *this;
-     }
+        m_top_k = top_k;
+        return *this;
+    }
 
     GenerationConfig& top_p(size_t top_p) {
-        this->m_top_p = top_p;
-         return *this;
-     }
+        m_top_p = top_p;
+        return *this;
+    }
+
     GenerationConfig& do_sample(bool do_sample) {
-        this->m_do_sample = do_sample;
-         return *this;
-     }
+        m_do_sample = do_sample;
+        return *this;
+    }
 
     GenerationConfig& repetition_penalty(float repetition_penalty) {
-        this->m_repetition_penalty = repetition_penalty;
-         return *this;
-     }
+        m_repetition_penalty = repetition_penalty;
+        return *this;
+    }
 
     GenerationConfig& bos_token_id(int64_t bos_token_id) {
-        this->m_bos_token_id = bos_token_id;
-         return *this;
-     }
+        m_bos_token_id = bos_token_id;
+        return *this;
+    }
 
     GenerationConfig& eos_token_id(int64_t eos_token_id) {
-        this->m_eos_token_id = eos_token_id;
-         return *this;
-     }
+        m_eos_token_id = eos_token_id;
+        return *this;
+    }
 
     GenerationConfig& pad_token_id(int64_t pad_token_id) {
-        this->m_pad_token_id = pad_token_id;
-         return *this;
-     }
-
-    GenerationConfig& set_callback(std::function<void (std::vector<int64_t>&&, LLMPipeline&)> callback) {
-        this->m_callback = callback;
-         return *this;
-     }
+        m_pad_token_id = pad_token_id;
+        return *this;
+    }
 
     GenerationConfig() = default;
 
@@ -203,9 +201,16 @@ struct GenerationConfig {
         multimomial.m_do_sample = 20;
         return multimomial;
     }
+    
+    template <typename T>
+    static GenerationConfig assistive_decoding(T& assistant_model) {
+        GenerationConfig assistive;
+        assistive.assistant_model(assistant_model);
+        return assistive;
+    }
 
     bool is_gready_sampling() const {
-        return !m_do_sample && !is_beam_search();
+        return !m_do_sample && !is_beam_search() && !is_speculative();
     }
 
     bool is_beam_search() const {
@@ -215,5 +220,68 @@ struct GenerationConfig {
     bool is_multimomial() const {
         return m_do_sample;
     }
+
+    // for Assistive/Speculative decoding
+    ov::InferRequest m_assistant_model;
+    size_t m_num_assistant_tokens = 5;
+    size_t m_seq_len_axis = 2;
+    private:
+        std::shared_ptr<const ov::Model> m_assistant_ov_model;
+        bool is_assistant_request_defined = false;
+        bool is_assistant_ov_defined = false;
+
+    public:
+        GenerationConfig& assistant_model(const ov::InferRequest& assistant_model) {
+            m_assistant_model = assistant_model;
+            is_assistant_request_defined = true;
+            return *this;
+        }
+
+        GenerationConfig& assistant_model(ov::CompiledModel& assistant_model) {
+            m_assistant_model = assistant_model.create_infer_request();
+            is_assistant_request_defined = true;
+            return *this;
+        }
+
+        GenerationConfig& assistant_model(const std::shared_ptr<const ov::Model>& assistant_model) {
+            m_assistant_ov_model = assistant_model;
+            is_assistant_ov_defined = true;
+            return *this;
+        }
+
+        GenerationConfig& assistant_model(std::string assistant_model) {
+            auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
+            if (!is_xml(assistant_model))
+                assistant_model += "/openvino_model.xml";
+
+            m_assistant_ov_model = ov::Core().read_model(assistant_model);
+            is_assistant_ov_defined = true;
+            return *this;
+        }
+
+        GenerationConfig& set_callback(std::function<void (std::vector<int64_t>&&, LLMPipeline&)> callback) {
+            m_callback = callback;
+            return *this;
+        }
+
+        ov::InferRequest get_assistant_model(std::string device="CPU", const ov::AnyMap& config={}) {
+            if (is_assistant_request_defined) {
+                return m_assistant_model;
+            } else if (is_assistant_ov_defined) {
+                m_assistant_model = ov::Core().compile_model(m_assistant_ov_model, device, config).create_infer_request();
+                is_assistant_request_defined = true;
+                return m_assistant_model;
+            } else {
+                OPENVINO_THROW("assistant model is not specified");
+            }
+        }
+        
+        GenerationConfig& num_assistant_tokens(int64_t num_assistant_tokens) {
+            m_num_assistant_tokens = num_assistant_tokens;
+        return *this;
+    }
     
+    bool is_speculative() const {
+        return is_assistant_ov_defined || is_assistant_request_defined;
+    }
 };

From a5083c7bc9945f7f83abfb0d2683722ef580199a Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 17 Apr 2024 09:48:58 +0200
Subject: [PATCH 15/97] separate Tokenizer

---
 text_generation/causal_lm/cpp/CMakeLists.txt  |   2 +-
 .../causal_lm/cpp/beam_search_causal_lm.cpp   | 108 +--
 .../generate_pipeline/generate_pipeline.hpp   | 653 ------------------
 .../cpp/generate_pipeline/generate_sample.cpp |  75 +-
 .../generate_pipeline/generation_config.hpp   |   4 +-
 .../cpp/generate_pipeline/llm_pipeline.cpp    | 533 ++++++++++++++
 .../cpp/generate_pipeline/llm_pipeline.hpp    |  82 +++
 .../cpp/generate_pipeline/llm_tokenizer.cpp   | 119 ++++
 .../cpp/generate_pipeline/llm_tokenizer.hpp   |  74 ++
 9 files changed, 904 insertions(+), 746 deletions(-)
 delete mode 100644 text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 6db072680e..2c4870cc35 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -36,7 +36,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 set(TARGET_NAME generate_sample)
-add_executable(${TARGET_NAME} generate_pipeline/generate_sample.cpp)
+add_executable(${TARGET_NAME} generate_pipeline/generate_sample.cpp generate_pipeline/llm_tokenizer.cpp generate_pipeline/llm_pipeline.cpp)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
index eb90b17b5e..5d487c61ae 100644
--- a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
+++ b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
@@ -167,60 +167,60 @@ int main(int argc, char* argv[]) try {
     auto [input_ids, attention_mask] = tokenize(tokenizer, prompts_arguments_to_vector(argc, argv));
 
     // Initialize beam search
-    const int64_t* prompt_data = input_ids.data<const int64_t>();
-    std::vector<std::vector<int64_t>> prompts;
-    prompts.reserve(input_ids.get_shape().at(0));
-    for (size_t batch = 0; batch < input_ids.get_shape().at(0); batch++) {
-        size_t sequence_length = input_ids.get_shape().at(1);
-        size_t batch_offset = batch * sequence_length;
-        const int64_t* prompt_start = prompt_data + batch_offset;
-        prompts.push_back(std::vector<int64_t>{prompt_start, prompt_start + sequence_length});
-    }
-
-    // Get the runtime info from the tokenizer model that we read earlier
-    auto rt_info = tokenizer_model->get_rt_info();  // Get the runtime info for the model
-    int64_t SPECIAL_EOS_TOKEN;
-
-    if (rt_info.count("eos_token_id") > 0) {  // check if the runtime information has a valid EOS token ID
-        SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as<int64_t>();
-
-    } else {
-        throw std::runtime_error("EOS token ID not found in model's runtime information.");
-    }
-
-    Parameters parameters{std::move(prompts), SPECIAL_EOS_TOKEN};
-    GroupBeamSearcher group_beam_searcher{parameters};
-
-    initialize_inputs(input_ids, attention_mask, lm);
-
-    std::vector<int64_t> next_tokens;
-    std::vector<int32_t> next_beams;
-
-    for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) {
-        lm.infer();
-
-        std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits"));
-        if (next_tokens.empty()) {
-            break;
-        }
-        size_t batch_size = next_tokens.size();
-        // Set pointers
-        lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
-        lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
-        // Set auxiliary inputs
-        set_attention_mask(lm.get_tensor("attention_mask"), next_beams);
-        set_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask"));
-    }
-
-    for (const std::vector<std::vector<Beam>>& prompt_group : finalize(std::move(group_beam_searcher))) {
-        std::cout << "Prompt:\n";
-        for (const std::vector<Beam> group : prompt_group) {
-            std::cout << "Group:\n";
-            for (const Beam& beam : group) {
-                std::cout << beam.score << ": " << detokenize(detokenizer, beam.tokens) << '\n';
-            }
-        }
-    }
+    // const int64_t* prompt_data = input_ids.data<const int64_t>();
+    // std::vector<std::vector<int64_t>> prompts;
+    // prompts.reserve(input_ids.get_shape().at(0));
+    // for (size_t batch = 0; batch < input_ids.get_shape().at(0); batch++) {
+    //     size_t sequence_length = input_ids.get_shape().at(1);
+    //     size_t batch_offset = batch * sequence_length;
+    //     const int64_t* prompt_start = prompt_data + batch_offset;
+    //     prompts.push_back(std::vector<int64_t>{prompt_start, prompt_start + sequence_length});
+    // }
+
+    // // Get the runtime info from the tokenizer model that we read earlier
+    // auto rt_info = tokenizer_model->get_rt_info();  // Get the runtime info for the model
+    // int64_t SPECIAL_EOS_TOKEN;
+
+    // if (rt_info.count("eos_token_id") > 0) {  // check if the runtime information has a valid EOS token ID
+    //     SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as<int64_t>();
+
+    // } else {
+    //     throw std::runtime_error("EOS token ID not found in model's runtime information.");
+    // }
+
+    // Parameters parameters{std::move(prompts), SPECIAL_EOS_TOKEN};
+    // GroupBeamSearcher group_beam_searcher{parameters};
+
+    // initialize_inputs(input_ids, attention_mask, lm);
+
+    // std::vector<int64_t> next_tokens;
+    // std::vector<int32_t> next_beams;
+
+    // for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) {
+    //     lm.infer();
+
+    //     std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits"));
+    //     if (next_tokens.empty()) {
+    //         break;
+    //     }
+    //     size_t batch_size = next_tokens.size();
+    //     // Set pointers
+    //     lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
+    //     lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
+    //     // Set auxiliary inputs
+    //     set_attention_mask(lm.get_tensor("attention_mask"), next_beams);
+    //     set_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask"));
+    // }
+
+    // for (const std::vector<std::vector<Beam>>& prompt_group : finalize(std::move(group_beam_searcher))) {
+    //     std::cout << "Prompt:\n";
+    //     for (const std::vector<Beam> group : prompt_group) {
+    //         std::cout << "Group:\n";
+    //         for (const Beam& beam : group) {
+    //             std::cout << beam.score << ": " << detokenize(detokenizer, beam.tokens) << '\n';
+    //         }
+    //     }
+    // }
     // Model is stateful which means that context (kv-cache) which belongs to a particular
     // text sequence is accumulated inside the model during the generation loop above.
     // This context should be reset before processing the next text sequence.
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
deleted file mode 100644
index 6eace2467f..0000000000
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ /dev/null
@@ -1,653 +0,0 @@
-// Copyright (C) 2023-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <openvino/openvino.hpp>
-#include <openvino/core/any.hpp>
-#include "generation_config.hpp"
-#include <filesystem>
-#include "group_beam_searcher.hpp"
-
-using GenerationResult = std::vector<std::pair<float, std::vector<int64_t>>>;
-using namespace std;
-
-std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token=2) {
-    const size_t batch_size = input_ids.get_shape().at(0);
-    const size_t sequence_length = input_ids.get_shape().at(1);
-    int64_t* inputs_data = input_ids.data<int64_t>();
-    int64_t* attention_mask_data = attention_mask.data<int64_t>();
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        const size_t batch_offset = batch * sequence_length;
-
-        // last token in the sequence is not a PAD_TOKEN, skipping
-        if (inputs_data[batch_offset + sequence_length - 1] != pad_token)
-            continue;
-
-        size_t pad_tokens_number = 0;
-        for (int i = sequence_length - 1; i >= 0; i--) {
-            const size_t token_offset = batch_offset + i;
-
-            if (inputs_data[token_offset] == pad_token)
-                continue;
-
-            if (pad_tokens_number == 0)
-                pad_tokens_number = sequence_length - i - 1;
-
-            std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]);
-            std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]);
-        }
-    }
-
-    return {input_ids, attention_mask};
-}
-
-void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) {
-    const size_t batch_size = attention_mask.get_shape().at(0);
-    const size_t seq_length = attention_mask.get_shape().at(1);
-    position_ids.set_shape({batch_size, 1});
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        int64_t* start = attention_mask.data<int64_t>() + batch * seq_length;
-        position_ids.data<int64_t>()[batch] = std::accumulate(start, start + seq_length, 0);
-    }
-}
-
-void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) {
-    const size_t batch_size = attention_mask.get_shape()[0];
-    const size_t seq_length = attention_mask.get_shape()[1];
-
-    const int64_t* attention_mask_data = attention_mask.data<int64_t>();
-    int64_t* position_ids_data = position_ids.data<int64_t>();
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        size_t sum = 0;
-        for (size_t i = 0; i < seq_length; i++) {
-            const size_t element_offset = batch * seq_length + i;
-            position_ids_data[element_offset] = sum;
-            if (attention_mask_data[element_offset] == 1) {
-                sum += 1;
-            }
-        }
-    }
-}
-
-ov::Tensor init_attention_mask(ov::Tensor& position_ids) {
-    auto shape = position_ids.get_shape();
-    auto attention_mask = ov::Tensor{position_ids.get_element_type(), shape};
-    std::fill_n(attention_mask.data<int64_t>(), shape[0] * shape[1], 1);
-    return attention_mask;
-}
-
-ov::Tensor extend_attention(ov::Tensor attention_mask) {
-    auto shape = attention_mask.get_shape();
-    auto batch_size = shape[0];
-    auto seq_len = shape[1];
-
-    ov::Tensor new_atten_mask = ov::Tensor{attention_mask.get_element_type(), {batch_size, seq_len + 1}};
-    auto old_data = attention_mask.data<int64_t>();
-    auto new_data = new_atten_mask.data<int64_t>();
-    for (size_t batch = 0; batch < batch_size; ++batch) {
-        std::memcpy(new_data + batch * (seq_len + 1), old_data + batch * seq_len, seq_len * sizeof(int64_t));
-        new_data[batch * (seq_len + 1) + seq_len] = 1;
-    }
-    return new_atten_mask;
-}
-
-ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len) {
-    // Copy elements from the old to a new tensor and return it.
-    // It's assumed that key/values tensor has a shape [BATCH_SIZE, num_kv_heads, seq_len, head_size] or [seq_len, ...],
-    // It that's not the case for your model please implement your own trim method.
-    OPENVINO_ASSERT(seq_len_axis == 2 || seq_len_axis == 0, "Cannot trim key/values with sequence length axis = ", seq_len_axis);
-    
-    auto old_tensor_data = tensor.data<float>();
-    auto shape = tensor.get_shape();
-    size_t batch_size = shape[0];
-    size_t num_kv_heads = shape[1];
-    size_t old_seq_len = shape[2];
-    size_t head_size = shape[3];
-    
-    OPENVINO_ASSERT(new_seq_len <= old_seq_len);
-    
-    // if new_seq_len equal to old one no need to copy tensor, return as is
-    if (old_seq_len == new_seq_len)
-        return tensor;
-
-    if (seq_len_axis == 0) {
-        shape[0] = new_seq_len;
-        tensor.set_shape(shape);
-    }
-
-    // if seq_len_axis == 2, then data is not contiguous, in order to trim need to repack tensor
-    auto new_tensor = ov::Tensor{ov::element::f32, {batch_size, num_kv_heads, new_seq_len, head_size}};
-    auto new_tensor_data = new_tensor.data<float>();
-    for (size_t batch = 0; batch < batch_size; ++batch){
-        for (size_t i = 0; i < num_kv_heads; ++i) {
-            for (size_t j = 0; j < new_seq_len; ++j) {
-                auto dst_ptr = new_tensor_data + num_kv_heads * new_seq_len * head_size * batch + new_seq_len * head_size * i +  head_size * j;
-                auto src_ptr = old_tensor_data + num_kv_heads * new_seq_len * head_size * batch + old_seq_len * head_size * i +  head_size * j;
-                std::memcpy(dst_ptr, src_ptr, head_size * sizeof(float));
-            }
-        }
-    }
-    return new_tensor;
-}
-
-void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) {
-    // trim kv_cache values up to the new_seq_len
-    for (auto& state: request.query_state()) {
-        ov::Tensor old_tensor = state.get_state();
-        state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
-    }
-}
-
-class LLMPipeline {
-    ov::InferRequest m_model_runner;
-    ov::InferRequest m_tokenizer;
-    ov::InferRequest m_detokenizer;
-    GenerationConfig m_sampling_parameters;
-    std::string m_device;
-    ov::AnyMap m_config;
-
-public:
-    // TODO: add constructor for specifying manually tokenizer path
-    // dir path
-    // xml file path
-    // compiled model
-    // infer request
-    // ov::Model
-    
-    LLMPipeline(
-        std::string& model_path,
-        std::string& tokenizer_path,
-        std::string& detokenizer_path,
-        std::string device="CPU",
-        const ov::AnyMap& config={}
-    ) {
-        m_device = device;
-        m_config = config;
-        ov::Core core;
-        
-        auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
-        
-        std::string full_path = model_path;
-	    if (!is_xml(full_path))
-		    full_path += "/openvino_model.xml";
-        m_model_runner = core.compile_model(full_path, device, config).create_infer_request();
-        
-        // todo: add loading EOS_TOKEN_ID from IR
-        core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-        // tokenizer and detokenizer work on CPU only
-        full_path = tokenizer_path;
-	    if (!is_xml(full_path))
-		    full_path += "/openvino_tokenizer.xml";
-        m_tokenizer = core.compile_model(full_path, "CPU").create_infer_request();
-
-        full_path = detokenizer_path;
-	    if (!is_xml(full_path))
-		    full_path += "/openvino_detokenizer.xml";
-        m_detokenizer = core.compile_model(full_path, "CPU").create_infer_request();
-    }
-
-    LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& config={}) {
-        if (std::filesystem::exists(path + "/generation_config.json")) {
-            m_sampling_parameters = GenerationConfig(path + "/generation_config.json");
-        }
-        m_device = device;
-
-        ov::Core core;
-        auto model_request = core.compile_model(path + "/openvino_model.xml", device, config).create_infer_request();
-        m_model_runner = model_request;
-
-        // tokenizer and detokenizer work on CPU only
-        core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-        m_tokenizer = core.compile_model(path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
-        m_detokenizer = core.compile_model(path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
-    }
-
-    GenerationConfig generation_config() const {
-        return m_sampling_parameters;
-    }
-
-    std::pair<ov::Tensor, ov::Tensor> tokenize(std::string prompt) {
-        size_t batch_size = 1;
-        m_tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
-        m_tokenizer.infer();
-
-        vector<vector<int64_t>> input_ids_vec;
-        input_ids_vec.reserve(1);
-        auto res_tensor = m_tokenizer.get_tensor("input_ids");
-        auto res_shape = res_tensor.get_shape();
-        
-        for (int i = 0; i < res_shape[0]; ++i) {
-            int64_t* start = res_tensor.data<int64_t>() + i * res_shape[1];
-            input_ids_vec.emplace_back(std::vector<int64_t>(start, start + res_shape[1]));
-        }
-
-        return {m_tokenizer.get_tensor("input_ids"), m_tokenizer.get_tensor("attention_mask")};
-    }
-
-    std::pair<ov::Tensor, ov::Tensor> tokenize(std::vector<std::string> prompts) {
-        m_tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
-        auto size_ = m_tokenizer.get_input_tensor().get_shape();
-        m_tokenizer.infer();
-
-        pad_left(m_tokenizer.get_tensor("input_ids"), m_tokenizer.get_tensor("attention_mask"));
-        // fix mask filled with '2' instead of '0'
-        ov::Tensor attention_mask = m_tokenizer.get_tensor("attention_mask");
-        int64_t* attention_mask_data = attention_mask.data<int64_t>();
-        std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0);
-        
-        vector<vector<int64_t>> input_ids_vec;
-        vector<vector<int64_t>> atten_mask_vec;
-        
-        input_ids_vec.reserve(prompts.size());
-        atten_mask_vec.reserve(prompts.size());
-        auto res_tensor = m_tokenizer.get_tensor("input_ids");
-        auto atten_tensor = m_tokenizer.get_tensor("attention_mask");
-        auto res_shape = res_tensor.get_shape();
-        
-        for (int i = 0; i < res_shape[0]; ++i) {
-            int64_t* start = res_tensor.data<int64_t>() + i * res_shape[1];
-            input_ids_vec.emplace_back(std::vector<int64_t>(start, start + res_shape[1]));
-            
-            int64_t* atten_start = atten_tensor.data<int64_t>() + i * res_shape[1];
-            atten_mask_vec.emplace_back(std::vector<int64_t>(atten_start, atten_start + res_shape[1]));
-        }
-
-        return {m_tokenizer.get_tensor("input_ids"), m_tokenizer.get_tensor("attention_mask")};
-    }
-    
-    std::pair<ov::Tensor, ov::Tensor> tokenize(std::initializer_list<std::string> text) {
-        return tokenize(std::vector<std::string>(text.begin(), text.end()));
-    }
-    
-
-    std::string detokenize(std::vector<int64_t> tokens) {
-        size_t batch_size = 1;
-        m_detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()});
-        m_detokenizer.infer();
-        return m_detokenizer.get_output_tensor().data<std::string>()[0];
-    }
-
-    std::vector<std::string> detokenize(ov::Tensor tokens) {
-        m_detokenizer.set_input_tensor(tokens);
-        auto shape = tokens.get_shape();
-        auto data = tokens.data<int64_t>();
-        m_detokenizer.infer();
-        auto res = m_detokenizer.get_output_tensor();
-        
-        std::vector<std::string> strings;
-        for (int i = 0; i < res.get_shape()[0]; ++i) {
-            strings.emplace_back(res.data<std::string>()[i]);
-        }
-        return strings;
-    }
-
-    std::vector<std::string> detokenize(GenerationResult lines) {
-        // todo: implement calling detokenizer in a single batch
-
-        std::vector<std::string> strings;
-        for (auto& [score, line]: lines){
-            ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()};
-            m_detokenizer.set_input_tensor(tokens);
-            m_detokenizer.infer();
-            auto res = m_detokenizer.get_output_tensor();
-            auto res_str = res.data<std::string>()[0];
-            strings.emplace_back(res_str);
-        }
-        
-        return strings;
-    }
-
-    GenerationResult greedy_search(ov::Tensor input_ids, 
-                                   ov::Tensor attention_mask, 
-                                   GenerationConfig sampling_params) {
-        ov::Shape prompts_shape = input_ids.get_shape();
-        size_t batch_size = prompts_shape[0];
-        
-        GenerationResult results(batch_size);
-
-        auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
-        // todo: make this work even if position_ids are not specified
-        initialize_position_ids(position_ids, attention_mask);
-
-        size_t prompt_len = input_ids.get_shape()[1];
-
-        m_model_runner.set_tensor("input_ids", input_ids);
-        m_model_runner.set_tensor("attention_mask", attention_mask);
-        m_model_runner.set_tensor("position_ids", position_ids);
-    
-        m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
-        auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
-        std::iota(beam_data, beam_data + batch_size, 0);
-
-        for (size_t i = 0; i < sampling_params.get_max_new_tokens(prompt_len); ++i) {
-            // todo: consider replacing with start_async and run callback right after that
-            m_model_runner.infer();
-            auto logits = m_model_runner.get_tensor("logits");
-            ov::Shape logits_shape = logits.get_shape();
-            size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];
-
-            m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});
-            m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
-            update_position_ids(position_ids, attention_mask);  // todo: check why does not always work correctly
-            
-            std::vector<int64_t> token_iter_results(batch_size);  // results of a single infer request
-            std::vector<int> eos_met(batch_size, 0);  // use int because can not use std::all_of with vector<bool>
-            for (size_t batch = 0; batch < batch_size; ++batch) {
-                const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
-                int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
-                results[batch].second.emplace_back(out_token);
-                token_iter_results[batch] = out_token;
-                eos_met[batch] != (out_token == sampling_params.m_eos_token_id);
-
-                m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
-                m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(prompt_len + i);
-            }
-            // place
-            sampling_params.m_callback(std::move(token_iter_results), *this);
-            
-            // stop generation when EOS is met in all batches
-            bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; });
-            if (!sampling_params.m_ignore_eos && all_are_eos)
-                break;
-        }
-        return results;
-    }
-
-    GenerationResult beam_search(ov::Tensor prompts, GenerationConfig sampling_params) {
-        ov::Shape prompts_shape = prompts.get_shape();
-        size_t batch_size = prompts_shape[0];
-        // todo: implement for batch > 1
-        OPENVINO_ASSERT(batch_size == 1);
-
-        // initialize inputs
-        auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()};
-        std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
-        auto position_ids = ov::Tensor{ov::element::i64, prompts.get_shape()};
-        std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
-        auto prompt_len = prompts.get_shape()[1];
-
-        m_model_runner.set_tensor("input_ids", prompts);
-        m_model_runner.set_tensor("attention_mask", attention_mask);
-        m_model_runner.set_tensor("position_ids", position_ids);
-    
-        // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1
-        m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
-        m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;
-
-        const int64_t* prompt_data = prompts.data<const int64_t>();
-        
-        // todo: remove this duplicatino and use the same SamplingParameters for both greedy and beam
-        Parameters parameters{std::vector<int64_t>{prompt_data, prompt_data + prompts.get_size()}};
-        parameters.n_groups = sampling_params.m_num_groups;
-        parameters.diversity_penalty = sampling_params.m_diversity_penalty;
-        parameters.group_size = sampling_params.m_group_size;
-
-        GroupBeamSearcher group_beam_searcher{parameters};
-        std::vector<int64_t> next_tokens;
-        std::vector<int32_t> next_beams;
-        for (size_t length_count = 0; length_count < sampling_params.get_max_new_tokens(prompt_len); ++length_count) {
-            m_model_runner.infer();
-            std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(m_model_runner.get_tensor("logits"));
-            if (next_tokens.empty()) {
-                break;
-            }
-            size_t batch_size = next_tokens.size();
-            // Set pointers
-            m_model_runner.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
-            m_model_runner.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
-            // Set auxiliary inputs
-            ov::Tensor attention_mask = m_model_runner.get_tensor("attention_mask");
-            ov::Shape mask_shape{batch_size, attention_mask.get_shape()[1] + 1};
-            attention_mask.set_shape(mask_shape);
-            std::fill_n(attention_mask.data<int64_t>(), ov::shape_size(mask_shape), 1);
-
-            m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
-            std::fill_n(m_model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1);
-            
-            // place
-            sampling_params.m_callback(std::move(next_tokens), *this);
-
-        }
-
-        std::vector<Beam> beams;
-        for (const std::vector<Beam>& group : finalize(std::move(group_beam_searcher))) {
-            for (const Beam& beam : group) {
-                beams.emplace_back(beam);
-            }
-        }
-
-        auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); };
-        std::sort(beams.begin(), beams.end(), compare_scores);
-        
-        GenerationResult results;
-        for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) {
-            results.emplace_back(std::pair(beam->score, beam->tokens));
-        }
-        return results;
-    }
-
-    /* Speculative decoding works the following way. The draft model predicts the next K
-    tokens one by one in an autoregressive manner, while the main model validates these
-    predictions and corrects them if necessary. We go through each predicted token, and
-    if a difference is detected between the draft and main model, we stop and keep the
-    last token predicted by the main model. Then the draft model gets the latest main
-    prediction and again tries to predict the next K tokens, repeating the cycle.
-
-    This approach reduces the need for multiple infer requests to the main model,
-    enhancing performance. For instance, in more predictable parts of text generation,
-    the draft model can, in best-case scenarios, generate the next K tokens that exactly
-    match the target. In tha caste the are validated in a single inference request to
-    the main model (which is bigger, more accurate but slower) instead of running K
-    subsequent requests. 
-    */
-    GenerationResult speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
-        auto batch_size = input_ids.get_shape()[0];
-        OPENVINO_ASSERT(batch_size == 1);
-        auto draft_model = sampling_params.get_assistant_model(m_device, m_config);
-        auto main_model = m_model_runner;
-        
-        auto draft_input_ids = ov::Tensor{input_ids.get_element_type(), input_ids.get_shape()};
-        input_ids.copy_to(draft_input_ids);
-        auto draft_attention_mask = ov::Tensor{input_ids.get_element_type(), input_ids.get_shape()};
-        
-        draft_model.set_tensor("input_ids", draft_input_ids);
-        draft_model.set_tensor("attention_mask", draft_attention_mask);
-        
-        ov::Tensor draft_position_ids = draft_model.get_tensor("position_ids");
-        draft_position_ids.set_shape(draft_input_ids.get_shape());
-        std::iota(draft_position_ids.data<int64_t>(), draft_position_ids.data<int64_t>() + draft_position_ids.get_size(), 0);
-        uint64_t seq_len = draft_input_ids.get_shape()[1];
-
-        // Input tensors for the main model should not be mixed with draft.
-        // Do not feed the same draft_postion_ids to the main, but copy input_ids from the draft_input_ids
-        // auto input_ids = main_model.get_tensor("input_ids");
-        // input_ids.set_shape(draft_input_ids.get_shape());
-        // draft_input_ids.copy_to(input_ids);
-
-        // auto attention_mask = main_model.get_tensor("attention_mask");
-        // attention_mask.set_shape(draft_input_ids.get_shape());
-        // std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
-
-        auto position_ids = main_model.get_tensor("position_ids");
-        position_ids.set_shape(draft_input_ids.get_shape());
-        std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
-        
-        // set beam_idx for stateful model: no beam search is used and batch_size = 1
-        draft_model.get_tensor("beam_idx").set_shape({batch_size});
-        draft_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
-        main_model.get_tensor("beam_idx").set_shape({batch_size});
-        main_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
-
-        main_model.set_tensor("input_ids", input_ids);
-        main_model.set_tensor("attention_mask", attention_mask);
-        main_model.set_tensor("position_ids", position_ids);
-
-        // To coollect kv-cache for the <PROMPT> and to get the next token run the very first infer request
-        draft_model.infer();
-        main_model.infer();
-
-        size_t vocab_size = draft_model.get_tensor("logits").get_shape().back();
-        OPENVINO_ASSERT(vocab_size == main_model.get_tensor("logits").get_shape().back(), "vocab size should be the same for the both models");
-        
-        // logits shape is [batch_size, seq_len, vocab_size]
-        auto logits = main_model.get_tensor("logits");
-        auto data_logits = logits.data<float>() + (seq_len - 1) * vocab_size;
-        int64_t out_token = std::max_element(data_logits, data_logits + vocab_size) - data_logits;
-        
-        // the first token which is fed to both draft and main netwoks on each iteration
-        auto first_token = out_token;
-
-        GenerationResult results(batch_size);
-        results[0].second.emplace_back(out_token);
-        
-        // run K infer requests on draft model and get next K prediction tokens on each iteration
-        uint64_t K = sampling_params.m_num_assistant_tokens;
-        std::vector<int64_t> draft_tokens;
-
-        // The draft model predicts tokens one by one in an auto-regressive manner, draft_input_ids length should be 1.
-        draft_input_ids.set_shape({batch_size, 1});
-        draft_position_ids.set_shape({batch_size, 1});
-
-        int max_sequence_length = sampling_params.m_max_new_tokens;
-        auto eos_token = sampling_params.m_eos_token_id;
-        
-        while (out_token != eos_token && seq_len < max_sequence_length) {
-            // infer the K next tokens with draft model
-            for (int i = 0; i < K; ++i) {
-                draft_input_ids.data<int64_t>()[0] = out_token;
-                draft_attention_mask.set_shape({batch_size, seq_len + i + 1});
-                std::fill_n(draft_attention_mask.data<int64_t>(), draft_attention_mask.get_size(), 1);
-                draft_position_ids.data<int64_t>()[0] = int64_t(draft_attention_mask.get_size() - 1);
-
-                draft_model.infer();
-
-                auto draft_logits = draft_model.get_tensor("logits").data<float>();
-                int64_t arg_max_token = std::max_element(draft_logits, draft_logits + vocab_size) - draft_logits;
-                out_token = arg_max_token;
-                draft_tokens.emplace_back(arg_max_token);
-            }
-
-            // For the main network, K tokens will be fed at once in a single infer request.
-            input_ids.set_shape({batch_size, K});
-            // Set the first token for the main model to be the same as for the draft model.
-            input_ids.data<int64_t>()[0] = first_token;
-            for (int i = 0; i < K - 1; i++)
-                input_ids.data<int64_t>()[i + 1] = draft_tokens[i];
-
-            attention_mask.set_shape({batch_size, seq_len + K});
-            std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
-
-            position_ids.set_shape({batch_size, K});
-            std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), seq_len);
-
-            main_model.infer();
-
-            data_logits = logits.data<float>();  // [batch_size, K, vocab_size]
-            size_t disagree_idx = K - 1;
-            // Iterate through the predicted tokens from the main model and compare them with draft predictions.
-            // In the worst-case scenario (disagreement at the beginning), iter will increase by 1.
-            // In the best-case scenario, all elements match, and K predicted tokens will be taken.
-            for (size_t i = 0; i < K; i++) {
-                auto start = data_logits + vocab_size * i;
-                auto stop = data_logits + vocab_size * (i + 1);
-                out_token = std::max_element(start, stop) - start;
-                results[0].second.emplace_back(out_token);
-
-                disagree_idx = i;                
-                if (out_token != draft_tokens[i] || out_token == eos_token || seq_len + disagree_idx + 1 >= max_sequence_length)
-                    break;
-            }
-
-            // After the inference request, key/values have shape [batch_size, seq_len + K, vocab_size].
-            // Increment the sequence length by the number of matched tokens, and
-            // trim the KV cache to match the new sequence length.
-            seq_len += disagree_idx + 1;
-            update_kv_cache(draft_model, sampling_params.m_seq_len_axis, seq_len);
-            update_kv_cache(main_model, sampling_params.m_seq_len_axis, seq_len);
-            
-            draft_tokens.clear();
-            first_token = out_token;
-        }
-
-        return results;
-    }
-
-    GenerationResult multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
-        // todo: implement
-        GenerationResult results;
-        return results;
-    }
-
-    std::string call(std::string text) {
-        auto [input_ids, attention_mask] = tokenize(text);
-
-        auto generate_results = generate(input_ids, attention_mask, m_sampling_parameters);
-
-        return detokenize(generate_results)[0];
-    }
-    
-    std::string call(std::string text, GenerationConfig generation_config) {
-        auto [input_ids, attention_mask] = tokenize(text);
-        // to keep config specified during LLMPipeline creation need to get existing 
-        // and modify only after that, e.g.:
-        // GenerationConfig config = pipe.generation_config();
-        // config.do_sample(false).max_new_tokens(20);
-        auto generate_results = generate(input_ids, attention_mask, generation_config);
-
-        return detokenize(generate_results)[0];
-    }
-
-    std::vector<std::string> call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
-        auto [input_ids, attention_mask] = tokenize(text);
-
-        auto generate_results = generate(input_ids, attention_mask, sampling_parameters);
-
-        return detokenize(generate_results);
-    }
-    
-    std::string operator()(std::string text) {
-        return call(text);
-    }
-
-    std::string operator()(std::string text, GenerationConfig sampling_parameters) {
-        return call(text, sampling_parameters);
-    }
-    
-    std::vector<std::string> operator()(std::vector<std::string> text, GenerationConfig sampling_parameters) {
-        return call(text, sampling_parameters);
-    }
-    
-    std::vector<std::string> operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters) {
-        return call(text, sampling_parameters);
-    }
-    
-    GenerationResult generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
-        if (sampling_params.is_gready_sampling()) {
-            return greedy_search(input_ids, attention_mask, sampling_params);
-        } else if (sampling_params.is_beam_search()) {
-            return beam_search(input_ids, sampling_params);
-        } else if (sampling_params.is_multimomial()) {
-            return multinomial_sampling(input_ids, sampling_params);
-        } else { // speculative
-            return speculative_sampling(input_ids, attention_mask, sampling_params);
-        }
-    }
-
-    GenerationResult generate(ov::Tensor input_ids, ov::Tensor attention_mask) {
-        return generate(input_ids, attention_mask, m_sampling_parameters);
-    }
-
-    GenerationResult generate(ov::Tensor input_ids, GenerationConfig sampling_params) {
-
-        return generate(input_ids, init_attention_mask(input_ids), sampling_params);
-    }
-
-    GenerationResult generate(ov::Tensor input_ids) {
-        return generate(input_ids, init_attention_mask(input_ids), m_sampling_parameters);
-    }
-
-};
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
index 6c20bee63c..9d0f0b91f8 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <openvino/openvino.hpp>
-#include "generate_pipeline.hpp"
+#include "llm_pipeline.hpp"
 
 
 // The following reasons require TextStreamer to keep a cache of previous tokens:
@@ -10,19 +10,19 @@
 // but detokenize(tokenize("prefix a")) == "prefix a"
 // 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
 struct TextStreamer {
-    LLMPipeline pipe;
+    Tokenizer tokenizer;
     std::vector<int64_t> token_cache;
     size_t print_len = 0;
 
     void put(int64_t token) {
         token_cache.push_back(token);
-        std::string text = pipe.detokenize(token_cache);
+        std::string text = tokenizer.detokenize(token_cache);
         if (!text.empty() && '\n' == text.back()) {
             // Flush the cache after the new line symbol
             std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
             token_cache.clear();
             print_len = 0;
-	    return;
+	        return;
         }
         if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
             // Don't print incomplete text
@@ -33,7 +33,7 @@ struct TextStreamer {
     }
 
     void end() {
-        std::string text = pipe.detokenize(token_cache);
+        std::string text = tokenizer.detokenize(token_cache);
         std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
         token_cache.clear();
         print_len = 0;
@@ -54,12 +54,13 @@ int main(int argc, char* argv[]) try {
         device = argv[3];
 
     // Example 1: TextStreaming example with greedy search
+    
     LLMPipeline pipe(model_path, device);
     // Will try to load config from generation_config.json.
     // but if not found default velues for gready search will be used
     GenerationConfig config = pipe.generation_config();
 
-    auto text_streamer = TextStreamer{pipe};
+    auto text_streamer = TextStreamer{pipe.get_tokenizer()};
     auto text_streamer_callback = [&text_streamer](std::vector<int64_t>&& tokens, LLMPipeline& pipe){
         text_streamer.put(tokens[0]);
     };
@@ -69,18 +70,18 @@ int main(int argc, char* argv[]) try {
     pipe(prompt, config);
     text_streamer.end();
     
-    // Example 2: Grouped Beam Search decoding example
-    pipe = LLMPipeline(model_path, device);  
-    config = pipe.generation_config();
+    // // Example 2: Grouped Beam Search decoding example
+    // pipe = LLMPipeline(model_path, device);  
+    // config = pipe.generation_config();
 
-    // will return vector with num_return_sequences strings
-    auto num_return_sequences = 3;
-    config.max_new_tokens(20).num_groups(3).group_size(5).num_return_sequences(num_return_sequences);
+    // // will return vector with num_return_sequences strings
+    // auto num_return_sequences = 3;
+    // config.max_new_tokens(20).num_groups(3).group_size(5).num_return_sequences(num_return_sequences);
     
-    cout << endl << "grouped beam search generated candidates:" << endl;
-    auto generation_results = pipe({prompt}, config);
-    for (int i = 0; i < num_return_sequences; ++i)
-        cout << "candidate " << i << ": " << generation_results[i] << endl;
+    // cout << endl << "grouped beam search generated candidates:" << endl;
+    // auto generation_results = pipe({prompt}, config);
+    // for (int i = 0; i < num_return_sequences; ++i)
+    //     cout << "candidate " << i << ": " << generation_results[i] << endl;
 
     // Example 3: Greedy Decoding with multiple batch
     pipe = LLMPipeline(model_path, device);
@@ -93,28 +94,30 @@ int main(int argc, char* argv[]) try {
         cout << prompts[i] << ": " << results[i] << endl;
 
     // Example 4: Calling tokenizer/detokenizer manually and getting beam scores for all candidates
-    pipe = LLMPipeline(model_path);
-    auto [input_ids, attention_mask] = pipe.tokenize({prompt});
-    config = GenerationConfig::beam_search();
-    // config for grouped beam search
-    config.max_new_tokens(30).num_groups(3).group_size(5).num_return_sequences(15);
+    // pipe = LLMPipeline(model_path);
+    // auto [input_ids, attention_mask] = pipe.tokenize({prompt});
+    // config = GenerationConfig::beam_search();
+    // // config for grouped beam search
+    // config.max_new_tokens(30).num_groups(3).group_size(5).num_return_sequences(15);
     
-    cout << endl << "beam search with printing of all candidates:" << endl;
-    auto beams = pipe.generate(input_ids, attention_mask, config);
-    for (const auto& beam : beams)
-        std::cout << beam.first << ": " << pipe.detokenize(beam.second) << std::endl;
-
-    {
-        // Example 5: Speculative sampling
-        std::string assitive_model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16";
-        pipe = LLMPipeline(model_path);
-        auto [input_ids, attention_mask] = pipe.tokenize({prompt});
-        config = GenerationConfig::assistive_decoding(assitive_model_path).num_assistant_tokens(5).max_new_tokens(20);
+    // cout << endl << "beam search with printing of all candidates:" << endl;
+    // auto beams = pipe.generate(input_ids, attention_mask, config);
+    // for (const auto& beam : beams)
+    //     std::cout << beam.first << ": " << pipe.detokenize(beam.second) << std::endl;
+
+    // {
+    //     // Example 5: Speculative sampling
+    //     std::string assitive_model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16";
+    //     pipe = LLMPipeline(model_path);
+    //     auto [input_ids, attention_mask] = pipe.tokenize({prompt});
+    //     // config = GenerationConfig::assistive_decoding(assitive_model_path).num_assistant_tokens(5).max_new_tokens(20);
+    //     pipe.generation_config().assistant_model(assitive_model_path);
         
-        auto results = pipe.generate(input_ids, attention_mask, config);
-        for (const auto& beam : results)
-            std::cout << pipe.detokenize(beam.second) << std::endl;
-    }
+    //     cout << endl << "Speculative sampling with TinyLlama assistance:" << endl;
+    //     auto results = pipe.generate(input_ids, attention_mask, config);
+    //     for (const auto& result : results)
+    //         std::cout << pipe.detokenize(result.second) << std::endl;
+    // }
 
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index 9e97d91d5a..084cdaf6a0 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -7,7 +7,7 @@
 #include <functional>
 #include <nlohmann/json.hpp>
 #include <fstream>
-#include <group_beam_searcher.hpp>  // used only for StopCriteria
+// #include <group_beam_searcher.hpp>  // used only for StopCriteria
 #include <limits>
 
 // forward declaration
@@ -31,7 +31,7 @@ struct GenerationConfig {
     size_t m_group_size = 1; // beam_width
     float m_diversity_penalty = 1.0f; // 0.0 means no diversity
     size_t m_num_return_sequences = 3;  // is used by beam search, in other case is equal to batch size
-    StopCriteria stop_criteria = StopCriteria::heuristic;
+    // StopCriteria stop_criteria = StopCriteria::heuristic;
     
     float m_repetition_penalty = 1.0f;
     float m_length_penalty = 1.0f;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
new file mode 100644
index 0000000000..b587eca80f
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
@@ -0,0 +1,533 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openvino/openvino.hpp>
+#include "generate_pipeline/llm_pipeline.hpp"
+#include "group_beam_searcher.hpp"
+#include <filesystem>
+
+
+using GenerationResult = std::vector<std::pair<float, std::vector<int64_t>>>;
+using namespace std;
+
+std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token) {
+    const size_t batch_size = input_ids.get_shape().at(0);
+    const size_t sequence_length = input_ids.get_shape().at(1);
+    int64_t* inputs_data = input_ids.data<int64_t>();
+    int64_t* attention_mask_data = attention_mask.data<int64_t>();
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        const size_t batch_offset = batch * sequence_length;
+
+        // last token in the sequence is not a PAD_TOKEN, skipping
+        if (inputs_data[batch_offset + sequence_length - 1] != pad_token)
+            continue;
+
+        size_t pad_tokens_number = 0;
+        for (int i = sequence_length - 1; i >= 0; i--) {
+            const size_t token_offset = batch_offset + i;
+
+            if (inputs_data[token_offset] == pad_token)
+                continue;
+
+            if (pad_tokens_number == 0)
+                pad_tokens_number = sequence_length - i - 1;
+
+            std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]);
+            std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]);
+        }
+    }
+
+    return {input_ids, attention_mask};
+}
+
+void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) {
+    const size_t batch_size = attention_mask.get_shape().at(0);
+    const size_t seq_length = attention_mask.get_shape().at(1);
+    position_ids.set_shape({batch_size, 1});
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        int64_t* start = attention_mask.data<int64_t>() + batch * seq_length;
+        position_ids.data<int64_t>()[batch] = std::accumulate(start, start + seq_length, 0);
+    }
+}
+
+void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) {
+    const size_t batch_size = attention_mask.get_shape()[0];
+    const size_t seq_length = attention_mask.get_shape()[1];
+
+    const int64_t* attention_mask_data = attention_mask.data<int64_t>();
+    int64_t* position_ids_data = position_ids.data<int64_t>();
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        size_t sum = 0;
+        for (size_t i = 0; i < seq_length; i++) {
+            const size_t element_offset = batch * seq_length + i;
+            position_ids_data[element_offset] = sum;
+            if (attention_mask_data[element_offset] == 1) {
+                sum += 1;
+            }
+        }
+    }
+}
+
+ov::Tensor init_attention_mask(ov::Tensor& position_ids) {
+    auto shape = position_ids.get_shape();
+    auto attention_mask = ov::Tensor{position_ids.get_element_type(), shape};
+    std::fill_n(attention_mask.data<int64_t>(), shape[0] * shape[1], 1);
+    return attention_mask;
+}
+
+ov::Tensor extend_attention(ov::Tensor attention_mask) {
+    auto shape = attention_mask.get_shape();
+    auto batch_size = shape[0];
+    auto seq_len = shape[1];
+
+    ov::Tensor new_atten_mask = ov::Tensor{attention_mask.get_element_type(), {batch_size, seq_len + 1}};
+    auto old_data = attention_mask.data<int64_t>();
+    auto new_data = new_atten_mask.data<int64_t>();
+    for (size_t batch = 0; batch < batch_size; ++batch) {
+        std::memcpy(new_data + batch * (seq_len + 1), old_data + batch * seq_len, seq_len * sizeof(int64_t));
+        new_data[batch * (seq_len + 1) + seq_len] = 1;
+    }
+    return new_atten_mask;
+}
+
+ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len) {
+    // Copy elements from the old to a new tensor and return it.
+    // It's assumed that key/values tensor has a shape [BATCH_SIZE, num_kv_heads, seq_len, head_size] or [seq_len, ...],
+    // It that's not the case for your model please implement your own trim method.
+    OPENVINO_ASSERT(seq_len_axis == 2 || seq_len_axis == 0, "Cannot trim key/values with sequence length axis = ", seq_len_axis);
+    
+    auto old_tensor_data = tensor.data<float>();
+    auto shape = tensor.get_shape();
+    size_t batch_size = shape[0];
+    size_t num_kv_heads = shape[1];
+    size_t old_seq_len = shape[2];
+    size_t head_size = shape[3];
+    
+    OPENVINO_ASSERT(new_seq_len <= old_seq_len);
+    
+    // if new_seq_len equal to old one no need to copy tensor, return as is
+    if (old_seq_len == new_seq_len)
+        return tensor;
+
+    if (seq_len_axis == 0) {
+        shape[0] = new_seq_len;
+        tensor.set_shape(shape);
+    }
+
+    // if seq_len_axis == 2, then data is not contiguous, in order to trim need to repack tensor
+    auto new_tensor = ov::Tensor{ov::element::f32, {batch_size, num_kv_heads, new_seq_len, head_size}};
+    auto new_tensor_data = new_tensor.data<float>();
+    for (size_t batch = 0; batch < batch_size; ++batch){
+        for (size_t i = 0; i < num_kv_heads; ++i) {
+            for (size_t j = 0; j < new_seq_len; ++j) {
+                auto dst_ptr = new_tensor_data + num_kv_heads * new_seq_len * head_size * batch + new_seq_len * head_size * i +  head_size * j;
+                auto src_ptr = old_tensor_data + num_kv_heads * new_seq_len * head_size * batch + old_seq_len * head_size * i +  head_size * j;
+                std::memcpy(dst_ptr, src_ptr, head_size * sizeof(float));
+            }
+        }
+    }
+    return new_tensor;
+}
+
+void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) {
+    // trim kv_cache values up to the new_seq_len
+    for (auto& state: request.query_state()) {
+        ov::Tensor old_tensor = state.get_state();
+        state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
+    }
+}
+   
+LLMPipeline::LLMPipeline(
+    std::string& model_path,
+    std::string& tokenizer_path,
+    std::string& detokenizer_path,
+    std::string device,
+    const ov::AnyMap& config
+) {
+    m_device = device;
+    m_config = config;
+    ov::Core core;
+    
+    auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
+    
+    std::string full_path = model_path;
+    if (!is_xml(full_path))
+        full_path += "/openvino_model.xml";
+    m_model_runner = core.compile_model(full_path, device, config).create_infer_request();
+    
+    // todo: add loading Tokenizers from separate folders
+}
+
+LLMPipeline::LLMPipeline(std::string& path, std::string device, const ov::AnyMap& config) {
+    if (std::filesystem::exists(path + "/generation_config.json")) {
+        m_sampling_parameters = GenerationConfig(path + "/generation_config.json");
+    }
+    m_device = device;
+
+    ov::Core core;
+    auto model_request = core.compile_model(path + "/openvino_model.xml", device, config).create_infer_request();
+    m_model_runner = model_request;
+
+    m_tokenizer = Tokenizer(path);
+}
+
+GenerationConfig LLMPipeline::generation_config() const {
+    return m_sampling_parameters;
+}
+
+
+GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids, 
+                                ov::Tensor attention_mask, 
+                                GenerationConfig sampling_params) {
+    ov::Shape prompts_shape = input_ids.get_shape();
+    size_t batch_size = prompts_shape[0];
+    
+    GenerationResult results(batch_size);
+
+    auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
+    // todo: make this work even if position_ids are not specified
+    initialize_position_ids(position_ids, attention_mask);
+
+    size_t prompt_len = input_ids.get_shape()[1];
+
+    m_model_runner.set_tensor("input_ids", input_ids);
+    m_model_runner.set_tensor("attention_mask", attention_mask);
+    m_model_runner.set_tensor("position_ids", position_ids);
+
+    m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
+    auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
+    std::iota(beam_data, beam_data + batch_size, 0);
+
+    for (size_t i = 0; i < sampling_params.get_max_new_tokens(prompt_len); ++i) {
+        // todo: consider replacing with start_async and run callback right after that
+        m_model_runner.infer();
+        auto logits = m_model_runner.get_tensor("logits");
+        ov::Shape logits_shape = logits.get_shape();
+        size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];
+
+        m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});
+        m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
+        update_position_ids(position_ids, attention_mask);  // todo: check why does not always work correctly
+        
+        std::vector<int64_t> token_iter_results(batch_size);  // results of a single infer request
+        std::vector<int> eos_met(batch_size, 0);  // use int because can not use std::all_of with vector<bool>
+        for (size_t batch = 0; batch < batch_size; ++batch) {
+            const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
+            int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
+            results[batch].second.emplace_back(out_token);
+            token_iter_results[batch] = out_token;
+            eos_met[batch] != (out_token == sampling_params.m_eos_token_id);
+
+            m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
+            m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(prompt_len + i);
+        }
+        // place
+        sampling_params.m_callback(std::move(token_iter_results), *this);
+        
+        // stop generation when EOS is met in all batches
+        bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; });
+        if (!sampling_params.m_ignore_eos && all_are_eos)
+            break;
+    }
+    return results;
+}
+
+GenerationResult LLMPipeline::beam_search(ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params) {
+    ov::Shape prompts_shape = prompts.get_shape();
+    size_t batch_size = prompts_shape[0];
+    // todo: implement for batch > 1
+    OPENVINO_ASSERT(batch_size == 1);
+
+    // initialize inputs
+    auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()};
+    std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
+    auto position_ids = ov::Tensor{ov::element::i64, prompts.get_shape()};
+    std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
+    auto prompt_len = prompts.get_shape()[1];
+
+    m_model_runner.set_tensor("input_ids", prompts);
+    m_model_runner.set_tensor("attention_mask", attention_mask);
+    m_model_runner.set_tensor("position_ids", position_ids);
+
+    // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1
+    m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
+    m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+
+    const int64_t* prompt_data = prompts.data<const int64_t>();
+    
+    // todo: remove this duplication and use the same SamplingParameters for both greedy and beam
+    Parameters parameters{{std::vector<int64_t>{prompt_data, prompt_data + prompts.get_size()}}};
+    parameters.n_groups = sampling_params.m_num_groups;
+    parameters.diversity_penalty = sampling_params.m_diversity_penalty;
+    parameters.group_size = sampling_params.m_group_size;
+    
+    GroupBeamSearcher group_beam_searcher{parameters};
+    std::vector<int64_t> next_tokens;
+    std::vector<int32_t> next_beams;
+    for (size_t length_count = 0; length_count < sampling_params.get_max_new_tokens(prompt_len); ++length_count) {
+        m_model_runner.infer();
+        std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(m_model_runner.get_tensor("logits"));
+        if (next_tokens.empty()) {
+            break;
+        }
+        size_t batch_size = next_tokens.size();
+        // Set pointers
+        m_model_runner.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
+        m_model_runner.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
+        // Set auxiliary inputs
+        ov::Tensor attention_mask = m_model_runner.get_tensor("attention_mask");
+        ov::Shape mask_shape{batch_size, attention_mask.get_shape()[1] + 1};
+        attention_mask.set_shape(mask_shape);
+        std::fill_n(attention_mask.data<int64_t>(), ov::shape_size(mask_shape), 1);
+
+        m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
+        std::fill_n(m_model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1);
+        
+        // place
+        sampling_params.m_callback(std::move(next_tokens), *this);
+
+    }
+
+    std::vector<Beam> beams;
+    // for (const std::vector<Beam>& group : finalize(std::move(group_beam_searcher))) {
+    //     for (const Beam& beam : group) {
+    //         beams.emplace_back(beam);
+    //     }
+    // }
+
+    auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); };
+    std::sort(beams.begin(), beams.end(), compare_scores);
+    
+    GenerationResult results;
+    for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) {
+        results.emplace_back(std::pair(beam->score, beam->tokens));
+    }
+    return results;
+}
+
+/* Speculative decoding works the following way. The draft model predicts the next K
+tokens one by one in an autoregressive manner, while the main model validates these
+predictions and corrects them if necessary. We go through each predicted token, and
+if a difference is detected between the draft and main model, we stop and keep the
+last token predicted by the main model. Then the draft model gets the latest main
+prediction and again tries to predict the next K tokens, repeating the cycle.
+
+This approach reduces the need for multiple infer requests to the main model,
+enhancing performance. For instance, in more predictable parts of text generation,
+the draft model can, in best-case scenarios, generate the next K tokens that exactly
+match the target. In tha caste the are validated in a single inference request to
+the main model (which is bigger, more accurate but slower) instead of running K
+subsequent requests. 
+*/
+GenerationResult LLMPipeline::speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
+    auto batch_size = input_ids.get_shape()[0];
+    OPENVINO_ASSERT(batch_size == 1);
+    auto draft_model = sampling_params.get_assistant_model(m_device, m_config);
+    auto main_model = m_model_runner;
+    
+    auto draft_input_ids = ov::Tensor{input_ids.get_element_type(), input_ids.get_shape()};
+    input_ids.copy_to(draft_input_ids);
+    auto draft_attention_mask = ov::Tensor{input_ids.get_element_type(), input_ids.get_shape()};
+    
+    draft_model.set_tensor("input_ids", draft_input_ids);
+    draft_model.set_tensor("attention_mask", draft_attention_mask);
+    
+    ov::Tensor draft_position_ids = draft_model.get_tensor("position_ids");
+    draft_position_ids.set_shape(draft_input_ids.get_shape());
+    std::iota(draft_position_ids.data<int64_t>(), draft_position_ids.data<int64_t>() + draft_position_ids.get_size(), 0);
+    uint64_t seq_len = draft_input_ids.get_shape()[1];
+
+    // Input tensors for the main model should not be mixed with draft.
+    // Do not feed the same draft_postion_ids to the main, but copy input_ids from the draft_input_ids
+    // auto input_ids = main_model.get_tensor("input_ids");
+    // input_ids.set_shape(draft_input_ids.get_shape());
+    // draft_input_ids.copy_to(input_ids);
+
+    // auto attention_mask = main_model.get_tensor("attention_mask");
+    // attention_mask.set_shape(draft_input_ids.get_shape());
+    // std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
+
+    auto position_ids = main_model.get_tensor("position_ids");
+    position_ids.set_shape(draft_input_ids.get_shape());
+    std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
+    
+    // set beam_idx for stateful model: no beam search is used and batch_size = 1
+    draft_model.get_tensor("beam_idx").set_shape({batch_size});
+    draft_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+    main_model.get_tensor("beam_idx").set_shape({batch_size});
+    main_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+
+    main_model.set_tensor("input_ids", input_ids);
+    main_model.set_tensor("attention_mask", attention_mask);
+    main_model.set_tensor("position_ids", position_ids);
+
+    // To coollect kv-cache for the <PROMPT> and to get the next token run the very first infer request
+    draft_model.infer();
+    main_model.infer();
+
+    size_t vocab_size = draft_model.get_tensor("logits").get_shape().back();
+    OPENVINO_ASSERT(vocab_size == main_model.get_tensor("logits").get_shape().back(), "vocab size should be the same for the both models");
+    
+    // logits shape is [batch_size, seq_len, vocab_size]
+    auto logits = main_model.get_tensor("logits");
+    auto data_logits = logits.data<float>() + (seq_len - 1) * vocab_size;
+    int64_t out_token = std::max_element(data_logits, data_logits + vocab_size) - data_logits;
+    
+    // the first token which is fed to both draft and main netwoks on each iteration
+    auto first_token = out_token;
+
+    GenerationResult results(batch_size);
+    results[0].second.emplace_back(out_token);
+    
+    // run K infer requests on draft model and get next K prediction tokens on each iteration
+    uint64_t K = sampling_params.m_num_assistant_tokens;
+    std::vector<int64_t> draft_tokens;
+
+    // The draft model predicts tokens one by one in an auto-regressive manner, draft_input_ids length should be 1.
+    draft_input_ids.set_shape({batch_size, 1});
+    draft_position_ids.set_shape({batch_size, 1});
+
+    int max_sequence_length = sampling_params.m_max_new_tokens;
+    auto eos_token = sampling_params.m_eos_token_id;
+    
+    while (out_token != eos_token && seq_len < max_sequence_length) {
+        // infer the K next tokens with draft model
+        for (int i = 0; i < K; ++i) {
+            draft_input_ids.data<int64_t>()[0] = out_token;
+            draft_attention_mask.set_shape({batch_size, seq_len + i + 1});
+            std::fill_n(draft_attention_mask.data<int64_t>(), draft_attention_mask.get_size(), 1);
+            draft_position_ids.data<int64_t>()[0] = int64_t(draft_attention_mask.get_size() - 1);
+
+            draft_model.infer();
+
+            auto draft_logits = draft_model.get_tensor("logits").data<float>();
+            int64_t arg_max_token = std::max_element(draft_logits, draft_logits + vocab_size) - draft_logits;
+            out_token = arg_max_token;
+            draft_tokens.emplace_back(arg_max_token);
+        }
+
+        // For the main network, K tokens will be fed at once in a single infer request.
+        input_ids.set_shape({batch_size, K});
+        // Set the first token for the main model to be the same as for the draft model.
+        input_ids.data<int64_t>()[0] = first_token;
+        for (int i = 0; i < K - 1; i++)
+            input_ids.data<int64_t>()[i + 1] = draft_tokens[i];
+
+        attention_mask.set_shape({batch_size, seq_len + K});
+        std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
+
+        position_ids.set_shape({batch_size, K});
+        std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), seq_len);
+
+        main_model.infer();
+
+        data_logits = logits.data<float>();  // [batch_size, K, vocab_size]
+        size_t disagree_idx = K - 1;
+        // Iterate through the predicted tokens from the main model and compare them with draft predictions.
+        // In the worst-case scenario (disagreement at the beginning), iter will increase by 1.
+        // In the best-case scenario, all elements match, and K predicted tokens will be taken.
+        for (size_t i = 0; i < K; i++) {
+            auto start = data_logits + vocab_size * i;
+            auto stop = data_logits + vocab_size * (i + 1);
+            out_token = std::max_element(start, stop) - start;
+            results[0].second.emplace_back(out_token);
+
+            disagree_idx = i;                
+            if (out_token != draft_tokens[i] || out_token == eos_token || seq_len + disagree_idx + 1 >= max_sequence_length)
+                break;
+        }
+
+        // After the inference request, key/values have shape [batch_size, seq_len + K, vocab_size].
+        // Increment the sequence length by the number of matched tokens, and
+        // trim the KV cache to match the new sequence length.
+        seq_len += disagree_idx + 1;
+        update_kv_cache(draft_model, sampling_params.m_seq_len_axis, seq_len);
+        update_kv_cache(main_model, sampling_params.m_seq_len_axis, seq_len);
+        
+        draft_tokens.clear();
+        first_token = out_token;
+    }
+
+    return results;
+}
+
+GenerationResult LLMPipeline::multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
+    // todo: implement
+    GenerationResult results;
+    return results;
+}
+
+std::string LLMPipeline::call(std::string text) {
+    auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
+
+    auto generate_results = generate(input_ids, attention_mask, m_sampling_parameters);
+
+    return m_tokenizer.detokenize(generate_results)[0];
+}
+
+std::string LLMPipeline::call(std::string text, GenerationConfig generation_config) {
+    auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
+    // to keep config specified during LLMPipeline creation need to get existing 
+    // and modify only after that, e.g.:
+    // GenerationConfig config = pipe.generation_config();
+    // config.do_sample(false).max_new_tokens(20);
+    auto generate_results = generate(input_ids, attention_mask, generation_config);
+
+    return m_tokenizer.detokenize(generate_results)[0];
+}
+
+std::vector<std::string> LLMPipeline::call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
+    auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
+
+    auto generate_results = generate(input_ids, attention_mask, sampling_parameters);
+
+    return m_tokenizer.detokenize(generate_results);
+}
+
+std::string LLMPipeline::operator()(std::string text) {
+    return call(text);
+}
+
+std::string LLMPipeline::operator()(std::string text, GenerationConfig sampling_parameters) {
+    return call(text, sampling_parameters);
+}
+
+std::vector<std::string> LLMPipeline::operator()(std::vector<std::string> text, GenerationConfig sampling_parameters) {
+    return call(text, sampling_parameters);
+}
+
+std::vector<std::string> LLMPipeline::operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters) {
+    return call(text, sampling_parameters);
+}
+
+GenerationResult LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
+    if (sampling_params.is_gready_sampling()) {
+        return greedy_search(input_ids, attention_mask, sampling_params);
+    } else if (sampling_params.is_beam_search()) {
+        return beam_search(input_ids, attention_mask, sampling_params);
+    } else if (sampling_params.is_multimomial()) {
+        return multinomial_sampling(input_ids, sampling_params);
+    } else { // speculative
+        return speculative_sampling(input_ids, attention_mask, sampling_params);
+    }
+}
+
+GenerationResult LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask) {
+    return generate(input_ids, attention_mask, m_sampling_parameters);
+}
+
+GenerationResult LLMPipeline::generate(ov::Tensor input_ids, GenerationConfig sampling_params) {
+
+    return generate(input_ids, init_attention_mask(input_ids), sampling_params);
+}
+
+GenerationResult LLMPipeline::generate(ov::Tensor input_ids) {
+    return generate(input_ids, init_attention_mask(input_ids), m_sampling_parameters);
+}
+
+Tokenizer LLMPipeline::get_tokenizer() {
+    return m_tokenizer;
+}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
new file mode 100644
index 0000000000..d48973752a
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
@@ -0,0 +1,82 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <openvino/openvino.hpp>
+#include <openvino/core/any.hpp>
+#include "generate_pipeline/generation_config.hpp"
+#include "generate_pipeline/llm_tokenizer.hpp"
+#include <filesystem>
+
+
+using GenerationResult = std::vector<std::pair<float, std::vector<int64_t>>>;
+using namespace std;
+
+void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
+void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
+ov::Tensor init_attention_mask(ov::Tensor& position_ids);
+ov::Tensor extend_attention(ov::Tensor attention_mask);
+ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len);
+void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len);
+
+class Tokenizer; // forward declaration
+
+class LLMPipeline {
+public:
+    ov::InferRequest m_model_runner;
+    Tokenizer m_tokenizer;
+    GenerationConfig m_sampling_parameters;
+    std::string m_device;
+    ov::AnyMap m_config;
+    
+    // TODO: add constructor for specifying manually tokenizer path
+    // dir path
+    // xml file path
+    // compiled model
+    // infer request
+    // ov::Model
+
+    LLMPipeline(
+        std::string& model_path,
+        std::string& tokenizer_path,
+        std::string& detokenizer_path,
+        std::string device="CPU",
+        const ov::AnyMap& config={}
+    );
+
+    LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& config={});
+    GenerationConfig generation_config() const;
+
+    GenerationResult greedy_search(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
+
+    GenerationResult beam_search(ov::Tensor prompts, ov::Tensor attention_mask, GenerationConfig sampling_params);
+
+    GenerationResult speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
+
+    GenerationResult multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params);
+
+    std::string call(std::string text);
+
+    std::string call(std::string text, GenerationConfig generation_config);
+
+    std::vector<std::string> call(std::vector<std::string> text, GenerationConfig sampling_parameters);
+
+    std::string operator()(std::string text);
+
+    std::string operator()(std::string text, GenerationConfig sampling_parameters);
+
+    std::vector<std::string> operator()(std::vector<std::string> text, GenerationConfig sampling_parameters);
+
+    std::vector<std::string> operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters);
+
+    GenerationResult generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
+
+    GenerationResult generate(ov::Tensor input_ids, ov::Tensor attention_mask);
+
+    GenerationResult generate(ov::Tensor input_ids, GenerationConfig sampling_params);
+
+    GenerationResult generate(ov::Tensor input_ids);
+
+    Tokenizer get_tokenizer();
+};
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
new file mode 100644
index 0000000000..fc39916f0a
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
@@ -0,0 +1,119 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openvino/openvino.hpp>
+#include "generate_pipeline/llm_tokenizer.hpp"
+#include <filesystem>
+
+
+Tokenizer::Tokenizer(std::string& tokenizers_path, std::string device): m_device(device) {
+    ov::Core core;
+    
+    auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
+    
+    if (is_xml(tokenizers_path))
+        OPENVINO_THROW("tokenizers_path should be a path to a dir not to xml file");
+  
+    // todo: add loading EOS_TOKEN_ID from IR
+    // todo:: OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
+    core.add_extension(OPENVINO_TOKENIZERS_PATH);  
+    // tokenizer and detokenizer work on CPU only
+    
+    m_tokenize_request = core.compile_model(tokenizers_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
+    m_detokenizer_request = core.compile_model(tokenizers_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+}
+
+// Tokenizer::Tokenizer(std::string& tokenizer_path, std::string& detokenizer_path, std::string device="CPU") {
+
+// }
+
+std::pair<ov::Tensor, ov::Tensor> Tokenizer::tokenize(std::string prompt) {
+    size_t batch_size = 1;
+    m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
+    m_tokenize_request.infer();
+
+    vector<vector<int64_t>> input_ids_vec;
+    input_ids_vec.reserve(1);
+    auto res_tensor = m_tokenize_request.get_tensor("input_ids");
+    auto res_shape = res_tensor.get_shape();
+    
+    for (int i = 0; i < res_shape[0]; ++i) {
+        int64_t* start = res_tensor.data<int64_t>() + i * res_shape[1];
+        input_ids_vec.emplace_back(std::vector<int64_t>(start, start + res_shape[1]));
+    }
+
+    return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
+}
+
+std::pair<ov::Tensor, ov::Tensor> Tokenizer::tokenize(std::vector<std::string> prompts) {
+    m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
+    auto size_ = m_tokenize_request.get_input_tensor().get_shape();
+    m_tokenize_request.infer();
+
+    pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"));
+    // todo: fix mask filled with '2' instead of '0'
+    ov::Tensor attention_mask = m_tokenize_request.get_tensor("attention_mask");
+    int64_t* attention_mask_data = attention_mask.data<int64_t>();
+    std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0);
+    
+    vector<vector<int64_t>> input_ids_vec;
+    vector<vector<int64_t>> atten_mask_vec;
+    
+    input_ids_vec.reserve(prompts.size());
+    atten_mask_vec.reserve(prompts.size());
+    auto res_tensor = m_tokenize_request.get_tensor("input_ids");
+    auto atten_tensor = m_tokenize_request.get_tensor("attention_mask");
+    auto res_shape = res_tensor.get_shape();
+    
+    for (int i = 0; i < res_shape[0]; ++i) {
+        int64_t* start = res_tensor.data<int64_t>() + i * res_shape[1];
+        input_ids_vec.emplace_back(std::vector<int64_t>(start, start + res_shape[1]));
+        
+        int64_t* atten_start = atten_tensor.data<int64_t>() + i * res_shape[1];
+        atten_mask_vec.emplace_back(std::vector<int64_t>(atten_start, atten_start + res_shape[1]));
+    }
+
+    return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
+}
+
+std::pair<ov::Tensor, ov::Tensor> Tokenizer::tokenize(std::initializer_list<std::string> text) {
+    return tokenize(std::vector<std::string>(text.begin(), text.end()));
+}
+
+
+std::string Tokenizer::detokenize(std::vector<int64_t> tokens) {
+    size_t batch_size = 1;
+    m_detokenizer_request.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()});
+    m_detokenizer_request.infer();
+    return m_detokenizer_request.get_output_tensor().data<std::string>()[0];
+}
+
+std::vector<std::string> Tokenizer::detokenize(ov::Tensor tokens) {
+    m_detokenizer_request.set_input_tensor(tokens);
+    auto shape = tokens.get_shape();
+    auto data = tokens.data<int64_t>();
+    m_detokenizer_request.infer();
+    auto res = m_detokenizer_request.get_output_tensor();
+    
+    std::vector<std::string> strings;
+    for (int i = 0; i < res.get_shape()[0]; ++i) {
+        strings.emplace_back(res.data<std::string>()[i]);
+    }
+    return strings;
+}
+
+std::vector<std::string> Tokenizer::detokenize(GenerationResult lines) {
+    // todo: implement calling detokenizer in a single batch
+
+    std::vector<std::string> strings;
+    for (auto& [score, line]: lines){
+        ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()};
+        m_detokenizer_request.set_input_tensor(tokens);
+        m_detokenizer_request.infer();
+        auto res = m_detokenizer_request.get_output_tensor();
+        auto res_str = res.data<std::string>()[0];
+        strings.emplace_back(res_str);
+    }
+    
+    return strings;
+}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp
new file mode 100644
index 0000000000..54c4243f64
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp
@@ -0,0 +1,74 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <openvino/openvino.hpp>
+#include <openvino/core/any.hpp>
+#include <filesystem>
+
+
+using GenerationResult = std::vector<std::pair<float, std::vector<int64_t>>>;
+using namespace std;
+
+std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token=2);
+
+
+class Tokenizer {
+    ov::InferRequest m_tokenize_request;
+    ov::InferRequest m_detokenizer_request;
+    std::string m_device;
+
+public:
+    Tokenizer() = default;
+    Tokenizer(std::string& tokenizers_path, std::string device="CPU");
+
+    // Tokenizer(std::string& tokenizer_path, std::string& detokenizer_path, std::string device="CPU");
+    
+    std::pair<ov::Tensor, ov::Tensor> tokenize(std::string prompt);
+
+    std::pair<ov::Tensor, ov::Tensor> tokenize(std::vector<std::string> prompts);
+    
+    std::pair<ov::Tensor, ov::Tensor> tokenize(std::initializer_list<std::string> text);
+    
+    std::string detokenize(std::vector<int64_t> tokens);
+    
+    std::vector<std::string> detokenize(ov::Tensor tokens);
+    
+    std::vector<std::string> detokenize(GenerationResult lines);
+};
+
+
+// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
+// class TextStreamer {
+//     LLMPipeline pipe;
+//     std::vector<int64_t> token_cache;
+//     size_t print_len = 0;
+
+//     // TextStreamer(Tokenizer)
+
+//     void put(int64_t token) {
+//         token_cache.push_back(token);
+//         std::string text = pipe.detokenize(token_cache);
+//         if (!text.empty() && '\n' == text.back()) {
+//             // Flush the cache after the new line symbol
+//             std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
+//             token_cache.clear();
+//             print_len = 0;
+// 	        return;
+//         }
+//         if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
+//             // Don't print incomplete text
+//             return;
+//         }
+//         std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+//         print_len = text.size();
+//     }
+
+//     void end() {
+//         std::string text = pipe.detokenize(token_cache);
+//         std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
+//         token_cache.clear();
+//         print_len = 0;
+//     }
+// };
\ No newline at end of file

From 7692160434756b3794fb6499f9255510d4297e6f Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Tue, 23 Apr 2024 15:07:31 +0200
Subject: [PATCH 16/97] wip

---
 text_generation/causal_lm/cpp/CMakeLists.txt  |  12 ++
 .../cpp/generate_pipeline/chat_sample.cpp     |  66 +++++++++++
 .../generate_pipeline/generation_config.hpp   |   7 ++
 .../cpp/generate_pipeline/llm_pipeline.cpp    | 110 +++++++++++++++---
 .../cpp/generate_pipeline/llm_pipeline.hpp    |   9 +-
 .../cpp/generate_pipeline/llm_tokenizer.cpp   |  39 +++++++
 .../cpp/generate_pipeline/llm_tokenizer.hpp   |  47 +++-----
 .../cpp/generate_pipeline/text_streamer.cpp   |  45 +++++++
 .../cpp/generate_pipeline/text_streamer.hpp   |  29 +++++
 9 files changed, 313 insertions(+), 51 deletions(-)
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/text_streamer.cpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/text_streamer.hpp

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 2c4870cc35..1a4ce68856 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -46,3 +46,15 @@ set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 target_link_libraries(${TARGET_NAME} PRIVATE stdc++fs)
+
+set(TARGET_NAME chat_sample)
+add_executable(${TARGET_NAME} generate_pipeline/chat_sample.cpp generate_pipeline/llm_tokenizer.cpp generate_pipeline/llm_pipeline.cpp)
+target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+find_package(OpenVINO REQUIRED COMPONENTS Runtime)
+target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
+target_link_libraries(${TARGET_NAME} PRIVATE nlohmann_json::nlohmann_json)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
+
+target_link_libraries(${TARGET_NAME} PRIVATE stdc++fs)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
new file mode 100644
index 0000000000..710e0e5305
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -0,0 +1,66 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openvino/openvino.hpp>
+#include "llm_pipeline.hpp"
+
+
+std::string generate_chat_prompt(const std::string& input) {
+    std::stringstream result_prompt;
+    // Gemma-7b-it
+    // result_prompt << "<bos><start_of_turn>user\n" << input << "<end_of_turn>\n<start_of_turn>model";
+    
+    // TinyLlama
+    result_prompt << "<|user|>\n" << input << "</s>\n<|assistant|>\n";
+    return result_prompt.str();
+}
+
+int main(int argc, char* argv[]) try {
+    // if (2 >= argc && argc <= 4)
+    //     throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> \"<PROMPT>\" <DEVICE>");
+    
+    std::string prompt = "table is made of";
+    std::string device = "CPU"; // can be replaced with GPU
+
+    std::string model_path = argv[1];
+    if (argc > 2)
+        prompt = argv[2];
+    if (argc > 3)
+        device = argv[3];
+
+    LLMPipeline pipe(model_path, device);
+    GenerationConfig config = pipe.generation_config();
+    config.do_reset_state(false);
+    config.max_new_tokens(200);
+    config.eos_token_id(2);
+    pipe.set_streamer_callback([](std::string word) { std::cout << word; });
+    
+   std::string accumulated_str = "";
+
+   std::cout << "Type keyword \"Stop!\" to stop the chat. \n";
+   size_t max_len = 10000;
+    for (size_t i = 0; i < max_len; i++) {
+        std::cout << "question:\n";
+        std::getline(std::cin, prompt);
+
+        if (!prompt.compare("Stop!"))
+            break;
+
+        prompt = generate_chat_prompt(prompt);
+        accumulated_str += prompt;
+        // string prefix = (i != 0) ? "</s>" : ""; 
+        string prefix = (i != 0) ? "</s>" : ""; 
+        bool first_time = (i != 0) ? false : true;
+
+        // auto answer_str = pipe.call(prompt, config.do_reset_state(false), first_time);
+        auto answer_str = pipe(accumulated_str, config.do_reset_state(true));
+        accumulated_str += answer_str;
+    }
+
+} catch (const std::exception& error) {
+    std::cerr << error.what() << '\n';
+    return EXIT_FAILURE;
+} catch (...) {
+    std::cerr << "Non-exception object thrown\n";
+    return EXIT_FAILURE;
+}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index 084cdaf6a0..60dcbd0108 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -9,6 +9,7 @@
 #include <fstream>
 // #include <group_beam_searcher.hpp>  // used only for StopCriteria
 #include <limits>
+#include "llm_tokenizer.hpp"
 
 // forward declaration
 class Sequence;
@@ -25,6 +26,7 @@ struct GenerationConfig {
     size_t m_max_length = SIZE_MAX; // m_max_new_tokens should have priority over m_max_length
     bool m_ignore_eos = false;
     int64_t m_eos_token = 2; // There's no way to extract special token values from the tokenizer for now
+    bool m_do_reset_state = true;
 
     // Beam search specific
     size_t m_num_groups = 1;
@@ -151,6 +153,11 @@ struct GenerationConfig {
         return *this;
     }
 
+    GenerationConfig& do_reset_state(bool do_reset_state) {
+        m_do_reset_state = do_reset_state;
+        return *this;
+    }
+
     GenerationConfig() = default;
 
     GenerationConfig(std::string json_path) {
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
index b587eca80f..aa27eebf09 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
@@ -52,7 +52,7 @@ void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_m
     }
 }
 
-void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) {
+void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos = 0) {
     const size_t batch_size = attention_mask.get_shape()[0];
     const size_t seq_length = attention_mask.get_shape()[1];
 
@@ -60,7 +60,7 @@ void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attenti
     int64_t* position_ids_data = position_ids.data<int64_t>();
 
     for (size_t batch = 0; batch < batch_size; batch++) {
-        size_t sum = 0;
+        size_t sum = start_pos;
         for (size_t i = 0; i < seq_length; i++) {
             const size_t element_offset = batch * seq_length + i;
             position_ids_data[element_offset] = sum;
@@ -178,6 +178,15 @@ GenerationConfig LLMPipeline::generation_config() const {
     return m_sampling_parameters;
 }
 
+void print_tensor(const ov::Tensor& tensor) {
+    std::vector<int64_t> res;
+
+    auto t_shape = tensor.get_shape()[1];
+    for (size_t i = 0; i < t_shape; ++i) {
+        res.emplace_back(tensor.data<int64_t>()[i]);
+    }
+    cout << "";
+}
 
 GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids, 
                                 ov::Tensor attention_mask, 
@@ -186,21 +195,58 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
     size_t batch_size = prompts_shape[0];
     
     GenerationResult results(batch_size);
+    
+    if (!sampling_params.m_do_reset_state && kv_cache_len > 0) {
+        // m_attentions_mask_cache extent with attention_mask;
+
+        size_t new_prompt_len = attention_mask.get_shape()[1];
+        size_t context_len = m_attentions_mask_cache.get_shape()[1];
+        ov::Tensor new_attention_mask =  ov::Tensor{ov::element::i64, {1, context_len + new_prompt_len}};
+        // print_tensor(m_attentions_mask_cache);
+        // print_tensor(attention_mask);
+
+        for (size_t i = 0; i < context_len; ++i) {
+            auto r = m_attentions_mask_cache.data<int64_t>()[i];
+            new_attention_mask.data<int64_t>()[i] = m_attentions_mask_cache.data<int64_t>()[i];
+        }
+        for (size_t i = context_len; i < context_len + new_prompt_len; ++i) {
+            auto r = attention_mask.data<int64_t>()[i];
+            new_attention_mask.data<int64_t>()[i] = attention_mask.data<int64_t>()[i - context_len];
+        }
+        // attention_mask = new_attention_mask;
+    // }
+        m_model_runner.set_tensor("attention_mask", new_attention_mask);
+    } else {
+        m_model_runner.set_tensor("attention_mask", attention_mask);
+    }
+    
+    // print_tensor(attention_mask);
+
 
     auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
     // todo: make this work even if position_ids are not specified
-    initialize_position_ids(position_ids, attention_mask);
+    initialize_position_ids(position_ids, attention_mask, kv_cache_len);
+    // initialize_position_ids(position_ids, attention_mask, 0);
 
     size_t prompt_len = input_ids.get_shape()[1];
 
+    auto atten_shape = attention_mask.get_shape();
+    auto pos_shape = position_ids.get_shape();
+    auto input_ids_shape = input_ids.get_shape();
+
     m_model_runner.set_tensor("input_ids", input_ids);
-    m_model_runner.set_tensor("attention_mask", attention_mask);
+    // print_tensor(m_model_runner.get_tensor("input_ids"));
+    // m_model_runner.set_tensor("attention_mask", attention_mask);
     m_model_runner.set_tensor("position_ids", position_ids);
 
     m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
     auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
     std::iota(beam_data, beam_data + batch_size, 0);
 
+    print_tensor(m_model_runner.get_tensor("input_ids"));
+    print_tensor(m_model_runner.get_tensor("attention_mask"));
+    print_tensor(m_model_runner.get_tensor("position_ids"));
+
     for (size_t i = 0; i < sampling_params.get_max_new_tokens(prompt_len); ++i) {
         // todo: consider replacing with start_async and run callback right after that
         m_model_runner.infer();
@@ -209,8 +255,10 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
         size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];
 
         m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});
+        m_attentions_mask_cache = m_model_runner.get_tensor("attention_mask");
         m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
-        update_position_ids(position_ids, attention_mask);  // todo: check why does not always work correctly
+
+        update_position_ids(position_ids, m_model_runner.get_tensor("attention_mask"));  // todo: check why does not always work correctly
         
         std::vector<int64_t> token_iter_results(batch_size);  // results of a single infer request
         std::vector<int> eos_met(batch_size, 0);  // use int because can not use std::all_of with vector<bool>
@@ -219,13 +267,19 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
             int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
             results[batch].second.emplace_back(out_token);
             token_iter_results[batch] = out_token;
-            eos_met[batch] != (out_token == sampling_params.m_eos_token_id);
+            eos_met[batch] = (out_token == sampling_params.m_eos_token_id);
 
             m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
             m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(prompt_len + i);
+
+            kv_cache_len = prompt_len + i;
         }
         // place
-        sampling_params.m_callback(std::move(token_iter_results), *this);
+        // sampling_params.m_callback(std::move(token_iter_results), *this);
+        
+        if (is_streamer_set) {
+            m_streamer.put(token_iter_results[0]);
+        }
         
         // stop generation when EOS is met in all batches
         bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; });
@@ -286,8 +340,11 @@ GenerationResult LLMPipeline::beam_search(ov::Tensor prompts, ov::Tensor attenti
         m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
         std::fill_n(m_model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1);
         
-        // place
-        sampling_params.m_callback(std::move(next_tokens), *this);
+        // sampling_params.m_callback(std::move(next_tokens), *this);
+        // m_callback(std::move(next_tokens);
+        if (is_streamer_set) {
+            m_streamer.put(next_tokens[0]);
+        }
 
     }
 
@@ -468,8 +525,14 @@ std::string LLMPipeline::call(std::string text) {
     return m_tokenizer.detokenize(generate_results)[0];
 }
 
-std::string LLMPipeline::call(std::string text, GenerationConfig generation_config) {
+std::string LLMPipeline::call(std::string text, GenerationConfig generation_config, bool first_time) {
     auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
+    if (generation_config.m_do_reset_state == false && !first_time) {
+        auto size = input_ids.get_shape();
+        int64_t* inputs_data = input_ids.data<int64_t>();
+
+        // std::replace(inputs_data, inputs_data + input_ids.get_shape()[1], 1, 2);
+    }
     // to keep config specified during LLMPipeline creation need to get existing 
     // and modify only after that, e.g.:
     // GenerationConfig config = pipe.generation_config();
@@ -503,15 +566,20 @@ std::vector<std::string> LLMPipeline::operator()(std::initializer_list<std::stri
     return call(text, sampling_parameters);
 }
 
-GenerationResult LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
-    if (sampling_params.is_gready_sampling()) {
-        return greedy_search(input_ids, attention_mask, sampling_params);
-    } else if (sampling_params.is_beam_search()) {
-        return beam_search(input_ids, attention_mask, sampling_params);
-    } else if (sampling_params.is_multimomial()) {
-        return multinomial_sampling(input_ids, sampling_params);
+GenerationResult LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
+    if (generation_config.m_do_reset_state) {
+        m_model_runner.reset_state();
+        kv_cache_len = 0;
+    }
+
+    if (generation_config.is_gready_sampling()) {
+        return greedy_search(input_ids, attention_mask, generation_config);
+    } else if (generation_config.is_beam_search()) {
+        return beam_search(input_ids, attention_mask, generation_config);
+    } else if (generation_config.is_multimomial()) {
+        return multinomial_sampling(input_ids, generation_config);
     } else { // speculative
-        return speculative_sampling(input_ids, attention_mask, sampling_params);
+        return speculative_sampling(input_ids, attention_mask, generation_config);
     }
 }
 
@@ -531,3 +599,9 @@ GenerationResult LLMPipeline::generate(ov::Tensor input_ids) {
 Tokenizer LLMPipeline::get_tokenizer() {
     return m_tokenizer;
 }
+
+void LLMPipeline::set_streamer_callback(std::function<void (std::string)> callback) {
+    is_streamer_set = true;
+    m_callback = callback;
+    m_streamer = TextCoutStreamer(m_tokenizer);
+}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
index d48973752a..ef7ba2f3d0 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
@@ -29,6 +29,11 @@ class LLMPipeline {
     GenerationConfig m_sampling_parameters;
     std::string m_device;
     ov::AnyMap m_config;
+    size_t kv_cache_len = 0;
+    ov::Tensor m_attentions_mask_cache;
+    std::function<void (std::string)> m_callback = [](std::string ){ ;};
+    TextCoutStreamer m_streamer;
+    bool is_streamer_set = false;
     
     // TODO: add constructor for specifying manually tokenizer path
     // dir path
@@ -58,7 +63,7 @@ class LLMPipeline {
 
     std::string call(std::string text);
 
-    std::string call(std::string text, GenerationConfig generation_config);
+    std::string call(std::string text, GenerationConfig generation_config, bool first_time = false);
 
     std::vector<std::string> call(std::vector<std::string> text, GenerationConfig sampling_parameters);
 
@@ -79,4 +84,6 @@ class LLMPipeline {
     GenerationResult generate(ov::Tensor input_ids);
 
     Tokenizer get_tokenizer();
+
+    void set_streamer_callback(std::function<void (std::string)> callback);
 };
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
index fc39916f0a..eb79c0a304 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
@@ -117,3 +117,42 @@ std::vector<std::string> Tokenizer::detokenize(GenerationResult lines) {
     
     return strings;
 }
+
+TextCoutStreamer::TextCoutStreamer(const Tokenizer& tokenizer) {
+    this->tokenizer = tokenizer;
+}
+
+void TextCoutStreamer::put(int64_t token) {
+    // do not print anything and flush cache if EOS token is met
+    if (token == tokenizer.m_eos_token) {
+        end();
+        return ;
+    }
+
+    token_cache.push_back(token);
+    std::string text = tokenizer.detokenize(token_cache);
+    if (!text.empty() && '\n' == text.back()) {
+        // Flush the cache after the new line symbol
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
+        token_cache.clear();
+        print_len = 0;
+        return;
+    }
+    if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
+        // Don't print incomplete text
+        return;
+    }
+    std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+    print_len = text.size();
+}
+
+void TextCoutStreamer::end() {
+    std::string text = tokenizer.detokenize(token_cache);
+    std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
+    token_cache.clear();
+    print_len = 0;
+}
+
+void TextCoutStreamer::set_tokenizer(Tokenizer tokenizer) {
+    this->tokenizer = tokenizer;
+}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp
index 54c4243f64..6e0ac75f4f 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp
@@ -20,6 +20,8 @@ class Tokenizer {
     std::string m_device;
 
 public:
+    int64_t m_eos_token = 2;
+
     Tokenizer() = default;
     Tokenizer(std::string& tokenizers_path, std::string device="CPU");
 
@@ -39,36 +41,17 @@ class Tokenizer {
 };
 
 
-// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
-// class TextStreamer {
-//     LLMPipeline pipe;
-//     std::vector<int64_t> token_cache;
-//     size_t print_len = 0;
-
-//     // TextStreamer(Tokenizer)
-
-//     void put(int64_t token) {
-//         token_cache.push_back(token);
-//         std::string text = pipe.detokenize(token_cache);
-//         if (!text.empty() && '\n' == text.back()) {
-//             // Flush the cache after the new line symbol
-//             std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
-//             token_cache.clear();
-//             print_len = 0;
-// 	        return;
-//         }
-//         if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
-//             // Don't print incomplete text
-//             return;
-//         }
-//         std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
-//         print_len = text.size();
-//     }
+class TextCoutStreamer {
+    Tokenizer tokenizer;
+    std::vector<int64_t> token_cache;
+    size_t print_len = 0;
+    std::function<void (std::string)> m_callback = [](std::string words){ ;};
+    
+public:
+    void put(int64_t token);
 
-//     void end() {
-//         std::string text = pipe.detokenize(token_cache);
-//         std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
-//         token_cache.clear();
-//         print_len = 0;
-//     }
-// };
\ No newline at end of file
+    void end();
+    TextCoutStreamer(const Tokenizer& tokenizer);
+    TextCoutStreamer() = default;
+    void set_tokenizer(Tokenizer tokenizer);
+};
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/text_streamer.cpp b/text_generation/causal_lm/cpp/generate_pipeline/text_streamer.cpp
new file mode 100644
index 0000000000..ac12a05eb5
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/text_streamer.cpp
@@ -0,0 +1,45 @@
+#include "text_streamer.hpp"
+
+TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer) {
+    this->tokenizer = tokenizer;
+}
+
+TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function<void (std::string)> callback) {
+    this->tokenizer = tokenizer;
+    this->m_callback = callback;
+}
+
+void TextCallbackStreamer::put(int64_t token) {
+    // do not print anything and flush cache if EOS token is met
+    if (token == tokenizer.m_eos_token) {
+        end();
+        return ;
+    }
+
+    token_cache.push_back(token);
+    std::string text = tokenizer.detokenize(token_cache);
+    if (!text.empty() && '\n' == text.back()) {
+        // Flush the cache after the new line symbol
+        std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
+        token_cache.clear();
+        print_len = 0;
+        return;
+    }
+    if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
+        // Don't print incomplete text
+        return;
+    }
+    std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+    print_len = text.size();
+}
+
+void TextCallbackStreamer::end() {
+    std::string text = tokenizer.detokenize(token_cache);
+    std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
+    token_cache.clear();
+    print_len = 0;
+}
+
+void TextCallbackStreamer::set_tokenizer(Tokenizer tokenizer) {
+    this->tokenizer = tokenizer;
+}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/text_streamer.hpp b/text_generation/causal_lm/cpp/generate_pipeline/text_streamer.hpp
new file mode 100644
index 0000000000..1927e5c0c7
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/text_streamer.hpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "llm_tokenizer.hpp"
+
+class StreamerBase {
+public:
+    virtual void put(int64_t token) = 0;
+
+    virtual void end() = 0;
+};
+
+class TextCallbackStreamer: public StreamerBase {
+    Tokenizer tokenizer;
+    std::vector<int64_t> token_cache;
+    size_t print_len = 0;
+    std::function<void (std::string)> m_callback = [](std::string words){ ;};
+    
+public:
+    TextCallbackStreamer() = default;
+    TextCallbackStreamer(const Tokenizer& tokenizer);
+    TextCallbackStreamer(const Tokenizer& tokenizer, std::function<void (std::string)> callback);
+    void set_tokenizer(Tokenizer tokenizer);
+    void set_callback(std::function<void (std::string)> callback);
+    
+    void put(int64_t token) override;
+    void end() override;  
+};

From 3776433597231207fde70580103c33e46bbf7b8d Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Tue, 23 Apr 2024 20:48:12 +0200
Subject: [PATCH 17/97] use text in streamer instead of raw tokens

---
 text_generation/causal_lm/cpp/CMakeLists.txt  |  18 +++
 .../cpp/generate_pipeline/chat_sample.cpp     |  78 ++++++++---
 .../generate_pipeline/continuation_sample.cpp |  47 +++++++
 .../generate_pipeline/generation_config.hpp   |   6 +-
 .../cpp/generate_pipeline/llm_pipeline.cpp    | 125 ++++++++++++------
 .../cpp/generate_pipeline/llm_pipeline.hpp    |   9 +-
 .../cpp/generate_pipeline/llm_tokenizer.cpp   |  41 +++---
 .../cpp/generate_pipeline/llm_tokenizer.hpp   |  25 ++--
 .../causal_lm/cpp/greedy_causal_lm.cpp        | 103 +++++++++++++--
 thirdparty/Jinja2Cpp                          |   1 +
 10 files changed, 343 insertions(+), 110 deletions(-)
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp
 create mode 160000 thirdparty/Jinja2Cpp

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 1a4ce68856..ff98abf634 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -4,9 +4,16 @@
 cmake_minimum_required(VERSION 3.15)
 project(causal_lm)
 
+set(JINJA2CPP_DEPS_MODE internal)
+
+
 add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
 add_subdirectory(../../../thirdparty/nlohmann_json/ "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json/")
+# add_subdirectory(../../../thirdparty/inja/ "${CMAKE_CURRENT_BINARY_DIR}/inja/")
+# include_directories(../../../thirdparty/inja/include/inja)
 
+# add_subdirectory(../../../thirdparty/Jinja2Cpp/ "${CMAKE_CURRENT_BINARY_DIR}/Jinja2Cpp/")
+# include_directories(../../../thirdparty/inja/include/Jinja2Cpp)
 
 set(TARGET_NAME greedy_causal_lm)
 add_executable(${TARGET_NAME} greedy_causal_lm.cpp)
@@ -56,5 +63,16 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
 target_link_libraries(${TARGET_NAME} PRIVATE nlohmann_json::nlohmann_json)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
+# target_link_libraries(${TARGET_NAME} PRIVATE Jinja2Cpp)
+
+set(TARGET_NAME continuation_sample)
+add_executable(${TARGET_NAME} generate_pipeline/continuation_sample.cpp generate_pipeline/llm_tokenizer.cpp generate_pipeline/llm_pipeline.cpp)
+target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+find_package(OpenVINO REQUIRED COMPONENTS Runtime)
+target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
+target_link_libraries(${TARGET_NAME} PRIVATE nlohmann_json::nlohmann_json)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 target_link_libraries(${TARGET_NAME} PRIVATE stdc++fs)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index 710e0e5305..0d03808a43 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -5,13 +5,18 @@
 #include "llm_pipeline.hpp"
 
 
-std::string generate_chat_prompt(const std::string& input) {
+std::string generate_chat_prompt(const std::string& input, bool first_iter = false) {
     std::stringstream result_prompt;
+    string prefix = (first_iter) ? "" : "<\n>";
+
     // Gemma-7b-it
     // result_prompt << "<bos><start_of_turn>user\n" << input << "<end_of_turn>\n<start_of_turn>model";
     
     // TinyLlama
     result_prompt << "<|user|>\n" << input << "</s>\n<|assistant|>\n";
+
+    // LLama-2-7b
+    // result_prompt << "<s>[INST] " << input << " [/INST]";
     return result_prompt.str();
 }
 
@@ -29,32 +34,48 @@ int main(int argc, char* argv[]) try {
         device = argv[3];
 
     LLMPipeline pipe(model_path, device);
+
     GenerationConfig config = pipe.generation_config();
-    config.do_reset_state(false);
-    config.max_new_tokens(200);
+    config.reset_state(false);
+    config.max_new_tokens(2000000);
     config.eos_token_id(2);
     pipe.set_streamer_callback([](std::string word) { std::cout << word; });
     
-   std::string accumulated_str = "";
+    vector<string> questions = {
+        "1+1=", 
+        "what was the previous answer?", 
+        "Why is the sky blue?", 
+        "4+10=",
+        "Who was Alan Turing?",
+        "But why did he killed himself?",
+        // "4+10=", 
+        // "sum up all the numeric answers in the current chat session"
+        // "Why is the sky blue?",
+        // "Please repeat all the questions I asked you.",
+        "Can you briefly summarize what I asked you about during this session?",
+    };
 
-   std::cout << "Type keyword \"Stop!\" to stop the chat. \n";
-   size_t max_len = 10000;
-    for (size_t i = 0; i < max_len; i++) {
+    std::string accumulated_str = "";
+    for (size_t i = 0; i < questions.size(); i++) {
+        prompt = questions[i];
+        
+        bool first_iter = (i == 0) ? true : false;
+        bool last_iter = (i == questions.size() - 1) ? true : false;
+        
         std::cout << "question:\n";
-        std::getline(std::cin, prompt);
-
-        if (!prompt.compare("Stop!"))
-            break;
-
-        prompt = generate_chat_prompt(prompt);
+        // std::getline(std::cin, prompt);
+        cout << prompt << endl;
+        prompt = generate_chat_prompt(prompt, first_iter);
         accumulated_str += prompt;
-        // string prefix = (i != 0) ? "</s>" : ""; 
-        string prefix = (i != 0) ? "</s>" : ""; 
-        bool first_time = (i != 0) ? false : true;
-
-        // auto answer_str = pipe.call(prompt, config.do_reset_state(false), first_time);
-        auto answer_str = pipe(accumulated_str, config.do_reset_state(true));
+        
+        std::string prefix = (first_iter) ? "" : "</s>";
+        auto answer_str = pipe.call(prefix + prompt, config.reset_state(false), first_iter);
+        // auto answer_str = pipe(accumulated_str, config.reset_state(true));
         accumulated_str += answer_str;
+        cout << "\n----------\n";
+        
+        // if (last_iter)
+        //     cout << accumulated_str;
     }
 
 } catch (const std::exception& error) {
@@ -64,3 +85,22 @@ int main(int argc, char* argv[]) try {
     std::cerr << "Non-exception object thrown\n";
     return EXIT_FAILURE;
 }
+
+// using namespace inja;
+// #include <inja.hpp>
+// using namespace nlohmann;
+// #include <jinja2cpp/template.h>
+// string my_template = "{% for message in messages %}{% if message.role == 'user' %}{{ ' ' }}{% endif %}{{ message.content }}{% endfor %}";
+
+// nlohmann::json data;
+// data["messages"] = {
+//     {{"role", "system"}, {"content", "You are a friendly chatbot who always responds in the style of a pirate"}},
+//     {{"role", "user"}, {"content", "1+1="}},
+// };
+// data["eos_token"] = "</s>";
+
+// cout << data.dump() << endl;
+// auto res = inja::render(my_template, data);
+// json data;
+// data["messages"] = {"Jeff", "Tom", "Patrick"};
+// auto res = render("{% for message in messages %}{{message}}{% endfor %}", data); // Turn up the music!
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp
new file mode 100644
index 0000000000..8ef5123d0e
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openvino/openvino.hpp>
+#include "llm_pipeline.hpp"
+
+int main(int argc, char* argv[]) try {
+    std::string prompt = "table is made of";
+    std::string device = "CPU";
+
+    std::string model_path = argv[1];
+    if (argc > 2)
+        prompt = argv[2];
+    if (argc > 3)
+        device = argv[3];
+    vector<int64_t> all_results;
+
+    LLMPipeline pipe(model_path, device);
+    GenerationConfig config = pipe.generation_config();
+    Tokenizer tokenizer = pipe.get_tokenizer();
+    config.eos_token_id(2);
+
+    ov::Tensor input_ids, attention_mask;
+    std::tie(input_ids, attention_mask) = tokenizer.tokenize(prompt);
+    // max_new_tokens should be 15 for reproducer case
+    auto result = pipe.generate(input_ids, attention_mask, config.reset_state(false).max_new_tokens(55), true)[0].second;
+    all_results.insert(all_results.end(), result.begin(), result.end());
+
+    string text = tokenizer.detokenize(result);
+    cout << text << endl;
+
+    auto new_input_ids = ov::Tensor{ov::element::i64, {1, 1}};
+    auto new_attention_mask = ov::Tensor{ov::element::i64, {1, 1}};
+    auto data = new_attention_mask.data<int64_t>();
+    data[0] = 1;
+    data = new_input_ids.data<int64_t>();
+    data[0] = result.back();
+    auto new_result = pipe.generate(new_input_ids, new_attention_mask, config.reset_state(false).max_new_tokens(1000), true)[0].second;
+    all_results.insert(all_results.end(), new_result.begin(), new_result.end());
+    cout << tokenizer.detokenize(all_results);
+} catch (const std::exception& error) {
+    std::cerr << error.what() << '\n';
+    return EXIT_FAILURE;
+} catch (...) {
+    std::cerr << "Non-exception object thrown\n";
+    return EXIT_FAILURE;
+}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index 60dcbd0108..507210c33c 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -26,7 +26,7 @@ struct GenerationConfig {
     size_t m_max_length = SIZE_MAX; // m_max_new_tokens should have priority over m_max_length
     bool m_ignore_eos = false;
     int64_t m_eos_token = 2; // There's no way to extract special token values from the tokenizer for now
-    bool m_do_reset_state = true;
+    bool m_reset_state = true;
 
     // Beam search specific
     size_t m_num_groups = 1;
@@ -153,8 +153,8 @@ struct GenerationConfig {
         return *this;
     }
 
-    GenerationConfig& do_reset_state(bool do_reset_state) {
-        m_do_reset_state = do_reset_state;
+    GenerationConfig& reset_state(bool do_reset_state) {
+        m_reset_state = do_reset_state;
         return *this;
     }
 
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
index aa27eebf09..6d16a756c2 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
@@ -11,8 +11,8 @@ using GenerationResult = std::vector<std::pair<float, std::vector<int64_t>>>;
 using namespace std;
 
 std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token) {
-    const size_t batch_size = input_ids.get_shape().at(0);
-    const size_t sequence_length = input_ids.get_shape().at(1);
+    const size_t batch_size = input_ids.get_shape()[0];
+    const size_t sequence_length = input_ids.get_shape()[1];
     int64_t* inputs_data = input_ids.data<int64_t>();
     int64_t* attention_mask_data = attention_mask.data<int64_t>();
 
@@ -42,13 +42,13 @@ std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&&
 }
 
 void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) {
-    const size_t batch_size = attention_mask.get_shape().at(0);
-    const size_t seq_length = attention_mask.get_shape().at(1);
+    const size_t batch_size = attention_mask.get_shape()[0];
+    const size_t atten_length = attention_mask.get_shape()[1];
     position_ids.set_shape({batch_size, 1});
 
     for (size_t batch = 0; batch < batch_size; batch++) {
-        int64_t* start = attention_mask.data<int64_t>() + batch * seq_length;
-        position_ids.data<int64_t>()[batch] = std::accumulate(start, start + seq_length, 0);
+        int64_t* start = attention_mask.data<int64_t>() + batch * atten_length;
+        position_ids.data<int64_t>()[batch] = std::accumulate(start, start + atten_length, 0);
     }
 }
 
@@ -162,9 +162,18 @@ LLMPipeline::LLMPipeline(
 }
 
 LLMPipeline::LLMPipeline(std::string& path, std::string device, const ov::AnyMap& config) {
-    if (std::filesystem::exists(path + "/generation_config.json")) {
-        m_sampling_parameters = GenerationConfig(path + "/generation_config.json");
+    std::string tokenizer_config_fname = "tokenizer_config.json";
+    std::string generation_config_fname = "generation_config.json";
+
+    if (std::filesystem::exists(path + "/" + generation_config_fname)) {
+        m_sampling_parameters = GenerationConfig(path + "/" + generation_config_fname);
     }
+    if (std::filesystem::exists(path + "/" + tokenizer_config_fname)) {
+        std::ifstream f(path + "/" + tokenizer_config_fname);
+        nlohmann::json data = nlohmann::json::parse(f);
+        m_chat_template = data.value("chat_template", "");
+    }
+    
     m_device = device;
 
     ov::Core core;
@@ -181,11 +190,16 @@ GenerationConfig LLMPipeline::generation_config() const {
 void print_tensor(const ov::Tensor& tensor) {
     std::vector<int64_t> res;
 
-    auto t_shape = tensor.get_shape()[1];
-    for (size_t i = 0; i < t_shape; ++i) {
-        res.emplace_back(tensor.data<int64_t>()[i]);
-    }
-    cout << "";
+    auto t_shape = tensor.get_shape();
+    // cout << "[";
+    // for (size_t i = 0; i < t_shape[1]; ++i) {
+    //     if (tensor.get_element_type() == ov::element::i64) {
+    //         res.emplace_back(tensor.data<int64_t>()[i]);
+    //         cout << tensor.data<int64_t>()[i] << " ";
+    //     }
+    // }
+    // cout << "]" << endl;
+    // cout << "---------" << endl;
 }
 
 GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids, 
@@ -193,10 +207,16 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
                                 GenerationConfig sampling_params) {
     ov::Shape prompts_shape = input_ids.get_shape();
     size_t batch_size = prompts_shape[0];
+    size_t prompt_len = prompts_shape[1];
     
+    kv_cache_len = m_model_runner.query_state()[0].get_state().get_shape()[2];
+    
+    auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
+    initialize_position_ids(position_ids, attention_mask, kv_cache_len);
+
     GenerationResult results(batch_size);
     
-    if (!sampling_params.m_do_reset_state && kv_cache_len > 0) {
+    if (!sampling_params.m_reset_state && kv_cache_len > 0) {
         // m_attentions_mask_cache extent with attention_mask;
 
         size_t new_prompt_len = attention_mask.get_shape()[1];
@@ -217,18 +237,11 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
     // }
         m_model_runner.set_tensor("attention_mask", new_attention_mask);
     } else {
+        // kv_cache_len = prompt_len;
         m_model_runner.set_tensor("attention_mask", attention_mask);
     }
     
-    // print_tensor(attention_mask);
-
-
-    auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
     // todo: make this work even if position_ids are not specified
-    initialize_position_ids(position_ids, attention_mask, kv_cache_len);
-    // initialize_position_ids(position_ids, attention_mask, 0);
-
-    size_t prompt_len = input_ids.get_shape()[1];
 
     auto atten_shape = attention_mask.get_shape();
     auto pos_shape = position_ids.get_shape();
@@ -243,11 +256,13 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
     auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
     std::iota(beam_data, beam_data + batch_size, 0);
 
-    print_tensor(m_model_runner.get_tensor("input_ids"));
-    print_tensor(m_model_runner.get_tensor("attention_mask"));
-    print_tensor(m_model_runner.get_tensor("position_ids"));
-
-    for (size_t i = 0; i < sampling_params.get_max_new_tokens(prompt_len); ++i) {
+    size_t max_tokens = sampling_params.get_max_new_tokens(prompt_len);
+    for (size_t i = 0; i < max_tokens; ++i) {
+        // print_tensor(m_model_runner.query_state()[0].get_state());
+        // print_tensor(m_model_runner.get_tensor("attention_mask"));
+        // print_tensor(m_model_runner.get_tensor("position_ids"));
+        print_tensor(m_model_runner.get_tensor("input_ids"));
+        
         // todo: consider replacing with start_async and run callback right after that
         m_model_runner.infer();
         auto logits = m_model_runner.get_tensor("logits");
@@ -255,10 +270,13 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
         size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];
 
         m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});
-        m_attentions_mask_cache = m_model_runner.get_tensor("attention_mask");
-        m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
-
+        
+        m_attentions_mask_cache = ov::Tensor{attention_mask.get_element_type(),  m_model_runner.get_tensor("attention_mask").get_shape()};
+        m_model_runner.get_tensor("attention_mask").copy_to(m_attentions_mask_cache);
+        // m_attentions_mask_cache = m_model_runner.get_tensor("attention_mask");
+        
         update_position_ids(position_ids, m_model_runner.get_tensor("attention_mask"));  // todo: check why does not always work correctly
+        m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
         
         std::vector<int64_t> token_iter_results(batch_size);  // results of a single infer request
         std::vector<int> eos_met(batch_size, 0);  // use int because can not use std::all_of with vector<bool>
@@ -270,21 +288,20 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
             eos_met[batch] = (out_token == sampling_params.m_eos_token_id);
 
             m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
-            m_model_runner.get_tensor("position_ids").data<int64_t>()[batch] = int64_t(prompt_len + i);
-
-            kv_cache_len = prompt_len + i;
         }
         // place
         // sampling_params.m_callback(std::move(token_iter_results), *this);
         
         if (is_streamer_set) {
-            m_streamer.put(token_iter_results[0]);
+            m_streamer_callback(m_streamer.put(token_iter_results[0]));
         }
         
         // stop generation when EOS is met in all batches
         bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; });
         if (!sampling_params.m_ignore_eos && all_are_eos)
             break;
+        // if (i != sampling_params.get_max_new_tokens(prompt_len) - 1)
+        //     kv_cache_len += 1;
     }
     return results;
 }
@@ -338,7 +355,7 @@ GenerationResult LLMPipeline::beam_search(ov::Tensor prompts, ov::Tensor attenti
         std::fill_n(attention_mask.data<int64_t>(), ov::shape_size(mask_shape), 1);
 
         m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
-        std::fill_n(m_model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1);
+        std::fill_n(m_model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape[1] - 1);
         
         // sampling_params.m_callback(std::move(next_tokens), *this);
         // m_callback(std::move(next_tokens);
@@ -527,17 +544,39 @@ std::string LLMPipeline::call(std::string text) {
 
 std::string LLMPipeline::call(std::string text, GenerationConfig generation_config, bool first_time) {
     auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
-    if (generation_config.m_do_reset_state == false && !first_time) {
-        auto size = input_ids.get_shape();
-        int64_t* inputs_data = input_ids.data<int64_t>();
 
-        // std::replace(inputs_data, inputs_data + input_ids.get_shape()[1], 1, 2);
+    // todo: W/A If sentence begins with a special tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
+    // but HF does not do that. Moreover openvino_tokenizer always inserts <bos> but in chat scenario HF does not do that because skip_special_tokens=True.
+    // Need to remove both of that tokens manually to get exact token by token alignment with HF
+    auto size = input_ids.get_shape();
+    int64_t* inputs_data = input_ids.data<int64_t>();
+    std::vector<int64_t> tmp_ids(inputs_data, inputs_data + input_ids.get_size()); // todo: works only for batch 1
+    tmp_ids.erase(tmp_ids.begin());
+
+    auto attention_mask_data = attention_mask.data<int64_t>();
+    std::vector<float> tmp_attn_mask(attention_mask_data, attention_mask_data + attention_mask.get_size());
+    tmp_attn_mask.erase(tmp_attn_mask.begin());
+
+    std::vector<std::string> prefixes_to_exclude = {"<s>", "</s>"};  // todo: for TinyLlama, need to get them form generation_config
+    auto prefix_match = [&text](std::string prefix) { return text.substr(0, prefix.length()) == prefix; };
+    if (std::any_of(prefixes_to_exclude.begin(), prefixes_to_exclude.end(), prefix_match)) {
+        tmp_ids.erase(tmp_ids.begin());
+        tmp_attn_mask.erase(tmp_attn_mask.begin());
     }
+
+    input_ids = ov::Tensor(input_ids.get_element_type(), {1, tmp_ids.size()});
+    for (size_t i = 0; i < tmp_ids.size(); i++)
+        input_ids.data<int64_t>()[i] = tmp_ids.data()[i];
+    attention_mask = ov::Tensor(attention_mask.get_element_type(), {1, tmp_attn_mask.size()});
+    for (size_t i = 0; i < tmp_attn_mask.size(); i++)
+        attention_mask.data<int64_t>()[i] = tmp_attn_mask.data()[i];
+
     // to keep config specified during LLMPipeline creation need to get existing 
     // and modify only after that, e.g.:
     // GenerationConfig config = pipe.generation_config();
     // config.do_sample(false).max_new_tokens(20);
-    auto generate_results = generate(input_ids, attention_mask, generation_config);
+    // print_tensor(input_ids);
+    auto generate_results = generate(input_ids, attention_mask, generation_config, first_time);
 
     return m_tokenizer.detokenize(generate_results)[0];
 }
@@ -566,8 +605,8 @@ std::vector<std::string> LLMPipeline::operator()(std::initializer_list<std::stri
     return call(text, sampling_parameters);
 }
 
-GenerationResult LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
-    if (generation_config.m_do_reset_state) {
+GenerationResult LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config, bool first_time) {
+    if (generation_config.m_reset_state) {
         m_model_runner.reset_state();
         kv_cache_len = 0;
     }
@@ -602,6 +641,6 @@ Tokenizer LLMPipeline::get_tokenizer() {
 
 void LLMPipeline::set_streamer_callback(std::function<void (std::string)> callback) {
     is_streamer_set = true;
-    m_callback = callback;
+    m_streamer_callback = callback;
     m_streamer = TextCoutStreamer(m_tokenizer);
 }
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
index ef7ba2f3d0..25a786c1be 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
@@ -31,9 +31,8 @@ class LLMPipeline {
     ov::AnyMap m_config;
     size_t kv_cache_len = 0;
     ov::Tensor m_attentions_mask_cache;
-    std::function<void (std::string)> m_callback = [](std::string ){ ;};
-    TextCoutStreamer m_streamer;
     bool is_streamer_set = false;
+    std::string m_chat_template = "";
     
     // TODO: add constructor for specifying manually tokenizer path
     // dir path
@@ -75,7 +74,7 @@ class LLMPipeline {
 
     std::vector<std::string> operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters);
 
-    GenerationResult generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
+    GenerationResult generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params, bool first = true);
 
     GenerationResult generate(ov::Tensor input_ids, ov::Tensor attention_mask);
 
@@ -86,4 +85,8 @@ class LLMPipeline {
     Tokenizer get_tokenizer();
 
     void set_streamer_callback(std::function<void (std::string)> callback);
+private:
+    TextCoutStreamer m_streamer;
+    std::function<void (std::string)> m_streamer_callback = [](std::string ){ ;};
+
 };
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
index eb79c0a304..b02855f218 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
@@ -118,41 +118,46 @@ std::vector<std::string> Tokenizer::detokenize(GenerationResult lines) {
     return strings;
 }
 
-TextCoutStreamer::TextCoutStreamer(const Tokenizer& tokenizer) {
-    this->tokenizer = tokenizer;
+TextCoutStreamer::TextCoutStreamer(const Tokenizer& tokenizer, bool print_eos_token) {
+    m_tokenizer = tokenizer;
+    m_print_eos_token = print_eos_token;
 }
 
-void TextCoutStreamer::put(int64_t token) {
+std::string TextCoutStreamer::put(int64_t token) {
+    std::stringstream res;
+
     // do not print anything and flush cache if EOS token is met
-    if (token == tokenizer.m_eos_token) {
-        end();
-        return ;
+    if (token == m_tokenizer.m_eos_token) {
+        return end();
     }
 
-    token_cache.push_back(token);
-    std::string text = tokenizer.detokenize(token_cache);
+    m_tokens_cache.push_back(token);
+    std::string text = m_tokenizer.detokenize(m_tokens_cache);
     if (!text.empty() && '\n' == text.back()) {
         // Flush the cache after the new line symbol
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
-        token_cache.clear();
+        res << std::string_view{text.data() + print_len, text.size() - print_len};
+        m_tokens_cache.clear();
         print_len = 0;
-        return;
+        return res.str();
     }
     if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
         // Don't print incomplete text
-        return;
+        return res.str();
     }
-    std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+    res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
     print_len = text.size();
+    return res.str();
 }
 
-void TextCoutStreamer::end() {
-    std::string text = tokenizer.detokenize(token_cache);
-    std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
-    token_cache.clear();
+std::string TextCoutStreamer::end() {
+    std::stringstream res;
+    std::string text = m_tokenizer.detokenize(m_tokens_cache);
+    res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+    m_tokens_cache.clear();
     print_len = 0;
+    return res.str();
 }
 
 void TextCoutStreamer::set_tokenizer(Tokenizer tokenizer) {
-    this->tokenizer = tokenizer;
+    this->m_tokenizer = tokenizer;
 }
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp
index 6e0ac75f4f..b570ae5ec1 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp
@@ -15,10 +15,6 @@ std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&&
 
 
 class Tokenizer {
-    ov::InferRequest m_tokenize_request;
-    ov::InferRequest m_detokenizer_request;
-    std::string m_device;
-
 public:
     int64_t m_eos_token = 2;
 
@@ -38,20 +34,25 @@ class Tokenizer {
     std::vector<std::string> detokenize(ov::Tensor tokens);
     
     std::vector<std::string> detokenize(GenerationResult lines);
+private:
+    ov::InferRequest m_tokenize_request;
+    ov::InferRequest m_detokenizer_request;
+    std::string m_device;
 };
 
 
 class TextCoutStreamer {
-    Tokenizer tokenizer;
-    std::vector<int64_t> token_cache;
-    size_t print_len = 0;
-    std::function<void (std::string)> m_callback = [](std::string words){ ;};
-    
 public:
-    void put(int64_t token);
+    std::string put(int64_t token);
 
-    void end();
-    TextCoutStreamer(const Tokenizer& tokenizer);
+    std::string end();
+    TextCoutStreamer(const Tokenizer& tokenizer, bool m_print_eos_token = false);
     TextCoutStreamer() = default;
     void set_tokenizer(Tokenizer tokenizer);
+private:
+    bool m_print_eos_token = false;
+    Tokenizer m_tokenizer;
+    std::vector<int64_t> m_tokens_cache;
+    size_t print_len = 0;
+    std::function<void (std::string)> m_callback = [](std::string words){ ;};
 };
diff --git a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp
index d75d32d0e0..db7e71323b 100644
--- a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp
+++ b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp
@@ -54,6 +54,30 @@ struct TextStreamer {
 };
 }
 
+void print_tensor(const ov::Tensor& tensor) {
+    std::vector<int64_t> res;
+
+    auto t_shape = tensor.get_shape();
+    for (size_t i = 0; i < t_shape[1]; ++i) {
+        if (tensor.get_element_type() == ov::element::i64) {
+            res.emplace_back(tensor.data<int64_t>()[i]);
+        }
+    }
+    std::cout << "";
+}
+
+template <class T>
+void copy_partially(const ov::Tensor& src, const ov::Tensor& trg, int src_offset, int trg_offset, size_t size) {
+    T* src_data = src.data<T>();
+    T* dst_data = trg.data<T>();
+    OPENVINO_ASSERT(src_offset + size <= src.get_shape()[1]);
+    OPENVINO_ASSERT(trg_offset + size <= trg.get_shape()[1]);
+
+    for (size_t i = 0; i < size; i++) {
+        dst_data[trg_offset + i] = src_data[src_offset + i];
+    }
+}
+
 int main(int argc, char* argv[]) try {
     if (argc != 3) {
         throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> '<PROMPT>'");
@@ -73,25 +97,78 @@ int main(int argc, char* argv[]) try {
     ov::InferRequest lm = core.compile_model(
         std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request();
     auto seq_len = input_ids.get_size();
-    
-    // Initialize inputs
-    lm.set_tensor("input_ids", input_ids);
-    lm.set_tensor("attention_mask", attention_mask);
-    ov::Tensor position_ids = lm.get_tensor("position_ids");
+
+    ov::Tensor position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
     position_ids.set_shape(input_ids.get_shape());
     std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + seq_len, 0);
+    
+
     constexpr size_t BATCH_SIZE = 1;
     // Input values are persistent between inference calls.
     // That allows to set values, which aren't going to change, only once
     lm.get_tensor("beam_idx").set_shape({BATCH_SIZE});
     lm.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+
+    // splitting the first inference
+    auto shape = input_ids.get_shape();
+    size_t split_pos = shape[1] * 0.95;
+    int position = 4;
+    if (position != -1)
+        split_pos = position;
+    
+    ov::Shape split_1_shape = {1, split_pos};
+    ov::Shape split_2_shape = {1, shape[1] - split_pos};
+
+    ov::Tensor input_ids_1 = ov::Tensor{ov::element::i64, split_1_shape};
+    ov::Tensor input_ids_2 = ov::Tensor{ov::element::i64, split_2_shape};
+    
+    ov::Tensor position_ids_1 = ov::Tensor{ov::element::i64, split_1_shape};
+    ov::Tensor position_ids_2 = ov::Tensor{ov::element::i64, split_2_shape};
+
+    ov::Tensor attention_mask_1 = ov::Tensor{ov::element::i64, split_1_shape};
+    ov::Tensor attention_mask_2 = ov::Tensor{ov::element::i64, split_2_shape};
+
+    copy_partially<int64_t>(input_ids, input_ids_1, 0, 0, split_pos);
+    copy_partially<int64_t>(input_ids, input_ids_2, split_pos, 0, split_2_shape[1]);
+    copy_partially<int64_t>(attention_mask, attention_mask_1, 0, 0, split_pos);
+    copy_partially<int64_t>(attention_mask, attention_mask_2, split_pos, 0, split_2_shape[1]);
+    copy_partially<int64_t>(position_ids, position_ids_1, 0, 0, split_pos);
+    copy_partially<int64_t>(position_ids, position_ids_2, split_pos, 0, split_2_shape[1]);
+    
+    print_tensor(input_ids);
+    print_tensor(input_ids_1);
+    print_tensor(input_ids_2);
+
+    print_tensor(position_ids);
+    print_tensor(position_ids_1);
+    print_tensor(position_ids_2);
+
+    // 2 part inference
+    lm.set_tensor("input_ids", input_ids_1);
+    lm.set_tensor("attention_mask", attention_mask_1);
+    lm.set_tensor("position_ids", position_ids_1);
+    lm.infer();
+
+    lm.set_tensor("input_ids", input_ids_2);
+    lm.set_tensor("attention_mask", attention_mask);
+    lm.set_tensor("position_ids", position_ids_2);
     lm.infer();
+    auto shift = input_ids_2.get_shape()[1] - 1;
+
+    // single inference
+    // lm.set_tensor("input_ids", input_ids);
+    // lm.set_tensor("attention_mask", attention_mask);
+    // lm.set_tensor("position_ids", position_ids);
+    // lm.infer();
+    // auto shift = seq_len - 1;
+
+    seq_len = lm.get_tensor("attention_mask").get_shape()[1];
     size_t vocab_size = lm.get_tensor("logits").get_shape().back();
-    float* logits = lm.get_tensor("logits").data<float>() + (seq_len - 1) * vocab_size;
+    float* logits = lm.get_tensor("logits").data<float>() + shift * vocab_size;
     int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
 
     lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1});
-    position_ids.set_shape({BATCH_SIZE, 1});
+    lm.get_tensor("position_ids").set_shape({BATCH_SIZE, 1});
     TextStreamer text_streamer{std::move(detokenizer)};
 
     // Get the runtime info from the tokenizer model that we read earlier
@@ -103,17 +180,19 @@ int main(int argc, char* argv[]) try {
     } else {
         throw std::runtime_error("EOS token ID not found in model's runtime information.");
     }
-
-    int max_sequence_length = 100;
+    
+    int max_sequence_length = 1315;
     while (out_token != SPECIAL_EOS_TOKEN && seq_len < max_sequence_length) {
         ++seq_len;
         lm.get_tensor("input_ids").data<int64_t>()[0] = out_token;
         lm.get_tensor("attention_mask").set_shape({BATCH_SIZE, seq_len});
         std::fill_n(lm.get_tensor("attention_mask").data<int64_t>(), seq_len, 1);
-        position_ids.data<int64_t>()[0] = int64_t(seq_len - 1);
-        lm.start_async();
+        lm.get_tensor("position_ids").data<int64_t>()[0] = int64_t(seq_len - 1);
+
+        lm.infer();
         text_streamer.put(out_token);
-        lm.wait();
+        // std::cout << out_token << " ";
+
         logits = lm.get_tensor("logits").data<float>();
         out_token = std::max_element(logits, logits + vocab_size) - logits;
     }
diff --git a/thirdparty/Jinja2Cpp b/thirdparty/Jinja2Cpp
new file mode 160000
index 0000000000..a853f8e9f7
--- /dev/null
+++ b/thirdparty/Jinja2Cpp
@@ -0,0 +1 @@
+Subproject commit a853f8e9f784de53b11973a47af0b20b0167f6f3

From 964a5e8bf367de23dfcd88aebdda4f19597bf7ab Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Tue, 23 Apr 2024 23:17:13 +0200
Subject: [PATCH 18/97] add apply_chat_template

---
 text_generation/causal_lm/cpp/CMakeLists.txt  | 16 +++++++++++++-
 .../cpp/generate_pipeline/chat_sample.cpp     |  7 +++++--
 .../generate_pipeline/generation_config.hpp   |  2 +-
 .../cpp/generate_pipeline/llm_pipeline.cpp    | 21 ++++++++++++++++++-
 .../cpp/generate_pipeline/llm_pipeline.hpp    |  2 ++
 5 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index ff98abf634..5f342d3fc4 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -15,8 +15,13 @@ add_subdirectory(../../../thirdparty/nlohmann_json/ "${CMAKE_CURRENT_BINARY_DIR}
 # add_subdirectory(../../../thirdparty/Jinja2Cpp/ "${CMAKE_CURRENT_BINARY_DIR}/Jinja2Cpp/")
 # include_directories(../../../thirdparty/inja/include/Jinja2Cpp)
 
+# todo: remove hardcodes
+include_directories($ENV{HOME}/opt/jinja2cpp/include)
+
 set(TARGET_NAME greedy_causal_lm)
 add_executable(${TARGET_NAME} greedy_causal_lm.cpp)
+# todo: remove hardcode
+target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
@@ -26,6 +31,8 @@ set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 set(TARGET_NAME beam_search_causal_lm)
 add_executable(${TARGET_NAME} beam_search_causal_lm.cpp)
+# todo: remove hardcode
+target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
@@ -35,6 +42,8 @@ set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 set(TARGET_NAME speculative_decoding_lm)
 add_executable(${TARGET_NAME} speculative_decoding_lm.cpp)
+# todo: remove hardcode
+target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
@@ -44,6 +53,8 @@ set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 set(TARGET_NAME generate_sample)
 add_executable(${TARGET_NAME} generate_pipeline/generate_sample.cpp generate_pipeline/llm_tokenizer.cpp generate_pipeline/llm_pipeline.cpp)
+# todo: remove hardcode
+target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
@@ -56,6 +67,8 @@ target_link_libraries(${TARGET_NAME} PRIVATE stdc++fs)
 
 set(TARGET_NAME chat_sample)
 add_executable(${TARGET_NAME} generate_pipeline/chat_sample.cpp generate_pipeline/llm_tokenizer.cpp generate_pipeline/llm_pipeline.cpp)
+# todo: remove hardcode
+target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
@@ -63,10 +76,11 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
 target_link_libraries(${TARGET_NAME} PRIVATE nlohmann_json::nlohmann_json)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
-# target_link_libraries(${TARGET_NAME} PRIVATE Jinja2Cpp)
 
 set(TARGET_NAME continuation_sample)
 add_executable(${TARGET_NAME} generate_pipeline/continuation_sample.cpp generate_pipeline/llm_tokenizer.cpp generate_pipeline/llm_pipeline.cpp)
+# todo: remove hardcode
+target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index 0d03808a43..03a720f7ae 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -5,7 +5,10 @@
 #include "llm_pipeline.hpp"
 
 
-std::string generate_chat_prompt(const std::string& input, bool first_iter = false) {
+std::string generate_chat_prompt(const LLMPipeline& pipe, std::string& input, bool first_iter = false, bool use_chat_template = true) {
+    if (use_chat_template)
+        return pipe.apply_chat_template(input);
+
     std::stringstream result_prompt;
     string prefix = (first_iter) ? "" : "<\n>";
 
@@ -65,7 +68,7 @@ int main(int argc, char* argv[]) try {
         std::cout << "question:\n";
         // std::getline(std::cin, prompt);
         cout << prompt << endl;
-        prompt = generate_chat_prompt(prompt, first_iter);
+        prompt = generate_chat_prompt(pipe, prompt, first_iter);
         accumulated_str += prompt;
         
         std::string prefix = (first_iter) ? "" : "</s>";
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index 507210c33c..9be7c0586c 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -48,7 +48,7 @@ struct GenerationConfig {
 
     // special tokens
     int64_t m_bos_token_id = 0;
-    int64_t m_eos_token_id = 0;  // todo: do we need both m_eos_token and m_eos_token_id?
+    int64_t m_eos_token_id = 2;  // todo: do we need both m_eos_token and m_eos_token_id?
     int64_t m_pad_token_id = 0;
 
     std::function<void (std::vector<int64_t>&&, LLMPipeline&)> m_callback = [](std::vector<int64_t>&& tokens, LLMPipeline& pipe){ ;};
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
index 6d16a756c2..3227b87f65 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
@@ -5,7 +5,8 @@
 #include "generate_pipeline/llm_pipeline.hpp"
 #include "group_beam_searcher.hpp"
 #include <filesystem>
-
+#include <jinja2cpp/template.h>
+#include <jinja2cpp/template_env.h>
 
 using GenerationResult = std::vector<std::pair<float, std::vector<int64_t>>>;
 using namespace std;
@@ -172,6 +173,7 @@ LLMPipeline::LLMPipeline(std::string& path, std::string device, const ov::AnyMap
         std::ifstream f(path + "/" + tokenizer_config_fname);
         nlohmann::json data = nlohmann::json::parse(f);
         m_chat_template = data.value("chat_template", "");
+
     }
     
     m_device = device;
@@ -639,6 +641,23 @@ Tokenizer LLMPipeline::get_tokenizer() {
     return m_tokenizer;
 }
 
+std::string LLMPipeline::apply_chat_template(std::string prompt, std::string role) const {
+    jinja2::TemplateEnv env;
+    env.GetSettings().lstripBlocks = true;
+    env.GetSettings().trimBlocks = true;
+    jinja2::Template tpl(&env);
+    tpl.Load(m_chat_template);
+    
+    jinja2::ValuesMap message {{"role", role}, {"content", prompt}};
+    jinja2::ValuesMap params = {
+        {"messages", jinja2::ValuesList({message})},
+        {"eos_token", "</s>"},  // todo: load from config
+        {"add_generation_prompt", true},
+    };
+ 
+    return tpl.RenderAsString(params).value();
+}
+
 void LLMPipeline::set_streamer_callback(std::function<void (std::string)> callback) {
     is_streamer_set = true;
     m_streamer_callback = callback;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
index 25a786c1be..fe1e343e03 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
@@ -84,6 +84,8 @@ class LLMPipeline {
 
     Tokenizer get_tokenizer();
 
+    std::string apply_chat_template(std::string prompt, std::string role = "user") const;
+
     void set_streamer_callback(std::function<void (std::string)> callback);
 private:
     TextCoutStreamer m_streamer;

From d3f633919d3cace50d656f8c7ff98fd45f7e9a08 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 24 Apr 2024 09:06:39 +0200
Subject: [PATCH 19/97] add start/stop conversation

---
 .../cpp/generate_pipeline/chat_sample.cpp     | 29 ++-------
 .../generate_pipeline/continuation_sample.cpp |  5 +-
 .../generate_pipeline/generation_config.hpp   |  6 --
 .../cpp/generate_pipeline/llm_pipeline.cpp    | 62 ++++++++-----------
 .../cpp/generate_pipeline/llm_pipeline.hpp    |  6 +-
 5 files changed, 39 insertions(+), 69 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index 03a720f7ae..dc438e174a 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -39,10 +39,9 @@ int main(int argc, char* argv[]) try {
     LLMPipeline pipe(model_path, device);
 
     GenerationConfig config = pipe.generation_config();
-    config.reset_state(false);
     config.max_new_tokens(2000000);
     config.eos_token_id(2);
-    pipe.set_streamer_callback([](std::string word) { std::cout << word; });
+    pipe.set_streamer_callback([](std::string word) { std::cout << word << std::flush; });
     
     vector<string> questions = {
         "1+1=", 
@@ -51,6 +50,7 @@ int main(int argc, char* argv[]) try {
         "4+10=",
         "Who was Alan Turing?",
         "But why did he killed himself?",
+        // "What is Intel OpenVINO?",
         // "4+10=", 
         // "sum up all the numeric answers in the current chat session"
         // "Why is the sky blue?",
@@ -59,6 +59,7 @@ int main(int argc, char* argv[]) try {
     };
 
     std::string accumulated_str = "";
+    // pipe.start_conversation();
     for (size_t i = 0; i < questions.size(); i++) {
         prompt = questions[i];
         
@@ -72,14 +73,15 @@ int main(int argc, char* argv[]) try {
         accumulated_str += prompt;
         
         std::string prefix = (first_iter) ? "" : "</s>";
-        auto answer_str = pipe.call(prefix + prompt, config.reset_state(false), first_iter);
-        // auto answer_str = pipe(accumulated_str, config.reset_state(true));
+        // auto answer_str = pipe.call(prefix + prompt, config, first_iter);
+        auto answer_str = pipe(accumulated_str, config);
         accumulated_str += answer_str;
         cout << "\n----------\n";
         
         // if (last_iter)
         //     cout << accumulated_str;
     }
+    pipe.stop_conversation();
 
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
@@ -88,22 +90,3 @@ int main(int argc, char* argv[]) try {
     std::cerr << "Non-exception object thrown\n";
     return EXIT_FAILURE;
 }
-
-// using namespace inja;
-// #include <inja.hpp>
-// using namespace nlohmann;
-// #include <jinja2cpp/template.h>
-// string my_template = "{% for message in messages %}{% if message.role == 'user' %}{{ ' ' }}{% endif %}{{ message.content }}{% endfor %}";
-
-// nlohmann::json data;
-// data["messages"] = {
-//     {{"role", "system"}, {"content", "You are a friendly chatbot who always responds in the style of a pirate"}},
-//     {{"role", "user"}, {"content", "1+1="}},
-// };
-// data["eos_token"] = "</s>";
-
-// cout << data.dump() << endl;
-// auto res = inja::render(my_template, data);
-// json data;
-// data["messages"] = {"Jeff", "Tom", "Patrick"};
-// auto res = render("{% for message in messages %}{{message}}{% endfor %}", data); // Turn up the music!
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp
index 8ef5123d0e..c8d31862d1 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp
@@ -16,6 +16,7 @@ int main(int argc, char* argv[]) try {
     vector<int64_t> all_results;
 
     LLMPipeline pipe(model_path, device);
+    pipe.start_conversation();
     GenerationConfig config = pipe.generation_config();
     Tokenizer tokenizer = pipe.get_tokenizer();
     config.eos_token_id(2);
@@ -23,7 +24,7 @@ int main(int argc, char* argv[]) try {
     ov::Tensor input_ids, attention_mask;
     std::tie(input_ids, attention_mask) = tokenizer.tokenize(prompt);
     // max_new_tokens should be 15 for reproducer case
-    auto result = pipe.generate(input_ids, attention_mask, config.reset_state(false).max_new_tokens(55), true)[0].second;
+    auto result = pipe.generate(input_ids, attention_mask, config.max_new_tokens(55), true)[0].second;
     all_results.insert(all_results.end(), result.begin(), result.end());
 
     string text = tokenizer.detokenize(result);
@@ -35,7 +36,7 @@ int main(int argc, char* argv[]) try {
     data[0] = 1;
     data = new_input_ids.data<int64_t>();
     data[0] = result.back();
-    auto new_result = pipe.generate(new_input_ids, new_attention_mask, config.reset_state(false).max_new_tokens(1000), true)[0].second;
+    auto new_result = pipe.generate(new_input_ids, new_attention_mask, config.max_new_tokens(1000), true)[0].second;
     all_results.insert(all_results.end(), new_result.begin(), new_result.end());
     cout << tokenizer.detokenize(all_results);
 } catch (const std::exception& error) {
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index 9be7c0586c..b4f53f4665 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -26,7 +26,6 @@ struct GenerationConfig {
     size_t m_max_length = SIZE_MAX; // m_max_new_tokens should have priority over m_max_length
     bool m_ignore_eos = false;
     int64_t m_eos_token = 2; // There's no way to extract special token values from the tokenizer for now
-    bool m_reset_state = true;
 
     // Beam search specific
     size_t m_num_groups = 1;
@@ -153,11 +152,6 @@ struct GenerationConfig {
         return *this;
     }
 
-    GenerationConfig& reset_state(bool do_reset_state) {
-        m_reset_state = do_reset_state;
-        return *this;
-    }
-
     GenerationConfig() = default;
 
     GenerationConfig(std::string json_path) {
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
index 3227b87f65..a496bcb44a 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
@@ -211,14 +211,14 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
     size_t batch_size = prompts_shape[0];
     size_t prompt_len = prompts_shape[1];
     
-    kv_cache_len = m_model_runner.query_state()[0].get_state().get_shape()[2];
+    auto kv_cache_len = m_model_runner.query_state()[0].get_state().get_shape()[2];
     
     auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
     initialize_position_ids(position_ids, attention_mask, kv_cache_len);
 
     GenerationResult results(batch_size);
     
-    if (!sampling_params.m_reset_state && kv_cache_len > 0) {
+    if (is_chat_conversation && kv_cache_len > 0) {
         // m_attentions_mask_cache extent with attention_mask;
 
         size_t new_prompt_len = attention_mask.get_shape()[1];
@@ -239,7 +239,6 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
     // }
         m_model_runner.set_tensor("attention_mask", new_attention_mask);
     } else {
-        // kv_cache_len = prompt_len;
         m_model_runner.set_tensor("attention_mask", attention_mask);
     }
     
@@ -277,8 +276,8 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
         m_model_runner.get_tensor("attention_mask").copy_to(m_attentions_mask_cache);
         // m_attentions_mask_cache = m_model_runner.get_tensor("attention_mask");
         
-        update_position_ids(position_ids, m_model_runner.get_tensor("attention_mask"));  // todo: check why does not always work correctly
         m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
+        update_position_ids(position_ids, m_model_runner.get_tensor("attention_mask"));  // todo: check why does not always work correctly
         
         std::vector<int64_t> token_iter_results(batch_size);  // results of a single infer request
         std::vector<int> eos_met(batch_size, 0);  // use int because can not use std::all_of with vector<bool>
@@ -547,31 +546,7 @@ std::string LLMPipeline::call(std::string text) {
 std::string LLMPipeline::call(std::string text, GenerationConfig generation_config, bool first_time) {
     auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
 
-    // todo: W/A If sentence begins with a special tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
-    // but HF does not do that. Moreover openvino_tokenizer always inserts <bos> but in chat scenario HF does not do that because skip_special_tokens=True.
-    // Need to remove both of that tokens manually to get exact token by token alignment with HF
-    auto size = input_ids.get_shape();
-    int64_t* inputs_data = input_ids.data<int64_t>();
-    std::vector<int64_t> tmp_ids(inputs_data, inputs_data + input_ids.get_size()); // todo: works only for batch 1
-    tmp_ids.erase(tmp_ids.begin());
-
-    auto attention_mask_data = attention_mask.data<int64_t>();
-    std::vector<float> tmp_attn_mask(attention_mask_data, attention_mask_data + attention_mask.get_size());
-    tmp_attn_mask.erase(tmp_attn_mask.begin());
-
-    std::vector<std::string> prefixes_to_exclude = {"<s>", "</s>"};  // todo: for TinyLlama, need to get them form generation_config
-    auto prefix_match = [&text](std::string prefix) { return text.substr(0, prefix.length()) == prefix; };
-    if (std::any_of(prefixes_to_exclude.begin(), prefixes_to_exclude.end(), prefix_match)) {
-        tmp_ids.erase(tmp_ids.begin());
-        tmp_attn_mask.erase(tmp_attn_mask.begin());
-    }
 
-    input_ids = ov::Tensor(input_ids.get_element_type(), {1, tmp_ids.size()});
-    for (size_t i = 0; i < tmp_ids.size(); i++)
-        input_ids.data<int64_t>()[i] = tmp_ids.data()[i];
-    attention_mask = ov::Tensor(attention_mask.get_element_type(), {1, tmp_attn_mask.size()});
-    for (size_t i = 0; i < tmp_attn_mask.size(); i++)
-        attention_mask.data<int64_t>()[i] = tmp_attn_mask.data()[i];
 
     // to keep config specified during LLMPipeline creation need to get existing 
     // and modify only after that, e.g.:
@@ -608,20 +583,22 @@ std::vector<std::string> LLMPipeline::operator()(std::initializer_list<std::stri
 }
 
 GenerationResult LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config, bool first_time) {
-    if (generation_config.m_reset_state) {
-        m_model_runner.reset_state();
-        kv_cache_len = 0;
-    }
+    GenerationResult result;
 
     if (generation_config.is_gready_sampling()) {
-        return greedy_search(input_ids, attention_mask, generation_config);
+        result = greedy_search(input_ids, attention_mask, generation_config);
     } else if (generation_config.is_beam_search()) {
-        return beam_search(input_ids, attention_mask, generation_config);
+        result = beam_search(input_ids, attention_mask, generation_config);
     } else if (generation_config.is_multimomial()) {
-        return multinomial_sampling(input_ids, generation_config);
+        result = multinomial_sampling(input_ids, generation_config);
     } else { // speculative
-        return speculative_sampling(input_ids, attention_mask, generation_config);
+        result = speculative_sampling(input_ids, attention_mask, generation_config);
     }
+
+    if (!is_chat_conversation)
+        reset_state();
+
+    return result;
 }
 
 GenerationResult LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask) {
@@ -663,3 +640,16 @@ void LLMPipeline::set_streamer_callback(std::function<void (std::string)> callba
     m_streamer_callback = callback;
     m_streamer = TextCoutStreamer(m_tokenizer);
 }
+
+void LLMPipeline::start_conversation() {
+    is_chat_conversation = true;
+}
+
+void LLMPipeline::stop_conversation() {
+    is_chat_conversation = false;
+    reset_state();
+}
+
+void LLMPipeline::reset_state() {
+    m_model_runner.reset_state();
+}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
index fe1e343e03..c98df7770c 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
@@ -29,7 +29,6 @@ class LLMPipeline {
     GenerationConfig m_sampling_parameters;
     std::string m_device;
     ov::AnyMap m_config;
-    size_t kv_cache_len = 0;
     ov::Tensor m_attentions_mask_cache;
     bool is_streamer_set = false;
     std::string m_chat_template = "";
@@ -87,8 +86,11 @@ class LLMPipeline {
     std::string apply_chat_template(std::string prompt, std::string role = "user") const;
 
     void set_streamer_callback(std::function<void (std::string)> callback);
+    void start_conversation();
+    void stop_conversation();
+    void reset_state();
 private:
     TextCoutStreamer m_streamer;
     std::function<void (std::string)> m_streamer_callback = [](std::string ){ ;};
-
+    bool is_chat_conversation = false;
 };

From e57aa4c4f5368114006355012dc0f04c86f8ce95 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 26 Apr 2024 11:30:24 +0200
Subject: [PATCH 20/97] fix difference between accumulating conversation as
 text and keeping history in KV cache

---
 .../cpp/generate_pipeline/chat_sample.cpp     |  6 ++---
 .../cpp/generate_pipeline/llm_pipeline.cpp    | 26 ++++++++++++++++++-
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index dc438e174a..97cab7d46b 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -59,7 +59,7 @@ int main(int argc, char* argv[]) try {
     };
 
     std::string accumulated_str = "";
-    // pipe.start_conversation();
+    pipe.start_conversation();
     for (size_t i = 0; i < questions.size(); i++) {
         prompt = questions[i];
         
@@ -73,8 +73,8 @@ int main(int argc, char* argv[]) try {
         accumulated_str += prompt;
         
         std::string prefix = (first_iter) ? "" : "</s>";
-        // auto answer_str = pipe.call(prefix + prompt, config, first_iter);
-        auto answer_str = pipe(accumulated_str, config);
+        auto answer_str = pipe.call(prefix + prompt, config, first_iter);
+        // auto answer_str = pipe(accumulated_str, config);
         accumulated_str += answer_str;
         cout << "\n----------\n";
         
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
index a496bcb44a..82cecc2671 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
@@ -276,8 +276,8 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
         m_model_runner.get_tensor("attention_mask").copy_to(m_attentions_mask_cache);
         // m_attentions_mask_cache = m_model_runner.get_tensor("attention_mask");
         
+        update_position_ids(position_ids, m_model_runner.get_tensor("attention_mask"));
         m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
-        update_position_ids(position_ids, m_model_runner.get_tensor("attention_mask"));  // todo: check why does not always work correctly
         
         std::vector<int64_t> token_iter_results(batch_size);  // results of a single infer request
         std::vector<int> eos_met(batch_size, 0);  // use int because can not use std::all_of with vector<bool>
@@ -546,7 +546,31 @@ std::string LLMPipeline::call(std::string text) {
 std::string LLMPipeline::call(std::string text, GenerationConfig generation_config, bool first_time) {
     auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
 
+    // todo: W/A If sentence begins with a special tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
+    // but HF does not do that. Moreover openvino_tokenizer always inserts <bos> but in chat scenario HF does not do that because skip_special_tokens=True.
+    // Need to remove both of that tokens manually to get exact token by token alignment with HF
+    auto size = input_ids.get_shape();
+    int64_t* inputs_data = input_ids.data<int64_t>();
+    std::vector<int64_t> tmp_ids(inputs_data, inputs_data + input_ids.get_size()); // todo: works only for batch 1
+    tmp_ids.erase(tmp_ids.begin());
+
+    auto attention_mask_data = attention_mask.data<int64_t>();
+    std::vector<float> tmp_attn_mask(attention_mask_data, attention_mask_data + attention_mask.get_size());
+    tmp_attn_mask.erase(tmp_attn_mask.begin());
+
+    std::vector<std::string> prefixes_to_exclude = {"<s>", "</s>"};  // todo: for TinyLlama, need to get them form generation_config
+    auto prefix_match = [&text](std::string prefix) { return text.substr(0, prefix.length()) == prefix; };
+    if (std::any_of(prefixes_to_exclude.begin(), prefixes_to_exclude.end(), prefix_match)) {
+        tmp_ids.erase(tmp_ids.begin());
+        tmp_attn_mask.erase(tmp_attn_mask.begin());
+    }
 
+    input_ids = ov::Tensor(input_ids.get_element_type(), {1, tmp_ids.size()});
+    for (size_t i = 0; i < tmp_ids.size(); i++)
+        input_ids.data<int64_t>()[i] = tmp_ids.data()[i];
+    attention_mask = ov::Tensor(attention_mask.get_element_type(), {1, tmp_attn_mask.size()});
+    for (size_t i = 0; i < tmp_attn_mask.size(); i++)
+        attention_mask.data<int64_t>()[i] = tmp_attn_mask.data()[i];
 
     // to keep config specified during LLMPipeline creation need to get existing 
     // and modify only after that, e.g.:

From d0c134161f39d6448a1ac0a4cf6418a2097941cd Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 26 Apr 2024 12:45:17 +0200
Subject: [PATCH 21/97] cleanup

---
 .../cpp/generate_pipeline/chat_sample.cpp     | 15 ++---
 .../generate_pipeline/continuation_sample.cpp |  4 +-
 .../generate_pipeline/generation_config.hpp   |  7 ++-
 .../cpp/generate_pipeline/llm_pipeline.cpp    | 56 +++++++++----------
 .../cpp/generate_pipeline/llm_pipeline.hpp    |  7 +--
 5 files changed, 38 insertions(+), 51 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index 97cab7d46b..c50bff845b 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -23,10 +23,7 @@ std::string generate_chat_prompt(const LLMPipeline& pipe, std::string& input, bo
     return result_prompt.str();
 }
 
-int main(int argc, char* argv[]) try {
-    // if (2 >= argc && argc <= 4)
-    //     throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> \"<PROMPT>\" <DEVICE>");
-    
+int main(int argc, char* argv[]) try { 
     std::string prompt = "table is made of";
     std::string device = "CPU"; // can be replaced with GPU
 
@@ -39,9 +36,8 @@ int main(int argc, char* argv[]) try {
     LLMPipeline pipe(model_path, device);
 
     GenerationConfig config = pipe.generation_config();
-    config.max_new_tokens(2000000);
-    config.eos_token_id(2);
-    pipe.set_streamer_callback([](std::string word) { std::cout << word << std::flush; });
+    config.max_new_tokens(10000);
+    pipe.set_streamer([](std::string word) { std::cout << word << std::flush; });
     
     vector<string> questions = {
         "1+1=", 
@@ -73,13 +69,10 @@ int main(int argc, char* argv[]) try {
         accumulated_str += prompt;
         
         std::string prefix = (first_iter) ? "" : "</s>";
-        auto answer_str = pipe.call(prefix + prompt, config, first_iter);
+        auto answer_str = pipe.call(prefix + prompt, config);
         // auto answer_str = pipe(accumulated_str, config);
         accumulated_str += answer_str;
         cout << "\n----------\n";
-        
-        // if (last_iter)
-        //     cout << accumulated_str;
     }
     pipe.stop_conversation();
 
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp
index c8d31862d1..53a3d60e79 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp
@@ -24,7 +24,7 @@ int main(int argc, char* argv[]) try {
     ov::Tensor input_ids, attention_mask;
     std::tie(input_ids, attention_mask) = tokenizer.tokenize(prompt);
     // max_new_tokens should be 15 for reproducer case
-    auto result = pipe.generate(input_ids, attention_mask, config.max_new_tokens(55), true)[0].second;
+    auto result = pipe.generate(input_ids, attention_mask, config.max_new_tokens(55))[0].second;
     all_results.insert(all_results.end(), result.begin(), result.end());
 
     string text = tokenizer.detokenize(result);
@@ -36,7 +36,7 @@ int main(int argc, char* argv[]) try {
     data[0] = 1;
     data = new_input_ids.data<int64_t>();
     data[0] = result.back();
-    auto new_result = pipe.generate(new_input_ids, new_attention_mask, config.max_new_tokens(1000), true)[0].second;
+    auto new_result = pipe.generate(new_input_ids, new_attention_mask, config.max_new_tokens(1000))[0].second;
     all_results.insert(all_results.end(), new_result.begin(), new_result.end());
     cout << tokenizer.detokenize(all_results);
 } catch (const std::exception& error) {
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index b4f53f4665..92ff075da3 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -25,7 +25,7 @@ struct GenerationConfig {
     size_t m_max_new_tokens = SIZE_MAX;
     size_t m_max_length = SIZE_MAX; // m_max_new_tokens should have priority over m_max_length
     bool m_ignore_eos = false;
-    int64_t m_eos_token = 2; // There's no way to extract special token values from the tokenizer for now
+    std::string m_eos_token = "</s>";
 
     // Beam search specific
     size_t m_num_groups = 1;
@@ -47,7 +47,7 @@ struct GenerationConfig {
 
     // special tokens
     int64_t m_bos_token_id = 0;
-    int64_t m_eos_token_id = 2;  // todo: do we need both m_eos_token and m_eos_token_id?
+    int64_t m_eos_token_id = 2;  // todo: check form where it's better to extract rt_info or tokenizer_config.json
     int64_t m_pad_token_id = 0;
 
     std::function<void (std::vector<int64_t>&&, LLMPipeline&)> m_callback = [](std::vector<int64_t>&& tokens, LLMPipeline& pipe){ ;};
@@ -77,7 +77,7 @@ struct GenerationConfig {
         return *this;
     }
 
-    GenerationConfig& eos_token(int64_t eos_token) {
+    GenerationConfig& eos_token(std::string eos_token) {
         m_eos_token = eos_token;
         return *this;
     }
@@ -160,6 +160,7 @@ struct GenerationConfig {
 
         m_bos_token_id = data.value("bos_token_id", 0);
         m_eos_token_id = data.value("eos_token_id", 0);
+        m_eos_token = data.value("eos_token", "</s>");
 
         m_pad_token_id = data.value("pad_token_id", 0);
         m_num_return_sequences = data.value("num_return_sequences", 1);
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
index 82cecc2671..1c587753b7 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
@@ -193,15 +193,15 @@ void print_tensor(const ov::Tensor& tensor) {
     std::vector<int64_t> res;
 
     auto t_shape = tensor.get_shape();
-    // cout << "[";
-    // for (size_t i = 0; i < t_shape[1]; ++i) {
-    //     if (tensor.get_element_type() == ov::element::i64) {
-    //         res.emplace_back(tensor.data<int64_t>()[i]);
-    //         cout << tensor.data<int64_t>()[i] << " ";
-    //     }
-    // }
-    // cout << "]" << endl;
-    // cout << "---------" << endl;
+    cout << "[";
+    for (size_t i = 0; i < t_shape[1]; ++i) {
+        if (tensor.get_element_type() == ov::element::i64) {
+            res.emplace_back(tensor.data<int64_t>()[i]);
+            cout << tensor.data<int64_t>()[i] << " ";
+        }
+    }
+    cout << "]" << endl;
+    cout << "---------" << endl;
 }
 
 GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids, 
@@ -212,7 +212,8 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
     size_t prompt_len = prompts_shape[1];
     
     auto kv_cache_len = m_model_runner.query_state()[0].get_state().get_shape()[2];
-    
+
+    // todo: make this work even if position_ids are not specified
     auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
     initialize_position_ids(position_ids, attention_mask, kv_cache_len);
 
@@ -224,8 +225,6 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
         size_t new_prompt_len = attention_mask.get_shape()[1];
         size_t context_len = m_attentions_mask_cache.get_shape()[1];
         ov::Tensor new_attention_mask =  ov::Tensor{ov::element::i64, {1, context_len + new_prompt_len}};
-        // print_tensor(m_attentions_mask_cache);
-        // print_tensor(attention_mask);
 
         for (size_t i = 0; i < context_len; ++i) {
             auto r = m_attentions_mask_cache.data<int64_t>()[i];
@@ -235,22 +234,17 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
             auto r = attention_mask.data<int64_t>()[i];
             new_attention_mask.data<int64_t>()[i] = attention_mask.data<int64_t>()[i - context_len];
         }
-        // attention_mask = new_attention_mask;
-    // }
         m_model_runner.set_tensor("attention_mask", new_attention_mask);
     } else {
         m_model_runner.set_tensor("attention_mask", attention_mask);
     }
     
-    // todo: make this work even if position_ids are not specified
 
     auto atten_shape = attention_mask.get_shape();
     auto pos_shape = position_ids.get_shape();
     auto input_ids_shape = input_ids.get_shape();
 
     m_model_runner.set_tensor("input_ids", input_ids);
-    // print_tensor(m_model_runner.get_tensor("input_ids"));
-    // m_model_runner.set_tensor("attention_mask", attention_mask);
     m_model_runner.set_tensor("position_ids", position_ids);
 
     m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
@@ -259,10 +253,6 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
 
     size_t max_tokens = sampling_params.get_max_new_tokens(prompt_len);
     for (size_t i = 0; i < max_tokens; ++i) {
-        // print_tensor(m_model_runner.query_state()[0].get_state());
-        // print_tensor(m_model_runner.get_tensor("attention_mask"));
-        // print_tensor(m_model_runner.get_tensor("position_ids"));
-        print_tensor(m_model_runner.get_tensor("input_ids"));
         
         // todo: consider replacing with start_async and run callback right after that
         m_model_runner.infer();
@@ -536,14 +526,19 @@ GenerationResult LLMPipeline::multinomial_sampling(ov::Tensor prompts, Generatio
 }
 
 std::string LLMPipeline::call(std::string text) {
-    auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
-
-    auto generate_results = generate(input_ids, attention_mask, m_sampling_parameters);
-
-    return m_tokenizer.detokenize(generate_results)[0];
+    return call(text, m_sampling_parameters);
 }
 
-std::string LLMPipeline::call(std::string text, GenerationConfig generation_config, bool first_time) {
+std::string LLMPipeline::call(std::string text, GenerationConfig generation_config) {
+    if (is_chat_conversation) {
+        text = apply_chat_template(text);
+    }
+    auto kv_cache_len = m_model_runner.query_state()[0].get_state().get_shape()[2];
+    
+    // if (is_chat_conversation && kv_cache_len > 0) {
+    //     text += generation_config.m_eos_token;
+    // }
+
     auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
 
     // todo: W/A If sentence begins with a special tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
@@ -576,8 +571,7 @@ std::string LLMPipeline::call(std::string text, GenerationConfig generation_conf
     // and modify only after that, e.g.:
     // GenerationConfig config = pipe.generation_config();
     // config.do_sample(false).max_new_tokens(20);
-    // print_tensor(input_ids);
-    auto generate_results = generate(input_ids, attention_mask, generation_config, first_time);
+    auto generate_results = generate(input_ids, attention_mask, generation_config);
 
     return m_tokenizer.detokenize(generate_results)[0];
 }
@@ -606,7 +600,7 @@ std::vector<std::string> LLMPipeline::operator()(std::initializer_list<std::stri
     return call(text, sampling_parameters);
 }
 
-GenerationResult LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config, bool first_time) {
+GenerationResult LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
     GenerationResult result;
 
     if (generation_config.is_gready_sampling()) {
@@ -659,7 +653,7 @@ std::string LLMPipeline::apply_chat_template(std::string prompt, std::string rol
     return tpl.RenderAsString(params).value();
 }
 
-void LLMPipeline::set_streamer_callback(std::function<void (std::string)> callback) {
+void LLMPipeline::set_streamer(std::function<void (std::string)> callback) {
     is_streamer_set = true;
     m_streamer_callback = callback;
     m_streamer = TextCoutStreamer(m_tokenizer);
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
index c98df7770c..9825c8d7e2 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
@@ -9,7 +9,6 @@
 #include "generate_pipeline/llm_tokenizer.hpp"
 #include <filesystem>
 
-
 using GenerationResult = std::vector<std::pair<float, std::vector<int64_t>>>;
 using namespace std;
 
@@ -61,7 +60,7 @@ class LLMPipeline {
 
     std::string call(std::string text);
 
-    std::string call(std::string text, GenerationConfig generation_config, bool first_time = false);
+    std::string call(std::string text, GenerationConfig generation_config);
 
     std::vector<std::string> call(std::vector<std::string> text, GenerationConfig sampling_parameters);
 
@@ -73,7 +72,7 @@ class LLMPipeline {
 
     std::vector<std::string> operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters);
 
-    GenerationResult generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params, bool first = true);
+    GenerationResult generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
 
     GenerationResult generate(ov::Tensor input_ids, ov::Tensor attention_mask);
 
@@ -85,7 +84,7 @@ class LLMPipeline {
 
     std::string apply_chat_template(std::string prompt, std::string role = "user") const;
 
-    void set_streamer_callback(std::function<void (std::string)> callback);
+    void set_streamer(std::function<void (std::string)> callback);
     void start_conversation();
     void stop_conversation();
     void reset_state();

From 8dcea1fbfd03bf6281de1aaa52df39ffe1622efb Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 26 Apr 2024 13:51:47 +0200
Subject: [PATCH 22/97] add Jinja2cpp submodule

---
 .gitmodules | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitmodules b/.gitmodules
index 97bc043641..937468fb64 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "thirdparty/nlohmann_json"]
 	path = thirdparty/nlohmann_json
 	url = https://github.com/nlohmann/json.git
+[submodule "thirdparty/Jinja2Cpp"]
+	path = thirdparty/Jinja2Cpp
+	url = https://github.com/jinja2cpp/Jinja2Cpp

From 754a4627777d26be44f0f14f255bf852d0e6d8a2 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 2 May 2024 14:41:34 +0200
Subject: [PATCH 23/97] add ov namespace

---
 .../cpp/generate_pipeline/chat_sample.cpp     | 70 ++++++++-----------
 .../generate_pipeline/continuation_sample.cpp | 48 -------------
 .../cpp/generate_pipeline/generate_sample.cpp |  8 +--
 .../generate_pipeline/generation_config.hpp   |  7 +-
 .../cpp/generate_pipeline/llm_pipeline.cpp    | 56 ++++++++-------
 .../cpp/generate_pipeline/llm_pipeline.hpp    |  4 ++
 6 files changed, 70 insertions(+), 123 deletions(-)
 delete mode 100644 text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index c50bff845b..1c133097e6 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -5,73 +5,59 @@
 #include "llm_pipeline.hpp"
 
 
-std::string generate_chat_prompt(const LLMPipeline& pipe, std::string& input, bool first_iter = false, bool use_chat_template = true) {
+std::string generate_chat_prompt(const ov::LLMPipeline& pipe, std::string& input, bool use_chat_template = true) {
     if (use_chat_template)
         return pipe.apply_chat_template(input);
 
     std::stringstream result_prompt;
-    string prefix = (first_iter) ? "" : "<\n>";
-
-    // Gemma-7b-it
-    // result_prompt << "<bos><start_of_turn>user\n" << input << "<end_of_turn>\n<start_of_turn>model";
+    // result_prompt << "<bos><start_of_turn>user\n" << input << "<end_of_turn>\n<start_of_turn>model";  // Gemma-7b-it
+    // result_prompt << "<s>[INST] " << input << " [/INST]";  // LLama-2-7b
     
-    // TinyLlama
-    result_prompt << "<|user|>\n" << input << "</s>\n<|assistant|>\n";
-
-    // LLama-2-7b
-    // result_prompt << "<s>[INST] " << input << " [/INST]";
+    result_prompt << "<|user|>\n" << input << "</s>\n<|assistant|>\n";  // TinyLlama
     return result_prompt.str();
 }
 
-int main(int argc, char* argv[]) try { 
+std::vector<string> questions = {
+    "1+1=", 
+    "what was the previous answer?", 
+    "Why is the sky blue?", 
+    "4+10=",
+    // "Who was Alan Turing?",
+    // "But why did he killed himself?",
+    "What is Intel OpenVINO?",
+    // "4+10=", 
+    // "sum up all the numeric answers in the current chat session"
+    // "Why is the sky blue?",
+    // "Please repeat all the questions I asked you.",
+    "Can you briefly summarize what I asked you about during this session?",
+};
+
+int main(int argc, char* argv[]) try {
     std::string prompt = "table is made of";
     std::string device = "CPU"; // can be replaced with GPU
 
     std::string model_path = argv[1];
-    if (argc > 2)
-        prompt = argv[2];
-    if (argc > 3)
-        device = argv[3];
-
-    LLMPipeline pipe(model_path, device);
+    ov::LLMPipeline pipe(model_path, device);
 
     GenerationConfig config = pipe.generation_config();
     config.max_new_tokens(10000);
     pipe.set_streamer([](std::string word) { std::cout << word << std::flush; });
-    
-    vector<string> questions = {
-        "1+1=", 
-        "what was the previous answer?", 
-        "Why is the sky blue?", 
-        "4+10=",
-        "Who was Alan Turing?",
-        "But why did he killed himself?",
-        // "What is Intel OpenVINO?",
-        // "4+10=", 
-        // "sum up all the numeric answers in the current chat session"
-        // "Why is the sky blue?",
-        // "Please repeat all the questions I asked you.",
-        "Can you briefly summarize what I asked you about during this session?",
-    };
 
     std::string accumulated_str = "";
-    pipe.start_conversation();
+    // pipe.start_conversation();
     for (size_t i = 0; i < questions.size(); i++) {
         prompt = questions[i];
         
-        bool first_iter = (i == 0) ? true : false;
-        bool last_iter = (i == questions.size() - 1) ? true : false;
-        
         std::cout << "question:\n";
-        // std::getline(std::cin, prompt);
         cout << prompt << endl;
-        prompt = generate_chat_prompt(pipe, prompt, first_iter);
-        accumulated_str += prompt;
+        // std::getline(std::cin, prompt);
+
+        // auto answer_str = pipe.call(prompt, config);
         
-        std::string prefix = (first_iter) ? "" : "</s>";
-        auto answer_str = pipe.call(prefix + prompt, config);
-        // auto answer_str = pipe(accumulated_str, config);
+        accumulated_str += pipe.apply_chat_template(prompt);
+        auto answer_str = pipe(accumulated_str, config);
         accumulated_str += answer_str;
+        
         cout << "\n----------\n";
     }
     pipe.stop_conversation();
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp
deleted file mode 100644
index 53a3d60e79..0000000000
--- a/text_generation/causal_lm/cpp/generate_pipeline/continuation_sample.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (C) 2023-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include <openvino/openvino.hpp>
-#include "llm_pipeline.hpp"
-
-int main(int argc, char* argv[]) try {
-    std::string prompt = "table is made of";
-    std::string device = "CPU";
-
-    std::string model_path = argv[1];
-    if (argc > 2)
-        prompt = argv[2];
-    if (argc > 3)
-        device = argv[3];
-    vector<int64_t> all_results;
-
-    LLMPipeline pipe(model_path, device);
-    pipe.start_conversation();
-    GenerationConfig config = pipe.generation_config();
-    Tokenizer tokenizer = pipe.get_tokenizer();
-    config.eos_token_id(2);
-
-    ov::Tensor input_ids, attention_mask;
-    std::tie(input_ids, attention_mask) = tokenizer.tokenize(prompt);
-    // max_new_tokens should be 15 for reproducer case
-    auto result = pipe.generate(input_ids, attention_mask, config.max_new_tokens(55))[0].second;
-    all_results.insert(all_results.end(), result.begin(), result.end());
-
-    string text = tokenizer.detokenize(result);
-    cout << text << endl;
-
-    auto new_input_ids = ov::Tensor{ov::element::i64, {1, 1}};
-    auto new_attention_mask = ov::Tensor{ov::element::i64, {1, 1}};
-    auto data = new_attention_mask.data<int64_t>();
-    data[0] = 1;
-    data = new_input_ids.data<int64_t>();
-    data[0] = result.back();
-    auto new_result = pipe.generate(new_input_ids, new_attention_mask, config.max_new_tokens(1000))[0].second;
-    all_results.insert(all_results.end(), new_result.begin(), new_result.end());
-    cout << tokenizer.detokenize(all_results);
-} catch (const std::exception& error) {
-    std::cerr << error.what() << '\n';
-    return EXIT_FAILURE;
-} catch (...) {
-    std::cerr << "Non-exception object thrown\n";
-    return EXIT_FAILURE;
-}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
index 9d0f0b91f8..4b6a23f087 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -55,18 +55,18 @@ int main(int argc, char* argv[]) try {
 
     // Example 1: TextStreaming example with greedy search
     
-    LLMPipeline pipe(model_path, device);
+    ov::LLMPipeline pipe(model_path, device);
     // Will try to load config from generation_config.json.
     // but if not found default velues for gready search will be used
     GenerationConfig config = pipe.generation_config();
 
     auto text_streamer = TextStreamer{pipe.get_tokenizer()};
-    auto text_streamer_callback = [&text_streamer](std::vector<int64_t>&& tokens, LLMPipeline& pipe){
+    auto text_streamer_callback = [&text_streamer](std::vector<int64_t>&& tokens, ov::LLMPipeline& pipe){
         text_streamer.put(tokens[0]);
     };
 
     cout << "greedy generate streaming mode:" << endl;
-    config.max_new_tokens(20).set_callback(text_streamer_callback);
+    config.max_new_tokens(20).set_streamer(text_streamer_callback);
     pipe(prompt, config);
     text_streamer.end();
     
@@ -84,7 +84,7 @@ int main(int argc, char* argv[]) try {
     //     cout << "candidate " << i << ": " << generation_results[i] << endl;
 
     // Example 3: Greedy Decoding with multiple batch
-    pipe = LLMPipeline(model_path, device);
+    pipe = ov::LLMPipeline(model_path, device);
     config = pipe.generation_config();
 
     cout << endl << "greedy decoding with multiple batches:" << endl;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index 92ff075da3..bde8cec3a1 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -14,9 +14,12 @@
 // forward declaration
 class Sequence;
 
+namespace ov {
 // forward declaration
 class LLMPipeline;
 
+}
+
 // Similar to HuggingFace GenerationConfig
 struct GenerationConfig {
     // todo: add copy constructor
@@ -50,7 +53,7 @@ struct GenerationConfig {
     int64_t m_eos_token_id = 2;  // todo: check form where it's better to extract rt_info or tokenizer_config.json
     int64_t m_pad_token_id = 0;
 
-    std::function<void (std::vector<int64_t>&&, LLMPipeline&)> m_callback = [](std::vector<int64_t>&& tokens, LLMPipeline& pipe){ ;};
+    std::function<void (std::vector<int64_t>&&, ov::LLMPipeline&)> m_callback = [](std::vector<int64_t>&& tokens, ov::LLMPipeline& pipe){ ;};
 
     size_t get_max_new_tokens(size_t prompt_length = 0) {
         // max_new_tokens has priority over max_length,
@@ -261,7 +264,7 @@ struct GenerationConfig {
             return *this;
         }
 
-        GenerationConfig& set_callback(std::function<void (std::vector<int64_t>&&, LLMPipeline&)> callback) {
+        GenerationConfig& set_streamer(std::function<void (std::vector<int64_t>&&, ov::LLMPipeline&)> callback) {
             m_callback = callback;
             return *this;
         }
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
index 1c587753b7..9916a089a0 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
@@ -141,7 +141,7 @@ void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t n
     }
 }
    
-LLMPipeline::LLMPipeline(
+ov::LLMPipeline::LLMPipeline(
     std::string& model_path,
     std::string& tokenizer_path,
     std::string& detokenizer_path,
@@ -162,7 +162,7 @@ LLMPipeline::LLMPipeline(
     // todo: add loading Tokenizers from separate folders
 }
 
-LLMPipeline::LLMPipeline(std::string& path, std::string device, const ov::AnyMap& config) {
+ov::LLMPipeline::LLMPipeline(std::string& path, std::string device, const ov::AnyMap& config) {
     std::string tokenizer_config_fname = "tokenizer_config.json";
     std::string generation_config_fname = "generation_config.json";
 
@@ -185,7 +185,7 @@ LLMPipeline::LLMPipeline(std::string& path, std::string device, const ov::AnyMap
     m_tokenizer = Tokenizer(path);
 }
 
-GenerationConfig LLMPipeline::generation_config() const {
+GenerationConfig ov::LLMPipeline::generation_config() const {
     return m_sampling_parameters;
 }
 
@@ -204,7 +204,7 @@ void print_tensor(const ov::Tensor& tensor) {
     cout << "---------" << endl;
 }
 
-GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids, 
+GenerationResult ov::LLMPipeline::greedy_search(ov::Tensor input_ids, 
                                 ov::Tensor attention_mask, 
                                 GenerationConfig sampling_params) {
     ov::Shape prompts_shape = input_ids.get_shape();
@@ -297,7 +297,7 @@ GenerationResult LLMPipeline::greedy_search(ov::Tensor input_ids,
     return results;
 }
 
-GenerationResult LLMPipeline::beam_search(ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params) {
+GenerationResult ov::LLMPipeline::beam_search(ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params) {
     ov::Shape prompts_shape = prompts.get_shape();
     size_t batch_size = prompts_shape[0];
     // todo: implement for batch > 1
@@ -387,7 +387,7 @@ match the target. In tha caste the are validated in a single inference request t
 the main model (which is bigger, more accurate but slower) instead of running K
 subsequent requests. 
 */
-GenerationResult LLMPipeline::speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
+GenerationResult ov::LLMPipeline::speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
     auto batch_size = input_ids.get_shape()[0];
     OPENVINO_ASSERT(batch_size == 1);
     auto draft_model = sampling_params.get_assistant_model(m_device, m_config);
@@ -519,25 +519,27 @@ GenerationResult LLMPipeline::speculative_sampling(ov::Tensor input_ids, ov::Ten
     return results;
 }
 
-GenerationResult LLMPipeline::multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
+GenerationResult ov::LLMPipeline::multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
     // todo: implement
     GenerationResult results;
     return results;
 }
 
-std::string LLMPipeline::call(std::string text) {
+std::string ov::LLMPipeline::call(std::string text) {
     return call(text, m_sampling_parameters);
 }
 
-std::string LLMPipeline::call(std::string text, GenerationConfig generation_config) {
+std::string ov::LLMPipeline::call(std::string text, GenerationConfig generation_config) {
     if (is_chat_conversation) {
         text = apply_chat_template(text);
     }
     auto kv_cache_len = m_model_runner.query_state()[0].get_state().get_shape()[2];
     
-    // if (is_chat_conversation && kv_cache_len > 0) {
-    //     text += generation_config.m_eos_token;
-    // }
+    // previous prompt generation in chat dialog stops with the end of sentence token, 
+    // need to append this token to the current prompt
+    if (is_chat_conversation && kv_cache_len > 0) {
+        text = generation_config.m_eos_token + text;
+    }
 
     auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
 
@@ -576,7 +578,7 @@ std::string LLMPipeline::call(std::string text, GenerationConfig generation_conf
     return m_tokenizer.detokenize(generate_results)[0];
 }
 
-std::vector<std::string> LLMPipeline::call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
+std::vector<std::string> ov::LLMPipeline::call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
     auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
 
     auto generate_results = generate(input_ids, attention_mask, sampling_parameters);
@@ -584,23 +586,23 @@ std::vector<std::string> LLMPipeline::call(std::vector<std::string> text, Genera
     return m_tokenizer.detokenize(generate_results);
 }
 
-std::string LLMPipeline::operator()(std::string text) {
+std::string ov::LLMPipeline::operator()(std::string text) {
     return call(text);
 }
 
-std::string LLMPipeline::operator()(std::string text, GenerationConfig sampling_parameters) {
+std::string ov::LLMPipeline::operator()(std::string text, GenerationConfig sampling_parameters) {
     return call(text, sampling_parameters);
 }
 
-std::vector<std::string> LLMPipeline::operator()(std::vector<std::string> text, GenerationConfig sampling_parameters) {
+std::vector<std::string> ov::LLMPipeline::operator()(std::vector<std::string> text, GenerationConfig sampling_parameters) {
     return call(text, sampling_parameters);
 }
 
-std::vector<std::string> LLMPipeline::operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters) {
+std::vector<std::string> ov::LLMPipeline::operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters) {
     return call(text, sampling_parameters);
 }
 
-GenerationResult LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
+GenerationResult ov::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
     GenerationResult result;
 
     if (generation_config.is_gready_sampling()) {
@@ -619,24 +621,24 @@ GenerationResult LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attentio
     return result;
 }
 
-GenerationResult LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask) {
+GenerationResult ov::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask) {
     return generate(input_ids, attention_mask, m_sampling_parameters);
 }
 
-GenerationResult LLMPipeline::generate(ov::Tensor input_ids, GenerationConfig sampling_params) {
+GenerationResult ov::LLMPipeline::generate(ov::Tensor input_ids, GenerationConfig sampling_params) {
 
     return generate(input_ids, init_attention_mask(input_ids), sampling_params);
 }
 
-GenerationResult LLMPipeline::generate(ov::Tensor input_ids) {
+GenerationResult ov::LLMPipeline::generate(ov::Tensor input_ids) {
     return generate(input_ids, init_attention_mask(input_ids), m_sampling_parameters);
 }
 
-Tokenizer LLMPipeline::get_tokenizer() {
+Tokenizer ov::LLMPipeline::get_tokenizer() {
     return m_tokenizer;
 }
 
-std::string LLMPipeline::apply_chat_template(std::string prompt, std::string role) const {
+std::string ov::LLMPipeline::apply_chat_template(std::string prompt, std::string role) const {
     jinja2::TemplateEnv env;
     env.GetSettings().lstripBlocks = true;
     env.GetSettings().trimBlocks = true;
@@ -653,21 +655,21 @@ std::string LLMPipeline::apply_chat_template(std::string prompt, std::string rol
     return tpl.RenderAsString(params).value();
 }
 
-void LLMPipeline::set_streamer(std::function<void (std::string)> callback) {
+void ov::LLMPipeline::set_streamer(std::function<void (std::string)> callback) {
     is_streamer_set = true;
     m_streamer_callback = callback;
     m_streamer = TextCoutStreamer(m_tokenizer);
 }
 
-void LLMPipeline::start_conversation() {
+void ov::LLMPipeline::start_conversation() {
     is_chat_conversation = true;
 }
 
-void LLMPipeline::stop_conversation() {
+void ov::LLMPipeline::stop_conversation() {
     is_chat_conversation = false;
     reset_state();
 }
 
-void LLMPipeline::reset_state() {
+void ov::LLMPipeline::reset_state() {
     m_model_runner.reset_state();
 }
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
index 9825c8d7e2..151bc1ff01 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
@@ -21,6 +21,8 @@ void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t n
 
 class Tokenizer; // forward declaration
 
+namespace ov {
+
 class LLMPipeline {
 public:
     ov::InferRequest m_model_runner;
@@ -93,3 +95,5 @@ class LLMPipeline {
     std::function<void (std::string)> m_streamer_callback = [](std::string ){ ;};
     bool is_chat_conversation = false;
 };
+
+} // namespace ov
\ No newline at end of file

From 9b19c6f92c5100d47ef4e301e0518bd408b6604f Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 2 May 2024 23:50:30 +0200
Subject: [PATCH 24/97] return scores for batched outputs

---
 .../cpp/generate_pipeline/chat_sample.cpp     |   5 +-
 .../cpp/generate_pipeline/generate_sample.cpp |  48 +++---
 .../generate_pipeline/generation_config.hpp   |   2 +-
 .../cpp/generate_pipeline/llm_pipeline.cpp    | 145 ++++++++++++++----
 .../cpp/generate_pipeline/llm_pipeline.hpp    |  81 ++++++++--
 .../cpp/generate_pipeline/llm_tokenizer.cpp   |  10 +-
 .../cpp/generate_pipeline/llm_tokenizer.hpp   |   3 +-
 7 files changed, 220 insertions(+), 74 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index 1c133097e6..ca2d0e62a5 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -12,8 +12,8 @@ std::string generate_chat_prompt(const ov::LLMPipeline& pipe, std::string& input
     std::stringstream result_prompt;
     // result_prompt << "<bos><start_of_turn>user\n" << input << "<end_of_turn>\n<start_of_turn>model";  // Gemma-7b-it
     // result_prompt << "<s>[INST] " << input << " [/INST]";  // LLama-2-7b
-    
     result_prompt << "<|user|>\n" << input << "</s>\n<|assistant|>\n";  // TinyLlama
+    
     return result_prompt.str();
 }
 
@@ -38,8 +38,9 @@ int main(int argc, char* argv[]) try {
 
     std::string model_path = argv[1];
     ov::LLMPipeline pipe(model_path, device);
-
+    
     GenerationConfig config = pipe.generation_config();
+
     config.max_new_tokens(10000);
     pipe.set_streamer([](std::string word) { std::cout << word << std::flush; });
 
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
index 4b6a23f087..222abc2de5 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -91,33 +91,39 @@ int main(int argc, char* argv[]) try {
     std::vector<std::string> prompts = {"table is made of", "Alan Turing was a", "1 + 1 = ", "Why is the Sun yellow?"};
     auto results = pipe(prompts, config.max_new_tokens(20));
     for (int i = 0; i < prompts.size(); i++)
-        cout << prompts[i] << ": " << results[i] << endl;
+        cout << prompts[i] << ": " << results.texts[i] << endl;
 
     // Example 4: Calling tokenizer/detokenizer manually and getting beam scores for all candidates
-    // pipe = LLMPipeline(model_path);
-    // auto [input_ids, attention_mask] = pipe.tokenize({prompt});
-    // config = GenerationConfig::beam_search();
-    // // config for grouped beam search
-    // config.max_new_tokens(30).num_groups(3).group_size(5).num_return_sequences(15);
+    pipe = ov::LLMPipeline(model_path);
+    auto [input_ids, attention_mask] = pipe.get_tokenizer().tokenize({prompt});
+    config = GenerationConfig::beam_search();
+    // config for grouped beam search
+    config.max_new_tokens(30).num_groups(3).group_size(5).num_return_sequences(15);
     
-    // cout << endl << "beam search with printing of all candidates:" << endl;
-    // auto beams = pipe.generate(input_ids, attention_mask, config);
-    // for (const auto& beam : beams)
+    cout << endl << "beam search with printing of all candidates:" << endl;
+    auto beams = pipe.generate(input_ids, attention_mask, config);
+    for (size_t i = 0; i < beams.scores.size(); i++) {
+        std::cout << beams.scores[i] << ": " << pipe.get_tokenizer().detokenize(beams.tokens[i]) << std::endl;
+    }
+
+    // for (const auto& beam : beams.second)
     //     std::cout << beam.first << ": " << pipe.detokenize(beam.second) << std::endl;
 
-    // {
-    //     // Example 5: Speculative sampling
-    //     std::string assitive_model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16";
-    //     pipe = LLMPipeline(model_path);
-    //     auto [input_ids, attention_mask] = pipe.tokenize({prompt});
-    //     // config = GenerationConfig::assistive_decoding(assitive_model_path).num_assistant_tokens(5).max_new_tokens(20);
-    //     pipe.generation_config().assistant_model(assitive_model_path);
+    {
+        // Example 5: Speculative sampling
+        std::string assitive_model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16";
+        pipe = ov::LLMPipeline(model_path);
+        auto [input_ids, attention_mask] = pipe.get_tokenizer().tokenize({prompt});
+        // config = GenerationConfig::assistive_decoding(assitive_model_path).num_assistant_tokens(5).max_new_tokens(20);
+        pipe.generation_config().assistant_model(assitive_model_path);
         
-    //     cout << endl << "Speculative sampling with TinyLlama assistance:" << endl;
-    //     auto results = pipe.generate(input_ids, attention_mask, config);
-    //     for (const auto& result : results)
-    //         std::cout << pipe.detokenize(result.second) << std::endl;
-    // }
+        cout << endl << "Speculative sampling with TinyLlama assistance:" << endl;
+        auto results = pipe.generate(input_ids, attention_mask, config);
+        for (size_t i = 0; i < beams.scores.size(); i++) {
+        for (const auto& result : results)
+            std::cout << pipe.get_tokenizer().detokenize(result.tokens) << std::endl;
+        }
+    }
 
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index bde8cec3a1..79433501ed 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -46,7 +46,7 @@ struct GenerationConfig {
     float m_temperature = 0.0f; // by default we use greedy sampling
     int m_top_k = -1; // maybe to assign vocab_size ?
     float m_top_p = 1.0f; // by default convsider all tokens
-    bool m_do_sample;
+    bool m_do_sample = false;
 
     // special tokens
     int64_t m_bos_token_id = 0;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
index 9916a089a0..2c07c9216f 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
@@ -8,7 +8,6 @@
 #include <jinja2cpp/template.h>
 #include <jinja2cpp/template_env.h>
 
-using GenerationResult = std::vector<std::pair<float, std::vector<int64_t>>>;
 using namespace std;
 
 std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token) {
@@ -140,7 +139,77 @@ void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t n
         state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
     }
 }
-   
+
+std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_idx) {
+    if (logits.get_shape()[0] <= batch_idx) {
+        OPENVINO_THROW("logits batch size doesn't match the number of beams");
+    }
+
+    size_t vocab_size = logits.get_shape().back();
+    size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size;
+    size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size;
+    const float* logits_data = logits.data<const float>() + batch_offset + sequence_offset;
+    
+    int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
+    float max_logit = logits_data[out_token];
+
+    float log_sum = std::log(
+        std::accumulate(logits_data, logits_data + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) {
+            return accumulated + std::exp(to_add - max_logit);
+        }));
+    return {out_token, log_sum};
+}
+
+template <class T, class ItemType>
+bool ov::ResultsIterator<T, ItemType>::operator!=(const ResultsIterator& other) const {
+    return index != other.index;
+}
+
+template <class T, class ItemType>
+ItemType ov::ResultsIterator<T, ItemType>::operator*() const {
+    return ItemType{results[index]};
+}
+
+template <class T, class ItemType>
+ov::ResultsIterator<T, ItemType>& ov::ResultsIterator<T, ItemType>::operator++() {
+    ++index;
+    return *this;
+}
+
+
+template class ov::ResultsIterator<ov::GenerationResults, ov::TokensScorePair>;
+template class ov::ResultsIterator<ov::PipelineResults, ov::TextScorePair>;
+
+ov::TokensScorePair ov::GenerationResults::operator[](size_t index) const {
+    if (index >= tokens.size() || index >= scores.size()) {
+        OPENVINO_THROW("Index out of range");
+    }
+    return TokensScorePair{tokens[index], scores[index]};
+}
+
+ov::ResultsIterator<ov::GenerationResults, ov::TokensScorePair> ov::GenerationResults::begin() const {
+    return ov::ResultsIterator<ov::GenerationResults, ov::TokensScorePair>(*this, 0);
+}
+
+ov::ResultsIterator<ov::GenerationResults, ov::TokensScorePair> ov::GenerationResults::end() const {
+    return ResultsIterator<ov::GenerationResults, TokensScorePair>(*this, tokens.size());
+}
+
+ov::TextScorePair ov::PipelineResults::operator[](size_t index) const {
+    if (index >= texts.size() || index >= scores.size()) {
+        OPENVINO_THROW("Index out of range");
+    }
+    return TextScorePair{texts[index], scores[index]};
+}
+
+ov::ResultsIterator<ov::PipelineResults, ov::TextScorePair> ov::PipelineResults::begin() const {
+    return ov::ResultsIterator<ov::PipelineResults, ov::TextScorePair>(*this, 0);
+}
+
+ov::ResultsIterator<ov::PipelineResults, ov::TextScorePair> ov::PipelineResults::end() const {
+    return ov::ResultsIterator<ov::PipelineResults, ov::TextScorePair>(*this, texts.size());
+}
+
 ov::LLMPipeline::LLMPipeline(
     std::string& model_path,
     std::string& tokenizer_path,
@@ -204,7 +273,7 @@ void print_tensor(const ov::Tensor& tensor) {
     cout << "---------" << endl;
 }
 
-GenerationResult ov::LLMPipeline::greedy_search(ov::Tensor input_ids, 
+ov::GenerationResults ov::LLMPipeline::greedy_search(ov::Tensor input_ids, 
                                 ov::Tensor attention_mask, 
                                 GenerationConfig sampling_params) {
     ov::Shape prompts_shape = input_ids.get_shape();
@@ -217,7 +286,10 @@ GenerationResult ov::LLMPipeline::greedy_search(ov::Tensor input_ids,
     auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
     initialize_position_ids(position_ids, attention_mask, kv_cache_len);
 
-    GenerationResult results(batch_size);
+    ov::GenerationResults results;
+    results.scores.resize(batch_size);
+    results.tokens.resize(batch_size);
+    std::fill(results.scores.begin(), results.scores.end(), 0);
     
     if (is_chat_conversation && kv_cache_len > 0) {
         // m_attentions_mask_cache extent with attention_mask;
@@ -272,9 +344,16 @@ GenerationResult ov::LLMPipeline::greedy_search(ov::Tensor input_ids,
         std::vector<int64_t> token_iter_results(batch_size);  // results of a single infer request
         std::vector<int> eos_met(batch_size, 0);  // use int because can not use std::all_of with vector<bool>
         for (size_t batch = 0; batch < batch_size; ++batch) {
-            const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
-            int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
-            results[batch].second.emplace_back(out_token);
+            // const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
+            // int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
+            // results.tokens[batch].emplace_back(out_token);
+            // results.scores[batch] += logits_data[out_token];
+
+            auto res = softmax(logits, batch);
+            auto out_token = res.first;
+            results.tokens[batch].emplace_back(res.first);
+            results.scores[batch] += res.second;
+
             token_iter_results[batch] = out_token;
             eos_met[batch] = (out_token == sampling_params.m_eos_token_id);
 
@@ -297,7 +376,7 @@ GenerationResult ov::LLMPipeline::greedy_search(ov::Tensor input_ids,
     return results;
 }
 
-GenerationResult ov::LLMPipeline::beam_search(ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params) {
+ov::GenerationResults ov::LLMPipeline::beam_search(ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params) {
     ov::Shape prompts_shape = prompts.get_shape();
     size_t batch_size = prompts_shape[0];
     // todo: implement for batch > 1
@@ -366,9 +445,11 @@ GenerationResult ov::LLMPipeline::beam_search(ov::Tensor prompts, ov::Tensor att
     auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); };
     std::sort(beams.begin(), beams.end(), compare_scores);
     
-    GenerationResult results;
+    ov::GenerationResults results;
     for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) {
-        results.emplace_back(std::pair(beam->score, beam->tokens));
+        // todo: convert to string 
+        results.scores.emplace_back(beam->score);
+        results.tokens.emplace_back(beam->tokens);
     }
     return results;
 }
@@ -387,7 +468,7 @@ match the target. In tha caste the are validated in a single inference request t
 the main model (which is bigger, more accurate but slower) instead of running K
 subsequent requests. 
 */
-GenerationResult ov::LLMPipeline::speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
+ov::GenerationResults ov::LLMPipeline::speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
     auto batch_size = input_ids.get_shape()[0];
     OPENVINO_ASSERT(batch_size == 1);
     auto draft_model = sampling_params.get_assistant_model(m_device, m_config);
@@ -444,8 +525,10 @@ GenerationResult ov::LLMPipeline::speculative_sampling(ov::Tensor input_ids, ov:
     // the first token which is fed to both draft and main netwoks on each iteration
     auto first_token = out_token;
 
-    GenerationResult results(batch_size);
-    results[0].second.emplace_back(out_token);
+    ov::GenerationResults results;
+    results.tokens.resize(batch_size);
+
+    results.tokens[0].emplace_back(out_token);
     
     // run K infer requests on draft model and get next K prediction tokens on each iteration
     uint64_t K = sampling_params.m_num_assistant_tokens;
@@ -498,7 +581,11 @@ GenerationResult ov::LLMPipeline::speculative_sampling(ov::Tensor input_ids, ov:
             auto start = data_logits + vocab_size * i;
             auto stop = data_logits + vocab_size * (i + 1);
             out_token = std::max_element(start, stop) - start;
-            results[0].second.emplace_back(out_token);
+            results.tokens[0].emplace_back(out_token);
+
+            if (is_streamer_set) {
+                m_streamer_callback(m_streamer.put(out_token));
+            }
 
             disagree_idx = i;                
             if (out_token != draft_tokens[i] || out_token == eos_token || seq_len + disagree_idx + 1 >= max_sequence_length)
@@ -519,9 +606,9 @@ GenerationResult ov::LLMPipeline::speculative_sampling(ov::Tensor input_ids, ov:
     return results;
 }
 
-GenerationResult ov::LLMPipeline::multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
+ov::GenerationResults ov::LLMPipeline::multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
     // todo: implement
-    GenerationResult results;
+    ov::GenerationResults results;
     return results;
 }
 
@@ -569,21 +656,16 @@ std::string ov::LLMPipeline::call(std::string text, GenerationConfig generation_
     for (size_t i = 0; i < tmp_attn_mask.size(); i++)
         attention_mask.data<int64_t>()[i] = tmp_attn_mask.data()[i];
 
-    // to keep config specified during LLMPipeline creation need to get existing 
-    // and modify only after that, e.g.:
-    // GenerationConfig config = pipe.generation_config();
-    // config.do_sample(false).max_new_tokens(20);
     auto generate_results = generate(input_ids, attention_mask, generation_config);
-
-    return m_tokenizer.detokenize(generate_results)[0];
+    return m_tokenizer.detokenize(generate_results.tokens)[0];
 }
 
-std::vector<std::string> ov::LLMPipeline::call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
+ov::PipelineResults ov::LLMPipeline::call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
     auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
 
     auto generate_results = generate(input_ids, attention_mask, sampling_parameters);
 
-    return m_tokenizer.detokenize(generate_results);
+    return {m_tokenizer.detokenize(generate_results.tokens), generate_results.scores};
 }
 
 std::string ov::LLMPipeline::operator()(std::string text) {
@@ -594,16 +676,16 @@ std::string ov::LLMPipeline::operator()(std::string text, GenerationConfig sampl
     return call(text, sampling_parameters);
 }
 
-std::vector<std::string> ov::LLMPipeline::operator()(std::vector<std::string> text, GenerationConfig sampling_parameters) {
+ov::PipelineResults ov::LLMPipeline::operator()(std::vector<std::string> text, GenerationConfig sampling_parameters) {
     return call(text, sampling_parameters);
 }
 
-std::vector<std::string> ov::LLMPipeline::operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters) {
+ov::PipelineResults ov::LLMPipeline::operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters) {
     return call(text, sampling_parameters);
 }
 
-GenerationResult ov::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
-    GenerationResult result;
+ov::GenerationResults ov::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
+    ov::GenerationResults result;
 
     if (generation_config.is_gready_sampling()) {
         result = greedy_search(input_ids, attention_mask, generation_config);
@@ -621,16 +703,16 @@ GenerationResult ov::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor atte
     return result;
 }
 
-GenerationResult ov::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask) {
+ov::GenerationResults ov::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask) {
     return generate(input_ids, attention_mask, m_sampling_parameters);
 }
 
-GenerationResult ov::LLMPipeline::generate(ov::Tensor input_ids, GenerationConfig sampling_params) {
+ov::GenerationResults ov::LLMPipeline::generate(ov::Tensor input_ids, GenerationConfig sampling_params) {
 
     return generate(input_ids, init_attention_mask(input_ids), sampling_params);
 }
 
-GenerationResult ov::LLMPipeline::generate(ov::Tensor input_ids) {
+ov::GenerationResults ov::LLMPipeline::generate(ov::Tensor input_ids) {
     return generate(input_ids, init_attention_mask(input_ids), m_sampling_parameters);
 }
 
@@ -648,6 +730,7 @@ std::string ov::LLMPipeline::apply_chat_template(std::string prompt, std::string
     jinja2::ValuesMap message {{"role", role}, {"content", prompt}};
     jinja2::ValuesMap params = {
         {"messages", jinja2::ValuesList({message})},
+        {"bos_token",  "<s>"},
         {"eos_token", "</s>"},  // todo: load from config
         {"add_generation_prompt", true},
     };
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
index 151bc1ff01..4bc735f0c7 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
@@ -9,7 +9,6 @@
 #include "generate_pipeline/llm_tokenizer.hpp"
 #include <filesystem>
 
-using GenerationResult = std::vector<std::pair<float, std::vector<int64_t>>>;
 using namespace std;
 
 void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
@@ -19,10 +18,66 @@ ov::Tensor extend_attention(ov::Tensor attention_mask);
 ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len);
 void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len);
 
+std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_idx);
+
 class Tokenizer; // forward declaration
 
 namespace ov {
 
+template <class T, class ItemType>
+class ResultsIterator {
+    public:
+        ResultsIterator(const T& results, size_t index) : results(results), index(index) {}
+
+        bool operator!=(const ResultsIterator& other) const;
+
+        ItemType operator*() const;
+
+        ResultsIterator& operator++();
+
+    private:
+        const T& results;
+        size_t index;
+};
+
+class TextScorePair {
+public:
+    std::string text;
+    float score;
+};
+
+class TokensScorePair {
+public:
+    std::vector<int64_t> tokens;
+    float score;
+};
+
+class GenerationResults {
+public:
+    std::vector<std::vector<int64_t>> tokens;
+    std::vector<float> scores;
+
+    TokensScorePair operator[](size_t index) const;
+
+    ResultsIterator<GenerationResults, TokensScorePair> begin() const;
+
+    ResultsIterator<GenerationResults, TokensScorePair> end() const;
+};
+
+class PipelineResults {
+public:
+    std::vector<std::string> texts;
+    std::vector<float> scores;
+    
+    TextScorePair operator[](size_t index) const;
+
+    ResultsIterator<PipelineResults, TextScorePair> begin() const;
+
+    ResultsIterator<PipelineResults, TextScorePair> end() const;
+};
+
+
+
 class LLMPipeline {
 public:
     ov::InferRequest m_model_runner;
@@ -52,35 +107,35 @@ class LLMPipeline {
     LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& config={});
     GenerationConfig generation_config() const;
 
-    GenerationResult greedy_search(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
+    GenerationResults greedy_search(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
 
-    GenerationResult beam_search(ov::Tensor prompts, ov::Tensor attention_mask, GenerationConfig sampling_params);
+    GenerationResults beam_search(ov::Tensor prompts, ov::Tensor attention_mask, GenerationConfig sampling_params);
 
-    GenerationResult speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
+    GenerationResults speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
 
-    GenerationResult multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params);
+    GenerationResults multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params);
 
     std::string call(std::string text);
 
     std::string call(std::string text, GenerationConfig generation_config);
 
-    std::vector<std::string> call(std::vector<std::string> text, GenerationConfig sampling_parameters);
+    PipelineResults call(std::vector<std::string> text, GenerationConfig sampling_parameters);
 
     std::string operator()(std::string text);
 
     std::string operator()(std::string text, GenerationConfig sampling_parameters);
 
-    std::vector<std::string> operator()(std::vector<std::string> text, GenerationConfig sampling_parameters);
+    PipelineResults operator()(std::vector<std::string> text, GenerationConfig sampling_parameters);
 
-    std::vector<std::string> operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters);
+    PipelineResults operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters);
 
-    GenerationResult generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
+    GenerationResults generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
 
-    GenerationResult generate(ov::Tensor input_ids, ov::Tensor attention_mask);
+    GenerationResults generate(ov::Tensor input_ids, ov::Tensor attention_mask);
 
-    GenerationResult generate(ov::Tensor input_ids, GenerationConfig sampling_params);
+    GenerationResults generate(ov::Tensor input_ids, GenerationConfig sampling_params);
 
-    GenerationResult generate(ov::Tensor input_ids);
+    GenerationResults generate(ov::Tensor input_ids);
 
     Tokenizer get_tokenizer();
 
@@ -96,4 +151,6 @@ class LLMPipeline {
     bool is_chat_conversation = false;
 };
 
+
+
 } // namespace ov
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
index b02855f218..9443de22ac 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
@@ -102,20 +102,20 @@ std::vector<std::string> Tokenizer::detokenize(ov::Tensor tokens) {
     return strings;
 }
 
-std::vector<std::string> Tokenizer::detokenize(GenerationResult lines) {
+std::vector<std::string> Tokenizer::detokenize(std::vector<std::vector<int64_t>> lines) {
     // todo: implement calling detokenizer in a single batch
 
-    std::vector<std::string> strings;
-    for (auto& [score, line]: lines){
+    std::vector<std::string> results;
+    for (auto& line: lines){
         ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()};
         m_detokenizer_request.set_input_tensor(tokens);
         m_detokenizer_request.infer();
         auto res = m_detokenizer_request.get_output_tensor();
         auto res_str = res.data<std::string>()[0];
-        strings.emplace_back(res_str);
+        results.emplace_back(res_str);
     }
     
-    return strings;
+    return results;
 }
 
 TextCoutStreamer::TextCoutStreamer(const Tokenizer& tokenizer, bool print_eos_token) {
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp
index b570ae5ec1..219554a57e 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp
@@ -8,7 +8,6 @@
 #include <filesystem>
 
 
-using GenerationResult = std::vector<std::pair<float, std::vector<int64_t>>>;
 using namespace std;
 
 std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token=2);
@@ -33,7 +32,7 @@ class Tokenizer {
     
     std::vector<std::string> detokenize(ov::Tensor tokens);
     
-    std::vector<std::string> detokenize(GenerationResult lines);
+    std::vector<std::string> detokenize(std::vector<std::vector<int64_t>> lines);
 private:
     ov::InferRequest m_tokenize_request;
     ov::InferRequest m_detokenizer_request;

From 9bf6caa2a59734abdaeb30c6866955bce187a34a Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 3 May 2024 10:49:25 +0200
Subject: [PATCH 25/97] add AnyMap

---
 .../generate_pipeline/generation_config.hpp   | 228 ++++++++++++------
 1 file changed, 156 insertions(+), 72 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index 79433501ed..8c89bb65c8 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -20,38 +20,79 @@ class LLMPipeline;
 
 }
 
-// Similar to HuggingFace GenerationConfig
-struct GenerationConfig {
-    // todo: add copy constructor
+namespace {
+
+// TODO: LEAVE ONLY ONE PLACE FOR DEFAULT VALUES
+static const ov::AnyMap default_generation_config_map = {
+    // Generic
+    {"max_new_tokens", SIZE_MAX},
+    {"max_length", SIZE_MAX},
+    {"m_ignore_eos", false},
+    {"m_bos_token", "</s>"},
+    {"m_eos_token", "</s>"},
+    
+    // Beam search specific
+    {"m_num_groups", 1},
+    {"m_group_size", 1},
+    {"m_diversity_penalty", 1.0f},  // 0.0 means no diversity
+    {"m_num_return_sequences", 1},  // is used by beam search, in other case is equal to batch size
+    // {"stop_criteria", StopCriteria::heuristic},  // todo: align with the latest beam searcher
+
+    {"m_repetition_penalty", 1.0f},
+    {"m_length_penalty", 1.0f},
+    {"m_no_repeat_ngram_size", std::numeric_limits<size_t>::max()},
+    {"early_finish", [](const Sequence&) {return false; }},
+    
+    // Multinomial
+    {"m_temperature", 0.0f},
+    {"m_top_k", -1},
+    {"m_top_p", 1.0f},
+    {"m_do_sample", false},
     
+    // special tokens
+    {"m_bos_token_id", 0},
+    {"m_eos_token_id", 2}, // todo: check form where it's better to extract from rt_info or from tokenizer_config.json
+    {"m_pad_token_id", 0},
+    
+    // assistive decoding
+    {"m_assistant_model", ov::InferRequest()},
+    {"m_num_assistant_tokens", 5},
+    {"m_seq_len_axis", 2},
+};
+
+}
+
+// Similar to HuggingFace GenerationConfig
+class GenerationConfig {
+public:  
     // Generic
-    size_t m_max_new_tokens = SIZE_MAX;
-    size_t m_max_length = SIZE_MAX; // m_max_new_tokens should have priority over m_max_length
-    bool m_ignore_eos = false;
-    std::string m_eos_token = "</s>";
+    size_t m_max_new_tokens;
+    size_t m_max_length;
+    bool m_ignore_eos;
+    std::string m_eos_token;
 
     // Beam search specific
-    size_t m_num_groups = 1;
-    size_t m_group_size = 1; // beam_width
-    float m_diversity_penalty = 1.0f; // 0.0 means no diversity
-    size_t m_num_return_sequences = 3;  // is used by beam search, in other case is equal to batch size
+    size_t m_num_groups;
+    size_t m_group_size;
+    float m_diversity_penalty;
+    size_t m_num_return_sequences;
     // StopCriteria stop_criteria = StopCriteria::heuristic;
     
-    float m_repetition_penalty = 1.0f;
-    float m_length_penalty = 1.0f;
-    size_t m_no_repeat_ngram_size = std::numeric_limits<size_t>::max();
+    float m_repetition_penalty;
+    float m_length_penalty;
+    size_t m_no_repeat_ngram_size;
     std::function<bool(const Sequence&)> early_finish = [](const Sequence&) {return false; };
 
     // Multinomial
-    float m_temperature = 0.0f; // by default we use greedy sampling
-    int m_top_k = -1; // maybe to assign vocab_size ?
-    float m_top_p = 1.0f; // by default convsider all tokens
-    bool m_do_sample = false;
+    float m_temperature;
+    int m_top_k;
+    float m_top_p;
+    bool m_do_sample;
 
     // special tokens
-    int64_t m_bos_token_id = 0;
-    int64_t m_eos_token_id = 2;  // todo: check form where it's better to extract rt_info or tokenizer_config.json
-    int64_t m_pad_token_id = 0;
+    int64_t m_bos_token_id;
+    int64_t m_eos_token_id;
+    int64_t m_pad_token_id;
 
     std::function<void (std::vector<int64_t>&&, ov::LLMPipeline&)> m_callback = [](std::vector<int64_t>&& tokens, ov::LLMPipeline& pipe){ ;};
 
@@ -66,6 +107,8 @@ struct GenerationConfig {
     }
 
     GenerationConfig& max_new_tokens(size_t max_new_tokens) {
+        const auto& r = ::default_generation_config_map.find("sdf") != ::default_generation_config_map.end();
+
         m_max_new_tokens = max_new_tokens;
         return *this;
     }
@@ -182,6 +225,7 @@ struct GenerationConfig {
         m_group_size = num_beams / m_num_groups;
     }
 
+
     static GenerationConfig greedy() {
         GenerationConfig greedy_params;
         greedy_params.m_temperature = 0.0f;
@@ -226,67 +270,107 @@ struct GenerationConfig {
         return m_do_sample;
     }
 
-    // for Assistive/Speculative decoding
-    ov::InferRequest m_assistant_model;
-    size_t m_num_assistant_tokens = 5;
-    size_t m_seq_len_axis = 2;
-    private:
-        std::shared_ptr<const ov::Model> m_assistant_ov_model;
-        bool is_assistant_request_defined = false;
-        bool is_assistant_ov_defined = false;
-
-    public:
-        GenerationConfig& assistant_model(const ov::InferRequest& assistant_model) {
-            m_assistant_model = assistant_model;
-            is_assistant_request_defined = true;
-            return *this;
-        }
+    // for speculative decoding
+    GenerationConfig& assistant_model(const ov::InferRequest& assistant_model) {
+        m_assistant_model = assistant_model;
+        is_assistant_request_defined = true;
+        return *this;
+    }
 
-        GenerationConfig& assistant_model(ov::CompiledModel& assistant_model) {
-            m_assistant_model = assistant_model.create_infer_request();
-            is_assistant_request_defined = true;
-            return *this;
-        }
+    GenerationConfig& assistant_model(ov::CompiledModel& assistant_model) {
+        m_assistant_model = assistant_model.create_infer_request();
+        is_assistant_request_defined = true;
+        return *this;
+    }
 
-        GenerationConfig& assistant_model(const std::shared_ptr<const ov::Model>& assistant_model) {
-            m_assistant_ov_model = assistant_model;
-            is_assistant_ov_defined = true;
-            return *this;
-        }
+    GenerationConfig& assistant_model(const std::shared_ptr<const ov::Model>& assistant_model) {
+        m_assistant_ov_model = assistant_model;
+        is_assistant_ov_defined = true;
+        return *this;
+    }
 
-        GenerationConfig& assistant_model(std::string assistant_model) {
-            auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
-            if (!is_xml(assistant_model))
-                assistant_model += "/openvino_model.xml";
+    GenerationConfig& assistant_model(std::string assistant_model) {
+        auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
+        if (!is_xml(assistant_model))
+            assistant_model += "/openvino_model.xml";
 
-            m_assistant_ov_model = ov::Core().read_model(assistant_model);
-            is_assistant_ov_defined = true;
-            return *this;
-        }
+        m_assistant_ov_model = ov::Core().read_model(assistant_model);
+        is_assistant_ov_defined = true;
+        return *this;
+    }
 
-        GenerationConfig& set_streamer(std::function<void (std::vector<int64_t>&&, ov::LLMPipeline&)> callback) {
-            m_callback = callback;
-            return *this;
-        }
+    GenerationConfig& set_streamer(std::function<void (std::vector<int64_t>&&, ov::LLMPipeline&)> callback) {
+        m_callback = callback;
+        return *this;
+    }
 
-        ov::InferRequest get_assistant_model(std::string device="CPU", const ov::AnyMap& config={}) {
-            if (is_assistant_request_defined) {
-                return m_assistant_model;
-            } else if (is_assistant_ov_defined) {
-                m_assistant_model = ov::Core().compile_model(m_assistant_ov_model, device, config).create_infer_request();
-                is_assistant_request_defined = true;
-                return m_assistant_model;
-            } else {
-                OPENVINO_THROW("assistant model is not specified");
-            }
+    ov::InferRequest get_assistant_model(std::string device="CPU", const ov::AnyMap& config={}) {
+        if (is_assistant_request_defined) {
+            return m_assistant_model;
+        } else if (is_assistant_ov_defined) {
+            m_assistant_model = ov::Core().compile_model(m_assistant_ov_model, device, config).create_infer_request();
+            is_assistant_request_defined = true;
+            return m_assistant_model;
+        } else {
+            OPENVINO_THROW("assistant model is not specified");
         }
-        
-        GenerationConfig& num_assistant_tokens(int64_t num_assistant_tokens) {
-            m_num_assistant_tokens = num_assistant_tokens;
-        return *this;
     }
     
+    GenerationConfig& num_assistant_tokens(int64_t num_assistant_tokens) {
+        m_num_assistant_tokens = num_assistant_tokens;
+        return *this;
+    }
+
     bool is_speculative() const {
         return is_assistant_ov_defined || is_assistant_request_defined;
     }
+
+    // for Assistive/Speculative decoding
+    ov::InferRequest m_assistant_model;
+    size_t m_num_assistant_tokens = 5;
+    size_t m_seq_len_axis = 2;
+private:
+    std::shared_ptr<const ov::Model> m_assistant_ov_model;
+    bool is_assistant_request_defined = false;
+    bool is_assistant_ov_defined = false;
+
+    static GenerationConfig anymap_to_generation_config(const ov::AnyMap& genereation_config_map = {}) {
+        // need to load default values and update only those keys that are specified in genereation_config_map
+        auto tmp_map = default_generation_config_map;
+        
+        for (auto it = genereation_config_map.begin(); it != genereation_config_map.end(); ++it) {
+            tmp_map[it->first] = it->second;
+        }
+
+        GenerationConfig config;
+        
+        // general arguments
+        config.m_max_new_tokens = tmp_map.at("m_max_new_tokens").as<size_t>();
+        config.m_max_length = tmp_map.at("m_max_length").as<size_t>();
+        config.m_ignore_eos = tmp_map.at("m_ignore_eos").as<bool>();
+        config.m_eos_token = tmp_map.at("m_eos_token").as<int64_t>();
+
+        // Beam search specific
+        config.m_num_groups = tmp_map.at("m_num_groups").as<size_t>();
+        config.m_group_size = tmp_map.at("m_group_size").as<size_t>();
+        config.m_diversity_penalty = tmp_map.at("m_diversity_penalty").as<size_t>();
+        config.m_num_return_sequences = tmp_map.at("m_num_return_sequences").as<size_t>();
+        
+        config.m_repetition_penalty = tmp_map.at("m_repetition_penalty").as<size_t>();
+        config.m_length_penalty = tmp_map.at("m_length_penalty").as<size_t>();
+        config.m_no_repeat_ngram_size = tmp_map.at("m_no_repeat_ngram_size").as<size_t>();
+        config.early_finish = tmp_map.at("early_finish").as<std::function<bool(const Sequence&)>>();
+
+        // Multinomial
+        config.m_temperature = tmp_map.at("m_temperature").as<size_t>();
+        config.m_top_k = tmp_map.at("m_top_k").as<size_t>();
+        config.m_top_p = tmp_map.at("m_top_p").as<size_t>();
+        config.m_do_sample = tmp_map.at("m_do_sample").as<bool>();
+
+        // special tokens
+        config.m_bos_token_id = tmp_map.at("m_bos_token_id").as<int64_t>();
+        config.m_eos_token_id = tmp_map.at("m_eos_token_id").as<int64_t>();
+        config.m_pad_token_id = tmp_map.at("m_pad_token_id").as<int64_t>();
+        return config;
+    }
 };

From 63d8f6d92bdee4f4e8252e77c40ff8cbebe57802 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 3 May 2024 11:56:26 +0200
Subject: [PATCH 26/97] cleanup

---
 text_generation/causal_lm/cpp/CMakeLists.txt  |  29 +----
 .../causal_lm/cpp/beam_search_causal_lm.cpp   | 108 +++++++++---------
 .../generate_pipeline/generation_config.hpp   |   3 +
 .../cpp/generate_pipeline/llm_pipeline.cpp    |   1 -
 .../cpp/generate_pipeline/llm_pipeline.hpp    |  12 +-
 .../cpp/generate_pipeline/llm_tokenizer.cpp   |  17 ---
 .../causal_lm/cpp/greedy_causal_lm.cpp        | 103 ++---------------
 7 files changed, 81 insertions(+), 192 deletions(-)

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 09b76869cb..33d4fd6643 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -9,15 +9,12 @@ set(JINJA2CPP_DEPS_MODE internal)
 
 add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
 add_subdirectory(../../../thirdparty/nlohmann_json/ "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json/")
-# add_subdirectory(../../../thirdparty/inja/ "${CMAKE_CURRENT_BINARY_DIR}/inja/")
-# include_directories(../../../thirdparty/inja/include/inja)
 
+# todo: remove hardcodes and make submodule work
+include_directories($ENV{HOME}/opt/jinja2cpp/include)
 # add_subdirectory(../../../thirdparty/Jinja2Cpp/ "${CMAKE_CURRENT_BINARY_DIR}/Jinja2Cpp/")
 # include_directories(../../../thirdparty/inja/include/Jinja2Cpp)
 
-# todo: remove hardcodes
-include_directories($ENV{HOME}/opt/jinja2cpp/include)
-
 set(TARGET_NAME greedy_causal_lm)
 add_executable(${TARGET_NAME} greedy_causal_lm.cpp)
 # todo: remove hardcode
@@ -63,8 +60,6 @@ target_link_libraries(${TARGET_NAME} PRIVATE nlohmann_json::nlohmann_json)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-target_link_libraries(${TARGET_NAME} PRIVATE stdc++fs)
-
 set(TARGET_NAME chat_sample)
 add_executable(${TARGET_NAME} generate_pipeline/chat_sample.cpp generate_pipeline/llm_tokenizer.cpp generate_pipeline/llm_pipeline.cpp)
 # todo: remove hardcode
@@ -77,24 +72,12 @@ target_link_libraries(${TARGET_NAME} PRIVATE nlohmann_json::nlohmann_json)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-set(TARGET_NAME continuation_sample)
-add_executable(${TARGET_NAME} generate_pipeline/continuation_sample.cpp generate_pipeline/llm_tokenizer.cpp generate_pipeline/llm_pipeline.cpp)
-# todo: remove hardcode
-target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)
-target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+
+set(TARGET_NAME prompt_lookup_decoding_lm)
+add_executable(${TARGET_NAME} prompt_lookup_decoding_lm.cpp)
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+target_include_directories(${TARGET_NAME} PRIVATE ./)
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
-target_link_libraries(${TARGET_NAME} PRIVATE nlohmann_json::nlohmann_json)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
-
-target_link_libraries(${TARGET_NAME} PRIVATE stdc++fs)
-
-add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp)
-target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
-target_include_directories(prompt_lookup_decoding_lm PRIVATE ./)
-find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime)
-set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD 17)
-set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
index 9f725a2d8f..110ac47178 100644
--- a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
+++ b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
@@ -167,60 +167,60 @@ int main(int argc, char* argv[]) try {
     auto [input_ids, attention_mask] = tokenize(tokenizer, prompts_arguments_to_vector(argc, argv));
 
     // Initialize beam search
-    // const int64_t* prompt_data = input_ids.data<const int64_t>();
-    // std::vector<std::vector<int64_t>> prompts;
-    // prompts.reserve(input_ids.get_shape().at(0));
-    // for (size_t batch = 0; batch < input_ids.get_shape().at(0); batch++) {
-    //     size_t sequence_length = input_ids.get_shape().at(1);
-    //     size_t batch_offset = batch * sequence_length;
-    //     const int64_t* prompt_start = prompt_data + batch_offset;
-    //     prompts.push_back(std::vector<int64_t>{prompt_start, prompt_start + sequence_length});
-    // }
-
-    // // Get the runtime info from the tokenizer model that we read earlier
-    // auto rt_info = tokenizer_model->get_rt_info();  // Get the runtime info for the model
-    // int64_t SPECIAL_EOS_TOKEN;
-
-    // if (rt_info.count("eos_token_id") > 0) {  // check if the runtime information has a valid EOS token ID
-    //     SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as<int64_t>();
-
-    // } else {
-    //     throw std::runtime_error("EOS token ID not found in model's runtime information.");
-    // }
-
-    // Parameters parameters{std::move(prompts), SPECIAL_EOS_TOKEN};
-    // GroupBeamSearcher group_beam_searcher{parameters};
-
-    // initialize_inputs(input_ids, attention_mask, lm);
-
-    // std::vector<int64_t> next_tokens;
-    // std::vector<int32_t> next_beams;
-
-    // for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) {
-    //     lm.infer();
-
-    //     std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits"));
-    //     if (next_tokens.empty()) {
-    //         break;
-    //     }
-    //     size_t batch_size = next_tokens.size();
-    //     // Set pointers
-    //     lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
-    //     lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
-    //     // Set auxiliary inputs
-    //     set_attention_mask(lm.get_tensor("attention_mask"), next_beams);
-    //     set_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask"));
-    // }
-
-    // for (const std::vector<std::vector<Beam>>& prompt_group : finalize(std::move(group_beam_searcher))) {
-    //     std::cout << "Prompt:\n";
-    //     for (const std::vector<Beam> group : prompt_group) {
-    //         std::cout << "Group:\n";
-    //         for (const Beam& beam : group) {
-    //             std::cout << beam.score << ": " << detokenize(detokenizer, beam.tokens) << '\n';
-    //         }
-    //     }
-    // }
+    const int64_t* prompt_data = input_ids.data<const int64_t>();
+    std::vector<std::vector<int64_t>> prompts;
+    prompts.reserve(input_ids.get_shape().at(0));
+    for (size_t batch = 0; batch < input_ids.get_shape().at(0); batch++) {
+        size_t sequence_length = input_ids.get_shape().at(1);
+        size_t batch_offset = batch * sequence_length;
+        const int64_t* prompt_start = prompt_data + batch_offset;
+        prompts.push_back(std::vector<int64_t>{prompt_start, prompt_start + sequence_length});
+    }
+
+    // Get the runtime info from the tokenizer model that we read earlier
+    auto rt_info = tokenizer_model->get_rt_info();  // Get the runtime info for the model
+    int64_t SPECIAL_EOS_TOKEN;
+
+    if (rt_info.count("eos_token_id") > 0) {  // check if the runtime information has a valid EOS token ID
+        SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as<int64_t>();
+
+    } else {
+        throw std::runtime_error("EOS token ID not found in model's runtime information.");
+    }
+
+    Parameters parameters{std::move(prompts), SPECIAL_EOS_TOKEN};
+    GroupBeamSearcher group_beam_searcher{parameters};
+
+    initialize_inputs(input_ids, attention_mask, lm);
+
+    std::vector<int64_t> next_tokens;
+    std::vector<int32_t> next_beams;
+
+    for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) {
+        lm.infer();
+
+        std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits"));
+        if (next_tokens.empty()) {
+            break;
+        }
+        size_t batch_size = next_tokens.size();
+        // Set pointers
+        lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
+        lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
+        // Set auxiliary inputs
+        set_attention_mask(lm.get_tensor("attention_mask"), next_beams);
+        set_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask"));
+    }
+
+    for (const std::vector<std::vector<Beam>>& prompt_group : finalize(std::move(group_beam_searcher))) {
+        std::cout << "Prompt:\n";
+        for (const std::vector<Beam> group : prompt_group) {
+            std::cout << "Group:\n";
+            for (const Beam& beam : group) {
+                std::cout << beam.score << ": " << detokenize(detokenizer, beam.tokens) << '\n';
+            }
+        }
+    }
     // Model is stateful which means that context (kv-cache) which belongs to a particular
     // text sequence is accumulated inside the model during the generation loop above.
     // This context should be reset before processing the next text sequence.
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index 8c89bb65c8..5325bde5be 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -202,6 +202,8 @@ class GenerationConfig {
 
     GenerationConfig(std::string json_path) {
         std::ifstream f(json_path);
+        OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config");
+
         nlohmann::json data = nlohmann::json::parse(f);
 
         m_bos_token_id = data.value("bos_token_id", 0);
@@ -223,6 +225,7 @@ class GenerationConfig {
         m_diversity_penalty = data.value("diversity_penalty", 1.0f);
         int num_beams = data.value("num_beams", 1);
         m_group_size = num_beams / m_num_groups;
+        OPENVINO_ASSERT(num_beams % m_num_groups == 0, "number of beams should be divisible by number of groups");
     }
 
 
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
index 2c07c9216f..bbe81dffde 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
@@ -242,7 +242,6 @@ ov::LLMPipeline::LLMPipeline(std::string& path, std::string device, const ov::An
         std::ifstream f(path + "/" + tokenizer_config_fname);
         nlohmann::json data = nlohmann::json::parse(f);
         m_chat_template = data.value("chat_template", "");
-
     }
     
     m_device = device;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
index 4bc735f0c7..29c7dd264d 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
@@ -115,12 +115,6 @@ class LLMPipeline {
 
     GenerationResults multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params);
 
-    std::string call(std::string text);
-
-    std::string call(std::string text, GenerationConfig generation_config);
-
-    PipelineResults call(std::vector<std::string> text, GenerationConfig sampling_parameters);
-
     std::string operator()(std::string text);
 
     std::string operator()(std::string text, GenerationConfig sampling_parameters);
@@ -149,6 +143,12 @@ class LLMPipeline {
     TextCoutStreamer m_streamer;
     std::function<void (std::string)> m_streamer_callback = [](std::string ){ ;};
     bool is_chat_conversation = false;
+
+    std::string call(std::string text);
+
+    std::string call(std::string text, GenerationConfig generation_config);
+
+    PipelineResults call(std::vector<std::string> text, GenerationConfig sampling_parameters);
 };
 
 
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
index 9443de22ac..8e0263a895 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
@@ -56,23 +56,6 @@ std::pair<ov::Tensor, ov::Tensor> Tokenizer::tokenize(std::vector<std::string> p
     int64_t* attention_mask_data = attention_mask.data<int64_t>();
     std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0);
     
-    vector<vector<int64_t>> input_ids_vec;
-    vector<vector<int64_t>> atten_mask_vec;
-    
-    input_ids_vec.reserve(prompts.size());
-    atten_mask_vec.reserve(prompts.size());
-    auto res_tensor = m_tokenize_request.get_tensor("input_ids");
-    auto atten_tensor = m_tokenize_request.get_tensor("attention_mask");
-    auto res_shape = res_tensor.get_shape();
-    
-    for (int i = 0; i < res_shape[0]; ++i) {
-        int64_t* start = res_tensor.data<int64_t>() + i * res_shape[1];
-        input_ids_vec.emplace_back(std::vector<int64_t>(start, start + res_shape[1]));
-        
-        int64_t* atten_start = atten_tensor.data<int64_t>() + i * res_shape[1];
-        atten_mask_vec.emplace_back(std::vector<int64_t>(atten_start, atten_start + res_shape[1]));
-    }
-
     return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
 }
 
diff --git a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp
index db7e71323b..d75d32d0e0 100644
--- a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp
+++ b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp
@@ -54,30 +54,6 @@ struct TextStreamer {
 };
 }
 
-void print_tensor(const ov::Tensor& tensor) {
-    std::vector<int64_t> res;
-
-    auto t_shape = tensor.get_shape();
-    for (size_t i = 0; i < t_shape[1]; ++i) {
-        if (tensor.get_element_type() == ov::element::i64) {
-            res.emplace_back(tensor.data<int64_t>()[i]);
-        }
-    }
-    std::cout << "";
-}
-
-template <class T>
-void copy_partially(const ov::Tensor& src, const ov::Tensor& trg, int src_offset, int trg_offset, size_t size) {
-    T* src_data = src.data<T>();
-    T* dst_data = trg.data<T>();
-    OPENVINO_ASSERT(src_offset + size <= src.get_shape()[1]);
-    OPENVINO_ASSERT(trg_offset + size <= trg.get_shape()[1]);
-
-    for (size_t i = 0; i < size; i++) {
-        dst_data[trg_offset + i] = src_data[src_offset + i];
-    }
-}
-
 int main(int argc, char* argv[]) try {
     if (argc != 3) {
         throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> '<PROMPT>'");
@@ -97,78 +73,25 @@ int main(int argc, char* argv[]) try {
     ov::InferRequest lm = core.compile_model(
         std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request();
     auto seq_len = input_ids.get_size();
-
-    ov::Tensor position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
+    
+    // Initialize inputs
+    lm.set_tensor("input_ids", input_ids);
+    lm.set_tensor("attention_mask", attention_mask);
+    ov::Tensor position_ids = lm.get_tensor("position_ids");
     position_ids.set_shape(input_ids.get_shape());
     std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + seq_len, 0);
-    
-
     constexpr size_t BATCH_SIZE = 1;
     // Input values are persistent between inference calls.
     // That allows to set values, which aren't going to change, only once
     lm.get_tensor("beam_idx").set_shape({BATCH_SIZE});
     lm.get_tensor("beam_idx").data<int32_t>()[0] = 0;
-
-    // splitting the first inference
-    auto shape = input_ids.get_shape();
-    size_t split_pos = shape[1] * 0.95;
-    int position = 4;
-    if (position != -1)
-        split_pos = position;
-    
-    ov::Shape split_1_shape = {1, split_pos};
-    ov::Shape split_2_shape = {1, shape[1] - split_pos};
-
-    ov::Tensor input_ids_1 = ov::Tensor{ov::element::i64, split_1_shape};
-    ov::Tensor input_ids_2 = ov::Tensor{ov::element::i64, split_2_shape};
-    
-    ov::Tensor position_ids_1 = ov::Tensor{ov::element::i64, split_1_shape};
-    ov::Tensor position_ids_2 = ov::Tensor{ov::element::i64, split_2_shape};
-
-    ov::Tensor attention_mask_1 = ov::Tensor{ov::element::i64, split_1_shape};
-    ov::Tensor attention_mask_2 = ov::Tensor{ov::element::i64, split_2_shape};
-
-    copy_partially<int64_t>(input_ids, input_ids_1, 0, 0, split_pos);
-    copy_partially<int64_t>(input_ids, input_ids_2, split_pos, 0, split_2_shape[1]);
-    copy_partially<int64_t>(attention_mask, attention_mask_1, 0, 0, split_pos);
-    copy_partially<int64_t>(attention_mask, attention_mask_2, split_pos, 0, split_2_shape[1]);
-    copy_partially<int64_t>(position_ids, position_ids_1, 0, 0, split_pos);
-    copy_partially<int64_t>(position_ids, position_ids_2, split_pos, 0, split_2_shape[1]);
-    
-    print_tensor(input_ids);
-    print_tensor(input_ids_1);
-    print_tensor(input_ids_2);
-
-    print_tensor(position_ids);
-    print_tensor(position_ids_1);
-    print_tensor(position_ids_2);
-
-    // 2 part inference
-    lm.set_tensor("input_ids", input_ids_1);
-    lm.set_tensor("attention_mask", attention_mask_1);
-    lm.set_tensor("position_ids", position_ids_1);
-    lm.infer();
-
-    lm.set_tensor("input_ids", input_ids_2);
-    lm.set_tensor("attention_mask", attention_mask);
-    lm.set_tensor("position_ids", position_ids_2);
     lm.infer();
-    auto shift = input_ids_2.get_shape()[1] - 1;
-
-    // single inference
-    // lm.set_tensor("input_ids", input_ids);
-    // lm.set_tensor("attention_mask", attention_mask);
-    // lm.set_tensor("position_ids", position_ids);
-    // lm.infer();
-    // auto shift = seq_len - 1;
-
-    seq_len = lm.get_tensor("attention_mask").get_shape()[1];
     size_t vocab_size = lm.get_tensor("logits").get_shape().back();
-    float* logits = lm.get_tensor("logits").data<float>() + shift * vocab_size;
+    float* logits = lm.get_tensor("logits").data<float>() + (seq_len - 1) * vocab_size;
     int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
 
     lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1});
-    lm.get_tensor("position_ids").set_shape({BATCH_SIZE, 1});
+    position_ids.set_shape({BATCH_SIZE, 1});
     TextStreamer text_streamer{std::move(detokenizer)};
 
     // Get the runtime info from the tokenizer model that we read earlier
@@ -180,19 +103,17 @@ int main(int argc, char* argv[]) try {
     } else {
         throw std::runtime_error("EOS token ID not found in model's runtime information.");
     }
-    
-    int max_sequence_length = 1315;
+
+    int max_sequence_length = 100;
     while (out_token != SPECIAL_EOS_TOKEN && seq_len < max_sequence_length) {
         ++seq_len;
         lm.get_tensor("input_ids").data<int64_t>()[0] = out_token;
         lm.get_tensor("attention_mask").set_shape({BATCH_SIZE, seq_len});
         std::fill_n(lm.get_tensor("attention_mask").data<int64_t>(), seq_len, 1);
-        lm.get_tensor("position_ids").data<int64_t>()[0] = int64_t(seq_len - 1);
-
-        lm.infer();
+        position_ids.data<int64_t>()[0] = int64_t(seq_len - 1);
+        lm.start_async();
         text_streamer.put(out_token);
-        // std::cout << out_token << " ";
-
+        lm.wait();
         logits = lm.get_tensor("logits").data<float>();
         out_token = std::max_element(logits, logits + vocab_size) - logits;
     }

From a8337601ea930c33d79c40af0d133639d818e5d6 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Mon, 6 May 2024 08:31:19 +0200
Subject: [PATCH 27/97] before moving to pimpl

---
 .../cpp/generate_pipeline/chat_sample.cpp     |  2 +-
 .../cpp/generate_pipeline/generate_sample.cpp | 24 ++++++------
 .../generate_pipeline/generation_config.hpp   | 10 ++---
 .../cpp/generate_pipeline/llm_pipeline.cpp    | 37 +++++++++++++------
 .../cpp/generate_pipeline/llm_pipeline.hpp    | 18 ++++-----
 5 files changed, 53 insertions(+), 38 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index ca2d0e62a5..075af1a9d0 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -61,7 +61,7 @@ int main(int argc, char* argv[]) try {
         
         cout << "\n----------\n";
     }
-    pipe.stop_conversation();
+    pipe.finish_chat();
 
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
index 222abc2de5..94fd3411ff 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -70,18 +70,18 @@ int main(int argc, char* argv[]) try {
     pipe(prompt, config);
     text_streamer.end();
     
-    // // Example 2: Grouped Beam Search decoding example
-    // pipe = LLMPipeline(model_path, device);  
-    // config = pipe.generation_config();
+    // Example 2: Grouped Beam Search decoding example
+    pipe = ov::LLMPipeline(model_path, device);  
+    config = pipe.generation_config();
 
-    // // will return vector with num_return_sequences strings
-    // auto num_return_sequences = 3;
-    // config.max_new_tokens(20).num_groups(3).group_size(5).num_return_sequences(num_return_sequences);
+    // will return vector with num_return_sequences strings
+    auto num_return_sequences = 3;
+    config.max_new_tokens(20).num_groups(3).group_size(5).num_return_sequences(num_return_sequences);
     
-    // cout << endl << "grouped beam search generated candidates:" << endl;
-    // auto generation_results = pipe({prompt}, config);
-    // for (int i = 0; i < num_return_sequences; ++i)
-    //     cout << "candidate " << i << ": " << generation_results[i] << endl;
+    cout << endl << "grouped beam search generated candidates:" << endl;
+    auto generation_results = pipe({prompt}, config);
+    for (int i = 0; i < num_return_sequences; ++i)
+        cout << generation_results[i].score << ": " << generation_results[i].text << endl;
 
     // Example 3: Greedy Decoding with multiple batch
     pipe = ov::LLMPipeline(model_path, device);
@@ -90,8 +90,8 @@ int main(int argc, char* argv[]) try {
     cout << endl << "greedy decoding with multiple batches:" << endl;
     std::vector<std::string> prompts = {"table is made of", "Alan Turing was a", "1 + 1 = ", "Why is the Sun yellow?"};
     auto results = pipe(prompts, config.max_new_tokens(20));
-    for (int i = 0; i < prompts.size(); i++)
-        cout << prompts[i] << ": " << results.texts[i] << endl;
+    for (const auto& res: results)
+        std::cout << res.text << std::endl;
 
     // Example 4: Calling tokenizer/detokenizer manually and getting beam scores for all candidates
     pipe = ov::LLMPipeline(model_path);
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
index 5325bde5be..5be46b6be7 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
@@ -332,11 +332,7 @@ class GenerationConfig {
     ov::InferRequest m_assistant_model;
     size_t m_num_assistant_tokens = 5;
     size_t m_seq_len_axis = 2;
-private:
-    std::shared_ptr<const ov::Model> m_assistant_ov_model;
-    bool is_assistant_request_defined = false;
-    bool is_assistant_ov_defined = false;
-
+    
     static GenerationConfig anymap_to_generation_config(const ov::AnyMap& genereation_config_map = {}) {
         // need to load default values and update only those keys that are specified in genereation_config_map
         auto tmp_map = default_generation_config_map;
@@ -376,4 +372,8 @@ class GenerationConfig {
         config.m_pad_token_id = tmp_map.at("m_pad_token_id").as<int64_t>();
         return config;
     }
+private:
+    std::shared_ptr<const ov::Model> m_assistant_ov_model;
+    bool is_assistant_request_defined = false;
+    bool is_assistant_ov_defined = false;
 };
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
index bbe81dffde..66d72bb426 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
@@ -215,10 +215,10 @@ ov::LLMPipeline::LLMPipeline(
     std::string& tokenizer_path,
     std::string& detokenizer_path,
     std::string device,
-    const ov::AnyMap& config
+    const ov::AnyMap& plugin_config
 ) {
     m_device = device;
-    m_config = config;
+    m_plugin_config = plugin_config;
     ov::Core core;
     
     auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
@@ -226,7 +226,7 @@ ov::LLMPipeline::LLMPipeline(
     std::string full_path = model_path;
     if (!is_xml(full_path))
         full_path += "/openvino_model.xml";
-    m_model_runner = core.compile_model(full_path, device, config).create_infer_request();
+    m_model_runner = core.compile_model(full_path, device, plugin_config).create_infer_request();
     
     // todo: add loading Tokenizers from separate folders
 }
@@ -435,11 +435,13 @@ ov::GenerationResults ov::LLMPipeline::beam_search(ov::Tensor prompts, ov::Tenso
     }
 
     std::vector<Beam> beams;
-    // for (const std::vector<Beam>& group : finalize(std::move(group_beam_searcher))) {
-    //     for (const Beam& beam : group) {
-    //         beams.emplace_back(beam);
-    //     }
-    // }
+    for (const std::vector<std::vector<Beam>>& prompt_group : finalize(std::move(group_beam_searcher))) {
+        for (const std::vector<Beam> group : prompt_group) {
+            for (const Beam& beam : group) {
+                beams.emplace_back(beam);
+            }
+        }
+    }
 
     auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); };
     std::sort(beams.begin(), beams.end(), compare_scores);
@@ -470,7 +472,7 @@ subsequent requests.
 ov::GenerationResults ov::LLMPipeline::speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
     auto batch_size = input_ids.get_shape()[0];
     OPENVINO_ASSERT(batch_size == 1);
-    auto draft_model = sampling_params.get_assistant_model(m_device, m_config);
+    auto draft_model = sampling_params.get_assistant_model(m_device, m_plugin_config);
     auto main_model = m_model_runner;
     
     auto draft_input_ids = ov::Tensor{input_ids.get_element_type(), input_ids.get_shape()};
@@ -743,11 +745,16 @@ void ov::LLMPipeline::set_streamer(std::function<void (std::string)> callback) {
     m_streamer = TextCoutStreamer(m_tokenizer);
 }
 
-void ov::LLMPipeline::start_conversation() {
+void ov::LLMPipeline::set_streamer() {
+    is_streamer_set = false;
+    m_streamer_callback = [](std::string){ ;};
+}
+
+void ov::LLMPipeline::start_chat() {
     is_chat_conversation = true;
 }
 
-void ov::LLMPipeline::stop_conversation() {
+void ov::LLMPipeline::finish_chat() {
     is_chat_conversation = false;
     reset_state();
 }
@@ -755,3 +762,11 @@ void ov::LLMPipeline::stop_conversation() {
 void ov::LLMPipeline::reset_state() {
     m_model_runner.reset_state();
 }
+
+void ov::LLMPipeline::set_default_config(const GenerationConfig& generation_config) {
+    m_sampling_parameters = generation_config;
+}
+
+void ov::LLMPipeline::set_default_config(const AnyMap& generation_config_map) {
+    m_sampling_parameters = GenerationConfig::anymap_to_generation_config(generation_config_map);
+}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
index 29c7dd264d..2a4a718150 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
@@ -77,14 +77,13 @@ class PipelineResults {
 };
 
 
-
 class LLMPipeline {
 public:
     ov::InferRequest m_model_runner;
     Tokenizer m_tokenizer;
     GenerationConfig m_sampling_parameters;
     std::string m_device;
-    ov::AnyMap m_config;
+    ov::AnyMap m_plugin_config;
     ov::Tensor m_attentions_mask_cache;
     bool is_streamer_set = false;
     std::string m_chat_template = "";
@@ -101,10 +100,11 @@ class LLMPipeline {
         std::string& tokenizer_path,
         std::string& detokenizer_path,
         std::string device="CPU",
-        const ov::AnyMap& config={}
+        const ov::AnyMap& plugin_config={}
     );
 
     LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& config={});
+    
     GenerationConfig generation_config() const;
 
     GenerationResults greedy_search(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
@@ -136,21 +136,21 @@ class LLMPipeline {
     std::string apply_chat_template(std::string prompt, std::string role = "user") const;
 
     void set_streamer(std::function<void (std::string)> callback);
-    void start_conversation();
-    void stop_conversation();
+    void set_streamer();
+    void start_chat();
+    void finish_chat();
     void reset_state();
+    void set_default_config(const GenerationConfig& generation_config);
+    void set_default_config(const AnyMap& generation_config_map);
 private:
     TextCoutStreamer m_streamer;
     std::function<void (std::string)> m_streamer_callback = [](std::string ){ ;};
     bool is_chat_conversation = false;
 
     std::string call(std::string text);
-
     std::string call(std::string text, GenerationConfig generation_config);
-
     PipelineResults call(std::vector<std::string> text, GenerationConfig sampling_parameters);
-};
-
 
+};
 
 } // namespace ov
\ No newline at end of file

From 16816547d27978c0edaf301a78388602d9953a9f Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Mon, 6 May 2024 11:49:23 +0200
Subject: [PATCH 28/97] move to separate include & src

---
 text_generation/causal_lm/cpp/CMakeLists.txt  | 46 +++++++++----------
 .../cpp/generate_pipeline/generate_sample.cpp |  2 +-
 .../{ => include}/generation_config.hpp       |  0
 .../{ => include}/llm_pipeline.hpp            | 13 +-----
 .../{ => include}/llm_tokenizer.hpp           |  1 -
 .../{ => include}/text_streamer.hpp           |  0
 .../{ => src}/llm_pipeline.cpp                | 12 ++++-
 .../{ => src}/llm_tokenizer.cpp               |  2 +-
 .../{ => src}/text_streamer.cpp               |  0
 9 files changed, 37 insertions(+), 39 deletions(-)
 rename text_generation/causal_lm/cpp/generate_pipeline/{ => include}/generation_config.hpp (100%)
 rename text_generation/causal_lm/cpp/generate_pipeline/{ => include}/llm_pipeline.hpp (86%)
 rename text_generation/causal_lm/cpp/generate_pipeline/{ => include}/llm_tokenizer.hpp (99%)
 rename text_generation/causal_lm/cpp/generate_pipeline/{ => include}/text_streamer.hpp (100%)
 rename text_generation/causal_lm/cpp/generate_pipeline/{ => src}/llm_pipeline.cpp (98%)
 rename text_generation/causal_lm/cpp/generate_pipeline/{ => src}/llm_tokenizer.cpp (99%)
 rename text_generation/causal_lm/cpp/generate_pipeline/{ => src}/text_streamer.cpp (100%)

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 33d4fd6643..c1a3e046c7 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -18,7 +18,6 @@ include_directories($ENV{HOME}/opt/jinja2cpp/include)
 set(TARGET_NAME greedy_causal_lm)
 add_executable(${TARGET_NAME} greedy_causal_lm.cpp)
 # todo: remove hardcode
-target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
@@ -28,8 +27,6 @@ set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 set(TARGET_NAME beam_search_causal_lm)
 add_executable(${TARGET_NAME} beam_search_causal_lm.cpp)
-# todo: remove hardcode
-target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
@@ -39,8 +36,6 @@ set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 set(TARGET_NAME speculative_decoding_lm)
 add_executable(${TARGET_NAME} speculative_decoding_lm.cpp)
-# todo: remove hardcode
-target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
@@ -48,36 +43,39 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-set(TARGET_NAME generate_sample)
-add_executable(${TARGET_NAME} generate_pipeline/generate_sample.cpp generate_pipeline/llm_tokenizer.cpp generate_pipeline/llm_pipeline.cpp)
-# todo: remove hardcode
-target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)
-target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+set(TARGET_NAME prompt_lookup_decoding_lm)
+add_executable(${TARGET_NAME} prompt_lookup_decoding_lm.cpp)
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+target_include_directories(${TARGET_NAME} PRIVATE ./)
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
-target_link_libraries(${TARGET_NAME} PRIVATE nlohmann_json::nlohmann_json)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-set(TARGET_NAME chat_sample)
-add_executable(${TARGET_NAME} generate_pipeline/chat_sample.cpp generate_pipeline/llm_tokenizer.cpp generate_pipeline/llm_pipeline.cpp)
-# todo: remove hardcode
-target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)
+# Generate Pipeline library
+set(TARGET_NAME generate_pipeline_lib)
+file(GLOB SOURCE_FILES "generate_pipeline/src/*.cpp")
+add_library(${TARGET_NAME} ${SOURCE_FILES})
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
-target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+target_include_directories(${TARGET_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/generate_pipeline/include)
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
-target_link_libraries(${TARGET_NAME} PRIVATE nlohmann_json::nlohmann_json)
+target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime)
+target_link_libraries(${TARGET_NAME} PUBLIC nlohmann_json::nlohmann_json)
+target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)  # todo: remove hardcode
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
+set(TARGET_NAME generate_sample)
+add_executable(${TARGET_NAME} generate_pipeline/generate_sample.cpp)
+target_link_libraries(${TARGET_NAME} PRIVATE generate_pipeline_lib)
+target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-set(TARGET_NAME prompt_lookup_decoding_lm)
-add_executable(${TARGET_NAME} prompt_lookup_decoding_lm.cpp)
-target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
-target_include_directories(${TARGET_NAME} PRIVATE ./)
-find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
+set(TARGET_NAME chat_sample)
+add_executable(${TARGET_NAME} generate_pipeline/chat_sample.cpp)
+target_link_libraries(${TARGET_NAME} PRIVATE generate_pipeline_lib)
+target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
index 94fd3411ff..54306e38b2 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -1,7 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include <openvino/openvino.hpp>
+// #include <openvino/openvino.hpp>
 #include "llm_pipeline.hpp"
 
 
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/generation_config.hpp
rename to text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
similarity index 86%
rename from text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
rename to text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
index 2a4a718150..6fb16bee13 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
@@ -5,21 +5,12 @@
 
 #include <openvino/openvino.hpp>
 #include <openvino/core/any.hpp>
-#include "generate_pipeline/generation_config.hpp"
-#include "generate_pipeline/llm_tokenizer.hpp"
+#include "generation_config.hpp"
+#include "llm_tokenizer.hpp"
 #include <filesystem>
 
 using namespace std;
 
-void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
-void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
-ov::Tensor init_attention_mask(ov::Tensor& position_ids);
-ov::Tensor extend_attention(ov::Tensor attention_mask);
-ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len);
-void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len);
-
-std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_idx);
-
 class Tokenizer; // forward declaration
 
 namespace ov {
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_tokenizer.hpp
similarity index 99%
rename from text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp
rename to text_generation/causal_lm/cpp/generate_pipeline/include/llm_tokenizer.hpp
index 219554a57e..47bdab61b5 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_tokenizer.hpp
@@ -7,7 +7,6 @@
 #include <openvino/core/any.hpp>
 #include <filesystem>
 
-
 using namespace std;
 
 std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token=2);
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/text_streamer.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/text_streamer.hpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/text_streamer.hpp
rename to text_generation/causal_lm/cpp/generate_pipeline/include/text_streamer.hpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
similarity index 98%
rename from text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
rename to text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
index 66d72bb426..6995d79fb4 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
@@ -2,12 +2,22 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <openvino/openvino.hpp>
-#include "generate_pipeline/llm_pipeline.hpp"
+#include "llm_pipeline.hpp"
 #include "group_beam_searcher.hpp"
 #include <filesystem>
 #include <jinja2cpp/template.h>
 #include <jinja2cpp/template_env.h>
 
+void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
+void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
+ov::Tensor init_attention_mask(ov::Tensor& position_ids);
+ov::Tensor extend_attention(ov::Tensor attention_mask);
+ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len);
+void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len);
+
+std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_idx);
+
+
 using namespace std;
 
 std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token) {
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_tokenizer.cpp
similarity index 99%
rename from text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
rename to text_generation/causal_lm/cpp/generate_pipeline/src/llm_tokenizer.cpp
index 8e0263a895..48f25812f4 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/llm_tokenizer.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_tokenizer.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <openvino/openvino.hpp>
-#include "generate_pipeline/llm_tokenizer.hpp"
+#include "llm_tokenizer.hpp"
 #include <filesystem>
 
 
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/text_streamer.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/text_streamer.cpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/text_streamer.cpp
rename to text_generation/causal_lm/cpp/generate_pipeline/src/text_streamer.cpp

From 9fe73c69e6513346379231e078529fdb840e0bef Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Mon, 6 May 2024 14:44:55 +0200
Subject: [PATCH 29/97] pimpl implementation

---
 .../cpp/generate_pipeline/chat_sample.cpp     |   7 +-
 .../cpp/generate_pipeline/generate_sample.cpp | 101 ++++-----
 .../include/generation_config.hpp             |  72 ++----
 .../include/llm_pipeline.hpp                  | 100 ++-------
 .../generate_pipeline/src/llm_pipeline.cpp    | 209 +++++++++++-------
 5 files changed, 223 insertions(+), 266 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index 075af1a9d0..f39d55820f 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -45,7 +45,7 @@ int main(int argc, char* argv[]) try {
     pipe.set_streamer([](std::string word) { std::cout << word << std::flush; });
 
     std::string accumulated_str = "";
-    // pipe.start_conversation();
+    pipe.start_chat();
     for (size_t i = 0; i < questions.size(); i++) {
         prompt = questions[i];
         
@@ -53,10 +53,7 @@ int main(int argc, char* argv[]) try {
         cout << prompt << endl;
         // std::getline(std::cin, prompt);
 
-        // auto answer_str = pipe.call(prompt, config);
-        
-        accumulated_str += pipe.apply_chat_template(prompt);
-        auto answer_str = pipe(accumulated_str, config);
+        auto answer_str = pipe(prompt, config);
         accumulated_str += answer_str;
         
         cout << "\n----------\n";
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
index 54306e38b2..3ec36ee7d1 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -66,64 +66,65 @@ int main(int argc, char* argv[]) try {
     };
 
     cout << "greedy generate streaming mode:" << endl;
-    config.max_new_tokens(20).set_streamer(text_streamer_callback);
+    config.max_new_tokens(20);
+    config.set_streamer(text_streamer_callback);
     pipe(prompt, config);
     text_streamer.end();
     
     // Example 2: Grouped Beam Search decoding example
-    pipe = ov::LLMPipeline(model_path, device);  
-    config = pipe.generation_config();
+    // pipe = ov::LLMPipeline(model_path, device);
+    // config = pipe.generation_config();
 
-    // will return vector with num_return_sequences strings
-    auto num_return_sequences = 3;
-    config.max_new_tokens(20).num_groups(3).group_size(5).num_return_sequences(num_return_sequences);
+    // // will return vector with num_return_sequences strings
+    // auto num_return_sequences = 3;
+    // config.max_new_tokens(20).num_groups(3).group_size(5).num_return_sequences(num_return_sequences);
     
-    cout << endl << "grouped beam search generated candidates:" << endl;
-    auto generation_results = pipe({prompt}, config);
-    for (int i = 0; i < num_return_sequences; ++i)
-        cout << generation_results[i].score << ": " << generation_results[i].text << endl;
-
-    // Example 3: Greedy Decoding with multiple batch
-    pipe = ov::LLMPipeline(model_path, device);
-    config = pipe.generation_config();
-
-    cout << endl << "greedy decoding with multiple batches:" << endl;
-    std::vector<std::string> prompts = {"table is made of", "Alan Turing was a", "1 + 1 = ", "Why is the Sun yellow?"};
-    auto results = pipe(prompts, config.max_new_tokens(20));
-    for (const auto& res: results)
-        std::cout << res.text << std::endl;
-
-    // Example 4: Calling tokenizer/detokenizer manually and getting beam scores for all candidates
-    pipe = ov::LLMPipeline(model_path);
-    auto [input_ids, attention_mask] = pipe.get_tokenizer().tokenize({prompt});
-    config = GenerationConfig::beam_search();
-    // config for grouped beam search
-    config.max_new_tokens(30).num_groups(3).group_size(5).num_return_sequences(15);
+    // cout << endl << "grouped beam search generated candidates:" << endl;
+    // auto generation_results = pipe({prompt}, config);
+    // for (int i = 0; i < num_return_sequences; ++i)
+    //     cout << generation_results[i].score << ": " << generation_results[i].text << endl;
+
+    // // Example 3: Greedy Decoding with multiple batch
+    // pipe = ov::LLMPipeline(model_path, device);
+    // config = pipe.generation_config();
+
+    // cout << endl << "greedy decoding with multiple batches:" << endl;
+    // std::vector<std::string> prompts = {"table is made of", "Alan Turing was a", "1 + 1 = ", "Why is the Sun yellow?"};
+    // auto results = pipe(prompts, config.max_new_tokens(20));
+    // for (const auto& res: results)
+    //     std::cout << res.text << std::endl;
+
+    // // Example 4: Calling tokenizer/detokenizer manually and getting beam scores for all candidates
+    // pipe = ov::LLMPipeline(model_path);
+    // auto [input_ids, attention_mask] = pipe.get_tokenizer().tokenize({prompt});
+    // config = GenerationConfig::beam_search();
+    // // config for grouped beam search
+    // config.max_new_tokens(30).num_groups(3).group_size(5).num_return_sequences(15);
     
-    cout << endl << "beam search with printing of all candidates:" << endl;
-    auto beams = pipe.generate(input_ids, attention_mask, config);
-    for (size_t i = 0; i < beams.scores.size(); i++) {
-        std::cout << beams.scores[i] << ": " << pipe.get_tokenizer().detokenize(beams.tokens[i]) << std::endl;
-    }
-
-    // for (const auto& beam : beams.second)
-    //     std::cout << beam.first << ": " << pipe.detokenize(beam.second) << std::endl;
-
-    {
-        // Example 5: Speculative sampling
-        std::string assitive_model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16";
-        pipe = ov::LLMPipeline(model_path);
-        auto [input_ids, attention_mask] = pipe.get_tokenizer().tokenize({prompt});
-        // config = GenerationConfig::assistive_decoding(assitive_model_path).num_assistant_tokens(5).max_new_tokens(20);
-        pipe.generation_config().assistant_model(assitive_model_path);
+    // cout << endl << "beam search with printing of all candidates:" << endl;
+    // auto beams = pipe.generate(input_ids, attention_mask, config);
+    // for (size_t i = 0; i < beams.scores.size(); i++) {
+    //     std::cout << beams.scores[i] << ": " << pipe.get_tokenizer().detokenize(beams.tokens[i]) << std::endl;
+    // }
+
+    // // for (const auto& beam : beams.second)
+    // //     std::cout << beam.first << ": " << pipe.detokenize(beam.second) << std::endl;
+
+    // {
+    //     // Example 5: Speculative sampling
+    //     std::string assitive_model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16";
+    //     pipe = ov::LLMPipeline(model_path);
+    //     auto [input_ids, attention_mask] = pipe.get_tokenizer().tokenize({prompt});
+    //     // config = GenerationConfig::assistive_decoding(assitive_model_path).num_assistant_tokens(5).max_new_tokens(20);
+    //     pipe.generation_config().assistant_model(assitive_model_path);
         
-        cout << endl << "Speculative sampling with TinyLlama assistance:" << endl;
-        auto results = pipe.generate(input_ids, attention_mask, config);
-        for (size_t i = 0; i < beams.scores.size(); i++) {
-        for (const auto& result : results)
-            std::cout << pipe.get_tokenizer().detokenize(result.tokens) << std::endl;
-        }
-    }
+    //     cout << endl << "Speculative sampling with TinyLlama assistance:" << endl;
+    //     auto results = pipe.generate(input_ids, attention_mask, config);
+    //     for (size_t i = 0; i < beams.scores.size(); i++) {
+    //     for (const auto& result : results)
+    //         std::cout << pipe.get_tokenizer().detokenize(result.tokens) << std::endl;
+    //     }
+    // }
 
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
index 5be46b6be7..d302b5dc40 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
@@ -106,96 +106,78 @@ class GenerationConfig {
         }
     }
 
-    GenerationConfig& max_new_tokens(size_t max_new_tokens) {
+    void max_new_tokens(size_t max_new_tokens) {
         const auto& r = ::default_generation_config_map.find("sdf") != ::default_generation_config_map.end();
 
         m_max_new_tokens = max_new_tokens;
-        return *this;
     }
 
-    GenerationConfig& max_length(size_t max_length) {
+    void max_length(size_t max_length) {
         m_max_length = max_length;
-        return *this;
     }
 
-    GenerationConfig& ignore_eos(bool ignore_eos) {
+    void ignore_eos(bool ignore_eos) {
         m_ignore_eos = ignore_eos;
-        return *this;
     }
 
-    GenerationConfig& eos_token(std::string eos_token) {
+    void eos_token(std::string eos_token) {
         m_eos_token = eos_token;
-        return *this;
     }
 
-    GenerationConfig& num_return_sequences(size_t num_return_sequences) {
+    void num_return_sequences(size_t num_return_sequences) {
         m_num_return_sequences = num_return_sequences;
-        return *this;
     }
 
-    GenerationConfig& num_groups(size_t num_groups) {
+    void num_groups(size_t num_groups) {
         m_num_groups = num_groups;
-        return *this;
     }
 
-    GenerationConfig& group_size(size_t group_size) {
+    void group_size(size_t group_size) {
         m_group_size = group_size;
-        return *this;
     }
 
-    GenerationConfig& diversity_penalty(float diversity_penalty) {
+    void diversity_penalty(float diversity_penalty) {
         m_diversity_penalty = diversity_penalty;
-        return *this;
     }
 
-    GenerationConfig& length_penalty(float length_penalty) {
+    void length_penalty(float length_penalty) {
         m_length_penalty = length_penalty;
-        return *this;
     }
 
-    GenerationConfig& no_repeat_ngram_size(size_t no_repeat_ngram_size) {
+    void no_repeat_ngram_size(size_t no_repeat_ngram_size) {
         m_no_repeat_ngram_size = no_repeat_ngram_size;
-        return *this;
     }
 
-    GenerationConfig& temperature(float temperature) {
+    void temperature(float temperature) {
         m_temperature = temperature;
-        return *this;
     }
 
-    GenerationConfig& top_k(size_t top_k) {
+    void top_k(size_t top_k) {
         m_top_k = top_k;
-        return *this;
     }
 
-    GenerationConfig& top_p(size_t top_p) {
+    void top_p(size_t top_p) {
         m_top_p = top_p;
-        return *this;
     }
 
-    GenerationConfig& do_sample(bool do_sample) {
+    void do_sample(bool do_sample) {
         m_do_sample = do_sample;
-        return *this;
     }
 
-    GenerationConfig& repetition_penalty(float repetition_penalty) {
+    void repetition_penalty(float repetition_penalty) {
         m_repetition_penalty = repetition_penalty;
-        return *this;
     }
 
-    GenerationConfig& bos_token_id(int64_t bos_token_id) {
+    void bos_token_id(int64_t bos_token_id) {
         m_bos_token_id = bos_token_id;
-        return *this;
     }
 
-    GenerationConfig& eos_token_id(int64_t eos_token_id) {
+    void eos_token_id(int64_t eos_token_id) {
         m_eos_token_id = eos_token_id;
-        return *this;
     }
 
-    GenerationConfig& pad_token_id(int64_t pad_token_id) {
+    void pad_token_id(int64_t pad_token_id) {
         m_pad_token_id = pad_token_id;
-        return *this;
     }
 
     GenerationConfig() = default;
@@ -274,37 +256,32 @@ class GenerationConfig {
     }
 
     // for speculative decoding
-    GenerationConfig& assistant_model(const ov::InferRequest& assistant_model) {
+    void assistant_model(const ov::InferRequest& assistant_model) {
         m_assistant_model = assistant_model;
         is_assistant_request_defined = true;
-        return *this;
     }
 
-    GenerationConfig& assistant_model(ov::CompiledModel& assistant_model) {
+    void assistant_model(ov::CompiledModel& assistant_model) {
         m_assistant_model = assistant_model.create_infer_request();
         is_assistant_request_defined = true;
-        return *this;
     }
 
-    GenerationConfig& assistant_model(const std::shared_ptr<const ov::Model>& assistant_model) {
+    void assistant_model(const std::shared_ptr<const ov::Model>& assistant_model) {
         m_assistant_ov_model = assistant_model;
         is_assistant_ov_defined = true;
-        return *this;
     }
 
-    GenerationConfig& assistant_model(std::string assistant_model) {
+    void assistant_model(std::string assistant_model) {
         auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
         if (!is_xml(assistant_model))
             assistant_model += "/openvino_model.xml";
 
         m_assistant_ov_model = ov::Core().read_model(assistant_model);
         is_assistant_ov_defined = true;
-        return *this;
     }
 
-    GenerationConfig& set_streamer(std::function<void (std::vector<int64_t>&&, ov::LLMPipeline&)> callback) {
+    void set_streamer(std::function<void (std::vector<int64_t>&&, ov::LLMPipeline&)> callback) {
         m_callback = callback;
-        return *this;
     }
 
     ov::InferRequest get_assistant_model(std::string device="CPU", const ov::AnyMap& config={}) {
@@ -319,9 +296,8 @@ class GenerationConfig {
         }
     }
     
-    GenerationConfig& num_assistant_tokens(int64_t num_assistant_tokens) {
+    void num_assistant_tokens(int64_t num_assistant_tokens) {
         m_num_assistant_tokens = num_assistant_tokens;
-        return *this;
     }
 
     bool is_speculative() const {
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
index 6fb16bee13..a25d93b5e9 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
@@ -1,7 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#pragma once
+// #pragma once
 
 #include <openvino/openvino.hpp>
 #include <openvino/core/any.hpp>
@@ -15,112 +15,49 @@ class Tokenizer; // forward declaration
 
 namespace ov {
 
-template <class T, class ItemType>
-class ResultsIterator {
-    public:
-        ResultsIterator(const T& results, size_t index) : results(results), index(index) {}
-
-        bool operator!=(const ResultsIterator& other) const;
-
-        ItemType operator*() const;
-
-        ResultsIterator& operator++();
-
-    private:
-        const T& results;
-        size_t index;
-};
-
-class TextScorePair {
-public:
-    std::string text;
-    float score;
-};
-
-class TokensScorePair {
-public:
-    std::vector<int64_t> tokens;
-    float score;
-};
-
-class GenerationResults {
+class EncodedResults {
 public:
     std::vector<std::vector<int64_t>> tokens;
     std::vector<float> scores;
-
-    TokensScorePair operator[](size_t index) const;
-
-    ResultsIterator<GenerationResults, TokensScorePair> begin() const;
-
-    ResultsIterator<GenerationResults, TokensScorePair> end() const;
 };
 
-class PipelineResults {
+class DecodedResults {
 public:
     std::vector<std::string> texts;
     std::vector<float> scores;
-    
-    TextScorePair operator[](size_t index) const;
-
-    ResultsIterator<PipelineResults, TextScorePair> begin() const;
-
-    ResultsIterator<PipelineResults, TextScorePair> end() const;
 };
 
-
 class LLMPipeline {
 public:
-    ov::InferRequest m_model_runner;
-    Tokenizer m_tokenizer;
-    GenerationConfig m_sampling_parameters;
-    std::string m_device;
-    ov::AnyMap m_plugin_config;
-    ov::Tensor m_attentions_mask_cache;
-    bool is_streamer_set = false;
-    std::string m_chat_template = "";
-    
-    // TODO: add constructor for specifying manually tokenizer path
-    // dir path
-    // xml file path
-    // compiled model
-    // infer request
-    // ov::Model
-
     LLMPipeline(
         std::string& model_path,
-        std::string& tokenizer_path,
+        std::string& tokenizer_path,  // todo: make available also specifying ov::Model, ov::CompiledModel, etc. tokenizers
         std::string& detokenizer_path,
         std::string device="CPU",
         const ov::AnyMap& plugin_config={}
     );
-
-    LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& config={});
+    
+    LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& plugin_config={});
+    
+    ~LLMPipeline();  // Declare the destructor
     
     GenerationConfig generation_config() const;
 
-    GenerationResults greedy_search(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
-
-    GenerationResults beam_search(ov::Tensor prompts, ov::Tensor attention_mask, GenerationConfig sampling_params);
-
-    GenerationResults speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
-
-    GenerationResults multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params);
-
     std::string operator()(std::string text);
 
     std::string operator()(std::string text, GenerationConfig sampling_parameters);
 
-    PipelineResults operator()(std::vector<std::string> text, GenerationConfig sampling_parameters);
+    DecodedResults operator()(std::vector<std::string> text, GenerationConfig sampling_parameters);
 
-    PipelineResults operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters);
+    DecodedResults operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters);
 
-    GenerationResults generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
+    EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
 
-    GenerationResults generate(ov::Tensor input_ids, ov::Tensor attention_mask);
+    EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask);
 
-    GenerationResults generate(ov::Tensor input_ids, GenerationConfig sampling_params);
+    EncodedResults generate(ov::Tensor input_ids, GenerationConfig sampling_params);
 
-    GenerationResults generate(ov::Tensor input_ids);
+    EncodedResults generate(ov::Tensor input_ids);
 
     Tokenizer get_tokenizer();
 
@@ -133,15 +70,14 @@ class LLMPipeline {
     void reset_state();
     void set_default_config(const GenerationConfig& generation_config);
     void set_default_config(const AnyMap& generation_config_map);
+
 private:
-    TextCoutStreamer m_streamer;
-    std::function<void (std::string)> m_streamer_callback = [](std::string ){ ;};
-    bool is_chat_conversation = false;
+    class LLMPipelineImpl;
+    std::unique_ptr<LLMPipelineImpl> m_pimpl;
 
     std::string call(std::string text);
     std::string call(std::string text, GenerationConfig generation_config);
-    PipelineResults call(std::vector<std::string> text, GenerationConfig sampling_parameters);
-
+    DecodedResults call(std::vector<std::string> text, GenerationConfig sampling_parameters);
 };
 
 } // namespace ov
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
index 6995d79fb4..0695865ca6 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
@@ -7,6 +7,7 @@
 #include <filesystem>
 #include <jinja2cpp/template.h>
 #include <jinja2cpp/template_env.h>
+#include "generation_config.hpp"
 
 void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
 void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
@@ -17,6 +18,61 @@ void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t n
 
 std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_idx);
 
+namespace ov {
+
+class LLMPipeline::LLMPipelineImpl {
+public:
+    ov::InferRequest m_model_runner;
+    Tokenizer m_tokenizer;
+    GenerationConfig m_sampling_parameters;
+    std::string m_device;
+    ov::AnyMap m_plugin_config;
+    ov::Tensor m_attentions_mask_cache;
+    bool is_streamer_set = false;
+    std::string m_chat_template = "";
+    
+    // TODO: add constructor for specifying manually tokenizer path
+    // dir path
+    // xml file path
+    // compiled model
+    // infer request
+    // ov::Model
+    
+    LLMPipelineImpl(
+        std::string& model_path,
+        std::string& tokenizer_path,
+        std::string& detokenizer_path,
+        std::string device="CPU",
+        const ov::AnyMap& plugin_config={}
+    );
+
+    LLMPipelineImpl(std::string& path, std::string device="CPU", const ov::AnyMap& config={});
+    
+    GenerationConfig generation_config() const;
+
+    EncodedResults greedy_search(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
+
+    EncodedResults beam_search(ov::Tensor prompts, ov::Tensor attention_mask, GenerationConfig sampling_params);
+
+    EncodedResults speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
+
+    EncodedResults multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params);
+
+    EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
+
+    std::string apply_chat_template(std::string prompt, std::string role = "user") const;
+
+    TextCoutStreamer m_streamer;
+    std::function<void (std::string)> m_streamer_callback = [](std::string ){ ;};
+    bool is_chat_conversation = false;
+
+    std::string call(std::string text);
+    std::string call(std::string text, GenerationConfig generation_config);
+    DecodedResults call(std::vector<std::string> text, GenerationConfig sampling_parameters);
+
+};
+
+} // namespace ov
 
 using namespace std;
 
@@ -170,57 +226,17 @@ std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_i
     return {out_token, log_sum};
 }
 
-template <class T, class ItemType>
-bool ov::ResultsIterator<T, ItemType>::operator!=(const ResultsIterator& other) const {
-    return index != other.index;
-}
-
-template <class T, class ItemType>
-ItemType ov::ResultsIterator<T, ItemType>::operator*() const {
-    return ItemType{results[index]};
-}
-
-template <class T, class ItemType>
-ov::ResultsIterator<T, ItemType>& ov::ResultsIterator<T, ItemType>::operator++() {
-    ++index;
-    return *this;
-}
-
-
-template class ov::ResultsIterator<ov::GenerationResults, ov::TokensScorePair>;
-template class ov::ResultsIterator<ov::PipelineResults, ov::TextScorePair>;
-
-ov::TokensScorePair ov::GenerationResults::operator[](size_t index) const {
-    if (index >= tokens.size() || index >= scores.size()) {
-        OPENVINO_THROW("Index out of range");
-    }
-    return TokensScorePair{tokens[index], scores[index]};
-}
-
-ov::ResultsIterator<ov::GenerationResults, ov::TokensScorePair> ov::GenerationResults::begin() const {
-    return ov::ResultsIterator<ov::GenerationResults, ov::TokensScorePair>(*this, 0);
-}
-
-ov::ResultsIterator<ov::GenerationResults, ov::TokensScorePair> ov::GenerationResults::end() const {
-    return ResultsIterator<ov::GenerationResults, TokensScorePair>(*this, tokens.size());
-}
-
-ov::TextScorePair ov::PipelineResults::operator[](size_t index) const {
-    if (index >= texts.size() || index >= scores.size()) {
-        OPENVINO_THROW("Index out of range");
-    }
-    return TextScorePair{texts[index], scores[index]};
-}
-
-ov::ResultsIterator<ov::PipelineResults, ov::TextScorePair> ov::PipelineResults::begin() const {
-    return ov::ResultsIterator<ov::PipelineResults, ov::TextScorePair>(*this, 0);
-}
-
-ov::ResultsIterator<ov::PipelineResults, ov::TextScorePair> ov::PipelineResults::end() const {
-    return ov::ResultsIterator<ov::PipelineResults, ov::TextScorePair>(*this, texts.size());
+ov::LLMPipeline::LLMPipeline(
+    std::string& model_path,
+    std::string& tokenizer_path,
+    std::string& detokenizer_path,
+    std::string device,
+    const ov::AnyMap& plugin_config
+) {
+    m_pimpl = make_unique<LLMPipelineImpl>(model_path, tokenizer_path, detokenizer_path, device, plugin_config);
 }
 
-ov::LLMPipeline::LLMPipeline(
+ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(
     std::string& model_path,
     std::string& tokenizer_path,
     std::string& detokenizer_path,
@@ -242,6 +258,10 @@ ov::LLMPipeline::LLMPipeline(
 }
 
 ov::LLMPipeline::LLMPipeline(std::string& path, std::string device, const ov::AnyMap& config) {
+    m_pimpl = make_unique<LLMPipelineImpl>(path, device, config);
+}
+
+ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(std::string& path, std::string device, const ov::AnyMap& config) {
     std::string tokenizer_config_fname = "tokenizer_config.json";
     std::string generation_config_fname = "generation_config.json";
 
@@ -263,10 +283,14 @@ ov::LLMPipeline::LLMPipeline(std::string& path, std::string device, const ov::An
     m_tokenizer = Tokenizer(path);
 }
 
-GenerationConfig ov::LLMPipeline::generation_config() const {
+GenerationConfig ov::LLMPipeline::LLMPipelineImpl::generation_config() const {
     return m_sampling_parameters;
 }
 
+GenerationConfig ov::LLMPipeline::generation_config() const {
+    return m_pimpl->generation_config();
+}
+
 void print_tensor(const ov::Tensor& tensor) {
     std::vector<int64_t> res;
 
@@ -282,7 +306,7 @@ void print_tensor(const ov::Tensor& tensor) {
     cout << "---------" << endl;
 }
 
-ov::GenerationResults ov::LLMPipeline::greedy_search(ov::Tensor input_ids, 
+ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::greedy_search(ov::Tensor input_ids, 
                                 ov::Tensor attention_mask, 
                                 GenerationConfig sampling_params) {
     ov::Shape prompts_shape = input_ids.get_shape();
@@ -295,7 +319,7 @@ ov::GenerationResults ov::LLMPipeline::greedy_search(ov::Tensor input_ids,
     auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
     initialize_position_ids(position_ids, attention_mask, kv_cache_len);
 
-    ov::GenerationResults results;
+    ov::EncodedResults results;
     results.scores.resize(batch_size);
     results.tokens.resize(batch_size);
     std::fill(results.scores.begin(), results.scores.end(), 0);
@@ -385,7 +409,7 @@ ov::GenerationResults ov::LLMPipeline::greedy_search(ov::Tensor input_ids,
     return results;
 }
 
-ov::GenerationResults ov::LLMPipeline::beam_search(ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params) {
+ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::beam_search(ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params) {
     ov::Shape prompts_shape = prompts.get_shape();
     size_t batch_size = prompts_shape[0];
     // todo: implement for batch > 1
@@ -456,7 +480,7 @@ ov::GenerationResults ov::LLMPipeline::beam_search(ov::Tensor prompts, ov::Tenso
     auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); };
     std::sort(beams.begin(), beams.end(), compare_scores);
     
-    ov::GenerationResults results;
+    ov::EncodedResults results;
     for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) {
         // todo: convert to string 
         results.scores.emplace_back(beam->score);
@@ -479,7 +503,7 @@ match the target. In tha caste the are validated in a single inference request t
 the main model (which is bigger, more accurate but slower) instead of running K
 subsequent requests. 
 */
-ov::GenerationResults ov::LLMPipeline::speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
+ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
     auto batch_size = input_ids.get_shape()[0];
     OPENVINO_ASSERT(batch_size == 1);
     auto draft_model = sampling_params.get_assistant_model(m_device, m_plugin_config);
@@ -536,7 +560,7 @@ ov::GenerationResults ov::LLMPipeline::speculative_sampling(ov::Tensor input_ids
     // the first token which is fed to both draft and main netwoks on each iteration
     auto first_token = out_token;
 
-    ov::GenerationResults results;
+    ov::EncodedResults results;
     results.tokens.resize(batch_size);
 
     results.tokens[0].emplace_back(out_token);
@@ -617,17 +641,25 @@ ov::GenerationResults ov::LLMPipeline::speculative_sampling(ov::Tensor input_ids
     return results;
 }
 
-ov::GenerationResults ov::LLMPipeline::multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
+ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
     // todo: implement
-    ov::GenerationResults results;
+    ov::EncodedResults results;
     return results;
 }
 
-std::string ov::LLMPipeline::call(std::string text) {
+std::string ov::LLMPipeline::LLMPipelineImpl::call(std::string text) {
     return call(text, m_sampling_parameters);
 }
 
+std::string ov::LLMPipeline::call(std::string text) {
+    return m_pimpl->call(text);
+}
+
 std::string ov::LLMPipeline::call(std::string text, GenerationConfig generation_config) {
+    return m_pimpl->call(text, generation_config);
+}
+
+std::string ov::LLMPipeline::LLMPipelineImpl::call(std::string text, GenerationConfig generation_config) {
     if (is_chat_conversation) {
         text = apply_chat_template(text);
     }
@@ -671,7 +703,11 @@ std::string ov::LLMPipeline::call(std::string text, GenerationConfig generation_
     return m_tokenizer.detokenize(generate_results.tokens)[0];
 }
 
-ov::PipelineResults ov::LLMPipeline::call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
+ov::DecodedResults ov::LLMPipeline::call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
+    return m_pimpl->call(text, sampling_parameters);
+}
+
+ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
     auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
 
     auto generate_results = generate(input_ids, attention_mask, sampling_parameters);
@@ -687,16 +723,20 @@ std::string ov::LLMPipeline::operator()(std::string text, GenerationConfig sampl
     return call(text, sampling_parameters);
 }
 
-ov::PipelineResults ov::LLMPipeline::operator()(std::vector<std::string> text, GenerationConfig sampling_parameters) {
+ov::DecodedResults ov::LLMPipeline::operator()(std::vector<std::string> text, GenerationConfig sampling_parameters) {
     return call(text, sampling_parameters);
 }
 
-ov::PipelineResults ov::LLMPipeline::operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters) {
+ov::DecodedResults ov::LLMPipeline::operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters) {
     return call(text, sampling_parameters);
 }
 
-ov::GenerationResults ov::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
-    ov::GenerationResults result;
+ov::EncodedResults ov::LLMPipeline::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
+    return m_pimpl->generate(input_ids, attention_mask, generation_config);
+}
+
+ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
+    ov::EncodedResults result;
 
     if (generation_config.is_gready_sampling()) {
         result = greedy_search(input_ids, attention_mask, generation_config);
@@ -709,29 +749,34 @@ ov::GenerationResults ov::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor
     }
 
     if (!is_chat_conversation)
-        reset_state();
+        // reset_state(); todo: implement in m_mimpl
+        m_model_runner.reset_state();
 
     return result;
 }
 
-ov::GenerationResults ov::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask) {
-    return generate(input_ids, attention_mask, m_sampling_parameters);
+ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask) {
+    return generate(input_ids, attention_mask, m_pimpl->m_sampling_parameters);
 }
 
-ov::GenerationResults ov::LLMPipeline::generate(ov::Tensor input_ids, GenerationConfig sampling_params) {
+ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, GenerationConfig sampling_params) {
 
     return generate(input_ids, init_attention_mask(input_ids), sampling_params);
 }
 
-ov::GenerationResults ov::LLMPipeline::generate(ov::Tensor input_ids) {
-    return generate(input_ids, init_attention_mask(input_ids), m_sampling_parameters);
+ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids) {
+    return generate(input_ids, init_attention_mask(input_ids), m_pimpl->m_sampling_parameters);
 }
 
 Tokenizer ov::LLMPipeline::get_tokenizer() {
-    return m_tokenizer;
+    return m_pimpl->m_tokenizer;
 }
 
 std::string ov::LLMPipeline::apply_chat_template(std::string prompt, std::string role) const {
+    return m_pimpl->apply_chat_template(prompt, role);
+}
+
+std::string ov::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string prompt, std::string role) const {
     jinja2::TemplateEnv env;
     env.GetSettings().lstripBlocks = true;
     env.GetSettings().trimBlocks = true;
@@ -750,33 +795,35 @@ std::string ov::LLMPipeline::apply_chat_template(std::string prompt, std::string
 }
 
 void ov::LLMPipeline::set_streamer(std::function<void (std::string)> callback) {
-    is_streamer_set = true;
-    m_streamer_callback = callback;
-    m_streamer = TextCoutStreamer(m_tokenizer);
+    m_pimpl->is_streamer_set = true;
+    m_pimpl->m_streamer_callback = callback;
+    m_pimpl->m_streamer = TextCoutStreamer(m_pimpl->m_tokenizer);
 }
 
 void ov::LLMPipeline::set_streamer() {
-    is_streamer_set = false;
-    m_streamer_callback = [](std::string){ ;};
+    m_pimpl->is_streamer_set = false;
+    m_pimpl->m_streamer_callback = [](std::string){ ;};
 }
 
 void ov::LLMPipeline::start_chat() {
-    is_chat_conversation = true;
+    m_pimpl->is_chat_conversation = true;
 }
 
 void ov::LLMPipeline::finish_chat() {
-    is_chat_conversation = false;
+    m_pimpl->is_chat_conversation = false;
     reset_state();
 }
 
 void ov::LLMPipeline::reset_state() {
-    m_model_runner.reset_state();
+    m_pimpl->m_model_runner.reset_state();
 }
 
 void ov::LLMPipeline::set_default_config(const GenerationConfig& generation_config) {
-    m_sampling_parameters = generation_config;
+    m_pimpl->m_sampling_parameters = generation_config;
 }
 
 void ov::LLMPipeline::set_default_config(const AnyMap& generation_config_map) {
-    m_sampling_parameters = GenerationConfig::anymap_to_generation_config(generation_config_map);
+    m_pimpl->m_sampling_parameters = GenerationConfig::anymap_to_generation_config(generation_config_map);
 }
+
+ov::LLMPipeline::~LLMPipeline() = default;

From 053708fb9f7e4ae89464927b4b9d5ef2bdee9cde Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Mon, 6 May 2024 16:48:01 +0200
Subject: [PATCH 30/97] temporary disable jinja2cpp

---
 text_generation/causal_lm/cpp/CMakeLists.txt  |  4 +--
 .../generate_pipeline/src/llm_pipeline.cpp    | 35 +++++++++++--------
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index c1a3e046c7..1d9cbd66be 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -11,7 +11,7 @@ add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINAR
 add_subdirectory(../../../thirdparty/nlohmann_json/ "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json/")
 
 # todo: remove hardcodes and make submodule work
-include_directories($ENV{HOME}/opt/jinja2cpp/include)
+# include_directories($ENV{HOME}/opt/jinja2cpp/include)
 # add_subdirectory(../../../thirdparty/Jinja2Cpp/ "${CMAKE_CURRENT_BINARY_DIR}/Jinja2Cpp/")
 # include_directories(../../../thirdparty/inja/include/Jinja2Cpp)
 
@@ -62,7 +62,7 @@ find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime)
 target_link_libraries(${TARGET_NAME} PUBLIC nlohmann_json::nlohmann_json)
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
-target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)  # todo: remove hardcode
+# target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)  # todo: remove hardcode
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
index 0695865ca6..62384fdc49 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
@@ -5,8 +5,8 @@
 #include "llm_pipeline.hpp"
 #include "group_beam_searcher.hpp"
 #include <filesystem>
-#include <jinja2cpp/template.h>
-#include <jinja2cpp/template_env.h>
+// #include <jinja2cpp/template.h>
+// #include <jinja2cpp/template_env.h>
 #include "generation_config.hpp"
 
 void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
@@ -777,21 +777,26 @@ std::string ov::LLMPipeline::apply_chat_template(std::string prompt, std::string
 }
 
 std::string ov::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string prompt, std::string role) const {
-    jinja2::TemplateEnv env;
-    env.GetSettings().lstripBlocks = true;
-    env.GetSettings().trimBlocks = true;
-    jinja2::Template tpl(&env);
-    tpl.Load(m_chat_template);
+    // jinja2::TemplateEnv env;
+    // env.GetSettings().lstripBlocks = true;
+    // env.GetSettings().trimBlocks = true;
+    // jinja2::Template tpl(&env);
+    // tpl.Load(m_chat_template);
     
-    jinja2::ValuesMap message {{"role", role}, {"content", prompt}};
-    jinja2::ValuesMap params = {
-        {"messages", jinja2::ValuesList({message})},
-        {"bos_token",  "<s>"},
-        {"eos_token", "</s>"},  // todo: load from config
-        {"add_generation_prompt", true},
-    };
+    // jinja2::ValuesMap message {{"role", role}, {"content", prompt}};
+    // jinja2::ValuesMap params = {
+    //     {"messages", jinja2::ValuesList({message})},
+    //     {"bos_token",  "<s>"},
+    //     {"eos_token", "</s>"},  // todo: load from config
+    //     {"add_generation_prompt", true},
+    // };
  
-    return tpl.RenderAsString(params).value();
+    // return tpl.RenderAsString(params).value();
+
+    std::stringstream result_prompt;
+    result_prompt << "<|user|>\n" << prompt << "</s>\n<|assistant|>\n";  // hardcode template for TinyLlama
+    
+    return result_prompt.str();
 }
 
 void ov::LLMPipeline::set_streamer(std::function<void (std::string)> callback) {

From bd6849ae3ee7993e68b8ebe3853c37c39756a7e1 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Tue, 7 May 2024 12:43:18 +0200
Subject: [PATCH 31/97] add python api draft, hide implementations from user &
 refactor implementation

---
 text_generation/causal_lm/cpp/CMakeLists.txt  |  22 +-
 .../cpp/generate_pipeline/chat_sample.cpp     |   4 +-
 .../cpp/generate_pipeline/generate_sample.cpp |  12 +-
 .../include/generation_config.hpp             | 343 +--------------
 .../include/llm_pipeline.hpp                  |  12 +-
 .../include/llm_tokenizer.hpp                 |  44 +-
 .../include/streamer_base.hpp                 |  16 +
 .../generate_pipeline/python/CMakeLists.txt_  |   3 +
 .../python/py_generate_pipeline.cpp           |  27 ++
 .../cpp/generate_pipeline/src/beam_search.cpp |  89 ++++
 .../src/generation_config.cpp                 | 396 ++++++++++++++++++
 .../src/generation_config_helper.hpp          |  78 ++++
 .../generate_pipeline/src/llm_pipeline.cpp    | 185 +++-----
 .../generate_pipeline/src/llm_tokenizer.cpp   | 109 ++---
 .../src/text_callback_streamer.cpp            |  77 ++++
 .../text_callback_streamer.hpp}               |  35 +-
 .../generate_pipeline/src/text_streamer.cpp   |  45 --
 17 files changed, 889 insertions(+), 608 deletions(-)
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/include/streamer_base.hpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt_
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python/py_generate_pipeline.cpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/src/beam_search.cpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/src/generation_config.cpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/src/generation_config_helper.hpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.cpp
 rename text_generation/causal_lm/cpp/generate_pipeline/{include/text_streamer.hpp => src/text_callback_streamer.hpp} (52%)
 delete mode 100644 text_generation/causal_lm/cpp/generate_pipeline/src/text_streamer.cpp

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 1d9cbd66be..431649e7c1 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -5,7 +5,7 @@ cmake_minimum_required(VERSION 3.15)
 project(causal_lm)
 
 set(JINJA2CPP_DEPS_MODE internal)
-
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
 add_subdirectory(../../../thirdparty/nlohmann_json/ "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json/")
@@ -55,14 +55,14 @@ set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 # Generate Pipeline library
 set(TARGET_NAME generate_pipeline_lib)
 file(GLOB SOURCE_FILES "generate_pipeline/src/*.cpp")
-add_library(${TARGET_NAME} ${SOURCE_FILES})
+add_library(${TARGET_NAME} SHARED ${SOURCE_FILES})
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_include_directories(${TARGET_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/generate_pipeline/include)
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime)
 target_link_libraries(${TARGET_NAME} PUBLIC nlohmann_json::nlohmann_json)
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
-# target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)  # todo: remove hardcode
+target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)  # todo: remove hardcode
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
@@ -79,3 +79,19 @@ target_link_libraries(${TARGET_NAME} PRIVATE generate_pipeline_lib)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
+
+include(FetchContent)
+FetchContent_Declare(
+    pybind11
+    GIT_REPOSITORY https://github.com/pybind/pybind11
+    GIT_TAG        v2.12.0
+)
+
+FetchContent_GetProperties(pybind11)
+if(NOT pybind11_POPULATED)
+    FetchContent_Populate(pybind11)
+    add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
+endif()
+
+pybind11_add_module(py_generate_pipeline generate_pipeline/python/py_generate_pipeline.cpp)
+target_link_libraries(py_generate_pipeline PRIVATE generate_pipeline_lib)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index f39d55820f..d5ae403fdc 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -39,9 +39,9 @@ int main(int argc, char* argv[]) try {
     std::string model_path = argv[1];
     ov::LLMPipeline pipe(model_path, device);
     
-    GenerationConfig config = pipe.generation_config();
+    ov::GenerationConfig config = pipe.generation_config();
 
-    config.max_new_tokens(10000);
+    config.max_new_tokens = 10000;
     pipe.set_streamer([](std::string word) { std::cout << word << std::flush; });
 
     std::string accumulated_str = "";
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
index 3ec36ee7d1..bfe1668dfa 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -10,13 +10,13 @@
 // but detokenize(tokenize("prefix a")) == "prefix a"
 // 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
 struct TextStreamer {
-    Tokenizer tokenizer;
+    ov::Tokenizer tokenizer;
     std::vector<int64_t> token_cache;
     size_t print_len = 0;
 
     void put(int64_t token) {
         token_cache.push_back(token);
-        std::string text = tokenizer.detokenize(token_cache);
+        std::string text = tokenizer.decode(token_cache);
         if (!text.empty() && '\n' == text.back()) {
             // Flush the cache after the new line symbol
             std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
@@ -33,7 +33,7 @@ struct TextStreamer {
     }
 
     void end() {
-        std::string text = tokenizer.detokenize(token_cache);
+        std::string text = tokenizer.decode(token_cache);
         std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
         token_cache.clear();
         print_len = 0;
@@ -58,7 +58,7 @@ int main(int argc, char* argv[]) try {
     ov::LLMPipeline pipe(model_path, device);
     // Will try to load config from generation_config.json.
     // but if not found default velues for gready search will be used
-    GenerationConfig config = pipe.generation_config();
+    ov::GenerationConfig config = pipe.generation_config();
 
     auto text_streamer = TextStreamer{pipe.get_tokenizer()};
     auto text_streamer_callback = [&text_streamer](std::vector<int64_t>&& tokens, ov::LLMPipeline& pipe){
@@ -66,8 +66,8 @@ int main(int argc, char* argv[]) try {
     };
 
     cout << "greedy generate streaming mode:" << endl;
-    config.max_new_tokens(20);
-    config.set_streamer(text_streamer_callback);
+    config.max_new_tokens = 20;
+    // config.m_set_streamer(text_streamer_callback);
     pipe(prompt, config);
     text_streamer.end();
     
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
index d302b5dc40..4526e6cbba 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
@@ -10,346 +10,49 @@
 // #include <group_beam_searcher.hpp>  // used only for StopCriteria
 #include <limits>
 #include "llm_tokenizer.hpp"
+#include <variant>
 
 // forward declaration
 class Sequence;
 
 namespace ov {
-// forward declaration
-class LLMPipeline;
-
-}
-
-namespace {
-
-// TODO: LEAVE ONLY ONE PLACE FOR DEFAULT VALUES
-static const ov::AnyMap default_generation_config_map = {
-    // Generic
-    {"max_new_tokens", SIZE_MAX},
-    {"max_length", SIZE_MAX},
-    {"m_ignore_eos", false},
-    {"m_bos_token", "</s>"},
-    {"m_eos_token", "</s>"},
-    
-    // Beam search specific
-    {"m_num_groups", 1},
-    {"m_group_size", 1},
-    {"m_diversity_penalty", 1.0f},  // 0.0 means no diversity
-    {"m_num_return_sequences", 1},  // is used by beam search, in other case is equal to batch size
-    // {"stop_criteria", StopCriteria::heuristic},  // todo: align with the latest beam searcher
-
-    {"m_repetition_penalty", 1.0f},
-    {"m_length_penalty", 1.0f},
-    {"m_no_repeat_ngram_size", std::numeric_limits<size_t>::max()},
-    {"early_finish", [](const Sequence&) {return false; }},
-    
-    // Multinomial
-    {"m_temperature", 0.0f},
-    {"m_top_k", -1},
-    {"m_top_p", 1.0f},
-    {"m_do_sample", false},
-    
-    // special tokens
-    {"m_bos_token_id", 0},
-    {"m_eos_token_id", 2}, // todo: check form where it's better to extract from rt_info or from tokenizer_config.json
-    {"m_pad_token_id", 0},
-    
-    // assistive decoding
-    {"m_assistant_model", ov::InferRequest()},
-    {"m_num_assistant_tokens", 5},
-    {"m_seq_len_axis", 2},
-};
-
-}
 
 // Similar to HuggingFace GenerationConfig
 class GenerationConfig {
-public:  
+public:
     // Generic
-    size_t m_max_new_tokens;
-    size_t m_max_length;
-    bool m_ignore_eos;
-    std::string m_eos_token;
+    size_t max_new_tokens;
+    size_t max_length;
+    bool ignore_eos;
+    std::string eos_token;
 
     // Beam search specific
-    size_t m_num_groups;
-    size_t m_group_size;
-    float m_diversity_penalty;
+    size_t num_groups;
+    size_t group_size;
+    float diversity_penalty;
     size_t m_num_return_sequences;
     // StopCriteria stop_criteria = StopCriteria::heuristic;
     
-    float m_repetition_penalty;
-    float m_length_penalty;
-    size_t m_no_repeat_ngram_size;
+    float repetition_penalty;
+    float length_penalty;
+    size_t no_repeat_ngram_size;
     std::function<bool(const Sequence&)> early_finish = [](const Sequence&) {return false; };
 
     // Multinomial
-    float m_temperature;
-    int m_top_k;
-    float m_top_p;
-    bool m_do_sample;
+    float temperature;
+    int top_k;
+    float top_p;
+    bool do_sample;
+    std::variant<std::string, ov::CompiledModel, ov::InferRequest> draft_model;  // todo: remove or try to add ov::Model const ov::Model&,
 
     // special tokens
-    int64_t m_bos_token_id;
-    int64_t m_eos_token_id;
-    int64_t m_pad_token_id;
-
-    std::function<void (std::vector<int64_t>&&, ov::LLMPipeline&)> m_callback = [](std::vector<int64_t>&& tokens, ov::LLMPipeline& pipe){ ;};
-
-    size_t get_max_new_tokens(size_t prompt_length = 0) {
-        // max_new_tokens has priority over max_length,
-        // only if m_max_new_tokens was not specified use max_length
-        if (m_max_new_tokens != SIZE_MAX) {
-            return m_max_new_tokens;
-        } else {
-            return m_max_length - prompt_length;
-        }
-    }
-
-    void max_new_tokens(size_t max_new_tokens) {
-        const auto& r = ::default_generation_config_map.find("sdf") != ::default_generation_config_map.end();
-
-        m_max_new_tokens = max_new_tokens;
-    }
-
-    void max_length(size_t max_length) {
-        m_max_length = max_length;
-    }
-
-    void ignore_eos(bool ignore_eos) {
-        m_ignore_eos = ignore_eos;
-    }
-
-    void eos_token(std::string eos_token) {
-        m_eos_token = eos_token;
-    }
-
-    void num_return_sequences(size_t num_return_sequences) {
-        m_num_return_sequences = num_return_sequences;
-    }
-
-    void num_groups(size_t num_groups) {
-        m_num_groups = num_groups;
-    }
-
-    void group_size(size_t group_size) {
-        m_group_size = group_size;
-    }
-
-    void diversity_penalty(float diversity_penalty) {
-        m_diversity_penalty = diversity_penalty;
-    }
-
-    void length_penalty(float length_penalty) {
-        m_length_penalty = length_penalty;
-    }
-
-    void no_repeat_ngram_size(size_t no_repeat_ngram_size) {
-        m_no_repeat_ngram_size = no_repeat_ngram_size;
-    }
-
-    void temperature(float temperature) {
-        m_temperature = temperature;
-    }
-
-    void top_k(size_t top_k) {
-        m_top_k = top_k;
-    }
-
-    void top_p(size_t top_p) {
-        m_top_p = top_p;
-    }
-
-    void do_sample(bool do_sample) {
-        m_do_sample = do_sample;
-    }
-
-    void repetition_penalty(float repetition_penalty) {
-        m_repetition_penalty = repetition_penalty;
-    }
-
-    void bos_token_id(int64_t bos_token_id) {
-        m_bos_token_id = bos_token_id;
-    }
-
-    void eos_token_id(int64_t eos_token_id) {
-        m_eos_token_id = eos_token_id;
-    }
-
-    void pad_token_id(int64_t pad_token_id) {
-        m_pad_token_id = pad_token_id;
-    }
+    int64_t bos_token_id;
+    int64_t eos_token_id;
+    int64_t pad_token_id;
 
     GenerationConfig() = default;
 
-    GenerationConfig(std::string json_path) {
-        std::ifstream f(json_path);
-        OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config");
-
-        nlohmann::json data = nlohmann::json::parse(f);
-
-        m_bos_token_id = data.value("bos_token_id", 0);
-        m_eos_token_id = data.value("eos_token_id", 0);
-        m_eos_token = data.value("eos_token", "</s>");
-
-        m_pad_token_id = data.value("pad_token_id", 0);
-        m_num_return_sequences = data.value("num_return_sequences", 1);
-        
-        m_max_new_tokens = data.value("max_new_tokens", SIZE_MAX);
-        m_max_length = data.value("max_length", SIZE_MAX);
-        
-        m_temperature = data.value("temperature", 0.0f);
-        m_do_sample = data.value("do_sample", false);
-        m_top_p = data.value("top_p", 0.0f);
-        
-        // beam_search_params
-        m_num_groups = data.value("num_beam_groups", 1);
-        m_diversity_penalty = data.value("diversity_penalty", 1.0f);
-        int num_beams = data.value("num_beams", 1);
-        m_group_size = num_beams / m_num_groups;
-        OPENVINO_ASSERT(num_beams % m_num_groups == 0, "number of beams should be divisible by number of groups");
-    }
-
-
-    static GenerationConfig greedy() {
-        GenerationConfig greedy_params;
-        greedy_params.m_temperature = 0.0f;
-        greedy_params.m_ignore_eos = true;
-        return greedy_params;
-    }
-
-    static GenerationConfig beam_search() {
-        GenerationConfig beam_search;
-        beam_search.m_num_groups = 3;
-        beam_search.m_group_size = 5;
-        beam_search.m_max_new_tokens = 10;
-        beam_search.m_diversity_penalty = 2.0f;
-        return beam_search;
-    }
-
-    static GenerationConfig multimomial() {
-        GenerationConfig multimomial;
-        multimomial.m_temperature = 0.8f;
-        multimomial.m_top_p = 0.8;
-        multimomial.m_top_k = 20;
-        multimomial.m_do_sample = 20;
-        return multimomial;
-    }
-    
-    template <typename T>
-    static GenerationConfig assistive_decoding(T& assistant_model) {
-        GenerationConfig assistive;
-        assistive.assistant_model(assistant_model);
-        return assistive;
-    }
-
-    bool is_gready_sampling() const {
-        return !m_do_sample && !is_beam_search() && !is_speculative();
-    }
-
-    bool is_beam_search() const {
-        return m_num_groups * m_group_size > 1;
-    }
-
-    bool is_multimomial() const {
-        return m_do_sample;
-    }
-
-    // for speculative decoding
-    void assistant_model(const ov::InferRequest& assistant_model) {
-        m_assistant_model = assistant_model;
-        is_assistant_request_defined = true;
-    }
-
-    void assistant_model(ov::CompiledModel& assistant_model) {
-        m_assistant_model = assistant_model.create_infer_request();
-        is_assistant_request_defined = true;
-    }
-
-    void assistant_model(const std::shared_ptr<const ov::Model>& assistant_model) {
-        m_assistant_ov_model = assistant_model;
-        is_assistant_ov_defined = true;
-    }
-
-    void assistant_model(std::string assistant_model) {
-        auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
-        if (!is_xml(assistant_model))
-            assistant_model += "/openvino_model.xml";
-
-        m_assistant_ov_model = ov::Core().read_model(assistant_model);
-        is_assistant_ov_defined = true;
-    }
-
-    void set_streamer(std::function<void (std::vector<int64_t>&&, ov::LLMPipeline&)> callback) {
-        m_callback = callback;
-    }
-
-    ov::InferRequest get_assistant_model(std::string device="CPU", const ov::AnyMap& config={}) {
-        if (is_assistant_request_defined) {
-            return m_assistant_model;
-        } else if (is_assistant_ov_defined) {
-            m_assistant_model = ov::Core().compile_model(m_assistant_ov_model, device, config).create_infer_request();
-            is_assistant_request_defined = true;
-            return m_assistant_model;
-        } else {
-            OPENVINO_THROW("assistant model is not specified");
-        }
-    }
-    
-    void num_assistant_tokens(int64_t num_assistant_tokens) {
-        m_num_assistant_tokens = num_assistant_tokens;
-    }
-
-    bool is_speculative() const {
-        return is_assistant_ov_defined || is_assistant_request_defined;
-    }
-
-    // for Assistive/Speculative decoding
-    ov::InferRequest m_assistant_model;
-    size_t m_num_assistant_tokens = 5;
-    size_t m_seq_len_axis = 2;
-    
-    static GenerationConfig anymap_to_generation_config(const ov::AnyMap& genereation_config_map = {}) {
-        // need to load default values and update only those keys that are specified in genereation_config_map
-        auto tmp_map = default_generation_config_map;
-        
-        for (auto it = genereation_config_map.begin(); it != genereation_config_map.end(); ++it) {
-            tmp_map[it->first] = it->second;
-        }
-
-        GenerationConfig config;
-        
-        // general arguments
-        config.m_max_new_tokens = tmp_map.at("m_max_new_tokens").as<size_t>();
-        config.m_max_length = tmp_map.at("m_max_length").as<size_t>();
-        config.m_ignore_eos = tmp_map.at("m_ignore_eos").as<bool>();
-        config.m_eos_token = tmp_map.at("m_eos_token").as<int64_t>();
-
-        // Beam search specific
-        config.m_num_groups = tmp_map.at("m_num_groups").as<size_t>();
-        config.m_group_size = tmp_map.at("m_group_size").as<size_t>();
-        config.m_diversity_penalty = tmp_map.at("m_diversity_penalty").as<size_t>();
-        config.m_num_return_sequences = tmp_map.at("m_num_return_sequences").as<size_t>();
-        
-        config.m_repetition_penalty = tmp_map.at("m_repetition_penalty").as<size_t>();
-        config.m_length_penalty = tmp_map.at("m_length_penalty").as<size_t>();
-        config.m_no_repeat_ngram_size = tmp_map.at("m_no_repeat_ngram_size").as<size_t>();
-        config.early_finish = tmp_map.at("early_finish").as<std::function<bool(const Sequence&)>>();
-
-        // Multinomial
-        config.m_temperature = tmp_map.at("m_temperature").as<size_t>();
-        config.m_top_k = tmp_map.at("m_top_k").as<size_t>();
-        config.m_top_p = tmp_map.at("m_top_p").as<size_t>();
-        config.m_do_sample = tmp_map.at("m_do_sample").as<bool>();
-
-        // special tokens
-        config.m_bos_token_id = tmp_map.at("m_bos_token_id").as<int64_t>();
-        config.m_eos_token_id = tmp_map.at("m_eos_token_id").as<int64_t>();
-        config.m_pad_token_id = tmp_map.at("m_pad_token_id").as<int64_t>();
-        return config;
-    }
-private:
-    std::shared_ptr<const ov::Model> m_assistant_ov_model;
-    bool is_assistant_request_defined = false;
-    bool is_assistant_ov_defined = false;
+    GenerationConfig(std::string json_path);
 };
+
+} // namespace ov
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
index a25d93b5e9..7655607e3f 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
@@ -7,6 +7,7 @@
 #include <openvino/core/any.hpp>
 #include "generation_config.hpp"
 #include "llm_tokenizer.hpp"
+#include "streamer_base.hpp"
 #include <filesystem>
 
 using namespace std;
@@ -31,7 +32,7 @@ class LLMPipeline {
 public:
     LLMPipeline(
         std::string& model_path,
-        std::string& tokenizer_path,  // todo: make available also specifying ov::Model, ov::CompiledModel, etc. tokenizers
+        std::string& tokenizer_path,  // todo: make possible to specify tokenizers with ov::Model, ov::CompiledModel, etc. 
         std::string& detokenizer_path,
         std::string device="CPU",
         const ov::AnyMap& plugin_config={}
@@ -39,7 +40,7 @@ class LLMPipeline {
     
     LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& plugin_config={});
     
-    ~LLMPipeline();  // Declare the destructor
+    ~LLMPipeline();
     
     GenerationConfig generation_config() const;
 
@@ -59,17 +60,18 @@ class LLMPipeline {
 
     EncodedResults generate(ov::Tensor input_ids);
 
-    Tokenizer get_tokenizer();
+    ov::Tokenizer get_tokenizer();
 
     std::string apply_chat_template(std::string prompt, std::string role = "user") const;
 
     void set_streamer(std::function<void (std::string)> callback);
+    void set_streamer(std::shared_ptr<StreamerBase> streamer);
     void set_streamer();
     void start_chat();
     void finish_chat();
     void reset_state();
     void set_default_config(const GenerationConfig& generation_config);
-    void set_default_config(const AnyMap& generation_config_map);
+    // void set_default_config(const AnyMap& generation_config_map);
 
 private:
     class LLMPipelineImpl;
@@ -80,4 +82,4 @@ class LLMPipeline {
     DecodedResults call(std::vector<std::string> text, GenerationConfig sampling_parameters);
 };
 
-} // namespace ov
\ No newline at end of file
+} // namespace ov
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_tokenizer.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_tokenizer.hpp
index 47bdab61b5..a324573251 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_tokenizer.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_tokenizer.hpp
@@ -7,50 +7,32 @@
 #include <openvino/core/any.hpp>
 #include <filesystem>
 
-using namespace std;
-
-std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token=2);
-
-
+namespace ov {
+    
 class Tokenizer {
 public:
-    int64_t m_eos_token = 2;
+    int64_t m_eos_token = 2;  // todo: read from rt_info
 
     Tokenizer() = default;
+    ~Tokenizer();
     Tokenizer(std::string& tokenizers_path, std::string device="CPU");
 
     // Tokenizer(std::string& tokenizer_path, std::string& detokenizer_path, std::string device="CPU");
     
-    std::pair<ov::Tensor, ov::Tensor> tokenize(std::string prompt);
+    std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt);
 
-    std::pair<ov::Tensor, ov::Tensor> tokenize(std::vector<std::string> prompts);
+    std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string> prompts);
     
-    std::pair<ov::Tensor, ov::Tensor> tokenize(std::initializer_list<std::string> text);
+    std::pair<ov::Tensor, ov::Tensor> encode(std::initializer_list<std::string> text);
     
-    std::string detokenize(std::vector<int64_t> tokens);
+    std::string decode(std::vector<int64_t> tokens);
     
-    std::vector<std::string> detokenize(ov::Tensor tokens);
+    std::vector<std::string> decode(ov::Tensor tokens);
     
-    std::vector<std::string> detokenize(std::vector<std::vector<int64_t>> lines);
+    std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines);
 private:
-    ov::InferRequest m_tokenize_request;
-    ov::InferRequest m_detokenizer_request;
-    std::string m_device;
+    class TokenizerImpl;
+    std::shared_ptr<TokenizerImpl> m_pimpl;
 };
 
-
-class TextCoutStreamer {
-public:
-    std::string put(int64_t token);
-
-    std::string end();
-    TextCoutStreamer(const Tokenizer& tokenizer, bool m_print_eos_token = false);
-    TextCoutStreamer() = default;
-    void set_tokenizer(Tokenizer tokenizer);
-private:
-    bool m_print_eos_token = false;
-    Tokenizer m_tokenizer;
-    std::vector<int64_t> m_tokens_cache;
-    size_t print_len = 0;
-    std::function<void (std::string)> m_callback = [](std::string words){ ;};
-};
+} // namespace ov
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/streamer_base.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/streamer_base.hpp
new file mode 100644
index 0000000000..dd1ce71b08
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/streamer_base.hpp
@@ -0,0 +1,16 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "llm_tokenizer.hpp"
+
+namespace ov {
+
+class StreamerBase {
+public:
+    virtual void put(int64_t token) = 0;
+
+    virtual void end() = 0;
+};
+
+} // namespace ov
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt_ b/text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt_
new file mode 100644
index 0000000000..88b5a71df7
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt_
@@ -0,0 +1,3 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python/py_generate_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/python/py_generate_pipeline.cpp
new file mode 100644
index 0000000000..2d636f8a86
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/python/py_generate_pipeline.cpp
@@ -0,0 +1,27 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/functional.h>
+#include "llm_pipeline.hpp"
+
+namespace py = pybind11;
+using namespace ov;
+
+PYBIND11_MODULE(py_generate_pipeline, m) {
+    m.doc() = "Pybind11 binding for LLM Pipeline";
+
+    py::class_<LLMPipeline>(m, "LLMPipeline")
+        .def(py::init<std::string&, std::string&, std::string&, std::string, const ov::AnyMap&>(),
+             py::arg("model_path"), py::arg("tokenizer_path"), py::arg("detokenizer_path"),
+             py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{})
+        .def(py::init<std::string&, std::string, const ov::AnyMap&>(),
+             py::arg("path"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{})
+        .def("__call__", (std::string (LLMPipeline::*)(std::string)) &LLMPipeline::operator())
+        .def("__call__", (std::string (LLMPipeline::*)(std::string, GenerationConfig)) &LLMPipeline::operator())
+        .def("generate", (EncodedResults (LLMPipeline::*)(ov::Tensor, ov::Tensor, GenerationConfig)) &LLMPipeline::generate)
+        .def("generate", (EncodedResults (LLMPipeline::*)(ov::Tensor, ov::Tensor)) &LLMPipeline::generate)
+        // Bind other methods similarly
+        .def("get_tokenizer", &LLMPipeline::get_tokenizer)
+        .def("apply_chat_template", &LLMPipeline::apply_chat_template);
+
+
+}
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/beam_search.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/beam_search.cpp
new file mode 100644
index 0000000000..19851acebb
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/beam_search.cpp
@@ -0,0 +1,89 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "generation_config_helper.hpp"
+#include "llm_pipeline.hpp"
+#include "group_beam_searcher.hpp"
+
+namespace ov {
+
+EncodedResults beam_search(ov::InferRequest& model_runner, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params) {
+    GenerationConfigHelper config_helper = sampling_params;
+
+    ov::Shape prompts_shape = prompts.get_shape();
+    size_t batch_size = prompts_shape[0];
+    // todo: implement for batch > 1
+    OPENVINO_ASSERT(batch_size == 1);
+
+    // initialize inputs
+    auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()};
+    std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
+    auto position_ids = ov::Tensor{ov::element::i64, prompts.get_shape()};
+    std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
+    auto prompt_len = prompts.get_shape()[1];
+
+    model_runner.set_tensor("input_ids", prompts);
+    model_runner.set_tensor("attention_mask", attention_mask);
+    model_runner.set_tensor("position_ids", position_ids);
+
+    // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1
+    model_runner.get_tensor("beam_idx").set_shape({batch_size});
+    model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+
+    const int64_t* prompt_data = prompts.data<const int64_t>();
+    
+    // todo: remove this duplication and use the same SamplingParameters for both greedy and beam
+    Parameters parameters{{std::vector<int64_t>{prompt_data, prompt_data + prompts.get_size()}}};
+    parameters.n_groups = sampling_params.num_groups;
+    parameters.diversity_penalty = sampling_params.diversity_penalty;
+    parameters.group_size = sampling_params.group_size;
+    
+    GroupBeamSearcher group_beam_searcher{parameters};
+    std::vector<int64_t> next_tokens;
+    std::vector<int32_t> next_beams;
+    for (size_t length_count = 0; length_count < config_helper.get_max_new_tokens(prompt_len); ++length_count) {
+        model_runner.infer();
+        std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(model_runner.get_tensor("logits"));
+        if (next_tokens.empty()) {
+            break;
+        }
+        size_t batch_size = next_tokens.size();
+        // Set pointers
+        model_runner.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
+        model_runner.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
+        // Set auxiliary inputs
+        ov::Tensor attention_mask = model_runner.get_tensor("attention_mask");
+        ov::Shape mask_shape{batch_size, attention_mask.get_shape()[1] + 1};
+        attention_mask.set_shape(mask_shape);
+        std::fill_n(attention_mask.data<int64_t>(), ov::shape_size(mask_shape), 1);
+
+        model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
+        std::fill_n(model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape[1] - 1);
+        
+        // todo: pass streamer here
+        // m_streamer.put(token_iter_results[0]);
+
+    }
+
+    std::vector<Beam> beams;
+    for (const std::vector<std::vector<Beam>>& prompt_group : finalize(std::move(group_beam_searcher))) {
+        for (const std::vector<Beam> group : prompt_group) {
+            for (const Beam& beam : group) {
+                beams.emplace_back(beam);
+            }
+        }
+    }
+
+    auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); };
+    std::sort(beams.begin(), beams.end(), compare_scores);
+    
+    ov::EncodedResults results;
+    for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) {
+        // todo: convert to string 
+        results.scores.emplace_back(beam->score);
+        results.tokens.emplace_back(beam->tokens);
+    }
+    return results;
+}
+
+} // namespace ov
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config.cpp
new file mode 100644
index 0000000000..1f64c0061d
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config.cpp
@@ -0,0 +1,396 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdlib>
+#include <functional>
+// #include <nlohmann/json.hpp>
+#include <fstream>
+// #include <group_beam_searcher.hpp>  // used only for StopCriteria
+#include <limits>
+// #include "llm_tokenizer.hpp"
+#include "generation_config.hpp"
+#include "generation_config_helper.hpp"
+
+namespace ov {
+
+GenerationConfig::GenerationConfig(std::string json_path) {
+    std::ifstream f(json_path);
+    OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config");
+
+    nlohmann::json data = nlohmann::json::parse(f);
+
+    bos_token_id = data.value("bos_token_id", 0);
+    eos_token_id = data.value("eos_token_id", 0);
+    eos_token = data.value("eos_token", "</s>");
+
+    pad_token_id = data.value("pad_token_id", 0);
+    m_num_return_sequences = data.value("num_return_sequences", 1);
+    
+    max_new_tokens = data.value("max_new_tokens", SIZE_MAX);
+    max_length = data.value("max_length", SIZE_MAX);
+    
+    temperature = data.value("temperature", 0.0f);
+    do_sample = data.value("do_sample", false);
+    top_p = data.value("top_p", 0.0f);
+    
+    // beam_search_params
+    num_groups = data.value("num_beam_groups", 1);
+    diversity_penalty = data.value("diversity_penalty", 1.0f);
+    int num_beams = data.value("num_beams", 1);
+    group_size = num_beams / num_groups;
+    OPENVINO_ASSERT(num_beams % num_groups == 0, "number of beams should be divisible by number of groups");
+}
+
+
+size_t GenerationConfigHelper::get_max_new_tokens(size_t prompt_length) {
+    // max_new_tokens has priority over max_length,
+    // only if max_new_tokens was not specified use max_length
+    if (config.max_new_tokens != SIZE_MAX) {
+        return config.max_new_tokens;
+    } else {
+        return config.max_length - prompt_length;
+    }
+}
+
+bool GenerationConfigHelper::is_greedy_sampling() const {
+    return !config.do_sample && !is_beam_search() && !is_speculative();
+}
+
+bool GenerationConfigHelper::is_beam_search() const {
+    return config.num_groups * config.group_size > 1;
+}
+
+bool GenerationConfigHelper::is_multimomial() const {
+    return config.do_sample;
+}
+
+bool GenerationConfigHelper::is_speculative() const {
+    return is_assistant_ov_defined || is_assistant_request_defined;
+}
+
+ov::InferRequest GenerationConfigHelper::get_assistant_model(std::string device, const ov::AnyMap& config) {
+    if (is_assistant_request_defined) {
+        return assistant_model;
+    } else if (is_assistant_ov_defined) {
+        assistant_model = ov::Core().compile_model(m_assistant_ov_model, device, config).create_infer_request();
+        is_assistant_request_defined = true;
+        return assistant_model;
+    } else {
+        OPENVINO_THROW("assistant model is not specified");
+    }
+}
+
+} // namespace ov
+
+
+
+// // forward declaration
+// class Sequence;
+
+// // forward declaration
+// namespace ov {
+// class LLMPipeline;
+// }
+
+// namespace {
+
+// // TODO: LEAVE ONLY ONE PLACE FOR DEFAULT VALUES
+// static const ov::AnyMap default_generation_config_map = {
+//     // Generic
+//     {"max_new_tokens", SIZE_MAX},
+//     {"max_length", SIZE_MAX},
+//     {"m_ignore_eos", false},
+//     {"m_bos_token", "</s>"},
+//     {"m_eos_token", "</s>"},
+    
+//     // Beam search specific
+//     {"m_num_groups", 1},
+//     {"m_group_size", 1},
+//     {"m_diversity_penalty", 1.0f},  // 0.0 means no diversity
+//     {"m_num_return_sequences", 1},  // is used by beam search, in other case is equal to batch size
+//     // {"stop_criteria", StopCriteria::heuristic},  // todo: align with the latest beam searcher
+
+//     {"m_repetition_penalty", 1.0f},
+//     {"m_length_penalty", 1.0f},
+//     {"m_no_repeat_ngram_size", std::numeric_limits<size_t>::max()},
+//     {"early_finish", [](const Sequence&) {return false; }},
+    
+//     // Multinomial
+//     {"m_temperature", 0.0f},
+//     {"m_top_k", -1},
+//     {"m_top_p", 1.0f},
+//     {"m_do_sample", false},
+    
+//     // special tokens
+//     {"m_bos_token_id", 0},
+//     {"m_eos_token_id", 2}, // todo: check form where it's better to extract from rt_info or from tokenizer_config.json
+//     {"m_pad_token_id", 0},
+    
+//     // assistive decoding
+//     {"m_assistant_model", ov::InferRequest()},
+//     {"m_num_assistant_tokens", 5},
+//     {"m_seq_len_axis", 2},
+// };
+
+// }
+
+// namespace ov {
+// size_t get_max_new_tokens(size_t prompt_length = 0) {
+//         // max_new_tokens has priority over max_length,
+//         // only if m_max_new_tokens was not specified use max_length
+//         if (m_max_new_tokens != SIZE_MAX) {
+//             return m_max_new_tokens;
+//         } else {
+//             return m_max_length - prompt_length;
+//         }
+//     }
+
+//     void max_new_tokens(size_t max_new_tokens) {
+//         const auto& r = ::default_generation_config_map.find("sdf") != ::default_generation_config_map.end();
+
+//         m_max_new_tokens = max_new_tokens;
+//     }
+
+//     void max_length(size_t max_length) {
+//         m_max_length = max_length;
+//     }
+
+//     void ignore_eos(bool ignore_eos) {
+//         m_ignore_eos = ignore_eos;
+//     }
+
+//     void eos_token(std::string eos_token) {
+//         m_eos_token = eos_token;
+//     }
+
+//     void num_return_sequences(size_t num_return_sequences) {
+//         m_num_return_sequences = num_return_sequences;
+//     }
+
+//     void num_groups(size_t num_groups) {
+//         m_num_groups = num_groups;
+//     }
+
+//     void group_size(size_t group_size) {
+//         m_group_size = group_size;
+//     }
+
+//     void diversity_penalty(float diversity_penalty) {
+//         m_diversity_penalty = diversity_penalty;
+//     }
+
+//     void length_penalty(float length_penalty) {
+//         m_length_penalty = length_penalty;
+//     }
+
+//     void no_repeat_ngram_size(size_t no_repeat_ngram_size) {
+//         m_no_repeat_ngram_size = no_repeat_ngram_size;
+//     }
+
+//     void temperature(float temperature) {
+//         m_temperature = temperature;
+//     }
+
+//     void top_k(size_t top_k) {
+//         m_top_k = top_k;
+//     }
+
+//     void top_p(size_t top_p) {
+//         m_top_p = top_p;
+//     }
+
+//     void do_sample(bool do_sample) {
+//         m_do_sample = do_sample;
+//     }
+
+//     void repetition_penalty(float repetition_penalty) {
+//         m_repetition_penalty = repetition_penalty;
+//     }
+
+//     void bos_token_id(int64_t bos_token_id) {
+//         m_bos_token_id = bos_token_id;
+//     }
+
+//     void eos_token_id(int64_t eos_token_id) {
+//         m_eos_token_id = eos_token_id;
+//     }
+
+//     void pad_token_id(int64_t pad_token_id) {
+//         m_pad_token_id = pad_token_id;
+//     }
+
+//     GenerationConfig() = default;
+
+//     GenerationConfig(std::string json_path) {
+//         std::ifstream f(json_path);
+//         OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config");
+
+//         nlohmann::json data = nlohmann::json::parse(f);
+
+//         m_bos_token_id = data.value("bos_token_id", 0);
+//         m_eos_token_id = data.value("eos_token_id", 0);
+//         m_eos_token = data.value("eos_token", "</s>");
+
+//         m_pad_token_id = data.value("pad_token_id", 0);
+//         m_num_return_sequences = data.value("num_return_sequences", 1);
+        
+//         m_max_new_tokens = data.value("max_new_tokens", SIZE_MAX);
+//         m_max_length = data.value("max_length", SIZE_MAX);
+        
+//         m_temperature = data.value("temperature", 0.0f);
+//         m_do_sample = data.value("do_sample", false);
+//         m_top_p = data.value("top_p", 0.0f);
+        
+//         // beam_search_params
+//         m_num_groups = data.value("num_beam_groups", 1);
+//         m_diversity_penalty = data.value("diversity_penalty", 1.0f);
+//         int num_beams = data.value("num_beams", 1);
+//         m_group_size = num_beams / m_num_groups;
+//         OPENVINO_ASSERT(num_beams % m_num_groups == 0, "number of beams should be divisible by number of groups");
+//     }
+
+
+//     static GenerationConfig greedy() {
+//         GenerationConfig greedy_params;
+//         greedy_params.m_temperature = 0.0f;
+//         greedy_params.m_ignore_eos = true;
+//         return greedy_params;
+//     }
+
+//     static GenerationConfig beam_search() {
+//         GenerationConfig beam_search;
+//         beam_search.m_num_groups = 3;
+//         beam_search.m_group_size = 5;
+//         beam_search.m_max_new_tokens = 10;
+//         beam_search.m_diversity_penalty = 2.0f;
+//         return beam_search;
+//     }
+
+//     static GenerationConfig multimomial() {
+//         GenerationConfig multimomial;
+//         multimomial.m_temperature = 0.8f;
+//         multimomial.m_top_p = 0.8;
+//         multimomial.m_top_k = 20;
+//         multimomial.m_do_sample = 20;
+//         return multimomial;
+//     }
+    
+//     template <typename T>
+//     static GenerationConfig assistive_decoding(T& assistant_model) {
+//         GenerationConfig assistive;
+//         assistive.assistant_model(assistant_model);
+//         return assistive;
+//     }
+
+//     bool is_greedy_sampling() const {
+//         return !m_do_sample && !is_beam_search() && !is_speculative();
+//     }
+
+//     bool is_beam_search() const {
+//         return m_num_groups * m_group_size > 1;
+//     }
+
+//     bool is_multimomial() const {
+//         return m_do_sample;
+//     }
+
+//     // for speculative decoding
+//     void assistant_model(const ov::InferRequest& assistant_model) {
+//         m_assistant_model = assistant_model;
+//         is_assistant_request_defined = true;
+//     }
+
+//     void assistant_model(ov::CompiledModel& assistant_model) {
+//         m_assistant_model = assistant_model.create_infer_request();
+//         is_assistant_request_defined = true;
+//     }
+
+//     void assistant_model(const std::shared_ptr<const ov::Model>& assistant_model) {
+//         m_assistant_ov_model = assistant_model;
+//         is_assistant_ov_defined = true;
+//     }
+
+//     void assistant_model(std::string assistant_model) {
+//         auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
+//         if (!is_xml(assistant_model))
+//             assistant_model += "/openvino_model.xml";
+
+//         m_assistant_ov_model = ov::Core().read_model(assistant_model);
+//         is_assistant_ov_defined = true;
+//     }
+
+//     void set_streamer(std::function<void (std::vector<int64_t>&&, ov::LLMPipeline&)> callback) {
+//         m_callback = callback;
+//     }
+
+//     ov::InferRequest get_assistant_model(std::string device="CPU", const ov::AnyMap& config={}) {
+//         if (is_assistant_request_defined) {
+//             return m_assistant_model;
+//         } else if (is_assistant_ov_defined) {
+//             m_assistant_model = ov::Core().compile_model(m_assistant_ov_model, device, config).create_infer_request();
+//             is_assistant_request_defined = true;
+//             return m_assistant_model;
+//         } else {
+//             OPENVINO_THROW("assistant model is not specified");
+//         }
+//     }
+    
+//     void num_assistant_tokens(int64_t num_assistant_tokens) {
+//         m_num_assistant_tokens = num_assistant_tokens;
+//     }
+
+//     bool is_speculative() const {
+//         return is_assistant_ov_defined || is_assistant_request_defined;
+//     }
+
+//     // for Assistive/Speculative decoding
+//     ov::InferRequest m_assistant_model;
+//     size_t m_num_assistant_tokens = 5;
+//     size_t m_seq_len_axis = 2;
+    
+//     static GenerationConfig anymap_to_generation_config(const ov::AnyMap& genereation_config_map = {}) {
+//         // need to load default values and update only those keys that are specified in genereation_config_map
+//         auto tmp_map = default_generation_config_map;
+        
+//         for (auto it = genereation_config_map.begin(); it != genereation_config_map.end(); ++it) {
+//             tmp_map[it->first] = it->second;
+//         }
+
+//         GenerationConfig config;
+        
+//         // general arguments
+//         config.m_max_new_tokens = tmp_map.at("m_max_new_tokens").as<size_t>();
+//         config.m_max_length = tmp_map.at("m_max_length").as<size_t>();
+//         config.m_ignore_eos = tmp_map.at("m_ignore_eos").as<bool>();
+//         config.m_eos_token = tmp_map.at("m_eos_token").as<int64_t>();
+
+//         // Beam search specific
+//         config.m_num_groups = tmp_map.at("m_num_groups").as<size_t>();
+//         config.m_group_size = tmp_map.at("m_group_size").as<size_t>();
+//         config.m_diversity_penalty = tmp_map.at("m_diversity_penalty").as<size_t>();
+//         config.m_num_return_sequences = tmp_map.at("m_num_return_sequences").as<size_t>();
+        
+//         config.m_repetition_penalty = tmp_map.at("m_repetition_penalty").as<size_t>();
+//         config.m_length_penalty = tmp_map.at("m_length_penalty").as<size_t>();
+//         config.m_no_repeat_ngram_size = tmp_map.at("m_no_repeat_ngram_size").as<size_t>();
+//         config.early_finish = tmp_map.at("early_finish").as<std::function<bool(const Sequence&)>>();
+
+//         // Multinomial
+//         config.m_temperature = tmp_map.at("m_temperature").as<size_t>();
+//         config.m_top_k = tmp_map.at("m_top_k").as<size_t>();
+//         config.m_top_p = tmp_map.at("m_top_p").as<size_t>();
+//         config.m_do_sample = tmp_map.at("m_do_sample").as<bool>();
+
+//         // special tokens
+//         config.m_bos_token_id = tmp_map.at("m_bos_token_id").as<int64_t>();
+//         config.m_eos_token_id = tmp_map.at("m_eos_token_id").as<int64_t>();
+//         config.m_pad_token_id = tmp_map.at("m_pad_token_id").as<int64_t>();
+//         return config;
+//     }    
+// }
+    
+// private:
+//     std::shared_ptr<const ov::Model> m_assistant_ov_model;
+//     bool is_assistant_request_defined = false;
+//     bool is_assistant_ov_defined = false;
+// };
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config_helper.hpp b/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config_helper.hpp
new file mode 100644
index 0000000000..c433594c2c
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config_helper.hpp
@@ -0,0 +1,78 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include "generation_config.hpp"
+
+namespace ov {
+
+
+class GenerationConfigHelper {
+public:
+    GenerationConfig config;
+
+    GenerationConfigHelper() = default;
+    
+    GenerationConfigHelper(const GenerationConfig& config): config(config) {};
+
+    size_t get_max_new_tokens(size_t prompt_length = 0);
+    
+    // template <typename T>
+    // static GenerationConfig assistive_decoding(T& assistant_model) {
+    //     GenerationConfig assistive;
+    //     assistive.assistant_model(assistant_model);
+    //     return assistive;
+    // }
+
+    bool is_greedy_sampling() const;
+
+    bool is_beam_search() const;
+
+    bool is_multimomial() const;
+
+    bool is_speculative() const;
+
+
+    // // for speculative decoding
+    // void set_assistant_model(const ov::InferRequest& assistant_model) {
+    //     this->assistant_model = assistant_model;
+    //     is_assistant_request_defined = true;
+    // }
+
+    // void set_assistant_model(ov::CompiledModel& assistant_model) {
+    //     this->assistant_model = assistant_model.create_infer_request();
+    //     is_assistant_request_defined = true;
+    // }
+
+    // void set_assistant_model(const std::shared_ptr<const ov::Model>& assistant_model) {
+    //     m_assistant_ov_model = assistant_model;
+    //     is_assistant_ov_defined = true;
+    // }
+
+    // void set_assistant_model(std::string assistant_model) {
+    //     auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
+    //     if (!is_xml(assistant_model))
+    //         assistant_model += "/openvino_model.xml";
+
+    //     m_assistant_ov_model = ov::Core().read_model(assistant_model);
+    //     is_assistant_ov_defined = true;
+    // }
+
+    ov::InferRequest get_assistant_model(std::string device="CPU", const ov::AnyMap& config={});
+    
+    // void set_num_assistant_tokens(int64_t num_assistant_tokens) {
+    //     this->num_assistant_tokens = num_assistant_tokens;
+    // }
+
+    // for Assistive/Speculative decoding
+    ov::InferRequest assistant_model;
+    size_t num_assistant_tokens = 5;
+    size_t seq_len_axis = 2;
+private:
+
+    std::shared_ptr<const ov::Model> m_assistant_ov_model;
+    bool is_assistant_request_defined = false;
+    bool is_assistant_ov_defined = false;
+};
+
+} // namespace ov
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
index 62384fdc49..5b6892c3d2 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
@@ -3,11 +3,13 @@
 
 #include <openvino/openvino.hpp>
 #include "llm_pipeline.hpp"
-#include "group_beam_searcher.hpp"
 #include <filesystem>
+#include "generation_config_helper.hpp"
+#include "text_callback_streamer.hpp"
+
 // #include <jinja2cpp/template.h>
 // #include <jinja2cpp/template_env.h>
-#include "generation_config.hpp"
+// #include "generation_config.hpp"
 
 void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
 void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
@@ -20,6 +22,8 @@ std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_i
 
 namespace ov {
 
+ov::EncodedResults beam_search(ov::InferRequest& model_runner, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params);
+
 class LLMPipeline::LLMPipelineImpl {
 public:
     ov::InferRequest m_model_runner;
@@ -28,7 +32,6 @@ class LLMPipeline::LLMPipelineImpl {
     std::string m_device;
     ov::AnyMap m_plugin_config;
     ov::Tensor m_attentions_mask_cache;
-    bool is_streamer_set = false;
     std::string m_chat_template = "";
     
     // TODO: add constructor for specifying manually tokenizer path
@@ -50,20 +53,19 @@ class LLMPipeline::LLMPipelineImpl {
     
     GenerationConfig generation_config() const;
 
-    EncodedResults greedy_search(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
+    EncodedResults greedy_search(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config);
 
-    EncodedResults beam_search(ov::Tensor prompts, ov::Tensor attention_mask, GenerationConfig sampling_params);
+    // EncodedResults beam_search(ov::Tensor prompts, ov::Tensor attention_mask, GenerationConfig generation_config);
 
-    EncodedResults speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
+    EncodedResults speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config);
 
-    EncodedResults multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params);
+    EncodedResults multinomial_sampling(ov::Tensor prompts, GenerationConfig generation_config);
 
-    EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
+    EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config);
 
     std::string apply_chat_template(std::string prompt, std::string role = "user") const;
 
-    TextCoutStreamer m_streamer;
-    std::function<void (std::string)> m_streamer_callback = [](std::string ){ ;};
+    std::shared_ptr<StreamerBase> m_streamer;
     bool is_chat_conversation = false;
 
     std::string call(std::string text);
@@ -283,11 +285,11 @@ ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(std::string& path, std::string
     m_tokenizer = Tokenizer(path);
 }
 
-GenerationConfig ov::LLMPipeline::LLMPipelineImpl::generation_config() const {
+ov::GenerationConfig ov::LLMPipeline::LLMPipelineImpl::generation_config() const {
     return m_sampling_parameters;
 }
 
-GenerationConfig ov::LLMPipeline::generation_config() const {
+ov::GenerationConfig ov::LLMPipeline::generation_config() const {
     return m_pimpl->generation_config();
 }
 
@@ -308,7 +310,9 @@ void print_tensor(const ov::Tensor& tensor) {
 
 ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::greedy_search(ov::Tensor input_ids, 
                                 ov::Tensor attention_mask, 
-                                GenerationConfig sampling_params) {
+                                GenerationConfig generation_config) {
+    
+    GenerationConfigHelper config_helper = generation_config;
     ov::Shape prompts_shape = input_ids.get_shape();
     size_t batch_size = prompts_shape[0];
     size_t prompt_len = prompts_shape[1];
@@ -356,7 +360,7 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::greedy_search(ov::Tensor in
     auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
     std::iota(beam_data, beam_data + batch_size, 0);
 
-    size_t max_tokens = sampling_params.get_max_new_tokens(prompt_len);
+    size_t max_tokens = config_helper.get_max_new_tokens(prompt_len);
     for (size_t i = 0; i < max_tokens; ++i) {
         
         // todo: consider replacing with start_async and run callback right after that
@@ -388,20 +392,18 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::greedy_search(ov::Tensor in
             results.scores[batch] += res.second;
 
             token_iter_results[batch] = out_token;
-            eos_met[batch] = (out_token == sampling_params.m_eos_token_id);
+            eos_met[batch] = (out_token == generation_config.eos_token_id);
 
             m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
         }
         // place
         // sampling_params.m_callback(std::move(token_iter_results), *this);
         
-        if (is_streamer_set) {
-            m_streamer_callback(m_streamer.put(token_iter_results[0]));
-        }
-        
+        m_streamer->put(token_iter_results[0]);
+
         // stop generation when EOS is met in all batches
         bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; });
-        if (!sampling_params.m_ignore_eos && all_are_eos)
+        if (!generation_config.ignore_eos && all_are_eos)
             break;
         // if (i != sampling_params.get_max_new_tokens(prompt_len) - 1)
         //     kv_cache_len += 1;
@@ -409,85 +411,8 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::greedy_search(ov::Tensor in
     return results;
 }
 
-ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::beam_search(ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params) {
-    ov::Shape prompts_shape = prompts.get_shape();
-    size_t batch_size = prompts_shape[0];
-    // todo: implement for batch > 1
-    OPENVINO_ASSERT(batch_size == 1);
-
-    // initialize inputs
-    auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()};
-    std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
-    auto position_ids = ov::Tensor{ov::element::i64, prompts.get_shape()};
-    std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
-    auto prompt_len = prompts.get_shape()[1];
-
-    m_model_runner.set_tensor("input_ids", prompts);
-    m_model_runner.set_tensor("attention_mask", attention_mask);
-    m_model_runner.set_tensor("position_ids", position_ids);
-
-    // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1
-    m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
-    m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;
-
-    const int64_t* prompt_data = prompts.data<const int64_t>();
-    
-    // todo: remove this duplication and use the same SamplingParameters for both greedy and beam
-    Parameters parameters{{std::vector<int64_t>{prompt_data, prompt_data + prompts.get_size()}}};
-    parameters.n_groups = sampling_params.m_num_groups;
-    parameters.diversity_penalty = sampling_params.m_diversity_penalty;
-    parameters.group_size = sampling_params.m_group_size;
-    
-    GroupBeamSearcher group_beam_searcher{parameters};
-    std::vector<int64_t> next_tokens;
-    std::vector<int32_t> next_beams;
-    for (size_t length_count = 0; length_count < sampling_params.get_max_new_tokens(prompt_len); ++length_count) {
-        m_model_runner.infer();
-        std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(m_model_runner.get_tensor("logits"));
-        if (next_tokens.empty()) {
-            break;
-        }
-        size_t batch_size = next_tokens.size();
-        // Set pointers
-        m_model_runner.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
-        m_model_runner.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
-        // Set auxiliary inputs
-        ov::Tensor attention_mask = m_model_runner.get_tensor("attention_mask");
-        ov::Shape mask_shape{batch_size, attention_mask.get_shape()[1] + 1};
-        attention_mask.set_shape(mask_shape);
-        std::fill_n(attention_mask.data<int64_t>(), ov::shape_size(mask_shape), 1);
-
-        m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
-        std::fill_n(m_model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape[1] - 1);
-        
-        // sampling_params.m_callback(std::move(next_tokens), *this);
-        // m_callback(std::move(next_tokens);
-        if (is_streamer_set) {
-            m_streamer.put(next_tokens[0]);
-        }
-
-    }
-
-    std::vector<Beam> beams;
-    for (const std::vector<std::vector<Beam>>& prompt_group : finalize(std::move(group_beam_searcher))) {
-        for (const std::vector<Beam> group : prompt_group) {
-            for (const Beam& beam : group) {
-                beams.emplace_back(beam);
-            }
-        }
-    }
-
-    auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); };
-    std::sort(beams.begin(), beams.end(), compare_scores);
-    
-    ov::EncodedResults results;
-    for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) {
-        // todo: convert to string 
-        results.scores.emplace_back(beam->score);
-        results.tokens.emplace_back(beam->tokens);
-    }
-    return results;
-}
+// ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::beam_search(ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params) {
+// }
 
 /* Speculative decoding works the following way. The draft model predicts the next K
 tokens one by one in an autoregressive manner, while the main model validates these
@@ -503,10 +428,12 @@ match the target. In tha caste the are validated in a single inference request t
 the main model (which is bigger, more accurate but slower) instead of running K
 subsequent requests. 
 */
-ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params) {
+ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
+    GenerationConfigHelper config_helper = generation_config;
+
     auto batch_size = input_ids.get_shape()[0];
     OPENVINO_ASSERT(batch_size == 1);
-    auto draft_model = sampling_params.get_assistant_model(m_device, m_plugin_config);
+    auto draft_model = config_helper.get_assistant_model(m_device, m_plugin_config);
     auto main_model = m_model_runner;
     
     auto draft_input_ids = ov::Tensor{input_ids.get_element_type(), input_ids.get_shape()};
@@ -566,15 +493,15 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::speculative_sampling(ov::Te
     results.tokens[0].emplace_back(out_token);
     
     // run K infer requests on draft model and get next K prediction tokens on each iteration
-    uint64_t K = sampling_params.m_num_assistant_tokens;
+    uint64_t K = config_helper.num_assistant_tokens;
     std::vector<int64_t> draft_tokens;
 
     // The draft model predicts tokens one by one in an auto-regressive manner, draft_input_ids length should be 1.
     draft_input_ids.set_shape({batch_size, 1});
     draft_position_ids.set_shape({batch_size, 1});
 
-    int max_sequence_length = sampling_params.m_max_new_tokens;
-    auto eos_token = sampling_params.m_eos_token_id;
+    int max_sequence_length = generation_config.max_new_tokens;
+    auto eos_token = generation_config.eos_token_id;
     
     while (out_token != eos_token && seq_len < max_sequence_length) {
         // infer the K next tokens with draft model
@@ -618,9 +545,7 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::speculative_sampling(ov::Te
             out_token = std::max_element(start, stop) - start;
             results.tokens[0].emplace_back(out_token);
 
-            if (is_streamer_set) {
-                m_streamer_callback(m_streamer.put(out_token));
-            }
+            m_streamer->put(out_token);
 
             disagree_idx = i;                
             if (out_token != draft_tokens[i] || out_token == eos_token || seq_len + disagree_idx + 1 >= max_sequence_length)
@@ -631,8 +556,8 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::speculative_sampling(ov::Te
         // Increment the sequence length by the number of matched tokens, and
         // trim the KV cache to match the new sequence length.
         seq_len += disagree_idx + 1;
-        update_kv_cache(draft_model, sampling_params.m_seq_len_axis, seq_len);
-        update_kv_cache(main_model, sampling_params.m_seq_len_axis, seq_len);
+        update_kv_cache(draft_model, config_helper.seq_len_axis, seq_len);
+        update_kv_cache(main_model, config_helper.seq_len_axis, seq_len);
         
         draft_tokens.clear();
         first_token = out_token;
@@ -641,7 +566,7 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::speculative_sampling(ov::Te
     return results;
 }
 
-ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
+ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::multinomial_sampling(ov::Tensor prompts, GenerationConfig generation_config) {
     // todo: implement
     ov::EncodedResults results;
     return results;
@@ -668,10 +593,10 @@ std::string ov::LLMPipeline::LLMPipelineImpl::call(std::string text, GenerationC
     // previous prompt generation in chat dialog stops with the end of sentence token, 
     // need to append this token to the current prompt
     if (is_chat_conversation && kv_cache_len > 0) {
-        text = generation_config.m_eos_token + text;
+        text = generation_config.eos_token + text;
     }
 
-    auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
+    auto [input_ids, attention_mask] = m_tokenizer.encode(text);
 
     // todo: W/A If sentence begins with a special tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
     // but HF does not do that. Moreover openvino_tokenizer always inserts <bos> but in chat scenario HF does not do that because skip_special_tokens=True.
@@ -700,7 +625,7 @@ std::string ov::LLMPipeline::LLMPipelineImpl::call(std::string text, GenerationC
         attention_mask.data<int64_t>()[i] = tmp_attn_mask.data()[i];
 
     auto generate_results = generate(input_ids, attention_mask, generation_config);
-    return m_tokenizer.detokenize(generate_results.tokens)[0];
+    return m_tokenizer.decode(generate_results.tokens)[0];
 }
 
 ov::DecodedResults ov::LLMPipeline::call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
@@ -708,11 +633,11 @@ ov::DecodedResults ov::LLMPipeline::call(std::vector<std::string> text, Generati
 }
 
 ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
-    auto [input_ids, attention_mask] = m_tokenizer.tokenize(text);
+    auto [input_ids, attention_mask] = m_tokenizer.encode(text);
 
     auto generate_results = generate(input_ids, attention_mask, sampling_parameters);
 
-    return {m_tokenizer.detokenize(generate_results.tokens), generate_results.scores};
+    return {m_tokenizer.decode(generate_results.tokens), generate_results.scores};
 }
 
 std::string ov::LLMPipeline::operator()(std::string text) {
@@ -737,12 +662,14 @@ ov::EncodedResults ov::LLMPipeline::LLMPipeline::generate(ov::Tensor input_ids,
 
 ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
     ov::EncodedResults result;
+    GenerationConfigHelper config_helper = generation_config;
 
-    if (generation_config.is_gready_sampling()) {
+    if (config_helper.is_greedy_sampling()) {
         result = greedy_search(input_ids, attention_mask, generation_config);
-    } else if (generation_config.is_beam_search()) {
-        result = beam_search(input_ids, attention_mask, generation_config);
-    } else if (generation_config.is_multimomial()) {
+    } else if (config_helper.is_beam_search()) {
+        result = beam_search(m_model_runner, input_ids, attention_mask, generation_config);
+        
+    } else if (config_helper.is_multimomial()) {
         result = multinomial_sampling(input_ids, generation_config);
     } else { // speculative
         result = speculative_sampling(input_ids, attention_mask, generation_config);
@@ -768,7 +695,7 @@ ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids) {
     return generate(input_ids, init_attention_mask(input_ids), m_pimpl->m_sampling_parameters);
 }
 
-Tokenizer ov::LLMPipeline::get_tokenizer() {
+ov::Tokenizer ov::LLMPipeline::get_tokenizer() {
     return m_pimpl->m_tokenizer;
 }
 
@@ -800,14 +727,16 @@ std::string ov::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string pr
 }
 
 void ov::LLMPipeline::set_streamer(std::function<void (std::string)> callback) {
-    m_pimpl->is_streamer_set = true;
-    m_pimpl->m_streamer_callback = callback;
-    m_pimpl->m_streamer = TextCoutStreamer(m_pimpl->m_tokenizer);
+    m_pimpl->m_streamer = std::make_shared<TextCallbackStreamer>(m_pimpl->m_tokenizer, callback);
+    // m_pimpl->m_streamer->set_callback(callback);
+}
+
+void ov::LLMPipeline::set_streamer(std::shared_ptr<StreamerBase> streamer) {
+    m_pimpl->m_streamer = streamer;
 }
 
 void ov::LLMPipeline::set_streamer() {
-    m_pimpl->is_streamer_set = false;
-    m_pimpl->m_streamer_callback = [](std::string){ ;};
+    // m_pimpl->m_streamer->set_callback();
 }
 
 void ov::LLMPipeline::start_chat() {
@@ -827,8 +756,8 @@ void ov::LLMPipeline::set_default_config(const GenerationConfig& generation_conf
     m_pimpl->m_sampling_parameters = generation_config;
 }
 
-void ov::LLMPipeline::set_default_config(const AnyMap& generation_config_map) {
-    m_pimpl->m_sampling_parameters = GenerationConfig::anymap_to_generation_config(generation_config_map);
-}
+// void ov::LLMPipeline::set_default_config(const AnyMap& generation_config_map) {
+//     m_pimpl->m_sampling_parameters = GenerationConfig::anymap_to_generation_config(generation_config_map);
+// }
 
 ov::LLMPipeline::~LLMPipeline() = default;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_tokenizer.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_tokenizer.cpp
index 48f25812f4..d0d9c9894c 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_tokenizer.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_tokenizer.cpp
@@ -5,8 +5,31 @@
 #include "llm_tokenizer.hpp"
 #include <filesystem>
 
+std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token=2);
 
-Tokenizer::Tokenizer(std::string& tokenizers_path, std::string device): m_device(device) {
+namespace ov {
+
+class Tokenizer::TokenizerImpl {
+public:
+    ov::InferRequest m_tokenize_request;
+    ov::InferRequest m_detokenizer_request;
+    std::string m_device;
+
+    TokenizerImpl() = default;
+    TokenizerImpl(std::string& tokenizers_path, std::string device);
+
+    std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt);
+
+    std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string> prompts);
+      
+    std::string decode(std::vector<int64_t> tokens);
+    
+    std::vector<std::string> decode(ov::Tensor tokens);
+    
+    std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines);
+};
+
+Tokenizer::TokenizerImpl::TokenizerImpl(std::string& tokenizers_path, std::string device): m_device(device) {
     ov::Core core;
     
     auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
@@ -23,16 +46,20 @@ Tokenizer::Tokenizer(std::string& tokenizers_path, std::string device): m_device
     m_detokenizer_request = core.compile_model(tokenizers_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
 }
 
-// Tokenizer::Tokenizer(std::string& tokenizer_path, std::string& detokenizer_path, std::string device="CPU") {
+Tokenizer::Tokenizer(std::string& tokenizers_path, std::string device) {
+    m_pimpl = std::make_shared<TokenizerImpl>(tokenizers_path, device);
+}
 
-// }
+std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(std::string prompt) {
+    return m_pimpl->encode(prompt);
+}
 
-std::pair<ov::Tensor, ov::Tensor> Tokenizer::tokenize(std::string prompt) {
+std::pair<ov::Tensor, ov::Tensor> Tokenizer::TokenizerImpl::encode(std::string prompt) {
     size_t batch_size = 1;
     m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
     m_tokenize_request.infer();
 
-    vector<vector<int64_t>> input_ids_vec;
+    std::vector<std::vector<int64_t>> input_ids_vec;
     input_ids_vec.reserve(1);
     auto res_tensor = m_tokenize_request.get_tensor("input_ids");
     auto res_shape = res_tensor.get_shape();
@@ -45,7 +72,11 @@ std::pair<ov::Tensor, ov::Tensor> Tokenizer::tokenize(std::string prompt) {
     return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
 }
 
-std::pair<ov::Tensor, ov::Tensor> Tokenizer::tokenize(std::vector<std::string> prompts) {
+std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(std::vector<std::string> prompts) {
+    return m_pimpl->encode(prompts);
+}
+
+std::pair<ov::Tensor, ov::Tensor> Tokenizer::TokenizerImpl::encode(std::vector<std::string> prompts) {
     m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
     auto size_ = m_tokenize_request.get_input_tensor().get_shape();
     m_tokenize_request.infer();
@@ -59,19 +90,27 @@ std::pair<ov::Tensor, ov::Tensor> Tokenizer::tokenize(std::vector<std::string> p
     return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
 }
 
-std::pair<ov::Tensor, ov::Tensor> Tokenizer::tokenize(std::initializer_list<std::string> text) {
-    return tokenize(std::vector<std::string>(text.begin(), text.end()));
+std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(std::initializer_list<std::string> text) {
+    return encode(std::vector<std::string>(text.begin(), text.end()));
 }
 
 
-std::string Tokenizer::detokenize(std::vector<int64_t> tokens) {
+std::string Tokenizer::decode(std::vector<int64_t> tokens) {
+    return m_pimpl->decode(tokens);
+}
+
+std::string Tokenizer::TokenizerImpl::decode(std::vector<int64_t> tokens) {
     size_t batch_size = 1;
     m_detokenizer_request.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()});
     m_detokenizer_request.infer();
     return m_detokenizer_request.get_output_tensor().data<std::string>()[0];
 }
 
-std::vector<std::string> Tokenizer::detokenize(ov::Tensor tokens) {
+std::vector<std::string> Tokenizer::decode(ov::Tensor tokens) {
+    return m_pimpl->decode(tokens);
+}
+
+std::vector<std::string> Tokenizer::TokenizerImpl::decode(ov::Tensor tokens) {
     m_detokenizer_request.set_input_tensor(tokens);
     auto shape = tokens.get_shape();
     auto data = tokens.data<int64_t>();
@@ -85,7 +124,11 @@ std::vector<std::string> Tokenizer::detokenize(ov::Tensor tokens) {
     return strings;
 }
 
-std::vector<std::string> Tokenizer::detokenize(std::vector<std::vector<int64_t>> lines) {
+std::vector<std::string> Tokenizer::decode(std::vector<std::vector<int64_t>> lines) {
+    return m_pimpl->decode(lines);
+}
+
+std::vector<std::string> Tokenizer::TokenizerImpl::decode(std::vector<std::vector<int64_t>> lines) {
     // todo: implement calling detokenizer in a single batch
 
     std::vector<std::string> results;
@@ -101,46 +144,6 @@ std::vector<std::string> Tokenizer::detokenize(std::vector<std::vector<int64_t>>
     return results;
 }
 
-TextCoutStreamer::TextCoutStreamer(const Tokenizer& tokenizer, bool print_eos_token) {
-    m_tokenizer = tokenizer;
-    m_print_eos_token = print_eos_token;
-}
-
-std::string TextCoutStreamer::put(int64_t token) {
-    std::stringstream res;
-
-    // do not print anything and flush cache if EOS token is met
-    if (token == m_tokenizer.m_eos_token) {
-        return end();
-    }
-
-    m_tokens_cache.push_back(token);
-    std::string text = m_tokenizer.detokenize(m_tokens_cache);
-    if (!text.empty() && '\n' == text.back()) {
-        // Flush the cache after the new line symbol
-        res << std::string_view{text.data() + print_len, text.size() - print_len};
-        m_tokens_cache.clear();
-        print_len = 0;
-        return res.str();
-    }
-    if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
-        // Don't print incomplete text
-        return res.str();
-    }
-    res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
-    print_len = text.size();
-    return res.str();
-}
-
-std::string TextCoutStreamer::end() {
-    std::stringstream res;
-    std::string text = m_tokenizer.detokenize(m_tokens_cache);
-    res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
-    m_tokens_cache.clear();
-    print_len = 0;
-    return res.str();
-}
+Tokenizer::~Tokenizer() = default;
 
-void TextCoutStreamer::set_tokenizer(Tokenizer tokenizer) {
-    this->m_tokenizer = tokenizer;
-}
+} // namespace ov
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.cpp
new file mode 100644
index 0000000000..15be012661
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.cpp
@@ -0,0 +1,77 @@
+#include "text_callback_streamer.hpp"
+
+namespace ov {
+    
+
+TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function<void (std::string)> callback, bool print_eos_token) {
+    m_tokenizer = tokenizer;
+    m_print_eos_token = print_eos_token;
+    m_callback = callback;
+    m_enabled = true;
+}
+
+TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, bool print_eos_token) {
+    m_tokenizer = tokenizer;
+    m_print_eos_token = print_eos_token;
+}
+
+void TextCallbackStreamer::put(int64_t token) {
+    std::stringstream res;
+
+    // do not print anything and flush cache if EOS token is met
+    if (token == m_tokenizer.m_eos_token) {
+        end();
+        return;
+    }
+
+    m_tokens_cache.push_back(token);
+    std::string text = m_tokenizer.decode(m_tokens_cache);
+    if (!text.empty() && '\n' == text.back()) {
+        // Flush the cache after the new line symbol
+        res << std::string_view{text.data() + print_len, text.size() - print_len};
+        m_tokens_cache.clear();
+        print_len = 0;
+        on_finalized_text(res.str());
+        return;
+    }
+    if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
+        // Don't print incomplete text
+        on_finalized_text(res.str());
+        return;
+    }
+    res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+    print_len = text.size();
+    on_finalized_text(res.str());
+    return;
+}
+
+void TextCallbackStreamer::end() {
+    std::stringstream res;
+    std::string text = m_tokenizer.decode(m_tokens_cache);
+    res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
+    m_tokens_cache.clear();
+    print_len = 0;
+    on_finalized_text(res.str());
+}
+
+void TextCallbackStreamer::set_tokenizer(Tokenizer tokenizer) {
+    this->m_tokenizer = tokenizer;
+}
+
+void TextCallbackStreamer::set_callback(std::function<void (std::string)> callback) {
+    m_callback = callback;
+    m_enabled = true;
+}
+
+void TextCallbackStreamer::set_callback() {
+    m_callback = [](std::string words){ ;};
+    m_enabled = false;
+}
+
+void TextCallbackStreamer::on_finalized_text(const std::string& subword) {
+    if (m_enabled) {
+        m_callback(subword);
+    }
+}
+
+} // namespace ov
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/text_streamer.hpp b/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.hpp
similarity index 52%
rename from text_generation/causal_lm/cpp/generate_pipeline/include/text_streamer.hpp
rename to text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.hpp
index 1927e5c0c7..6eeada6d35 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/text_streamer.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.hpp
@@ -2,28 +2,33 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
+#include "streamer_base.hpp"
 #include "llm_tokenizer.hpp"
 
-class StreamerBase {
-public:
-    virtual void put(int64_t token) = 0;
-
-    virtual void end() = 0;
-};
+namespace ov {
 
 class TextCallbackStreamer: public StreamerBase {
-    Tokenizer tokenizer;
-    std::vector<int64_t> token_cache;
-    size_t print_len = 0;
-    std::function<void (std::string)> m_callback = [](std::string words){ ;};
-    
 public:
+    void put(int64_t token) override;
+    void end() override;
+
+    TextCallbackStreamer(const Tokenizer& tokenizer, std::function<void (std::string)> callback, bool print_eos_token = false);
+    TextCallbackStreamer(const Tokenizer& tokenizer, bool print_eos_token = false);
     TextCallbackStreamer() = default;
-    TextCallbackStreamer(const Tokenizer& tokenizer);
-    TextCallbackStreamer(const Tokenizer& tokenizer, std::function<void (std::string)> callback);
+    ~TextCallbackStreamer() = default;
+
     void set_tokenizer(Tokenizer tokenizer);
     void set_callback(std::function<void (std::string)> callback);
+    void set_callback();
     
-    void put(int64_t token) override;
-    void end() override;  
+    std::function<void (std::string)> m_callback = [](std::string words){ ;};
+    bool m_enabled = false;
+private:
+    bool m_print_eos_token = false;
+    Tokenizer m_tokenizer;
+    std::vector<int64_t> m_tokens_cache;
+    size_t print_len = 0;
+    void on_finalized_text(const std::string& subword);
 };
+
+} // namespace ov
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/text_streamer.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/text_streamer.cpp
deleted file mode 100644
index ac12a05eb5..0000000000
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/text_streamer.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#include "text_streamer.hpp"
-
-TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer) {
-    this->tokenizer = tokenizer;
-}
-
-TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function<void (std::string)> callback) {
-    this->tokenizer = tokenizer;
-    this->m_callback = callback;
-}
-
-void TextCallbackStreamer::put(int64_t token) {
-    // do not print anything and flush cache if EOS token is met
-    if (token == tokenizer.m_eos_token) {
-        end();
-        return ;
-    }
-
-    token_cache.push_back(token);
-    std::string text = tokenizer.detokenize(token_cache);
-    if (!text.empty() && '\n' == text.back()) {
-        // Flush the cache after the new line symbol
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
-        token_cache.clear();
-        print_len = 0;
-        return;
-    }
-    if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
-        // Don't print incomplete text
-        return;
-    }
-    std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
-    print_len = text.size();
-}
-
-void TextCallbackStreamer::end() {
-    std::string text = tokenizer.detokenize(token_cache);
-    std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
-    token_cache.clear();
-    print_len = 0;
-}
-
-void TextCallbackStreamer::set_tokenizer(Tokenizer tokenizer) {
-    this->tokenizer = tokenizer;
-}

From 62c471e9ec0659215fcfacbf8267043b3679e3e7 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Tue, 7 May 2024 16:14:13 +0200
Subject: [PATCH 32/97] extract decoding methods to separate files

---
 .../src/assistive_decoding.cpp                | 211 +++++++++
 ...am_search.cpp => beam_search_decoding.cpp} |   0
 .../generate_pipeline/src/greedy_decoding.cpp | 178 ++++++++
 .../generate_pipeline/src/llm_pipeline.cpp    | 426 +-----------------
 .../cpp/generate_pipeline/src/utils.cpp       |  51 +++
 .../cpp/generate_pipeline/src/utils.hpp       |  18 +
 6 files changed, 475 insertions(+), 409 deletions(-)
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/src/assistive_decoding.cpp
 rename text_generation/causal_lm/cpp/generate_pipeline/src/{beam_search.cpp => beam_search_decoding.cpp} (100%)
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/src/greedy_decoding.cpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/src/utils.cpp
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/src/utils.hpp

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/assistive_decoding.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/assistive_decoding.cpp
new file mode 100644
index 0000000000..3e893e053c
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/assistive_decoding.cpp
@@ -0,0 +1,211 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "generation_config_helper.hpp"
+#include "llm_pipeline.hpp"
+
+ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len);
+void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len);
+
+ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len) {
+    // Copy elements from the old to a new tensor and return it.
+    // It's assumed that key/values tensor has a shape [BATCH_SIZE, num_kv_heads, seq_len, head_size] or [seq_len, ...],
+    // It that's not the case for your model please implement your own trim method.
+    OPENVINO_ASSERT(seq_len_axis == 2 || seq_len_axis == 0, "Cannot trim key/values with sequence length axis = ", seq_len_axis);
+    
+    auto old_tensor_data = tensor.data<float>();
+    auto shape = tensor.get_shape();
+    size_t batch_size = shape[0];
+    size_t num_kv_heads = shape[1];
+    size_t old_seq_len = shape[2];
+    size_t head_size = shape[3];
+    
+    OPENVINO_ASSERT(new_seq_len <= old_seq_len);
+    
+    // if new_seq_len equal to old one no need to copy tensor, return as is
+    if (old_seq_len == new_seq_len)
+        return tensor;
+
+    if (seq_len_axis == 0) {
+        shape[0] = new_seq_len;
+        tensor.set_shape(shape);
+    }
+
+    // if seq_len_axis == 2, then data is not contiguous, in order to trim need to repack tensor
+    auto new_tensor = ov::Tensor{ov::element::f32, {batch_size, num_kv_heads, new_seq_len, head_size}};
+    auto new_tensor_data = new_tensor.data<float>();
+    for (size_t batch = 0; batch < batch_size; ++batch){
+        for (size_t i = 0; i < num_kv_heads; ++i) {
+            for (size_t j = 0; j < new_seq_len; ++j) {
+                auto dst_ptr = new_tensor_data + num_kv_heads * new_seq_len * head_size * batch + new_seq_len * head_size * i +  head_size * j;
+                auto src_ptr = old_tensor_data + num_kv_heads * new_seq_len * head_size * batch + old_seq_len * head_size * i +  head_size * j;
+                std::memcpy(dst_ptr, src_ptr, head_size * sizeof(float));
+            }
+        }
+    }
+    return new_tensor;
+}
+
+void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) {
+    // trim kv_cache values up to the new_seq_len
+    for (auto& state: request.query_state()) {
+        ov::Tensor old_tensor = state.get_state();
+        state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
+    }
+}
+
+/* Speculative decoding works the following way. The draft model predicts the next K
+tokens one by one in an autoregressive manner, while the main model validates these
+predictions and corrects them if necessary. We go through each predicted token, and
+if a difference is detected between the draft and main model, we stop and keep the
+last token predicted by the main model. Then the draft model gets the latest main
+prediction and again tries to predict the next K tokens, repeating the cycle.
+
+This approach reduces the need for multiple infer requests to the main model,
+enhancing performance. For instance, in more predictable parts of text generation,
+the draft model can, in best-case scenarios, generate the next K tokens that exactly
+match the target. In tha caste the are validated in a single inference request to
+the main model (which is bigger, more accurate but slower) instead of running K
+subsequent requests. 
+*/
+
+namespace ov {
+ov::EncodedResults assistive_decoding(ov::InferRequest& m_model_runner, ov::Tensor input_ids, ov::Tensor attention_mask, ov::GenerationConfig generation_config) {
+    ov::GenerationConfigHelper config_helper = generation_config;
+
+    auto batch_size = input_ids.get_shape()[0];
+    OPENVINO_ASSERT(batch_size == 1);
+    auto draft_model = config_helper.get_assistant_model(); // todo: add config getting m_device, m_plugin_config
+    auto main_model = m_model_runner;
+    
+    auto draft_input_ids = ov::Tensor{input_ids.get_element_type(), input_ids.get_shape()};
+    input_ids.copy_to(draft_input_ids);
+    auto draft_attention_mask = ov::Tensor{input_ids.get_element_type(), input_ids.get_shape()};
+    
+    draft_model.set_tensor("input_ids", draft_input_ids);
+    draft_model.set_tensor("attention_mask", draft_attention_mask);
+    
+    ov::Tensor draft_position_ids = draft_model.get_tensor("position_ids");
+    draft_position_ids.set_shape(draft_input_ids.get_shape());
+    std::iota(draft_position_ids.data<int64_t>(), draft_position_ids.data<int64_t>() + draft_position_ids.get_size(), 0);
+    uint64_t seq_len = draft_input_ids.get_shape()[1];
+
+    // Input tensors for the main model should not be mixed with draft.
+    // Do not feed the same draft_postion_ids to the main, but copy input_ids from the draft_input_ids
+    // auto input_ids = main_model.get_tensor("input_ids");
+    // input_ids.set_shape(draft_input_ids.get_shape());
+    // draft_input_ids.copy_to(input_ids);
+
+    // auto attention_mask = main_model.get_tensor("attention_mask");
+    // attention_mask.set_shape(draft_input_ids.get_shape());
+    // std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
+
+    auto position_ids = main_model.get_tensor("position_ids");
+    position_ids.set_shape(draft_input_ids.get_shape());
+    std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
+    
+    // set beam_idx for stateful model: no beam search is used and batch_size = 1
+    draft_model.get_tensor("beam_idx").set_shape({batch_size});
+    draft_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+    main_model.get_tensor("beam_idx").set_shape({batch_size});
+    main_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+
+    main_model.set_tensor("input_ids", input_ids);
+    main_model.set_tensor("attention_mask", attention_mask);
+    main_model.set_tensor("position_ids", position_ids);
+
+    // To coollect kv-cache for the <PROMPT> and to get the next token run the very first infer request
+    draft_model.infer();
+    main_model.infer();
+
+    size_t vocab_size = draft_model.get_tensor("logits").get_shape().back();
+    OPENVINO_ASSERT(vocab_size == main_model.get_tensor("logits").get_shape().back(), "vocab size should be the same for the both models");
+    
+    // logits shape is [batch_size, seq_len, vocab_size]
+    auto logits = main_model.get_tensor("logits");
+    auto data_logits = logits.data<float>() + (seq_len - 1) * vocab_size;
+    int64_t out_token = std::max_element(data_logits, data_logits + vocab_size) - data_logits;
+    
+    // the first token which is fed to both draft and main netwoks on each iteration
+    auto first_token = out_token;
+
+    ov::EncodedResults results;
+    results.tokens.resize(batch_size);
+
+    results.tokens[0].emplace_back(out_token);
+    
+    // run K infer requests on draft model and get next K prediction tokens on each iteration
+    uint64_t K = config_helper.num_assistant_tokens;
+    std::vector<int64_t> draft_tokens;
+
+    // The draft model predicts tokens one by one in an auto-regressive manner, draft_input_ids length should be 1.
+    draft_input_ids.set_shape({batch_size, 1});
+    draft_position_ids.set_shape({batch_size, 1});
+
+    int max_sequence_length = generation_config.max_new_tokens;
+    auto eos_token = generation_config.eos_token_id;
+    
+    while (out_token != eos_token && seq_len < max_sequence_length) {
+        // infer the K next tokens with draft model
+        for (int i = 0; i < K; ++i) {
+            draft_input_ids.data<int64_t>()[0] = out_token;
+            draft_attention_mask.set_shape({batch_size, seq_len + i + 1});
+            std::fill_n(draft_attention_mask.data<int64_t>(), draft_attention_mask.get_size(), 1);
+            draft_position_ids.data<int64_t>()[0] = int64_t(draft_attention_mask.get_size() - 1);
+
+            draft_model.infer();
+
+            auto draft_logits = draft_model.get_tensor("logits").data<float>();
+            int64_t arg_max_token = std::max_element(draft_logits, draft_logits + vocab_size) - draft_logits;
+            out_token = arg_max_token;
+            draft_tokens.emplace_back(arg_max_token);
+        }
+
+        // For the main network, K tokens will be fed at once in a single infer request.
+        input_ids.set_shape({batch_size, K});
+        // Set the first token for the main model to be the same as for the draft model.
+        input_ids.data<int64_t>()[0] = first_token;
+        for (int i = 0; i < K - 1; i++)
+            input_ids.data<int64_t>()[i + 1] = draft_tokens[i];
+
+        attention_mask.set_shape({batch_size, seq_len + K});
+        std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
+
+        position_ids.set_shape({batch_size, K});
+        std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), seq_len);
+
+        main_model.infer();
+
+        data_logits = logits.data<float>();  // [batch_size, K, vocab_size]
+        size_t disagree_idx = K - 1;
+        // Iterate through the predicted tokens from the main model and compare them with draft predictions.
+        // In the worst-case scenario (disagreement at the beginning), iter will increase by 1.
+        // In the best-case scenario, all elements match, and K predicted tokens will be taken.
+        for (size_t i = 0; i < K; i++) {
+            auto start = data_logits + vocab_size * i;
+            auto stop = data_logits + vocab_size * (i + 1);
+            out_token = std::max_element(start, stop) - start;
+            results.tokens[0].emplace_back(out_token);
+
+            // m_streamer->put(out_token);
+
+            disagree_idx = i;                
+            if (out_token != draft_tokens[i] || out_token == eos_token || seq_len + disagree_idx + 1 >= max_sequence_length)
+                break;
+        }
+
+        // After the inference request, key/values have shape [batch_size, seq_len + K, vocab_size].
+        // Increment the sequence length by the number of matched tokens, and
+        // trim the KV cache to match the new sequence length.
+        seq_len += disagree_idx + 1;
+        update_kv_cache(draft_model, config_helper.seq_len_axis, seq_len);
+        update_kv_cache(main_model, config_helper.seq_len_axis, seq_len);
+        
+        draft_tokens.clear();
+        first_token = out_token;
+    }
+
+    return results;
+}
+
+} // namespace ov
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/beam_search.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/beam_search_decoding.cpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/src/beam_search.cpp
rename to text_generation/causal_lm/cpp/generate_pipeline/src/beam_search_decoding.cpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/greedy_decoding.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/greedy_decoding.cpp
new file mode 100644
index 0000000000..a9d750bfb8
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/greedy_decoding.cpp
@@ -0,0 +1,178 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "generation_config_helper.hpp"
+#include "llm_pipeline.hpp"
+#include "utils.hpp"
+
+namespace {
+
+void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
+void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos = 0);
+ov::Tensor extend_attention(ov::Tensor attention_mask);
+
+void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) {
+    const size_t batch_size = attention_mask.get_shape()[0];
+    const size_t atten_length = attention_mask.get_shape()[1];
+    position_ids.set_shape({batch_size, 1});
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        int64_t* start = attention_mask.data<int64_t>() + batch * atten_length;
+        position_ids.data<int64_t>()[batch] = std::accumulate(start, start + atten_length, 0);
+    }
+}
+
+void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos) {
+    const size_t batch_size = attention_mask.get_shape()[0];
+    const size_t seq_length = attention_mask.get_shape()[1];
+
+    const int64_t* attention_mask_data = attention_mask.data<int64_t>();
+    int64_t* position_ids_data = position_ids.data<int64_t>();
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        size_t sum = start_pos;
+        for (size_t i = 0; i < seq_length; i++) {
+            const size_t element_offset = batch * seq_length + i;
+            position_ids_data[element_offset] = sum;
+            if (attention_mask_data[element_offset] == 1) {
+                sum += 1;
+            }
+        }
+    }
+}
+
+ov::Tensor extend_attention(ov::Tensor attention_mask) {
+    auto shape = attention_mask.get_shape();
+    auto batch_size = shape[0];
+    auto seq_len = shape[1];
+
+    ov::Tensor new_atten_mask = ov::Tensor{attention_mask.get_element_type(), {batch_size, seq_len + 1}};
+    auto old_data = attention_mask.data<int64_t>();
+    auto new_data = new_atten_mask.data<int64_t>();
+    for (size_t batch = 0; batch < batch_size; ++batch) {
+        std::memcpy(new_data + batch * (seq_len + 1), old_data + batch * seq_len, seq_len * sizeof(int64_t));
+        new_data[batch * (seq_len + 1) + seq_len] = 1;
+    }
+    return new_atten_mask;
+}
+
+}
+
+namespace ov {
+
+ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner, 
+                                       ov::Tensor input_ids, ov::Tensor attention_mask, ov::GenerationConfig generation_config, 
+                                       std::shared_ptr<StreamerBase> streamer, bool is_chat_conversation) {
+    
+    ov::GenerationConfigHelper config_helper = generation_config;
+    ov::Shape prompts_shape = input_ids.get_shape();
+    size_t batch_size = prompts_shape[0];
+    size_t prompt_len = prompts_shape[1];
+    
+    auto kv_cache_len = m_model_runner.query_state()[0].get_state().get_shape()[2];
+
+    // todo: make this work even if position_ids are not specified
+    auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
+    initialize_position_ids(position_ids, attention_mask, kv_cache_len);
+
+    ov::EncodedResults results;
+    results.scores.resize(batch_size);
+    results.tokens.resize(batch_size);
+    std::fill(results.scores.begin(), results.scores.end(), 0);
+    
+    if (is_chat_conversation && kv_cache_len > 0) {
+        // m_attentions_mask_cache extent with attention_mask;
+        auto attentions_mask_history = m_model_runner.get_tensor("attention_mask");
+        // print_tensor(m_attentions_mask_cache);
+
+        size_t new_prompt_len = attention_mask.get_shape()[1];
+        size_t context_len = attentions_mask_history.get_shape()[1];
+        ov::Tensor new_attention_mask =  ov::Tensor{ov::element::i64, {1, context_len + new_prompt_len}};
+
+        for (size_t i = 0; i < context_len; ++i) {
+            auto r = attentions_mask_history.data<int64_t>()[i];
+            new_attention_mask.data<int64_t>()[i] = attentions_mask_history.data<int64_t>()[i];
+        }
+        for (size_t i = context_len; i < context_len + new_prompt_len; ++i) {
+            auto r = attention_mask.data<int64_t>()[i];
+            new_attention_mask.data<int64_t>()[i] = attention_mask.data<int64_t>()[i - context_len];
+        }
+        m_model_runner.set_tensor("attention_mask", new_attention_mask);
+    } else {
+        m_model_runner.set_tensor("attention_mask", attention_mask);
+    }
+
+    auto atten_shape = attention_mask.get_shape();
+    auto pos_shape = position_ids.get_shape();
+    auto input_ids_shape = input_ids.get_shape();
+
+    m_model_runner.set_tensor("input_ids", input_ids);
+    m_model_runner.set_tensor("position_ids", position_ids);
+
+    m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
+    auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
+    std::iota(beam_data, beam_data + batch_size, 0);
+
+    size_t max_tokens = config_helper.get_max_new_tokens(prompt_len);
+    
+    m_model_runner.infer();
+    auto logits = m_model_runner.get_tensor("logits");
+    ov::Shape logits_shape = logits.get_shape();
+    size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];
+    m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});
+
+    std::vector<int64_t> token_iter_results(batch_size);  // results of a single infer request
+    std::vector<int> eos_met(batch_size, 0);  // use int because can not use std::all_of with vector<bool>
+    for (size_t batch = 0; batch < batch_size; ++batch) {
+        auto res = generate_utils::softmax(logits, batch);
+        auto out_token = res.first;
+        results.tokens[batch].emplace_back(res.first);
+        results.scores[batch] += res.second;
+
+        token_iter_results[batch] = out_token;
+        eos_met[batch] = (out_token == generation_config.eos_token_id);
+        m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
+    }
+    if (streamer)
+        streamer->put(token_iter_results[0]);
+
+    bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; });
+    if (!generation_config.ignore_eos && all_are_eos)
+        return results;
+    
+    for (size_t i = 0; i < max_tokens; ++i) {
+        update_position_ids(position_ids, m_model_runner.get_tensor("attention_mask"));
+        m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
+
+        // todo: consider replacing with start_async and run callback right after that
+        m_model_runner.infer();
+        auto logits = m_model_runner.get_tensor("logits");
+        ov::Shape logits_shape = logits.get_shape();
+        size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];
+        
+        std::vector<int64_t> token_iter_results(batch_size);  // results of a single infer request
+        std::vector<int> eos_met(batch_size, 0);  // use int because can not use std::all_of with vector<bool>
+        for (size_t batch = 0; batch < batch_size; ++batch) {
+
+            auto res = ov::generate_utils::softmax(logits, batch);
+            auto out_token = res.first;
+            results.tokens[batch].emplace_back(res.first);
+            results.scores[batch] += res.second;
+
+            token_iter_results[batch] = out_token;
+            eos_met[batch] = (out_token == generation_config.eos_token_id);
+
+            m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
+        }
+        if (streamer)
+            streamer->put(token_iter_results[0]);
+
+        // stop generation when EOS is met in all batches
+        bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; });
+        if (!generation_config.ignore_eos && all_are_eos)
+            break;
+    }
+    return results;
+}
+
+}
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
index 5b6892c3d2..734a617ef0 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
@@ -6,24 +6,30 @@
 #include <filesystem>
 #include "generation_config_helper.hpp"
 #include "text_callback_streamer.hpp"
+#include "utils.hpp"
 
 // #include <jinja2cpp/template.h>
 // #include <jinja2cpp/template_env.h>
 // #include "generation_config.hpp"
 
-void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
-void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
-ov::Tensor init_attention_mask(ov::Tensor& position_ids);
-ov::Tensor extend_attention(ov::Tensor attention_mask);
-ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len);
-void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len);
 
 std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_idx);
 
 namespace ov {
 
+ov::EncodedResults assistive_decoding(ov::InferRequest& m_model_runner, ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config);
 ov::EncodedResults beam_search(ov::InferRequest& model_runner, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params);
 
+ov::EncodedResults greedy_decoding(
+    ov::InferRequest& model_runner, 
+    ov::Tensor prompts, 
+    ov::Tensor attentin_mask, 
+    GenerationConfig sampling_params, 
+    std::shared_ptr<StreamerBase> streamer, 
+    bool is_chat_conversation = false
+);
+
+
 class LLMPipeline::LLMPipelineImpl {
 public:
     ov::InferRequest m_model_runner;
@@ -53,12 +59,6 @@ class LLMPipeline::LLMPipelineImpl {
     
     GenerationConfig generation_config() const;
 
-    EncodedResults greedy_search(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config);
-
-    // EncodedResults beam_search(ov::Tensor prompts, ov::Tensor attention_mask, GenerationConfig generation_config);
-
-    EncodedResults speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config);
-
     EncodedResults multinomial_sampling(ov::Tensor prompts, GenerationConfig generation_config);
 
     EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config);
@@ -109,125 +109,6 @@ std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&&
     return {input_ids, attention_mask};
 }
 
-void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) {
-    const size_t batch_size = attention_mask.get_shape()[0];
-    const size_t atten_length = attention_mask.get_shape()[1];
-    position_ids.set_shape({batch_size, 1});
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        int64_t* start = attention_mask.data<int64_t>() + batch * atten_length;
-        position_ids.data<int64_t>()[batch] = std::accumulate(start, start + atten_length, 0);
-    }
-}
-
-void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos = 0) {
-    const size_t batch_size = attention_mask.get_shape()[0];
-    const size_t seq_length = attention_mask.get_shape()[1];
-
-    const int64_t* attention_mask_data = attention_mask.data<int64_t>();
-    int64_t* position_ids_data = position_ids.data<int64_t>();
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        size_t sum = start_pos;
-        for (size_t i = 0; i < seq_length; i++) {
-            const size_t element_offset = batch * seq_length + i;
-            position_ids_data[element_offset] = sum;
-            if (attention_mask_data[element_offset] == 1) {
-                sum += 1;
-            }
-        }
-    }
-}
-
-ov::Tensor init_attention_mask(ov::Tensor& position_ids) {
-    auto shape = position_ids.get_shape();
-    auto attention_mask = ov::Tensor{position_ids.get_element_type(), shape};
-    std::fill_n(attention_mask.data<int64_t>(), shape[0] * shape[1], 1);
-    return attention_mask;
-}
-
-ov::Tensor extend_attention(ov::Tensor attention_mask) {
-    auto shape = attention_mask.get_shape();
-    auto batch_size = shape[0];
-    auto seq_len = shape[1];
-
-    ov::Tensor new_atten_mask = ov::Tensor{attention_mask.get_element_type(), {batch_size, seq_len + 1}};
-    auto old_data = attention_mask.data<int64_t>();
-    auto new_data = new_atten_mask.data<int64_t>();
-    for (size_t batch = 0; batch < batch_size; ++batch) {
-        std::memcpy(new_data + batch * (seq_len + 1), old_data + batch * seq_len, seq_len * sizeof(int64_t));
-        new_data[batch * (seq_len + 1) + seq_len] = 1;
-    }
-    return new_atten_mask;
-}
-
-ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len) {
-    // Copy elements from the old to a new tensor and return it.
-    // It's assumed that key/values tensor has a shape [BATCH_SIZE, num_kv_heads, seq_len, head_size] or [seq_len, ...],
-    // It that's not the case for your model please implement your own trim method.
-    OPENVINO_ASSERT(seq_len_axis == 2 || seq_len_axis == 0, "Cannot trim key/values with sequence length axis = ", seq_len_axis);
-    
-    auto old_tensor_data = tensor.data<float>();
-    auto shape = tensor.get_shape();
-    size_t batch_size = shape[0];
-    size_t num_kv_heads = shape[1];
-    size_t old_seq_len = shape[2];
-    size_t head_size = shape[3];
-    
-    OPENVINO_ASSERT(new_seq_len <= old_seq_len);
-    
-    // if new_seq_len equal to old one no need to copy tensor, return as is
-    if (old_seq_len == new_seq_len)
-        return tensor;
-
-    if (seq_len_axis == 0) {
-        shape[0] = new_seq_len;
-        tensor.set_shape(shape);
-    }
-
-    // if seq_len_axis == 2, then data is not contiguous, in order to trim need to repack tensor
-    auto new_tensor = ov::Tensor{ov::element::f32, {batch_size, num_kv_heads, new_seq_len, head_size}};
-    auto new_tensor_data = new_tensor.data<float>();
-    for (size_t batch = 0; batch < batch_size; ++batch){
-        for (size_t i = 0; i < num_kv_heads; ++i) {
-            for (size_t j = 0; j < new_seq_len; ++j) {
-                auto dst_ptr = new_tensor_data + num_kv_heads * new_seq_len * head_size * batch + new_seq_len * head_size * i +  head_size * j;
-                auto src_ptr = old_tensor_data + num_kv_heads * new_seq_len * head_size * batch + old_seq_len * head_size * i +  head_size * j;
-                std::memcpy(dst_ptr, src_ptr, head_size * sizeof(float));
-            }
-        }
-    }
-    return new_tensor;
-}
-
-void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) {
-    // trim kv_cache values up to the new_seq_len
-    for (auto& state: request.query_state()) {
-        ov::Tensor old_tensor = state.get_state();
-        state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
-    }
-}
-
-std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_idx) {
-    if (logits.get_shape()[0] <= batch_idx) {
-        OPENVINO_THROW("logits batch size doesn't match the number of beams");
-    }
-
-    size_t vocab_size = logits.get_shape().back();
-    size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size;
-    size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size;
-    const float* logits_data = logits.data<const float>() + batch_offset + sequence_offset;
-    
-    int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
-    float max_logit = logits_data[out_token];
-
-    float log_sum = std::log(
-        std::accumulate(logits_data, logits_data + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) {
-            return accumulated + std::exp(to_add - max_logit);
-        }));
-    return {out_token, log_sum};
-}
-
 ov::LLMPipeline::LLMPipeline(
     std::string& model_path,
     std::string& tokenizer_path,
@@ -293,279 +174,6 @@ ov::GenerationConfig ov::LLMPipeline::generation_config() const {
     return m_pimpl->generation_config();
 }
 
-void print_tensor(const ov::Tensor& tensor) {
-    std::vector<int64_t> res;
-
-    auto t_shape = tensor.get_shape();
-    cout << "[";
-    for (size_t i = 0; i < t_shape[1]; ++i) {
-        if (tensor.get_element_type() == ov::element::i64) {
-            res.emplace_back(tensor.data<int64_t>()[i]);
-            cout << tensor.data<int64_t>()[i] << " ";
-        }
-    }
-    cout << "]" << endl;
-    cout << "---------" << endl;
-}
-
-ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::greedy_search(ov::Tensor input_ids, 
-                                ov::Tensor attention_mask, 
-                                GenerationConfig generation_config) {
-    
-    GenerationConfigHelper config_helper = generation_config;
-    ov::Shape prompts_shape = input_ids.get_shape();
-    size_t batch_size = prompts_shape[0];
-    size_t prompt_len = prompts_shape[1];
-    
-    auto kv_cache_len = m_model_runner.query_state()[0].get_state().get_shape()[2];
-
-    // todo: make this work even if position_ids are not specified
-    auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
-    initialize_position_ids(position_ids, attention_mask, kv_cache_len);
-
-    ov::EncodedResults results;
-    results.scores.resize(batch_size);
-    results.tokens.resize(batch_size);
-    std::fill(results.scores.begin(), results.scores.end(), 0);
-    
-    if (is_chat_conversation && kv_cache_len > 0) {
-        // m_attentions_mask_cache extent with attention_mask;
-
-        size_t new_prompt_len = attention_mask.get_shape()[1];
-        size_t context_len = m_attentions_mask_cache.get_shape()[1];
-        ov::Tensor new_attention_mask =  ov::Tensor{ov::element::i64, {1, context_len + new_prompt_len}};
-
-        for (size_t i = 0; i < context_len; ++i) {
-            auto r = m_attentions_mask_cache.data<int64_t>()[i];
-            new_attention_mask.data<int64_t>()[i] = m_attentions_mask_cache.data<int64_t>()[i];
-        }
-        for (size_t i = context_len; i < context_len + new_prompt_len; ++i) {
-            auto r = attention_mask.data<int64_t>()[i];
-            new_attention_mask.data<int64_t>()[i] = attention_mask.data<int64_t>()[i - context_len];
-        }
-        m_model_runner.set_tensor("attention_mask", new_attention_mask);
-    } else {
-        m_model_runner.set_tensor("attention_mask", attention_mask);
-    }
-    
-
-    auto atten_shape = attention_mask.get_shape();
-    auto pos_shape = position_ids.get_shape();
-    auto input_ids_shape = input_ids.get_shape();
-
-    m_model_runner.set_tensor("input_ids", input_ids);
-    m_model_runner.set_tensor("position_ids", position_ids);
-
-    m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
-    auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
-    std::iota(beam_data, beam_data + batch_size, 0);
-
-    size_t max_tokens = config_helper.get_max_new_tokens(prompt_len);
-    for (size_t i = 0; i < max_tokens; ++i) {
-        
-        // todo: consider replacing with start_async and run callback right after that
-        m_model_runner.infer();
-        auto logits = m_model_runner.get_tensor("logits");
-        ov::Shape logits_shape = logits.get_shape();
-        size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];
-
-        m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});
-        
-        m_attentions_mask_cache = ov::Tensor{attention_mask.get_element_type(),  m_model_runner.get_tensor("attention_mask").get_shape()};
-        m_model_runner.get_tensor("attention_mask").copy_to(m_attentions_mask_cache);
-        // m_attentions_mask_cache = m_model_runner.get_tensor("attention_mask");
-        
-        update_position_ids(position_ids, m_model_runner.get_tensor("attention_mask"));
-        m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
-        
-        std::vector<int64_t> token_iter_results(batch_size);  // results of a single infer request
-        std::vector<int> eos_met(batch_size, 0);  // use int because can not use std::all_of with vector<bool>
-        for (size_t batch = 0; batch < batch_size; ++batch) {
-            // const float * logits_data = logits.data<const float>() + seq_len * vocab_size * batch + (seq_len - 1) * vocab_size;
-            // int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
-            // results.tokens[batch].emplace_back(out_token);
-            // results.scores[batch] += logits_data[out_token];
-
-            auto res = softmax(logits, batch);
-            auto out_token = res.first;
-            results.tokens[batch].emplace_back(res.first);
-            results.scores[batch] += res.second;
-
-            token_iter_results[batch] = out_token;
-            eos_met[batch] = (out_token == generation_config.eos_token_id);
-
-            m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
-        }
-        // place
-        // sampling_params.m_callback(std::move(token_iter_results), *this);
-        
-        m_streamer->put(token_iter_results[0]);
-
-        // stop generation when EOS is met in all batches
-        bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; });
-        if (!generation_config.ignore_eos && all_are_eos)
-            break;
-        // if (i != sampling_params.get_max_new_tokens(prompt_len) - 1)
-        //     kv_cache_len += 1;
-    }
-    return results;
-}
-
-// ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::beam_search(ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params) {
-// }
-
-/* Speculative decoding works the following way. The draft model predicts the next K
-tokens one by one in an autoregressive manner, while the main model validates these
-predictions and corrects them if necessary. We go through each predicted token, and
-if a difference is detected between the draft and main model, we stop and keep the
-last token predicted by the main model. Then the draft model gets the latest main
-prediction and again tries to predict the next K tokens, repeating the cycle.
-
-This approach reduces the need for multiple infer requests to the main model,
-enhancing performance. For instance, in more predictable parts of text generation,
-the draft model can, in best-case scenarios, generate the next K tokens that exactly
-match the target. In tha caste the are validated in a single inference request to
-the main model (which is bigger, more accurate but slower) instead of running K
-subsequent requests. 
-*/
-ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::speculative_sampling(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
-    GenerationConfigHelper config_helper = generation_config;
-
-    auto batch_size = input_ids.get_shape()[0];
-    OPENVINO_ASSERT(batch_size == 1);
-    auto draft_model = config_helper.get_assistant_model(m_device, m_plugin_config);
-    auto main_model = m_model_runner;
-    
-    auto draft_input_ids = ov::Tensor{input_ids.get_element_type(), input_ids.get_shape()};
-    input_ids.copy_to(draft_input_ids);
-    auto draft_attention_mask = ov::Tensor{input_ids.get_element_type(), input_ids.get_shape()};
-    
-    draft_model.set_tensor("input_ids", draft_input_ids);
-    draft_model.set_tensor("attention_mask", draft_attention_mask);
-    
-    ov::Tensor draft_position_ids = draft_model.get_tensor("position_ids");
-    draft_position_ids.set_shape(draft_input_ids.get_shape());
-    std::iota(draft_position_ids.data<int64_t>(), draft_position_ids.data<int64_t>() + draft_position_ids.get_size(), 0);
-    uint64_t seq_len = draft_input_ids.get_shape()[1];
-
-    // Input tensors for the main model should not be mixed with draft.
-    // Do not feed the same draft_postion_ids to the main, but copy input_ids from the draft_input_ids
-    // auto input_ids = main_model.get_tensor("input_ids");
-    // input_ids.set_shape(draft_input_ids.get_shape());
-    // draft_input_ids.copy_to(input_ids);
-
-    // auto attention_mask = main_model.get_tensor("attention_mask");
-    // attention_mask.set_shape(draft_input_ids.get_shape());
-    // std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
-
-    auto position_ids = main_model.get_tensor("position_ids");
-    position_ids.set_shape(draft_input_ids.get_shape());
-    std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
-    
-    // set beam_idx for stateful model: no beam search is used and batch_size = 1
-    draft_model.get_tensor("beam_idx").set_shape({batch_size});
-    draft_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
-    main_model.get_tensor("beam_idx").set_shape({batch_size});
-    main_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
-
-    main_model.set_tensor("input_ids", input_ids);
-    main_model.set_tensor("attention_mask", attention_mask);
-    main_model.set_tensor("position_ids", position_ids);
-
-    // To coollect kv-cache for the <PROMPT> and to get the next token run the very first infer request
-    draft_model.infer();
-    main_model.infer();
-
-    size_t vocab_size = draft_model.get_tensor("logits").get_shape().back();
-    OPENVINO_ASSERT(vocab_size == main_model.get_tensor("logits").get_shape().back(), "vocab size should be the same for the both models");
-    
-    // logits shape is [batch_size, seq_len, vocab_size]
-    auto logits = main_model.get_tensor("logits");
-    auto data_logits = logits.data<float>() + (seq_len - 1) * vocab_size;
-    int64_t out_token = std::max_element(data_logits, data_logits + vocab_size) - data_logits;
-    
-    // the first token which is fed to both draft and main netwoks on each iteration
-    auto first_token = out_token;
-
-    ov::EncodedResults results;
-    results.tokens.resize(batch_size);
-
-    results.tokens[0].emplace_back(out_token);
-    
-    // run K infer requests on draft model and get next K prediction tokens on each iteration
-    uint64_t K = config_helper.num_assistant_tokens;
-    std::vector<int64_t> draft_tokens;
-
-    // The draft model predicts tokens one by one in an auto-regressive manner, draft_input_ids length should be 1.
-    draft_input_ids.set_shape({batch_size, 1});
-    draft_position_ids.set_shape({batch_size, 1});
-
-    int max_sequence_length = generation_config.max_new_tokens;
-    auto eos_token = generation_config.eos_token_id;
-    
-    while (out_token != eos_token && seq_len < max_sequence_length) {
-        // infer the K next tokens with draft model
-        for (int i = 0; i < K; ++i) {
-            draft_input_ids.data<int64_t>()[0] = out_token;
-            draft_attention_mask.set_shape({batch_size, seq_len + i + 1});
-            std::fill_n(draft_attention_mask.data<int64_t>(), draft_attention_mask.get_size(), 1);
-            draft_position_ids.data<int64_t>()[0] = int64_t(draft_attention_mask.get_size() - 1);
-
-            draft_model.infer();
-
-            auto draft_logits = draft_model.get_tensor("logits").data<float>();
-            int64_t arg_max_token = std::max_element(draft_logits, draft_logits + vocab_size) - draft_logits;
-            out_token = arg_max_token;
-            draft_tokens.emplace_back(arg_max_token);
-        }
-
-        // For the main network, K tokens will be fed at once in a single infer request.
-        input_ids.set_shape({batch_size, K});
-        // Set the first token for the main model to be the same as for the draft model.
-        input_ids.data<int64_t>()[0] = first_token;
-        for (int i = 0; i < K - 1; i++)
-            input_ids.data<int64_t>()[i + 1] = draft_tokens[i];
-
-        attention_mask.set_shape({batch_size, seq_len + K});
-        std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
-
-        position_ids.set_shape({batch_size, K});
-        std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), seq_len);
-
-        main_model.infer();
-
-        data_logits = logits.data<float>();  // [batch_size, K, vocab_size]
-        size_t disagree_idx = K - 1;
-        // Iterate through the predicted tokens from the main model and compare them with draft predictions.
-        // In the worst-case scenario (disagreement at the beginning), iter will increase by 1.
-        // In the best-case scenario, all elements match, and K predicted tokens will be taken.
-        for (size_t i = 0; i < K; i++) {
-            auto start = data_logits + vocab_size * i;
-            auto stop = data_logits + vocab_size * (i + 1);
-            out_token = std::max_element(start, stop) - start;
-            results.tokens[0].emplace_back(out_token);
-
-            m_streamer->put(out_token);
-
-            disagree_idx = i;                
-            if (out_token != draft_tokens[i] || out_token == eos_token || seq_len + disagree_idx + 1 >= max_sequence_length)
-                break;
-        }
-
-        // After the inference request, key/values have shape [batch_size, seq_len + K, vocab_size].
-        // Increment the sequence length by the number of matched tokens, and
-        // trim the KV cache to match the new sequence length.
-        seq_len += disagree_idx + 1;
-        update_kv_cache(draft_model, config_helper.seq_len_axis, seq_len);
-        update_kv_cache(main_model, config_helper.seq_len_axis, seq_len);
-        
-        draft_tokens.clear();
-        first_token = out_token;
-    }
-
-    return results;
-}
-
 ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::multinomial_sampling(ov::Tensor prompts, GenerationConfig generation_config) {
     // todo: implement
     ov::EncodedResults results;
@@ -665,14 +273,14 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(ov::Tensor input_i
     GenerationConfigHelper config_helper = generation_config;
 
     if (config_helper.is_greedy_sampling()) {
-        result = greedy_search(input_ids, attention_mask, generation_config);
+        result = ov::greedy_decoding(m_model_runner, input_ids, attention_mask, generation_config, m_streamer, is_chat_conversation);
     } else if (config_helper.is_beam_search()) {
         result = beam_search(m_model_runner, input_ids, attention_mask, generation_config);
         
     } else if (config_helper.is_multimomial()) {
         result = multinomial_sampling(input_ids, generation_config);
-    } else { // speculative
-        result = speculative_sampling(input_ids, attention_mask, generation_config);
+    } else {
+        result = ov::assistive_decoding(m_model_runner, input_ids, attention_mask, generation_config);
     }
 
     if (!is_chat_conversation)
@@ -688,11 +296,11 @@ ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor at
 
 ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, GenerationConfig sampling_params) {
 
-    return generate(input_ids, init_attention_mask(input_ids), sampling_params);
+    return generate(input_ids, ov::generate_utils::init_attention_mask(input_ids), sampling_params);
 }
 
 ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids) {
-    return generate(input_ids, init_attention_mask(input_ids), m_pimpl->m_sampling_parameters);
+    return generate(input_ids, ov::generate_utils::init_attention_mask(input_ids), m_pimpl->m_sampling_parameters);
 }
 
 ov::Tokenizer ov::LLMPipeline::get_tokenizer() {
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/utils.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/utils.cpp
new file mode 100644
index 0000000000..2b30b75838
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/utils.cpp
@@ -0,0 +1,51 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "utils.hpp"
+
+namespace ov {
+namespace generate_utils {
+
+Tensor init_attention_mask(Tensor& position_ids) {
+    auto shape = position_ids.get_shape();
+    auto attention_mask = ov::Tensor{position_ids.get_element_type(), shape};
+    std::fill_n(attention_mask.data<int64_t>(), shape[0] * shape[1], 1);
+    return attention_mask;
+}
+
+void print_tensor(const ov::Tensor& tensor) {
+    std::vector<int64_t> res;
+
+    auto t_shape = tensor.get_shape();
+    std::cout << "[";
+    for (size_t i = 0; i < t_shape[1]; ++i) {
+        if (tensor.get_element_type() == ov::element::i64) {
+            res.emplace_back(tensor.data<int64_t>()[i]);
+            std::cout << tensor.data<int64_t>()[i] << " ";
+        }
+    }
+    std::cout << "]" << std::endl;
+}
+
+std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_idx) {
+    if (logits.get_shape()[0] <= batch_idx) {
+        OPENVINO_THROW("logits batch size doesn't match the number of beams");
+    }
+
+    size_t vocab_size = logits.get_shape().back();
+    size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size;
+    size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size;
+    const float* logits_data = logits.data<const float>() + batch_offset + sequence_offset;
+    
+    int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
+    float max_logit = logits_data[out_token];
+
+    float log_sum = std::log(
+        std::accumulate(logits_data, logits_data + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) {
+            return accumulated + std::exp(to_add - max_logit);
+        }));
+    return {out_token, log_sum};
+}
+
+}  // namespace generate_utils
+}  // namespace ov
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/utils.hpp b/text_generation/causal_lm/cpp/generate_pipeline/src/utils.hpp
new file mode 100644
index 0000000000..0c0eef3228
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/utils.hpp
@@ -0,0 +1,18 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <openvino/openvino.hpp>
+
+namespace ov {
+namespace generate_utils {
+
+Tensor init_attention_mask(Tensor& position_ids);
+
+void print_tensor(const ov::Tensor& tensor);
+
+std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_idx);
+
+}  // namespace generate_utils
+}  // namespace ov
\ No newline at end of file

From f1d54f412bf8b798686e795233e14882c7a8bbfb Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Tue, 7 May 2024 22:42:15 +0200
Subject: [PATCH 33/97] extended python api, added python api test

---
 .../cpp/generate_pipeline/chat_sample.cpp     |  2 +-
 .../cpp/generate_pipeline/generate_sample.cpp |  2 +-
 .../include/generation_config.hpp             |  8 +-
 .../include/llm_pipeline.hpp                  |  6 +-
 .../generate_pipeline/python/CMakeLists.txt_  |  3 -
 .../python/py_generate_pipeline.cpp           | 84 ++++++++++++++++++-
 .../generate_pipeline/src/greedy_decoding.cpp |  2 +-
 .../generate_pipeline/src/llm_pipeline.cpp    | 14 +---
 .../causal_lm/generate_tests/test_greedy.py   | 29 +++++++
 9 files changed, 126 insertions(+), 24 deletions(-)
 delete mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt_
 create mode 100644 text_generation/causal_lm/generate_tests/test_greedy.py

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index d5ae403fdc..43f762f681 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -39,7 +39,7 @@ int main(int argc, char* argv[]) try {
     std::string model_path = argv[1];
     ov::LLMPipeline pipe(model_path, device);
     
-    ov::GenerationConfig config = pipe.generation_config();
+    ov::GenerationConfig config = pipe.get_generation_config();
 
     config.max_new_tokens = 10000;
     pipe.set_streamer([](std::string word) { std::cout << word << std::flush; });
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
index bfe1668dfa..77a3cd41ce 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -58,7 +58,7 @@ int main(int argc, char* argv[]) try {
     ov::LLMPipeline pipe(model_path, device);
     // Will try to load config from generation_config.json.
     // but if not found default velues for gready search will be used
-    ov::GenerationConfig config = pipe.generation_config();
+    ov::GenerationConfig config = pipe.get_generation_config();
 
     auto text_streamer = TextStreamer{pipe.get_tokenizer()};
     auto text_streamer_callback = [&text_streamer](std::vector<int64_t>&& tokens, ov::LLMPipeline& pipe){
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
index 4526e6cbba..d838c7e6a2 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
@@ -40,16 +40,18 @@ class GenerationConfig {
 
     // Multinomial
     float temperature;
-    int top_k;
     float top_p;
+    size_t top_k;
     bool do_sample;
-    std::variant<std::string, ov::CompiledModel, ov::InferRequest> draft_model;  // todo: remove or try to add ov::Model const ov::Model&,
 
     // special tokens
     int64_t bos_token_id;
     int64_t eos_token_id;
     int64_t pad_token_id;
-
+    
+    // speculative sampling
+    std::variant<std::string, ov::CompiledModel, ov::InferRequest> draft_model;  // todo: remove or try to add ov::Model const ov::Model&,
+    
     GenerationConfig() = default;
 
     GenerationConfig(std::string json_path);
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
index 7655607e3f..9d6a237b18 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
@@ -42,7 +42,7 @@ class LLMPipeline {
     
     ~LLMPipeline();
     
-    GenerationConfig generation_config() const;
+    GenerationConfig get_generation_config() const;
 
     std::string operator()(std::string text);
 
@@ -70,8 +70,8 @@ class LLMPipeline {
     void start_chat();
     void finish_chat();
     void reset_state();
-    void set_default_config(const GenerationConfig& generation_config);
-    // void set_default_config(const AnyMap& generation_config_map);
+    
+    void set_generation_config(const GenerationConfig& generation_config);
 
 private:
     class LLMPipelineImpl;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt_ b/text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt_
deleted file mode 100644
index 88b5a71df7..0000000000
--- a/text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt_
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (C) 2023-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python/py_generate_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/python/py_generate_pipeline.cpp
index 2d636f8a86..ff4b400e63 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/python/py_generate_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/python/py_generate_pipeline.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/functional.h>
@@ -6,22 +9,99 @@
 namespace py = pybind11;
 using namespace ov;
 
+std::string call_with_config(ov::LLMPipeline& pipeline, const std::string& text, const py::kwargs& kwargs) {
+    // Create a new GenerationConfig instance and initialize from kwargs
+    ov::GenerationConfig config = pipeline.get_generation_config();
+    if (kwargs.contains("max_new_tokens")) config.max_new_tokens = kwargs["max_new_tokens"].cast<size_t>();
+    if (kwargs.contains("max_length")) config.max_length = kwargs["max_length"].cast<size_t>();
+    if (kwargs.contains("ignore_eos")) config.ignore_eos = kwargs["ignore_eos"].cast<bool>();
+    if (kwargs.contains("eos_token")) config.eos_token = kwargs["eos_token"].cast<std::string>();
+    if (kwargs.contains("num_groups")) config.num_groups = kwargs["num_groups"].cast<size_t>();
+    if (kwargs.contains("group_size")) config.group_size = kwargs["group_size"].cast<size_t>();
+    if (kwargs.contains("diversity_penalty")) config.diversity_penalty = kwargs["diversity_penalty"].cast<float>();
+    if (kwargs.contains("repetition_penalty")) config.repetition_penalty = kwargs["repetition_penalty"].cast<float>();
+    if (kwargs.contains("length_penalty")) config.length_penalty = kwargs["length_penalty"].cast<float>();
+    
+    if (kwargs.contains("no_repeat_ngram_size")) config.no_repeat_ngram_size = kwargs["no_repeat_ngram_size"].cast<size_t>();
+    if (kwargs.contains("temperature")) config.temperature = kwargs["temperature"].cast<float>();
+    if (kwargs.contains("top_k")) config.top_k = kwargs["top_k"].cast<size_t>();
+    if (kwargs.contains("top_p")) config.top_p = kwargs["top_p"].cast<float>();
+    if (kwargs.contains("do_sample")) config.do_sample = kwargs["do_sample"].cast<bool>();
+    if (kwargs.contains("bos_token_id")) config.bos_token_id = kwargs["bos_token_id"].cast<int64_t>();
+    if (kwargs.contains("eos_token_id")) config.eos_token_id = kwargs["eos_token_id"].cast<int64_t>();
+    if (kwargs.contains("pad_token_id")) config.pad_token_id = kwargs["pad_token_id"].cast<int64_t>();
+    if (kwargs.contains("draft_model")) config.draft_model = kwargs["draft_model"].cast<std::variant<std::string, ov::CompiledModel, ov::InferRequest>>();
+
+    // Call the LLMPipeline with the constructed GenerationConfig
+    return pipeline(text, config);
+}
+
 PYBIND11_MODULE(py_generate_pipeline, m) {
     m.doc() = "Pybind11 binding for LLM Pipeline";
 
+
     py::class_<LLMPipeline>(m, "LLMPipeline")
         .def(py::init<std::string&, std::string&, std::string&, std::string, const ov::AnyMap&>(),
              py::arg("model_path"), py::arg("tokenizer_path"), py::arg("detokenizer_path"),
              py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{})
         .def(py::init<std::string&, std::string, const ov::AnyMap&>(),
              py::arg("path"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{})
-        .def("__call__", (std::string (LLMPipeline::*)(std::string)) &LLMPipeline::operator())
-        .def("__call__", (std::string (LLMPipeline::*)(std::string, GenerationConfig)) &LLMPipeline::operator())
+         .def("__call__", py::overload_cast<std::string>(&ov::LLMPipeline::operator()), "Process single text input")
+        .def("__call__", py::overload_cast<std::string, ov::GenerationConfig>(&ov::LLMPipeline::operator()), "Process text input with specific generation config")
+        .def("__call__", py::overload_cast<std::vector<std::string>, ov::GenerationConfig>(&ov::LLMPipeline::operator()), "Process multiple text inputs with generation config")
+        .def("__call__", &call_with_config)
         .def("generate", (EncodedResults (LLMPipeline::*)(ov::Tensor, ov::Tensor, GenerationConfig)) &LLMPipeline::generate)
         .def("generate", (EncodedResults (LLMPipeline::*)(ov::Tensor, ov::Tensor)) &LLMPipeline::generate)
         // Bind other methods similarly
         .def("get_tokenizer", &LLMPipeline::get_tokenizer)
+        .def("start_chat", &ov::LLMPipeline::start_chat)
+        .def("finish_chat", &ov::LLMPipeline::finish_chat)
+        .def("reset_state", &ov::LLMPipeline::reset_state)
+        .def("get_generation_config", &ov::LLMPipeline::get_generation_config, py::return_value_policy::copy)
+        .def("set_generation_config", &ov::LLMPipeline::set_generation_config)
         .def("apply_chat_template", &LLMPipeline::apply_chat_template);
 
+     // Binding for Tokenizer
+    py::class_<ov::Tokenizer>(m, "Tokenizer")
+        .def(py::init<>())
+        .def(py::init<std::string&, std::string>(), py::arg("tokenizers_path"), py::arg("device") = "CPU")
+        .def("encode", py::overload_cast<std::string>(&ov::Tokenizer::encode), "Encode a single prompt")
+        .def("encode", py::overload_cast<std::vector<std::string>>(&ov::Tokenizer::encode), "Encode multiple prompts")
+        .def("decode", py::overload_cast<std::vector<int64_t>>(&ov::Tokenizer::decode), "Decode a list of tokens")
+        .def("decode", py::overload_cast<ov::Tensor>(&ov::Tokenizer::decode), "Decode a tensor of tokens")
+        .def("decode", py::overload_cast<std::vector<std::vector<int64_t>>>(&ov::Tokenizer::decode), "Decode multiple lines of tokens");
+
+    py::class_<ov::GenerationConfig>(m, "GenerationConfig")
+        .def(py::init<>())
+        .def(py::init<std::string>())
+        .def_readwrite("max_new_tokens", &ov::GenerationConfig::max_new_tokens)
+        .def_readwrite("max_length", &ov::GenerationConfig::max_length)
+        .def_readwrite("ignore_eos", &ov::GenerationConfig::ignore_eos)
+        .def_readwrite("eos_token", &ov::GenerationConfig::eos_token)
+        .def_readwrite("num_groups", &ov::GenerationConfig::num_groups)
+        .def_readwrite("group_size", &ov::GenerationConfig::group_size)
+        .def_readwrite("diversity_penalty", &ov::GenerationConfig::diversity_penalty)
+        .def_readwrite("repetition_penalty", &ov::GenerationConfig::repetition_penalty)
+        .def_readwrite("length_penalty", &ov::GenerationConfig::length_penalty)
+        .def_readwrite("no_repeat_ngram_size", &ov::GenerationConfig::no_repeat_ngram_size)
+        .def_readwrite("temperature", &ov::GenerationConfig::temperature)
+        .def_readwrite("top_k", &ov::GenerationConfig::top_k)
+        .def_readwrite("top_p", &ov::GenerationConfig::top_p)
+        .def_readwrite("do_sample", &ov::GenerationConfig::do_sample)
+        .def_readwrite("bos_token_id", &ov::GenerationConfig::bos_token_id)
+        .def_readwrite("eos_token_id", &ov::GenerationConfig::eos_token_id)
+        .def_readwrite("pad_token_id", &ov::GenerationConfig::pad_token_id)
+        .def_readwrite("draft_model", &ov::GenerationConfig::draft_model);
+
+
+    py::class_<ov::DecodedResults>(m, "DecodedResults")
+        .def(py::init<>())
+        .def_readwrite("texts", &ov::DecodedResults::texts)
+        .def_readwrite("scores", &ov::DecodedResults::scores);
+
+    py::class_<ov::EncodedResults>(m, "EncodedResults")
+        .def(py::init<>())
+        .def_readwrite("tokens", &ov::EncodedResults::tokens)
+        .def_readwrite("scores", &ov::EncodedResults::scores);
 
 }
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/greedy_decoding.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/greedy_decoding.cpp
index a9d750bfb8..d3ff0108ba 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/greedy_decoding.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/greedy_decoding.cpp
@@ -140,7 +140,7 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner,
     if (!generation_config.ignore_eos && all_are_eos)
         return results;
     
-    for (size_t i = 0; i < max_tokens; ++i) {
+    for (size_t i = 0; i < max_tokens - 1; ++i) {
         update_position_ids(position_ids, m_model_runner.get_tensor("attention_mask"));
         m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
 
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
index 734a617ef0..a390d7f29f 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
@@ -13,8 +13,6 @@
 // #include "generation_config.hpp"
 
 
-std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_idx);
-
 namespace ov {
 
 ov::EncodedResults assistive_decoding(ov::InferRequest& m_model_runner, ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config);
@@ -170,7 +168,7 @@ ov::GenerationConfig ov::LLMPipeline::LLMPipelineImpl::generation_config() const
     return m_sampling_parameters;
 }
 
-ov::GenerationConfig ov::LLMPipeline::generation_config() const {
+ov::GenerationConfig ov::LLMPipeline::get_generation_config() const {
     return m_pimpl->generation_config();
 }
 
@@ -312,6 +310,7 @@ std::string ov::LLMPipeline::apply_chat_template(std::string prompt, std::string
 }
 
 std::string ov::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string prompt, std::string role) const {
+    // todo: temporary disable for easier and faster build
     // jinja2::TemplateEnv env;
     // env.GetSettings().lstripBlocks = true;
     // env.GetSettings().trimBlocks = true;
@@ -336,7 +335,6 @@ std::string ov::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string pr
 
 void ov::LLMPipeline::set_streamer(std::function<void (std::string)> callback) {
     m_pimpl->m_streamer = std::make_shared<TextCallbackStreamer>(m_pimpl->m_tokenizer, callback);
-    // m_pimpl->m_streamer->set_callback(callback);
 }
 
 void ov::LLMPipeline::set_streamer(std::shared_ptr<StreamerBase> streamer) {
@@ -344,7 +342,7 @@ void ov::LLMPipeline::set_streamer(std::shared_ptr<StreamerBase> streamer) {
 }
 
 void ov::LLMPipeline::set_streamer() {
-    // m_pimpl->m_streamer->set_callback();
+    m_pimpl->m_streamer = nullptr;
 }
 
 void ov::LLMPipeline::start_chat() {
@@ -360,12 +358,8 @@ void ov::LLMPipeline::reset_state() {
     m_pimpl->m_model_runner.reset_state();
 }
 
-void ov::LLMPipeline::set_default_config(const GenerationConfig& generation_config) {
+void ov::LLMPipeline::set_generation_config(const GenerationConfig& generation_config) {
     m_pimpl->m_sampling_parameters = generation_config;
 }
 
-// void ov::LLMPipeline::set_default_config(const AnyMap& generation_config_map) {
-//     m_pimpl->m_sampling_parameters = GenerationConfig::anymap_to_generation_config(generation_config_map);
-// }
-
 ov::LLMPipeline::~LLMPipeline() = default;
diff --git a/text_generation/causal_lm/generate_tests/test_greedy.py b/text_generation/causal_lm/generate_tests/test_greedy.py
new file mode 100644
index 0000000000..fda0b36d41
--- /dev/null
+++ b/text_generation/causal_lm/generate_tests/test_greedy.py
@@ -0,0 +1,29 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+def test_tiny_llama():
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+    model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+
+    max_new_tokens = 500
+    prompt = 'table is made of'
+
+    encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=False)
+    hf_encoded_output = model.generate(encoded_prompt, max_new_tokens=max_new_tokens, do_sample=False)
+    hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:])
+    print(f'hf_output: {hf_output}')
+
+    import sys
+    sys.path.append('build-Debug/')
+    import py_generate_pipeline as genai
+
+    pipe = genai.LLMPipeline('text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/')
+    ov_output = pipe(prompt, max_new_tokens=max_new_tokens)
+    print(f'ov_output: {ov_output}')
+
+    assert hf_output == ov_output
+
+if __name__ == '__main__':
+    test_tiny_llama()

From 3c82e1121113ccc342b6058176e9af1ce7d57b78 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 8 May 2024 10:03:17 +0200
Subject: [PATCH 34/97] remove call method

---
 .../cpp/generate_pipeline/chat_sample.cpp     |  1 -
 .../include/generation_config.hpp             | 26 ++++------
 .../include/llm_pipeline.hpp                  | 40 ++++++---------
 .../include/llm_tokenizer.hpp                 | 12 ++---
 .../src/generation_config.cpp                 |  2 +-
 .../generate_pipeline/src/llm_pipeline.cpp    | 50 +++++++++++--------
 .../generate_pipeline/src/llm_tokenizer.cpp   | 21 +++++++-
 .../src/text_callback_streamer.cpp            |  1 -
 .../src/text_callback_streamer.hpp            |  1 +
 .../cpp/generate_pipeline/src/utils.hpp       |  2 +
 10 files changed, 84 insertions(+), 72 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index 43f762f681..ec5d23a894 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -4,7 +4,6 @@
 #include <openvino/openvino.hpp>
 #include "llm_pipeline.hpp"
 
-
 std::string generate_chat_prompt(const ov::LLMPipeline& pipe, std::string& input, bool use_chat_template = true) {
     if (use_chat_template)
         return pipe.apply_chat_template(input);
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
index d838c7e6a2..b3d6accceb 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
@@ -5,9 +5,6 @@
 
 #include <cstdlib>
 #include <functional>
-#include <nlohmann/json.hpp>
-#include <fstream>
-// #include <group_beam_searcher.hpp>  // used only for StopCriteria
 #include <limits>
 #include "llm_tokenizer.hpp"
 #include <variant>
@@ -17,44 +14,43 @@ class Sequence;
 
 namespace ov {
 
-// Similar to HuggingFace GenerationConfig
 class GenerationConfig {
 public:
+    GenerationConfig() = default;
+    GenerationConfig(std::string json_path);
+
     // Generic
     size_t max_new_tokens;
     size_t max_length;
     bool ignore_eos;
-    std::string eos_token;
 
     // Beam search specific
     size_t num_groups;
     size_t group_size;
     float diversity_penalty;
-    size_t m_num_return_sequences;
-    // StopCriteria stop_criteria = StopCriteria::heuristic;
-    
-    float repetition_penalty;
     float length_penalty;
+    size_t m_num_return_sequences;
     size_t no_repeat_ngram_size;
-    std::function<bool(const Sequence&)> early_finish = [](const Sequence&) {return false; };
-
+    std::variant<std::string, bool> early_stopping;
+    
     // Multinomial
     float temperature;
     float top_p;
     size_t top_k;
     bool do_sample;
+    float repetition_penalty;
 
     // special tokens
     int64_t bos_token_id;
     int64_t eos_token_id;
     int64_t pad_token_id;
     
+    // used for chat scenario
+    std::string eos_token;  
+    std::string bos_token; 
+    
     // speculative sampling
     std::variant<std::string, ov::CompiledModel, ov::InferRequest> draft_model;  // todo: remove or try to add ov::Model const ov::Model&,
-    
-    GenerationConfig() = default;
-
-    GenerationConfig(std::string json_path);
 };
 
 } // namespace ov
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
index 9d6a237b18..43893937a2 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
@@ -41,45 +41,37 @@ class LLMPipeline {
     LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& plugin_config={});
     
     ~LLMPipeline();
-    
-    GenerationConfig get_generation_config() const;
-
-    std::string operator()(std::string text);
-
-    std::string operator()(std::string text, GenerationConfig sampling_parameters);
-
-    DecodedResults operator()(std::vector<std::string> text, GenerationConfig sampling_parameters);
-
-    DecodedResults operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters);
-
-    EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig sampling_params);
 
+    EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config);
     EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask);
-
-    EncodedResults generate(ov::Tensor input_ids, GenerationConfig sampling_params);
-
     EncodedResults generate(ov::Tensor input_ids);
-
+    EncodedResults generate(ov::Tensor input_ids, GenerationConfig generation_config);
+    
     ov::Tokenizer get_tokenizer();
 
-    std::string apply_chat_template(std::string prompt, std::string role = "user") const;
+    std::string generate(std::string text);
+    std::string generate(std::string text, GenerationConfig generation_config);
+    DecodedResults generate(std::vector<std::string> text, GenerationConfig generation_config);
+
+    std::string operator()(std::string text);
+    std::string operator()(std::string text, GenerationConfig generation_config);
+    DecodedResults operator()(std::vector<std::string> text, GenerationConfig generation_config);
+    DecodedResults operator()(std::initializer_list<std::string> text, GenerationConfig generation_config);
+    
+    GenerationConfig get_generation_config() const;
+    void set_generation_config(const GenerationConfig& generation_config);
 
     void set_streamer(std::function<void (std::string)> callback);
     void set_streamer(std::shared_ptr<StreamerBase> streamer);
     void set_streamer();
+
     void start_chat();
     void finish_chat();
     void reset_state();
-    
-    void set_generation_config(const GenerationConfig& generation_config);
-
+    std::string apply_chat_template(std::string prompt, std::string role = "user") const;
 private:
     class LLMPipelineImpl;
     std::unique_ptr<LLMPipelineImpl> m_pimpl;
-
-    std::string call(std::string text);
-    std::string call(std::string text, GenerationConfig generation_config);
-    DecodedResults call(std::vector<std::string> text, GenerationConfig sampling_parameters);
 };
 
 } // namespace ov
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_tokenizer.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_tokenizer.hpp
index a324573251..2033e3053e 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_tokenizer.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_tokenizer.hpp
@@ -11,25 +11,21 @@ namespace ov {
     
 class Tokenizer {
 public:
-    int64_t m_eos_token = 2;  // todo: read from rt_info
-
     Tokenizer() = default;
     ~Tokenizer();
     Tokenizer(std::string& tokenizers_path, std::string device="CPU");
-
-    // Tokenizer(std::string& tokenizer_path, std::string& detokenizer_path, std::string device="CPU");
+    Tokenizer(std::string& tokenizer_path, std::string& detokenizer_path, std::string device="CPU");
     
     std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt);
-
     std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string> prompts);
-    
     std::pair<ov::Tensor, ov::Tensor> encode(std::initializer_list<std::string> text);
     
     std::string decode(std::vector<int64_t> tokens);
-    
     std::vector<std::string> decode(ov::Tensor tokens);
-    
     std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines);
+
+    int64_t m_eos_token = 2;  // todo: read from rt_info
+    int64_t m_bos_token = 1;  // todo: read from rt_info
 private:
     class TokenizerImpl;
     std::shared_ptr<TokenizerImpl> m_pimpl;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config.cpp
index 1f64c0061d..224fac3237 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config.cpp
@@ -3,7 +3,7 @@
 
 #include <cstdlib>
 #include <functional>
-// #include <nlohmann/json.hpp>
+#include <nlohmann/json.hpp>
 #include <fstream>
 // #include <group_beam_searcher.hpp>  // used only for StopCriteria
 #include <limits>
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
index a390d7f29f..dae07fb6fc 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
@@ -4,9 +4,11 @@
 #include <openvino/openvino.hpp>
 #include "llm_pipeline.hpp"
 #include <filesystem>
+#include <fstream>
 #include "generation_config_helper.hpp"
 #include "text_callback_streamer.hpp"
 #include "utils.hpp"
+#include <nlohmann/json.hpp>
 
 // #include <jinja2cpp/template.h>
 // #include <jinja2cpp/template_env.h>
@@ -32,7 +34,7 @@ class LLMPipeline::LLMPipelineImpl {
 public:
     ov::InferRequest m_model_runner;
     Tokenizer m_tokenizer;
-    GenerationConfig m_sampling_parameters;
+    GenerationConfig m_generation_config;
     std::string m_device;
     ov::AnyMap m_plugin_config;
     ov::Tensor m_attentions_mask_cache;
@@ -68,7 +70,7 @@ class LLMPipeline::LLMPipelineImpl {
 
     std::string call(std::string text);
     std::string call(std::string text, GenerationConfig generation_config);
-    DecodedResults call(std::vector<std::string> text, GenerationConfig sampling_parameters);
+    DecodedResults generate(std::vector<std::string> text, GenerationConfig generation_config);
 
 };
 
@@ -147,7 +149,7 @@ ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(std::string& path, std::string
     std::string generation_config_fname = "generation_config.json";
 
     if (std::filesystem::exists(path + "/" + generation_config_fname)) {
-        m_sampling_parameters = GenerationConfig(path + "/" + generation_config_fname);
+        m_generation_config = GenerationConfig(path + "/" + generation_config_fname);
     }
     if (std::filesystem::exists(path + "/" + tokenizer_config_fname)) {
         std::ifstream f(path + "/" + tokenizer_config_fname);
@@ -165,7 +167,7 @@ ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(std::string& path, std::string
 }
 
 ov::GenerationConfig ov::LLMPipeline::LLMPipelineImpl::generation_config() const {
-    return m_sampling_parameters;
+    return m_generation_config;
 }
 
 ov::GenerationConfig ov::LLMPipeline::get_generation_config() const {
@@ -179,14 +181,14 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::multinomial_sampling(ov::Te
 }
 
 std::string ov::LLMPipeline::LLMPipelineImpl::call(std::string text) {
-    return call(text, m_sampling_parameters);
+    return call(text, m_generation_config);
 }
 
-std::string ov::LLMPipeline::call(std::string text) {
+std::string ov::LLMPipeline::generate(std::string text) {
     return m_pimpl->call(text);
 }
 
-std::string ov::LLMPipeline::call(std::string text, GenerationConfig generation_config) {
+std::string ov::LLMPipeline::generate(std::string text, GenerationConfig generation_config) {
     return m_pimpl->call(text, generation_config);
 }
 
@@ -234,32 +236,36 @@ std::string ov::LLMPipeline::LLMPipelineImpl::call(std::string text, GenerationC
     return m_tokenizer.decode(generate_results.tokens)[0];
 }
 
-ov::DecodedResults ov::LLMPipeline::call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
-    return m_pimpl->call(text, sampling_parameters);
+ov::DecodedResults ov::LLMPipeline::generate(std::vector<std::string> text, GenerationConfig generation_config) {
+    return m_pimpl->generate(text, generation_config);
 }
 
-ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::call(std::vector<std::string> text, GenerationConfig sampling_parameters) {
+ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector<std::string> text, GenerationConfig generation_config) {
     auto [input_ids, attention_mask] = m_tokenizer.encode(text);
 
-    auto generate_results = generate(input_ids, attention_mask, sampling_parameters);
+    auto generate_results = generate(input_ids, attention_mask, generation_config);
 
     return {m_tokenizer.decode(generate_results.tokens), generate_results.scores};
 }
 
 std::string ov::LLMPipeline::operator()(std::string text) {
-    return call(text);
+    return generate(text);
 }
 
-std::string ov::LLMPipeline::operator()(std::string text, GenerationConfig sampling_parameters) {
-    return call(text, sampling_parameters);
+std::string ov::LLMPipeline::operator()(std::string text, GenerationConfig generation_config) {
+    return generate(text, generation_config);
 }
 
-ov::DecodedResults ov::LLMPipeline::operator()(std::vector<std::string> text, GenerationConfig sampling_parameters) {
-    return call(text, sampling_parameters);
+// std::string ov::LLMPipeline::operator()(std::string text, GenerationConfig generation_config, std::function<void (std::string)> streamer) {
+//     return "";
+// }
+
+ov::DecodedResults ov::LLMPipeline::operator()(std::vector<std::string> text, GenerationConfig generation_config) {
+    return generate(text, generation_config);
 }
 
-ov::DecodedResults ov::LLMPipeline::operator()(std::initializer_list<std::string> text, GenerationConfig sampling_parameters) {
-    return call(text, sampling_parameters);
+ov::DecodedResults ov::LLMPipeline::operator()(std::initializer_list<std::string> text, GenerationConfig generation_config) {
+    return generate(text, generation_config);
 }
 
 ov::EncodedResults ov::LLMPipeline::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
@@ -289,7 +295,7 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(ov::Tensor input_i
 }
 
 ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask) {
-    return generate(input_ids, attention_mask, m_pimpl->m_sampling_parameters);
+    return generate(input_ids, attention_mask, m_pimpl->m_generation_config);
 }
 
 ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, GenerationConfig sampling_params) {
@@ -298,7 +304,7 @@ ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, GenerationCon
 }
 
 ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids) {
-    return generate(input_ids, ov::generate_utils::init_attention_mask(input_ids), m_pimpl->m_sampling_parameters);
+    return generate(input_ids, ov::generate_utils::init_attention_mask(input_ids), m_pimpl->m_generation_config);
 }
 
 ov::Tokenizer ov::LLMPipeline::get_tokenizer() {
@@ -329,6 +335,8 @@ std::string ov::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string pr
 
     std::stringstream result_prompt;
     result_prompt << "<|user|>\n" << prompt << "</s>\n<|assistant|>\n";  // hardcode template for TinyLlama
+    // result_prompt << "<bos><start_of_turn>user\n" << prompt << "<end_of_turn>\n<start_of_turn>model";  // Gemma-7b-it
+    // result_prompt << "<s>[INST] " << input << " [/INST]";  // LLama-2-7b
     
     return result_prompt.str();
 }
@@ -359,7 +367,7 @@ void ov::LLMPipeline::reset_state() {
 }
 
 void ov::LLMPipeline::set_generation_config(const GenerationConfig& generation_config) {
-    m_pimpl->m_sampling_parameters = generation_config;
+    m_pimpl->m_generation_config = generation_config;
 }
 
 ov::LLMPipeline::~LLMPipeline() = default;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_tokenizer.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_tokenizer.cpp
index d0d9c9894c..d9facf508f 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_tokenizer.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_tokenizer.cpp
@@ -17,7 +17,7 @@ class Tokenizer::TokenizerImpl {
 
     TokenizerImpl() = default;
     TokenizerImpl(std::string& tokenizers_path, std::string device);
-
+    TokenizerImpl(std::string& tokenizer_path, std::string& detokenizer_path, std::string device);
     std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt);
 
     std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string> prompts);
@@ -44,6 +44,25 @@ Tokenizer::TokenizerImpl::TokenizerImpl(std::string& tokenizers_path, std::strin
     
     m_tokenize_request = core.compile_model(tokenizers_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
     m_detokenizer_request = core.compile_model(tokenizers_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+    // todo: read eos, bos here
+}
+
+Tokenizer::TokenizerImpl::TokenizerImpl(std::string& tokenizer_path, std::string& detokenizer_path, std::string device): m_device(device) {
+    ov::Core core;
+    
+    auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
+    if (!is_xml(tokenizer_path))
+        OPENVINO_THROW("tokenizers_path should be a path to a xml file");
+    if (!is_xml(detokenizer_path))
+        OPENVINO_THROW("detokenizer_path should be a path to a xml file");
+  
+    // todo: add loading EOS_TOKEN_ID from IR
+    // todo:: OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
+    core.add_extension(OPENVINO_TOKENIZERS_PATH);  
+    // tokenizer and detokenizer work on CPU only
+    
+    m_tokenize_request = core.compile_model(tokenizer_path, "CPU").create_infer_request();
+    m_detokenizer_request = core.compile_model(detokenizer_path, "CPU").create_infer_request();
 }
 
 Tokenizer::Tokenizer(std::string& tokenizers_path, std::string device) {
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.cpp
index 15be012661..53830c7b90 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.cpp
@@ -17,7 +17,6 @@ TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, bool prin
 
 void TextCallbackStreamer::put(int64_t token) {
     std::stringstream res;
-
     // do not print anything and flush cache if EOS token is met
     if (token == m_tokenizer.m_eos_token) {
         end();
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.hpp b/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.hpp
index 6eeada6d35..9f0bab68dd 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.hpp
@@ -23,6 +23,7 @@ class TextCallbackStreamer: public StreamerBase {
     
     std::function<void (std::string)> m_callback = [](std::string words){ ;};
     bool m_enabled = false;
+    int64_t m_eos_token;
 private:
     bool m_print_eos_token = false;
     Tokenizer m_tokenizer;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/utils.hpp b/text_generation/causal_lm/cpp/generate_pipeline/src/utils.hpp
index 0c0eef3228..3ef4c8e106 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/utils.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/utils.hpp
@@ -14,5 +14,7 @@ void print_tensor(const ov::Tensor& tensor);
 
 std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_idx);
 
+enum class StopCriteria { early, heuristic, never };
+
 }  // namespace generate_utils
 }  // namespace ov
\ No newline at end of file

From 5543cee68d8289ae30cdd8f1fe28637dfc5720db Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 6 May 2024 18:22:39 +0400
Subject: [PATCH 35/97] init

---
 .../generate_pipeline/python/CMakeLists.txt   | 22 +++++
 .../cpp/generate_pipeline/python/__init__.py  |  0
 .../generate_pipeline/python/pyproject.toml   | 97 +++++++++++++++++++
 .../cpp/generate_pipeline/python/python.cpp   | 26 +++++
 4 files changed, 145 insertions(+)
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python/__init__.py
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python/pyproject.toml
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python/python.cpp

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt b/text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt
new file mode 100644
index 0000000000..b16294c320
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt
@@ -0,0 +1,22 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.15)
+project(py_continuous_batching)
+
+include(FetchContent)
+FetchContent_Declare(
+    pybind11
+    GIT_REPOSITORY https://github.com/pybind/pybind11
+    GIT_TAG        v2.12.0
+)
+
+FetchContent_GetProperties(pybind11)
+if(NOT pybind11_POPULATED)
+    FetchContent_Populate(pybind11)
+    add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
+endif()
+
+pybind11_add_module(py_continuous_batching python.cpp)
+# TODO: how to link with tokenizers
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python/__init__.py b/text_generation/causal_lm/cpp/generate_pipeline/python/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python/pyproject.toml b/text_generation/causal_lm/cpp/generate_pipeline/python/pyproject.toml
new file mode 100644
index 0000000000..33b4edda5c
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/python/pyproject.toml
@@ -0,0 +1,97 @@
+[project]
+name = "py_continuous_batching"
+version = "2024.2.0.0"
+description = "Convert tokenizers into OpenVINO models"
+requires-python = ">=3.8"
+readme = {file = "../../../../../README.md", content-type="text/markdown"}
+license = {text = "OSI Approved :: Apache Software License"}
+
+authors = [
+    { name = "OpenVINO Developers", email = "openvino@intel.com" },
+]
+
+classifiers = [
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+]
+
+dependencies = [
+    # support of nightly openvino packages with dev suffix
+    "openvino~=2024.1.0.0"
+]
+
+[project.optional-dependencies]
+transformers = [
+    "transformers[sentencepiece] >= 4.36.0",
+    "tiktoken"
+]
+# chatglm2 custom tokenizer file imports torch, have to add torch dependency for tests
+torch = [
+    'torch'
+]
+dev = [
+    "ruff",
+    "bandit",
+    "pytest",
+    "pytest_harvest",
+    "pandas",
+    "openvino_tokenizers[transformers, torch]"
+]
+benchmark = [
+    "pandas",
+    "seaborn",
+    "tqdm",
+    "openvino_tokenizers[transformers]"
+]
+#  don't include fuzzing to avoid windows CI issues
+fuzzing = [
+    "atheris",
+    "openvino_tokenizers[transformers]"
+]
+all = [
+    "openvino_tokenizers[dev, transformers]"
+]
+
+
+[tool.ruff]
+line-length = 119
+
+[tool.ruff.lint]
+ignore = ["C901", "E501", "E741", "W605"]
+select = ["C", "E", "F", "I", "W"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"]
+"openvino_tokenizers/hf_parser.py" = ["F821"]
+
+[tool.ruff.lint.isort]
+lines-after-imports = 2
+
+[tool.scikit-build]
+cmake.build-type = "Release"
+#cmake.args = [
+#    "-DCMAKE_INSTALL_BINDIR=lib",
+#    "-DCMAKE_INSTALL_LIBDIR=lib"
+#]
+cmake.targets = ["py_continuous_batching"]
+wheel.build-tag = "000"
+#wheel.packages = ["py_continuous_batching"] # my python files
+wheel.install-dir = "py_continuous_batching"
+wheel.py-api = "py3"
+# TODO: how to get files from top folders
+#wheel.license-files = ["../../../../../LICENSE", "../../../../../third-party-programs.txt", "../../../../../SECURITY.md"]
+sdist.exclude = ["dist", "tests", "examples", "python/tests"]
+sdist.cmake = true
+
+[[tool.scikit-build.generate]]
+path = "__version__.py"
+template = '''
+__version__ = "${version}"
+'''
+
+[build-system]
+requires = ["scikit-build-core~=0.8.0"]
+build-backend = "scikit_build_core.build"
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python/python.cpp b/text_generation/causal_lm/cpp/generate_pipeline/python/python.cpp
new file mode 100644
index 0000000000..7485a503ec
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/python/python.cpp
@@ -0,0 +1,26 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdlib>
+#include <limits>
+#include <functional>
+
+#include "pybind11/pybind11.h"
+#include <pybind11/stl.h>
+
+struct GenerationConfig {
+    bool do_sample;
+
+};
+
+namespace py = pybind11;
+
+
+PYBIND11_MODULE(py_continuous_batching, m) {
+    py::class_<GenerationConfig>(m, "GenerationConfig")
+        .def(py::init<>())
+        .def_readwrite("do_sample", &GenerationConfig::do_sample);
+
+}

From abb8835aacc005af866c70b583019742ea3329a9 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Tue, 7 May 2024 18:19:46 +0400
Subject: [PATCH 36/97] add_subdirectory

---
 text_generation/causal_lm/cpp/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 1d9cbd66be..aad9a1e9b7 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -79,3 +79,5 @@ target_link_libraries(${TARGET_NAME} PRIVATE generate_pipeline_lib)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
+
+add_subdirectory(generate_pipeline/python-bindings)

From 0998abc151a9b40f0989d27e00a32367c6d32365 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Wed, 8 May 2024 12:36:07 +0400
Subject: [PATCH 37/97] add files

---
 .../python/pyproject.toml => pyproject.toml   | 30 ++++----
 .../python-bindings/CMakeLists.txt            | 20 ++++++
 .../python-bindings/openvino/__init__.py      | 70 +++++++++++++++++++
 .../openvino_genai_bindings.cpp               | 26 +++++++
 third-party-programs.txt                      |  1 +
 5 files changed, 133 insertions(+), 14 deletions(-)
 rename text_generation/causal_lm/cpp/generate_pipeline/python/pyproject.toml => pyproject.toml (67%)
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python-bindings/CMakeLists.txt
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino/__init__.py
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino_genai_bindings.cpp
 create mode 100644 third-party-programs.txt

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python/pyproject.toml b/pyproject.toml
similarity index 67%
rename from text_generation/causal_lm/cpp/generate_pipeline/python/pyproject.toml
rename to pyproject.toml
index 33b4edda5c..71c85319bd 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/python/pyproject.toml
+++ b/pyproject.toml
@@ -1,9 +1,9 @@
 [project]
-name = "py_continuous_batching"
+name = "openvino.genai"
 version = "2024.2.0.0"
-description = "Convert tokenizers into OpenVINO models"
+description = "Python bindings for https://github.com/openvinotoolkit/openvino.genai"
 requires-python = ">=3.8"
-readme = {file = "../../../../../README.md", content-type="text/markdown"}
+readme = {file = "text_generation/causal_lm/cpp/README.md", content-type="text/markdown"}
 license = {text = "OSI Approved :: Apache Software License"}
 
 authors = [
@@ -19,11 +19,11 @@ classifiers = [
 ]
 
 dependencies = [
-    # support of nightly openvino packages with dev suffix
-    "openvino~=2024.1.0.0"
+    "openvino_tokenizers~=2024.1.0.0"
 ]
 
 [project.optional-dependencies]
+# TODO: do I need to propagate all this to openvino_tokenizers
 transformers = [
     "transformers[sentencepiece] >= 4.36.0",
     "tiktoken"
@@ -71,27 +71,29 @@ select = ["C", "E", "F", "I", "W"]
 lines-after-imports = 2
 
 [tool.scikit-build]
+install.components = ["openvino_genai_bindings_install_target"]
+cmake.source-dir = "text_generation/causal_lm/cpp"
 cmake.build-type = "Release"
-#cmake.args = [
+cmake.args = [
+    "-DBUILD_SHARED_LIBS=NO"
 #    "-DCMAKE_INSTALL_BINDIR=lib",
 #    "-DCMAKE_INSTALL_LIBDIR=lib"
-#]
-cmake.targets = ["py_continuous_batching"]
+]
+cmake.targets = ["openvino_genai_bindings"]
 wheel.build-tag = "000"
-#wheel.packages = ["py_continuous_batching"] # my python files
-wheel.install-dir = "py_continuous_batching"
+wheel.packages = ["text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino"]
+wheel.install-dir = "openvino/genai"
 wheel.py-api = "py3"
-# TODO: how to get files from top folders
-#wheel.license-files = ["../../../../../LICENSE", "../../../../../third-party-programs.txt", "../../../../../SECURITY.md"]
+wheel.license-files = ["LICENSE", "SECURITY.md"]  # TODO: Do we need third-party-programs.txt like openvino_tokenizers?
 sdist.exclude = ["dist", "tests", "examples", "python/tests"]
 sdist.cmake = true
 
 [[tool.scikit-build.generate]]
-path = "__version__.py"
+path = "openvino/genai/__version__.py"
 template = '''
 __version__ = "${version}"
 '''
 
 [build-system]
 requires = ["scikit-build-core~=0.8.0"]
-build-backend = "scikit_build_core.build"
\ No newline at end of file
+build-backend = "scikit_build_core.build"
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/CMakeLists.txt b/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/CMakeLists.txt
new file mode 100644
index 0000000000..41dabe43b5
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/CMakeLists.txt
@@ -0,0 +1,20 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+include(FetchContent)
+FetchContent_Declare(
+    pybind11
+    GIT_REPOSITORY https://github.com/pybind/pybind11
+    GIT_TAG        v2.12.0
+)
+
+FetchContent_GetProperties(pybind11)
+if(NOT pybind11_POPULATED)
+    FetchContent_Populate(pybind11)
+    add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
+endif()
+
+pybind11_add_module(openvino_genai_bindings openvino_genai_bindings.cpp)
+target_link_libraries(openvino_genai_bindings PRIVATE generate_pipeline_lib)
+# TODO: how to link with tokenizers and openvino
+install(TARGETS openvino_genai_bindings LIBRARY DESTINATION . COMPONENT openvino_genai_bindings_install_target)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino/__init__.py b/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino/__init__.py
new file mode 100644
index 0000000000..24a0ee92ec
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino/__init__.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)
+
+# Required for Windows OS platforms
+# Note: always top-level
+try:
+    from openvino.utils import _add_openvino_libs_to_search_path
+    _add_openvino_libs_to_search_path()
+except ImportError:
+    pass
+
+# #
+# # OpenVINO API
+# # This __init__.py forces checking of runtime modules to propagate errors.
+# # It is not compared with init files from openvino-dev package.
+# #
+# Import all public modules
+from openvino import runtime as runtime
+from openvino import frontend as frontend
+from openvino import helpers as helpers
+from openvino import preprocess as preprocess
+from openvino import utils as utils
+from openvino import properties as properties
+
+# Import most important classes and functions from openvino.runtime
+from openvino.runtime import Model
+from openvino.runtime import Core
+from openvino.runtime import CompiledModel
+from openvino.runtime import InferRequest
+from openvino.runtime import AsyncInferQueue
+
+from openvino.runtime import Symbol
+from openvino.runtime import Dimension
+from openvino.runtime import Strides
+from openvino.runtime import PartialShape
+from openvino.runtime import Shape
+from openvino.runtime import Layout
+from openvino.runtime import Type
+from openvino.runtime import Tensor
+from openvino.runtime import OVAny
+
+from openvino.runtime import compile_model
+from openvino.runtime import get_batch
+from openvino.runtime import set_batch
+from openvino.runtime import serialize
+from openvino.runtime import shutdown
+from openvino.runtime import tensor_from_file
+from openvino.runtime import save_model
+from openvino.runtime import layout_helpers
+
+from openvino._pyopenvino import RemoteContext
+from openvino._pyopenvino import RemoteTensor
+
+# libva related:
+from openvino._pyopenvino import VAContext
+from openvino._pyopenvino import VASurfaceTensor
+
+# Set version for openvino package
+from openvino.runtime import get_version
+__version__ = get_version()
+
+# Tools
+try:
+    # Model Conversion API - ovc should reside in the main namespace
+    from openvino.tools.ovc import convert_model
+except ImportError:
+    pass
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino_genai_bindings.cpp b/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino_genai_bindings.cpp
new file mode 100644
index 0000000000..7485a503ec
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino_genai_bindings.cpp
@@ -0,0 +1,26 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdlib>
+#include <limits>
+#include <functional>
+
+#include "pybind11/pybind11.h"
+#include <pybind11/stl.h>
+
+struct GenerationConfig {
+    bool do_sample;
+
+};
+
+namespace py = pybind11;
+
+
+PYBIND11_MODULE(py_continuous_batching, m) {
+    py::class_<GenerationConfig>(m, "GenerationConfig")
+        .def(py::init<>())
+        .def_readwrite("do_sample", &GenerationConfig::do_sample);
+
+}
diff --git a/third-party-programs.txt b/third-party-programs.txt
new file mode 100644
index 0000000000..60d40abdd0
--- /dev/null
+++ b/third-party-programs.txt
@@ -0,0 +1 @@
+TODO: do I need it?
\ No newline at end of file

From 15492c4a0e73b230908a3a337146b2f7f848a109 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Wed, 8 May 2024 13:37:20 +0400
Subject: [PATCH 38/97] add __init__.py

---
 .../generate_pipeline/python-bindings/openvino/genai/__init__.py  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino/genai/__init__.py

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino/genai/__init__.py b/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino/genai/__init__.py
new file mode 100644
index 0000000000..e69de29bb2

From 005d3fb765af9ec8bb5abc6d34c6041477203132 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 8 May 2024 11:52:40 +0200
Subject: [PATCH 39/97] removed set_streamer

---
 .../cpp/generate_pipeline/chat_sample.cpp     | 32 ++------
 .../include/llm_pipeline.hpp                  | 15 ++--
 .../src/generation_config.cpp                 |  2 +-
 .../src/generation_config_helper.hpp          |  2 +-
 .../generate_pipeline/src/llm_pipeline.cpp    | 80 +++++++++++++------
 .../causal_lm/generate_tests/test_greedy.py   |  2 +-
 6 files changed, 76 insertions(+), 57 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index ec5d23a894..95e7ddb341 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -4,61 +4,41 @@
 #include <openvino/openvino.hpp>
 #include "llm_pipeline.hpp"
 
-std::string generate_chat_prompt(const ov::LLMPipeline& pipe, std::string& input, bool use_chat_template = true) {
-    if (use_chat_template)
-        return pipe.apply_chat_template(input);
-
-    std::stringstream result_prompt;
-    // result_prompt << "<bos><start_of_turn>user\n" << input << "<end_of_turn>\n<start_of_turn>model";  // Gemma-7b-it
-    // result_prompt << "<s>[INST] " << input << " [/INST]";  // LLama-2-7b
-    result_prompt << "<|user|>\n" << input << "</s>\n<|assistant|>\n";  // TinyLlama
-    
-    return result_prompt.str();
-}
 
 std::vector<string> questions = {
     "1+1=", 
     "what was the previous answer?", 
     "Why is the sky blue?", 
     "4+10=",
-    // "Who was Alan Turing?",
-    // "But why did he killed himself?",
     "What is Intel OpenVINO?",
-    // "4+10=", 
-    // "sum up all the numeric answers in the current chat session"
-    // "Why is the sky blue?",
-    // "Please repeat all the questions I asked you.",
     "Can you briefly summarize what I asked you about during this session?",
 };
 
 int main(int argc, char* argv[]) try {
-    std::string prompt = "table is made of";
-    std::string device = "CPU"; // can be replaced with GPU
+    std::string prompt;
+    std::string accumulated_str = "";
 
     std::string model_path = argv[1];
-    ov::LLMPipeline pipe(model_path, device);
+    ov::LLMPipeline pipe(model_path, "CPU");
     
     ov::GenerationConfig config = pipe.get_generation_config();
-
     config.max_new_tokens = 10000;
-    pipe.set_streamer([](std::string word) { std::cout << word << std::flush; });
+    auto streamer = [](std::string word) { std::cout << word << std::flush; };
 
-    std::string accumulated_str = "";
     pipe.start_chat();
     for (size_t i = 0; i < questions.size(); i++) {
+        // std::getline(std::cin, prompt);
         prompt = questions[i];
         
         std::cout << "question:\n";
         cout << prompt << endl;
-        // std::getline(std::cin, prompt);
 
-        auto answer_str = pipe(prompt, config);
+        auto answer_str = pipe(prompt, config, streamer);
         accumulated_str += answer_str;
         
         cout << "\n----------\n";
     }
     pipe.finish_chat();
-
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
     return EXIT_FAILURE;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
index 43893937a2..3dd799b478 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
@@ -16,6 +16,9 @@ class Tokenizer; // forward declaration
 
 namespace ov {
 
+
+using StreamerVariant = std::variant<std::monostate, std::function<void (std::string)>, std::shared_ptr<StreamerBase>>;
+
 class EncodedResults {
 public:
     std::vector<std::vector<int64_t>> tokens;
@@ -57,14 +60,16 @@ class LLMPipeline {
     std::string operator()(std::string text, GenerationConfig generation_config);
     DecodedResults operator()(std::vector<std::string> text, GenerationConfig generation_config);
     DecodedResults operator()(std::initializer_list<std::string> text, GenerationConfig generation_config);
-    
+
+    // generate with streamers
+    std::string generate(std::string text, StreamerVariant streamer);
+    std::string generate(std::string text, GenerationConfig generation_config, StreamerVariant streamer);
+    std::string operator()(std::string text, StreamerVariant streamer);
+    std::string operator()(std::string text, GenerationConfig generation_config, StreamerVariant streamer);
+
     GenerationConfig get_generation_config() const;
     void set_generation_config(const GenerationConfig& generation_config);
 
-    void set_streamer(std::function<void (std::string)> callback);
-    void set_streamer(std::shared_ptr<StreamerBase> streamer);
-    void set_streamer();
-
     void start_chat();
     void finish_chat();
     void reset_state();
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config.cpp
index 224fac3237..d4ff8ffc74 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config.cpp
@@ -52,7 +52,7 @@ size_t GenerationConfigHelper::get_max_new_tokens(size_t prompt_length) {
     }
 }
 
-bool GenerationConfigHelper::is_greedy_sampling() const {
+bool GenerationConfigHelper::is_greedy_decoding() const {
     return !config.do_sample && !is_beam_search() && !is_speculative();
 }
 
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config_helper.hpp b/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config_helper.hpp
index c433594c2c..f428829773 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config_helper.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config_helper.hpp
@@ -24,7 +24,7 @@ class GenerationConfigHelper {
     //     return assistive;
     // }
 
-    bool is_greedy_sampling() const;
+    bool is_greedy_decoding() const;
 
     bool is_beam_search() const;
 
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
index dae07fb6fc..2751a6b884 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
@@ -5,6 +5,7 @@
 #include "llm_pipeline.hpp"
 #include <filesystem>
 #include <fstream>
+#include <variant>
 #include "generation_config_helper.hpp"
 #include "text_callback_streamer.hpp"
 #include "utils.hpp"
@@ -62,14 +63,16 @@ class LLMPipeline::LLMPipelineImpl {
     EncodedResults multinomial_sampling(ov::Tensor prompts, GenerationConfig generation_config);
 
     EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config);
+    EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config, StreamerVariant streamer);
 
     std::string apply_chat_template(std::string prompt, std::string role = "user") const;
 
-    std::shared_ptr<StreamerBase> m_streamer;
+    // std::shared_ptr<StreamerBase> m_streamer;
     bool is_chat_conversation = false;
 
-    std::string call(std::string text);
-    std::string call(std::string text, GenerationConfig generation_config);
+    std::string generate(std::string text);
+    std::string generate(std::string text, GenerationConfig generation_config);
+    std::string generate(std::string text, GenerationConfig generation_config, StreamerVariant streamer);
     DecodedResults generate(std::vector<std::string> text, GenerationConfig generation_config);
 
 };
@@ -180,19 +183,29 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::multinomial_sampling(ov::Te
     return results;
 }
 
-std::string ov::LLMPipeline::LLMPipelineImpl::call(std::string text) {
-    return call(text, m_generation_config);
+std::string ov::LLMPipeline::LLMPipelineImpl::generate(std::string text) {
+    return generate(text, m_generation_config);
 }
 
 std::string ov::LLMPipeline::generate(std::string text) {
-    return m_pimpl->call(text);
+    return m_pimpl->generate(text);
 }
 
 std::string ov::LLMPipeline::generate(std::string text, GenerationConfig generation_config) {
-    return m_pimpl->call(text, generation_config);
+    return m_pimpl->generate(text, generation_config);
+}
+
+std::string ov::LLMPipeline::LLMPipelineImpl::generate(std::string text, GenerationConfig generation_config) {
+    std::cout << "WE ARE HEEEEEEEEEEEEEEERE" << std::endl;
+    StreamerVariant var;
+    return generate(text, generation_config, var);
 }
 
-std::string ov::LLMPipeline::LLMPipelineImpl::call(std::string text, GenerationConfig generation_config) {
+std::string ov::LLMPipeline::LLMPipelineImpl::generate(
+    std::string text, 
+    GenerationConfig generation_config,
+    StreamerVariant streamer
+) {
     if (is_chat_conversation) {
         text = apply_chat_template(text);
     }
@@ -232,7 +245,7 @@ std::string ov::LLMPipeline::LLMPipelineImpl::call(std::string text, GenerationC
     for (size_t i = 0; i < tmp_attn_mask.size(); i++)
         attention_mask.data<int64_t>()[i] = tmp_attn_mask.data()[i];
 
-    auto generate_results = generate(input_ids, attention_mask, generation_config);
+    auto generate_results = generate(input_ids, attention_mask, generation_config, streamer);
     return m_tokenizer.decode(generate_results.tokens)[0];
 }
 
@@ -273,11 +286,27 @@ ov::EncodedResults ov::LLMPipeline::LLMPipeline::generate(ov::Tensor input_ids,
 }
 
 ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
+    return generate(input_ids, attention_mask, generation_config);
+}
+
+ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(
+    ov::Tensor input_ids, 
+    ov::Tensor attention_mask, 
+    GenerationConfig generation_config,
+    StreamerVariant streamer
+) {
     ov::EncodedResults result;
     GenerationConfigHelper config_helper = generation_config;
+    std::shared_ptr<StreamerBase> streamer_ptr;
 
-    if (config_helper.is_greedy_sampling()) {
-        result = ov::greedy_decoding(m_model_runner, input_ids, attention_mask, generation_config, m_streamer, is_chat_conversation);
+    if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) {
+        streamer_ptr = *streamer_obj;
+    } else if (auto callback = std::get_if<std::function<void(std::string)>>(&streamer)) {
+        streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
+    }
+    
+    if (config_helper.is_greedy_decoding()) {
+        result = ov::greedy_decoding(m_model_runner, input_ids, attention_mask, generation_config, streamer_ptr, is_chat_conversation);
     } else if (config_helper.is_beam_search()) {
         result = beam_search(m_model_runner, input_ids, attention_mask, generation_config);
         
@@ -307,6 +336,23 @@ ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids) {
     return generate(input_ids, ov::generate_utils::init_attention_mask(input_ids), m_pimpl->m_generation_config);
 }
 
+std::string ov::LLMPipeline::generate(std::string text, StreamerVariant streamer) {
+    return "";
+}
+
+std::string ov::LLMPipeline::generate(std::string text, GenerationConfig generation_config, StreamerVariant streamer) {
+    return m_pimpl->generate(text, generation_config, streamer);
+}
+
+std::string ov::LLMPipeline::operator()(std::string text, StreamerVariant streamer) {
+    return "";
+}
+
+std::string ov::LLMPipeline::operator()(std::string text, GenerationConfig generation_config, StreamerVariant streamer) {
+    return generate(text, generation_config, streamer);
+}
+
+
 ov::Tokenizer ov::LLMPipeline::get_tokenizer() {
     return m_pimpl->m_tokenizer;
 }
@@ -341,18 +387,6 @@ std::string ov::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string pr
     return result_prompt.str();
 }
 
-void ov::LLMPipeline::set_streamer(std::function<void (std::string)> callback) {
-    m_pimpl->m_streamer = std::make_shared<TextCallbackStreamer>(m_pimpl->m_tokenizer, callback);
-}
-
-void ov::LLMPipeline::set_streamer(std::shared_ptr<StreamerBase> streamer) {
-    m_pimpl->m_streamer = streamer;
-}
-
-void ov::LLMPipeline::set_streamer() {
-    m_pimpl->m_streamer = nullptr;
-}
-
 void ov::LLMPipeline::start_chat() {
     m_pimpl->is_chat_conversation = true;
 }
diff --git a/text_generation/causal_lm/generate_tests/test_greedy.py b/text_generation/causal_lm/generate_tests/test_greedy.py
index fda0b36d41..c9c09dbb87 100644
--- a/text_generation/causal_lm/generate_tests/test_greedy.py
+++ b/text_generation/causal_lm/generate_tests/test_greedy.py
@@ -7,7 +7,7 @@ def test_tiny_llama():
     tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
     model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 
-    max_new_tokens = 500
+    max_new_tokens = 32
     prompt = 'table is made of'
 
     encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=False)

From cc44bc89da771e3640276b687a615aff2fe2cd56 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 8 May 2024 12:23:37 +0200
Subject: [PATCH 40/97] use std::optional

---
 .../include/llm_pipeline.hpp                  | 31 +++---
 .../python/py_generate_pipeline.cpp           |  7 +-
 .../generate_pipeline/src/llm_pipeline.cpp    | 96 +++++++------------
 3 files changed, 45 insertions(+), 89 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
index 3dd799b478..5c7990e140 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
@@ -9,6 +9,7 @@
 #include "llm_tokenizer.hpp"
 #include "streamer_base.hpp"
 #include <filesystem>
+#include <optional>
 
 using namespace std;
 
@@ -18,6 +19,7 @@ namespace ov {
 
 
 using StreamerVariant = std::variant<std::monostate, std::function<void (std::string)>, std::shared_ptr<StreamerBase>>;
+using OptionalGenerationConfig = std::optional<GenerationConfig>;
 
 class EncodedResults {
 public:
@@ -45,28 +47,19 @@ class LLMPipeline {
     
     ~LLMPipeline();
 
-    EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config);
-    EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask);
-    EncodedResults generate(ov::Tensor input_ids);
-    EncodedResults generate(ov::Tensor input_ids, GenerationConfig generation_config);
-    
-    ov::Tokenizer get_tokenizer();
-
-    std::string generate(std::string text);
-    std::string generate(std::string text, GenerationConfig generation_config);
-    DecodedResults generate(std::vector<std::string> text, GenerationConfig generation_config);
+    EncodedResults generate(ov::Tensor input_ids, std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config);
+    std::string generate(std::string text, OptionalGenerationConfig generation_config);
+    DecodedResults generate(std::vector<std::string> text, OptionalGenerationConfig generation_config);
 
-    std::string operator()(std::string text);
-    std::string operator()(std::string text, GenerationConfig generation_config);
-    DecodedResults operator()(std::vector<std::string> text, GenerationConfig generation_config);
-    DecodedResults operator()(std::initializer_list<std::string> text, GenerationConfig generation_config);
+    std::string operator()(std::string text, OptionalGenerationConfig generation_config);
+    DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config);
+    DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config);
 
     // generate with streamers
-    std::string generate(std::string text, StreamerVariant streamer);
-    std::string generate(std::string text, GenerationConfig generation_config, StreamerVariant streamer);
-    std::string operator()(std::string text, StreamerVariant streamer);
-    std::string operator()(std::string text, GenerationConfig generation_config, StreamerVariant streamer);
-
+    std::string generate(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer);
+    std::string operator()(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer);
+    
+    ov::Tokenizer get_tokenizer();
     GenerationConfig get_generation_config() const;
     void set_generation_config(const GenerationConfig& generation_config);
 
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python/py_generate_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/python/py_generate_pipeline.cpp
index ff4b400e63..09ca8fb2bb 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/python/py_generate_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/python/py_generate_pipeline.cpp
@@ -46,13 +46,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
              py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{})
         .def(py::init<std::string&, std::string, const ov::AnyMap&>(),
              py::arg("path"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{})
-         .def("__call__", py::overload_cast<std::string>(&ov::LLMPipeline::operator()), "Process single text input")
-        .def("__call__", py::overload_cast<std::string, ov::GenerationConfig>(&ov::LLMPipeline::operator()), "Process text input with specific generation config")
-        .def("__call__", py::overload_cast<std::vector<std::string>, ov::GenerationConfig>(&ov::LLMPipeline::operator()), "Process multiple text inputs with generation config")
         .def("__call__", &call_with_config)
-        .def("generate", (EncodedResults (LLMPipeline::*)(ov::Tensor, ov::Tensor, GenerationConfig)) &LLMPipeline::generate)
-        .def("generate", (EncodedResults (LLMPipeline::*)(ov::Tensor, ov::Tensor)) &LLMPipeline::generate)
-        // Bind other methods similarly
         .def("get_tokenizer", &LLMPipeline::get_tokenizer)
         .def("start_chat", &ov::LLMPipeline::start_chat)
         .def("finish_chat", &ov::LLMPipeline::finish_chat)
@@ -71,6 +65,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def("decode", py::overload_cast<ov::Tensor>(&ov::Tokenizer::decode), "Decode a tensor of tokens")
         .def("decode", py::overload_cast<std::vector<std::vector<int64_t>>>(&ov::Tokenizer::decode), "Decode multiple lines of tokens");
 
+     // Binding for GenerationConfig
     py::class_<ov::GenerationConfig>(m, "GenerationConfig")
         .def(py::init<>())
         .def(py::init<std::string>())
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
index 2751a6b884..ead61b4bb6 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
@@ -62,18 +62,18 @@ class LLMPipeline::LLMPipelineImpl {
 
     EncodedResults multinomial_sampling(ov::Tensor prompts, GenerationConfig generation_config);
 
-    EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config);
-    EncodedResults generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config, StreamerVariant streamer);
+    EncodedResults generate(ov::Tensor input_ids, std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config);
+
+    EncodedResults generate(ov::Tensor input_ids, std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config, StreamerVariant streamer);
 
     std::string apply_chat_template(std::string prompt, std::string role = "user") const;
 
     // std::shared_ptr<StreamerBase> m_streamer;
     bool is_chat_conversation = false;
 
-    std::string generate(std::string text);
-    std::string generate(std::string text, GenerationConfig generation_config);
-    std::string generate(std::string text, GenerationConfig generation_config, StreamerVariant streamer);
-    DecodedResults generate(std::vector<std::string> text, GenerationConfig generation_config);
+    std::string generate(std::string text, OptionalGenerationConfig generation_config);
+    std::string generate(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer);
+    DecodedResults generate(std::vector<std::string> text, OptionalGenerationConfig generation_config);
 
 };
 
@@ -183,19 +183,11 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::multinomial_sampling(ov::Te
     return results;
 }
 
-std::string ov::LLMPipeline::LLMPipelineImpl::generate(std::string text) {
-    return generate(text, m_generation_config);
-}
-
-std::string ov::LLMPipeline::generate(std::string text) {
-    return m_pimpl->generate(text);
-}
-
-std::string ov::LLMPipeline::generate(std::string text, GenerationConfig generation_config) {
+std::string ov::LLMPipeline::generate(std::string text, OptionalGenerationConfig generation_config) {
     return m_pimpl->generate(text, generation_config);
 }
 
-std::string ov::LLMPipeline::LLMPipelineImpl::generate(std::string text, GenerationConfig generation_config) {
+std::string ov::LLMPipeline::LLMPipelineImpl::generate(std::string text, OptionalGenerationConfig generation_config) {
     std::cout << "WE ARE HEEEEEEEEEEEEEEERE" << std::endl;
     StreamerVariant var;
     return generate(text, generation_config, var);
@@ -203,9 +195,11 @@ std::string ov::LLMPipeline::LLMPipelineImpl::generate(std::string text, Generat
 
 std::string ov::LLMPipeline::LLMPipelineImpl::generate(
     std::string text, 
-    GenerationConfig generation_config,
+    OptionalGenerationConfig generation_config,
     StreamerVariant streamer
 ) {
+    GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+
     if (is_chat_conversation) {
         text = apply_chat_template(text);
     }
@@ -214,7 +208,7 @@ std::string ov::LLMPipeline::LLMPipelineImpl::generate(
     // previous prompt generation in chat dialog stops with the end of sentence token, 
     // need to append this token to the current prompt
     if (is_chat_conversation && kv_cache_len > 0) {
-        text = generation_config.eos_token + text;
+        text = config.eos_token + text;
     }
 
     auto [input_ids, attention_mask] = m_tokenizer.encode(text);
@@ -245,15 +239,15 @@ std::string ov::LLMPipeline::LLMPipelineImpl::generate(
     for (size_t i = 0; i < tmp_attn_mask.size(); i++)
         attention_mask.data<int64_t>()[i] = tmp_attn_mask.data()[i];
 
-    auto generate_results = generate(input_ids, attention_mask, generation_config, streamer);
+    auto generate_results = generate(input_ids, attention_mask, config, streamer);
     return m_tokenizer.decode(generate_results.tokens)[0];
 }
 
-ov::DecodedResults ov::LLMPipeline::generate(std::vector<std::string> text, GenerationConfig generation_config) {
+ov::DecodedResults ov::LLMPipeline::generate(std::vector<std::string> text, OptionalGenerationConfig generation_config) {
     return m_pimpl->generate(text, generation_config);
 }
 
-ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector<std::string> text, GenerationConfig generation_config) {
+ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector<std::string> text, OptionalGenerationConfig generation_config) {
     auto [input_ids, attention_mask] = m_tokenizer.encode(text);
 
     auto generate_results = generate(input_ids, attention_mask, generation_config);
@@ -261,42 +255,35 @@ ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector<std::s
     return {m_tokenizer.decode(generate_results.tokens), generate_results.scores};
 }
 
-std::string ov::LLMPipeline::operator()(std::string text) {
-    return generate(text);
-}
-
-std::string ov::LLMPipeline::operator()(std::string text, GenerationConfig generation_config) {
+std::string ov::LLMPipeline::operator()(std::string text, OptionalGenerationConfig generation_config) {
     return generate(text, generation_config);
 }
 
-// std::string ov::LLMPipeline::operator()(std::string text, GenerationConfig generation_config, std::function<void (std::string)> streamer) {
-//     return "";
-// }
-
-ov::DecodedResults ov::LLMPipeline::operator()(std::vector<std::string> text, GenerationConfig generation_config) {
+ov::DecodedResults ov::LLMPipeline::operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config) {
     return generate(text, generation_config);
 }
 
-ov::DecodedResults ov::LLMPipeline::operator()(std::initializer_list<std::string> text, GenerationConfig generation_config) {
+ov::DecodedResults ov::LLMPipeline::operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config) {
     return generate(text, generation_config);
 }
 
-ov::EncodedResults ov::LLMPipeline::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
+ov::EncodedResults ov::LLMPipeline::LLMPipeline::generate(ov::Tensor input_ids, std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config) {
     return m_pimpl->generate(input_ids, attention_mask, generation_config);
 }
 
-ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config) {
+ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(ov::Tensor input_ids, std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config) {
     return generate(input_ids, attention_mask, generation_config);
 }
 
 ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(
     ov::Tensor input_ids, 
-    ov::Tensor attention_mask, 
-    GenerationConfig generation_config,
+    std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config, 
     StreamerVariant streamer
 ) {
     ov::EncodedResults result;
-    GenerationConfigHelper config_helper = generation_config;
+    GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
+    GenerationConfigHelper config_helper = config;
+    
     std::shared_ptr<StreamerBase> streamer_ptr;
 
     if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) {
@@ -305,15 +292,17 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(
         streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
     }
     
+    auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::generate_utils::init_attention_mask(input_ids);
+
     if (config_helper.is_greedy_decoding()) {
-        result = ov::greedy_decoding(m_model_runner, input_ids, attention_mask, generation_config, streamer_ptr, is_chat_conversation);
+        result = ov::greedy_decoding(m_model_runner, input_ids, attention_mask_data, config, streamer_ptr, is_chat_conversation);
     } else if (config_helper.is_beam_search()) {
-        result = beam_search(m_model_runner, input_ids, attention_mask, generation_config);
+        result = beam_search(m_model_runner, input_ids, attention_mask_data, config);
         
     } else if (config_helper.is_multimomial()) {
-        result = multinomial_sampling(input_ids, generation_config);
+        result = multinomial_sampling(input_ids, config);
     } else {
-        result = ov::assistive_decoding(m_model_runner, input_ids, attention_mask, generation_config);
+        result = ov::assistive_decoding(m_model_runner, input_ids, attention_mask_data, config);
     }
 
     if (!is_chat_conversation)
@@ -323,32 +312,11 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(
     return result;
 }
 
-ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, ov::Tensor attention_mask) {
-    return generate(input_ids, attention_mask, m_pimpl->m_generation_config);
-}
-
-ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, GenerationConfig sampling_params) {
-
-    return generate(input_ids, ov::generate_utils::init_attention_mask(input_ids), sampling_params);
-}
-
-ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids) {
-    return generate(input_ids, ov::generate_utils::init_attention_mask(input_ids), m_pimpl->m_generation_config);
-}
-
-std::string ov::LLMPipeline::generate(std::string text, StreamerVariant streamer) {
-    return "";
-}
-
-std::string ov::LLMPipeline::generate(std::string text, GenerationConfig generation_config, StreamerVariant streamer) {
+std::string ov::LLMPipeline::generate(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer) {
     return m_pimpl->generate(text, generation_config, streamer);
 }
 
-std::string ov::LLMPipeline::operator()(std::string text, StreamerVariant streamer) {
-    return "";
-}
-
-std::string ov::LLMPipeline::operator()(std::string text, GenerationConfig generation_config, StreamerVariant streamer) {
+std::string ov::LLMPipeline::operator()(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer) {
     return generate(text, generation_config, streamer);
 }
 

From d8cab05e2079870bbd51d97066a84455cd9cc648 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 8 May 2024 12:57:59 +0200
Subject: [PATCH 41/97] started to add Readme docs

---
 .../causal_lm/cpp/generate_pipeline/README.md | 112 ++++++++++++++++++
 .../include/llm_pipeline.hpp                  |   3 +-
 .../generate_pipeline/src/llm_pipeline.cpp    |   3 +
 3 files changed, 117 insertions(+), 1 deletion(-)
 create mode 100644 text_generation/causal_lm/cpp/generate_pipeline/README.md

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/README.md b/text_generation/causal_lm/cpp/generate_pipeline/README.md
new file mode 100644
index 0000000000..0540e19557
--- /dev/null
+++ b/text_generation/causal_lm/cpp/generate_pipeline/README.md
@@ -0,0 +1,112 @@
+# OpenVINO Generate API
+
+## Usage 
+
+### In C++
+
+
+```cpp
+int main(int argc, char* argv[]) {
+    std::string model_path = argv[1];
+    ov::LLMPipeline pipe(model_path, "CPU");
+    cout << pipe.generate("The Sun is yellow bacause");
+}
+```
+
+Using Group Beam Search Decoding
+```cpp
+int main(int argc, char* argv[]) {
+    std::string model_path = argv[1];
+    ov::LLMPipeline pipe(model_path, "CPU");
+    ov::GenerationConfig config = pipe.get_generation_config();
+    config.max_new_tokens = 256;
+    config.num_groups = 3;
+    config.group_size = 5;
+    config.diversity_penalty = 1.0f;
+
+    cout << pipe.generate("The Sun is yellow bacause", config);
+}
+```
+
+A simplest chat in C++
+``` cpp
+int main(int argc, char* argv[]) {
+    std::string prompt;
+
+    std::string model_path = argv[1];
+    ov::LLMPipeline pipe(model_path, "CPU");
+
+    pipe.start_chat();
+    for (size_t i = 0; i < questions.size(); i++) {
+        std::cout << "question:\n";
+        std::getline(std::cin, prompt);
+
+        std::cout << pipe(prompt) << std::endl>>;
+    }
+    pipe.finish_chat();
+}
+```
+
+Specifying generation_config to use grouped beam search
+``` cpp
+int main(int argc, char* argv[]) {
+    std::string prompt;
+
+    std::string model_path = argv[1];
+    ov::LLMPipeline pipe(model_path, "CPU");
+    
+    ov::GenerationConfig config = pipe.get_generation_config();
+    config.max_new_tokens = 256;
+    config.num_groups = 3;
+    config.group_size = 5;
+    config.diversity_penalty = 1.0f;
+    
+    auto streamer = [](std::string word) { std::cout << word << std::flush; };
+
+    pipe.start_chat();
+    for (size_t i = 0; i < questions.size(); i++) {
+        
+        std::cout << "question:\n";
+        cout << prompt << endl;
+
+        auto answer = pipe(prompt, config, streamer);
+        // no need to print answer, streamer will do that
+    }
+    pipe.finish_chat();
+}
+```
+
+### In Python
+   
+
+``` python
+pip install openvino-genai
+optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format fp16 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0"
+```
+
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+
+max_new_tokens = 32
+prompt = 'table is made of'
+
+encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=False)
+hf_encoded_output = model.generate(encoded_prompt, max_new_tokens=max_new_tokens, do_sample=False)
+hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:])
+print(f'hf_output: {hf_output}')
+
+import sys
+sys.path.append('build-Debug/')
+import py_generate_pipeline as genai # set more friendly module name
+
+pipe = genai.LLMPipeline('text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/')
+ov_output = pipe(prompt, max_new_tokens=max_new_tokens)
+print(f'ov_output: {ov_output}')
+
+assert hf_output == ov_output
+
+```
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
index 5c7990e140..a0ae229423 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
@@ -46,7 +46,7 @@ class LLMPipeline {
     LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& plugin_config={});
     
     ~LLMPipeline();
-
+    
     EncodedResults generate(ov::Tensor input_ids, std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config);
     std::string generate(std::string text, OptionalGenerationConfig generation_config);
     DecodedResults generate(std::vector<std::string> text, OptionalGenerationConfig generation_config);
@@ -58,6 +58,7 @@ class LLMPipeline {
     // generate with streamers
     std::string generate(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer);
     std::string operator()(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer);
+    std::string operator()(std::string text, StreamerVariant streamer);
     
     ov::Tokenizer get_tokenizer();
     GenerationConfig get_generation_config() const;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
index ead61b4bb6..6979c08f4d 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
@@ -320,6 +320,9 @@ std::string ov::LLMPipeline::operator()(std::string text, OptionalGenerationConf
     return generate(text, generation_config, streamer);
 }
 
+std::string ov::LLMPipeline::operator()(std::string text, StreamerVariant streamer) {
+    return generate(text, m_pimpl->m_generation_config, streamer);
+}
 
 ov::Tokenizer ov::LLMPipeline::get_tokenizer() {
     return m_pimpl->m_tokenizer;

From 2535394154daef4afa39bce469b9b90221151679 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 8 May 2024 13:07:29 +0200
Subject: [PATCH 42/97] reoder Readme

---
 .../causal_lm/cpp/generate_pipeline/README.md | 101 +++++++++++-------
 1 file changed, 65 insertions(+), 36 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/README.md b/text_generation/causal_lm/cpp/generate_pipeline/README.md
index 0540e19557..13bd408af1 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/README.md
+++ b/text_generation/causal_lm/cpp/generate_pipeline/README.md
@@ -2,9 +2,73 @@
 
 ## Usage 
 
-### In C++
+Firs of all you need to convert your model with optimum-cli
+``` sh
+optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format fp16 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0"
+pip install openvino-genai
+```
+
+LLMPipeline is the main object used for decoding. You can initiliza it straigh away from the folder with the converted model. It will automanically load the main model, tokenizer, detokenizer and default generation configuration.
+
+### In Python
+
+A minimalist example:
+```python
+import py_generate_pipeline as genai # set more friendly module name
+pipe = genai.LLMPipeline(model_path, "CPU")
+print(pipe.generate("The Sun is yellow bacause"))
+```
+
+A simples chat in python:
+```python
+import openvino_genai as ov_genai
+pipe = ov_genai.LLMPipeline(model_path)
+
+config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.1}
+pipe.set_generation_cofnig(config)
+
+pipe.start_chat()
+while True:
+    print('question:')
+    prompt = input()
+    if prompt == 'Stop!':
+        break
+    print(pipe(prompt))
+pipe.finish_chat()
 
+```
 
+Test to compare with Huggingface outputs
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+
+max_new_tokens = 32
+prompt = 'table is made of'
+
+encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=False)
+hf_encoded_output = model.generate(encoded_prompt, max_new_tokens=max_new_tokens, do_sample=False)
+hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:])
+print(f'hf_output: {hf_output}')
+
+import sys
+sys.path.append('build-Debug/')
+import py_generate_pipeline as genai # set more friendly module name
+
+pipe = genai.LLMPipeline('text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/')
+ov_output = pipe(prompt, max_new_tokens=max_new_tokens)
+print(f'ov_output: {ov_output}')
+
+assert hf_output == ov_output
+
+```
+
+
+### In C++
+
+Minimalistc example
 ```cpp
 int main(int argc, char* argv[]) {
     std::string model_path = argv[1];
@@ -75,38 +139,3 @@ int main(int argc, char* argv[]) {
     pipe.finish_chat();
 }
 ```
-
-### In Python
-   
-
-``` python
-pip install openvino-genai
-optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format fp16 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0"
-```
-
-
-```python
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-
-max_new_tokens = 32
-prompt = 'table is made of'
-
-encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=False)
-hf_encoded_output = model.generate(encoded_prompt, max_new_tokens=max_new_tokens, do_sample=False)
-hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:])
-print(f'hf_output: {hf_output}')
-
-import sys
-sys.path.append('build-Debug/')
-import py_generate_pipeline as genai # set more friendly module name
-
-pipe = genai.LLMPipeline('text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/')
-ov_output = pipe(prompt, max_new_tokens=max_new_tokens)
-print(f'ov_output: {ov_output}')
-
-assert hf_output == ov_output
-
-```
\ No newline at end of file

From 95c1bfbfc76facec8f83cda7d843ed59e4f8b897 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Thu, 9 May 2024 12:56:38 +0400
Subject: [PATCH 43/97] rm generate_pipeline/python

---
 .../generate_pipeline/python/CMakeLists.txt   | 22 ----------------
 .../cpp/generate_pipeline/python/__init__.py  |  0
 .../cpp/generate_pipeline/python/python.cpp   | 26 -------------------
 3 files changed, 48 deletions(-)
 delete mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt
 delete mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python/__init__.py
 delete mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python/python.cpp

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt b/text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt
deleted file mode 100644
index b16294c320..0000000000
--- a/text_generation/causal_lm/cpp/generate_pipeline/python/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-cmake_minimum_required(VERSION 3.15)
-project(py_continuous_batching)
-
-include(FetchContent)
-FetchContent_Declare(
-    pybind11
-    GIT_REPOSITORY https://github.com/pybind/pybind11
-    GIT_TAG        v2.12.0
-)
-
-FetchContent_GetProperties(pybind11)
-if(NOT pybind11_POPULATED)
-    FetchContent_Populate(pybind11)
-    add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
-endif()
-
-pybind11_add_module(py_continuous_batching python.cpp)
-# TODO: how to link with tokenizers
\ No newline at end of file
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python/__init__.py b/text_generation/causal_lm/cpp/generate_pipeline/python/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python/python.cpp b/text_generation/causal_lm/cpp/generate_pipeline/python/python.cpp
deleted file mode 100644
index 7485a503ec..0000000000
--- a/text_generation/causal_lm/cpp/generate_pipeline/python/python.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (C) 2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <cstdlib>
-#include <limits>
-#include <functional>
-
-#include "pybind11/pybind11.h"
-#include <pybind11/stl.h>
-
-struct GenerationConfig {
-    bool do_sample;
-
-};
-
-namespace py = pybind11;
-
-
-PYBIND11_MODULE(py_continuous_batching, m) {
-    py::class_<GenerationConfig>(m, "GenerationConfig")
-        .def(py::init<>())
-        .def_readwrite("do_sample", &GenerationConfig::do_sample);
-
-}

From 4510f714012754c9f28032976f683b768e36371d Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 9 May 2024 10:09:13 +0200
Subject: [PATCH 44/97] update Readme; cleanup LLMPipeline and add docstring

---
 .../causal_lm/cpp/generate_pipeline/README.md | 34 +++++++-
 .../include/generation_config.hpp             |  8 +-
 .../include/llm_pipeline.hpp                  | 82 ++++++++++++++++---
 .../generate_pipeline/src/llm_pipeline.cpp    | 70 ++++++----------
 .../causal_lm/generate_tests/test_greedy.py   |  2 +-
 5 files changed, 135 insertions(+), 61 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/README.md b/text_generation/causal_lm/cpp/generate_pipeline/README.md
index 13bd408af1..0a0f6010e6 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/README.md
+++ b/text_generation/causal_lm/cpp/generate_pipeline/README.md
@@ -65,7 +65,6 @@ assert hf_output == ov_output
 
 ```
 
-
 ### In C++
 
 Minimalistc example
@@ -139,3 +138,36 @@ int main(int argc, char* argv[]) {
     pipe.finish_chat();
 }
 ```
+
+Streaming exapmle with lambda function
+
+``` cpp
+int main(int argc, char* argv[]) {
+    auto streamer = [](std::string word) { std::cout << word << std::flush; };
+
+    std::string model_path = argv[1];
+    ov::LLMPipeline pipe(model_path, "CPU");
+    cout << pipe.generate("The Sun is yellow bacause", streamer);
+}
+```
+
+Streaming with custom class
+``` cpp
+#include <streamer_base.hpp>
+
+class CustomStreamer: publict StreamerBase {
+public:
+    void put(int64_t token) {/* decode tokens and do process them*/};
+
+    void end() {/* decode tokens and do process them*/};
+};
+
+int main(int argc, char* argv[]) {
+    CustomStreamer custom_streamer;
+
+    std::string model_path = argv[1];
+    ov::LLMPipeline pipe(model_path, "CPU");
+    cout << pipe.generate("The Sun is yellow bacause", custom_streamer);
+}
+```
+
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
index b3d6accceb..12af29c656 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
@@ -3,17 +3,15 @@
 
 #pragma once
 
-#include <cstdlib>
-#include <functional>
 #include <limits>
 #include "llm_tokenizer.hpp"
 #include <variant>
 
-// forward declaration
-class Sequence;
 
 namespace ov {
 
+enum class StopCriteria { early, heuristic, never };
+
 class GenerationConfig {
 public:
     GenerationConfig() = default;
@@ -31,7 +29,7 @@ class GenerationConfig {
     float length_penalty;
     size_t m_num_return_sequences;
     size_t no_repeat_ngram_size;
-    std::variant<std::string, bool> early_stopping;
+    StopCriteria stop_criteria;
     
     // Multinomial
     float temperature;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
index a0ae229423..09e058b703 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
@@ -13,50 +13,110 @@
 
 using namespace std;
 
-class Tokenizer; // forward declaration
-
 namespace ov {
 
-
 using StreamerVariant = std::variant<std::monostate, std::function<void (std::string)>, std::shared_ptr<StreamerBase>>;
 using OptionalGenerationConfig = std::optional<GenerationConfig>;
+using OptionalStreamerVariant = std::optional<StreamerVariant>;
 
+/**
+* @brief Structure to store resulting batched tokens and scores for each batch sequence
+*
+* @param tokens sequence of resulting tokens
+* @param scores scores for each sequence
+*/
 class EncodedResults {
 public:
     std::vector<std::vector<int64_t>> tokens;
     std::vector<float> scores;
 };
 
+/**
+* @brief Structure to store resulting batched text outputs and scores for each batch
+*
+* @param texts vector of resulting sequences
+* @param scores scores for each sequence
+*/
 class DecodedResults {
 public:
     std::vector<std::string> texts;
     std::vector<float> scores;
 };
 
+/**
+* @brief This class is used for generation with LLMs.
+ */
 class LLMPipeline {
 public:
+    /**
+    * @brief Constructs a LLMPipeline when convert model xml/bin files, tokenizers and configuration and in the same dir
+    *
+    * @param model_path Path to the dir model xml/bin files, tokenizers and generation_configs.json
+    * @param device optional device
+    * @param plugin_config optional plugin_config
+    */
+    LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& plugin_config={});
+    
+    /**
+    * @brief Constructs a LLMPipeline when model and tokenizers are in separate dirs
+    *
+    * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json
+    * @param tokenizer_path path to the tokenizer
+    * @param detokenizer_path path to the detokenizer_path
+    * @param device optional device
+    * @param plugin_config optional plugin_config
+    */
     LLMPipeline(
         std::string& model_path,
         std::string& tokenizer_path,  // todo: make possible to specify tokenizers with ov::Model, ov::CompiledModel, etc. 
-        std::string& detokenizer_path,
+        std::string& detokenizer_path, // todo: do we deen separate detokenizer path?
         std::string device="CPU",
         const ov::AnyMap& plugin_config={}
     );
     
-    LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& plugin_config={});
-    
     ~LLMPipeline();
-    
-    EncodedResults generate(ov::Tensor input_ids, std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config);
-    std::string generate(std::string text, OptionalGenerationConfig generation_config);
-    DecodedResults generate(std::vector<std::string> text, OptionalGenerationConfig generation_config);
+
+    /**
+    * @brief High level generate for the input with a single prompt which encodes inputs and returns decoded output
+    *
+    * @param text input prompt
+    * @param generation_config optional GenerationConfig
+    * @param streamer optional streamer
+    * @return std::string decoded resulting text
+    */
+    std::string generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer);
+
+    /**
+    * @brief High level generate for batched prompts which encodes inputs and returns decoded outputs. 
+    * Streamer cannot be used for multibatch inputs.
+    *
+    * @param text input prompt
+    * @param generation_config optional GenerationConfig
+    * @return DecodedResults a structure with resulting texts & scores
+    */
+    DecodedResults generate(std::vector<std::string> texts, OptionalGenerationConfig generation_config);
+
+    /**
+    * @brief Low level generate to be called with already encoded input_ids tokens.
+    * Streamer cannot be used for multibatch inputs.
+    *
+    * @param input_ids encoded input prompt tokens
+    * @param attention_mask optional attention_mask
+    * @param generation_config optional GenerationConfig
+    * @param streamer optional streamer
+    * @return EncodedResults a structure with resulting tokens and scores
+    * @throws Exception if the stremaer is set for inputs_ids with multiple batches
+    */
+    EncodedResults generate(ov::Tensor input_ids, 
+                            std::optional<ov::Tensor> attention_mask, 
+                            OptionalGenerationConfig generation_config,
+                            OptionalStreamerVariant streamer);
 
     std::string operator()(std::string text, OptionalGenerationConfig generation_config);
     DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config);
     DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config);
 
     // generate with streamers
-    std::string generate(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer);
     std::string operator()(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer);
     std::string operator()(std::string text, StreamerVariant streamer);
     
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
index 6979c08f4d..06b360de28 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
@@ -62,19 +62,13 @@ class LLMPipeline::LLMPipelineImpl {
 
     EncodedResults multinomial_sampling(ov::Tensor prompts, GenerationConfig generation_config);
 
-    EncodedResults generate(ov::Tensor input_ids, std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config);
-
-    EncodedResults generate(ov::Tensor input_ids, std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config, StreamerVariant streamer);
+    std::string generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer);
+    DecodedResults generate(std::vector<std::string> texts, OptionalGenerationConfig generation_config);
+    EncodedResults generate(ov::Tensor input_ids, std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer);
 
     std::string apply_chat_template(std::string prompt, std::string role = "user") const;
 
-    // std::shared_ptr<StreamerBase> m_streamer;
     bool is_chat_conversation = false;
-
-    std::string generate(std::string text, OptionalGenerationConfig generation_config);
-    std::string generate(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer);
-    DecodedResults generate(std::vector<std::string> text, OptionalGenerationConfig generation_config);
-
 };
 
 } // namespace ov
@@ -183,20 +177,10 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::multinomial_sampling(ov::Te
     return results;
 }
 
-std::string ov::LLMPipeline::generate(std::string text, OptionalGenerationConfig generation_config) {
-    return m_pimpl->generate(text, generation_config);
-}
-
-std::string ov::LLMPipeline::LLMPipelineImpl::generate(std::string text, OptionalGenerationConfig generation_config) {
-    std::cout << "WE ARE HEEEEEEEEEEEEEEERE" << std::endl;
-    StreamerVariant var;
-    return generate(text, generation_config, var);
-}
-
 std::string ov::LLMPipeline::LLMPipelineImpl::generate(
     std::string text, 
     OptionalGenerationConfig generation_config,
-    StreamerVariant streamer
+    OptionalStreamerVariant streamer
 ) {
     GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
 
@@ -243,55 +227,55 @@ std::string ov::LLMPipeline::LLMPipelineImpl::generate(
     return m_tokenizer.decode(generate_results.tokens)[0];
 }
 
-ov::DecodedResults ov::LLMPipeline::generate(std::vector<std::string> text, OptionalGenerationConfig generation_config) {
-    return m_pimpl->generate(text, generation_config);
+ov::DecodedResults ov::LLMPipeline::generate(std::vector<std::string> texts, OptionalGenerationConfig generation_config) {
+    return m_pimpl->generate(texts, generation_config);
 }
 
-ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector<std::string> text, OptionalGenerationConfig generation_config) {
-    auto [input_ids, attention_mask] = m_tokenizer.encode(text);
+ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector<std::string> texts, OptionalGenerationConfig generation_config) {
+    auto [input_ids, attention_mask] = m_tokenizer.encode(texts);
 
-    auto generate_results = generate(input_ids, attention_mask, generation_config);
+    auto generate_results = generate(input_ids, attention_mask, generation_config, {});
 
     return {m_tokenizer.decode(generate_results.tokens), generate_results.scores};
 }
 
 std::string ov::LLMPipeline::operator()(std::string text, OptionalGenerationConfig generation_config) {
-    return generate(text, generation_config);
+    return generate(text, generation_config, {});
 }
 
-ov::DecodedResults ov::LLMPipeline::operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config) {
-    return generate(text, generation_config);
+ov::DecodedResults ov::LLMPipeline::operator()(std::vector<std::string> texts, OptionalGenerationConfig generation_config) {
+    return m_pimpl-> generate(texts, generation_config);
 }
 
 ov::DecodedResults ov::LLMPipeline::operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config) {
-    return generate(text, generation_config);
-}
-
-ov::EncodedResults ov::LLMPipeline::LLMPipeline::generate(ov::Tensor input_ids, std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config) {
-    return m_pimpl->generate(input_ids, attention_mask, generation_config);
+    return m_pimpl->generate(text, generation_config);
 }
 
-ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(ov::Tensor input_ids, std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config) {
-    return generate(input_ids, attention_mask, generation_config);
+ov::EncodedResults ov::LLMPipeline::LLMPipeline::generate(ov::Tensor input_ids, 
+                                                          std::optional<ov::Tensor> attention_mask, 
+                                                          OptionalGenerationConfig generation_config,
+                                                          OptionalStreamerVariant streamer) {
+    return m_pimpl->generate(input_ids, attention_mask, generation_config, streamer);
 }
 
 ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(
     ov::Tensor input_ids, 
     std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config, 
-    StreamerVariant streamer
+    OptionalStreamerVariant streamer
 ) {
     ov::EncodedResults result;
     GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
     GenerationConfigHelper config_helper = config;
     
     std::shared_ptr<StreamerBase> streamer_ptr;
-
-    if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) {
+    if (!streamer.has_value()){
+        streamer_ptr = nullptr;
+    } else if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&*streamer)) {
         streamer_ptr = *streamer_obj;
-    } else if (auto callback = std::get_if<std::function<void(std::string)>>(&streamer)) {
+    } else if (auto callback = std::get_if<std::function<void(std::string)>>(&*streamer)) {
         streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
     }
-    
+
     auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::generate_utils::init_attention_mask(input_ids);
 
     if (config_helper.is_greedy_decoding()) {
@@ -312,16 +296,16 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(
     return result;
 }
 
-std::string ov::LLMPipeline::generate(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer) {
+std::string ov::LLMPipeline::generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer) {
     return m_pimpl->generate(text, generation_config, streamer);
 }
 
 std::string ov::LLMPipeline::operator()(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer) {
-    return generate(text, generation_config, streamer);
+    return m_pimpl->generate(text, generation_config, streamer);
 }
 
 std::string ov::LLMPipeline::operator()(std::string text, StreamerVariant streamer) {
-    return generate(text, m_pimpl->m_generation_config, streamer);
+    return m_pimpl->generate(text, m_pimpl->m_generation_config, streamer);
 }
 
 ov::Tokenizer ov::LLMPipeline::get_tokenizer() {
diff --git a/text_generation/causal_lm/generate_tests/test_greedy.py b/text_generation/causal_lm/generate_tests/test_greedy.py
index c9c09dbb87..ae81a0c3b4 100644
--- a/text_generation/causal_lm/generate_tests/test_greedy.py
+++ b/text_generation/causal_lm/generate_tests/test_greedy.py
@@ -20,7 +20,7 @@ def test_tiny_llama():
     import py_generate_pipeline as genai
 
     pipe = genai.LLMPipeline('text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/')
-    ov_output = pipe(prompt, max_new_tokens=max_new_tokens)
+    ov_output = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False)
     print(f'ov_output: {ov_output}')
 
     assert hf_output == ov_output

From 507bc49226e21ff5601e07293470a1a387e747a5 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 9 May 2024 15:20:11 +0200
Subject: [PATCH 45/97] refactor folder structure

---
 CMakeLists.txt                                | 10 +++++
 src/CMakeLists.txt                            |  6 +++
 src/cpp/CMakeLists.txt                        | 25 ++++++++++++
 .../cpp}/include/generation_config.hpp        |  0
 .../cpp}/include/llm_pipeline.hpp             |  0
 .../cpp}/include/llm_tokenizer.hpp            |  0
 .../cpp}/include/streamer_base.hpp            |  0
 .../cpp}/src/assistive_decoding.cpp           |  0
 .../cpp}/src/beam_search_decoding.cpp         |  0
 .../cpp}/src/generation_config.cpp            |  0
 .../cpp}/src/generation_config_helper.hpp     |  0
 .../cpp}/src/greedy_decoding.cpp              |  0
 .../cpp}/src/llm_pipeline.cpp                 |  0
 .../cpp}/src/llm_tokenizer.cpp                |  0
 .../cpp}/src/text_callback_streamer.cpp       |  0
 .../cpp}/src/text_callback_streamer.hpp       |  0
 .../cpp}/src/utils.cpp                        |  0
 .../cpp}/src/utils.hpp                        |  0
 src/python-bindings/CMakeLists.txt            | 15 +++++++
 .../python-bindings}/py_generate_pipeline.cpp |  0
 .../tests/python_tests}/test_greedy.py        |  4 +-
 text_generation/causal_lm/cpp/CMakeLists.txt  | 40 -------------------
 22 files changed, 58 insertions(+), 42 deletions(-)
 create mode 100644 CMakeLists.txt
 create mode 100644 src/CMakeLists.txt
 create mode 100644 src/cpp/CMakeLists.txt
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/include/generation_config.hpp (100%)
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/include/llm_pipeline.hpp (100%)
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/include/llm_tokenizer.hpp (100%)
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/include/streamer_base.hpp (100%)
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/src/assistive_decoding.cpp (100%)
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/src/beam_search_decoding.cpp (100%)
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/src/generation_config.cpp (100%)
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/src/generation_config_helper.hpp (100%)
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/src/greedy_decoding.cpp (100%)
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/src/llm_pipeline.cpp (100%)
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/src/llm_tokenizer.cpp (100%)
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/src/text_callback_streamer.cpp (100%)
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/src/text_callback_streamer.hpp (100%)
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/src/utils.cpp (100%)
 rename {text_generation/causal_lm/cpp/generate_pipeline => src/cpp}/src/utils.hpp (100%)
 create mode 100644 src/python-bindings/CMakeLists.txt
 rename {text_generation/causal_lm/cpp/generate_pipeline/python => src/python-bindings}/py_generate_pipeline.cpp (100%)
 rename {text_generation/causal_lm/generate_tests => src/tests/python_tests}/test_greedy.py (94%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000000..0c55fba075
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright (C) 2018-2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+cmake_minimum_required(VERSION 3.15)
+project(openvino_genai)
+
+
+add_subdirectory(src)
+add_subdirectory(text_generation/causal_lm/cpp)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000000..dad9bd54a1
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,6 @@
+# Copyright (C) 2018-2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+add_subdirectory(python-bindings)
+add_subdirectory(cpp)
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
new file mode 100644
index 0000000000..bbdea5b1ab
--- /dev/null
+++ b/src/cpp/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Generate Pipeline library
+
+set(JINJA2CPP_DEPS_MODE internal)
+
+add_subdirectory(../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
+add_subdirectory(../../thirdparty/nlohmann_json/ "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json/")
+
+# todo: remove hardcodes and make submodule work
+# include_directories($ENV{HOME}/opt/jinja2cpp/include)
+# add_subdirectory(../../../thirdparty/Jinja2Cpp/ "${CMAKE_CURRENT_BINARY_DIR}/Jinja2Cpp/")
+# include_directories(../../../thirdparty/inja/include/Jinja2Cpp)
+
+
+set(TARGET_NAME generate_pipeline_lib)
+file(GLOB SOURCE_FILES "src/*.cpp")
+add_library(${TARGET_NAME} SHARED ${SOURCE_FILES})
+target_include_directories(${TARGET_NAME} PRIVATE ../../text_generation/causal_lm/cpp/)
+target_include_directories(${TARGET_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+find_package(OpenVINO REQUIRED COMPONENTS Runtime)
+target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime)
+target_link_libraries(${TARGET_NAME} PUBLIC nlohmann_json::nlohmann_json)
+target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+# target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)  # todo: remove hardcode
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
+set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp b/src/cpp/include/generation_config.hpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/include/generation_config.hpp
rename to src/cpp/include/generation_config.hpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp b/src/cpp/include/llm_pipeline.hpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/include/llm_pipeline.hpp
rename to src/cpp/include/llm_pipeline.hpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/llm_tokenizer.hpp b/src/cpp/include/llm_tokenizer.hpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/include/llm_tokenizer.hpp
rename to src/cpp/include/llm_tokenizer.hpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/include/streamer_base.hpp b/src/cpp/include/streamer_base.hpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/include/streamer_base.hpp
rename to src/cpp/include/streamer_base.hpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/assistive_decoding.cpp b/src/cpp/src/assistive_decoding.cpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/src/assistive_decoding.cpp
rename to src/cpp/src/assistive_decoding.cpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/beam_search_decoding.cpp b/src/cpp/src/beam_search_decoding.cpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/src/beam_search_decoding.cpp
rename to src/cpp/src/beam_search_decoding.cpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/src/generation_config.cpp
rename to src/cpp/src/generation_config.cpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/generation_config_helper.hpp b/src/cpp/src/generation_config_helper.hpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/src/generation_config_helper.hpp
rename to src/cpp/src/generation_config_helper.hpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/src/greedy_decoding.cpp
rename to src/cpp/src/greedy_decoding.cpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/src/llm_pipeline.cpp
rename to src/cpp/src/llm_pipeline.cpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/llm_tokenizer.cpp b/src/cpp/src/llm_tokenizer.cpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/src/llm_tokenizer.cpp
rename to src/cpp/src/llm_tokenizer.cpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.cpp
rename to src/cpp/src/text_callback_streamer.cpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/src/text_callback_streamer.hpp
rename to src/cpp/src/text_callback_streamer.hpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/utils.cpp b/src/cpp/src/utils.cpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/src/utils.cpp
rename to src/cpp/src/utils.cpp
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/src/utils.hpp b/src/cpp/src/utils.hpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/src/utils.hpp
rename to src/cpp/src/utils.hpp
diff --git a/src/python-bindings/CMakeLists.txt b/src/python-bindings/CMakeLists.txt
new file mode 100644
index 0000000000..a030fed156
--- /dev/null
+++ b/src/python-bindings/CMakeLists.txt
@@ -0,0 +1,15 @@
+include(FetchContent)
+FetchContent_Declare(
+    pybind11
+    GIT_REPOSITORY https://github.com/pybind/pybind11
+    GIT_TAG        v2.12.0
+)
+
+FetchContent_GetProperties(pybind11)
+if(NOT pybind11_POPULATED)
+    FetchContent_Populate(pybind11)
+    add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
+endif()
+
+pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp)
+target_link_libraries(py_generate_pipeline PRIVATE generate_pipeline_lib)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python/py_generate_pipeline.cpp b/src/python-bindings/py_generate_pipeline.cpp
similarity index 100%
rename from text_generation/causal_lm/cpp/generate_pipeline/python/py_generate_pipeline.cpp
rename to src/python-bindings/py_generate_pipeline.cpp
diff --git a/text_generation/causal_lm/generate_tests/test_greedy.py b/src/tests/python_tests/test_greedy.py
similarity index 94%
rename from text_generation/causal_lm/generate_tests/test_greedy.py
rename to src/tests/python_tests/test_greedy.py
index ae81a0c3b4..47c37f5bd8 100644
--- a/text_generation/causal_lm/generate_tests/test_greedy.py
+++ b/src/tests/python_tests/test_greedy.py
@@ -16,9 +16,9 @@ def test_tiny_llama():
     print(f'hf_output: {hf_output}')
 
     import sys
-    sys.path.append('build-Debug/')
+    sys.path.append('build-Debug/src/python-bindings')
     import py_generate_pipeline as genai
-
+    
     pipe = genai.LLMPipeline('text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/')
     ov_output = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False)
     print(f'ov_output: {ov_output}')
diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 431649e7c1..c659603fe3 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -4,20 +4,10 @@
 cmake_minimum_required(VERSION 3.15)
 project(causal_lm)
 
-set(JINJA2CPP_DEPS_MODE internal)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
-add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
-add_subdirectory(../../../thirdparty/nlohmann_json/ "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json/")
-
-# todo: remove hardcodes and make submodule work
-# include_directories($ENV{HOME}/opt/jinja2cpp/include)
-# add_subdirectory(../../../thirdparty/Jinja2Cpp/ "${CMAKE_CURRENT_BINARY_DIR}/Jinja2Cpp/")
-# include_directories(../../../thirdparty/inja/include/Jinja2Cpp)
-
 set(TARGET_NAME greedy_causal_lm)
 add_executable(${TARGET_NAME} greedy_causal_lm.cpp)
-# todo: remove hardcode
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
@@ -52,20 +42,6 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-# Generate Pipeline library
-set(TARGET_NAME generate_pipeline_lib)
-file(GLOB SOURCE_FILES "generate_pipeline/src/*.cpp")
-add_library(${TARGET_NAME} SHARED ${SOURCE_FILES})
-target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
-target_include_directories(${TARGET_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/generate_pipeline/include)
-find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime)
-target_link_libraries(${TARGET_NAME} PUBLIC nlohmann_json::nlohmann_json)
-target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
-target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)  # todo: remove hardcode
-set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
-set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
-
 set(TARGET_NAME generate_sample)
 add_executable(${TARGET_NAME} generate_pipeline/generate_sample.cpp)
 target_link_libraries(${TARGET_NAME} PRIVATE generate_pipeline_lib)
@@ -79,19 +55,3 @@ target_link_libraries(${TARGET_NAME} PRIVATE generate_pipeline_lib)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
-
-include(FetchContent)
-FetchContent_Declare(
-    pybind11
-    GIT_REPOSITORY https://github.com/pybind/pybind11
-    GIT_TAG        v2.12.0
-)
-
-FetchContent_GetProperties(pybind11)
-if(NOT pybind11_POPULATED)
-    FetchContent_Populate(pybind11)
-    add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
-endif()
-
-pybind11_add_module(py_generate_pipeline generate_pipeline/python/py_generate_pipeline.cpp)
-target_link_libraries(py_generate_pipeline PRIVATE generate_pipeline_lib)

From af747d4c05b69cbf53d3eae9e7c293a595420dda Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 9 May 2024 17:15:56 +0200
Subject: [PATCH 46/97] cleanup generation_config and ov::Tokenizer

---
 src/cpp/include/generation_config.hpp         |  83 +++-
 src/cpp/include/llm_pipeline.hpp              |  33 +-
 src/cpp/include/llm_tokenizer.hpp             |  55 ++-
 src/cpp/src/beam_search_decoding.cpp          |  14 +-
 src/cpp/src/generation_config.cpp             | 411 +++---------------
 src/cpp/src/generation_config_helper.hpp      |   9 +-
 src/cpp/src/greedy_decoding.cpp               |   2 -
 src/cpp/src/llm_pipeline.cpp                  |  76 ++--
 src/cpp/src/llm_tokenizer.cpp                 |  43 +-
 src/cpp/src/text_callback_streamer.cpp        |   2 +-
 src/python-bindings/py_generate_pipeline.cpp  |  55 ++-
 .../cpp/generate_pipeline/chat_sample.cpp     |   3 +-
 12 files changed, 297 insertions(+), 489 deletions(-)

diff --git a/src/cpp/include/generation_config.hpp b/src/cpp/include/generation_config.hpp
index 12af29c656..9aeb16b299 100644
--- a/src/cpp/include/generation_config.hpp
+++ b/src/cpp/include/generation_config.hpp
@@ -7,48 +7,89 @@
 #include "llm_tokenizer.hpp"
 #include <variant>
 
-
 namespace ov {
 
+/**
+ * @brief controls the stopping condition for grouped beam search. The following values are  possible:
+ *        "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an 
+ *        heuristic is applied and the generation stops when is it very unlikely to find better candidates;
+ */
 enum class StopCriteria { early, heuristic, never };
 
+/**
+ * @brief structure to keep generation config parameters.
+ * 
+ * @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
+ *        `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
+ * @param max_new_tokens the maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
+ * @param ignore_eos if set to true, then generation will not stop even if <eos> token is met.
+ * @param num_beams  number of beams for beam search. 1 means no beam search.
+ * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
+ * @param diversity_penalty this value is subtracted from a beam's score if it generates a token same as any beam from other group at a
+ *        particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
+ *        [more datails in this paper](https://arxiv.org/pdf/1610.02424.pdf).
+ * @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+ *        the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+ *        likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
+ *        `length_penalty` < 0.0 encourages shorter sequences.
+ * @param num_return_sequences the number of sequences to return for grouped beam search decoding
+ * @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once.
+ * @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values: 
+ *        "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an 
+ *        heuristic is applied and the generation stops when is it very unlikely to find better candidates;
+ *        "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
+ * @param temperature the value used to modulate token probabilities for random sampling
+ * @param top_p if set to float < 1, only the smallest set of most probable tokens with probabilities 
+ * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
+ * @param do_sample whether or not to use multinomial random sampling
+ *        that add up to `top_p` or higher are kept.
+ * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. 
+ *        [more datails in this paper](https://arxiv.org/pdf/1909.05858.pdf).
+ * @param pad_token_id id of padding token
+ * @param bos_token_id id of <bos> token
+ * @param eos_token_id id of <eos> token
+ * @param bos_token <bos> token string representation
+ * @param eos_token <eos> token string representation
+ * @param draft_model draft model for assitive decoding
+ */
 class GenerationConfig {
 public:
     GenerationConfig() = default;
     GenerationConfig(std::string json_path);
 
     // Generic
-    size_t max_new_tokens;
-    size_t max_length;
-    bool ignore_eos;
+    size_t max_new_tokens = SIZE_MAX;
+    size_t max_length = SIZE_MAX;
+    bool ignore_eos = false;
 
     // Beam search specific
-    size_t num_groups;
-    size_t group_size;
-    float diversity_penalty;
-    float length_penalty;
-    size_t m_num_return_sequences;
-    size_t no_repeat_ngram_size;
-    StopCriteria stop_criteria;
+    size_t num_beam_groups = 1;
+    size_t num_beams = 1;
+    float diversity_penalty = 1.0f;
+    float length_penalty = 1.0f;
+    size_t num_return_sequences = 1;
+    size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
+    StopCriteria stop_criteria = StopCriteria::heuristic;
     
     // Multinomial
-    float temperature;
-    float top_p;
-    size_t top_k;
-    bool do_sample;
-    float repetition_penalty;
+    float temperature = 0.0f;
+    float top_p = 1.0f;
+    int top_k = -1;
+    bool do_sample = false;
+    float repetition_penalty = 1.0f;
 
     // special tokens
-    int64_t bos_token_id;
-    int64_t eos_token_id;
-    int64_t pad_token_id;
+    int64_t pad_token_id = 0;
+    int64_t bos_token_id = 1;
+    int64_t eos_token_id = 2;
     
     // used for chat scenario
-    std::string eos_token;  
-    std::string bos_token; 
+    std::string bos_token = "<s>";
+    std::string eos_token = "</s>";
     
     // speculative sampling
     std::variant<std::string, ov::CompiledModel, ov::InferRequest> draft_model;  // todo: remove or try to add ov::Model const ov::Model&,
 };
 
+
 } // namespace ov
diff --git a/src/cpp/include/llm_pipeline.hpp b/src/cpp/include/llm_pipeline.hpp
index 09e058b703..8057599b91 100644
--- a/src/cpp/include/llm_pipeline.hpp
+++ b/src/cpp/include/llm_pipeline.hpp
@@ -15,7 +15,7 @@ using namespace std;
 
 namespace ov {
 
-using StreamerVariant = std::variant<std::monostate, std::function<void (std::string)>, std::shared_ptr<StreamerBase>>;
+using StreamerVariant = std::variant<std::function<void (std::string)>, std::shared_ptr<StreamerBase>>;
 using OptionalGenerationConfig = std::optional<GenerationConfig>;
 using OptionalStreamerVariant = std::optional<StreamerVariant>;
 
@@ -49,7 +49,7 @@ class DecodedResults {
 class LLMPipeline {
 public:
     /**
-    * @brief Constructs a LLMPipeline when convert model xml/bin files, tokenizers and configuration and in the same dir
+    * @brief Constructs a LLMPipeline when convert model xml/bin files, tokenizers and configuration and in the same dir.
     *
     * @param model_path Path to the dir model xml/bin files, tokenizers and generation_configs.json
     * @param device optional device
@@ -58,20 +58,18 @@ class LLMPipeline {
     LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& plugin_config={});
     
     /**
-    * @brief Constructs a LLMPipeline when model and tokenizers are in separate dirs
+    * @brief Constructs a LLMPipeline when ov::Tokenizer is initialized manually using file from the different dirs.
     *
     * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json
-    * @param tokenizer_path path to the tokenizer
-    * @param detokenizer_path path to the detokenizer_path
+    * @param tokenizer manually initialized ov::Tokenizer 
     * @param device optional device
     * @param plugin_config optional plugin_config
     */
     LLMPipeline(
-        std::string& model_path,
-        std::string& tokenizer_path,  // todo: make possible to specify tokenizers with ov::Model, ov::CompiledModel, etc. 
-        std::string& detokenizer_path, // todo: do we deen separate detokenizer path?
-        std::string device="CPU",
-        const ov::AnyMap& plugin_config={}
+        const std::string model_path,
+        const ov::Tokenizer& tokenizer,
+        const std::string device="CPU",
+        const ov::AnyMap& plugin_config = {}
     );
     
     ~LLMPipeline();
@@ -85,6 +83,16 @@ class LLMPipeline {
     * @return std::string decoded resulting text
     */
     std::string generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer);
+    
+
+    template <typename... Properties>
+    util::EnableIfAllStringAny<std::string, Properties...> generate(
+        std::string text,
+        Properties&&... properties) {
+        return generate(text, AnyMap{std::forward<Properties>(properties)...});
+    }
+
+    std::string generate(std::string text, const ov::AnyMap& config);
 
     /**
     * @brief High level generate for batched prompts which encodes inputs and returns decoded outputs. 
@@ -133,4 +141,9 @@ class LLMPipeline {
     std::unique_ptr<LLMPipelineImpl> m_pimpl;
 };
 
+static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
+static constexpr ov::Property<float> temperature{"temperature"};
+static constexpr ov::Property<std::function<void (std::string)>> streamer_lambda{"streamer_lambda"};
+static constexpr ov::Property<std::shared_ptr<StreamerBase>> streamer{"streamer"};
+
 } // namespace ov
diff --git a/src/cpp/include/llm_tokenizer.hpp b/src/cpp/include/llm_tokenizer.hpp
index 2033e3053e..34dda24e60 100644
--- a/src/cpp/include/llm_tokenizer.hpp
+++ b/src/cpp/include/llm_tokenizer.hpp
@@ -8,24 +8,59 @@
 #include <filesystem>
 
 namespace ov {
-    
+
+/**
+* @brief class used to encode prompts and decode resulting tokens
+*/
 class Tokenizer {
 public:
-    Tokenizer() = default;
-    ~Tokenizer();
-    Tokenizer(std::string& tokenizers_path, std::string device="CPU");
-    Tokenizer(std::string& tokenizer_path, std::string& detokenizer_path, std::string device="CPU");
-    
+    /**
+    * @brief ov::Tokenizer constructor.
+    * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
+    * @param device device. Currently only 'CPU' is supported
+    */
+    Tokenizer(const std::string tokenizers_path, const std::string device="CPU");
+
+    /**
+    * @brief encode a single prompt
+    * @return pair of [input_ids, attention_mask]
+    */
     std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt);
+    
+    /**
+    * @brief encode batch of prompts. Left padding will be applied by default
+    * @param prompts vector storing batch of prompts
+    * @return pair of [input_ids, attention_mask]
+    */
     std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string> prompts);
-    std::pair<ov::Tensor, ov::Tensor> encode(std::initializer_list<std::string> text);
+    std::pair<ov::Tensor, ov::Tensor> encode(std::initializer_list<std::string> prompts);
     
+    /**
+    * @brief decode sequence of tokens
+    * @param tokens vector storing tokens
+    * @return sequence string
+    */
     std::string decode(std::vector<int64_t> tokens);
+    
+    /**
+    * @brief decode tokens. 
+    * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
+    * @return vector of std::string, with size = batch_size
+    */
     std::vector<std::string> decode(ov::Tensor tokens);
-    std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines);
 
-    int64_t m_eos_token = 2;  // todo: read from rt_info
-    int64_t m_bos_token = 1;  // todo: read from rt_info
+    /**
+    * @brief batched decoding of tokens. 
+    * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size
+    * @return vector of std::string, with size equal to batch_size
+    */
+    std::vector<std::string> decode(std::vector<std::vector<int64_t>> tokens);
+
+    int64_t m_bos_token_id = 1;  // todo: read from rt_info
+    int64_t m_eos_token_id = 2;  // todo: read from rt_info
+
+    Tokenizer() = default;
+    ~Tokenizer();
 private:
     class TokenizerImpl;
     std::shared_ptr<TokenizerImpl> m_pimpl;
diff --git a/src/cpp/src/beam_search_decoding.cpp b/src/cpp/src/beam_search_decoding.cpp
index 19851acebb..b855ee9936 100644
--- a/src/cpp/src/beam_search_decoding.cpp
+++ b/src/cpp/src/beam_search_decoding.cpp
@@ -7,8 +7,8 @@
 
 namespace ov {
 
-EncodedResults beam_search(ov::InferRequest& model_runner, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params) {
-    GenerationConfigHelper config_helper = sampling_params;
+EncodedResults beam_search(ov::InferRequest& model_runner, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig config) {
+    GenerationConfigHelper config_helper = config;
 
     ov::Shape prompts_shape = prompts.get_shape();
     size_t batch_size = prompts_shape[0];
@@ -34,9 +34,11 @@ EncodedResults beam_search(ov::InferRequest& model_runner, ov::Tensor prompts, o
     
     // todo: remove this duplication and use the same SamplingParameters for both greedy and beam
     Parameters parameters{{std::vector<int64_t>{prompt_data, prompt_data + prompts.get_size()}}};
-    parameters.n_groups = sampling_params.num_groups;
-    parameters.diversity_penalty = sampling_params.diversity_penalty;
-    parameters.group_size = sampling_params.group_size;
+    parameters.n_groups = config.num_beam_groups;
+    parameters.diversity_penalty = config.diversity_penalty;
+    parameters.group_size = config.num_beams / config.num_beam_groups;
+    OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0, "number of beams should be divisible by number of groups");
+
     
     GroupBeamSearcher group_beam_searcher{parameters};
     std::vector<int64_t> next_tokens;
@@ -78,7 +80,7 @@ EncodedResults beam_search(ov::InferRequest& model_runner, ov::Tensor prompts, o
     std::sort(beams.begin(), beams.end(), compare_scores);
     
     ov::EncodedResults results;
-    for (auto beam = beams.begin(); beam != beams.begin() + sampling_params.m_num_return_sequences; ++beam) {
+    for (auto beam = beams.begin(); beam != beams.begin() + config.num_return_sequences; ++beam) {
         // todo: convert to string 
         results.scores.emplace_back(beam->score);
         results.tokens.emplace_back(beam->tokens);
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index d4ff8ffc74..fd49438b88 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -1,13 +1,9 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include <cstdlib>
-#include <functional>
 #include <nlohmann/json.hpp>
 #include <fstream>
-// #include <group_beam_searcher.hpp>  // used only for StopCriteria
 #include <limits>
-// #include "llm_tokenizer.hpp"
 #include "generation_config.hpp"
 #include "generation_config_helper.hpp"
 
@@ -19,49 +15,85 @@ GenerationConfig::GenerationConfig(std::string json_path) {
 
     nlohmann::json data = nlohmann::json::parse(f);
 
-    bos_token_id = data.value("bos_token_id", 0);
-    eos_token_id = data.value("eos_token_id", 0);
-    eos_token = data.value("eos_token", "</s>");
-
-    pad_token_id = data.value("pad_token_id", 0);
-    m_num_return_sequences = data.value("num_return_sequences", 1);
-    
-    max_new_tokens = data.value("max_new_tokens", SIZE_MAX);
-    max_length = data.value("max_length", SIZE_MAX);
-    
-    temperature = data.value("temperature", 0.0f);
-    do_sample = data.value("do_sample", false);
-    top_p = data.value("top_p", 0.0f);
-    
-    // beam_search_params
-    num_groups = data.value("num_beam_groups", 1);
-    diversity_penalty = data.value("diversity_penalty", 1.0f);
-    int num_beams = data.value("num_beams", 1);
-    group_size = num_beams / num_groups;
-    OPENVINO_ASSERT(num_beams % num_groups == 0, "number of beams should be divisible by number of groups");
+    if (data.contains("max_new_tokens")) max_new_tokens = data["max_new_tokens"];
+    if (data.contains("max_length")) max_length = data["max_length"];
+    // note that ignore_eos is not present in HF GenerationConfig
+    if (data.contains("num_beam_groups")) num_beam_groups = data["num_beam_groups"];
+    if (data.contains("num_beams")) num_beams = data["num_beams"];
+    if (data.contains("diversity_penalty")) diversity_penalty = data["diversity_penalty"];
+    if (data.contains("length_penalty")) length_penalty = data["length_penalty"];
+    if (data.contains("num_return_sequences")) num_return_sequences = data["num_return_sequences"];
+    if (data.contains("no_repeat_ngram_size")) no_repeat_ngram_size = data["no_repeat_ngram_size"];
+    // stop_criteria will be processed below
+    if (data.contains("temperature")) temperature = data["temperature"];
+    if (data.contains("top_p")) top_p = data["top_p"];
+    if (data.contains("top_k")) top_k = data["top_k"];
+    if (data.contains("do_sample")) do_sample = data["do_sample"];
+    if (data.contains("repetition_penalty")) repetition_penalty = data["repetition_penalty"];
+    if (data.contains("pad_token_id")) pad_token_id = data["pad_token_id"];
+    if (data.contains("bos_token_id")) bos_token_id = data["bos_token_id"];
+    if (data.contains("eos_token_id")) eos_token_id = data["eos_token_id"];
+    if (data.contains("bos_token")) bos_token = data["bos_token"];
+    if (data.contains("eos_token")) eos_token = data["eos_token"];
+
+    if (data.contains("early_stopping")) {
+        auto field_type = data["early_stopping"].type();
+        if (field_type == nlohmann::json::value_t::string && data["early_stopping"] == "never") {
+            stop_criteria = StopCriteria::never;
+        } else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == true) {
+            stop_criteria = StopCriteria::early;
+        } else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == false) {
+            stop_criteria = StopCriteria::heuristic;
+        }
+    }
 }
 
+GenerationConfig GenerationConfigHelper::anymap_to_generation_config(const ov::AnyMap& config_map) {
+    GenerationConfig config = m_config;
+
+    if (config_map.count("max_new_tokens")) config.max_new_tokens = config_map.at("max_new_tokens").as<size_t>();
+    if (config_map.count("max_length")) config.max_length = config_map.at("max_length").as<size_t>();
+    if (config_map.count("ignore_eos")) config.ignore_eos = config_map.at("ignore_eos").as<bool>();
+    if (config_map.count("num_beam_groups")) config.num_beam_groups = config_map.at("num_beam_groups").as<size_t>();
+    if (config_map.count("num_beams")) config.num_beams = config_map.at("num_beams").as<size_t>();
+    if (config_map.count("diversity_penalty")) config.diversity_penalty = config_map.at("diversity_penalty").as<float>();
+    if (config_map.count("length_penalty")) config.length_penalty = config_map.at("length_penalty").as<float>();
+    if (config_map.count("num_return_sequences")) config.num_return_sequences = config_map.at("num_return_sequences").as<size_t>();
+    if (config_map.count("no_repeat_ngram_size")) config.no_repeat_ngram_size = config_map.at("no_repeat_ngram_size").as<size_t>();
+    if (config_map.count("stop_criteria")) config.stop_criteria = config_map.at("stop_criteria").as<StopCriteria>();
+    if (config_map.count("temperature")) config.temperature = config_map.at("temperature").as<float>();
+    if (config_map.count("top_p")) config.top_p = config_map.at("top_p").as<float>();
+    if (config_map.count("top_k")) config.top_k = config_map.at("top_k").as<int>();
+    if (config_map.count("do_sample")) config.do_sample = config_map.at("do_sample").as<bool>();
+    if (config_map.count("repetition_penalty")) config.repetition_penalty = config_map.at("repetition_penalty").as<float>();
+    if (config_map.count("pad_token_id")) config.pad_token_id = config_map.at("pad_token_id").as<int64_t>();
+    if (config_map.count("bos_token_id")) config.bos_token_id = config_map.at("bos_token_id").as<int64_t>();
+    if (config_map.count("eos_token_id")) config.eos_token_id = config_map.at("eos_token_id").as<int64_t>();
+    if (config_map.count("bos_token")) config.bos_token = config_map.at("bos_token").as<std::string>();
+    if (config_map.count("eos_token")) config.eos_token = config_map.at("eos_token").as<std::string>();
+   
+    return config;
+}
 
 size_t GenerationConfigHelper::get_max_new_tokens(size_t prompt_length) {
-    // max_new_tokens has priority over max_length,
-    // only if max_new_tokens was not specified use max_length
-    if (config.max_new_tokens != SIZE_MAX) {
-        return config.max_new_tokens;
+    // max_new_tokens has priority over max_length, only if max_new_tokens was not specified use max_length
+    if (m_config.max_new_tokens != SIZE_MAX) {
+        return m_config.max_new_tokens;
     } else {
-        return config.max_length - prompt_length;
+        return m_config.max_length - prompt_length;
     }
 }
 
 bool GenerationConfigHelper::is_greedy_decoding() const {
-    return !config.do_sample && !is_beam_search() && !is_speculative();
+    return !m_config.do_sample && !is_beam_search() && !is_speculative();
 }
 
 bool GenerationConfigHelper::is_beam_search() const {
-    return config.num_groups * config.group_size > 1;
+    return m_config.num_beams > 1;
 }
 
 bool GenerationConfigHelper::is_multimomial() const {
-    return config.do_sample;
+    return m_config.do_sample;
 }
 
 bool GenerationConfigHelper::is_speculative() const {
@@ -80,317 +112,4 @@ ov::InferRequest GenerationConfigHelper::get_assistant_model(std::string device,
     }
 }
 
-} // namespace ov
-
-
-
-// // forward declaration
-// class Sequence;
-
-// // forward declaration
-// namespace ov {
-// class LLMPipeline;
-// }
-
-// namespace {
-
-// // TODO: LEAVE ONLY ONE PLACE FOR DEFAULT VALUES
-// static const ov::AnyMap default_generation_config_map = {
-//     // Generic
-//     {"max_new_tokens", SIZE_MAX},
-//     {"max_length", SIZE_MAX},
-//     {"m_ignore_eos", false},
-//     {"m_bos_token", "</s>"},
-//     {"m_eos_token", "</s>"},
-    
-//     // Beam search specific
-//     {"m_num_groups", 1},
-//     {"m_group_size", 1},
-//     {"m_diversity_penalty", 1.0f},  // 0.0 means no diversity
-//     {"m_num_return_sequences", 1},  // is used by beam search, in other case is equal to batch size
-//     // {"stop_criteria", StopCriteria::heuristic},  // todo: align with the latest beam searcher
-
-//     {"m_repetition_penalty", 1.0f},
-//     {"m_length_penalty", 1.0f},
-//     {"m_no_repeat_ngram_size", std::numeric_limits<size_t>::max()},
-//     {"early_finish", [](const Sequence&) {return false; }},
-    
-//     // Multinomial
-//     {"m_temperature", 0.0f},
-//     {"m_top_k", -1},
-//     {"m_top_p", 1.0f},
-//     {"m_do_sample", false},
-    
-//     // special tokens
-//     {"m_bos_token_id", 0},
-//     {"m_eos_token_id", 2}, // todo: check form where it's better to extract from rt_info or from tokenizer_config.json
-//     {"m_pad_token_id", 0},
-    
-//     // assistive decoding
-//     {"m_assistant_model", ov::InferRequest()},
-//     {"m_num_assistant_tokens", 5},
-//     {"m_seq_len_axis", 2},
-// };
-
-// }
-
-// namespace ov {
-// size_t get_max_new_tokens(size_t prompt_length = 0) {
-//         // max_new_tokens has priority over max_length,
-//         // only if m_max_new_tokens was not specified use max_length
-//         if (m_max_new_tokens != SIZE_MAX) {
-//             return m_max_new_tokens;
-//         } else {
-//             return m_max_length - prompt_length;
-//         }
-//     }
-
-//     void max_new_tokens(size_t max_new_tokens) {
-//         const auto& r = ::default_generation_config_map.find("sdf") != ::default_generation_config_map.end();
-
-//         m_max_new_tokens = max_new_tokens;
-//     }
-
-//     void max_length(size_t max_length) {
-//         m_max_length = max_length;
-//     }
-
-//     void ignore_eos(bool ignore_eos) {
-//         m_ignore_eos = ignore_eos;
-//     }
-
-//     void eos_token(std::string eos_token) {
-//         m_eos_token = eos_token;
-//     }
-
-//     void num_return_sequences(size_t num_return_sequences) {
-//         m_num_return_sequences = num_return_sequences;
-//     }
-
-//     void num_groups(size_t num_groups) {
-//         m_num_groups = num_groups;
-//     }
-
-//     void group_size(size_t group_size) {
-//         m_group_size = group_size;
-//     }
-
-//     void diversity_penalty(float diversity_penalty) {
-//         m_diversity_penalty = diversity_penalty;
-//     }
-
-//     void length_penalty(float length_penalty) {
-//         m_length_penalty = length_penalty;
-//     }
-
-//     void no_repeat_ngram_size(size_t no_repeat_ngram_size) {
-//         m_no_repeat_ngram_size = no_repeat_ngram_size;
-//     }
-
-//     void temperature(float temperature) {
-//         m_temperature = temperature;
-//     }
-
-//     void top_k(size_t top_k) {
-//         m_top_k = top_k;
-//     }
-
-//     void top_p(size_t top_p) {
-//         m_top_p = top_p;
-//     }
-
-//     void do_sample(bool do_sample) {
-//         m_do_sample = do_sample;
-//     }
-
-//     void repetition_penalty(float repetition_penalty) {
-//         m_repetition_penalty = repetition_penalty;
-//     }
-
-//     void bos_token_id(int64_t bos_token_id) {
-//         m_bos_token_id = bos_token_id;
-//     }
-
-//     void eos_token_id(int64_t eos_token_id) {
-//         m_eos_token_id = eos_token_id;
-//     }
-
-//     void pad_token_id(int64_t pad_token_id) {
-//         m_pad_token_id = pad_token_id;
-//     }
-
-//     GenerationConfig() = default;
-
-//     GenerationConfig(std::string json_path) {
-//         std::ifstream f(json_path);
-//         OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config");
-
-//         nlohmann::json data = nlohmann::json::parse(f);
-
-//         m_bos_token_id = data.value("bos_token_id", 0);
-//         m_eos_token_id = data.value("eos_token_id", 0);
-//         m_eos_token = data.value("eos_token", "</s>");
-
-//         m_pad_token_id = data.value("pad_token_id", 0);
-//         m_num_return_sequences = data.value("num_return_sequences", 1);
-        
-//         m_max_new_tokens = data.value("max_new_tokens", SIZE_MAX);
-//         m_max_length = data.value("max_length", SIZE_MAX);
-        
-//         m_temperature = data.value("temperature", 0.0f);
-//         m_do_sample = data.value("do_sample", false);
-//         m_top_p = data.value("top_p", 0.0f);
-        
-//         // beam_search_params
-//         m_num_groups = data.value("num_beam_groups", 1);
-//         m_diversity_penalty = data.value("diversity_penalty", 1.0f);
-//         int num_beams = data.value("num_beams", 1);
-//         m_group_size = num_beams / m_num_groups;
-//         OPENVINO_ASSERT(num_beams % m_num_groups == 0, "number of beams should be divisible by number of groups");
-//     }
-
-
-//     static GenerationConfig greedy() {
-//         GenerationConfig greedy_params;
-//         greedy_params.m_temperature = 0.0f;
-//         greedy_params.m_ignore_eos = true;
-//         return greedy_params;
-//     }
-
-//     static GenerationConfig beam_search() {
-//         GenerationConfig beam_search;
-//         beam_search.m_num_groups = 3;
-//         beam_search.m_group_size = 5;
-//         beam_search.m_max_new_tokens = 10;
-//         beam_search.m_diversity_penalty = 2.0f;
-//         return beam_search;
-//     }
-
-//     static GenerationConfig multimomial() {
-//         GenerationConfig multimomial;
-//         multimomial.m_temperature = 0.8f;
-//         multimomial.m_top_p = 0.8;
-//         multimomial.m_top_k = 20;
-//         multimomial.m_do_sample = 20;
-//         return multimomial;
-//     }
-    
-//     template <typename T>
-//     static GenerationConfig assistive_decoding(T& assistant_model) {
-//         GenerationConfig assistive;
-//         assistive.assistant_model(assistant_model);
-//         return assistive;
-//     }
-
-//     bool is_greedy_sampling() const {
-//         return !m_do_sample && !is_beam_search() && !is_speculative();
-//     }
-
-//     bool is_beam_search() const {
-//         return m_num_groups * m_group_size > 1;
-//     }
-
-//     bool is_multimomial() const {
-//         return m_do_sample;
-//     }
-
-//     // for speculative decoding
-//     void assistant_model(const ov::InferRequest& assistant_model) {
-//         m_assistant_model = assistant_model;
-//         is_assistant_request_defined = true;
-//     }
-
-//     void assistant_model(ov::CompiledModel& assistant_model) {
-//         m_assistant_model = assistant_model.create_infer_request();
-//         is_assistant_request_defined = true;
-//     }
-
-//     void assistant_model(const std::shared_ptr<const ov::Model>& assistant_model) {
-//         m_assistant_ov_model = assistant_model;
-//         is_assistant_ov_defined = true;
-//     }
-
-//     void assistant_model(std::string assistant_model) {
-//         auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
-//         if (!is_xml(assistant_model))
-//             assistant_model += "/openvino_model.xml";
-
-//         m_assistant_ov_model = ov::Core().read_model(assistant_model);
-//         is_assistant_ov_defined = true;
-//     }
-
-//     void set_streamer(std::function<void (std::vector<int64_t>&&, ov::LLMPipeline&)> callback) {
-//         m_callback = callback;
-//     }
-
-//     ov::InferRequest get_assistant_model(std::string device="CPU", const ov::AnyMap& config={}) {
-//         if (is_assistant_request_defined) {
-//             return m_assistant_model;
-//         } else if (is_assistant_ov_defined) {
-//             m_assistant_model = ov::Core().compile_model(m_assistant_ov_model, device, config).create_infer_request();
-//             is_assistant_request_defined = true;
-//             return m_assistant_model;
-//         } else {
-//             OPENVINO_THROW("assistant model is not specified");
-//         }
-//     }
-    
-//     void num_assistant_tokens(int64_t num_assistant_tokens) {
-//         m_num_assistant_tokens = num_assistant_tokens;
-//     }
-
-//     bool is_speculative() const {
-//         return is_assistant_ov_defined || is_assistant_request_defined;
-//     }
-
-//     // for Assistive/Speculative decoding
-//     ov::InferRequest m_assistant_model;
-//     size_t m_num_assistant_tokens = 5;
-//     size_t m_seq_len_axis = 2;
-    
-//     static GenerationConfig anymap_to_generation_config(const ov::AnyMap& genereation_config_map = {}) {
-//         // need to load default values and update only those keys that are specified in genereation_config_map
-//         auto tmp_map = default_generation_config_map;
-        
-//         for (auto it = genereation_config_map.begin(); it != genereation_config_map.end(); ++it) {
-//             tmp_map[it->first] = it->second;
-//         }
-
-//         GenerationConfig config;
-        
-//         // general arguments
-//         config.m_max_new_tokens = tmp_map.at("m_max_new_tokens").as<size_t>();
-//         config.m_max_length = tmp_map.at("m_max_length").as<size_t>();
-//         config.m_ignore_eos = tmp_map.at("m_ignore_eos").as<bool>();
-//         config.m_eos_token = tmp_map.at("m_eos_token").as<int64_t>();
-
-//         // Beam search specific
-//         config.m_num_groups = tmp_map.at("m_num_groups").as<size_t>();
-//         config.m_group_size = tmp_map.at("m_group_size").as<size_t>();
-//         config.m_diversity_penalty = tmp_map.at("m_diversity_penalty").as<size_t>();
-//         config.m_num_return_sequences = tmp_map.at("m_num_return_sequences").as<size_t>();
-        
-//         config.m_repetition_penalty = tmp_map.at("m_repetition_penalty").as<size_t>();
-//         config.m_length_penalty = tmp_map.at("m_length_penalty").as<size_t>();
-//         config.m_no_repeat_ngram_size = tmp_map.at("m_no_repeat_ngram_size").as<size_t>();
-//         config.early_finish = tmp_map.at("early_finish").as<std::function<bool(const Sequence&)>>();
-
-//         // Multinomial
-//         config.m_temperature = tmp_map.at("m_temperature").as<size_t>();
-//         config.m_top_k = tmp_map.at("m_top_k").as<size_t>();
-//         config.m_top_p = tmp_map.at("m_top_p").as<size_t>();
-//         config.m_do_sample = tmp_map.at("m_do_sample").as<bool>();
-
-//         // special tokens
-//         config.m_bos_token_id = tmp_map.at("m_bos_token_id").as<int64_t>();
-//         config.m_eos_token_id = tmp_map.at("m_eos_token_id").as<int64_t>();
-//         config.m_pad_token_id = tmp_map.at("m_pad_token_id").as<int64_t>();
-//         return config;
-//     }    
-// }
-    
-// private:
-//     std::shared_ptr<const ov::Model> m_assistant_ov_model;
-//     bool is_assistant_request_defined = false;
-//     bool is_assistant_ov_defined = false;
-// };
+}  // namespace ov
diff --git a/src/cpp/src/generation_config_helper.hpp b/src/cpp/src/generation_config_helper.hpp
index f428829773..d0722c9b10 100644
--- a/src/cpp/src/generation_config_helper.hpp
+++ b/src/cpp/src/generation_config_helper.hpp
@@ -9,11 +9,11 @@ namespace ov {
 
 class GenerationConfigHelper {
 public:
-    GenerationConfig config;
+    GenerationConfig m_config;
 
     GenerationConfigHelper() = default;
     
-    GenerationConfigHelper(const GenerationConfig& config): config(config) {};
+    GenerationConfigHelper(const GenerationConfig& config): m_config(config) {};
 
     size_t get_max_new_tokens(size_t prompt_length = 0);
     
@@ -68,6 +68,9 @@ class GenerationConfigHelper {
     ov::InferRequest assistant_model;
     size_t num_assistant_tokens = 5;
     size_t seq_len_axis = 2;
+    
+    GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {});
+
 private:
 
     std::shared_ptr<const ov::Model> m_assistant_ov_model;
@@ -75,4 +78,4 @@ class GenerationConfigHelper {
     bool is_assistant_ov_defined = false;
 };
 
-} // namespace ov
\ No newline at end of file
+} // namespace ov
diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
index d3ff0108ba..56cd010208 100644
--- a/src/cpp/src/greedy_decoding.cpp
+++ b/src/cpp/src/greedy_decoding.cpp
@@ -81,9 +81,7 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner,
     std::fill(results.scores.begin(), results.scores.end(), 0);
     
     if (is_chat_conversation && kv_cache_len > 0) {
-        // m_attentions_mask_cache extent with attention_mask;
         auto attentions_mask_history = m_model_runner.get_tensor("attention_mask");
-        // print_tensor(m_attentions_mask_cache);
 
         size_t new_prompt_len = attention_mask.get_shape()[1];
         size_t context_len = attentions_mask_history.get_shape()[1];
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 06b360de28..5b6087fd85 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -38,37 +38,25 @@ class LLMPipeline::LLMPipelineImpl {
     GenerationConfig m_generation_config;
     std::string m_device;
     ov::AnyMap m_plugin_config;
-    ov::Tensor m_attentions_mask_cache;
     std::string m_chat_template = "";
-    
-    // TODO: add constructor for specifying manually tokenizer path
-    // dir path
-    // xml file path
-    // compiled model
-    // infer request
-    // ov::Model
-    
+    bool is_chat_conversation = false;
+
     LLMPipelineImpl(
-        std::string& model_path,
-        std::string& tokenizer_path,
-        std::string& detokenizer_path,
-        std::string device="CPU",
-        const ov::AnyMap& plugin_config={}
+        const std::string model_path,
+        const ov::Tokenizer& tokenizer,
+        const std::string device,
+        const ov::AnyMap& plugin_config
     );
 
-    LLMPipelineImpl(std::string& path, std::string device="CPU", const ov::AnyMap& config={});
+    LLMPipelineImpl(std::string& path, std::string device, const ov::AnyMap& config);
     
     GenerationConfig generation_config() const;
 
-    EncodedResults multinomial_sampling(ov::Tensor prompts, GenerationConfig generation_config);
-
     std::string generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer);
     DecodedResults generate(std::vector<std::string> texts, OptionalGenerationConfig generation_config);
     EncodedResults generate(ov::Tensor input_ids, std::optional<ov::Tensor> attention_mask, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer);
 
     std::string apply_chat_template(std::string prompt, std::string role = "user") const;
-
-    bool is_chat_conversation = false;
 };
 
 } // namespace ov
@@ -107,24 +95,20 @@ std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&&
 }
 
 ov::LLMPipeline::LLMPipeline(
-    std::string& model_path,
-    std::string& tokenizer_path,
-    std::string& detokenizer_path,
-    std::string device,
+    const std::string model_path,
+    const ov::Tokenizer& tokenizer,
+    const std::string device,
     const ov::AnyMap& plugin_config
 ) {
-    m_pimpl = make_unique<LLMPipelineImpl>(model_path, tokenizer_path, detokenizer_path, device, plugin_config);
+    m_pimpl = make_unique<LLMPipelineImpl>(model_path, tokenizer, device, plugin_config);
 }
 
 ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(
-    std::string& model_path,
-    std::string& tokenizer_path,
-    std::string& detokenizer_path,
+    const std::string model_path,
+    const ov::Tokenizer& tokenizer,
     std::string device,
     const ov::AnyMap& plugin_config
-) {
-    m_device = device;
-    m_plugin_config = plugin_config;
+): m_tokenizer(tokenizer), m_device(device), m_plugin_config(plugin_config) {
     ov::Core core;
     
     auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
@@ -132,9 +116,11 @@ ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(
     std::string full_path = model_path;
     if (!is_xml(full_path))
         full_path += "/openvino_model.xml";
-    m_model_runner = core.compile_model(full_path, device, plugin_config).create_infer_request();
-    
-    // todo: add loading Tokenizers from separate folders
+    try {
+        m_model_runner = core.compile_model(full_path, device, plugin_config).create_infer_request();
+    } catch (...) {
+        OPENVINO_THROW("Cannot compile_model from path " + full_path);
+    }
 }
 
 ov::LLMPipeline::LLMPipeline(std::string& path, std::string device, const ov::AnyMap& config) {
@@ -171,12 +157,6 @@ ov::GenerationConfig ov::LLMPipeline::get_generation_config() const {
     return m_pimpl->generation_config();
 }
 
-ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::multinomial_sampling(ov::Tensor prompts, GenerationConfig generation_config) {
-    // todo: implement
-    ov::EncodedResults results;
-    return results;
-}
-
 std::string ov::LLMPipeline::LLMPipelineImpl::generate(
     std::string text, 
     OptionalGenerationConfig generation_config,
@@ -284,7 +264,8 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(
         result = beam_search(m_model_runner, input_ids, attention_mask_data, config);
         
     } else if (config_helper.is_multimomial()) {
-        result = multinomial_sampling(input_ids, config);
+        // todo: implement multinomial sampling
+        // result = multinomial_sampling(input_ids, config);
     } else {
         result = ov::assistive_decoding(m_model_runner, input_ids, attention_mask_data, config);
     }
@@ -300,6 +281,21 @@ std::string ov::LLMPipeline::generate(std::string text, OptionalGenerationConfig
     return m_pimpl->generate(text, generation_config, streamer);
 }
 
+
+std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config_map) {
+    StreamerVariant streamer = {};
+    auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map);
+
+    // todo: get attentions from properties?
+    if (config_map.count("streamer_lambda")) {
+        streamer = config_map.at("streamer_lambda").as<std::function<void (std::string)>>();
+    } else if (config_map.count("streamer")) {
+        streamer = config_map.at("streamer").as<std::shared_ptr<StreamerBase>>();
+    }
+
+    return m_pimpl->generate(text, config, streamer);
+}
+
 std::string ov::LLMPipeline::operator()(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer) {
     return m_pimpl->generate(text, generation_config, streamer);
 }
diff --git a/src/cpp/src/llm_tokenizer.cpp b/src/cpp/src/llm_tokenizer.cpp
index d9facf508f..4d3b599b22 100644
--- a/src/cpp/src/llm_tokenizer.cpp
+++ b/src/cpp/src/llm_tokenizer.cpp
@@ -15,21 +15,18 @@ class Tokenizer::TokenizerImpl {
     ov::InferRequest m_detokenizer_request;
     std::string m_device;
 
+
     TokenizerImpl() = default;
-    TokenizerImpl(std::string& tokenizers_path, std::string device);
-    TokenizerImpl(std::string& tokenizer_path, std::string& detokenizer_path, std::string device);
-    std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt);
+    TokenizerImpl(const std::string tokenizers_path, const std::string device);
 
+    std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt);
     std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string> prompts);
-      
     std::string decode(std::vector<int64_t> tokens);
-    
     std::vector<std::string> decode(ov::Tensor tokens);
-    
     std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines);
 };
 
-Tokenizer::TokenizerImpl::TokenizerImpl(std::string& tokenizers_path, std::string device): m_device(device) {
+Tokenizer::TokenizerImpl::TokenizerImpl(const std::string tokenizers_path, const std::string device): m_device(device) {
     ov::Core core;
     
     auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
@@ -37,35 +34,19 @@ Tokenizer::TokenizerImpl::TokenizerImpl(std::string& tokenizers_path, std::strin
     if (is_xml(tokenizers_path))
         OPENVINO_THROW("tokenizers_path should be a path to a dir not to xml file");
   
-    // todo: add loading EOS_TOKEN_ID from IR
     // todo:: OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
     core.add_extension(OPENVINO_TOKENIZERS_PATH);  
-    // tokenizer and detokenizer work on CPU only
-    
-    m_tokenize_request = core.compile_model(tokenizers_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
-    m_detokenizer_request = core.compile_model(tokenizers_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+    try {
+        m_tokenize_request = core.compile_model(tokenizers_path + "/openvino_tokenizer.xml", device).create_infer_request();
+        m_detokenizer_request = core.compile_model(tokenizers_path + "/openvino_detokenizer.xml", device).create_infer_request();
+    } catch (...) {
+        OPENVINO_THROW("Cannot compile tokenizer and/or detokenizer. Please check that "
+                       "openvino_tokenizer.xml and openvino_detokenizer.xml exit in \"" + tokenizers_path + "\"");
+    }
     // todo: read eos, bos here
 }
 
-Tokenizer::TokenizerImpl::TokenizerImpl(std::string& tokenizer_path, std::string& detokenizer_path, std::string device): m_device(device) {
-    ov::Core core;
-    
-    auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
-    if (!is_xml(tokenizer_path))
-        OPENVINO_THROW("tokenizers_path should be a path to a xml file");
-    if (!is_xml(detokenizer_path))
-        OPENVINO_THROW("detokenizer_path should be a path to a xml file");
-  
-    // todo: add loading EOS_TOKEN_ID from IR
-    // todo:: OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-    core.add_extension(OPENVINO_TOKENIZERS_PATH);  
-    // tokenizer and detokenizer work on CPU only
-    
-    m_tokenize_request = core.compile_model(tokenizer_path, "CPU").create_infer_request();
-    m_detokenizer_request = core.compile_model(detokenizer_path, "CPU").create_infer_request();
-}
-
-Tokenizer::Tokenizer(std::string& tokenizers_path, std::string device) {
+Tokenizer::Tokenizer(const std::string tokenizers_path, const std::string device) {
     m_pimpl = std::make_shared<TokenizerImpl>(tokenizers_path, device);
 }
 
diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp
index 53830c7b90..3192520eee 100644
--- a/src/cpp/src/text_callback_streamer.cpp
+++ b/src/cpp/src/text_callback_streamer.cpp
@@ -18,7 +18,7 @@ TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, bool prin
 void TextCallbackStreamer::put(int64_t token) {
     std::stringstream res;
     // do not print anything and flush cache if EOS token is met
-    if (token == m_tokenizer.m_eos_token) {
+    if (token == m_tokenizer.m_eos_token_id) {
         end();
         return;
     }
diff --git a/src/python-bindings/py_generate_pipeline.cpp b/src/python-bindings/py_generate_pipeline.cpp
index 09ca8fb2bb..189717c587 100644
--- a/src/python-bindings/py_generate_pipeline.cpp
+++ b/src/python-bindings/py_generate_pipeline.cpp
@@ -9,30 +9,48 @@
 namespace py = pybind11;
 using namespace ov;
 
+void str_to_stop_criteria(ov::GenerationConfig& config, const std::string& stop_criteria_str){
+    if (stop_criteria_str == "early") config.stop_criteria = StopCriteria::early;
+    else if (stop_criteria_str == "never") config.stop_criteria =  StopCriteria::never;
+    else if (stop_criteria_str == "heuristic") config.stop_criteria =  StopCriteria::heuristic;
+    else OPENVINO_THROW(stop_criteria_str + " is incorrect value of stop_criteria. "
+                       "Allowed values are: \"early\", \"never\", \"heuristic\". ");
+}
+
+std::string stop_criteria_to_str(const ov::GenerationConfig& config) {
+    switch (config.stop_criteria) {
+        case ov::StopCriteria::early: return "early";
+        case ov::StopCriteria::heuristic: return "heuristic";
+        case ov::StopCriteria::never: return "never";
+        default: throw std::runtime_error("Incorrect stop_criteria");
+    }
+}
+
 std::string call_with_config(ov::LLMPipeline& pipeline, const std::string& text, const py::kwargs& kwargs) {
     // Create a new GenerationConfig instance and initialize from kwargs
     ov::GenerationConfig config = pipeline.get_generation_config();
     if (kwargs.contains("max_new_tokens")) config.max_new_tokens = kwargs["max_new_tokens"].cast<size_t>();
     if (kwargs.contains("max_length")) config.max_length = kwargs["max_length"].cast<size_t>();
     if (kwargs.contains("ignore_eos")) config.ignore_eos = kwargs["ignore_eos"].cast<bool>();
-    if (kwargs.contains("eos_token")) config.eos_token = kwargs["eos_token"].cast<std::string>();
-    if (kwargs.contains("num_groups")) config.num_groups = kwargs["num_groups"].cast<size_t>();
-    if (kwargs.contains("group_size")) config.group_size = kwargs["group_size"].cast<size_t>();
+    if (kwargs.contains("num_beam_groups")) config.num_beam_groups = kwargs["num_beam_groups"].cast<size_t>();
+    if (kwargs.contains("num_beams")) config.num_beams = kwargs["num_beams"].cast<size_t>();
     if (kwargs.contains("diversity_penalty")) config.diversity_penalty = kwargs["diversity_penalty"].cast<float>();
-    if (kwargs.contains("repetition_penalty")) config.repetition_penalty = kwargs["repetition_penalty"].cast<float>();
     if (kwargs.contains("length_penalty")) config.length_penalty = kwargs["length_penalty"].cast<float>();
-    
+    if (kwargs.contains("num_return_sequences")) config.num_return_sequences = kwargs["num_return_sequences"].cast<size_t>();
     if (kwargs.contains("no_repeat_ngram_size")) config.no_repeat_ngram_size = kwargs["no_repeat_ngram_size"].cast<size_t>();
+    if (kwargs.contains("stop_criteria")) str_to_stop_criteria(config, kwargs["stop_criteria"].cast<std::string>());
     if (kwargs.contains("temperature")) config.temperature = kwargs["temperature"].cast<float>();
-    if (kwargs.contains("top_k")) config.top_k = kwargs["top_k"].cast<size_t>();
     if (kwargs.contains("top_p")) config.top_p = kwargs["top_p"].cast<float>();
+    if (kwargs.contains("top_k")) config.top_k = kwargs["top_k"].cast<size_t>();
     if (kwargs.contains("do_sample")) config.do_sample = kwargs["do_sample"].cast<bool>();
+    if (kwargs.contains("repetition_penalty")) config.repetition_penalty = kwargs["repetition_penalty"].cast<float>();
+    if (kwargs.contains("pad_token_id")) config.pad_token_id = kwargs["pad_token_id"].cast<int64_t>();
     if (kwargs.contains("bos_token_id")) config.bos_token_id = kwargs["bos_token_id"].cast<int64_t>();
     if (kwargs.contains("eos_token_id")) config.eos_token_id = kwargs["eos_token_id"].cast<int64_t>();
-    if (kwargs.contains("pad_token_id")) config.pad_token_id = kwargs["pad_token_id"].cast<int64_t>();
+    if (kwargs.contains("eos_token")) config.eos_token = kwargs["eos_token"].cast<std::string>();
+    if (kwargs.contains("bos_token")) config.bos_token = kwargs["bos_token"].cast<std::string>();
     if (kwargs.contains("draft_model")) config.draft_model = kwargs["draft_model"].cast<std::variant<std::string, ov::CompiledModel, ov::InferRequest>>();
 
-    // Call the LLMPipeline with the constructed GenerationConfig
     return pipeline(text, config);
 }
 
@@ -41,9 +59,8 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
 
 
     py::class_<LLMPipeline>(m, "LLMPipeline")
-        .def(py::init<std::string&, std::string&, std::string&, std::string, const ov::AnyMap&>(),
-             py::arg("model_path"), py::arg("tokenizer_path"), py::arg("detokenizer_path"),
-             py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{})
+        .def(py::init<const std::string, const ov::Tokenizer&, const std::string, const ov::AnyMap&>(),
+             py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{})
         .def(py::init<std::string&, std::string, const ov::AnyMap&>(),
              py::arg("path"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{})
         .def("__call__", &call_with_config)
@@ -72,23 +89,25 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readwrite("max_new_tokens", &ov::GenerationConfig::max_new_tokens)
         .def_readwrite("max_length", &ov::GenerationConfig::max_length)
         .def_readwrite("ignore_eos", &ov::GenerationConfig::ignore_eos)
-        .def_readwrite("eos_token", &ov::GenerationConfig::eos_token)
-        .def_readwrite("num_groups", &ov::GenerationConfig::num_groups)
-        .def_readwrite("group_size", &ov::GenerationConfig::group_size)
+        .def_readwrite("num_beam_groups", &ov::GenerationConfig::num_beam_groups)
+        .def_readwrite("num_beams", &ov::GenerationConfig::num_beams)
         .def_readwrite("diversity_penalty", &ov::GenerationConfig::diversity_penalty)
-        .def_readwrite("repetition_penalty", &ov::GenerationConfig::repetition_penalty)
         .def_readwrite("length_penalty", &ov::GenerationConfig::length_penalty)
+        .def_readwrite("num_return_sequences", &ov::GenerationConfig::num_return_sequences)
         .def_readwrite("no_repeat_ngram_size", &ov::GenerationConfig::no_repeat_ngram_size)
+        .def_property("stop_criteria", &stop_criteria_to_str, &str_to_stop_criteria)
         .def_readwrite("temperature", &ov::GenerationConfig::temperature)
-        .def_readwrite("top_k", &ov::GenerationConfig::top_k)
         .def_readwrite("top_p", &ov::GenerationConfig::top_p)
+        .def_readwrite("top_k", &ov::GenerationConfig::top_k)
         .def_readwrite("do_sample", &ov::GenerationConfig::do_sample)
+        .def_readwrite("repetition_penalty", &ov::GenerationConfig::repetition_penalty)
+        .def_readwrite("pad_token_id", &ov::GenerationConfig::pad_token_id)
         .def_readwrite("bos_token_id", &ov::GenerationConfig::bos_token_id)
         .def_readwrite("eos_token_id", &ov::GenerationConfig::eos_token_id)
-        .def_readwrite("pad_token_id", &ov::GenerationConfig::pad_token_id)
+        .def_readwrite("eos_token", &ov::GenerationConfig::eos_token)
+        .def_readwrite("bos_token", &ov::GenerationConfig::bos_token)
         .def_readwrite("draft_model", &ov::GenerationConfig::draft_model);
 
-
     py::class_<ov::DecodedResults>(m, "DecodedResults")
         .def(py::init<>())
         .def_readwrite("texts", &ov::DecodedResults::texts)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index 95e7ddb341..d9e28e8850 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -33,7 +33,8 @@ int main(int argc, char* argv[]) try {
         std::cout << "question:\n";
         cout << prompt << endl;
 
-        auto answer_str = pipe(prompt, config, streamer);
+        // auto answer_str = pipe(prompt, config, streamer);
+        auto answer_str = pipe.generate(prompt, ov::max_new_tokens(10000), ov::streamer_lambda(streamer));
         accumulated_str += answer_str;
         
         cout << "\n----------\n";

From c6620d91c0b8a52f00b2e58b086a28025b36f8a9 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 10 May 2024 11:52:16 +0200
Subject: [PATCH 47/97] move includes to a separate openvino/genai folder

---
 src/cpp/include/{ => openvino/genai}/generation_config.hpp  | 2 +-
 src/cpp/include/{ => openvino/genai}/llm_pipeline.hpp       | 6 +++---
 src/cpp/include/{ => openvino/genai}/streamer_base.hpp      | 2 +-
 .../{llm_tokenizer.hpp => openvino/genai/tokenizer.hpp}     | 0
 src/cpp/src/assistive_decoding.cpp                          | 2 +-
 src/cpp/src/beam_search_decoding.cpp                        | 2 +-
 src/cpp/src/generation_config.cpp                           | 2 +-
 src/cpp/src/generation_config_helper.hpp                    | 2 +-
 src/cpp/src/greedy_decoding.cpp                             | 2 +-
 src/cpp/src/llm_pipeline.cpp                                | 2 +-
 src/cpp/src/text_callback_streamer.hpp                      | 4 ++--
 src/cpp/src/{llm_tokenizer.cpp => tokenizer.cpp}            | 2 +-
 src/python-bindings/py_generate_pipeline.cpp                | 2 +-
 .../causal_lm/cpp/generate_pipeline/chat_sample.cpp         | 2 +-
 .../causal_lm/cpp/generate_pipeline/generate_sample.cpp     | 2 +-
 15 files changed, 17 insertions(+), 17 deletions(-)
 rename src/cpp/include/{ => openvino/genai}/generation_config.hpp (99%)
 rename src/cpp/include/{ => openvino/genai}/llm_pipeline.hpp (97%)
 rename src/cpp/include/{ => openvino/genai}/streamer_base.hpp (85%)
 rename src/cpp/include/{llm_tokenizer.hpp => openvino/genai/tokenizer.hpp} (100%)
 rename src/cpp/src/{llm_tokenizer.cpp => tokenizer.cpp} (99%)

diff --git a/src/cpp/include/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
similarity index 99%
rename from src/cpp/include/generation_config.hpp
rename to src/cpp/include/openvino/genai/generation_config.hpp
index 9aeb16b299..639d52a925 100644
--- a/src/cpp/include/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -4,7 +4,7 @@
 #pragma once
 
 #include <limits>
-#include "llm_tokenizer.hpp"
+#include "openvino/genai/tokenizer.hpp"
 #include <variant>
 
 namespace ov {
diff --git a/src/cpp/include/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
similarity index 97%
rename from src/cpp/include/llm_pipeline.hpp
rename to src/cpp/include/openvino/genai/llm_pipeline.hpp
index 8057599b91..2ba4b6a13b 100644
--- a/src/cpp/include/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -5,9 +5,9 @@
 
 #include <openvino/openvino.hpp>
 #include <openvino/core/any.hpp>
-#include "generation_config.hpp"
-#include "llm_tokenizer.hpp"
-#include "streamer_base.hpp"
+#include "openvino/genai/generation_config.hpp"
+#include "openvino/genai/tokenizer.hpp"
+#include "openvino/genai/streamer_base.hpp"
 #include <filesystem>
 #include <optional>
 
diff --git a/src/cpp/include/streamer_base.hpp b/src/cpp/include/openvino/genai/streamer_base.hpp
similarity index 85%
rename from src/cpp/include/streamer_base.hpp
rename to src/cpp/include/openvino/genai/streamer_base.hpp
index dd1ce71b08..6bc298cad6 100644
--- a/src/cpp/include/streamer_base.hpp
+++ b/src/cpp/include/openvino/genai/streamer_base.hpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
-#include "llm_tokenizer.hpp"
+#include "openvino/genai/tokenizer.hpp"
 
 namespace ov {
 
diff --git a/src/cpp/include/llm_tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
similarity index 100%
rename from src/cpp/include/llm_tokenizer.hpp
rename to src/cpp/include/openvino/genai/tokenizer.hpp
diff --git a/src/cpp/src/assistive_decoding.cpp b/src/cpp/src/assistive_decoding.cpp
index 3e893e053c..f852e4346c 100644
--- a/src/cpp/src/assistive_decoding.cpp
+++ b/src/cpp/src/assistive_decoding.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "generation_config_helper.hpp"
-#include "llm_pipeline.hpp"
+#include "openvino/genai/llm_pipeline.hpp"
 
 ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len);
 void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len);
diff --git a/src/cpp/src/beam_search_decoding.cpp b/src/cpp/src/beam_search_decoding.cpp
index b855ee9936..0b80c47e53 100644
--- a/src/cpp/src/beam_search_decoding.cpp
+++ b/src/cpp/src/beam_search_decoding.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "generation_config_helper.hpp"
-#include "llm_pipeline.hpp"
+#include "openvino/genai/llm_pipeline.hpp"
 #include "group_beam_searcher.hpp"
 
 namespace ov {
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index fd49438b88..2f2312e7a3 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -4,7 +4,7 @@
 #include <nlohmann/json.hpp>
 #include <fstream>
 #include <limits>
-#include "generation_config.hpp"
+#include "openvino/genai/generation_config.hpp"
 #include "generation_config_helper.hpp"
 
 namespace ov {
diff --git a/src/cpp/src/generation_config_helper.hpp b/src/cpp/src/generation_config_helper.hpp
index d0722c9b10..d3d97dde12 100644
--- a/src/cpp/src/generation_config_helper.hpp
+++ b/src/cpp/src/generation_config_helper.hpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "generation_config.hpp"
+#include "openvino/genai/generation_config.hpp"
 
 namespace ov {
 
diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
index 56cd010208..7ea134d736 100644
--- a/src/cpp/src/greedy_decoding.cpp
+++ b/src/cpp/src/greedy_decoding.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "generation_config_helper.hpp"
-#include "llm_pipeline.hpp"
+#include "openvino/genai/llm_pipeline.hpp"
 #include "utils.hpp"
 
 namespace {
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 5b6087fd85..de984efb44 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <openvino/openvino.hpp>
-#include "llm_pipeline.hpp"
+#include "openvino/genai/llm_pipeline.hpp"
 #include <filesystem>
 #include <fstream>
 #include <variant>
diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp
index 9f0bab68dd..f3d8773fb4 100644
--- a/src/cpp/src/text_callback_streamer.hpp
+++ b/src/cpp/src/text_callback_streamer.hpp
@@ -2,8 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
-#include "streamer_base.hpp"
-#include "llm_tokenizer.hpp"
+#include "openvino/genai/streamer_base.hpp"
+#include "openvino/genai/tokenizer.hpp"
 
 namespace ov {
 
diff --git a/src/cpp/src/llm_tokenizer.cpp b/src/cpp/src/tokenizer.cpp
similarity index 99%
rename from src/cpp/src/llm_tokenizer.cpp
rename to src/cpp/src/tokenizer.cpp
index 4d3b599b22..2bd0c4adc1 100644
--- a/src/cpp/src/llm_tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <openvino/openvino.hpp>
-#include "llm_tokenizer.hpp"
+#include "openvino/genai/tokenizer.hpp"
 #include <filesystem>
 
 std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token=2);
diff --git a/src/python-bindings/py_generate_pipeline.cpp b/src/python-bindings/py_generate_pipeline.cpp
index 189717c587..cf9cd64c5f 100644
--- a/src/python-bindings/py_generate_pipeline.cpp
+++ b/src/python-bindings/py_generate_pipeline.cpp
@@ -4,7 +4,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/functional.h>
-#include "llm_pipeline.hpp"
+#include "openvino/genai/llm_pipeline.hpp"
 
 namespace py = pybind11;
 using namespace ov;
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index d9e28e8850..c7460dd337 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <openvino/openvino.hpp>
-#include "llm_pipeline.hpp"
+#include "openvino/genai/llm_pipeline.hpp"
 
 
 std::vector<string> questions = {
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
index 77a3cd41ce..f46b8bc682 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // #include <openvino/openvino.hpp>
-#include "llm_pipeline.hpp"
+#include "openvino/genai/llm_pipeline.hpp"
 
 
 // The following reasons require TextStreamer to keep a cache of previous tokens:

From be843457384e9c5c254a0efab75f45bda0272e2d Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 16:35:58 +0400
Subject: [PATCH 48/97] align names

---
 pyproject.toml                     | 6 +++---
 src/cpp/CMakeLists.txt             | 1 +
 src/python-bindings/CMakeLists.txt | 4 ++++
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 71c85319bd..bdad02b7f2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,7 +71,7 @@ select = ["C", "E", "F", "I", "W"]
 lines-after-imports = 2
 
 [tool.scikit-build]
-install.components = ["openvino_genai_bindings_install_target"]
+install.components = ["py_generate_pipeline_install_target"]
 cmake.source-dir = "text_generation/causal_lm/cpp"
 cmake.build-type = "Release"
 cmake.args = [
@@ -79,9 +79,9 @@ cmake.args = [
 #    "-DCMAKE_INSTALL_BINDIR=lib",
 #    "-DCMAKE_INSTALL_LIBDIR=lib"
 ]
-cmake.targets = ["openvino_genai_bindings"]
+cmake.targets = ["py_generate_pipeline"]
 wheel.build-tag = "000"
-wheel.packages = ["text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino"]
+wheel.packages = ["src/python-bindings/openvino"]
 wheel.install-dir = "openvino/genai"
 wheel.py-api = "py3"
 wheel.license-files = ["LICENSE", "SECURITY.md"]  # TODO: Do we need third-party-programs.txt like openvino_tokenizers?
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index bbdea5b1ab..5276aac100 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -10,6 +10,7 @@ add_subdirectory(../../thirdparty/nlohmann_json/ "${CMAKE_CURRENT_BINARY_DIR}/nl
 # add_subdirectory(../../../thirdparty/Jinja2Cpp/ "${CMAKE_CURRENT_BINARY_DIR}/Jinja2Cpp/")
 # include_directories(../../../thirdparty/inja/include/Jinja2Cpp)
 
+set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
 set(TARGET_NAME generate_pipeline_lib)
 file(GLOB SOURCE_FILES "src/*.cpp")
diff --git a/src/python-bindings/CMakeLists.txt b/src/python-bindings/CMakeLists.txt
index a030fed156..9c9b8e9b3f 100644
--- a/src/python-bindings/CMakeLists.txt
+++ b/src/python-bindings/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 include(FetchContent)
 FetchContent_Declare(
     pybind11
@@ -13,3 +16,4 @@ endif()
 
 pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp)
 target_link_libraries(py_generate_pipeline PRIVATE generate_pipeline_lib)
+install(TARGETS py_generate_pipeline LIBRARY DESTINATION . COMPONENT py_generate_pipeline_install_target)

From bced64a8963dd26904a189680e634ee1e0e8eb47 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 16:37:11 +0400
Subject: [PATCH 49/97] Dont modify
 text_generation/causal_lm/cpp/CMakeLists.txt

---
 text_generation/causal_lm/cpp/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 731d19ba51..c659603fe3 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -55,5 +55,3 @@ target_link_libraries(${TARGET_NAME} PRIVATE generate_pipeline_lib)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
-
-add_subdirectory(generate_pipeline/python-bindings)

From f4e82b6fd957c126ee36f89cfb85c4eb001ead06 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 16:38:01 +0400
Subject: [PATCH 50/97] rm -r
 text_generation/causal_lm/cpp/generate_pipeline/python-bindings/

---
 .../python-bindings/CMakeLists.txt            | 20 ------
 .../python-bindings/openvino/__init__.py      | 70 -------------------
 .../openvino/genai/__init__.py                |  0
 .../openvino_genai_bindings.cpp               | 26 -------
 4 files changed, 116 deletions(-)
 delete mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python-bindings/CMakeLists.txt
 delete mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino/__init__.py
 delete mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino/genai/__init__.py
 delete mode 100644 text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino_genai_bindings.cpp

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/CMakeLists.txt b/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/CMakeLists.txt
deleted file mode 100644
index 41dabe43b5..0000000000
--- a/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-include(FetchContent)
-FetchContent_Declare(
-    pybind11
-    GIT_REPOSITORY https://github.com/pybind/pybind11
-    GIT_TAG        v2.12.0
-)
-
-FetchContent_GetProperties(pybind11)
-if(NOT pybind11_POPULATED)
-    FetchContent_Populate(pybind11)
-    add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
-endif()
-
-pybind11_add_module(openvino_genai_bindings openvino_genai_bindings.cpp)
-target_link_libraries(openvino_genai_bindings PRIVATE generate_pipeline_lib)
-# TODO: how to link with tokenizers and openvino
-install(TARGETS openvino_genai_bindings LIBRARY DESTINATION . COMPONENT openvino_genai_bindings_install_target)
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino/__init__.py b/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino/__init__.py
deleted file mode 100644
index 24a0ee92ec..0000000000
--- a/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino/__init__.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-__path__ = __import__("pkgutil").extend_path(__path__, __name__)
-
-# Required for Windows OS platforms
-# Note: always top-level
-try:
-    from openvino.utils import _add_openvino_libs_to_search_path
-    _add_openvino_libs_to_search_path()
-except ImportError:
-    pass
-
-# #
-# # OpenVINO API
-# # This __init__.py forces checking of runtime modules to propagate errors.
-# # It is not compared with init files from openvino-dev package.
-# #
-# Import all public modules
-from openvino import runtime as runtime
-from openvino import frontend as frontend
-from openvino import helpers as helpers
-from openvino import preprocess as preprocess
-from openvino import utils as utils
-from openvino import properties as properties
-
-# Import most important classes and functions from openvino.runtime
-from openvino.runtime import Model
-from openvino.runtime import Core
-from openvino.runtime import CompiledModel
-from openvino.runtime import InferRequest
-from openvino.runtime import AsyncInferQueue
-
-from openvino.runtime import Symbol
-from openvino.runtime import Dimension
-from openvino.runtime import Strides
-from openvino.runtime import PartialShape
-from openvino.runtime import Shape
-from openvino.runtime import Layout
-from openvino.runtime import Type
-from openvino.runtime import Tensor
-from openvino.runtime import OVAny
-
-from openvino.runtime import compile_model
-from openvino.runtime import get_batch
-from openvino.runtime import set_batch
-from openvino.runtime import serialize
-from openvino.runtime import shutdown
-from openvino.runtime import tensor_from_file
-from openvino.runtime import save_model
-from openvino.runtime import layout_helpers
-
-from openvino._pyopenvino import RemoteContext
-from openvino._pyopenvino import RemoteTensor
-
-# libva related:
-from openvino._pyopenvino import VAContext
-from openvino._pyopenvino import VASurfaceTensor
-
-# Set version for openvino package
-from openvino.runtime import get_version
-__version__ = get_version()
-
-# Tools
-try:
-    # Model Conversion API - ovc should reside in the main namespace
-    from openvino.tools.ovc import convert_model
-except ImportError:
-    pass
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino/genai/__init__.py b/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino/genai/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino_genai_bindings.cpp b/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino_genai_bindings.cpp
deleted file mode 100644
index 7485a503ec..0000000000
--- a/text_generation/causal_lm/cpp/generate_pipeline/python-bindings/openvino_genai_bindings.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (C) 2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <cstdlib>
-#include <limits>
-#include <functional>
-
-#include "pybind11/pybind11.h"
-#include <pybind11/stl.h>
-
-struct GenerationConfig {
-    bool do_sample;
-
-};
-
-namespace py = pybind11;
-
-
-PYBIND11_MODULE(py_continuous_batching, m) {
-    py::class_<GenerationConfig>(m, "GenerationConfig")
-        .def(py::init<>())
-        .def_readwrite("do_sample", &GenerationConfig::do_sample);
-
-}

From 5b2b0cad4c2ae3913f0546a3bf78eafc92e78479 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 16:53:16 +0400
Subject: [PATCH 51/97] fix build

---
 pyproject.toml                               | 2 +-
 src/python-bindings/CMakeLists.txt           | 2 ++
 text_generation/causal_lm/cpp/CMakeLists.txt | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index bdad02b7f2..35ac1f51d6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,7 +83,7 @@ cmake.targets = ["py_generate_pipeline"]
 wheel.build-tag = "000"
 wheel.packages = ["src/python-bindings/openvino"]
 wheel.install-dir = "openvino/genai"
-wheel.py-api = "py3"
+wheel.py-api = ""
 wheel.license-files = ["LICENSE", "SECURITY.md"]  # TODO: Do we need third-party-programs.txt like openvino_tokenizers?
 sdist.exclude = ["dist", "tests", "examples", "python/tests"]
 sdist.cmake = true
diff --git a/src/python-bindings/CMakeLists.txt b/src/python-bindings/CMakeLists.txt
index 9c9b8e9b3f..673a172587 100644
--- a/src/python-bindings/CMakeLists.txt
+++ b/src/python-bindings/CMakeLists.txt
@@ -15,5 +15,7 @@ if(NOT pybind11_POPULATED)
 endif()
 
 pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp)
+set_target_properties(py_generate_pipeline PROPERTIES CXX_STANDARD 17)
+set_target_properties(py_generate_pipeline PROPERTIES CXX_STANDARD_REQUIRED ON)
 target_link_libraries(py_generate_pipeline PRIVATE generate_pipeline_lib)
 install(TARGETS py_generate_pipeline LIBRARY DESTINATION . COMPONENT py_generate_pipeline_install_target)
diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index c659603fe3..25215e9fdd 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -6,6 +6,8 @@ project(causal_lm)
 
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
+add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
+
 set(TARGET_NAME greedy_causal_lm)
 add_executable(${TARGET_NAME} greedy_causal_lm.cpp)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")

From 0dd8f59583bb61c46e1224afe557c0b22d1a54aa Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 16:58:18 +0400
Subject: [PATCH 52/97] add tokenizers only once

---
 CMakeLists.txt                               | 1 +
 src/cpp/CMakeLists.txt                       | 1 -
 text_generation/causal_lm/cpp/CMakeLists.txt | 2 --
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0c55fba075..0caa8c0315 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,7 @@
 cmake_minimum_required(VERSION 3.15)
 project(openvino_genai)
 
+add_subdirectory(./thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
 
 add_subdirectory(src)
 add_subdirectory(text_generation/causal_lm/cpp)
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 5276aac100..0715e91b8e 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -2,7 +2,6 @@
 
 set(JINJA2CPP_DEPS_MODE internal)
 
-add_subdirectory(../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
 add_subdirectory(../../thirdparty/nlohmann_json/ "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json/")
 
 # todo: remove hardcodes and make submodule work
diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 25215e9fdd..c659603fe3 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -6,8 +6,6 @@ project(causal_lm)
 
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
-add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
-
 set(TARGET_NAME greedy_causal_lm)
 add_executable(${TARGET_NAME} greedy_causal_lm.cpp)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")

From 23638ff00548b156552bd7beff47fb9b342c849d Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 17:03:13 +0400
Subject: [PATCH 53/97] change cmake.source-dir

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 35ac1f51d6..478bb22194 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,7 +72,7 @@ lines-after-imports = 2
 
 [tool.scikit-build]
 install.components = ["py_generate_pipeline_install_target"]
-cmake.source-dir = "text_generation/causal_lm/cpp"
+cmake.source-dir = "./"
 cmake.build-type = "Release"
 cmake.args = [
     "-DBUILD_SHARED_LIBS=NO"

From d8c534994416e5cb0fd554e5dd83f8870dae4d46 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 17:08:38 +0400
Subject: [PATCH 54/97] restore openvino/genai inits

---
 src/python-bindings/openvino/__init__.py      | 70 +++++++++++++++++++
 .../openvino/genai/__init__.py                |  0
 2 files changed, 70 insertions(+)
 create mode 100644 src/python-bindings/openvino/__init__.py
 create mode 100644 src/python-bindings/openvino/genai/__init__.py

diff --git a/src/python-bindings/openvino/__init__.py b/src/python-bindings/openvino/__init__.py
new file mode 100644
index 0000000000..24a0ee92ec
--- /dev/null
+++ b/src/python-bindings/openvino/__init__.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)
+
+# Required for Windows OS platforms
+# Note: always top-level
+try:
+    from openvino.utils import _add_openvino_libs_to_search_path
+    _add_openvino_libs_to_search_path()
+except ImportError:
+    pass
+
+# #
+# # OpenVINO API
+# # This __init__.py forces checking of runtime modules to propagate errors.
+# # It is not compared with init files from openvino-dev package.
+# #
+# Import all public modules
+from openvino import runtime as runtime
+from openvino import frontend as frontend
+from openvino import helpers as helpers
+from openvino import preprocess as preprocess
+from openvino import utils as utils
+from openvino import properties as properties
+
+# Import most important classes and functions from openvino.runtime
+from openvino.runtime import Model
+from openvino.runtime import Core
+from openvino.runtime import CompiledModel
+from openvino.runtime import InferRequest
+from openvino.runtime import AsyncInferQueue
+
+from openvino.runtime import Symbol
+from openvino.runtime import Dimension
+from openvino.runtime import Strides
+from openvino.runtime import PartialShape
+from openvino.runtime import Shape
+from openvino.runtime import Layout
+from openvino.runtime import Type
+from openvino.runtime import Tensor
+from openvino.runtime import OVAny
+
+from openvino.runtime import compile_model
+from openvino.runtime import get_batch
+from openvino.runtime import set_batch
+from openvino.runtime import serialize
+from openvino.runtime import shutdown
+from openvino.runtime import tensor_from_file
+from openvino.runtime import save_model
+from openvino.runtime import layout_helpers
+
+from openvino._pyopenvino import RemoteContext
+from openvino._pyopenvino import RemoteTensor
+
+# libva related:
+from openvino._pyopenvino import VAContext
+from openvino._pyopenvino import VASurfaceTensor
+
+# Set version for openvino package
+from openvino.runtime import get_version
+__version__ = get_version()
+
+# Tools
+try:
+    # Model Conversion API - ovc should reside in the main namespace
+    from openvino.tools.ovc import convert_model
+except ImportError:
+    pass
diff --git a/src/python-bindings/openvino/genai/__init__.py b/src/python-bindings/openvino/genai/__init__.py
new file mode 100644
index 0000000000..e69de29bb2

From 24faefe3c5b57cb96113ed83dc38da56eb4353be Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 10 May 2024 17:42:26 +0400
Subject: [PATCH 55/97] Integrate JinjaCpp

---
 CMakeLists.txt                                |  4 +-
 src/CMakeLists.txt                            |  4 +-
 src/cpp/CMakeLists.txt                        | 67 ++++++++++++++-----
 src/cpp/src/llm_pipeline.cpp                  | 40 +++++------
 .../CMakeLists.txt                            |  4 ++
 .../py_generate_pipeline.cpp                  |  0
 thirdparty/Jinja2Cpp                          |  1 -
 thirdparty/nlohmann_json                      |  1 -
 thirdparty/openvino_tokenizers                |  2 +-
 9 files changed, 76 insertions(+), 47 deletions(-)
 rename src/{python-bindings => python}/CMakeLists.txt (84%)
 rename src/{python-bindings => python}/py_generate_pipeline.cpp (100%)
 delete mode 160000 thirdparty/Jinja2Cpp
 delete mode 160000 thirdparty/nlohmann_json

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0c55fba075..1627347ade 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,10 +1,10 @@
-# Copyright (C) 2018-2023 Intel Corporation
+# Copyright (C) 2018-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 
 cmake_minimum_required(VERSION 3.15)
-project(openvino_genai)
 
+project(openvino_genai)
 
 add_subdirectory(src)
 add_subdirectory(text_generation/causal_lm/cpp)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index dad9bd54a1..99d80180a2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,6 @@
-# Copyright (C) 2018-2023 Intel Corporation
+# Copyright (C) 2018-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 
-add_subdirectory(python-bindings)
 add_subdirectory(cpp)
+add_subdirectory(python)
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index bbdea5b1ab..247db07bdc 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -1,25 +1,60 @@
-# Generate Pipeline library
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
 
-set(JINJA2CPP_DEPS_MODE internal)
+# Dependencies
+
+include(FetchContent)
+
+FetchContent_Declare(nlohmann_json
+    URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
+    URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406)
+FetchContent_MakeAvailable(nlohmann_json)
+
+function(ov_genai_build_jinja2cpp)
+    FetchContent_Declare(jinja2cpp
+        URL https://github.com/ilya-lavrenov/Jinja2Cpp/archive/a5d002cbf44469775556daea14ba3ccdba1e365a.tar.gz
+        URL_HASH SHA256=5aa5378d9acf3c44dfb607fd7f16f48b17ffa6495c219957901e9191ffe28900)
+
+    FetchContent_GetProperties(jinja2cpp)
+    if(NOT jinja2cpp_POPULATED)
+        FetchContent_Populate(jinja2cpp)
+
+        set(BUILD_SHARED_LIBS OFF)
+        set(JINJA2CPP_INSTALL OFF CACHE BOOL "")
+        set(JINJA2CPP_CXX_STANDARD 17 CACHE STRING "")
+        set(JINJA2CPP_BUILD_SHARED OFF CACHE BOOL "")
+        set(JINJA2CPP_USE_REGEX "std" CACHE STRING "")
+        set(JINJA2CPP_WITH_JSON_BINDINGS "none" CACHE STRING "")
+        set(JINJA2CPP_STRICT_WARNINGS OFF CACHE BOOL "")
+        set(JINJA2CPP_PIC ON CACHE BOOL "")
+
+        add_subdirectory("${jinja2cpp_SOURCE_DIR}" "${jinja2cpp_BINARY_DIR}" EXCLUDE_FROM_ALL)
+    endif()
+endfunction()
+
+ov_genai_build_jinja2cpp()
 
 add_subdirectory(../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
-add_subdirectory(../../thirdparty/nlohmann_json/ "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json/")
 
-# todo: remove hardcodes and make submodule work
-# include_directories($ENV{HOME}/opt/jinja2cpp/include)
-# add_subdirectory(../../../thirdparty/Jinja2Cpp/ "${CMAKE_CURRENT_BINARY_DIR}/Jinja2Cpp/")
-# include_directories(../../../thirdparty/inja/include/Jinja2Cpp)
+find_package(OpenVINO REQUIRED COMPONENTS Runtime)
+
+# Library
 
+file(GLOB SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")
 
 set(TARGET_NAME generate_pipeline_lib)
-file(GLOB SOURCE_FILES "src/*.cpp")
 add_library(${TARGET_NAME} SHARED ${SOURCE_FILES})
-target_include_directories(${TARGET_NAME} PRIVATE ../../text_generation/causal_lm/cpp/)
-target_include_directories(${TARGET_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
-find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime)
-target_link_libraries(${TARGET_NAME} PUBLIC nlohmann_json::nlohmann_json)
+
+target_include_directories(${TARGET_NAME}
+    # TODO: remove it, because beam_search algo should not be exposed to end users
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../text_generation/causal_lm/cpp/
+    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_json::nlohmann_json jinja2cpp)
+
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
-# target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)  # todo: remove hardcode
-set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
-set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
+
+set_target_properties(${TARGET_NAME} PROPERTIES
+    CXX_STANDARD_REQUIRED ON
+    CXX_STANDARD 17)
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index de984efb44..e9eb7ea0d8 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -11,9 +11,9 @@
 #include "utils.hpp"
 #include <nlohmann/json.hpp>
 
-// #include <jinja2cpp/template.h>
-// #include <jinja2cpp/template_env.h>
-// #include "generation_config.hpp"
+#include <jinja2cpp/template.h>
+#include <jinja2cpp/template_env.h>
+#include "openvino/genai/generation_config.hpp"
 
 
 namespace ov {
@@ -313,29 +313,21 @@ std::string ov::LLMPipeline::apply_chat_template(std::string prompt, std::string
 }
 
 std::string ov::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string prompt, std::string role) const {
-    // todo: temporary disable for easier and faster build
-    // jinja2::TemplateEnv env;
-    // env.GetSettings().lstripBlocks = true;
-    // env.GetSettings().trimBlocks = true;
-    // jinja2::Template tpl(&env);
-    // tpl.Load(m_chat_template);
+    jinja2::TemplateEnv env;
+    env.GetSettings().lstripBlocks = true;
+    env.GetSettings().trimBlocks = true;
+    jinja2::Template tpl(&env);
+    tpl.Load(m_chat_template);
     
-    // jinja2::ValuesMap message {{"role", role}, {"content", prompt}};
-    // jinja2::ValuesMap params = {
-    //     {"messages", jinja2::ValuesList({message})},
-    //     {"bos_token",  "<s>"},
-    //     {"eos_token", "</s>"},  // todo: load from config
-    //     {"add_generation_prompt", true},
-    // };
+    jinja2::ValuesMap message {{"role", role}, {"content", prompt}};
+    jinja2::ValuesMap params = {
+        {"messages", jinja2::ValuesList({message})},
+        {"bos_token",  "<s>"},
+        {"eos_token", "</s>"},  // todo: load from config
+        {"add_generation_prompt", true},
+    };
  
-    // return tpl.RenderAsString(params).value();
-
-    std::stringstream result_prompt;
-    result_prompt << "<|user|>\n" << prompt << "</s>\n<|assistant|>\n";  // hardcode template for TinyLlama
-    // result_prompt << "<bos><start_of_turn>user\n" << prompt << "<end_of_turn>\n<start_of_turn>model";  // Gemma-7b-it
-    // result_prompt << "<s>[INST] " << input << " [/INST]";  // LLama-2-7b
-    
-    return result_prompt.str();
+    return tpl.RenderAsString(params).value();
 }
 
 void ov::LLMPipeline::start_chat() {
diff --git a/src/python-bindings/CMakeLists.txt b/src/python/CMakeLists.txt
similarity index 84%
rename from src/python-bindings/CMakeLists.txt
rename to src/python/CMakeLists.txt
index a030fed156..d951d25d25 100644
--- a/src/python-bindings/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -1,3 +1,7 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
 include(FetchContent)
 FetchContent_Declare(
     pybind11
diff --git a/src/python-bindings/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
similarity index 100%
rename from src/python-bindings/py_generate_pipeline.cpp
rename to src/python/py_generate_pipeline.cpp
diff --git a/thirdparty/Jinja2Cpp b/thirdparty/Jinja2Cpp
deleted file mode 160000
index a853f8e9f7..0000000000
--- a/thirdparty/Jinja2Cpp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a853f8e9f784de53b11973a47af0b20b0167f6f3
diff --git a/thirdparty/nlohmann_json b/thirdparty/nlohmann_json
deleted file mode 160000
index 199dea11b1..0000000000
--- a/thirdparty/nlohmann_json
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 199dea11b17c533721b26249e2dcaee6ca1d51d3
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 0e4bb32ca3..c754503462 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 0e4bb32ca3412f589e1d094faa8b0aad19ee47ca
+Subproject commit c754503462f569b648b598d57ff91ea57bb8deb1

From 598dda3db198b46b58e4f1476e2b0e87c8c03a37 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 18:05:05 +0400
Subject: [PATCH 56/97] install genai lib

---
 pyproject.toml                                 | 4 ++--
 src/cpp/CMakeLists.txt                         | 2 ++
 src/python-bindings/openvino/genai/__init__.py | 8 ++++++++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 478bb22194..7e0a940dfa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,7 +71,7 @@ select = ["C", "E", "F", "I", "W"]
 lines-after-imports = 2
 
 [tool.scikit-build]
-install.components = ["py_generate_pipeline_install_target"]
+install.components = ["py_generate_pipeline_install_target", "generate_pipeline_lib_install_target"]
 cmake.source-dir = "./"
 cmake.build-type = "Release"
 cmake.args = [
@@ -79,7 +79,7 @@ cmake.args = [
 #    "-DCMAKE_INSTALL_BINDIR=lib",
 #    "-DCMAKE_INSTALL_LIBDIR=lib"
 ]
-cmake.targets = ["py_generate_pipeline"]
+cmake.targets = ["py_generate_pipeline", "generate_pipeline_lib"]
 wheel.build-tag = "000"
 wheel.packages = ["src/python-bindings/openvino"]
 wheel.install-dir = "openvino/genai"
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 0715e91b8e..0341b9c4d2 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -23,3 +23,5 @@ target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<T
 # target_link_libraries(${TARGET_NAME} PRIVATE $ENV{HOME}/opt/jinja2cpp/lib/static/libjinja2cpp.a)  # todo: remove hardcode
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
+install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION . COMPONENT generate_pipeline_lib_install_target
+    RUNTIME DESTINATION . COMPONENT generate_pipeline_lib_install_target)
diff --git a/src/python-bindings/openvino/genai/__init__.py b/src/python-bindings/openvino/genai/__init__.py
index e69de29bb2..a00c04f6e7 100644
--- a/src/python-bindings/openvino/genai/__init__.py
+++ b/src/python-bindings/openvino/genai/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+if hasattr(os, "add_dll_directory"):
+    os.add_dll_directory(os.path.dirname(__file__))
+    import openvino  # add_dll_directory for openvino lib

From 02d0eae989008d14b60480ca6c935f4fa5ca5d19 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 18:08:03 +0400
Subject: [PATCH 57/97] import openvino for win and lin

---
 src/python-bindings/openvino/genai/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/python-bindings/openvino/genai/__init__.py b/src/python-bindings/openvino/genai/__init__.py
index a00c04f6e7..07b8c0e67e 100644
--- a/src/python-bindings/openvino/genai/__init__.py
+++ b/src/python-bindings/openvino/genai/__init__.py
@@ -1,8 +1,8 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import openvino  # add_dll_directory for openvino lib
 import os
 
 if hasattr(os, "add_dll_directory"):
     os.add_dll_directory(os.path.dirname(__file__))
-    import openvino  # add_dll_directory for openvino lib

From a27c5a7149ea49070df0affd8b76871801e66dbb Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 18:15:22 +0400
Subject: [PATCH 58/97] put the line back

---
 src/cpp/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 287c350ed2..8a3861bcb2 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -54,6 +54,7 @@ target_include_directories(${TARGET_NAME}
 target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_json::nlohmann_json jinja2cpp)
 
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+
 set_target_properties(${TARGET_NAME} PROPERTIES
     CXX_STANDARD_REQUIRED ON
     CXX_STANDARD 17)

From 0849c41e76caf8fd38cbc8bd9561c05b2aa8ad38 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 10 May 2024 18:18:08 +0400
Subject: [PATCH 59/97] Added cmake build type before project clause

---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1627347ade..21b05adf6b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,9 @@
 
 cmake_minimum_required(VERSION 3.15)
 
+set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMake build type")
+set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Release" "Debug" "RelWithDebInfo" "MinSizeRel")
+
 project(openvino_genai)
 
 add_subdirectory(src)

From 34cddff80ca72c8f1e6f3e2e1965a9c218628ac0 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 18:18:27 +0400
Subject: [PATCH 60/97] one line properties

---
 src/python/CMakeLists.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 33475b45a5..81f6bf1658 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -16,7 +16,8 @@ if(NOT pybind11_POPULATED)
 endif()
 
 pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp)
-set_target_properties(py_generate_pipeline PROPERTIES CXX_STANDARD 17)
-set_target_properties(py_generate_pipeline PROPERTIES CXX_STANDARD_REQUIRED ON)
+set_target_properties(${TARGET_NAME} PROPERTIES
+    CXX_STANDARD_REQUIRED ON
+    CXX_STANDARD 17)
 target_link_libraries(py_generate_pipeline PRIVATE generate_pipeline_lib)
 install(TARGETS py_generate_pipeline LIBRARY DESTINATION . COMPONENT py_generate_pipeline_install_target)

From 6a5d750ffe2a1eefda5a00c9cd5edcc01bce172f Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 10 May 2024 19:00:02 +0400
Subject: [PATCH 61/97] Export API symbols

---
 src/CMakeLists.txt                                   |  7 +++++++
 src/cpp/CMakeLists.txt                               |  6 +-----
 src/cpp/include/openvino/genai/generation_config.hpp |  8 ++++++--
 src/cpp/include/openvino/genai/llm_pipeline.hpp      | 12 +++++++-----
 src/cpp/include/openvino/genai/streamer_base.hpp     |  1 +
 src/cpp/include/openvino/genai/tokenizer.hpp         | 12 +++++++++---
 src/cpp/include/openvino/genai/visibility.hpp        | 10 ++++++++++
 src/cpp/src/generation_config.cpp                    |  6 +++++-
 src/cpp/src/generation_config_helper.hpp             |  1 +
 9 files changed, 47 insertions(+), 16 deletions(-)
 create mode 100644 src/cpp/include/openvino/genai/visibility.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 99d80180a2..d154836878 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,5 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
+# Find OpenVINODeveloperPackage first to compile with SDL flags
+find_package(OpenVINODeveloperPackage QUIET
+             PATHS "${OpenVINO_DIR}")
+if(NOT OpenVINODeveloperPackage_FOUND)
+    find_package(OpenVINO REQUIRED COMPONENTS Runtime)
+endif()
+
 add_subdirectory(cpp)
 add_subdirectory(python)
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 247db07bdc..3c53b66614 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -37,8 +37,6 @@ ov_genai_build_jinja2cpp()
 
 add_subdirectory(../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
 
-find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-
 # Library
 
 file(GLOB SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")
@@ -55,6 +53,4 @@ target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_j
 
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 
-set_target_properties(${TARGET_NAME} PROPERTIES
-    CXX_STANDARD_REQUIRED ON
-    CXX_STANDARD 17)
+target_compile_features(${TARGET_NAME} PUBLIC cxx_std_17)
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index 639d52a925..4843d3c76a 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -4,8 +4,12 @@
 #pragma once
 
 #include <limits>
-#include "openvino/genai/tokenizer.hpp"
 #include <variant>
+#include <string>
+
+#include "openvino/runtime/compiled_model.hpp"
+#include "openvino/runtime/infer_request.hpp"
+#include "openvino/genai/tokenizer.hpp"
 
 namespace ov {
 
@@ -52,7 +56,7 @@ enum class StopCriteria { early, heuristic, never };
  * @param eos_token <eos> token string representation
  * @param draft_model draft model for assitive decoding
  */
-class GenerationConfig {
+class OPENVINO_GENAI_EXPORTS GenerationConfig {
 public:
     GenerationConfig() = default;
     GenerationConfig(std::string json_path);
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index 2ba4b6a13b..cd8c3a4a1b 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -1,15 +1,17 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-// #pragma once
+#pragma once
 
-#include <openvino/openvino.hpp>
+#include <filesystem>
+#include <optional>
+
+#include <openvino/runtime/infer_request.hpp>
 #include <openvino/core/any.hpp>
+
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/genai/streamer_base.hpp"
-#include <filesystem>
-#include <optional>
 
 using namespace std;
 
@@ -46,7 +48,7 @@ class DecodedResults {
 /**
 * @brief This class is used for generation with LLMs.
  */
-class LLMPipeline {
+class OPENVINO_GENAI_EXPORTS LLMPipeline {
 public:
     /**
     * @brief Constructs a LLMPipeline when convert model xml/bin files, tokenizers and configuration and in the same dir.
diff --git a/src/cpp/include/openvino/genai/streamer_base.hpp b/src/cpp/include/openvino/genai/streamer_base.hpp
index 6bc298cad6..0d32f9fcda 100644
--- a/src/cpp/include/openvino/genai/streamer_base.hpp
+++ b/src/cpp/include/openvino/genai/streamer_base.hpp
@@ -1,5 +1,6 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
+
 #pragma once
 
 #include "openvino/genai/tokenizer.hpp"
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 34dda24e60..702aca83f9 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -3,16 +3,22 @@
 
 #pragma once
 
-#include <openvino/openvino.hpp>
-#include <openvino/core/any.hpp>
 #include <filesystem>
+#include <string>
+#include <vector>
+#include <memory>
+#include <initializer_list>
+
+#include <openvino/runtime/tensor.hpp>
+
+#include "openvino/genai/visibility.hpp"
 
 namespace ov {
 
 /**
 * @brief class used to encode prompts and decode resulting tokens
 */
-class Tokenizer {
+class OPENVINO_GENAI_EXPORTS Tokenizer {
 public:
     /**
     * @brief ov::Tokenizer constructor.
diff --git a/src/cpp/include/openvino/genai/visibility.hpp b/src/cpp/include/openvino/genai/visibility.hpp
new file mode 100644
index 0000000000..3a143da92a
--- /dev/null
+++ b/src/cpp/include/openvino/genai/visibility.hpp
@@ -0,0 +1,10 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/core/visibility.hpp"
+
+#ifdef generate_pipeline_lib_EXPORTS
+#    define OPENVINO_GENAI_EXPORTS OPENVINO_CORE_EXPORTS
+#else
+#    define OPENVINO_GENAI_EXPORTS OPENVINO_CORE_IMPORTS
+#endif  // generate_pipeline_lib_EXPORTS
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index 2f2312e7a3..e17dd0eadf 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -1,10 +1,14 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include <nlohmann/json.hpp>
 #include <fstream>
 #include <limits>
+
+#include <nlohmann/json.hpp>
+#include <openvino/runtime/core.hpp>
+
 #include "openvino/genai/generation_config.hpp"
+
 #include "generation_config_helper.hpp"
 
 namespace ov {
diff --git a/src/cpp/src/generation_config_helper.hpp b/src/cpp/src/generation_config_helper.hpp
index d3d97dde12..400843994d 100644
--- a/src/cpp/src/generation_config_helper.hpp
+++ b/src/cpp/src/generation_config_helper.hpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
+
 #include "openvino/genai/generation_config.hpp"
 
 namespace ov {

From 9ef488c51684cf5d8a729a28e3ee78ad43884cf3 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 19:56:29 +0400
Subject: [PATCH 62/97] rename

---
 pyproject.toml                                |   72 +-
 src/cpp/CMakeLists.txt                        |    3 +-
 src/python/CMakeLists.txt                     |    6 +-
 .../openvino/__init__.py                      |    0
 .../openvino/genai/__init__.py                |    0
 third-party-programs.txt                      | 1181 ++++++++++++++++-
 6 files changed, 1190 insertions(+), 72 deletions(-)
 rename src/{python-bindings => python}/openvino/__init__.py (100%)
 rename src/{python-bindings => python}/openvino/genai/__init__.py (100%)

diff --git a/pyproject.toml b/pyproject.toml
index 7e0a940dfa..2d60b44e30 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,15 +1,13 @@
 [project]
-name = "openvino.genai"
+name = "openvino_genai"
 version = "2024.2.0.0"
 description = "Python bindings for https://github.com/openvinotoolkit/openvino.genai"
 requires-python = ">=3.8"
 readme = {file = "text_generation/causal_lm/cpp/README.md", content-type="text/markdown"}
 license = {text = "OSI Approved :: Apache Software License"}
-
 authors = [
     { name = "OpenVINO Developers", email = "openvino@intel.com" },
 ]
-
 classifiers = [
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
@@ -17,76 +15,20 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
 ]
-
 dependencies = [
-    "openvino_tokenizers~=2024.1.0.0"
-]
-
-[project.optional-dependencies]
-# TODO: do I need to propagate all this to openvino_tokenizers
-transformers = [
-    "transformers[sentencepiece] >= 4.36.0",
-    "tiktoken"
-]
-# chatglm2 custom tokenizer file imports torch, have to add torch dependency for tests
-torch = [
-    'torch'
-]
-dev = [
-    "ruff",
-    "bandit",
-    "pytest",
-    "pytest_harvest",
-    "pandas",
-    "openvino_tokenizers[transformers, torch]"
-]
-benchmark = [
-    "pandas",
-    "seaborn",
-    "tqdm",
-    "openvino_tokenizers[transformers]"
-]
-#  don't include fuzzing to avoid windows CI issues
-fuzzing = [
-    "atheris",
-    "openvino_tokenizers[transformers]"
+    "openvino_tokenizers==2024.1.0.0"
 ]
-all = [
-    "openvino_tokenizers[dev, transformers]"
-]
-
-
-[tool.ruff]
-line-length = 119
-
-[tool.ruff.lint]
-ignore = ["C901", "E501", "E741", "W605"]
-select = ["C", "E", "F", "I", "W"]
-
-[tool.ruff.lint.per-file-ignores]
-"__init__.py" = ["F401"]
-"openvino_tokenizers/hf_parser.py" = ["F821"]
-
-[tool.ruff.lint.isort]
-lines-after-imports = 2
 
 [tool.scikit-build]
-install.components = ["py_generate_pipeline_install_target", "generate_pipeline_lib_install_target"]
 cmake.source-dir = "./"
 cmake.build-type = "Release"
-cmake.args = [
-    "-DBUILD_SHARED_LIBS=NO"
-#    "-DCMAKE_INSTALL_BINDIR=lib",
-#    "-DCMAKE_INSTALL_LIBDIR=lib"
-]
 cmake.targets = ["py_generate_pipeline", "generate_pipeline_lib"]
-wheel.build-tag = "000"
-wheel.packages = ["src/python-bindings/openvino"]
+install.components = ["genai", "genai_python"]
+sdist.cmake = true
+wheel.packages = ["src/python/openvino"]
 wheel.install-dir = "openvino/genai"
 wheel.py-api = ""
-wheel.license-files = ["LICENSE", "SECURITY.md"]  # TODO: Do we need third-party-programs.txt like openvino_tokenizers?
-sdist.exclude = ["dist", "tests", "examples", "python/tests"]
-sdist.cmake = true
+wheel.license-files = ["LICENSE", "SECURITY.md", "third-party-programs.txt"]
 
 [[tool.scikit-build.generate]]
 path = "openvino/genai/__version__.py"
@@ -95,5 +37,5 @@ __version__ = "${version}"
 '''
 
 [build-system]
-requires = ["scikit-build-core~=0.8.0"]
+requires = ["scikit-build-core>=0.8.0"]
 build-backend = "scikit_build_core.build"
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 2825955a3c..8bcaf8ab13 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -52,5 +52,4 @@ target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_j
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 
 target_compile_features(${TARGET_NAME} PUBLIC cxx_std_17)
-install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION . COMPONENT generate_pipeline_lib_install_target
-    RUNTIME DESTINATION . COMPONENT generate_pipeline_lib_install_target)
+install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION . COMPONENT genai RUNTIME DESTINATION . COMPONENT genai)
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 81f6bf1658..6261cd9bc7 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -16,8 +16,6 @@ if(NOT pybind11_POPULATED)
 endif()
 
 pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp)
-set_target_properties(${TARGET_NAME} PROPERTIES
-    CXX_STANDARD_REQUIRED ON
-    CXX_STANDARD 17)
+target_compile_features(py_generate_pipeline PUBLIC cxx_std_17)
 target_link_libraries(py_generate_pipeline PRIVATE generate_pipeline_lib)
-install(TARGETS py_generate_pipeline LIBRARY DESTINATION . COMPONENT py_generate_pipeline_install_target)
+install(TARGETS py_generate_pipeline LIBRARY DESTINATION . COMPONENT genai_python)
diff --git a/src/python-bindings/openvino/__init__.py b/src/python/openvino/__init__.py
similarity index 100%
rename from src/python-bindings/openvino/__init__.py
rename to src/python/openvino/__init__.py
diff --git a/src/python-bindings/openvino/genai/__init__.py b/src/python/openvino/genai/__init__.py
similarity index 100%
rename from src/python-bindings/openvino/genai/__init__.py
rename to src/python/openvino/genai/__init__.py
diff --git a/third-party-programs.txt b/third-party-programs.txt
index 60d40abdd0..146aea0684 100644
--- a/third-party-programs.txt
+++ b/third-party-programs.txt
@@ -1 +1,1180 @@
-TODO: do I need it?
\ No newline at end of file
+OpenVINO Tokenizers Third Party Programs File
+
+This file contains the list of third party software ("third party programs")
+contained in the Intel software and their required notices and/or license
+terms. This third party software, even if included with the distribution of
+the Intel software, may be governed by separate license terms, including
+without limitation, third party license terms, other Intel software license
+terms, and open source software license terms. These separate license terms
+govern your use of the third party programs as set forth in the
+"third-party-programs.txt" or other similarly-named text file.
+
+Third party programs and their corresponding required notices and/or license
+terms are listed below.
+
+-------------------------------------------------------------
+
+transformers
+
+Copyright 2018- The Hugging Face team. All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+-------------------------------------------------------------
+
+fast_tokenizer
+
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction,
+and distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by
+the copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all
+other entities that control, are controlled by, or are under common
+control with that entity. For the purposes of this definition,
+"control" means (i) the power, direct or indirect, to cause the
+direction or management of such entity, whether by contract or
+otherwise, or (ii) ownership of fifty percent (50%) or more of the
+outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity
+exercising permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications,
+including but not limited to software source code, documentation
+source, and configuration files.
+
+"Object" form shall mean any form resulting from mechanical
+transformation or translation of a Source form, including but
+not limited to compiled object code, generated documentation,
+and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or
+Object form, made available under the License, as indicated by a
+copyright notice that is included in or attached to the work
+(an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object
+form, that is based on (or derived from) the Work and for which the
+editorial revisions, annotations, elaborations, or other modifications
+represent, as a whole, an original work of authorship. For the purposes
+of this License, Derivative Works shall not include works that remain
+separable from, or merely link (or bind by name) to the interfaces of,
+the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including
+the original version of the Work and any modifications or additions
+to that Work or Derivative Works thereof, that is intentionally
+submitted to Licensor for inclusion in the Work by the copyright owner
+or by an individual or Legal Entity authorized to submit on behalf of
+the copyright owner. For the purposes of this definition, "submitted"
+means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems,
+and issue tracking systems that are managed by, or on behalf of, the
+Licensor for the purpose of discussing and improving the Work, but
+excluding communication that is conspicuously marked or otherwise
+designated in writing by the copyright owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity
+on behalf of whom a Contribution has been received by Licensor and
+subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the
+Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+(except as stated in this section) patent license to make, have made,
+use, offer to sell, sell, import, and otherwise transfer the Work,
+where such license applies only to those patent claims licensable
+by such Contributor that are necessarily infringed by their
+Contribution(s) alone or by combination of their Contribution(s)
+with the Work to which such Contribution(s) was submitted. If You
+institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work
+or a Contribution incorporated within the Work constitutes direct
+or contributory patent infringement, then any patent licenses
+granted to You under this License for that Work shall terminate
+as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+Work or Derivative Works thereof in any medium, with or without
+modifications, and in Source or Object form, provided that You
+meet the following conditions:
+
+(a) You must give any other recipients of the Work or
+Derivative Works a copy of this License; and
+
+(b) You must cause any modified files to carry prominent notices
+stating that You changed the files; and
+
+(c) You must retain, in the Source form of any Derivative Works
+that You distribute, all copyright, patent, trademark, and
+attribution notices from the Source form of the Work,
+excluding those notices that do not pertain to any part of
+the Derivative Works; and
+
+(d) If the Work includes a "NOTICE" text file as part of its
+distribution, then any Derivative Works that You distribute must
+include a readable copy of the attribution notices contained
+within such NOTICE file, excluding those notices that do not
+pertain to any part of the Derivative Works, in at least one
+of the following places: within a NOTICE text file distributed
+as part of the Derivative Works; within the Source form or
+documentation, if provided along with the Derivative Works; or,
+within a display generated by the Derivative Works, if and
+wherever such third-party notices normally appear. The contents
+of the NOTICE file are for informational purposes only and
+do not modify the License. You may add Your own attribution
+notices within Derivative Works that You distribute, alongside
+or as an addendum to the NOTICE text from the Work, provided
+that such additional attribution notices cannot be construed
+as modifying the License.
+
+You may add Your own copyright statement to Your modifications and
+may provide additional or different license terms and conditions
+for use, reproduction, or distribution of Your modifications, or
+for any such Derivative Works as a whole, provided Your use,
+reproduction, and distribution of the Work otherwise complies with
+the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+any Contribution intentionally submitted for inclusion in the Work
+by You to the Licensor shall be under the terms and conditions of
+this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify
+the terms of any separate license agreement you may have executed
+with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+names, trademarks, service marks, or product names of the Licensor,
+except as required for reasonable and customary use in describing the
+origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+agreed to in writing, Licensor provides the Work (and each
+Contributor provides its Contributions) on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied, including, without limitation, any warranties or conditions
+of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE. You are solely responsible for determining the
+appropriateness of using or redistributing the Work and assume any
+risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+whether in tort (including negligence), contract, or otherwise,
+unless required by applicable law (such as deliberate and grossly
+negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special,
+incidental, or consequential damages of any character arising as a
+result of this License or out of the use or inability to use the
+Work (including but not limited to damages for loss of goodwill,
+work stoppage, computer failure or malfunction, or any and all
+other commercial damages or losses), even if such Contributor
+has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+the Work or Derivative Works thereof, You may choose to offer,
+and charge a fee for, acceptance of support, warranty, indemnity,
+or other liability obligations and/or rights consistent with this
+License. However, in accepting such obligations, You may act only
+on Your own behalf and on Your sole responsibility, not on behalf
+of any other Contributor, and only if You agree to indemnify,
+defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason
+of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+To apply the Apache License to your work, attach the following
+boilerplate notice, with the fields enclosed by brackets "[]"
+replaced with your own identifying information. (Don't include
+the brackets!)  The text should be enclosed in the appropriate
+comment syntax for the file format. We also recommend that a
+file or class name and description of purpose be included on the
+same "printed page" as the copyright notice for easier
+identification within third-party archives.
+
+Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+-------------------------------------------------------------
+
+re2
+
+// Copyright (c) 2009 The RE2 Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-------------------------------------------------------------
+
+icu4c
+
+UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
+
+See Terms of Use <https://www.unicode.org/copyright.html>
+for definitions of Unicode Inc.’s Data Files and Software.
+
+NOTICE TO USER: Carefully read the following legal agreement.
+BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
+DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
+YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+TERMS AND CONDITIONS OF THIS AGREEMENT.
+IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
+THE DATA FILES OR SOFTWARE.
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright © 1991-2022 Unicode, Inc. All rights reserved.
+Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Unicode data files and any associated documentation
+(the "Data Files") or Unicode software and any associated documentation
+(the "Software") to deal in the Data Files or Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of
+the Data Files or Software, and to permit persons to whom the Data Files
+or Software are furnished to do so, provided that either
+(a) this copyright and permission notice appear with all copies
+of the Data Files or Software, or
+(b) this copyright and permission notice appear in associated
+Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
+NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale,
+use or other dealings in these Data Files or Software without prior
+written authorization of the copyright holder.
+
+-------------------------------------------------------------
+
+sentencepiece
+
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction,
+and distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by
+the copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all
+other entities that control, are controlled by, or are under common
+control with that entity. For the purposes of this definition,
+"control" means (i) the power, direct or indirect, to cause the
+direction or management of such entity, whether by contract or
+otherwise, or (ii) ownership of fifty percent (50%) or more of the
+outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity
+exercising permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications,
+including but not limited to software source code, documentation
+source, and configuration files.
+
+"Object" form shall mean any form resulting from mechanical
+transformation or translation of a Source form, including but
+not limited to compiled object code, generated documentation,
+and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or
+Object form, made available under the License, as indicated by a
+copyright notice that is included in or attached to the work
+(an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object
+form, that is based on (or derived from) the Work and for which the
+editorial revisions, annotations, elaborations, or other modifications
+represent, as a whole, an original work of authorship. For the purposes
+of this License, Derivative Works shall not include works that remain
+separable from, or merely link (or bind by name) to the interfaces of,
+the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including
+the original version of the Work and any modifications or additions
+to that Work or Derivative Works thereof, that is intentionally
+submitted to Licensor for inclusion in the Work by the copyright owner
+or by an individual or Legal Entity authorized to submit on behalf of
+the copyright owner. For the purposes of this definition, "submitted"
+means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems,
+and issue tracking systems that are managed by, or on behalf of, the
+Licensor for the purpose of discussing and improving the Work, but
+excluding communication that is conspicuously marked or otherwise
+designated in writing by the copyright owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity
+on behalf of whom a Contribution has been received by Licensor and
+subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the
+Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+(except as stated in this section) patent license to make, have made,
+use, offer to sell, sell, import, and otherwise transfer the Work,
+where such license applies only to those patent claims licensable
+by such Contributor that are necessarily infringed by their
+Contribution(s) alone or by combination of their Contribution(s)
+with the Work to which such Contribution(s) was submitted. If You
+institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work
+or a Contribution incorporated within the Work constitutes direct
+or contributory patent infringement, then any patent licenses
+granted to You under this License for that Work shall terminate
+as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+Work or Derivative Works thereof in any medium, with or without
+modifications, and in Source or Object form, provided that You
+meet the following conditions:
+
+(a) You must give any other recipients of the Work or
+Derivative Works a copy of this License; and
+
+(b) You must cause any modified files to carry prominent notices
+stating that You changed the files; and
+
+(c) You must retain, in the Source form of any Derivative Works
+that You distribute, all copyright, patent, trademark, and
+attribution notices from the Source form of the Work,
+excluding those notices that do not pertain to any part of
+the Derivative Works; and
+
+(d) If the Work includes a "NOTICE" text file as part of its
+distribution, then any Derivative Works that You distribute must
+include a readable copy of the attribution notices contained
+within such NOTICE file, excluding those notices that do not
+pertain to any part of the Derivative Works, in at least one
+of the following places: within a NOTICE text file distributed
+as part of the Derivative Works; within the Source form or
+documentation, if provided along with the Derivative Works; or,
+within a display generated by the Derivative Works, if and
+wherever such third-party notices normally appear. The contents
+of the NOTICE file are for informational purposes only and
+do not modify the License. You may add Your own attribution
+notices within Derivative Works that You distribute, alongside
+or as an addendum to the NOTICE text from the Work, provided
+that such additional attribution notices cannot be construed
+as modifying the License.
+
+You may add Your own copyright statement to Your modifications and
+may provide additional or different license terms and conditions
+for use, reproduction, or distribution of Your modifications, or
+for any such Derivative Works as a whole, provided Your use,
+reproduction, and distribution of the Work otherwise complies with
+the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+any Contribution intentionally submitted for inclusion in the Work
+by You to the Licensor shall be under the terms and conditions of
+this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify
+the terms of any separate license agreement you may have executed
+with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+names, trademarks, service marks, or product names of the Licensor,
+except as required for reasonable and customary use in describing the
+origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+agreed to in writing, Licensor provides the Work (and each
+Contributor provides its Contributions) on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied, including, without limitation, any warranties or conditions
+of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE. You are solely responsible for determining the
+appropriateness of using or redistributing the Work and assume any
+risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+whether in tort (including negligence), contract, or otherwise,
+unless required by applicable law (such as deliberate and grossly
+negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special,
+incidental, or consequential damages of any character arising as a
+result of this License or out of the use or inability to use the
+Work (including but not limited to damages for loss of goodwill,
+work stoppage, computer failure or malfunction, or any and all
+other commercial damages or losses), even if such Contributor
+has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+the Work or Derivative Works thereof, You may choose to offer,
+and charge a fee for, acceptance of support, warranty, indemnity,
+or other liability obligations and/or rights consistent with this
+License. However, in accepting such obligations, You may act only
+on Your own behalf and on Your sole responsibility, not on behalf
+of any other Contributor, and only if You agree to indemnify,
+defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason
+of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+To apply the Apache License to your work, attach the following
+boilerplate notice, with the fields enclosed by brackets "[]"
+replaced with your own identifying information. (Don't include
+the brackets!)  The text should be enclosed in the appropriate
+comment syntax for the file format. We also recommend that a
+file or class name and description of purpose be included on the
+same "printed page" as the copyright notice for easier
+identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+-------------------------------------------------------------
+
+tensorflow
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+## Some of TensorFlow's code is derived from Caffe, which is subject to the following copyright notice:
+
+COPYRIGHT
+
+All contributions by the University of California:
+
+Copyright (c) 2014, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+
+Copyright (c) 2014, the respective contributors
+All rights reserved.
+
+Caffe uses a shared copyright model: each contributor holds copyright over
+their contributions to Caffe. The project versioning records all such
+contribution and copyright details. If a contributor wants to further mark
+their specific copyright on a particular contribution, they should indicate
+their copyright solely in the commit message of the change when it is
+committed.
+
+LICENSE
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+   ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+   ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CONTRIBUTION AGREEMENT
+
+By contributing to the BVLC/caffe repository through pull-request, comment,
+or otherwise, the contributor releases their content to the
+license and copyright terms herein.
+
+-------------------------------------------------------------
+
+tensorflow-text
+
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2017, The TensorFlow Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

From 4fad7d534df4aad5f01c03551e9f0cbf1bff7fb1 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 19:58:35 +0400
Subject: [PATCH 63/97] add .github/workflows/genai_lib.yml

---
 .github/workflows/genai_lib.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 .github/workflows/genai_lib.yml

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
new file mode 100644
index 0000000000..e14e3b0315
--- /dev/null
+++ b/.github/workflows/genai_lib.yml
@@ -0,0 +1,14 @@
+name: genai_lib
+jobs:
+  genai_lib:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+      - run: python -m pip install scikit-build
+      - run: python -m pip isntall .
+      - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"

From 51e03a272cdf79819b4f0df7a4a71b0d437d1689 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 19:59:50 +0400
Subject: [PATCH 64/97] on: pull_request

---
 .github/workflows/genai_lib.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index e14e3b0315..c53a531b98 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -1,4 +1,5 @@
 name: genai_lib
+on: pull_request
 jobs:
   genai_lib:
     runs-on: ubuntu-20.04

From e23a7bb427dfe09a83e28ce17cf6158b328a8df6 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 20:00:48 +0400
Subject: [PATCH 65/97] spelling

---
 .github/workflows/genai_lib.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index c53a531b98..8c0202fe2f 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -11,5 +11,5 @@ jobs:
         with:
           python-version: 3.8
       - run: python -m pip install scikit-build
-      - run: python -m pip isntall .
+      - run: python -m pip install .
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"

From fc5b7539536eb226073596659151c0375c28c450 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Fri, 10 May 2024 20:02:15 +0400
Subject: [PATCH 66/97] install openvino

---
 .github/workflows/genai_lib.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index 8c0202fe2f..c7c711b4e5 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -10,6 +10,11 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: 3.8
+      - name: Install OpenVINO
+        run: |
+          mkdir ./ov/
+          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: python -m pip install scikit-build
-      - run: python -m pip install .
+      - run: source ./ov/setupvars.sh && python -m pip install .
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"

From 09f88061a22bbbe11b08b39e5762b8dedc57c033 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 10 May 2024 21:27:21 +0200
Subject: [PATCH 67/97] add syntacis sugar for geenrate, optimize value passing
 by reference where possible, read special tokens from IR

---
 .../openvino/genai/generation_config.hpp      |   2 -
 .../include/openvino/genai/llm_pipeline.hpp   |  36 +++--
 src/cpp/include/openvino/genai/tokenizer.hpp  |  26 ++--
 src/cpp/src/llm_pipeline.cpp                  |  73 ++++------
 src/cpp/src/text_callback_streamer.cpp        |   2 +-
 src/cpp/src/tokenizer.cpp                     | 127 +++++++++++++-----
 src/cpp/src/utils.cpp                         |   2 +
 src/cpp/src/utils.hpp                         |   2 +-
 src/python/CMakeLists.txt                     |   2 +-
 src/python/py_generate_pipeline.cpp           |   5 +-
 text_generation/causal_lm/cpp/CMakeLists.txt  |   2 -
 .../cpp/generate_pipeline/generate_sample.cpp | 100 ++++----------
 12 files changed, 205 insertions(+), 174 deletions(-)

diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index 4843d3c76a..10f6b5b1b6 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -31,7 +31,6 @@ enum class StopCriteria { early, heuristic, never };
  * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
  * @param diversity_penalty this value is subtracted from a beam's score if it generates a token same as any beam from other group at a
  *        particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
- *        [more datails in this paper](https://arxiv.org/pdf/1610.02424.pdf).
  * @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
  *        the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
  *        likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
@@ -48,7 +47,6 @@ enum class StopCriteria { early, heuristic, never };
  * @param do_sample whether or not to use multinomial random sampling
  *        that add up to `top_p` or higher are kept.
  * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. 
- *        [more datails in this paper](https://arxiv.org/pdf/1909.05858.pdf).
  * @param pad_token_id id of padding token
  * @param bos_token_id id of <bos> token
  * @param eos_token_id id of <eos> token
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index cd8c3a4a1b..47c7fcec85 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -3,12 +3,10 @@
 
 #pragma once
 
-#include <filesystem>
 #include <optional>
+#include <variant>
 
-#include <openvino/runtime/infer_request.hpp>
 #include <openvino/core/any.hpp>
-
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/genai/streamer_base.hpp"
@@ -93,9 +91,16 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
         Properties&&... properties) {
         return generate(text, AnyMap{std::forward<Properties>(properties)...});
     }
-
     std::string generate(std::string text, const ov::AnyMap& config);
 
+    template <typename... Properties>
+    util::EnableIfAllStringAny<EncodedResults, Properties...> generate(
+        ov::Tensor input_ids,
+        Properties&&... properties) {
+        return generate(input_ids, AnyMap{std::forward<Properties>(properties)...});
+    }
+    EncodedResults generate(ov::Tensor input_ids, const ov::AnyMap& config);
+
     /**
     * @brief High level generate for batched prompts which encodes inputs and returns decoded outputs. 
     * Streamer cannot be used for multibatch inputs.
@@ -121,14 +126,21 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
                             std::optional<ov::Tensor> attention_mask, 
                             OptionalGenerationConfig generation_config,
                             OptionalStreamerVariant streamer);
-
-    std::string operator()(std::string text, OptionalGenerationConfig generation_config);
+    
+    template <typename InputsType, typename... Properties>
+    util::EnableIfAllStringAny<std::string, Properties...> operator()(
+        InputsType text,
+        Properties&&... properties) {
+        return generate(text, AnyMap{std::forward<Properties>(properties)...});
+    }
+    std::string operator()(std::string text, OptionalGenerationConfig generation_config={});
+    
     DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config);
     DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config);
 
     // generate with streamers
-    std::string operator()(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer);
-    std::string operator()(std::string text, StreamerVariant streamer);
+    std::string operator()(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer);
+    std::string operator()(std::string text, OptionalStreamerVariant streamer);
     
     ov::Tokenizer get_tokenizer();
     GenerationConfig get_generation_config() const;
@@ -143,8 +155,16 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     std::unique_ptr<LLMPipelineImpl> m_pimpl;
 };
 
+/*
+ * utils that allow to use generate and operarator() in the folllowing way:
+ * pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...)
+ * pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...)
+ * All names match to names in cofnig except streamer.
+*/
 static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
 static constexpr ov::Property<float> temperature{"temperature"};
+
+// It's problematic to store and automaticall convert std::variant in AnyMap
 static constexpr ov::Property<std::function<void (std::string)>> streamer_lambda{"streamer_lambda"};
 static constexpr ov::Property<std::shared_ptr<StreamerBase>> streamer{"streamer"};
 
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 702aca83f9..54e11eaf9f 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -3,14 +3,10 @@
 
 #pragma once
 
-#include <filesystem>
 #include <string>
 #include <vector>
-#include <memory>
 #include <initializer_list>
-
 #include <openvino/runtime/tensor.hpp>
-
 #include "openvino/genai/visibility.hpp"
 
 namespace ov {
@@ -25,21 +21,22 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
     * @param device device. Currently only 'CPU' is supported
     */
-    Tokenizer(const std::string tokenizers_path, const std::string device="CPU");
+    Tokenizer(const std::string& tokenizers_path, const std::string& device="CPU");
 
     /**
     * @brief encode a single prompt
     * @return pair of [input_ids, attention_mask]
     */
-    std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt);
+    std::pair<ov::Tensor, ov::Tensor> encode(const std::string prompt);  // todo: passing by reference fails
     
     /**
     * @brief encode batch of prompts. Left padding will be applied by default
     * @param prompts vector storing batch of prompts
     * @return pair of [input_ids, attention_mask]
     */
-    std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string> prompts);
-    std::pair<ov::Tensor, ov::Tensor> encode(std::initializer_list<std::string> prompts);
+    std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string>& prompts);
+    std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string>&& prompts);
+    std::pair<ov::Tensor, ov::Tensor> encode(std::initializer_list<std::string>& prompts);
     
     /**
     * @brief decode sequence of tokens
@@ -62,8 +59,17 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     */
     std::vector<std::string> decode(std::vector<std::vector<int64_t>> tokens);
 
-    int64_t m_bos_token_id = 1;  // todo: read from rt_info
-    int64_t m_eos_token_id = 2;  // todo: read from rt_info
+    // information about <bos>, <eos> tokens should be public,
+    // they are used at least in StreamerBase descendants
+    int64_t get_bos_token_id() const;
+    int64_t get_eos_token_id() const;
+    int64_t get_pad_token_id() const;
+    
+    // Also need write access to set these tokens when they are not successfully read from xml rt_info.
+    // In the latter case values can be read from config.json in LLMPipeline
+    void set_bos_token_id(int64_t);
+    void set_eos_token_id(int64_t);
+    void set_pad_token_id(int64_t);
 
     Tokenizer() = default;
     ~Tokenizer();
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index e9eb7ea0d8..b405a12880 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -1,19 +1,20 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include <openvino/openvino.hpp>
-#include "openvino/genai/llm_pipeline.hpp"
 #include <filesystem>
 #include <fstream>
 #include <variant>
-#include "generation_config_helper.hpp"
-#include "text_callback_streamer.hpp"
-#include "utils.hpp"
-#include <nlohmann/json.hpp>
 
+#include <nlohmann/json.hpp>
 #include <jinja2cpp/template.h>
 #include <jinja2cpp/template_env.h>
+
+#include <openvino/openvino.hpp>
 #include "openvino/genai/generation_config.hpp"
+#include "openvino/genai/llm_pipeline.hpp"
+#include "utils.hpp"
+#include "generation_config_helper.hpp"
+#include "text_callback_streamer.hpp"
 
 
 namespace ov {
@@ -63,36 +64,7 @@ class LLMPipeline::LLMPipelineImpl {
 
 using namespace std;
 
-std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token) {
-    const size_t batch_size = input_ids.get_shape()[0];
-    const size_t sequence_length = input_ids.get_shape()[1];
-    int64_t* inputs_data = input_ids.data<int64_t>();
-    int64_t* attention_mask_data = attention_mask.data<int64_t>();
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        const size_t batch_offset = batch * sequence_length;
-
-        // last token in the sequence is not a PAD_TOKEN, skipping
-        if (inputs_data[batch_offset + sequence_length - 1] != pad_token)
-            continue;
-
-        size_t pad_tokens_number = 0;
-        for (int i = sequence_length - 1; i >= 0; i--) {
-            const size_t token_offset = batch_offset + i;
 
-            if (inputs_data[token_offset] == pad_token)
-                continue;
-
-            if (pad_tokens_number == 0)
-                pad_tokens_number = sequence_length - i - 1;
-
-            std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]);
-            std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]);
-        }
-    }
-
-    return {input_ids, attention_mask};
-}
 
 ov::LLMPipeline::LLMPipeline(
     const std::string model_path,
@@ -111,10 +83,8 @@ ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(
 ): m_tokenizer(tokenizer), m_device(device), m_plugin_config(plugin_config) {
     ov::Core core;
     
-    auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
-    
     std::string full_path = model_path;
-    if (!is_xml(full_path))
+    if (!ov::generate_utils::is_xml(full_path))
         full_path += "/openvino_model.xml";
     try {
         m_model_runner = core.compile_model(full_path, device, plugin_config).create_infer_request();
@@ -220,7 +190,8 @@ ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector<std::s
 }
 
 std::string ov::LLMPipeline::operator()(std::string text, OptionalGenerationConfig generation_config) {
-    return generate(text, generation_config, {});
+    OptionalStreamerVariant empty_streamer;
+    return generate(text, generation_config, empty_streamer);
 }
 
 ov::DecodedResults ov::LLMPipeline::operator()(std::vector<std::string> texts, OptionalGenerationConfig generation_config) {
@@ -283,7 +254,7 @@ std::string ov::LLMPipeline::generate(std::string text, OptionalGenerationConfig
 
 
 std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config_map) {
-    StreamerVariant streamer = {};
+    OptionalStreamerVariant streamer;
     auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map);
 
     // todo: get attentions from properties?
@@ -296,11 +267,25 @@ std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config
     return m_pimpl->generate(text, config, streamer);
 }
 
-std::string ov::LLMPipeline::operator()(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer) {
+ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, const ov::AnyMap& config_map) {
+    OptionalStreamerVariant streamer;
+    auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map);
+
+    // todo: get attentions from properties?
+    if (config_map.count("streamer_lambda")) {
+        streamer = config_map.at("streamer_lambda").as<std::function<void (std::string)>>();
+    } else if (config_map.count("streamer")) {
+        streamer = config_map.at("streamer").as<std::shared_ptr<StreamerBase>>();
+    }
+    std::optional<ov::Tensor> attention_mask;
+    return m_pimpl->generate(input_ids, attention_mask, config, streamer);
+}
+
+std::string ov::LLMPipeline::operator()(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer) {
     return m_pimpl->generate(text, generation_config, streamer);
 }
 
-std::string ov::LLMPipeline::operator()(std::string text, StreamerVariant streamer) {
+std::string ov::LLMPipeline::operator()(std::string text, OptionalStreamerVariant streamer) {
     return m_pimpl->generate(text, m_pimpl->m_generation_config, streamer);
 }
 
@@ -322,8 +307,8 @@ std::string ov::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string pr
     jinja2::ValuesMap message {{"role", role}, {"content", prompt}};
     jinja2::ValuesMap params = {
         {"messages", jinja2::ValuesList({message})},
-        {"bos_token",  "<s>"},
-        {"eos_token", "</s>"},  // todo: load from config
+        {"bos_token",  m_generation_config.bos_token},
+        {"eos_token", m_generation_config.eos_token},
         {"add_generation_prompt", true},
     };
  
diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp
index 3192520eee..6e5bd4ee8d 100644
--- a/src/cpp/src/text_callback_streamer.cpp
+++ b/src/cpp/src/text_callback_streamer.cpp
@@ -18,7 +18,7 @@ TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, bool prin
 void TextCallbackStreamer::put(int64_t token) {
     std::stringstream res;
     // do not print anything and flush cache if EOS token is met
-    if (token == m_tokenizer.m_eos_token_id) {
+    if (token == m_tokenizer.get_eos_token_id()) {
         end();
         return;
     }
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 2bd0c4adc1..1d7879a8d6 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -3,9 +3,43 @@
 
 #include <openvino/openvino.hpp>
 #include "openvino/genai/tokenizer.hpp"
-#include <filesystem>
+#include "utils.hpp"
 
-std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token=2);
+namespace {
+
+// todo: remove when openvino-tokenizers will support left padding
+std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token) {
+    const size_t batch_size = input_ids.get_shape()[0];
+    const size_t sequence_length = input_ids.get_shape()[1];
+    int64_t* inputs_data = input_ids.data<int64_t>();
+    int64_t* attention_mask_data = attention_mask.data<int64_t>();
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        const size_t batch_offset = batch * sequence_length;
+
+        // last token in the sequence is not a PAD_TOKEN, skipping
+        if (inputs_data[batch_offset + sequence_length - 1] != pad_token)
+            continue;
+
+        size_t pad_tokens_number = 0;
+        for (int i = sequence_length - 1; i >= 0; i--) {
+            const size_t token_offset = batch_offset + i;
+
+            if (inputs_data[token_offset] == pad_token)
+                continue;
+
+            if (pad_tokens_number == 0)
+                pad_tokens_number = sequence_length - i - 1;
+
+            std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]);
+            std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]);
+        }
+    }
+
+    return {input_ids, attention_mask};
+}
+
+}
 
 namespace ov {
 
@@ -14,74 +48,79 @@ class Tokenizer::TokenizerImpl {
     ov::InferRequest m_tokenize_request;
     ov::InferRequest m_detokenizer_request;
     std::string m_device;
-
+    int64_t m_pad_token_id = 0;
+    int64_t m_bos_token_id = 1;
+    int64_t m_eos_token_id = 2;
 
     TokenizerImpl() = default;
-    TokenizerImpl(const std::string tokenizers_path, const std::string device);
+    TokenizerImpl(std::string tokenizers_path, const std::string device);
 
     std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt);
-    std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string> prompts);
+    std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string>& prompts);
     std::string decode(std::vector<int64_t> tokens);
     std::vector<std::string> decode(ov::Tensor tokens);
     std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines);
 };
 
-Tokenizer::TokenizerImpl::TokenizerImpl(const std::string tokenizers_path, const std::string device): m_device(device) {
+Tokenizer::TokenizerImpl::TokenizerImpl(std::string tokenizers_path,  std::string device): m_device(device) {
     ov::Core core;
     
-    auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
-    
-    if (is_xml(tokenizers_path))
-        OPENVINO_THROW("tokenizers_path should be a path to a dir not to xml file");
+    if (ov::generate_utils::is_xml(tokenizers_path))
+        OPENVINO_THROW("tokenizers_path should be a path to a dir not a xml file");
   
     // todo:: OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
     core.add_extension(OPENVINO_TOKENIZERS_PATH);  
+    
+    std::shared_ptr<ov::Model> tokenizer_model, detokenizer_model;
     try {
-        m_tokenize_request = core.compile_model(tokenizers_path + "/openvino_tokenizer.xml", device).create_infer_request();
-        m_detokenizer_request = core.compile_model(tokenizers_path + "/openvino_detokenizer.xml", device).create_infer_request();
+        tokenizer_model = core.read_model(tokenizers_path + "/openvino_tokenizer.xml");
+        detokenizer_model = core.read_model(tokenizers_path + "/openvino_detokenizer.xml");
     } catch (...) {
         OPENVINO_THROW("Cannot compile tokenizer and/or detokenizer. Please check that "
-                       "openvino_tokenizer.xml and openvino_detokenizer.xml exit in \"" + tokenizers_path + "\"");
+                       "openvino_tokenizer.xml and openvino_detokenizer.xml exist in \"" + tokenizers_path + "\"");
     }
-    // todo: read eos, bos here
+    m_tokenize_request = core.compile_model(tokenizer_model, device).create_infer_request();
+    m_detokenizer_request = core.compile_model(detokenizer_model, device).create_infer_request();
+
+    auto rt_info = tokenizer_model->get_rt_info();
+    if (rt_info.count("eos_token_id") > 0)
+        m_eos_token_id = rt_info["eos_token_id"].as<int64_t>();
+    if (rt_info.count("bos_token_id") > 0)
+        m_bos_token_id = rt_info["bos_token_id"].as<int64_t>();
+    if (rt_info.count("pad_token_id") > 0)
+        m_pad_token_id = rt_info["pad_token_id"].as<int64_t>();
 }
 
-Tokenizer::Tokenizer(const std::string tokenizers_path, const std::string device) {
+Tokenizer::Tokenizer(const std::string& tokenizers_path, const std::string& device) {
     m_pimpl = std::make_shared<TokenizerImpl>(tokenizers_path, device);
 }
 
-std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(std::string prompt) {
+std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(const std::string prompt) {
     return m_pimpl->encode(prompt);
 }
 
 std::pair<ov::Tensor, ov::Tensor> Tokenizer::TokenizerImpl::encode(std::string prompt) {
-    size_t batch_size = 1;
-    m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
+    size_t batch_size = 1;   
+    m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt[0]});
     m_tokenize_request.infer();
 
-    std::vector<std::vector<int64_t>> input_ids_vec;
-    input_ids_vec.reserve(1);
-    auto res_tensor = m_tokenize_request.get_tensor("input_ids");
-    auto res_shape = res_tensor.get_shape();
-    
-    for (int i = 0; i < res_shape[0]; ++i) {
-        int64_t* start = res_tensor.data<int64_t>() + i * res_shape[1];
-        input_ids_vec.emplace_back(std::vector<int64_t>(start, start + res_shape[1]));
-    }
-
     return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
 }
 
-std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(std::vector<std::string> prompts) {
+std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(std::vector<std::string>& prompts) {
+    return m_pimpl->encode(prompts);
+}
+
+std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(std::vector<std::string>&& prompts) {
     return m_pimpl->encode(prompts);
 }
 
-std::pair<ov::Tensor, ov::Tensor> Tokenizer::TokenizerImpl::encode(std::vector<std::string> prompts) {
+std::pair<ov::Tensor, ov::Tensor> Tokenizer::TokenizerImpl::encode(std::vector<std::string>& prompts) {
     m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
     auto size_ = m_tokenize_request.get_input_tensor().get_shape();
     m_tokenize_request.infer();
 
-    pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"));
+    ::pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id);
     // todo: fix mask filled with '2' instead of '0'
     ov::Tensor attention_mask = m_tokenize_request.get_tensor("attention_mask");
     int64_t* attention_mask_data = attention_mask.data<int64_t>();
@@ -90,7 +129,7 @@ std::pair<ov::Tensor, ov::Tensor> Tokenizer::TokenizerImpl::encode(std::vector<s
     return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
 }
 
-std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(std::initializer_list<std::string> text) {
+std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(std::initializer_list<std::string>& text) {
     return encode(std::vector<std::string>(text.begin(), text.end()));
 }
 
@@ -144,6 +183,30 @@ std::vector<std::string> Tokenizer::TokenizerImpl::decode(std::vector<std::vecto
     return results;
 }
 
+int64_t Tokenizer::get_bos_token_id() const {
+    return m_pimpl->m_bos_token_id;
+}
+
+int64_t Tokenizer::get_eos_token_id() const {
+    return m_pimpl->m_eos_token_id;
+}
+
+int64_t Tokenizer::get_pad_token_id() const {
+    return m_pimpl->m_pad_token_id;
+}
+
+void Tokenizer::set_pad_token_id(int64_t pad_token_id) {
+    m_pimpl->m_pad_token_id = pad_token_id;
+}
+
+void Tokenizer::set_bos_token_id(int64_t bos_token_id) {
+    m_pimpl->m_bos_token_id = bos_token_id;
+}
+
+void Tokenizer::set_eos_token_id(int64_t eos_token_id) {
+    m_pimpl->m_eos_token_id = eos_token_id;
+}
+
 Tokenizer::~Tokenizer() = default;
 
 } // namespace ov
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index 2b30b75838..a5f109b791 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -27,6 +27,8 @@ void print_tensor(const ov::Tensor& tensor) {
     std::cout << "]" << std::endl;
 }
 
+bool is_xml(const std::string& path) { return path.compare(path.length() - 4, 4, ".xml") == 0;}
+
 std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_idx) {
     if (logits.get_shape()[0] <= batch_idx) {
         OPENVINO_THROW("logits batch size doesn't match the number of beams");
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 3ef4c8e106..ac5ac76158 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -14,7 +14,7 @@ void print_tensor(const ov::Tensor& tensor);
 
 std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_idx);
 
-enum class StopCriteria { early, heuristic, never };
+bool is_xml(const std::string& path);
 
 }  // namespace generate_utils
 }  // namespace ov
\ No newline at end of file
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index d951d25d25..0171ff87d4 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -8,7 +8,7 @@ FetchContent_Declare(
     GIT_REPOSITORY https://github.com/pybind/pybind11
     GIT_TAG        v2.12.0
 )
-
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 FetchContent_GetProperties(pybind11)
 if(NOT pybind11_POPULATED)
     FetchContent_Populate(pybind11)
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index cf9cd64c5f..a111753765 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -76,8 +76,9 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
     py::class_<ov::Tokenizer>(m, "Tokenizer")
         .def(py::init<>())
         .def(py::init<std::string&, std::string>(), py::arg("tokenizers_path"), py::arg("device") = "CPU")
-        .def("encode", py::overload_cast<std::string>(&ov::Tokenizer::encode), "Encode a single prompt")
-        .def("encode", py::overload_cast<std::vector<std::string>>(&ov::Tokenizer::encode), "Encode multiple prompts")
+        .def("encode", py::overload_cast<const std::string>(&ov::Tokenizer::encode), "Encode a single prompt")
+        // TODO: common.h(1106...) template argument deduction/substitution failed:
+        // .def("encode", py::overload_cast<std::vector<std::string>&>(&ov::Tokenizer::encode), "Encode multiple prompts")
         .def("decode", py::overload_cast<std::vector<int64_t>>(&ov::Tokenizer::decode), "Decode a list of tokens")
         .def("decode", py::overload_cast<ov::Tensor>(&ov::Tokenizer::decode), "Decode a tensor of tokens")
         .def("decode", py::overload_cast<std::vector<std::vector<int64_t>>>(&ov::Tokenizer::decode), "Decode multiple lines of tokens");
diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index c659603fe3..ff4132e08f 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -4,8 +4,6 @@
 cmake_minimum_required(VERSION 3.15)
 project(causal_lm)
 
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-
 set(TARGET_NAME greedy_causal_lm)
 add_executable(${TARGET_NAME} greedy_causal_lm.cpp)
 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
index f46b8bc682..f1c7745c87 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -1,89 +1,52 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-// #include <openvino/openvino.hpp>
 #include "openvino/genai/llm_pipeline.hpp"
 
+using std::cout;
+using std::endl;
 
-// The following reasons require TextStreamer to keep a cache of previous tokens:
-// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
-// but detokenize(tokenize("prefix a")) == "prefix a"
-// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
-struct TextStreamer {
-    ov::Tokenizer tokenizer;
-    std::vector<int64_t> token_cache;
-    size_t print_len = 0;
-
-    void put(int64_t token) {
-        token_cache.push_back(token);
-        std::string text = tokenizer.decode(token_cache);
-        if (!text.empty() && '\n' == text.back()) {
-            // Flush the cache after the new line symbol
-            std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
-            token_cache.clear();
-            print_len = 0;
-	        return;
-        }
-        if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
-            // Don't print incomplete text
-            return;
-        }
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
-        print_len = text.size();
-    }
-
-    void end() {
-        std::string text = tokenizer.decode(token_cache);
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
-        token_cache.clear();
-        print_len = 0;
-    }
-};
-
-int main(int argc, char* argv[]) try {
-    if (2 >= argc && argc <= 4)
+int main(int argc, char* argv[]) {
+    if (2 > argc && argc > 4)
         throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> \"<PROMPT>\" <DEVICE>");
+    std::string model_path = argv[1];
     
-    std::string prompt = "table is made of";
+    std::string prompt = "table is made of ";
     std::string device = "CPU"; // can be replaced with GPU
 
-    std::string model_path = argv[1];
     if (argc > 2)
         prompt = argv[2];
     if (argc > 3)
         device = argv[3];
-
-    // Example 1: TextStreaming example with greedy search
+    
+    // Example 1: Simplest example with greedy search
+    // Model, tokenizer and generation_config.json will be loaded from the model_path.
+    // If generation_config.json is not found default velues for gready search will be used
     
     ov::LLMPipeline pipe(model_path, device);
-    // Will try to load config from generation_config.json.
-    // but if not found default velues for gready search will be used
-    ov::GenerationConfig config = pipe.get_generation_config();
+    // cout << prompt << pipe(prompt) << endl;
 
-    auto text_streamer = TextStreamer{pipe.get_tokenizer()};
-    auto text_streamer_callback = [&text_streamer](std::vector<int64_t>&& tokens, ov::LLMPipeline& pipe){
-        text_streamer.put(tokens[0]);
-    };
+    // todo: syntactic sugar to specify generation configs in place
+    // cout << prompt << pipe(prompt, ov::max_new_tokens(100)) << endl;
 
-    cout << "greedy generate streaming mode:" << endl;
-    config.max_new_tokens = 20;
-    // config.m_set_streamer(text_streamer_callback);
-    pipe(prompt, config);
-    text_streamer.end();
-    
-    // Example 2: Grouped Beam Search decoding example
-    // pipe = ov::LLMPipeline(model_path, device);
-    // config = pipe.generation_config();
 
-    // // will return vector with num_return_sequences strings
-    // auto num_return_sequences = 3;
-    // config.max_new_tokens(20).num_groups(3).group_size(5).num_return_sequences(num_return_sequences);
-    
+    auto tokenizer = ov::Tokenizer(model_path);
+    auto [input_ids, attention_mask] = tokenizer.encode("table is made of ");
+    auto resuling_tokens = pipe.generate(input_ids, ov::max_new_tokens(1000));
+    cout << tokenizer.decode(resuling_tokens.tokens[0]) << endl;
+
+    // Example 2: Modifying generation_cofnig to use grouped beam search
+    ov::GenerationConfig config = pipe.get_generation_config();
+    config.max_new_tokens = 100;
+    config.num_beams = 15;
+    config.num_beam_groups = 3;
+    // cout << prompt << pipe(prompt, config) << endl;
+
     // cout << endl << "grouped beam search generated candidates:" << endl;
-    // auto generation_results = pipe({prompt}, config);
     // for (int i = 0; i < num_return_sequences; ++i)
-    //     cout << generation_results[i].score << ": " << generation_results[i].text << endl;
-
+    // will return vector with num_return_sequences strings
+    // auto num_return_sequences = 3;
+    
     // // Example 3: Greedy Decoding with multiple batch
     // pipe = ov::LLMPipeline(model_path, device);
     // config = pipe.generation_config();
@@ -126,10 +89,5 @@ int main(int argc, char* argv[]) try {
     //     }
     // }
 
-} catch (const std::exception& error) {
-    std::cerr << error.what() << '\n';
-    return EXIT_FAILURE;
-} catch (...) {
-    std::cerr << "Non-exception object thrown\n";
-    return EXIT_FAILURE;
+    return 0;
 }

From af22a8a1cf3d213b4fb3ed4b8ee65b5b0b7da4f8 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Sat, 11 May 2024 11:15:29 +0200
Subject: [PATCH 68/97] remove speculative decoding

---
 .../openvino/genai/generation_config.hpp      |   4 -
 src/cpp/src/assistive_decoding.cpp            | 211 ------------------
 src/cpp/src/generation_config.cpp             |  18 +-
 src/cpp/src/generation_config_helper.hpp      |  51 -----
 src/cpp/src/llm_pipeline.cpp                  |  13 +-
 src/cpp/src/tokenizer.cpp                     |   2 +-
 src/python/py_generate_pipeline.cpp           |   4 +-
 7 files changed, 6 insertions(+), 297 deletions(-)
 delete mode 100644 src/cpp/src/assistive_decoding.cpp

diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index 10f6b5b1b6..e1f2151d49 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -88,10 +88,6 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     // used for chat scenario
     std::string bos_token = "<s>";
     std::string eos_token = "</s>";
-    
-    // speculative sampling
-    std::variant<std::string, ov::CompiledModel, ov::InferRequest> draft_model;  // todo: remove or try to add ov::Model const ov::Model&,
 };
 
-
 } // namespace ov
diff --git a/src/cpp/src/assistive_decoding.cpp b/src/cpp/src/assistive_decoding.cpp
deleted file mode 100644
index f852e4346c..0000000000
--- a/src/cpp/src/assistive_decoding.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-// Copyright (C) 2023-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "generation_config_helper.hpp"
-#include "openvino/genai/llm_pipeline.hpp"
-
-ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len);
-void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len);
-
-ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len) {
-    // Copy elements from the old to a new tensor and return it.
-    // It's assumed that key/values tensor has a shape [BATCH_SIZE, num_kv_heads, seq_len, head_size] or [seq_len, ...],
-    // It that's not the case for your model please implement your own trim method.
-    OPENVINO_ASSERT(seq_len_axis == 2 || seq_len_axis == 0, "Cannot trim key/values with sequence length axis = ", seq_len_axis);
-    
-    auto old_tensor_data = tensor.data<float>();
-    auto shape = tensor.get_shape();
-    size_t batch_size = shape[0];
-    size_t num_kv_heads = shape[1];
-    size_t old_seq_len = shape[2];
-    size_t head_size = shape[3];
-    
-    OPENVINO_ASSERT(new_seq_len <= old_seq_len);
-    
-    // if new_seq_len equal to old one no need to copy tensor, return as is
-    if (old_seq_len == new_seq_len)
-        return tensor;
-
-    if (seq_len_axis == 0) {
-        shape[0] = new_seq_len;
-        tensor.set_shape(shape);
-    }
-
-    // if seq_len_axis == 2, then data is not contiguous, in order to trim need to repack tensor
-    auto new_tensor = ov::Tensor{ov::element::f32, {batch_size, num_kv_heads, new_seq_len, head_size}};
-    auto new_tensor_data = new_tensor.data<float>();
-    for (size_t batch = 0; batch < batch_size; ++batch){
-        for (size_t i = 0; i < num_kv_heads; ++i) {
-            for (size_t j = 0; j < new_seq_len; ++j) {
-                auto dst_ptr = new_tensor_data + num_kv_heads * new_seq_len * head_size * batch + new_seq_len * head_size * i +  head_size * j;
-                auto src_ptr = old_tensor_data + num_kv_heads * new_seq_len * head_size * batch + old_seq_len * head_size * i +  head_size * j;
-                std::memcpy(dst_ptr, src_ptr, head_size * sizeof(float));
-            }
-        }
-    }
-    return new_tensor;
-}
-
-void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) {
-    // trim kv_cache values up to the new_seq_len
-    for (auto& state: request.query_state()) {
-        ov::Tensor old_tensor = state.get_state();
-        state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
-    }
-}
-
-/* Speculative decoding works the following way. The draft model predicts the next K
-tokens one by one in an autoregressive manner, while the main model validates these
-predictions and corrects them if necessary. We go through each predicted token, and
-if a difference is detected between the draft and main model, we stop and keep the
-last token predicted by the main model. Then the draft model gets the latest main
-prediction and again tries to predict the next K tokens, repeating the cycle.
-
-This approach reduces the need for multiple infer requests to the main model,
-enhancing performance. For instance, in more predictable parts of text generation,
-the draft model can, in best-case scenarios, generate the next K tokens that exactly
-match the target. In tha caste the are validated in a single inference request to
-the main model (which is bigger, more accurate but slower) instead of running K
-subsequent requests. 
-*/
-
-namespace ov {
-ov::EncodedResults assistive_decoding(ov::InferRequest& m_model_runner, ov::Tensor input_ids, ov::Tensor attention_mask, ov::GenerationConfig generation_config) {
-    ov::GenerationConfigHelper config_helper = generation_config;
-
-    auto batch_size = input_ids.get_shape()[0];
-    OPENVINO_ASSERT(batch_size == 1);
-    auto draft_model = config_helper.get_assistant_model(); // todo: add config getting m_device, m_plugin_config
-    auto main_model = m_model_runner;
-    
-    auto draft_input_ids = ov::Tensor{input_ids.get_element_type(), input_ids.get_shape()};
-    input_ids.copy_to(draft_input_ids);
-    auto draft_attention_mask = ov::Tensor{input_ids.get_element_type(), input_ids.get_shape()};
-    
-    draft_model.set_tensor("input_ids", draft_input_ids);
-    draft_model.set_tensor("attention_mask", draft_attention_mask);
-    
-    ov::Tensor draft_position_ids = draft_model.get_tensor("position_ids");
-    draft_position_ids.set_shape(draft_input_ids.get_shape());
-    std::iota(draft_position_ids.data<int64_t>(), draft_position_ids.data<int64_t>() + draft_position_ids.get_size(), 0);
-    uint64_t seq_len = draft_input_ids.get_shape()[1];
-
-    // Input tensors for the main model should not be mixed with draft.
-    // Do not feed the same draft_postion_ids to the main, but copy input_ids from the draft_input_ids
-    // auto input_ids = main_model.get_tensor("input_ids");
-    // input_ids.set_shape(draft_input_ids.get_shape());
-    // draft_input_ids.copy_to(input_ids);
-
-    // auto attention_mask = main_model.get_tensor("attention_mask");
-    // attention_mask.set_shape(draft_input_ids.get_shape());
-    // std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
-
-    auto position_ids = main_model.get_tensor("position_ids");
-    position_ids.set_shape(draft_input_ids.get_shape());
-    std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
-    
-    // set beam_idx for stateful model: no beam search is used and batch_size = 1
-    draft_model.get_tensor("beam_idx").set_shape({batch_size});
-    draft_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
-    main_model.get_tensor("beam_idx").set_shape({batch_size});
-    main_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
-
-    main_model.set_tensor("input_ids", input_ids);
-    main_model.set_tensor("attention_mask", attention_mask);
-    main_model.set_tensor("position_ids", position_ids);
-
-    // To coollect kv-cache for the <PROMPT> and to get the next token run the very first infer request
-    draft_model.infer();
-    main_model.infer();
-
-    size_t vocab_size = draft_model.get_tensor("logits").get_shape().back();
-    OPENVINO_ASSERT(vocab_size == main_model.get_tensor("logits").get_shape().back(), "vocab size should be the same for the both models");
-    
-    // logits shape is [batch_size, seq_len, vocab_size]
-    auto logits = main_model.get_tensor("logits");
-    auto data_logits = logits.data<float>() + (seq_len - 1) * vocab_size;
-    int64_t out_token = std::max_element(data_logits, data_logits + vocab_size) - data_logits;
-    
-    // the first token which is fed to both draft and main netwoks on each iteration
-    auto first_token = out_token;
-
-    ov::EncodedResults results;
-    results.tokens.resize(batch_size);
-
-    results.tokens[0].emplace_back(out_token);
-    
-    // run K infer requests on draft model and get next K prediction tokens on each iteration
-    uint64_t K = config_helper.num_assistant_tokens;
-    std::vector<int64_t> draft_tokens;
-
-    // The draft model predicts tokens one by one in an auto-regressive manner, draft_input_ids length should be 1.
-    draft_input_ids.set_shape({batch_size, 1});
-    draft_position_ids.set_shape({batch_size, 1});
-
-    int max_sequence_length = generation_config.max_new_tokens;
-    auto eos_token = generation_config.eos_token_id;
-    
-    while (out_token != eos_token && seq_len < max_sequence_length) {
-        // infer the K next tokens with draft model
-        for (int i = 0; i < K; ++i) {
-            draft_input_ids.data<int64_t>()[0] = out_token;
-            draft_attention_mask.set_shape({batch_size, seq_len + i + 1});
-            std::fill_n(draft_attention_mask.data<int64_t>(), draft_attention_mask.get_size(), 1);
-            draft_position_ids.data<int64_t>()[0] = int64_t(draft_attention_mask.get_size() - 1);
-
-            draft_model.infer();
-
-            auto draft_logits = draft_model.get_tensor("logits").data<float>();
-            int64_t arg_max_token = std::max_element(draft_logits, draft_logits + vocab_size) - draft_logits;
-            out_token = arg_max_token;
-            draft_tokens.emplace_back(arg_max_token);
-        }
-
-        // For the main network, K tokens will be fed at once in a single infer request.
-        input_ids.set_shape({batch_size, K});
-        // Set the first token for the main model to be the same as for the draft model.
-        input_ids.data<int64_t>()[0] = first_token;
-        for (int i = 0; i < K - 1; i++)
-            input_ids.data<int64_t>()[i + 1] = draft_tokens[i];
-
-        attention_mask.set_shape({batch_size, seq_len + K});
-        std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
-
-        position_ids.set_shape({batch_size, K});
-        std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), seq_len);
-
-        main_model.infer();
-
-        data_logits = logits.data<float>();  // [batch_size, K, vocab_size]
-        size_t disagree_idx = K - 1;
-        // Iterate through the predicted tokens from the main model and compare them with draft predictions.
-        // In the worst-case scenario (disagreement at the beginning), iter will increase by 1.
-        // In the best-case scenario, all elements match, and K predicted tokens will be taken.
-        for (size_t i = 0; i < K; i++) {
-            auto start = data_logits + vocab_size * i;
-            auto stop = data_logits + vocab_size * (i + 1);
-            out_token = std::max_element(start, stop) - start;
-            results.tokens[0].emplace_back(out_token);
-
-            // m_streamer->put(out_token);
-
-            disagree_idx = i;                
-            if (out_token != draft_tokens[i] || out_token == eos_token || seq_len + disagree_idx + 1 >= max_sequence_length)
-                break;
-        }
-
-        // After the inference request, key/values have shape [batch_size, seq_len + K, vocab_size].
-        // Increment the sequence length by the number of matched tokens, and
-        // trim the KV cache to match the new sequence length.
-        seq_len += disagree_idx + 1;
-        update_kv_cache(draft_model, config_helper.seq_len_axis, seq_len);
-        update_kv_cache(main_model, config_helper.seq_len_axis, seq_len);
-        
-        draft_tokens.clear();
-        first_token = out_token;
-    }
-
-    return results;
-}
-
-} // namespace ov
\ No newline at end of file
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index e17dd0eadf..68fa0c86ab 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -89,7 +89,7 @@ size_t GenerationConfigHelper::get_max_new_tokens(size_t prompt_length) {
 }
 
 bool GenerationConfigHelper::is_greedy_decoding() const {
-    return !m_config.do_sample && !is_beam_search() && !is_speculative();
+    return !m_config.do_sample && !is_beam_search();
 }
 
 bool GenerationConfigHelper::is_beam_search() const {
@@ -100,20 +100,4 @@ bool GenerationConfigHelper::is_multimomial() const {
     return m_config.do_sample;
 }
 
-bool GenerationConfigHelper::is_speculative() const {
-    return is_assistant_ov_defined || is_assistant_request_defined;
-}
-
-ov::InferRequest GenerationConfigHelper::get_assistant_model(std::string device, const ov::AnyMap& config) {
-    if (is_assistant_request_defined) {
-        return assistant_model;
-    } else if (is_assistant_ov_defined) {
-        assistant_model = ov::Core().compile_model(m_assistant_ov_model, device, config).create_infer_request();
-        is_assistant_request_defined = true;
-        return assistant_model;
-    } else {
-        OPENVINO_THROW("assistant model is not specified");
-    }
-}
-
 }  // namespace ov
diff --git a/src/cpp/src/generation_config_helper.hpp b/src/cpp/src/generation_config_helper.hpp
index 400843994d..f4e5839990 100644
--- a/src/cpp/src/generation_config_helper.hpp
+++ b/src/cpp/src/generation_config_helper.hpp
@@ -18,65 +18,14 @@ class GenerationConfigHelper {
 
     size_t get_max_new_tokens(size_t prompt_length = 0);
     
-    // template <typename T>
-    // static GenerationConfig assistive_decoding(T& assistant_model) {
-    //     GenerationConfig assistive;
-    //     assistive.assistant_model(assistant_model);
-    //     return assistive;
-    // }
-
     bool is_greedy_decoding() const;
 
     bool is_beam_search() const;
 
     bool is_multimomial() const;
 
-    bool is_speculative() const;
-
-
-    // // for speculative decoding
-    // void set_assistant_model(const ov::InferRequest& assistant_model) {
-    //     this->assistant_model = assistant_model;
-    //     is_assistant_request_defined = true;
-    // }
-
-    // void set_assistant_model(ov::CompiledModel& assistant_model) {
-    //     this->assistant_model = assistant_model.create_infer_request();
-    //     is_assistant_request_defined = true;
-    // }
-
-    // void set_assistant_model(const std::shared_ptr<const ov::Model>& assistant_model) {
-    //     m_assistant_ov_model = assistant_model;
-    //     is_assistant_ov_defined = true;
-    // }
-
-    // void set_assistant_model(std::string assistant_model) {
-    //     auto is_xml = [](std::string path) -> bool { return path.compare(path.length() - 4, 4, ".xml") == 0;};
-    //     if (!is_xml(assistant_model))
-    //         assistant_model += "/openvino_model.xml";
-
-    //     m_assistant_ov_model = ov::Core().read_model(assistant_model);
-    //     is_assistant_ov_defined = true;
-    // }
-
-    ov::InferRequest get_assistant_model(std::string device="CPU", const ov::AnyMap& config={});
-    
-    // void set_num_assistant_tokens(int64_t num_assistant_tokens) {
-    //     this->num_assistant_tokens = num_assistant_tokens;
-    // }
-
-    // for Assistive/Speculative decoding
-    ov::InferRequest assistant_model;
-    size_t num_assistant_tokens = 5;
-    size_t seq_len_axis = 2;
-    
     GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {});
 
-private:
-
-    std::shared_ptr<const ov::Model> m_assistant_ov_model;
-    bool is_assistant_request_defined = false;
-    bool is_assistant_ov_defined = false;
 };
 
 } // namespace ov
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index b405a12880..5ac804ade9 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -19,7 +19,6 @@
 
 namespace ov {
 
-ov::EncodedResults assistive_decoding(ov::InferRequest& m_model_runner, ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig generation_config);
 ov::EncodedResults beam_search(ov::InferRequest& model_runner, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params);
 
 ov::EncodedResults greedy_decoding(
@@ -113,9 +112,7 @@ ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(std::string& path, std::string
     m_device = device;
 
     ov::Core core;
-    auto model_request = core.compile_model(path + "/openvino_model.xml", device, config).create_infer_request();
-    m_model_runner = model_request;
-
+    m_model_runner = core.compile_model(path + "/openvino_model.xml", device, config).create_infer_request();
     m_tokenizer = Tokenizer(path);
 }
 
@@ -233,16 +230,12 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(
         result = ov::greedy_decoding(m_model_runner, input_ids, attention_mask_data, config, streamer_ptr, is_chat_conversation);
     } else if (config_helper.is_beam_search()) {
         result = beam_search(m_model_runner, input_ids, attention_mask_data, config);
-        
-    } else if (config_helper.is_multimomial()) {
+    } else {
         // todo: implement multinomial sampling
         // result = multinomial_sampling(input_ids, config);
-    } else {
-        result = ov::assistive_decoding(m_model_runner, input_ids, attention_mask_data, config);
-    }
+    } 
 
     if (!is_chat_conversation)
-        // reset_state(); todo: implement in m_mimpl
         m_model_runner.reset_state();
 
     return result;
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 1d7879a8d6..6f6179baa4 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -101,7 +101,7 @@ std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(const std::string prompt) {
 
 std::pair<ov::Tensor, ov::Tensor> Tokenizer::TokenizerImpl::encode(std::string prompt) {
     size_t batch_size = 1;   
-    m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt[0]});
+    m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
     m_tokenize_request.infer();
 
     return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index a111753765..69398e1aac 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -49,7 +49,6 @@ std::string call_with_config(ov::LLMPipeline& pipeline, const std::string& text,
     if (kwargs.contains("eos_token_id")) config.eos_token_id = kwargs["eos_token_id"].cast<int64_t>();
     if (kwargs.contains("eos_token")) config.eos_token = kwargs["eos_token"].cast<std::string>();
     if (kwargs.contains("bos_token")) config.bos_token = kwargs["bos_token"].cast<std::string>();
-    if (kwargs.contains("draft_model")) config.draft_model = kwargs["draft_model"].cast<std::variant<std::string, ov::CompiledModel, ov::InferRequest>>();
 
     return pipeline(text, config);
 }
@@ -106,8 +105,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readwrite("bos_token_id", &ov::GenerationConfig::bos_token_id)
         .def_readwrite("eos_token_id", &ov::GenerationConfig::eos_token_id)
         .def_readwrite("eos_token", &ov::GenerationConfig::eos_token)
-        .def_readwrite("bos_token", &ov::GenerationConfig::bos_token)
-        .def_readwrite("draft_model", &ov::GenerationConfig::draft_model);
+        .def_readwrite("bos_token", &ov::GenerationConfig::bos_token);
 
     py::class_<ov::DecodedResults>(m, "DecodedResults")
         .def(py::init<>())

From e7db7e819e236a3ad1a5eec65728c733a6414003 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 13 May 2024 14:15:06 +0400
Subject: [PATCH 69/97] update

---
 .github/workflows/genai_lib.yml       |   18 +-
 pyproject.toml                        |    5 +-
 src/python/CMakeLists.txt             |    1 -
 src/python/openvino/genai/__init__.py |    2 +
 third-party-programs.txt              | 1541 +++++++------------------
 5 files changed, 410 insertions(+), 1157 deletions(-)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index c7c711b4e5..3127fe42b4 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -1,7 +1,7 @@
 name: genai_lib
 on: pull_request
 jobs:
-  genai_lib:
+  genai_lib_ubuntu:
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v4
@@ -15,6 +15,20 @@ jobs:
           mkdir ./ov/
           curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - run: python -m pip install scikit-build
       - run: source ./ov/setupvars.sh && python -m pip install .
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
+
+  genai_lib_windows:
+    runs-on: windows-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+      - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64.zip
+      - run: unzip ov.zip
+      - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && python -m pip install .
+        shell: cmd
+      - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
diff --git a/pyproject.toml b/pyproject.toml
index 2d60b44e30..7e497ecb06 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ classifiers = [
     "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
-    "openvino_tokenizers==2024.1.0.0"
+    "openvino_tokenizers~=2024.1.0.0"
 ]
 
 [tool.scikit-build]
@@ -27,6 +27,7 @@ install.components = ["genai", "genai_python"]
 sdist.cmake = true
 wheel.packages = ["src/python/openvino"]
 wheel.install-dir = "openvino/genai"
+wheel.build-tag = "000"
 wheel.py-api = ""
 wheel.license-files = ["LICENSE", "SECURITY.md", "third-party-programs.txt"]
 
@@ -37,5 +38,5 @@ __version__ = "${version}"
 '''
 
 [build-system]
-requires = ["scikit-build-core>=0.8.0"]
+requires = ["scikit-build-core~=0.8.0"]  # See https://github.com/openvinotoolkit/openvino_tokenizers/pull/123
 build-backend = "scikit_build_core.build"
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 6261cd9bc7..708ff4d2d8 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -16,6 +16,5 @@ if(NOT pybind11_POPULATED)
 endif()
 
 pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp)
-target_compile_features(py_generate_pipeline PUBLIC cxx_std_17)
 target_link_libraries(py_generate_pipeline PRIVATE generate_pipeline_lib)
 install(TARGETS py_generate_pipeline LIBRARY DESTINATION . COMPONENT genai_python)
diff --git a/src/python/openvino/genai/__init__.py b/src/python/openvino/genai/__init__.py
index 07b8c0e67e..f604e03e84 100644
--- a/src/python/openvino/genai/__init__.py
+++ b/src/python/openvino/genai/__init__.py
@@ -3,6 +3,8 @@
 
 import openvino  # add_dll_directory for openvino lib
 import os
+from .__version__ import __version__
+
 
 if hasattr(os, "add_dll_directory"):
     os.add_dll_directory(os.path.dirname(__file__))
diff --git a/third-party-programs.txt b/third-party-programs.txt
index 146aea0684..e418d7b5e3 100644
--- a/third-party-programs.txt
+++ b/third-party-programs.txt
@@ -1,4 +1,4 @@
-OpenVINO Tokenizers Third Party Programs File
+OpenVINO GenAI Third Party Programs File
 
 This file contains the list of third party software ("third party programs")
 contained in the Intel software and their required notices and/or license
@@ -14,1167 +14,404 @@ terms are listed below.
 
 -------------------------------------------------------------
 
-transformers
-
-Copyright 2018- The Hugging Face team. All rights reserved.
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
+Jinja2Cpp
 
--------------------------------------------------------------
+Mozilla Public License Version 2.0
+==================================
 
-fast_tokenizer
-
-Apache License
-Version 2.0, January 2004
-http://www.apache.org/licenses/
-
-TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-1. Definitions.
-
-"License" shall mean the terms and conditions for use, reproduction,
-and distribution as defined by Sections 1 through 9 of this document.
-
-"Licensor" shall mean the copyright owner or entity authorized by
-the copyright owner that is granting the License.
-
-"Legal Entity" shall mean the union of the acting entity and all
-other entities that control, are controlled by, or are under common
-control with that entity. For the purposes of this definition,
-"control" means (i) the power, direct or indirect, to cause the
-direction or management of such entity, whether by contract or
-otherwise, or (ii) ownership of fifty percent (50%) or more of the
-outstanding shares, or (iii) beneficial ownership of such entity.
-
-"You" (or "Your") shall mean an individual or Legal Entity
-exercising permissions granted by this License.
-
-"Source" form shall mean the preferred form for making modifications,
-including but not limited to software source code, documentation
-source, and configuration files.
-
-"Object" form shall mean any form resulting from mechanical
-transformation or translation of a Source form, including but
-not limited to compiled object code, generated documentation,
-and conversions to other media types.
-
-"Work" shall mean the work of authorship, whether in Source or
-Object form, made available under the License, as indicated by a
-copyright notice that is included in or attached to the work
-(an example is provided in the Appendix below).
-
-"Derivative Works" shall mean any work, whether in Source or Object
-form, that is based on (or derived from) the Work and for which the
-editorial revisions, annotations, elaborations, or other modifications
-represent, as a whole, an original work of authorship. For the purposes
-of this License, Derivative Works shall not include works that remain
-separable from, or merely link (or bind by name) to the interfaces of,
-the Work and Derivative Works thereof.
-
-"Contribution" shall mean any work of authorship, including
-the original version of the Work and any modifications or additions
-to that Work or Derivative Works thereof, that is intentionally
-submitted to Licensor for inclusion in the Work by the copyright owner
-or by an individual or Legal Entity authorized to submit on behalf of
-the copyright owner. For the purposes of this definition, "submitted"
-means any form of electronic, verbal, or written communication sent
-to the Licensor or its representatives, including but not limited to
-communication on electronic mailing lists, source code control systems,
-and issue tracking systems that are managed by, or on behalf of, the
-Licensor for the purpose of discussing and improving the Work, but
-excluding communication that is conspicuously marked or otherwise
-designated in writing by the copyright owner as "Not a Contribution."
-
-"Contributor" shall mean Licensor and any individual or Legal Entity
-on behalf of whom a Contribution has been received by Licensor and
-subsequently incorporated within the Work.
-
-2. Grant of Copyright License. Subject to the terms and conditions of
-this License, each Contributor hereby grants to You a perpetual,
-worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-copyright license to reproduce, prepare Derivative Works of,
-publicly display, publicly perform, sublicense, and distribute the
-Work and such Derivative Works in Source or Object form.
-
-3. Grant of Patent License. Subject to the terms and conditions of
-this License, each Contributor hereby grants to You a perpetual,
-worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-(except as stated in this section) patent license to make, have made,
-use, offer to sell, sell, import, and otherwise transfer the Work,
-where such license applies only to those patent claims licensable
-by such Contributor that are necessarily infringed by their
-Contribution(s) alone or by combination of their Contribution(s)
-with the Work to which such Contribution(s) was submitted. If You
-institute patent litigation against any entity (including a
-cross-claim or counterclaim in a lawsuit) alleging that the Work
-or a Contribution incorporated within the Work constitutes direct
-or contributory patent infringement, then any patent licenses
-granted to You under this License for that Work shall terminate
-as of the date such litigation is filed.
-
-4. Redistribution. You may reproduce and distribute copies of the
-Work or Derivative Works thereof in any medium, with or without
-modifications, and in Source or Object form, provided that You
-meet the following conditions:
-
-(a) You must give any other recipients of the Work or
-Derivative Works a copy of this License; and
-
-(b) You must cause any modified files to carry prominent notices
-stating that You changed the files; and
-
-(c) You must retain, in the Source form of any Derivative Works
-that You distribute, all copyright, patent, trademark, and
-attribution notices from the Source form of the Work,
-excluding those notices that do not pertain to any part of
-the Derivative Works; and
-
-(d) If the Work includes a "NOTICE" text file as part of its
-distribution, then any Derivative Works that You distribute must
-include a readable copy of the attribution notices contained
-within such NOTICE file, excluding those notices that do not
-pertain to any part of the Derivative Works, in at least one
-of the following places: within a NOTICE text file distributed
-as part of the Derivative Works; within the Source form or
-documentation, if provided along with the Derivative Works; or,
-within a display generated by the Derivative Works, if and
-wherever such third-party notices normally appear. The contents
-of the NOTICE file are for informational purposes only and
-do not modify the License. You may add Your own attribution
-notices within Derivative Works that You distribute, alongside
-or as an addendum to the NOTICE text from the Work, provided
-that such additional attribution notices cannot be construed
-as modifying the License.
-
-You may add Your own copyright statement to Your modifications and
-may provide additional or different license terms and conditions
-for use, reproduction, or distribution of Your modifications, or
-for any such Derivative Works as a whole, provided Your use,
-reproduction, and distribution of the Work otherwise complies with
-the conditions stated in this License.
-
-5. Submission of Contributions. Unless You explicitly state otherwise,
-any Contribution intentionally submitted for inclusion in the Work
-by You to the Licensor shall be under the terms and conditions of
-this License, without any additional terms or conditions.
-Notwithstanding the above, nothing herein shall supersede or modify
-the terms of any separate license agreement you may have executed
-with Licensor regarding such Contributions.
-
-6. Trademarks. This License does not grant permission to use the trade
-names, trademarks, service marks, or product names of the Licensor,
-except as required for reasonable and customary use in describing the
-origin of the Work and reproducing the content of the NOTICE file.
-
-7. Disclaimer of Warranty. Unless required by applicable law or
-agreed to in writing, Licensor provides the Work (and each
-Contributor provides its Contributions) on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-implied, including, without limitation, any warranties or conditions
-of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-PARTICULAR PURPOSE. You are solely responsible for determining the
-appropriateness of using or redistributing the Work and assume any
-risks associated with Your exercise of permissions under this License.
-
-8. Limitation of Liability. In no event and under no legal theory,
-whether in tort (including negligence), contract, or otherwise,
-unless required by applicable law (such as deliberate and grossly
-negligent acts) or agreed to in writing, shall any Contributor be
-liable to You for damages, including any direct, indirect, special,
-incidental, or consequential damages of any character arising as a
-result of this License or out of the use or inability to use the
-Work (including but not limited to damages for loss of goodwill,
-work stoppage, computer failure or malfunction, or any and all
-other commercial damages or losses), even if such Contributor
-has been advised of the possibility of such damages.
-
-9. Accepting Warranty or Additional Liability. While redistributing
-the Work or Derivative Works thereof, You may choose to offer,
-and charge a fee for, acceptance of support, warranty, indemnity,
-or other liability obligations and/or rights consistent with this
-License. However, in accepting such obligations, You may act only
-on Your own behalf and on Your sole responsibility, not on behalf
-of any other Contributor, and only if You agree to indemnify,
-defend, and hold each Contributor harmless for any liability
-incurred by, or claims asserted against, such Contributor by reason
-of your accepting any such warranty or additional liability.
-
-END OF TERMS AND CONDITIONS
-
-APPENDIX: How to apply the Apache License to your work.
-
-To apply the Apache License to your work, attach the following
-boilerplate notice, with the fields enclosed by brackets "[]"
-replaced with your own identifying information. (Don't include
-the brackets!)  The text should be enclosed in the appropriate
-comment syntax for the file format. We also recommend that a
-file or class name and description of purpose be included on the
-same "printed page" as the copyright notice for easier
-identification within third-party archives.
-
-Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
+1. Definitions
+--------------
 
--------------------------------------------------------------
+1.1. "Contributor"
+    means each individual or legal entity that creates, contributes to
+    the creation of, or owns Covered Software.
 
-re2
-
-// Copyright (c) 2009 The RE2 Authors. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//    * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//    * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//    * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+1.2. "Contributor Version"
+    means the combination of the Contributions of others (if any) used
+    by a Contributor and that particular Contributor's Contribution.
 
--------------------------------------------------------------
+1.3. "Contribution"
+    means Covered Software of a particular Contributor.
 
-icu4c
-
-UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
-
-See Terms of Use <https://www.unicode.org/copyright.html>
-for definitions of Unicode Inc.’s Data Files and Software.
-
-NOTICE TO USER: Carefully read the following legal agreement.
-BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
-DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
-YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
-TERMS AND CONDITIONS OF THIS AGREEMENT.
-IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
-THE DATA FILES OR SOFTWARE.
-
-COPYRIGHT AND PERMISSION NOTICE
-
-Copyright © 1991-2022 Unicode, Inc. All rights reserved.
-Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of the Unicode data files and any associated documentation
-(the "Data Files") or Unicode software and any associated documentation
-(the "Software") to deal in the Data Files or Software
-without restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, and/or sell copies of
-the Data Files or Software, and to permit persons to whom the Data Files
-or Software are furnished to do so, provided that either
-(a) this copyright and permission notice appear with all copies
-of the Data Files or Software, or
-(b) this copyright and permission notice appear in associated
-Documentation.
-
-THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
-ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT OF THIRD PARTY RIGHTS.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
-NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
-DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
-DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-PERFORMANCE OF THE DATA FILES OR SOFTWARE.
-
-Except as contained in this notice, the name of a copyright holder
-shall not be used in advertising or otherwise to promote the sale,
-use or other dealings in these Data Files or Software without prior
-written authorization of the copyright holder.
+1.4. "Covered Software"
+    means Source Code Form to which the initial Contributor has attached
+    the notice in Exhibit A, the Executable Form of such Source Code
+    Form, and Modifications of such Source Code Form, in each case
+    including portions thereof.
 
--------------------------------------------------------------
+1.5. "Incompatible With Secondary Licenses"
+    means
 
-sentencepiece
-
-Apache License
-Version 2.0, January 2004
-http://www.apache.org/licenses/
-
-TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-1. Definitions.
-
-"License" shall mean the terms and conditions for use, reproduction,
-and distribution as defined by Sections 1 through 9 of this document.
-
-"Licensor" shall mean the copyright owner or entity authorized by
-the copyright owner that is granting the License.
-
-"Legal Entity" shall mean the union of the acting entity and all
-other entities that control, are controlled by, or are under common
-control with that entity. For the purposes of this definition,
-"control" means (i) the power, direct or indirect, to cause the
-direction or management of such entity, whether by contract or
-otherwise, or (ii) ownership of fifty percent (50%) or more of the
-outstanding shares, or (iii) beneficial ownership of such entity.
-
-"You" (or "Your") shall mean an individual or Legal Entity
-exercising permissions granted by this License.
-
-"Source" form shall mean the preferred form for making modifications,
-including but not limited to software source code, documentation
-source, and configuration files.
-
-"Object" form shall mean any form resulting from mechanical
-transformation or translation of a Source form, including but
-not limited to compiled object code, generated documentation,
-and conversions to other media types.
-
-"Work" shall mean the work of authorship, whether in Source or
-Object form, made available under the License, as indicated by a
-copyright notice that is included in or attached to the work
-(an example is provided in the Appendix below).
-
-"Derivative Works" shall mean any work, whether in Source or Object
-form, that is based on (or derived from) the Work and for which the
-editorial revisions, annotations, elaborations, or other modifications
-represent, as a whole, an original work of authorship. For the purposes
-of this License, Derivative Works shall not include works that remain
-separable from, or merely link (or bind by name) to the interfaces of,
-the Work and Derivative Works thereof.
-
-"Contribution" shall mean any work of authorship, including
-the original version of the Work and any modifications or additions
-to that Work or Derivative Works thereof, that is intentionally
-submitted to Licensor for inclusion in the Work by the copyright owner
-or by an individual or Legal Entity authorized to submit on behalf of
-the copyright owner. For the purposes of this definition, "submitted"
-means any form of electronic, verbal, or written communication sent
-to the Licensor or its representatives, including but not limited to
-communication on electronic mailing lists, source code control systems,
-and issue tracking systems that are managed by, or on behalf of, the
-Licensor for the purpose of discussing and improving the Work, but
-excluding communication that is conspicuously marked or otherwise
-designated in writing by the copyright owner as "Not a Contribution."
-
-"Contributor" shall mean Licensor and any individual or Legal Entity
-on behalf of whom a Contribution has been received by Licensor and
-subsequently incorporated within the Work.
-
-2. Grant of Copyright License. Subject to the terms and conditions of
-this License, each Contributor hereby grants to You a perpetual,
-worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-copyright license to reproduce, prepare Derivative Works of,
-publicly display, publicly perform, sublicense, and distribute the
-Work and such Derivative Works in Source or Object form.
-
-3. Grant of Patent License. Subject to the terms and conditions of
-this License, each Contributor hereby grants to You a perpetual,
-worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-(except as stated in this section) patent license to make, have made,
-use, offer to sell, sell, import, and otherwise transfer the Work,
-where such license applies only to those patent claims licensable
-by such Contributor that are necessarily infringed by their
-Contribution(s) alone or by combination of their Contribution(s)
-with the Work to which such Contribution(s) was submitted. If You
-institute patent litigation against any entity (including a
-cross-claim or counterclaim in a lawsuit) alleging that the Work
-or a Contribution incorporated within the Work constitutes direct
-or contributory patent infringement, then any patent licenses
-granted to You under this License for that Work shall terminate
-as of the date such litigation is filed.
-
-4. Redistribution. You may reproduce and distribute copies of the
-Work or Derivative Works thereof in any medium, with or without
-modifications, and in Source or Object form, provided that You
-meet the following conditions:
-
-(a) You must give any other recipients of the Work or
-Derivative Works a copy of this License; and
-
-(b) You must cause any modified files to carry prominent notices
-stating that You changed the files; and
-
-(c) You must retain, in the Source form of any Derivative Works
-that You distribute, all copyright, patent, trademark, and
-attribution notices from the Source form of the Work,
-excluding those notices that do not pertain to any part of
-the Derivative Works; and
-
-(d) If the Work includes a "NOTICE" text file as part of its
-distribution, then any Derivative Works that You distribute must
-include a readable copy of the attribution notices contained
-within such NOTICE file, excluding those notices that do not
-pertain to any part of the Derivative Works, in at least one
-of the following places: within a NOTICE text file distributed
-as part of the Derivative Works; within the Source form or
-documentation, if provided along with the Derivative Works; or,
-within a display generated by the Derivative Works, if and
-wherever such third-party notices normally appear. The contents
-of the NOTICE file are for informational purposes only and
-do not modify the License. You may add Your own attribution
-notices within Derivative Works that You distribute, alongside
-or as an addendum to the NOTICE text from the Work, provided
-that such additional attribution notices cannot be construed
-as modifying the License.
-
-You may add Your own copyright statement to Your modifications and
-may provide additional or different license terms and conditions
-for use, reproduction, or distribution of Your modifications, or
-for any such Derivative Works as a whole, provided Your use,
-reproduction, and distribution of the Work otherwise complies with
-the conditions stated in this License.
-
-5. Submission of Contributions. Unless You explicitly state otherwise,
-any Contribution intentionally submitted for inclusion in the Work
-by You to the Licensor shall be under the terms and conditions of
-this License, without any additional terms or conditions.
-Notwithstanding the above, nothing herein shall supersede or modify
-the terms of any separate license agreement you may have executed
-with Licensor regarding such Contributions.
-
-6. Trademarks. This License does not grant permission to use the trade
-names, trademarks, service marks, or product names of the Licensor,
-except as required for reasonable and customary use in describing the
-origin of the Work and reproducing the content of the NOTICE file.
-
-7. Disclaimer of Warranty. Unless required by applicable law or
-agreed to in writing, Licensor provides the Work (and each
-Contributor provides its Contributions) on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-implied, including, without limitation, any warranties or conditions
-of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-PARTICULAR PURPOSE. You are solely responsible for determining the
-appropriateness of using or redistributing the Work and assume any
-risks associated with Your exercise of permissions under this License.
-
-8. Limitation of Liability. In no event and under no legal theory,
-whether in tort (including negligence), contract, or otherwise,
-unless required by applicable law (such as deliberate and grossly
-negligent acts) or agreed to in writing, shall any Contributor be
-liable to You for damages, including any direct, indirect, special,
-incidental, or consequential damages of any character arising as a
-result of this License or out of the use or inability to use the
-Work (including but not limited to damages for loss of goodwill,
-work stoppage, computer failure or malfunction, or any and all
-other commercial damages or losses), even if such Contributor
-has been advised of the possibility of such damages.
-
-9. Accepting Warranty or Additional Liability. While redistributing
-the Work or Derivative Works thereof, You may choose to offer,
-and charge a fee for, acceptance of support, warranty, indemnity,
-or other liability obligations and/or rights consistent with this
-License. However, in accepting such obligations, You may act only
-on Your own behalf and on Your sole responsibility, not on behalf
-of any other Contributor, and only if You agree to indemnify,
-defend, and hold each Contributor harmless for any liability
-incurred by, or claims asserted against, such Contributor by reason
-of your accepting any such warranty or additional liability.
-
-END OF TERMS AND CONDITIONS
-
-APPENDIX: How to apply the Apache License to your work.
-
-To apply the Apache License to your work, attach the following
-boilerplate notice, with the fields enclosed by brackets "[]"
-replaced with your own identifying information. (Don't include
-the brackets!)  The text should be enclosed in the appropriate
-comment syntax for the file format. We also recommend that a
-file or class name and description of purpose be included on the
-same "printed page" as the copyright notice for easier
-identification within third-party archives.
-
-Copyright [yyyy] [name of copyright owner]
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
+    (a) that the initial Contributor has attached the notice described
+        in Exhibit B to the Covered Software; or
 
--------------------------------------------------------------
+    (b) that the Covered Software was made available under the terms of
+        version 1.1 or earlier of the License, but not also under the
+        terms of a Secondary License.
+
+1.6. "Executable Form"
+    means any form of the work other than Source Code Form.
+
+1.7. "Larger Work"
+    means a work that combines Covered Software with other material, in
+    a separate file or files, that is not Covered Software.
+
+1.8. "License"
+    means this document.
+
+1.9. "Licensable"
+    means having the right to grant, to the maximum extent possible,
+    whether at the time of the initial grant or subsequently, any and
+    all of the rights conveyed by this License.
+
+1.10. "Modifications"
+    means any of the following:
+
+    (a) any file in Source Code Form that results from an addition to,
+        deletion from, or modification of the contents of Covered
+        Software; or
+
+    (b) any new file in Source Code Form that contains any Covered
+        Software.
+
+1.11. "Patent Claims" of a Contributor
+    means any patent claim(s), including without limitation, method,
+    process, and apparatus claims, in any patent Licensable by such
+    Contributor that would be infringed, but for the grant of the
+    License, by the making, using, selling, offering for sale, having
+    made, import, or transfer of either its Contributions or its
+    Contributor Version.
+
+1.12. "Secondary License"
+    means either the GNU General Public License, Version 2.0, the GNU
+    Lesser General Public License, Version 2.1, the GNU Affero General
+    Public License, Version 3.0, or any later versions of those
+    licenses.
+
+1.13. "Source Code Form"
+    means the form of the work preferred for making modifications.
 
-tensorflow
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
-## Some of TensorFlow's code is derived from Caffe, which is subject to the following copyright notice:
-
-COPYRIGHT
-
-All contributions by the University of California:
-
-Copyright (c) 2014, The Regents of the University of California (Regents)
-All rights reserved.
-
-All other contributions:
-
-Copyright (c) 2014, the respective contributors
-All rights reserved.
-
-Caffe uses a shared copyright model: each contributor holds copyright over
-their contributions to Caffe. The project versioning records all such
-contribution and copyright details. If a contributor wants to further mark
-their specific copyright on a particular contribution, they should indicate
-their copyright solely in the commit message of the change when it is
-committed.
-
-LICENSE
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-   ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-   ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-CONTRIBUTION AGREEMENT
-
-By contributing to the BVLC/caffe repository through pull-request, comment,
-or otherwise, the contributor releases their content to the
-license and copyright terms herein.
+1.14. "You" (or "Your")
+    means an individual or a legal entity exercising rights under this
+    License. For legal entities, "You" includes any entity that
+    controls, is controlled by, or is under common control with You. For
+    purposes of this definition, "control" means (a) the power, direct
+    or indirect, to cause the direction or management of such entity,
+    whether by contract or otherwise, or (b) ownership of more than
+    fifty percent (50%) of the outstanding shares or beneficial
+    ownership of such entity.
+
+2. License Grants and Conditions
+--------------------------------
+
+2.1. Grants
+
+Each Contributor hereby grants You a world-wide, royalty-free,
+non-exclusive license:
+
+(a) under intellectual property rights (other than patent or trademark)
+    Licensable by such Contributor to use, reproduce, make available,
+    modify, display, perform, distribute, and otherwise exploit its
+    Contributions, either on an unmodified basis, with Modifications, or
+    as part of a Larger Work; and
+
+(b) under Patent Claims of such Contributor to make, use, sell, offer
+    for sale, have made, import, and otherwise transfer either its
+    Contributions or its Contributor Version.
+
+2.2. Effective Date
+
+The licenses granted in Section 2.1 with respect to any Contribution
+become effective for each Contribution on the date the Contributor first
+distributes such Contribution.
+
+2.3. Limitations on Grant Scope
+
+The licenses granted in this Section 2 are the only rights granted under
+this License. No additional rights or licenses will be implied from the
+distribution or licensing of Covered Software under this License.
+Notwithstanding Section 2.1(b) above, no patent license is granted by a
+Contributor:
+
+(a) for any code that a Contributor has removed from Covered Software;
+    or
+
+(b) for infringements caused by: (i) Your and any other third party's
+    modifications of Covered Software, or (ii) the combination of its
+    Contributions with other software (except as part of its Contributor
+    Version); or
+
+(c) under Patent Claims infringed by Covered Software in the absence of
+    its Contributions.
+
+This License does not grant any rights in the trademarks, service marks,
+or logos of any Contributor (except as may be necessary to comply with
+the notice requirements in Section 3.4).
+
+2.4. Subsequent Licenses
+
+No Contributor makes additional grants as a result of Your choice to
+distribute the Covered Software under a subsequent version of this
+License (see Section 10.2) or under the terms of a Secondary License (if
+permitted under the terms of Section 3.3).
+
+2.5. Representation
+
+Each Contributor represents that the Contributor believes its
+Contributions are its original creation(s) or it has sufficient rights
+to grant the rights to its Contributions conveyed by this License.
+
+2.6. Fair Use
+
+This License is not intended to limit any rights You have under
+applicable copyright doctrines of fair use, fair dealing, or other
+equivalents.
+
+2.7. Conditions
+
+Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
+in Section 2.1.
+
+3. Responsibilities
+-------------------
+
+3.1. Distribution of Source Form
+
+All distribution of Covered Software in Source Code Form, including any
+Modifications that You create or to which You contribute, must be under
+the terms of this License. You must inform recipients that the Source
+Code Form of the Covered Software is governed by the terms of this
+License, and how they can obtain a copy of this License. You may not
+attempt to alter or restrict the recipients' rights in the Source Code
+Form.
+
+3.2. Distribution of Executable Form
+
+If You distribute Covered Software in Executable Form then:
+
+(a) such Covered Software must also be made available in Source Code
+    Form, as described in Section 3.1, and You must inform recipients of
+    the Executable Form how they can obtain a copy of such Source Code
+    Form by reasonable means in a timely manner, at a charge no more
+    than the cost of distribution to the recipient; and
+
+(b) You may distribute such Executable Form under the terms of this
+    License, or sublicense it under different terms, provided that the
+    license for the Executable Form does not attempt to limit or alter
+    the recipients' rights in the Source Code Form under this License.
+
+3.3. Distribution of a Larger Work
+
+You may create and distribute a Larger Work under terms of Your choice,
+provided that You also comply with the requirements of this License for
+the Covered Software. If the Larger Work is a combination of Covered
+Software with a work governed by one or more Secondary Licenses, and the
+Covered Software is not Incompatible With Secondary Licenses, this
+License permits You to additionally distribute such Covered Software
+under the terms of such Secondary License(s), so that the recipient of
+the Larger Work may, at their option, further distribute the Covered
+Software under the terms of either this License or such Secondary
+License(s).
+
+3.4. Notices
+
+You may not remove or alter the substance of any license notices
+(including copyright notices, patent notices, disclaimers of warranty,
+or limitations of liability) contained within the Source Code Form of
+the Covered Software, except that You may alter any license notices to
+the extent required to remedy known factual inaccuracies.
+
+3.5. Application of Additional Terms
+
+You may choose to offer, and to charge a fee for, warranty, support,
+indemnity or liability obligations to one or more recipients of Covered
+Software. However, You may do so only on Your own behalf, and not on
+behalf of any Contributor. You must make it absolutely clear that any
+such warranty, support, indemnity, or liability obligation is offered by
+You alone, and You hereby agree to indemnify every Contributor for any
+liability incurred by such Contributor as a result of warranty, support,
+indemnity or liability terms You offer. You may include additional
+disclaimers of warranty and limitations of liability specific to any
+jurisdiction.
+
+4. Inability to Comply Due to Statute or Regulation
+---------------------------------------------------
+
+If it is impossible for You to comply with any of the terms of this
+License with respect to some or all of the Covered Software due to
+statute, judicial order, or regulation then You must: (a) comply with
+the terms of this License to the maximum extent possible; and (b)
+describe the limitations and the code they affect. Such description must
+be placed in a text file included with all distributions of the Covered
+Software under this License. Except to the extent prohibited by statute
+or regulation, such description must be sufficiently detailed for a
+recipient of ordinary skill to be able to understand it.
+
+5. Termination
+--------------
+
+5.1. The rights granted under this License will terminate automatically
+if You fail to comply with any of its terms. However, if You become
+compliant, then the rights granted under this License from a particular
+Contributor are reinstated (a) provisionally, unless and until such
+Contributor explicitly and finally terminates Your grants, and (b) on an
+ongoing basis, if such Contributor fails to notify You of the
+non-compliance by some reasonable means prior to 60 days after You have
+come back into compliance. Moreover, Your grants from a particular
+Contributor are reinstated on an ongoing basis if such Contributor
+notifies You of the non-compliance by some reasonable means, this is the
+first time You have received notice of non-compliance with this License
+from such Contributor, and You become compliant prior to 30 days after
+Your receipt of the notice.
+
+5.2. If You initiate litigation against any entity by asserting a patent
+infringement claim (excluding declaratory judgment actions,
+counter-claims, and cross-claims) alleging that a Contributor Version
+directly or indirectly infringes any patent, then the rights granted to
+You by any and all Contributors for the Covered Software under Section
+2.1 of this License shall terminate.
+
+5.3. In the event of termination under Sections 5.1 or 5.2 above, all
+end user license agreements (excluding distributors and resellers) which
+have been validly granted by You or Your distributors under this License
+prior to termination shall survive termination.
+
+************************************************************************
+*                                                                      *
+*  6. Disclaimer of Warranty                                           *
+*  -------------------------                                           *
+*                                                                      *
+*  Covered Software is provided under this License on an "as is"       *
+*  basis, without warranty of any kind, either expressed, implied, or  *
+*  statutory, including, without limitation, warranties that the       *
+*  Covered Software is free of defects, merchantable, fit for a        *
+*  particular purpose or non-infringing. The entire risk as to the     *
+*  quality and performance of the Covered Software is with You.        *
+*  Should any Covered Software prove defective in any respect, You     *
+*  (not any Contributor) assume the cost of any necessary servicing,   *
+*  repair, or correction. This disclaimer of warranty constitutes an   *
+*  essential part of this License. No use of any Covered Software is   *
+*  authorized under this License except under this disclaimer.         *
+*                                                                      *
+************************************************************************
+
+************************************************************************
+*                                                                      *
+*  7. Limitation of Liability                                          *
+*  --------------------------                                          *
+*                                                                      *
+*  Under no circumstances and under no legal theory, whether tort      *
+*  (including negligence), contract, or otherwise, shall any           *
+*  Contributor, or anyone who distributes Covered Software as          *
+*  permitted above, be liable to You for any direct, indirect,         *
+*  special, incidental, or consequential damages of any character      *
+*  including, without limitation, damages for lost profits, loss of    *
+*  goodwill, work stoppage, computer failure or malfunction, or any    *
+*  and all other commercial damages or losses, even if such party      *
+*  shall have been informed of the possibility of such damages. This   *
+*  limitation of liability shall not apply to liability for death or   *
+*  personal injury resulting from such party's negligence to the       *
+*  extent applicable law prohibits such limitation. Some               *
+*  jurisdictions do not allow the exclusion or limitation of           *
+*  incidental or consequential damages, so this exclusion and          *
+*  limitation may not apply to You.                                    *
+*                                                                      *
+************************************************************************
+
+8. Litigation
+-------------
+
+Any litigation relating to this License may be brought only in the
+courts of a jurisdiction where the defendant maintains its principal
+place of business and such litigation shall be governed by laws of that
+jurisdiction, without reference to its conflict-of-law provisions.
+Nothing in this Section shall prevent a party's ability to bring
+cross-claims or counter-claims.
+
+9. Miscellaneous
+----------------
+
+This License represents the complete agreement concerning the subject
+matter hereof. If any provision of this License is held to be
+unenforceable, such provision shall be reformed only to the extent
+necessary to make it enforceable. Any law or regulation which provides
+that the language of a contract shall be construed against the drafter
+shall not be used to construe this License against a Contributor.
+
+10. Versions of the License
+---------------------------
+
+10.1. New Versions
+
+Mozilla Foundation is the license steward. Except as provided in Section
+10.3, no one other than the license steward has the right to modify or
+publish new versions of this License. Each version will be given a
+distinguishing version number.
+
+10.2. Effect of New Versions
+
+You may distribute the Covered Software under the terms of the version
+of the License under which You originally received the Covered Software,
+or under the terms of any subsequent version published by the license
+steward.
+
+10.3. Modified Versions
+
+If you create software not governed by this License, and you want to
+create a new license for such software, you may create and use a
+modified version of this License if you rename the license and remove
+any references to the name of the license steward (except to note that
+such modified license differs from this License).
+
+10.4. Distributing Source Code Form that is Incompatible With Secondary
+Licenses
+
+If You choose to distribute Source Code Form that is Incompatible With
+Secondary Licenses under the terms of this version of the License, the
+notice described in Exhibit B of this License must be attached.
+
+Exhibit A - Source Code Form License Notice
+-------------------------------------------
+
+  This Source Code Form is subject to the terms of the Mozilla Public
+  License, v. 2.0. If a copy of the MPL was not distributed with this
+  file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+If it is not possible or desirable to put the notice in a particular
+file, then You may include the notice in a location (such as a LICENSE
+file in a relevant directory) where a recipient would be likely to look
+for such a notice.
+
+You may add additional accurate notices of copyright ownership.
+
+Exhibit B - "Incompatible With Secondary Licenses" Notice
+---------------------------------------------------------
+
+  This Source Code Form is "Incompatible With Secondary Licenses", as
+  defined by the Mozilla Public License, v. 2.0.
 
 -------------------------------------------------------------
 
-tensorflow-text
-
-Copyright 2018 The TensorFlow Authors.  All rights reserved.
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2017, The TensorFlow Authors.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
+JSON for Modern C++ (https://github.com/nlohmann/json)
+
+MIT License
+
+Copyright (c) 2013-2022 Niels Lohmann
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

From f2793634db0b16babc7861b697f4292d59bdb706 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 13 May 2024 14:34:30 +0400
Subject: [PATCH 70/97] add rpath

---
 src/python/CMakeLists.txt | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 708ff4d2d8..ce87ed99ec 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -18,3 +18,20 @@ endif()
 pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp)
 target_link_libraries(py_generate_pipeline PRIVATE generate_pipeline_lib)
 install(TARGETS py_generate_pipeline LIBRARY DESTINATION . COMPONENT genai_python)
+
+# setting RPATH / LC_RPATH depending on platform
+if(LINUX)
+  # to find libgenerate_pipeline_lib.so in the same folder
+  set(rpaths "$ORIGIN")
+elseif(APPLE)
+  # to find libgenerate_pipeline_lib.dylib in the same folder
+  set(rpaths "@loader_path")
+  if(DEFINED SKBUILD)
+    # in case we build pip package, we need to refer to libopenvino.dylib from 'openvino' package
+    list(APPEND rpaths "@loader_path/../../openvino/libs")
+  endif()
+endif()
+
+if(rpaths)
+  set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "${rpaths}")
+endif()

From 83d77c8dff6fb884c167055c7255cda40bafa262 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 13 May 2024 14:40:36 +0400
Subject: [PATCH 71/97] add rpath to libopenvino.so

---
 src/python/CMakeLists.txt | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index ce87ed99ec..f8a7a24597 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -21,17 +21,21 @@ install(TARGETS py_generate_pipeline LIBRARY DESTINATION . COMPONENT genai_pytho
 
 # setting RPATH / LC_RPATH depending on platform
 if(LINUX)
-  # to find libgenerate_pipeline_lib.so in the same folder
-  set(rpaths "$ORIGIN")
+    # to find libgenerate_pipeline_lib.so in the same folder
+    set(rpaths "$ORIGIN")
+    if(DEFINED SKBUILD)
+        # in case we build pip package, we need to refer to libopenvino.so from 'openvino' package
+        list(APPEND rpaths "@ORIGIN/../../openvino/libs")
+    endif()
 elseif(APPLE)
-  # to find libgenerate_pipeline_lib.dylib in the same folder
-  set(rpaths "@loader_path")
-  if(DEFINED SKBUILD)
-    # in case we build pip package, we need to refer to libopenvino.dylib from 'openvino' package
-    list(APPEND rpaths "@loader_path/../../openvino/libs")
-  endif()
+    # to find libgenerate_pipeline_lib.dylib in the same folder
+    set(rpaths "@loader_path")
+    if(DEFINED SKBUILD)
+        # in case we build pip package, we need to refer to libopenvino.dylib from 'openvino' package
+        list(APPEND rpaths "@loader_path/../../openvino/libs")
+    endif()
 endif()
 
 if(rpaths)
-  set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "${rpaths}")
+    set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "${rpaths}")
 endif()

From 167f9244e76df906f87e0754a3ef30d6a3c67966 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 13 May 2024 14:43:26 +0400
Subject: [PATCH 72/97] py_generate_pipeline

---
 src/python/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index f8a7a24597..c430b165c8 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -37,5 +37,5 @@ elseif(APPLE)
 endif()
 
 if(rpaths)
-    set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "${rpaths}")
+    set_target_properties(py_generate_pipeline PROPERTIES INSTALL_RPATH "${rpaths}")
 endif()

From a111a3fcef311f0d6f80f5fb30b2dfbd431a7d1e Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Sat, 11 May 2024 12:23:59 +0200
Subject: [PATCH 73/97] reorder tokenizer.cpp, add comments to BaseStreamer

---
 .../include/openvino/genai/llm_pipeline.hpp   |  43 +++--
 .../include/openvino/genai/streamer_base.hpp  |  13 +-
 src/cpp/include/openvino/genai/tokenizer.hpp  |   4 +-
 src/cpp/src/greedy_decoding.cpp               |   2 +
 src/cpp/src/llm_pipeline.cpp                  |  23 +--
 src/cpp/src/text_callback_streamer.cpp        |   7 +-
 src/cpp/src/tokenizer.cpp                     | 177 ++++++++----------
 src/python/py_generate_pipeline.cpp           |  32 +++-
 .../cpp/generate_pipeline/generate_sample.cpp |   3 +-
 9 files changed, 168 insertions(+), 136 deletions(-)

diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index 47c7fcec85..c16d0ffde4 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -82,9 +82,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     * @param streamer optional streamer
     * @return std::string decoded resulting text
     */
-    std::string generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer);
+    std::string generate(std::string text, OptionalGenerationConfig generation_config=nullopt, OptionalStreamerVariant streamer=nullopt);
     
-
     template <typename... Properties>
     util::EnableIfAllStringAny<std::string, Properties...> generate(
         std::string text,
@@ -124,8 +123,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     */
     EncodedResults generate(ov::Tensor input_ids, 
                             std::optional<ov::Tensor> attention_mask, 
-                            OptionalGenerationConfig generation_config,
-                            OptionalStreamerVariant streamer);
+                            OptionalGenerationConfig generation_config=nullopt,
+                            OptionalStreamerVariant streamer=nullopt);
     
     template <typename InputsType, typename... Properties>
     util::EnableIfAllStringAny<std::string, Properties...> operator()(
@@ -133,13 +132,12 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
         Properties&&... properties) {
         return generate(text, AnyMap{std::forward<Properties>(properties)...});
     }
-    std::string operator()(std::string text, OptionalGenerationConfig generation_config={});
     
-    DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config);
-    DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config);
+    DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config=nullopt);
+    DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config=nullopt);
 
     // generate with streamers
-    std::string operator()(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer);
+    std::string operator()(std::string text, OptionalGenerationConfig generation_config=nullopt, OptionalStreamerVariant streamer=nullopt);
     std::string operator()(std::string text, OptionalStreamerVariant streamer);
     
     ov::Tokenizer get_tokenizer();
@@ -162,10 +160,33 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
  * All names match to names in cofnig except streamer.
 */
 static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
+static constexpr ov::Property<size_t> max_length{"max_length"};
+static constexpr ov::Property<bool> ignore_eos{"ignore_eos"};
+
+static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"};
+static constexpr ov::Property<size_t> num_beams{"num_beams"};
+static constexpr ov::Property<float> diversity_penalty{"diversity_penalty"};
+static constexpr ov::Property<float> length_penalty{"length_penalty"};
+static constexpr ov::Property<size_t> num_return_sequences{"num_return_sequences"};
+static constexpr ov::Property<size_t> no_repeat_ngram_size{"no_repeat_ngram_size"};
+static constexpr ov::Property<StopCriteria> stop_criteria{"stop_criteria"};
+
 static constexpr ov::Property<float> temperature{"temperature"};
+static constexpr ov::Property<float> top_p{"top_p"};
+static constexpr ov::Property<int> top_k{"top_k"};
+static constexpr ov::Property<bool> do_sample{"do_sample"};
+static constexpr ov::Property<float> repetition_penalty{"repetition_penalty"};
+
+
+static constexpr ov::Property<int64_t> pad_token_id{"pad_token_id"};
+static constexpr ov::Property<int64_t> bos_token_id{"bos_token_id"};
+static constexpr ov::Property<int64_t> eos_token_id{"eos_token_id"};
+    
+static constexpr ov::Property<std::string> bos_token{"bos_token"};
+static constexpr ov::Property<std::string> eos_token{"eos_token"};
 
-// It's problematic to store and automaticall convert std::variant in AnyMap
-static constexpr ov::Property<std::function<void (std::string)>> streamer_lambda{"streamer_lambda"};
-static constexpr ov::Property<std::shared_ptr<StreamerBase>> streamer{"streamer"};
+// only lambda streamer can be set via ov::streamer(),... syntaxic sugar,
+// because std::variant<StremaerBase, std::function<>> can not be stored in AnyMap
+static constexpr ov::Property<std::function<void (std::string)>> streamer_lambda{"streamer"};
 
 } // namespace ov
diff --git a/src/cpp/include/openvino/genai/streamer_base.hpp b/src/cpp/include/openvino/genai/streamer_base.hpp
index 0d32f9fcda..3f0879d702 100644
--- a/src/cpp/include/openvino/genai/streamer_base.hpp
+++ b/src/cpp/include/openvino/genai/streamer_base.hpp
@@ -7,10 +7,21 @@
 
 namespace ov {
 
+/** 
+ * @brief base class for streamers. In order to use inherit from from this class and inplement put, and methods
+ * 
+ * @param m_tokenizer tokenizer
+*/
 class StreamerBase {
 public:
+    Tokenizer m_tokenizer;
+    StreamerBase(Tokenizer tokenizer): m_tokenizer(tokenizer) {};
+    StreamerBase() = default;
+    
+    /// @brief put is called every time new token is decoded
     virtual void put(int64_t token) = 0;
-
+    
+    /// @brief end is called at the end of generation. It can be used to flush cache if your own streamer has one
     virtual void end() = 0;
 };
 
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 54e11eaf9f..0d55d9b0fe 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -12,7 +12,7 @@
 namespace ov {
 
 /**
-* @brief class used to encode prompts and decode resulting tokens
+* @brief class is used to encode prompts and decode resulting tokens
 */
 class OPENVINO_GENAI_EXPORTS Tokenizer {
 public:
@@ -27,7 +27,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     * @brief encode a single prompt
     * @return pair of [input_ids, attention_mask]
     */
-    std::pair<ov::Tensor, ov::Tensor> encode(const std::string prompt);  // todo: passing by reference fails
+    std::pair<ov::Tensor, ov::Tensor> encode(const std::string prompt);
     
     /**
     * @brief encode batch of prompts. Left padding will be applied by default
diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
index 7ea134d736..7e437ad281 100644
--- a/src/cpp/src/greedy_decoding.cpp
+++ b/src/cpp/src/greedy_decoding.cpp
@@ -170,6 +170,8 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner,
         if (!generation_config.ignore_eos && all_are_eos)
             break;
     }
+    if (streamer)
+        streamer->end();
     return results;
 }
 
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 5ac804ade9..2e4c49337a 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -186,11 +186,6 @@ ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector<std::s
     return {m_tokenizer.decode(generate_results.tokens), generate_results.scores};
 }
 
-std::string ov::LLMPipeline::operator()(std::string text, OptionalGenerationConfig generation_config) {
-    OptionalStreamerVariant empty_streamer;
-    return generate(text, generation_config, empty_streamer);
-}
-
 ov::DecodedResults ov::LLMPipeline::operator()(std::vector<std::string> texts, OptionalGenerationConfig generation_config) {
     return m_pimpl-> generate(texts, generation_config);
 }
@@ -245,16 +240,11 @@ std::string ov::LLMPipeline::generate(std::string text, OptionalGenerationConfig
     return m_pimpl->generate(text, generation_config, streamer);
 }
 
-
 std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config_map) {
     OptionalStreamerVariant streamer;
     auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map);
-
-    // todo: get attentions from properties?
-    if (config_map.count("streamer_lambda")) {
-        streamer = config_map.at("streamer_lambda").as<std::function<void (std::string)>>();
-    } else if (config_map.count("streamer")) {
-        streamer = config_map.at("streamer").as<std::shared_ptr<StreamerBase>>();
+    if (config_map.count("streamer")) {
+        streamer = config_map.at("streamer").as<std::function<void (std::string)>>();
     }
 
     return m_pimpl->generate(text, config, streamer);
@@ -263,13 +253,10 @@ std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config
 ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, const ov::AnyMap& config_map) {
     OptionalStreamerVariant streamer;
     auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map);
-
-    // todo: get attentions from properties?
-    if (config_map.count("streamer_lambda")) {
-        streamer = config_map.at("streamer_lambda").as<std::function<void (std::string)>>();
-    } else if (config_map.count("streamer")) {
-        streamer = config_map.at("streamer").as<std::shared_ptr<StreamerBase>>();
+    if (config_map.count("streamer")) {
+        streamer = config_map.at("streamer").as<std::function<void (std::string)>>();
     }
+    
     std::optional<ov::Tensor> attention_mask;
     return m_pimpl->generate(input_ids, attention_mask, config, streamer);
 }
diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp
index 6e5bd4ee8d..a1d2f3b01d 100644
--- a/src/cpp/src/text_callback_streamer.cpp
+++ b/src/cpp/src/text_callback_streamer.cpp
@@ -1,7 +1,6 @@
 #include "text_callback_streamer.hpp"
 
 namespace ov {
-    
 
 TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function<void (std::string)> callback, bool print_eos_token) {
     m_tokenizer = tokenizer;
@@ -17,11 +16,9 @@ TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, bool prin
 
 void TextCallbackStreamer::put(int64_t token) {
     std::stringstream res;
-    // do not print anything and flush cache if EOS token is met
-    if (token == m_tokenizer.get_eos_token_id()) {
-        end();
+    // do nothing if <eos> token is met and if print_eos_token=false
+    if (!m_print_eos_token && token == m_tokenizer.get_eos_token_id())
         return;
-    }
 
     m_tokens_cache.push_back(token);
     std::string text = m_tokenizer.decode(m_tokens_cache);
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 6f6179baa4..a11cfb471a 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -53,58 +53,99 @@ class Tokenizer::TokenizerImpl {
     int64_t m_eos_token_id = 2;
 
     TokenizerImpl() = default;
-    TokenizerImpl(std::string tokenizers_path, const std::string device);
+    TokenizerImpl(std::string tokenizers_path, const std::string device) {
+        ov::Core core;
+        
+        if (ov::generate_utils::is_xml(tokenizers_path))
+            OPENVINO_THROW("tokenizers_path should be a path to a dir not a xml file");
+    
+        // todo:: OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
+        core.add_extension(OPENVINO_TOKENIZERS_PATH);  
+        
+        std::shared_ptr<ov::Model> tokenizer_model, detokenizer_model;
+        try {
+            tokenizer_model = core.read_model(tokenizers_path + "/openvino_tokenizer.xml");
+            detokenizer_model = core.read_model(tokenizers_path + "/openvino_detokenizer.xml");
+        } catch (...) {
+            OPENVINO_THROW("Cannot compile tokenizer and/or detokenizer. Please check that "
+                        "openvino_tokenizer.xml and openvino_detokenizer.xml exist in \"" + tokenizers_path + "\"");
+        }
+        m_tokenize_request = core.compile_model(tokenizer_model, device).create_infer_request();
+        m_detokenizer_request = core.compile_model(detokenizer_model, device).create_infer_request();
+
+        auto rt_info = tokenizer_model->get_rt_info();
+        if (rt_info.count("eos_token_id") > 0)
+            m_eos_token_id = rt_info["eos_token_id"].as<int64_t>();
+        if (rt_info.count("bos_token_id") > 0)
+            m_bos_token_id = rt_info["bos_token_id"].as<int64_t>();
+        if (rt_info.count("pad_token_id") > 0)
+            m_pad_token_id = rt_info["pad_token_id"].as<int64_t>();
+    }
 
-    std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt);
-    std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string>& prompts);
-    std::string decode(std::vector<int64_t> tokens);
-    std::vector<std::string> decode(ov::Tensor tokens);
-    std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines);
-};
+    std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt) {
+        size_t batch_size = 1;
+        m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
+        m_tokenize_request.infer();
+        return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
+    }
 
-Tokenizer::TokenizerImpl::TokenizerImpl(std::string tokenizers_path,  std::string device): m_device(device) {
-    ov::Core core;
-    
-    if (ov::generate_utils::is_xml(tokenizers_path))
-        OPENVINO_THROW("tokenizers_path should be a path to a dir not a xml file");
-  
-    // todo:: OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-    core.add_extension(OPENVINO_TOKENIZERS_PATH);  
-    
-    std::shared_ptr<ov::Model> tokenizer_model, detokenizer_model;
-    try {
-        tokenizer_model = core.read_model(tokenizers_path + "/openvino_tokenizer.xml");
-        detokenizer_model = core.read_model(tokenizers_path + "/openvino_detokenizer.xml");
-    } catch (...) {
-        OPENVINO_THROW("Cannot compile tokenizer and/or detokenizer. Please check that "
-                       "openvino_tokenizer.xml and openvino_detokenizer.xml exist in \"" + tokenizers_path + "\"");
+    std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string>& prompts) {
+        m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
+        auto size_ = m_tokenize_request.get_input_tensor().get_shape();
+        m_tokenize_request.infer();
+
+        ::pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id);
+        // todo: fix mask filled with '2' instead of '0'
+        ov::Tensor attention_mask = m_tokenize_request.get_tensor("attention_mask");
+        int64_t* attention_mask_data = attention_mask.data<int64_t>();
+        std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0);
+        
+        return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
     }
-    m_tokenize_request = core.compile_model(tokenizer_model, device).create_infer_request();
-    m_detokenizer_request = core.compile_model(detokenizer_model, device).create_infer_request();
 
-    auto rt_info = tokenizer_model->get_rt_info();
-    if (rt_info.count("eos_token_id") > 0)
-        m_eos_token_id = rt_info["eos_token_id"].as<int64_t>();
-    if (rt_info.count("bos_token_id") > 0)
-        m_bos_token_id = rt_info["bos_token_id"].as<int64_t>();
-    if (rt_info.count("pad_token_id") > 0)
-        m_pad_token_id = rt_info["pad_token_id"].as<int64_t>();
-}
+    std::string decode(std::vector<int64_t> tokens) {
+        size_t batch_size = 1;
+        m_detokenizer_request.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()});
+        m_detokenizer_request.infer();
+        return m_detokenizer_request.get_output_tensor().data<std::string>()[0];
+    }
+
+    std::vector<std::string> decode(ov::Tensor tokens) {
+        m_detokenizer_request.set_input_tensor(tokens);
+        auto shape = tokens.get_shape();
+        auto data = tokens.data<int64_t>();
+        m_detokenizer_request.infer();
+        auto res = m_detokenizer_request.get_output_tensor();
+        
+        std::vector<std::string> strings;
+        for (int i = 0; i < res.get_shape()[0]; ++i) {
+            strings.emplace_back(res.data<std::string>()[i]);
+        }
+        return strings;
+    }
+
+    std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines) {
+        // todo: implement calling detokenizer in a single batch
+        std::vector<std::string> results;
+        for (auto& line: lines){
+            ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()};
+            m_detokenizer_request.set_input_tensor(tokens);
+            m_detokenizer_request.infer();
+            auto res = m_detokenizer_request.get_output_tensor();
+            auto res_str = res.data<std::string>()[0];
+            results.emplace_back(res_str);
+        }
+        
+        return results;
+    }
+};
 
 Tokenizer::Tokenizer(const std::string& tokenizers_path, const std::string& device) {
     m_pimpl = std::make_shared<TokenizerImpl>(tokenizers_path, device);
 }
 
 std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(const std::string prompt) {
-    return m_pimpl->encode(prompt);
-}
-
-std::pair<ov::Tensor, ov::Tensor> Tokenizer::TokenizerImpl::encode(std::string prompt) {
-    size_t batch_size = 1;   
-    m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
-    m_tokenize_request.infer();
-
-    return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
+    return m_pimpl->encode(std::move(prompt));
 }
 
 std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(std::vector<std::string>& prompts) {
@@ -115,74 +156,22 @@ std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(std::vector<std::string>&& p
     return m_pimpl->encode(prompts);
 }
 
-std::pair<ov::Tensor, ov::Tensor> Tokenizer::TokenizerImpl::encode(std::vector<std::string>& prompts) {
-    m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
-    auto size_ = m_tokenize_request.get_input_tensor().get_shape();
-    m_tokenize_request.infer();
-
-    ::pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id);
-    // todo: fix mask filled with '2' instead of '0'
-    ov::Tensor attention_mask = m_tokenize_request.get_tensor("attention_mask");
-    int64_t* attention_mask_data = attention_mask.data<int64_t>();
-    std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0);
-    
-    return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
-}
-
 std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(std::initializer_list<std::string>& text) {
     return encode(std::vector<std::string>(text.begin(), text.end()));
 }
 
-
 std::string Tokenizer::decode(std::vector<int64_t> tokens) {
     return m_pimpl->decode(tokens);
 }
 
-std::string Tokenizer::TokenizerImpl::decode(std::vector<int64_t> tokens) {
-    size_t batch_size = 1;
-    m_detokenizer_request.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()});
-    m_detokenizer_request.infer();
-    return m_detokenizer_request.get_output_tensor().data<std::string>()[0];
-}
-
 std::vector<std::string> Tokenizer::decode(ov::Tensor tokens) {
     return m_pimpl->decode(tokens);
 }
 
-std::vector<std::string> Tokenizer::TokenizerImpl::decode(ov::Tensor tokens) {
-    m_detokenizer_request.set_input_tensor(tokens);
-    auto shape = tokens.get_shape();
-    auto data = tokens.data<int64_t>();
-    m_detokenizer_request.infer();
-    auto res = m_detokenizer_request.get_output_tensor();
-    
-    std::vector<std::string> strings;
-    for (int i = 0; i < res.get_shape()[0]; ++i) {
-        strings.emplace_back(res.data<std::string>()[i]);
-    }
-    return strings;
-}
-
 std::vector<std::string> Tokenizer::decode(std::vector<std::vector<int64_t>> lines) {
     return m_pimpl->decode(lines);
 }
 
-std::vector<std::string> Tokenizer::TokenizerImpl::decode(std::vector<std::vector<int64_t>> lines) {
-    // todo: implement calling detokenizer in a single batch
-
-    std::vector<std::string> results;
-    for (auto& line: lines){
-        ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()};
-        m_detokenizer_request.set_input_tensor(tokens);
-        m_detokenizer_request.infer();
-        auto res = m_detokenizer_request.get_output_tensor();
-        auto res_str = res.data<std::string>()[0];
-        results.emplace_back(res_str);
-    }
-    
-    return results;
-}
-
 int64_t Tokenizer::get_bos_token_id() const {
     return m_pimpl->m_bos_token_id;
 }
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 69398e1aac..74cbe7e27d 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -26,9 +26,7 @@ std::string stop_criteria_to_str(const ov::GenerationConfig& config) {
     }
 }
 
-std::string call_with_config(ov::LLMPipeline& pipeline, const std::string& text, const py::kwargs& kwargs) {
-    // Create a new GenerationConfig instance and initialize from kwargs
-    ov::GenerationConfig config = pipeline.get_generation_config();
+void update_config_from_kwargs(ov::GenerationConfig& config, const py::kwargs& kwargs) {
     if (kwargs.contains("max_new_tokens")) config.max_new_tokens = kwargs["max_new_tokens"].cast<size_t>();
     if (kwargs.contains("max_length")) config.max_length = kwargs["max_length"].cast<size_t>();
     if (kwargs.contains("ignore_eos")) config.ignore_eos = kwargs["ignore_eos"].cast<bool>();
@@ -49,10 +47,21 @@ std::string call_with_config(ov::LLMPipeline& pipeline, const std::string& text,
     if (kwargs.contains("eos_token_id")) config.eos_token_id = kwargs["eos_token_id"].cast<int64_t>();
     if (kwargs.contains("eos_token")) config.eos_token = kwargs["eos_token"].cast<std::string>();
     if (kwargs.contains("bos_token")) config.bos_token = kwargs["bos_token"].cast<std::string>();
+}
 
+// operator() and generate methods are identical, operator() is just an alias for generate
+std::string call_with_kwargs(ov::LLMPipeline& pipeline, const std::string& text, const py::kwargs& kwargs) {
+    // Create a new GenerationConfig instance and initialize from kwargs
+    ov::GenerationConfig config = pipeline.get_generation_config();
+    update_config_from_kwargs(config, kwargs);
     return pipeline(text, config);
 }
 
+std::string call_with_config(ov::LLMPipeline& pipe, const std::string& text, const ov::GenerationConfig& config) {
+    std::shared_ptr<StreamerBase> streamer;
+    return pipe(text, config);
+}
+
 PYBIND11_MODULE(py_generate_pipeline, m) {
     m.doc() = "Pybind11 binding for LLM Pipeline";
 
@@ -62,7 +71,20 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
              py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{})
         .def(py::init<std::string&, std::string, const ov::AnyMap&>(),
              py::arg("path"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{})
-        .def("__call__", &call_with_config)
+        .def("__call__", py::overload_cast<ov::LLMPipeline&, const std::string&, const py::kwargs&>(&call_with_kwargs))
+        .def("__call__", py::overload_cast<ov::LLMPipeline&, const std::string&, const ov::GenerationConfig&>(&call_with_config))
+        .def("generate", py::overload_cast<ov::LLMPipeline&, const std::string&, const py::kwargs&>(&call_with_kwargs))
+        .def("generate", py::overload_cast<ov::LLMPipeline&, const std::string&, const ov::GenerationConfig&>(&call_with_config))
+        
+        // todo: if input_ids is a ov::Tensor/numpy tensor
+        // todo: implement calling generate/operator() with StreamerBase or lambda streamer
+        // signature to be implemented:
+        // EncodedResults generate(ov::Tensor input_ids, 
+        //                 std::optional<ov::Tensor> attention_mask, 
+        //                 OptionalGenerationConfig generation_config=nullopt,
+        //                 OptionalStreamerVariant streamer=nullopt);
+        
+
         .def("get_tokenizer", &LLMPipeline::get_tokenizer)
         .def("start_chat", &ov::LLMPipeline::start_chat)
         .def("finish_chat", &ov::LLMPipeline::finish_chat)
@@ -75,6 +97,8 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
     py::class_<ov::Tokenizer>(m, "Tokenizer")
         .def(py::init<>())
         .def(py::init<std::string&, std::string>(), py::arg("tokenizers_path"), py::arg("device") = "CPU")
+
+        // todo: implement encode/decode when for numpy inputs and outputs
         .def("encode", py::overload_cast<const std::string>(&ov::Tokenizer::encode), "Encode a single prompt")
         // TODO: common.h(1106...) template argument deduction/substitution failed:
         // .def("encode", py::overload_cast<std::vector<std::string>&>(&ov::Tokenizer::encode), "Encode multiple prompts")
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
index f1c7745c87..84e07c394b 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp
@@ -23,8 +23,9 @@ int main(int argc, char* argv[]) {
     // Model, tokenizer and generation_config.json will be loaded from the model_path.
     // If generation_config.json is not found default velues for gready search will be used
     
+    // ov::streamer_lambda([](std::string subword){std::cout << subword << std::flush;})
     ov::LLMPipeline pipe(model_path, device);
-    // cout << prompt << pipe(prompt) << endl;
+    // cout << prompt << pipe(prompt, ov::max_new_tokens(1000)) << endl;
 
     // todo: syntactic sugar to specify generation configs in place
     // cout << prompt << pipe(prompt, ov::max_new_tokens(100)) << endl;

From 813d80abd08f9554f4cef0a48ab0891891ce7bf0 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 13 May 2024 16:42:12 +0400
Subject: [PATCH 74/97] install centos7

---
 .github/workflows/genai_lib.yml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index 3127fe42b4..dbf966aa8d 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -10,11 +10,9 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: 3.8
-      - name: Install OpenVINO
-        run: |
-          mkdir ./ov/
-          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
-          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
+      - run: mkdir ./ov/
+      - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_centos7_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz  # Install centos instead of ubuntu to match PyPI distribution ABI
+      - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && python -m pip install .
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
 

From 6227b6576871e3a9c4d08deff93a674feaa51360 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 13 May 2024 16:52:09 +0400
Subject: [PATCH 75/97] install nightly

---
 .github/workflows/genai_lib.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index dbf966aa8d..1e6e5ccc2a 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -11,9 +11,9 @@ jobs:
         with:
           python-version: 3.8
       - run: mkdir ./ov/
-      - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_centos7_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz  # Install centos instead of ubuntu to match PyPI distribution ABI
+      - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14758-22bd6ff0494/l_openvino_toolkit_centos7_2024.1.0.dev20240315_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz  # Install centos instead of ubuntu to match PyPI distribution ABI
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - run: source ./ov/setupvars.sh && python -m pip install .
+      - run: source ./ov/setupvars.sh && python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
 
   genai_lib_windows:

From 9b83a7e84b0d781cb7d3b815bd810a0de978f1e4 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 13 May 2024 19:05:24 +0400
Subject: [PATCH 76/97] propagate _GLIBCXX_USE_CXX11_ABI

---
 .github/workflows/genai_lib.yml | 2 +-
 src/cpp/CMakeLists.txt          | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index 1e6e5ccc2a..2519871f11 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -11,7 +11,7 @@ jobs:
         with:
           python-version: 3.8
       - run: mkdir ./ov/
-      - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14758-22bd6ff0494/l_openvino_toolkit_centos7_2024.1.0.dev20240315_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz  # Install centos instead of ubuntu to match PyPI distribution ABI
+      - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14758-22bd6ff0494/l_openvino_toolkit_centos7_2024.1.0.dev20240315_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz  # Install CentOS7 instead of Ubuntu to match PyPI distribution ABI
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 8bcaf8ab13..97acd8785d 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -30,6 +30,11 @@ function(ov_genai_build_jinja2cpp)
         set(JINJA2CPP_PIC ON CACHE BOOL "")
 
         add_subdirectory("${jinja2cpp_SOURCE_DIR}" "${jinja2cpp_BINARY_DIR}" EXCLUDE_FROM_ALL)
+        # openvino::runtime exports _GLIBCXX_USE_CXX11_ABI=0 on CenOS7.
+        # It needs to be propagated to every lib GenAI links with. It's
+        # enough to propagate it to fmt, because fmt propagates to
+        # jinja2cpp.
+        target_link_libraries(fmt PUBLIC openvino::runtime)
     endif()
 endfunction()
 

From 2d157526cdf9967b9f856f60e46a04f4158f303c Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 13 May 2024 19:57:15 +0400
Subject: [PATCH 77/97] Populate python with the libraries to allow skipping
 wheel installation

---
 .github/workflows/genai_lib.yml | 10 +++++++++-
 src/cpp/CMakeLists.txt          | 14 +++++++++++---
 src/python/CMakeLists.txt       | 12 ++++++++----
 3 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index 2519871f11..2e0a3c6bec 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -13,11 +13,17 @@ jobs:
       - run: mkdir ./ov/
       - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14758-22bd6ff0494/l_openvino_toolkit_centos7_2024.1.0.dev20240315_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz  # Install CentOS7 instead of Ubuntu to match PyPI distribution ABI
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
+      - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+      - run: cmake --build ./build/ -j
+      - run: source ./ov/setupvars.sh && PYTHONPATH=./src/python:$PYTHONPATH python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
       - run: source ./ov/setupvars.sh && python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
 
   genai_lib_windows:
     runs-on: windows-latest
+    defaults:
+      run:
+        shell: cmd
     steps:
       - uses: actions/checkout@v4
         with:
@@ -27,6 +33,8 @@ jobs:
           python-version: 3.8
       - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64.zip
       - run: unzip ov.zip
+      - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+      - run: cmake --build ./build/ -j
+      - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && set PYTHONPATH=./src/python:$PYTHONPATH && python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && python -m pip install .
-        shell: cmd
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 97acd8785d..33f300fc51 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -31,10 +31,10 @@ function(ov_genai_build_jinja2cpp)
 
         add_subdirectory("${jinja2cpp_SOURCE_DIR}" "${jinja2cpp_BINARY_DIR}" EXCLUDE_FROM_ALL)
         # openvino::runtime exports _GLIBCXX_USE_CXX11_ABI=0 on CenOS7.
-        # It needs to be propagated to every lib GenAI links with. It's
-        # enough to propagate it to fmt, because fmt propagates to
+        # It needs to be propagated to every library GenAI links with.
+        # It's enough to propagate to fmt, because fmt propagates to
         # jinja2cpp.
-        target_link_libraries(fmt PUBLIC openvino::runtime)
+        target_compile_definitions(fmt PUBLIC $<TARGET_PROPERTY:openvino::runtime,INTERFACE_COMPILE_DEFINITIONS>)
     endif()
 endfunction()
 
@@ -57,4 +57,12 @@ target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_j
 target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 
 target_compile_features(${TARGET_NAME} PUBLIC cxx_std_17)
+
 install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION . COMPONENT genai RUNTIME DESTINATION . COMPONENT genai)
+
+# Populate python with the libraries to allow skipping wheel installation
+add_custom_command(TARGET generate_pipeline_lib POST_BUILD
+    COMMAND "${CMAKE_COMMAND}" -E copy
+        "$<TARGET_FILE:generate_pipeline_lib>"
+        "${CMAKE_CURRENT_SOURCE_DIR}/../python/openvino/genai/$<TARGET_FILE_NAME:generate_pipeline_lib>"
+    COMMENT "Copy generate_pipeline_lib to src/python/openvino/genai")
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 406cbe4ba1..da01b0e194 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -17,16 +17,13 @@ endif()
 
 pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp)
 target_link_libraries(py_generate_pipeline PRIVATE generate_pipeline_lib)
+
 install(TARGETS py_generate_pipeline LIBRARY DESTINATION . COMPONENT genai_python)
 
 # setting RPATH / LC_RPATH depending on platform
 if(LINUX)
     # to find libgenerate_pipeline_lib.so in the same folder
     set(rpaths "$ORIGIN")
-    if(DEFINED SKBUILD)
-        # in case we build pip package, we need to refer to libopenvino.so from 'openvino' package
-        list(APPEND rpaths "@ORIGIN/../../openvino/libs")
-    endif()
 elseif(APPLE)
     # to find libgenerate_pipeline_lib.dylib in the same folder
     set(rpaths "@loader_path")
@@ -39,3 +36,10 @@ endif()
 if(rpaths)
     set_target_properties(py_generate_pipeline PROPERTIES INSTALL_RPATH "${rpaths}")
 endif()
+
+# Populate python with the libraries to allow skipping wheel installation
+add_custom_command(TARGET py_generate_pipeline POST_BUILD
+    COMMAND "${CMAKE_COMMAND}" -E copy
+        "$<TARGET_FILE:py_generate_pipeline>"
+        "${CMAKE_CURRENT_SOURCE_DIR}/openvino/genai/$<TARGET_FILE_NAME:py_generate_pipeline>"
+    COMMENT "Copy py_generate_pipeline to src/python/openvino/genai")

From 8025554bb925ca1db8eba2902978418acb33e6e2 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 13 May 2024 20:00:55 +0400
Subject: [PATCH 78/97] run setupvars

---
 .github/workflows/genai_lib.yml          | 4 ++--
 src/python/openvino/genai/__version__.py | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)
 create mode 100644 src/python/openvino/genai/__version__.py

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index 2e0a3c6bec..53b852a061 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -14,7 +14,7 @@ jobs:
       - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14758-22bd6ff0494/l_openvino_toolkit_centos7_2024.1.0.dev20240315_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz  # Install CentOS7 instead of Ubuntu to match PyPI distribution ABI
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-      - run: cmake --build ./build/ -j
+      - run: source ./ov/setupvars.sh && cmake --build ./build/ -j
       - run: source ./ov/setupvars.sh && PYTHONPATH=./src/python:$PYTHONPATH python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
       - run: source ./ov/setupvars.sh && python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
@@ -34,7 +34,7 @@ jobs:
       - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64.zip
       - run: unzip ov.zip
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-      - run: cmake --build ./build/ -j
+      - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake --build ./build/ -j
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && set PYTHONPATH=./src/python:$PYTHONPATH && python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && python -m pip install .
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
diff --git a/src/python/openvino/genai/__version__.py b/src/python/openvino/genai/__version__.py
new file mode 100644
index 0000000000..dd095d7149
--- /dev/null
+++ b/src/python/openvino/genai/__version__.py
@@ -0,0 +1,2 @@
+# this property will be overwritten by value from pyproject.toml
+__version__ = "0.0.0.0"

From 2b142865dfe19b83f1e2dc17b4d5ab62a3063960 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 13 May 2024 20:09:57 +0400
Subject: [PATCH 79/97] update .gitignore, install numpy

---
 .github/workflows/genai_lib.yml | 2 ++
 .gitignore                      | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index 53b852a061..fe82d7d58a 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -15,6 +15,7 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ -j
+      - run: python -m pip install numpy<1.27
       - run: source ./ov/setupvars.sh && PYTHONPATH=./src/python:$PYTHONPATH python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
       - run: source ./ov/setupvars.sh && python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
@@ -35,6 +36,7 @@ jobs:
       - run: unzip ov.zip
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake --build ./build/ -j
+      - run: python -m pip install numpy<1.27
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && set PYTHONPATH=./src/python:$PYTHONPATH && python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && python -m pip install .
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
diff --git a/.gitignore b/.gitignore
index ae479f4faa..931487b5e9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
+# They are copied to python folder during the build to allow skipping wheel installation
+src/python/openvino/genai/*generate_pipeline_lib*
+src/python/openvino/genai/py_generate_pipeline*
+
 # build/artifact dirs
 _*
 [Bb]uild*/

From 1c11bc73ae9734974d9c9f046450759feecc820f Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 13 May 2024 20:14:17 +0400
Subject: [PATCH 80/97] quotes

---
 .github/workflows/genai_lib.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index fe82d7d58a..288ca84d84 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -15,7 +15,7 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ -j
-      - run: python -m pip install numpy<1.27
+      - run: python -m pip install "numpy<1.27"
       - run: source ./ov/setupvars.sh && PYTHONPATH=./src/python:$PYTHONPATH python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
       - run: source ./ov/setupvars.sh && python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
@@ -36,7 +36,7 @@ jobs:
       - run: unzip ov.zip
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake --build ./build/ -j
-      - run: python -m pip install numpy<1.27
+      - run: python -m pip install "numpy<1.27"
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && set PYTHONPATH=./src/python:$PYTHONPATH && python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && python -m pip install .
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"

From e7fce82dd92e37f024d1497ec859af41f5f8a507 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 13 May 2024 22:07:50 +0400
Subject: [PATCH 81/97] fix PYTHONPATH

---
 .github/workflows/genai_lib.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index 288ca84d84..25e3b1bb36 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -15,9 +15,9 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ -j
-      - run: python -m pip install "numpy<1.27"
-      - run: source ./ov/setupvars.sh && PYTHONPATH=./src/python:$PYTHONPATH python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
-      - run: source ./ov/setupvars.sh && python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: python -m pip install openvino  # Can't load CenOS libraries from the archive
+      - run: source ./ov/setupvars.sh && PYTHONPATH=./src/python/ python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
+      - run: source ./ov/setupvars.sh && python -m pip install --pre --upgrade . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
 
   genai_lib_windows:
@@ -37,6 +37,6 @@ jobs:
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake --build ./build/ -j
       - run: python -m pip install "numpy<1.27"
-      - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && set PYTHONPATH=./src/python:$PYTHONPATH && python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
+      - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && set PYTHONPATH=./src/python;$PYTHONPATH && python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && python -m pip install .
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"

From 64608d1c6669a7e3ea0e96e52fc978954aea3606 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Tue, 14 May 2024 00:13:04 +0400
Subject: [PATCH 82/97] fix PYTHONPATH

---
 .github/workflows/genai_lib.yml | 4 ++--
 pyproject.toml                  | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index 25e3b1bb36..14db153304 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -16,7 +16,7 @@ jobs:
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ -j
       - run: python -m pip install openvino  # Can't load CenOS libraries from the archive
-      - run: source ./ov/setupvars.sh && PYTHONPATH=./src/python/ python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
+      - run: PYTHONPATH=./src/python/ python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
       - run: source ./ov/setupvars.sh && python -m pip install --pre --upgrade . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
 
@@ -37,6 +37,6 @@ jobs:
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake --build ./build/ -j
       - run: python -m pip install "numpy<1.27"
-      - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && set PYTHONPATH=./src/python;$PYTHONPATH && python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
+      - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && set PYTHONPATH=./src/python;%PYTHONPATH% && python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && python -m pip install .
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
diff --git a/pyproject.toml b/pyproject.toml
index 7e497ecb06..c0b38545e2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,6 @@ sdist.cmake = true
 wheel.packages = ["src/python/openvino"]
 wheel.install-dir = "openvino/genai"
 wheel.build-tag = "000"
-wheel.py-api = ""
 wheel.license-files = ["LICENSE", "SECURITY.md", "third-party-programs.txt"]
 
 [[tool.scikit-build.generate]]

From 43b87c7342e4ec607ff7a828040dd82f5b6a6772 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Tue, 14 May 2024 00:41:49 +0400
Subject: [PATCH 83/97] quotes

---
 .github/workflows/genai_lib.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index 14db153304..d9434e958f 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -14,7 +14,7 @@ jobs:
       - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14758-22bd6ff0494/l_openvino_toolkit_centos7_2024.1.0.dev20240315_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz  # Install CentOS7 instead of Ubuntu to match PyPI distribution ABI
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-      - run: source ./ov/setupvars.sh && cmake --build ./build/ -j
+      - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
       - run: python -m pip install openvino  # Can't load CenOS libraries from the archive
       - run: PYTHONPATH=./src/python/ python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
       - run: source ./ov/setupvars.sh && python -m pip install --pre --upgrade . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
@@ -35,8 +35,8 @@ jobs:
       - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64.zip
       - run: unzip ov.zip
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-      - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake --build ./build/ -j
+      - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake --build ./build/ --config Release -j
       - run: python -m pip install "numpy<1.27"
-      - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && set PYTHONPATH=./src/python;%PYTHONPATH% && python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
+      - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && set "PYTHONPATH=./src/python;%PYTHONPATH%" && python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && python -m pip install .
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"

From fef967421dca97aea36b18112fe3a8105e287298 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Tue, 14 May 2024 11:38:12 +0400
Subject: [PATCH 84/97] reorder vars

---
 .github/workflows/genai_lib.yml | 2 +-
 src/cpp/CMakeLists.txt          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index d9434e958f..582c91603a 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -37,6 +37,6 @@ jobs:
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake --build ./build/ --config Release -j
       - run: python -m pip install "numpy<1.27"
-      - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && set "PYTHONPATH=./src/python;%PYTHONPATH%" && python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
+      - run: set "PYTHONPATH=./src/python;" && call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"  # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && python -m pip install .
       - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 33f300fc51..7b27f28399 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -30,7 +30,7 @@ function(ov_genai_build_jinja2cpp)
         set(JINJA2CPP_PIC ON CACHE BOOL "")
 
         add_subdirectory("${jinja2cpp_SOURCE_DIR}" "${jinja2cpp_BINARY_DIR}" EXCLUDE_FROM_ALL)
-        # openvino::runtime exports _GLIBCXX_USE_CXX11_ABI=0 on CenOS7.
+        # openvino::runtime exports _GLIBCXX_USE_CXX11_ABI=0 on CentOS7.
         # It needs to be propagated to every library GenAI links with.
         # It's enough to propagate to fmt, because fmt propagates to
         # jinja2cpp.

From b21286c348f496b8032c3bced56877d04ef5d211 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Tue, 14 May 2024 12:44:59 +0400
Subject: [PATCH 85/97] openvino.genai-

---
 .github/workflows/genai_lib.yml               |  8 +--
 pyproject.toml                                |  6 +-
 src/cpp/CMakeLists.txt                        |  4 +-
 src/python/CMakeLists.txt                     |  6 +-
 src/python/openvino/__init__.py               | 70 -------------------
 .../genai => openvino_genai}/__init__.py      |  0
 .../genai => openvino_genai}/__version__.py   |  0
 7 files changed, 12 insertions(+), 82 deletions(-)
 delete mode 100644 src/python/openvino/__init__.py
 rename src/python/{openvino/genai => openvino_genai}/__init__.py (100%)
 rename src/python/{openvino/genai => openvino_genai}/__version__.py (100%)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index 582c91603a..57e1d8d36f 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -16,9 +16,9 @@ jobs:
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
       - run: python -m pip install openvino  # Can't load CenOS libraries from the archive
-      - run: PYTHONPATH=./src/python/ python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
+      - run: PYTHONPATH=./src/python/ python -c "from openvino_genai.py_generate_pipeline import LLMPipeline"
       - run: source ./ov/setupvars.sh && python -m pip install --pre --upgrade . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-      - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
+      - run: python -c "from openvino_genai.py_generate_pipeline import LLMPipeline"
 
   genai_lib_windows:
     runs-on: windows-latest
@@ -37,6 +37,6 @@ jobs:
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && cmake --build ./build/ --config Release -j
       - run: python -m pip install "numpy<1.27"
-      - run: set "PYTHONPATH=./src/python;" && call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"  # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
+      - run: set "PYTHONPATH=./src/python;" && call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && python -c "from openvino_genai.py_generate_pipeline import LLMPipeline"  # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
       - run: call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat && python -m pip install .
-      - run: python -c "from openvino.genai.py_generate_pipeline import LLMPipeline"
+      - run: python -c "from openvino_genai.py_generate_pipeline import LLMPipeline"
diff --git a/pyproject.toml b/pyproject.toml
index c0b38545e2..007dcb11f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,13 +25,13 @@ cmake.build-type = "Release"
 cmake.targets = ["py_generate_pipeline", "generate_pipeline_lib"]
 install.components = ["genai", "genai_python"]
 sdist.cmake = true
-wheel.packages = ["src/python/openvino"]
-wheel.install-dir = "openvino/genai"
+wheel.packages = ["src/python/openvino_genai"]
+wheel.install-dir = "openvino_genai"
 wheel.build-tag = "000"
 wheel.license-files = ["LICENSE", "SECURITY.md", "third-party-programs.txt"]
 
 [[tool.scikit-build.generate]]
-path = "openvino/genai/__version__.py"
+path = "openvino_genai/__version__.py"
 template = '''
 __version__ = "${version}"
 '''
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 7b27f28399..d706d32356 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -64,5 +64,5 @@ install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION . COMPONENT genai RUNTIME DES
 add_custom_command(TARGET generate_pipeline_lib POST_BUILD
     COMMAND "${CMAKE_COMMAND}" -E copy
         "$<TARGET_FILE:generate_pipeline_lib>"
-        "${CMAKE_CURRENT_SOURCE_DIR}/../python/openvino/genai/$<TARGET_FILE_NAME:generate_pipeline_lib>"
-    COMMENT "Copy generate_pipeline_lib to src/python/openvino/genai")
+        "${CMAKE_CURRENT_SOURCE_DIR}/../python/openvino_genai/$<TARGET_FILE_NAME:generate_pipeline_lib>"
+    COMMENT "Copy generate_pipeline_lib to src/python/openvino_genai")
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index da01b0e194..b73950e828 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -29,7 +29,7 @@ elseif(APPLE)
     set(rpaths "@loader_path")
     if(DEFINED SKBUILD)
         # in case we build pip package, we need to refer to libopenvino.dylib from 'openvino' package
-        list(APPEND rpaths "@loader_path/../../openvino/libs")
+        list(APPEND rpaths "@loader_path/../openvino/libs")
     endif()
 endif()
 
@@ -41,5 +41,5 @@ endif()
 add_custom_command(TARGET py_generate_pipeline POST_BUILD
     COMMAND "${CMAKE_COMMAND}" -E copy
         "$<TARGET_FILE:py_generate_pipeline>"
-        "${CMAKE_CURRENT_SOURCE_DIR}/openvino/genai/$<TARGET_FILE_NAME:py_generate_pipeline>"
-    COMMENT "Copy py_generate_pipeline to src/python/openvino/genai")
+        "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/$<TARGET_FILE_NAME:py_generate_pipeline>"
+    COMMENT "Copy py_generate_pipeline to src/python/openvino_genai/")
diff --git a/src/python/openvino/__init__.py b/src/python/openvino/__init__.py
deleted file mode 100644
index 24a0ee92ec..0000000000
--- a/src/python/openvino/__init__.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-__path__ = __import__("pkgutil").extend_path(__path__, __name__)
-
-# Required for Windows OS platforms
-# Note: always top-level
-try:
-    from openvino.utils import _add_openvino_libs_to_search_path
-    _add_openvino_libs_to_search_path()
-except ImportError:
-    pass
-
-# #
-# # OpenVINO API
-# # This __init__.py forces checking of runtime modules to propagate errors.
-# # It is not compared with init files from openvino-dev package.
-# #
-# Import all public modules
-from openvino import runtime as runtime
-from openvino import frontend as frontend
-from openvino import helpers as helpers
-from openvino import preprocess as preprocess
-from openvino import utils as utils
-from openvino import properties as properties
-
-# Import most important classes and functions from openvino.runtime
-from openvino.runtime import Model
-from openvino.runtime import Core
-from openvino.runtime import CompiledModel
-from openvino.runtime import InferRequest
-from openvino.runtime import AsyncInferQueue
-
-from openvino.runtime import Symbol
-from openvino.runtime import Dimension
-from openvino.runtime import Strides
-from openvino.runtime import PartialShape
-from openvino.runtime import Shape
-from openvino.runtime import Layout
-from openvino.runtime import Type
-from openvino.runtime import Tensor
-from openvino.runtime import OVAny
-
-from openvino.runtime import compile_model
-from openvino.runtime import get_batch
-from openvino.runtime import set_batch
-from openvino.runtime import serialize
-from openvino.runtime import shutdown
-from openvino.runtime import tensor_from_file
-from openvino.runtime import save_model
-from openvino.runtime import layout_helpers
-
-from openvino._pyopenvino import RemoteContext
-from openvino._pyopenvino import RemoteTensor
-
-# libva related:
-from openvino._pyopenvino import VAContext
-from openvino._pyopenvino import VASurfaceTensor
-
-# Set version for openvino package
-from openvino.runtime import get_version
-__version__ = get_version()
-
-# Tools
-try:
-    # Model Conversion API - ovc should reside in the main namespace
-    from openvino.tools.ovc import convert_model
-except ImportError:
-    pass
diff --git a/src/python/openvino/genai/__init__.py b/src/python/openvino_genai/__init__.py
similarity index 100%
rename from src/python/openvino/genai/__init__.py
rename to src/python/openvino_genai/__init__.py
diff --git a/src/python/openvino/genai/__version__.py b/src/python/openvino_genai/__version__.py
similarity index 100%
rename from src/python/openvino/genai/__version__.py
rename to src/python/openvino_genai/__version__.py

From 11e872b0df0b3e9dea53d0f0651d8296511bf9c4 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Tue, 14 May 2024 13:33:51 +0200
Subject: [PATCH 86/97] Update CMakeLists.txt

---
 text_generation/causal_lm/cpp/CMakeLists.txt | 69 +++++++++-----------
 1 file changed, 32 insertions(+), 37 deletions(-)

diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 87cbbda618..26277c1405 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -4,28 +4,26 @@
 cmake_minimum_required(VERSION 3.15)
 project(causal_lm)
 
-set(TARGET_NAME greedy_causal_lm)
-add_executable(${TARGET_NAME} greedy_causal_lm.cpp)
-target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
-target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
+
+add_executable(greedy_causal_lm greedy_causal_lm.cpp)
+target_compile_definitions(greedy_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
-set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
-set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
-
-set(TARGET_NAME beam_search_causal_lm)
-add_executable(${TARGET_NAME} beam_search_causal_lm.cpp)
-target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
-target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+target_link_libraries(greedy_causal_lm PRIVATE openvino::runtime)
+set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD 17)
+set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
+
+add_executable(beam_search_causal_lm beam_search_causal_lm.cpp)
+target_compile_definitions(beam_search_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+target_include_directories(beam_search_causal_lm PRIVATE ./)
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
-set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
-set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
-
-set(TARGET_NAME speculative_decoding_lm)
-add_executable(${TARGET_NAME} speculative_decoding_lm.cpp)
-target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
-target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+target_link_libraries(beam_search_causal_lm PRIVATE openvino::runtime)
+set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD 17)
+set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
+
+add_executable(speculative_decoding_lm speculative_decoding_lm.cpp)
+target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+target_include_directories(speculative_decoding_lm PRIVATE ./)
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime)
 set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD 17)
@@ -33,10 +31,9 @@ set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED O
 find_package(TBB REQUIRED COMPONENTS tbb)
 target_link_libraries(speculative_decoding_lm PRIVATE TBB::tbb)
 
-set(TARGET_NAME prompt_lookup_decoding_lm)
-add_executable(${TARGET_NAME} prompt_lookup_decoding_lm.cpp)
-target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
-target_include_directories(${TARGET_NAME} PRIVATE ./)
+add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp)
+target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+target_include_directories(prompt_lookup_decoding_lm PRIVATE ./)
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime)
 set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD 17)
@@ -44,16 +41,14 @@ set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED
 find_package(TBB REQUIRED COMPONENTS tbb)
 target_link_libraries(prompt_lookup_decoding_lm PRIVATE TBB::tbb)
 
-set(TARGET_NAME generate_sample)
-add_executable(${TARGET_NAME} generate_pipeline/generate_sample.cpp)
-target_link_libraries(${TARGET_NAME} PRIVATE generate_pipeline_lib)
-target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
-set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
-set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
-
-set(TARGET_NAME chat_sample)
-add_executable(${TARGET_NAME} generate_pipeline/chat_sample.cpp)
-target_link_libraries(${TARGET_NAME} PRIVATE generate_pipeline_lib)
-target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
-set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17)
-set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON)
+add_executable(generate_sample generate_pipeline/generate_sample.cpp)
+target_link_libraries(generate_sample PRIVATE generate_pipeline_lib)
+target_include_directories(generate_sample PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+set_target_properties(generate_sample PROPERTIES CXX_STANDARD 17)
+set_target_properties(generate_sample PROPERTIES CXX_STANDARD_REQUIRED ON)
+
+add_executable(chat_sample generate_pipeline/chat_sample.cpp)
+target_link_libraries(chat_sample PRIVATE generate_pipeline_lib)
+target_include_directories(chat_sample PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+set_target_properties(chat_sample PROPERTIES CXX_STANDARD 17)
+set_target_properties(chat_sample PROPERTIES CXX_STANDARD_REQUIRED ON)

From 442dcbfc9203b4df7f10a25fdab2cd0e4e10b171 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Mon, 13 May 2024 16:47:00 +0200
Subject: [PATCH 87/97] move group beam searcher to src

---
 .../cpp/src/group_beam_searcher.cpp           |  86 +++++------
 src/cpp/src/group_beam_searcher.hpp           |  69 +++++++++
 text_generation/causal_lm/cpp/CMakeLists.txt  |  19 ++-
 .../causal_lm/cpp/greedy_causal_lm.cpp        | 137 +++---------------
 4 files changed, 139 insertions(+), 172 deletions(-)
 rename text_generation/causal_lm/cpp/group_beam_searcher.hpp => src/cpp/src/group_beam_searcher.cpp (86%)
 create mode 100644 src/cpp/src/group_beam_searcher.hpp

diff --git a/text_generation/causal_lm/cpp/group_beam_searcher.hpp b/src/cpp/src/group_beam_searcher.cpp
similarity index 86%
rename from text_generation/causal_lm/cpp/group_beam_searcher.hpp
rename to src/cpp/src/group_beam_searcher.cpp
index 6cc90386df..f9f3f3b17b 100644
--- a/text_generation/causal_lm/cpp/group_beam_searcher.hpp
+++ b/src/cpp/src/group_beam_searcher.cpp
@@ -1,8 +1,8 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
-#pragma once
 
 #include <openvino/runtime/tensor.hpp>
+#include "group_beam_searcher.hpp"
 
 // Modifyed Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurance in haystack
 std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std::vector<int64_t>& needle) {
@@ -45,11 +45,12 @@ std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std:
     return res;
 }
 
-struct Token {
-    float log_prob;
-    int64_t idx;
-};
+// struct Token {
+//     float log_prob;
+//     int64_t idx;
+// };
 
+namespace {
 std::vector<Token> log_softmax(const ov::Tensor& logits, const size_t batch_idx) {
     if (logits.get_shape().at(0) <= batch_idx) {
         throw std::runtime_error("logits batch size doesn't match the number of beams");
@@ -70,12 +71,13 @@ std::vector<Token> log_softmax(const ov::Tensor& logits, const size_t batch_idx)
     }
     return tokens;
 }
+}
 
-struct Beam {
-    float score = -std::numeric_limits<float>::infinity();  // The bigger, the better
-    std::vector<int64_t> tokens;
-    size_t global_beam_idx = 0;
-};
+// struct Beam {
+//     float score = -std::numeric_limits<float>::infinity();  // The bigger, the better
+//     std::vector<int64_t> tokens;
+//     size_t global_beam_idx = 0;
+// };
 
 bool greater(const Beam& left, const Beam& right) {
     return left.score > right.score;
@@ -83,28 +85,28 @@ bool greater(const Beam& left, const Beam& right) {
 
 enum class StopCriteria { early, heuristic, never };
 
-struct Parameters {
-    std::vector<std::vector<int64_t>> prompts;
-    int64_t eos_token;
-    size_t n_groups = 3;
-    size_t group_size = 5;
-    float diversity_penalty = 1.0;
-    size_t max_new_tokens = 20;
-    StopCriteria stop_criteria = StopCriteria::heuristic;
-    float length_penalty = 1.0;
-    size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
+// struct Parameters {
+//     std::vector<std::vector<int64_t>> prompts;
+//     int64_t eos_token;
+//     size_t n_groups = 3;
+//     size_t group_size = 5;
+//     float diversity_penalty = 1.0;
+//     size_t max_new_tokens = 20;
+//     ov::StopCriteria stop_criteria = ov::StopCriteria::heuristic;
+//     float length_penalty = 1.0;
+//     size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
 
-    std::function<bool(const Beam&)> early_finish = [](const Beam&) {
-        return false;
-    };
-};
+//     std::function<bool(const Beam&)> early_finish = [](const Beam&) {
+//         return false;
+//     };
+// };
 
-struct Group {
-    std::vector<Beam> ongoing;   // Best beams in front
-    std::vector<Beam> min_heap;  // The worst of the best completed beams is the first
-    bool done = false;
+// struct Group {
+//     std::vector<Beam> ongoing;   // Best beams in front
+//     std::vector<Beam> min_heap;  // The worst of the best completed beams is the first
+//     bool done = false;
 
-    void finish(Beam&& beam, const Parameters& parameters) {
+    void Group::finish(Beam&& beam, const Parameters& parameters) {
         beam.score /= std::pow(float(beam.tokens.size()), parameters.length_penalty);
 
         // HF implementation counts eos_token for length penalty calculation
@@ -119,7 +121,7 @@ struct Group {
             min_heap.pop_back();
         }
     }
-    void is_done(const Parameters& parameters) {
+    void Group::is_done(const Parameters& parameters) {
         if (min_heap.size() < parameters.group_size) {
             return;
         }
@@ -127,15 +129,15 @@ struct Group {
         float best_sum_logprobs = ongoing.front().score;
         float worst_score = min_heap.front().score;
         switch (parameters.stop_criteria) {
-        case StopCriteria::early:
+        case ov::StopCriteria::early:
             done = true;
             return;
-        case StopCriteria::heuristic: {
+        case ov::StopCriteria::heuristic: {
             float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
             done = worst_score >= highest_attainable_score;
             return;
         }
-        case StopCriteria::never: {
+        case ov::StopCriteria::never: {
             size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
             float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
             done = worst_score >= highest_attainable_score;
@@ -145,16 +147,16 @@ struct Group {
             throw std::runtime_error("Never reached");
         }
     }
-};
+// };
 
 // GroupBeamSearcher processes logits prduced by a language model and accumulates beams using group beam search
 // algorithm. select_next_tokens() returns token ids selected by the algorithm and corresponding beam ids. These values
 // are used for next inference. select_next_tokens() returns empty, if all groups are completed
-struct GroupBeamSearcher {
-    Parameters parameters;
-    std::vector<std::vector<Group>> prompts_groups;
+// struct GroupBeamSearcher {
+    // Parameters parameters;
+    // std::vector<std::vector<Group>> prompts_groups;
 
-    GroupBeamSearcher(Parameters parameters) : parameters{parameters}, prompts_groups{parameters.prompts.size()} {
+    GroupBeamSearcher::GroupBeamSearcher(Parameters parameters) : parameters{parameters}, prompts_groups{parameters.prompts.size()} {
         if (parameters.no_repeat_ngram_size == 0) {
             throw std::runtime_error("no_repeat_ngram_size must be positive");
         }
@@ -167,7 +169,7 @@ struct GroupBeamSearcher {
         }
     }
 
-    std::pair<std::vector<int64_t>, std::vector<int32_t>> select_next_tokens(const ov::Tensor& logits) {
+    std::pair<std::vector<int64_t>, std::vector<int32_t>> GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits) {
         std::vector<int64_t> next_tokens;
         std::vector<int32_t> next_beams;
 
@@ -210,7 +212,7 @@ struct GroupBeamSearcher {
         return {next_tokens, next_beams};
     }
 
-    std::pair<std::vector<int64_t>, std::vector<int32_t>> select_prompt_next_tokens(const ov::Tensor& logits,
+    std::pair<std::vector<int64_t>, std::vector<int32_t>> GroupBeamSearcher::select_prompt_next_tokens(const ov::Tensor& logits,
                                                                                     const std::vector<int64_t>& prompt,
                                                                                     std::vector<Group>& groups) {
         std::vector<int64_t> next_tokens;
@@ -225,7 +227,7 @@ struct GroupBeamSearcher {
             std::vector<Beam> candidates;
             candidates.reserve(parameters.group_size * 2 * parameters.group_size);
             for (const Beam& beam : group->ongoing) {
-                std::vector<Token> tokens = log_softmax(logits, beam.global_beam_idx);
+                std::vector<Token> tokens = ::log_softmax(logits, beam.global_beam_idx);
                 for (auto prev_group = groups.cbegin(); prev_group != group; ++prev_group) {
                     for (const Beam& prev_beam : prev_group->ongoing) {
                         if (prev_beam.tokens.size() > beam.tokens.size()) {
@@ -291,7 +293,7 @@ struct GroupBeamSearcher {
         }
         return {next_tokens, next_beams};
     }
-};
+// };
 
 // Consume group_beam_searcher because beams are consumed
 std::vector<std::vector<std::vector<Beam>>> finalize(GroupBeamSearcher&& group_beam_searcher) {
diff --git a/src/cpp/src/group_beam_searcher.hpp b/src/cpp/src/group_beam_searcher.hpp
new file mode 100644
index 0000000000..ae446813e2
--- /dev/null
+++ b/src/cpp/src/group_beam_searcher.hpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include <openvino/runtime/tensor.hpp>
+#include "openvino/genai/generation_config.hpp"
+
+// Modifyed Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurance in haystack
+std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std::vector<int64_t>& needle);
+
+struct Token {
+    float log_prob;
+    int64_t idx;
+};
+
+// std::vector<Token> log_softmax(const ov::Tensor& logits, const size_t batch_idx);
+
+struct Beam {
+    float score = -std::numeric_limits<float>::infinity();  // The bigger, the better
+    std::vector<int64_t> tokens;
+    size_t global_beam_idx = 0;
+};
+
+bool greater(const Beam& left, const Beam& right);
+
+// enum class StopCriteria { early, heuristic, never };
+
+struct Parameters {
+    std::vector<std::vector<int64_t>> prompts;
+    int64_t eos_token;
+    size_t n_groups = 3;
+    size_t group_size = 5;
+    float diversity_penalty = 1.0;
+    size_t max_new_tokens = 20;
+    ov::StopCriteria stop_criteria = ov::StopCriteria::heuristic;
+    float length_penalty = 1.0;
+    size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
+
+    std::function<bool(const Beam&)> early_finish = [](const Beam&) {
+        return false;
+    };
+};
+
+struct Group {
+    std::vector<Beam> ongoing;   // Best beams in front
+    std::vector<Beam> min_heap;  // The worst of the best completed beams is the first
+    bool done = false;
+
+    void finish(Beam&& beam, const Parameters& parameters);
+
+    void is_done(const Parameters& parameters);
+};
+
+
+struct GroupBeamSearcher {
+    Parameters parameters;
+    std::vector<std::vector<Group>> prompts_groups;
+
+    GroupBeamSearcher(Parameters parameters);
+
+    std::pair<std::vector<int64_t>, std::vector<int32_t>> select_next_tokens(const ov::Tensor& logits);
+
+    std::pair<std::vector<int64_t>, std::vector<int32_t>> select_prompt_next_tokens(const ov::Tensor& logits,
+                                                                                    const std::vector<int64_t>& prompt,
+                                                                                    std::vector<Group>& groups);
+};
+
+// Consume group_beam_searcher because beams are consumed
+std::vector<std::vector<std::vector<Beam>>> finalize(GroupBeamSearcher&& group_beam_searcher);
diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 26277c1405..efff680ca4 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -4,22 +4,21 @@
 cmake_minimum_required(VERSION 3.15)
 project(causal_lm)
 
-add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
+# add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
 
 add_executable(greedy_causal_lm greedy_causal_lm.cpp)
 target_compile_definitions(greedy_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
-find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-target_link_libraries(greedy_causal_lm PRIVATE openvino::runtime)
+target_link_libraries(greedy_causal_lm PRIVATE generate_pipeline_lib)
 set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD 17)
 set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-add_executable(beam_search_causal_lm beam_search_causal_lm.cpp)
-target_compile_definitions(beam_search_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
-target_include_directories(beam_search_causal_lm PRIVATE ./)
-find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-target_link_libraries(beam_search_causal_lm PRIVATE openvino::runtime)
-set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD 17)
-set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
+# add_executable(beam_search_causal_lm beam_search_causal_lm.cpp)
+# target_compile_definitions(beam_search_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
+# target_include_directories(beam_search_causal_lm PRIVATE ./)
+# find_package(OpenVINO REQUIRED COMPONENTS Runtime)
+# target_link_libraries(beam_search_causal_lm PRIVATE openvino::runtime)
+# set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD 17)
+# set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 add_executable(speculative_decoding_lm speculative_decoding_lm.cpp)
 target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
diff --git a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp
index d75d32d0e0..7b1dde4dc8 100644
--- a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp
+++ b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp
@@ -1,129 +1,26 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include <openvino/openvino.hpp>
-
-namespace {
-std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string&& prompt) {
-    constexpr size_t BATCH_SIZE = 1;
-    tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt});
-    tokenizer.infer();
-    return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
-}
-
-std::string detokenize(ov::InferRequest& detokenizer, std::vector<int64_t>& tokens) {
-    constexpr size_t BATCH_SIZE = 1;
-    detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {BATCH_SIZE, tokens.size()}, tokens.data()});
-    detokenizer.infer();
-    return detokenizer.get_output_tensor().data<std::string>()[0];
-}
-
-// The following reasons require TextStreamer to keep a cache of previous tokens:
-// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a",
-// but detokenize(tokenize("prefix a")) == "prefix a"
-// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�"
-struct TextStreamer {
-    ov::InferRequest detokenizer;
-    std::vector<int64_t> token_cache;
-    size_t print_len = 0;
-
-    void put(int64_t token) {
-        token_cache.push_back(token);
-        std::string text = detokenize(detokenizer, token_cache);
-        if (!text.empty() && '\n' == text.back()) {
-            // Flush the cache after the new line symbol
-            std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
-            token_cache.clear();
-            print_len = 0;
-	    return;
-        }
-        if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
-            // Don't print incomplete text
-            return;
-        }
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush;
-        print_len = text.size();
-    }
-
-    void end() {
-        std::string text = detokenize(detokenizer, token_cache);
-        std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
-        token_cache.clear();
-        print_len = 0;
-    }
-};
-}
+#include "openvino/genai/llm_pipeline.hpp"
 
 int main(int argc, char* argv[]) try {
-    if (argc != 3) {
-        throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> '<PROMPT>'");
-    }
-    // Compile models
-    ov::Core core;
-    core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-    //Read the tokenizer model information from the file to later get the runtime information
-    auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml");
-    // tokenizer and detokenizer work on CPU only
-    ov::InferRequest tokenizer = core.compile_model(
-        tokenizer_model, "CPU").create_infer_request();
-    auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]);
-    ov::InferRequest detokenizer = core.compile_model(
-        std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
-    // The model can be compiled for GPU as well
-    ov::InferRequest lm = core.compile_model(
-        std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request();
-    auto seq_len = input_ids.get_size();
-    
-    // Initialize inputs
-    lm.set_tensor("input_ids", input_ids);
-    lm.set_tensor("attention_mask", attention_mask);
-    ov::Tensor position_ids = lm.get_tensor("position_ids");
-    position_ids.set_shape(input_ids.get_shape());
-    std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + seq_len, 0);
-    constexpr size_t BATCH_SIZE = 1;
-    // Input values are persistent between inference calls.
-    // That allows to set values, which aren't going to change, only once
-    lm.get_tensor("beam_idx").set_shape({BATCH_SIZE});
-    lm.get_tensor("beam_idx").data<int32_t>()[0] = 0;
-    lm.infer();
-    size_t vocab_size = lm.get_tensor("logits").get_shape().back();
-    float* logits = lm.get_tensor("logits").data<float>() + (seq_len - 1) * vocab_size;
-    int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
-
-    lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1});
-    position_ids.set_shape({BATCH_SIZE, 1});
-    TextStreamer text_streamer{std::move(detokenizer)};
-
-    // Get the runtime info from the tokenizer model that we read earlier
-    auto rt_info = tokenizer_model->get_rt_info(); //Get the runtime info for the model
-    int64_t SPECIAL_EOS_TOKEN;
+    if (3 > argc || argc > 4)
+        throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> \"<PROMPT>\" <DEVICE>");
 
-    if (rt_info.count("eos_token_id") > 0) { //check if the runtime information has a valid EOS token ID
-        SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as<int64_t>();
-    } else {
-        throw std::runtime_error("EOS token ID not found in model's runtime information.");
-    }
-
-    int max_sequence_length = 100;
-    while (out_token != SPECIAL_EOS_TOKEN && seq_len < max_sequence_length) {
-        ++seq_len;
-        lm.get_tensor("input_ids").data<int64_t>()[0] = out_token;
-        lm.get_tensor("attention_mask").set_shape({BATCH_SIZE, seq_len});
-        std::fill_n(lm.get_tensor("attention_mask").data<int64_t>(), seq_len, 1);
-        position_ids.data<int64_t>()[0] = int64_t(seq_len - 1);
-        lm.start_async();
-        text_streamer.put(out_token);
-        lm.wait();
-        logits = lm.get_tensor("logits").data<float>();
-        out_token = std::max_element(logits, logits + vocab_size) - logits;
-    }
-    text_streamer.end();
-    // Model is stateful which means that context (kv-cache) which belongs to a particular
-    // text sequence is accumulated inside the model during the generation loop above.
-    // This context should be reset before processing the next text sequence.
-    // While it is not required to reset context in this sample as only one sequence is processed,
-    // it is called for education purposes:
-    lm.reset_state();
+    std::string model_path = argv[1];
+    std::string prompt = argv[2];
+    
+    // GPU can be used as well
+    std::string device = "CPU";  
+    if (argc > 3) device = argv[3];
+
+    ov::LLMPipeline pipe(model_path, device);
+    ov::GenerationConfig config = pipe.get_generation_config();
+    config.max_new_tokens = 100;
+    auto streamer = [](std::string subword){std::cout << subword << std::flush;};
+    
+    // since streamer is set results will be printed each time a new token is generated
+    pipe.generate(prompt, config, streamer);
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
     return EXIT_FAILURE;

From 53d534edc36fa3d44d2ae0f73eef14c7afdf40ac Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 15 May 2024 08:47:43 +0400
Subject: [PATCH 88/97] Update .gitignore (#5)

* Update .gitignore

* spelling
---
 .github/workflows/genai_lib.yml | 2 +-
 .gitignore                      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/genai_lib.yml b/.github/workflows/genai_lib.yml
index 57e1d8d36f..5b8d6db3fe 100644
--- a/.github/workflows/genai_lib.yml
+++ b/.github/workflows/genai_lib.yml
@@ -15,7 +15,7 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
-      - run: python -m pip install openvino  # Can't load CenOS libraries from the archive
+      - run: python -m pip install openvino  # Can't load CentOS libraries from the archive
       - run: PYTHONPATH=./src/python/ python -c "from openvino_genai.py_generate_pipeline import LLMPipeline"
       - run: source ./ov/setupvars.sh && python -m pip install --pre --upgrade . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - run: python -c "from openvino_genai.py_generate_pipeline import LLMPipeline"
diff --git a/.gitignore b/.gitignore
index 931487b5e9..1546c18f71 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 # They are copied to python folder during the build to allow skipping wheel installation
-src/python/openvino/genai/*generate_pipeline_lib*
-src/python/openvino/genai/py_generate_pipeline*
+src/python/openvino_genai/*generate_pipeline_lib*
+src/python/openvino_genai/py_generate_pipeline*
 
 # build/artifact dirs
 _*

From 72c045eed0513aacd41fc9498b49b1391fc96fe5 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 15 May 2024 10:52:19 +0200
Subject: [PATCH 89/97] fixed difference between old greddy sample and generate

---
 .../include/openvino/genai/llm_pipeline.hpp   |   1 +
 src/cpp/src/beam_search_decoding.cpp          |  91 -------
 src/cpp/src/greedy_decoding.cpp               |  59 +----
 src/cpp/src/group_beam_searcher.cpp           | 218 ++++++++++++----
 src/cpp/src/group_beam_searcher.hpp           |  67 +----
 src/cpp/src/llm_pipeline.cpp                  |  11 +-
 src/cpp/src/utils.cpp                         |  82 ++++++
 src/cpp/src/utils.hpp                         |   6 +
 src/tests/python_tests/test_greedy.py         |   4 +-
 text_generation/causal_lm/cpp/CMakeLists.txt  |  11 +-
 .../causal_lm/cpp/beam_search_causal_lm.cpp   | 239 ++----------------
 11 files changed, 298 insertions(+), 491 deletions(-)
 delete mode 100644 src/cpp/src/beam_search_decoding.cpp

diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index c16d0ffde4..5c3e23aa7e 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -109,6 +109,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     * @return DecodedResults a structure with resulting texts & scores
     */
     DecodedResults generate(std::vector<std::string> texts, OptionalGenerationConfig generation_config);
+    DecodedResults generate(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config);
 
     /**
     * @brief Low level generate to be called with already encoded input_ids tokens.
diff --git a/src/cpp/src/beam_search_decoding.cpp b/src/cpp/src/beam_search_decoding.cpp
deleted file mode 100644
index 0b80c47e53..0000000000
--- a/src/cpp/src/beam_search_decoding.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (C) 2023-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "generation_config_helper.hpp"
-#include "openvino/genai/llm_pipeline.hpp"
-#include "group_beam_searcher.hpp"
-
-namespace ov {
-
-EncodedResults beam_search(ov::InferRequest& model_runner, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig config) {
-    GenerationConfigHelper config_helper = config;
-
-    ov::Shape prompts_shape = prompts.get_shape();
-    size_t batch_size = prompts_shape[0];
-    // todo: implement for batch > 1
-    OPENVINO_ASSERT(batch_size == 1);
-
-    // initialize inputs
-    auto attention_mask = ov::Tensor{ov::element::i64, prompts.get_shape()};
-    std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
-    auto position_ids = ov::Tensor{ov::element::i64, prompts.get_shape()};
-    std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
-    auto prompt_len = prompts.get_shape()[1];
-
-    model_runner.set_tensor("input_ids", prompts);
-    model_runner.set_tensor("attention_mask", attention_mask);
-    model_runner.set_tensor("position_ids", position_ids);
-
-    // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1
-    model_runner.get_tensor("beam_idx").set_shape({batch_size});
-    model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;
-
-    const int64_t* prompt_data = prompts.data<const int64_t>();
-    
-    // todo: remove this duplication and use the same SamplingParameters for both greedy and beam
-    Parameters parameters{{std::vector<int64_t>{prompt_data, prompt_data + prompts.get_size()}}};
-    parameters.n_groups = config.num_beam_groups;
-    parameters.diversity_penalty = config.diversity_penalty;
-    parameters.group_size = config.num_beams / config.num_beam_groups;
-    OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0, "number of beams should be divisible by number of groups");
-
-    
-    GroupBeamSearcher group_beam_searcher{parameters};
-    std::vector<int64_t> next_tokens;
-    std::vector<int32_t> next_beams;
-    for (size_t length_count = 0; length_count < config_helper.get_max_new_tokens(prompt_len); ++length_count) {
-        model_runner.infer();
-        std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(model_runner.get_tensor("logits"));
-        if (next_tokens.empty()) {
-            break;
-        }
-        size_t batch_size = next_tokens.size();
-        // Set pointers
-        model_runner.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
-        model_runner.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
-        // Set auxiliary inputs
-        ov::Tensor attention_mask = model_runner.get_tensor("attention_mask");
-        ov::Shape mask_shape{batch_size, attention_mask.get_shape()[1] + 1};
-        attention_mask.set_shape(mask_shape);
-        std::fill_n(attention_mask.data<int64_t>(), ov::shape_size(mask_shape), 1);
-
-        model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
-        std::fill_n(model_runner.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape[1] - 1);
-        
-        // todo: pass streamer here
-        // m_streamer.put(token_iter_results[0]);
-
-    }
-
-    std::vector<Beam> beams;
-    for (const std::vector<std::vector<Beam>>& prompt_group : finalize(std::move(group_beam_searcher))) {
-        for (const std::vector<Beam> group : prompt_group) {
-            for (const Beam& beam : group) {
-                beams.emplace_back(beam);
-            }
-        }
-    }
-
-    auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); };
-    std::sort(beams.begin(), beams.end(), compare_scores);
-    
-    ov::EncodedResults results;
-    for (auto beam = beams.begin(); beam != beams.begin() + config.num_return_sequences; ++beam) {
-        // todo: convert to string 
-        results.scores.emplace_back(beam->score);
-        results.tokens.emplace_back(beam->tokens);
-    }
-    return results;
-}
-
-} // namespace ov
\ No newline at end of file
diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
index 7e437ad281..3298553a76 100644
--- a/src/cpp/src/greedy_decoding.cpp
+++ b/src/cpp/src/greedy_decoding.cpp
@@ -5,59 +5,6 @@
 #include "openvino/genai/llm_pipeline.hpp"
 #include "utils.hpp"
 
-namespace {
-
-void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
-void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos = 0);
-ov::Tensor extend_attention(ov::Tensor attention_mask);
-
-void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) {
-    const size_t batch_size = attention_mask.get_shape()[0];
-    const size_t atten_length = attention_mask.get_shape()[1];
-    position_ids.set_shape({batch_size, 1});
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        int64_t* start = attention_mask.data<int64_t>() + batch * atten_length;
-        position_ids.data<int64_t>()[batch] = std::accumulate(start, start + atten_length, 0);
-    }
-}
-
-void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos) {
-    const size_t batch_size = attention_mask.get_shape()[0];
-    const size_t seq_length = attention_mask.get_shape()[1];
-
-    const int64_t* attention_mask_data = attention_mask.data<int64_t>();
-    int64_t* position_ids_data = position_ids.data<int64_t>();
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        size_t sum = start_pos;
-        for (size_t i = 0; i < seq_length; i++) {
-            const size_t element_offset = batch * seq_length + i;
-            position_ids_data[element_offset] = sum;
-            if (attention_mask_data[element_offset] == 1) {
-                sum += 1;
-            }
-        }
-    }
-}
-
-ov::Tensor extend_attention(ov::Tensor attention_mask) {
-    auto shape = attention_mask.get_shape();
-    auto batch_size = shape[0];
-    auto seq_len = shape[1];
-
-    ov::Tensor new_atten_mask = ov::Tensor{attention_mask.get_element_type(), {batch_size, seq_len + 1}};
-    auto old_data = attention_mask.data<int64_t>();
-    auto new_data = new_atten_mask.data<int64_t>();
-    for (size_t batch = 0; batch < batch_size; ++batch) {
-        std::memcpy(new_data + batch * (seq_len + 1), old_data + batch * seq_len, seq_len * sizeof(int64_t));
-        new_data[batch * (seq_len + 1) + seq_len] = 1;
-    }
-    return new_atten_mask;
-}
-
-}
-
 namespace ov {
 
 ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner, 
@@ -73,7 +20,7 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner,
 
     // todo: make this work even if position_ids are not specified
     auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
-    initialize_position_ids(position_ids, attention_mask, kv_cache_len);
+    generate_utils::initialize_position_ids(position_ids, attention_mask, kv_cache_len);
 
     ov::EncodedResults results;
     results.scores.resize(batch_size);
@@ -139,8 +86,8 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner,
         return results;
     
     for (size_t i = 0; i < max_tokens - 1; ++i) {
-        update_position_ids(position_ids, m_model_runner.get_tensor("attention_mask"));
-        m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
+        generate_utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask"));
+        m_model_runner.set_tensor("attention_mask", generate_utils::extend_attention(m_model_runner.get_tensor("attention_mask")));
 
         // todo: consider replacing with start_async and run callback right after that
         m_model_runner.infer();
diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp
index f9f3f3b17b..1e27f36a0a 100644
--- a/src/cpp/src/group_beam_searcher.cpp
+++ b/src/cpp/src/group_beam_searcher.cpp
@@ -3,6 +3,10 @@
 
 #include <openvino/runtime/tensor.hpp>
 #include "group_beam_searcher.hpp"
+#include "generation_config_helper.hpp"
+#include "utils.hpp"
+
+namespace {
 
 // Modifyed Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurance in haystack
 std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std::vector<int64_t>& needle) {
@@ -45,12 +49,11 @@ std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std:
     return res;
 }
 
-// struct Token {
-//     float log_prob;
-//     int64_t idx;
-// };
+struct Token {
+    float log_prob;
+    int64_t idx;
+};
 
-namespace {
 std::vector<Token> log_softmax(const ov::Tensor& logits, const size_t batch_idx) {
     if (logits.get_shape().at(0) <= batch_idx) {
         throw std::runtime_error("logits batch size doesn't match the number of beams");
@@ -71,42 +74,39 @@ std::vector<Token> log_softmax(const ov::Tensor& logits, const size_t batch_idx)
     }
     return tokens;
 }
-}
 
-// struct Beam {
-//     float score = -std::numeric_limits<float>::infinity();  // The bigger, the better
-//     std::vector<int64_t> tokens;
-//     size_t global_beam_idx = 0;
-// };
+struct Beam {
+    float score = -std::numeric_limits<float>::infinity();  // The bigger, the better
+    std::vector<int64_t> tokens;
+    size_t global_beam_idx = 0;
+};
 
 bool greater(const Beam& left, const Beam& right) {
     return left.score > right.score;
 }
 
-enum class StopCriteria { early, heuristic, never };
-
-// struct Parameters {
-//     std::vector<std::vector<int64_t>> prompts;
-//     int64_t eos_token;
-//     size_t n_groups = 3;
-//     size_t group_size = 5;
-//     float diversity_penalty = 1.0;
-//     size_t max_new_tokens = 20;
-//     ov::StopCriteria stop_criteria = ov::StopCriteria::heuristic;
-//     float length_penalty = 1.0;
-//     size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
-
-//     std::function<bool(const Beam&)> early_finish = [](const Beam&) {
-//         return false;
-//     };
-// };
-
-// struct Group {
-//     std::vector<Beam> ongoing;   // Best beams in front
-//     std::vector<Beam> min_heap;  // The worst of the best completed beams is the first
-//     bool done = false;
-
-    void Group::finish(Beam&& beam, const Parameters& parameters) {
+struct Parameters {
+    std::vector<std::vector<int64_t>> prompts;
+    int64_t eos_token;
+    size_t n_groups = 3;
+    size_t group_size = 5;
+    float diversity_penalty = 1.0;
+    size_t max_new_tokens = 20;
+    ov::StopCriteria stop_criteria = ov::StopCriteria::heuristic;
+    float length_penalty = 1.0;
+    size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
+
+    std::function<bool(const Beam&)> early_finish = [](const Beam&) {
+        return false;
+    };
+};
+
+struct Group {
+    std::vector<Beam> ongoing;   // Best beams in front
+    std::vector<Beam> min_heap;  // The worst of the best completed beams is the first
+    bool done = false;
+
+    void finish(Beam&& beam, const Parameters& parameters) {
         beam.score /= std::pow(float(beam.tokens.size()), parameters.length_penalty);
 
         // HF implementation counts eos_token for length penalty calculation
@@ -115,13 +115,13 @@ enum class StopCriteria { early, heuristic, never };
         }
 
         min_heap.push_back(std::move(beam));
-        std::push_heap(min_heap.begin(), min_heap.end(), ::greater);
+        std::push_heap(min_heap.begin(), min_heap.end(), greater);
         if (min_heap.size() > parameters.group_size) {
-            std::pop_heap(min_heap.begin(), min_heap.end(), ::greater);
+            std::pop_heap(min_heap.begin(), min_heap.end(), greater);
             min_heap.pop_back();
         }
     }
-    void Group::is_done(const Parameters& parameters) {
+    void is_done(const Parameters& parameters) {
         if (min_heap.size() < parameters.group_size) {
             return;
         }
@@ -147,16 +147,16 @@ enum class StopCriteria { early, heuristic, never };
             throw std::runtime_error("Never reached");
         }
     }
-// };
+};
 
 // GroupBeamSearcher processes logits prduced by a language model and accumulates beams using group beam search
 // algorithm. select_next_tokens() returns token ids selected by the algorithm and corresponding beam ids. These values
 // are used for next inference. select_next_tokens() returns empty, if all groups are completed
-// struct GroupBeamSearcher {
-    // Parameters parameters;
-    // std::vector<std::vector<Group>> prompts_groups;
+struct GroupBeamSearcher {
+    Parameters parameters;
+    std::vector<std::vector<Group>> prompts_groups;
 
-    GroupBeamSearcher::GroupBeamSearcher(Parameters parameters) : parameters{parameters}, prompts_groups{parameters.prompts.size()} {
+    GroupBeamSearcher(Parameters parameters) : parameters{parameters}, prompts_groups{parameters.prompts.size()} {
         if (parameters.no_repeat_ngram_size == 0) {
             throw std::runtime_error("no_repeat_ngram_size must be positive");
         }
@@ -169,7 +169,7 @@ enum class StopCriteria { early, heuristic, never };
         }
     }
 
-    std::pair<std::vector<int64_t>, std::vector<int32_t>> GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits) {
+    std::pair<std::vector<int64_t>, std::vector<int32_t>> select_next_tokens(const ov::Tensor& logits) {
         std::vector<int64_t> next_tokens;
         std::vector<int32_t> next_beams;
 
@@ -212,7 +212,7 @@ enum class StopCriteria { early, heuristic, never };
         return {next_tokens, next_beams};
     }
 
-    std::pair<std::vector<int64_t>, std::vector<int32_t>> GroupBeamSearcher::select_prompt_next_tokens(const ov::Tensor& logits,
+    std::pair<std::vector<int64_t>, std::vector<int32_t>> select_prompt_next_tokens(const ov::Tensor& logits,
                                                                                     const std::vector<int64_t>& prompt,
                                                                                     std::vector<Group>& groups) {
         std::vector<int64_t> next_tokens;
@@ -227,7 +227,7 @@ enum class StopCriteria { early, heuristic, never };
             std::vector<Beam> candidates;
             candidates.reserve(parameters.group_size * 2 * parameters.group_size);
             for (const Beam& beam : group->ongoing) {
-                std::vector<Token> tokens = ::log_softmax(logits, beam.global_beam_idx);
+                std::vector<Token> tokens = log_softmax(logits, beam.global_beam_idx);
                 for (auto prev_group = groups.cbegin(); prev_group != group; ++prev_group) {
                     for (const Beam& prev_beam : prev_group->ongoing) {
                         if (prev_beam.tokens.size() > beam.tokens.size()) {
@@ -267,7 +267,7 @@ enum class StopCriteria { early, heuristic, never };
                 throw std::runtime_error("No beams left to search");
             }
             auto to_sort = candidates.begin() + ptrdiff_t(2 * parameters.group_size);
-            std::partial_sort(candidates.begin(), to_sort, candidates.end(), ::greater);
+            std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater);
             group->ongoing.clear();
             for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) {
                 if (parameters.eos_token == candidates.at(cand_idx).tokens.back()) {
@@ -293,7 +293,7 @@ enum class StopCriteria { early, heuristic, never };
         }
         return {next_tokens, next_beams};
     }
-// };
+};
 
 // Consume group_beam_searcher because beams are consumed
 std::vector<std::vector<std::vector<Beam>>> finalize(GroupBeamSearcher&& group_beam_searcher) {
@@ -316,3 +316,125 @@ std::vector<std::vector<std::vector<Beam>>> finalize(GroupBeamSearcher&& group_b
 
     return finalized;
 }
+
+void initialize_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_mask, ov::InferRequest& request) {
+    request.set_tensor("input_ids", input_ids);
+    request.set_tensor("attention_mask", attention_mask);
+
+    ov::Shape input_shape = input_ids.get_shape();
+
+    ov::Tensor position_ids = request.get_tensor("position_ids");
+    position_ids.set_shape(input_shape);
+    ov::generate_utils::initialize_position_ids(position_ids, attention_mask);
+
+    ov::Tensor beam_idx = request.get_tensor("beam_idx");
+    beam_idx.set_shape({input_shape.at(0)});
+    std::fill_n(beam_idx.data<int32_t>(), input_shape.at(0), 0);
+}
+
+
+void update_attention_mask_with_beams(ov::Tensor&& attention_mask, std::vector<int32_t> next_beams) {
+    ov::Tensor original_mask{ov::element::i64, attention_mask.get_shape()};
+    ov::Shape original_shape = original_mask.get_shape();
+    attention_mask.copy_to(original_mask);
+
+    ov::Shape new_shape{next_beams.size(), original_mask.get_shape().at(1) + 1};
+    attention_mask.set_shape(new_shape);
+
+    for (size_t beam_id = 0; beam_id < next_beams.size(); beam_id++) {
+        const size_t original_prompt_offset = next_beams.at(beam_id) * original_shape.at(1);
+        const size_t result_prompt_offset = beam_id * new_shape.at(1);
+
+        int64_t* dest = attention_mask.data<int64_t>() + result_prompt_offset;
+        const int64_t* src = original_mask.data<int64_t>() + original_prompt_offset;
+
+        std::memcpy(dest, src, original_shape.at(1) * sizeof(int64_t));
+        attention_mask.data<int64_t>()[result_prompt_offset + new_shape.at(1) - 1] = 1;
+    }
+}
+
+void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) {
+    const size_t batch_size = attention_mask.get_shape().at(0);
+    const size_t sequence_length = attention_mask.get_shape().at(1);
+    position_ids.set_shape({batch_size, 1});
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        int64_t* mask_start = attention_mask.data<int64_t>() + batch * sequence_length;
+        position_ids.data<int64_t>()[batch] = std::accumulate(mask_start, mask_start + sequence_length - 1, 0);
+    }
+}
+
+} // namespace
+
+
+namespace ov {
+
+EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig config) {
+    GenerationConfigHelper config_helper = config;
+    OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0, "number of beams should be divisible by number of groups");
+    
+    // Initialize beam search
+    const int64_t* prompt_data = input_ids.data<const int64_t>();
+    std::vector<std::vector<int64_t>> prompts;
+    prompts.reserve(input_ids.get_shape().at(0));
+    for (size_t batch = 0; batch < input_ids.get_shape().at(0); batch++) {
+        size_t sequence_length = input_ids.get_shape().at(1);
+        size_t batch_offset = batch * sequence_length;
+        const int64_t* prompt_start = prompt_data + batch_offset;
+        prompts.push_back(std::vector<int64_t>{prompt_start, prompt_start + sequence_length});
+    }
+    
+    initialize_inputs(input_ids, attention_mask, lm);
+
+    Parameters parameters{std::move(prompts)};
+    parameters.max_new_tokens = config.max_new_tokens;
+    parameters.eos_token = config.eos_token_id;
+    parameters.n_groups = config.num_beam_groups;
+    parameters.group_size = config.num_beams / config.num_beam_groups;
+    parameters.diversity_penalty = config.diversity_penalty;
+    parameters.length_penalty = config.length_penalty;
+    parameters.stop_criteria = config.stop_criteria;
+    parameters.no_repeat_ngram_size = config.no_repeat_ngram_size;
+    GroupBeamSearcher group_beam_searcher{parameters};
+
+    std::vector<int64_t> next_tokens;
+    std::vector<int32_t> next_beams;
+
+    for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) {
+        lm.infer();
+
+        std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits"));
+        if (next_tokens.empty()) {
+            break;
+        }
+        size_t batch_size = next_tokens.size();
+        // Set pointers
+        lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
+        lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
+        // Set auxiliary inputs
+        update_attention_mask_with_beams(lm.get_tensor("attention_mask"), next_beams);
+        update_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask"));
+    }
+
+    std::vector<Beam> beams;
+    for (const std::vector<std::vector<Beam>>& prompt_group : finalize(std::move(group_beam_searcher))) {
+        for (const std::vector<Beam> group : prompt_group) {
+            for (const Beam& beam : group) {
+                beams.emplace_back(beam);
+            }
+        }
+    }
+    
+    // return sorted scores
+    auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); };
+    std::sort(beams.begin(), beams.end(), compare_scores);
+    
+    ov::EncodedResults results;
+    for (auto beam = beams.begin(); beam != beams.begin() + config.num_return_sequences; ++beam) {
+        results.scores.emplace_back(beam->score);
+        results.tokens.emplace_back(beam->tokens);
+    }
+    return results;
+}
+
+} // namespace ov
diff --git a/src/cpp/src/group_beam_searcher.hpp b/src/cpp/src/group_beam_searcher.hpp
index ae446813e2..91f3ef4096 100644
--- a/src/cpp/src/group_beam_searcher.hpp
+++ b/src/cpp/src/group_beam_searcher.hpp
@@ -1,69 +1,12 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
+
 #pragma once
 
 #include <openvino/runtime/tensor.hpp>
 #include "openvino/genai/generation_config.hpp"
+#include "openvino/genai/llm_pipeline.hpp"
 
-// Modifyed Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurance in haystack
-std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std::vector<int64_t>& needle);
-
-struct Token {
-    float log_prob;
-    int64_t idx;
-};
-
-// std::vector<Token> log_softmax(const ov::Tensor& logits, const size_t batch_idx);
-
-struct Beam {
-    float score = -std::numeric_limits<float>::infinity();  // The bigger, the better
-    std::vector<int64_t> tokens;
-    size_t global_beam_idx = 0;
-};
-
-bool greater(const Beam& left, const Beam& right);
-
-// enum class StopCriteria { early, heuristic, never };
-
-struct Parameters {
-    std::vector<std::vector<int64_t>> prompts;
-    int64_t eos_token;
-    size_t n_groups = 3;
-    size_t group_size = 5;
-    float diversity_penalty = 1.0;
-    size_t max_new_tokens = 20;
-    ov::StopCriteria stop_criteria = ov::StopCriteria::heuristic;
-    float length_penalty = 1.0;
-    size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
-
-    std::function<bool(const Beam&)> early_finish = [](const Beam&) {
-        return false;
-    };
-};
-
-struct Group {
-    std::vector<Beam> ongoing;   // Best beams in front
-    std::vector<Beam> min_heap;  // The worst of the best completed beams is the first
-    bool done = false;
-
-    void finish(Beam&& beam, const Parameters& parameters);
-
-    void is_done(const Parameters& parameters);
-};
-
-
-struct GroupBeamSearcher {
-    Parameters parameters;
-    std::vector<std::vector<Group>> prompts_groups;
-
-    GroupBeamSearcher(Parameters parameters);
-
-    std::pair<std::vector<int64_t>, std::vector<int32_t>> select_next_tokens(const ov::Tensor& logits);
-
-    std::pair<std::vector<int64_t>, std::vector<int32_t>> select_prompt_next_tokens(const ov::Tensor& logits,
-                                                                                    const std::vector<int64_t>& prompt,
-                                                                                    std::vector<Group>& groups);
-};
-
-// Consume group_beam_searcher because beams are consumed
-std::vector<std::vector<std::vector<Beam>>> finalize(GroupBeamSearcher&& group_beam_searcher);
+namespace ov {
+    EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params);
+}
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 2e4c49337a..30ed23e9f8 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -14,13 +14,12 @@
 #include "openvino/genai/llm_pipeline.hpp"
 #include "utils.hpp"
 #include "generation_config_helper.hpp"
+#include "group_beam_searcher.hpp"
 #include "text_callback_streamer.hpp"
 
 
 namespace ov {
 
-ov::EncodedResults beam_search(ov::InferRequest& model_runner, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params);
-
 ov::EncodedResults greedy_decoding(
     ov::InferRequest& model_runner, 
     ov::Tensor prompts, 
@@ -150,11 +149,11 @@ std::string ov::LLMPipeline::LLMPipelineImpl::generate(
     auto size = input_ids.get_shape();
     int64_t* inputs_data = input_ids.data<int64_t>();
     std::vector<int64_t> tmp_ids(inputs_data, inputs_data + input_ids.get_size()); // todo: works only for batch 1
-    tmp_ids.erase(tmp_ids.begin());
+    // tmp_ids.erase(tmp_ids.begin());
 
     auto attention_mask_data = attention_mask.data<int64_t>();
     std::vector<float> tmp_attn_mask(attention_mask_data, attention_mask_data + attention_mask.get_size());
-    tmp_attn_mask.erase(tmp_attn_mask.begin());
+    // tmp_attn_mask.erase(tmp_attn_mask.begin());
 
     std::vector<std::string> prefixes_to_exclude = {"<s>", "</s>"};  // todo: for TinyLlama, need to get them form generation_config
     auto prefix_match = [&text](std::string prefix) { return text.substr(0, prefix.length()) == prefix; };
@@ -178,6 +177,10 @@ ov::DecodedResults ov::LLMPipeline::generate(std::vector<std::string> texts, Opt
     return m_pimpl->generate(texts, generation_config);
 }
 
+ov::DecodedResults ov::LLMPipeline::generate(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config) {
+    return m_pimpl->generate(text, generation_config);
+}
+
 ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector<std::string> texts, OptionalGenerationConfig generation_config) {
     auto [input_ids, attention_mask] = m_tokenizer.encode(texts);
 
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index a5f109b791..92df3d7067 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -49,5 +49,87 @@ std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_i
     return {out_token, log_sum};
 }
 
+void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos) {
+    const size_t batch_size = attention_mask.get_shape()[0];
+    const size_t seq_length = attention_mask.get_shape()[1];
+
+    const int64_t* attention_mask_data = attention_mask.data<int64_t>();
+    int64_t* position_ids_data = position_ids.data<int64_t>();
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        size_t sum = start_pos;
+        for (size_t i = 0; i < seq_length; i++) {
+            const size_t element_offset = batch * seq_length + i;
+            position_ids_data[element_offset] = sum;
+            if (attention_mask_data[element_offset] == 1) {
+                sum += 1;
+            }
+        }
+    }
+}
+
+void initialize_beam_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_mask, ov::InferRequest& request) {
+    request.set_tensor("input_ids", input_ids);
+    request.set_tensor("attention_mask", attention_mask);
+
+    ov::Shape input_shape = input_ids.get_shape();
+
+    ov::Tensor position_ids = request.get_tensor("position_ids");
+    position_ids.set_shape(input_shape);
+    initialize_position_ids(position_ids, attention_mask);
+
+    ov::Tensor beam_idx = request.get_tensor("beam_idx");
+    beam_idx.set_shape({input_shape.at(0)});
+    std::fill_n(beam_idx.data<int32_t>(), input_shape.at(0), 0);
+}
+
+
+void set_attention_mask(ov::Tensor&& attention_mask, std::vector<int32_t> next_beams) {
+    ov::Tensor original_mask{ov::element::i64, attention_mask.get_shape()};
+    ov::Shape original_shape = original_mask.get_shape();
+    attention_mask.copy_to(original_mask);
+
+    ov::Shape new_shape{next_beams.size(), original_mask.get_shape().at(1) + 1};
+    attention_mask.set_shape(new_shape);
+
+    for (size_t beam_id = 0; beam_id < next_beams.size(); beam_id++) {
+        const size_t original_prompt_offset = next_beams.at(beam_id) * original_shape.at(1);
+        const size_t result_prompt_offset = beam_id * new_shape.at(1);
+
+        int64_t* dest = attention_mask.data<int64_t>() + result_prompt_offset;
+        const int64_t* src = original_mask.data<int64_t>() + original_prompt_offset;
+
+        std::memcpy(dest, src, original_shape.at(1) * sizeof(int64_t));
+        attention_mask.data<int64_t>()[result_prompt_offset + new_shape.at(1) - 1] = 1;
+    }
+}
+
+void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) {
+    const size_t batch_size = attention_mask.get_shape().at(0);
+    const size_t atten_length = attention_mask.get_shape().at(1);
+    position_ids.set_shape({batch_size, 1});
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        int64_t* start = attention_mask.data<int64_t>() + batch * atten_length;
+        // todo: be careful with start + atten_length, probably need to replace with start + atten_length -1
+        position_ids.data<int64_t>()[batch] = std::accumulate(start, start + atten_length, 0);
+    }
+}
+
+ov::Tensor extend_attention(ov::Tensor attention_mask) {
+    auto shape = attention_mask.get_shape();
+    auto batch_size = shape[0];
+    auto seq_len = shape[1];
+
+    ov::Tensor new_atten_mask = ov::Tensor{attention_mask.get_element_type(), {batch_size, seq_len + 1}};
+    auto old_data = attention_mask.data<int64_t>();
+    auto new_data = new_atten_mask.data<int64_t>();
+    for (size_t batch = 0; batch < batch_size; ++batch) {
+        std::memcpy(new_data + batch * (seq_len + 1), old_data + batch * seq_len, seq_len * sizeof(int64_t));
+        new_data[batch * (seq_len + 1) + seq_len] = 1;
+    }
+    return new_atten_mask;
+}
+
 }  // namespace generate_utils
 }  // namespace ov
\ No newline at end of file
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index ac5ac76158..7510c59e46 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -14,6 +14,12 @@ void print_tensor(const ov::Tensor& tensor);
 
 std::pair<int64_t, float> softmax(const ov::Tensor& logits, const size_t batch_idx);
 
+void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos = 0);
+
+ov::Tensor extend_attention(ov::Tensor attention_mask);
+
+void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask);
+
 bool is_xml(const std::string& path);
 
 }  // namespace generate_utils
diff --git a/src/tests/python_tests/test_greedy.py b/src/tests/python_tests/test_greedy.py
index 47c37f5bd8..f33909721b 100644
--- a/src/tests/python_tests/test_greedy.py
+++ b/src/tests/python_tests/test_greedy.py
@@ -10,13 +10,13 @@ def test_tiny_llama():
     max_new_tokens = 32
     prompt = 'table is made of'
 
-    encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=False)
+    encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True)
     hf_encoded_output = model.generate(encoded_prompt, max_new_tokens=max_new_tokens, do_sample=False)
     hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:])
     print(f'hf_output: {hf_output}')
 
     import sys
-    sys.path.append('build-Debug/src/python-bindings')
+    sys.path.append('src/python/openvino_genai/')
     import py_generate_pipeline as genai
     
     pipe = genai.LLMPipeline('text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/')
diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index efff680ca4..30678f3ad5 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -12,13 +12,10 @@ target_link_libraries(greedy_causal_lm PRIVATE generate_pipeline_lib)
 set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD 17)
 set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
 
-# add_executable(beam_search_causal_lm beam_search_causal_lm.cpp)
-# target_compile_definitions(beam_search_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
-# target_include_directories(beam_search_causal_lm PRIVATE ./)
-# find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-# target_link_libraries(beam_search_causal_lm PRIVATE openvino::runtime)
-# set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD 17)
-# set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
+add_executable(beam_search_causal_lm beam_search_causal_lm.cpp)
+target_link_libraries(beam_search_causal_lm PRIVATE generate_pipeline_lib)
+set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD 17)
+set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 add_executable(speculative_decoding_lm speculative_decoding_lm.cpp)
 target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
index 110ac47178..056c923224 100644
--- a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
+++ b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
@@ -1,232 +1,29 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include <group_beam_searcher.hpp>
-#include <openvino/openvino.hpp>
-
-namespace {
-
-enum SPECIAL_TOKEN { PAD_TOKEN = 2 };
-
-std::string detokenize(ov::InferRequest& detokenizer, const std::vector<int64_t>& tokens) {
-    constexpr size_t BATCH_SIZE = 1;
-    ov::Tensor inp = detokenizer.get_input_tensor();
-    inp.set_shape({BATCH_SIZE, tokens.size()});
-    for (size_t idx = 0; idx < tokens.size(); ++idx) {
-        inp.data<int64_t>()[idx] = tokens.at(idx);
-    }
-    detokenizer.infer();
-    return detokenizer.get_output_tensor().data<std::string>()[0];
-}
-
-std::pair<ov::Tensor, ov::Tensor> pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask) {
-    const size_t batch_size = input_ids.get_shape().at(0);
-    const size_t sequence_length = input_ids.get_shape().at(1);
-    int64_t* inputs_data = input_ids.data<int64_t>();
-    int64_t* attention_mask_data = attention_mask.data<int64_t>();
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        const size_t batch_offset = batch * sequence_length;
-
-        // last token in the sequence is not a PAD_TOKEN, skipping
-        if (inputs_data[batch_offset + sequence_length - 1] != SPECIAL_TOKEN::PAD_TOKEN) {
-            continue;
-        }
-
-        size_t pad_tokens_number = 0;
-        for (int i = sequence_length - 1; i >= 0; i--) {
-            const size_t token_offset = batch_offset + i;
-
-            if (inputs_data[token_offset] == SPECIAL_TOKEN::PAD_TOKEN) {
-                continue;
-            }
-
-            if (pad_tokens_number == 0) {
-                pad_tokens_number = sequence_length - i - 1;
-            }
-
-            std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]);
-            std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]);
-        }
-    }
-
-    return {input_ids, attention_mask};
-}
-
-std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::vector<std::string> prompts) {
-    tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
-
-    tokenizer.infer();
-
-    pad_left(tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask"));
-
-    // fix mask filled with '2' instead of '0'
-    ov::Tensor attention_mask = tokenizer.get_tensor("attention_mask");
-    int64_t* attention_mask_data = attention_mask.data<int64_t>();
-    std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0);
-
-    return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
-}
-
-void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) {
-    const size_t batch_size = attention_mask.get_shape().at(0);
-    const size_t sequence_length = attention_mask.get_shape().at(1);
-
-    const int64_t* attention_mask_data = attention_mask.data<int64_t>();
-    int64_t* position_ids_data = position_ids.data<int64_t>();
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        const size_t batch_offset = batch * sequence_length;
-        size_t sum = 0;
-
-        for (size_t i = 0; i < sequence_length; i++) {
-            const size_t element_offset = batch_offset + i;
-            position_ids_data[element_offset] = sum;
-            if (attention_mask_data[element_offset] == 1) {
-                sum += 1;
-            }
-        }
-    }
-}
-
-void initialize_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_mask, ov::InferRequest& request) {
-    request.set_tensor("input_ids", input_ids);
-    request.set_tensor("attention_mask", attention_mask);
-
-    ov::Shape input_shape = input_ids.get_shape();
-
-    ov::Tensor position_ids = request.get_tensor("position_ids");
-    position_ids.set_shape(input_shape);
-    initialize_position_ids(position_ids, attention_mask);
-
-    ov::Tensor beam_idx = request.get_tensor("beam_idx");
-    beam_idx.set_shape({input_shape.at(0)});
-    std::fill_n(beam_idx.data<int32_t>(), input_shape.at(0), 0);
-}
-
-void set_attention_mask(ov::Tensor&& attention_mask, std::vector<int32_t> next_beams) {
-    ov::Tensor original_mask{ov::element::i64, attention_mask.get_shape()};
-    ov::Shape original_shape = original_mask.get_shape();
-    attention_mask.copy_to(original_mask);
-
-    ov::Shape new_shape{next_beams.size(), original_mask.get_shape().at(1) + 1};
-    attention_mask.set_shape(new_shape);
-
-    for (size_t beam_id = 0; beam_id < next_beams.size(); beam_id++) {
-        const size_t original_prompt_offset = next_beams.at(beam_id) * original_shape.at(1);
-        const size_t result_prompt_offset = beam_id * new_shape.at(1);
-
-        int64_t* dest = attention_mask.data<int64_t>() + result_prompt_offset;
-        const int64_t* src = original_mask.data<int64_t>() + original_prompt_offset;
-
-        std::memcpy(dest, src, original_shape.at(1) * sizeof(int64_t));
-        attention_mask.data<int64_t>()[result_prompt_offset + new_shape.at(1) - 1] = 1;
-    }
-}
-
-void set_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) {
-    const size_t batch_size = attention_mask.get_shape().at(0);
-    const size_t sequence_length = attention_mask.get_shape().at(1);
-    position_ids.set_shape({batch_size, 1});
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        int64_t* mask_start = attention_mask.data<int64_t>() + batch * sequence_length;
-        position_ids.data<int64_t>()[batch] = std::accumulate(mask_start, mask_start + sequence_length - 1, 0);
-    }
-}
-
-std::vector<std::string> prompts_arguments_to_vector(int argc, char* argv[]) {
-    std::vector<std::string> prompts;
-    prompts.reserve(argc - 2);
-    for (size_t i = 2; i < argc; i++) {
-        prompts.push_back(std::string{argv[i]});
-    }
-    return prompts;
-}
-
-}  // namespace
+#include <openvino/genai/llm_pipeline.hpp>
 
 int main(int argc, char* argv[]) try {
     if (argc < 3) {
         throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> '<PROMPT 1>' ['<PROMPT 2>' ...]");
     }
-
-    // Compile models
-    ov::Core core;
-    core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-    // Read the tokenizer model information from the file to later get the runtime information
-    auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml");
-    // tokenizer and detokenizer work on CPU only
-    ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request();
-    ov::InferRequest detokenizer =
-        core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
-    // The model can be compiled for GPU as well
-    ov::InferRequest lm =
-        core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request();
-
-    auto [input_ids, attention_mask] = tokenize(tokenizer, prompts_arguments_to_vector(argc, argv));
-
-    // Initialize beam search
-    const int64_t* prompt_data = input_ids.data<const int64_t>();
-    std::vector<std::vector<int64_t>> prompts;
-    prompts.reserve(input_ids.get_shape().at(0));
-    for (size_t batch = 0; batch < input_ids.get_shape().at(0); batch++) {
-        size_t sequence_length = input_ids.get_shape().at(1);
-        size_t batch_offset = batch * sequence_length;
-        const int64_t* prompt_start = prompt_data + batch_offset;
-        prompts.push_back(std::vector<int64_t>{prompt_start, prompt_start + sequence_length});
-    }
-
-    // Get the runtime info from the tokenizer model that we read earlier
-    auto rt_info = tokenizer_model->get_rt_info();  // Get the runtime info for the model
-    int64_t SPECIAL_EOS_TOKEN;
-
-    if (rt_info.count("eos_token_id") > 0) {  // check if the runtime information has a valid EOS token ID
-        SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as<int64_t>();
-
-    } else {
-        throw std::runtime_error("EOS token ID not found in model's runtime information.");
-    }
-
-    Parameters parameters{std::move(prompts), SPECIAL_EOS_TOKEN};
-    GroupBeamSearcher group_beam_searcher{parameters};
-
-    initialize_inputs(input_ids, attention_mask, lm);
-
-    std::vector<int64_t> next_tokens;
-    std::vector<int32_t> next_beams;
-
-    for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) {
-        lm.infer();
-
-        std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits"));
-        if (next_tokens.empty()) {
-            break;
-        }
-        size_t batch_size = next_tokens.size();
-        // Set pointers
-        lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()});
-        lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()});
-        // Set auxiliary inputs
-        set_attention_mask(lm.get_tensor("attention_mask"), next_beams);
-        set_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask"));
-    }
-
-    for (const std::vector<std::vector<Beam>>& prompt_group : finalize(std::move(group_beam_searcher))) {
-        std::cout << "Prompt:\n";
-        for (const std::vector<Beam> group : prompt_group) {
-            std::cout << "Group:\n";
-            for (const Beam& beam : group) {
-                std::cout << beam.score << ": " << detokenize(detokenizer, beam.tokens) << '\n';
-            }
-        }
-    }
-    // Model is stateful which means that context (kv-cache) which belongs to a particular
-    // text sequence is accumulated inside the model during the generation loop above.
-    // This context should be reset before processing the next text sequence.
-    // While it is not required to reset context in this sample as only one batch of sequences is processed,
-    // it is called for education purposes:
-    lm.reset_state();
+    auto prompts = std::vector<std::string>(argv + 2, argv + argc);
+    
+    std::string model_path = argv[1];
+    std::string device = "CPU";  // GPU can be used as well
+
+    ov::LLMPipeline pipe(model_path, device);
+    ov::GenerationConfig config = pipe.get_generation_config();
+    config.max_new_tokens = 20;
+    config.num_beam_groups = 3;
+    config.num_beams = 15;
+    config.num_return_sequences = config.num_beams * prompts.size();
+    
+    auto beams = pipe.generate(prompts, config);
+    for (int i = 0; i < beams.scores.size(); i++)
+        std::cout << beams.scores[i] << ": " << beams.texts[i] << '\n';
+
+    return 0;
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
     return EXIT_FAILURE;

From 11fbaa27391d76aa69208d039066e67d23bc043f Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 15 May 2024 11:45:00 +0200
Subject: [PATCH 90/97] tokenizer minor fixes

---
 src/cpp/src/llm_pipeline.cpp | 8 ++++++--
 src/cpp/src/tokenizer.cpp    | 8 +-------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 30ed23e9f8..47bf3495d5 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -143,7 +143,7 @@ std::string ov::LLMPipeline::LLMPipelineImpl::generate(
 
     auto [input_ids, attention_mask] = m_tokenizer.encode(text);
 
-    // todo: W/A If sentence begins with a special tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
+    // todo: W/A If sentence begins with a specfial tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
     // but HF does not do that. Moreover openvino_tokenizer always inserts <bos> but in chat scenario HF does not do that because skip_special_tokens=True.
     // Need to remove both of that tokens manually to get exact token by token alignment with HF
     auto size = input_ids.get_shape();
@@ -155,7 +155,7 @@ std::string ov::LLMPipeline::LLMPipelineImpl::generate(
     std::vector<float> tmp_attn_mask(attention_mask_data, attention_mask_data + attention_mask.get_size());
     // tmp_attn_mask.erase(tmp_attn_mask.begin());
 
-    std::vector<std::string> prefixes_to_exclude = {"<s>", "</s>"};  // todo: for TinyLlama, need to get them form generation_config
+    std::vector<std::string> prefixes_to_exclude = {config.eos_token, config.bos_token};
     auto prefix_match = [&text](std::string prefix) { return text.substr(0, prefix.length()) == prefix; };
     if (std::any_of(prefixes_to_exclude.begin(), prefixes_to_exclude.end(), prefix_match)) {
         tmp_ids.erase(tmp_ids.begin());
@@ -221,6 +221,10 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(
     } else if (auto callback = std::get_if<std::function<void(std::string)>>(&*streamer)) {
         streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
     }
+    auto batch_size = input_ids.get_shape().at(0);
+    if ((batch_size != 1 || !config_helper.is_greedy_decoding()) && streamer_ptr) {
+        OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy decoding");
+    }
 
     auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::generate_utils::init_attention_mask(input_ids);
 
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index a11cfb471a..09d64460a2 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -93,13 +93,7 @@ class Tokenizer::TokenizerImpl {
         m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
         auto size_ = m_tokenize_request.get_input_tensor().get_shape();
         m_tokenize_request.infer();
-
-        ::pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id);
-        // todo: fix mask filled with '2' instead of '0'
-        ov::Tensor attention_mask = m_tokenize_request.get_tensor("attention_mask");
-        int64_t* attention_mask_data = attention_mask.data<int64_t>();
-        std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0);
-        
+        pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id);
         return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
     }
 

From 264e99f08d2c476b48aaf92b89cf199fa2a05200 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 15 May 2024 12:04:36 +0200
Subject: [PATCH 91/97] apply comments

---
 .../include/openvino/genai/llm_pipeline.hpp   | 14 ++-
 src/cpp/src/llm_pipeline.cpp                  |  2 +
 src/cpp/src/text_callback_streamer.cpp        |  8 +-
 src/cpp/src/text_callback_streamer.hpp        |  2 +-
 src/tests/python_tests/test_cpp_samples.py    | 85 +++++++++++++++++++
 text_generation/causal_lm/cpp/CMakeLists.txt  |  4 +-
 .../cpp/generate_pipeline/chat_sample.cpp     |  1 +
 7 files changed, 102 insertions(+), 14 deletions(-)
 create mode 100644 src/tests/python_tests/test_cpp_samples.py

diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index 5c3e23aa7e..b25d11ecd4 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -11,8 +11,6 @@
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/genai/streamer_base.hpp"
 
-using namespace std;
-
 namespace ov {
 
 using StreamerVariant = std::variant<std::function<void (std::string)>, std::shared_ptr<StreamerBase>>;
@@ -82,7 +80,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     * @param streamer optional streamer
     * @return std::string decoded resulting text
     */
-    std::string generate(std::string text, OptionalGenerationConfig generation_config=nullopt, OptionalStreamerVariant streamer=nullopt);
+    std::string generate(std::string text, OptionalGenerationConfig generation_config=std::nullopt, OptionalStreamerVariant streamer=std::nullopt);
     
     template <typename... Properties>
     util::EnableIfAllStringAny<std::string, Properties...> generate(
@@ -124,8 +122,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     */
     EncodedResults generate(ov::Tensor input_ids, 
                             std::optional<ov::Tensor> attention_mask, 
-                            OptionalGenerationConfig generation_config=nullopt,
-                            OptionalStreamerVariant streamer=nullopt);
+                            OptionalGenerationConfig generation_config=std::nullopt,
+                            OptionalStreamerVariant streamer=std::nullopt);
     
     template <typename InputsType, typename... Properties>
     util::EnableIfAllStringAny<std::string, Properties...> operator()(
@@ -134,11 +132,11 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
         return generate(text, AnyMap{std::forward<Properties>(properties)...});
     }
     
-    DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config=nullopt);
-    DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config=nullopt);
+    DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config=std::nullopt);
+    DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config=std::nullopt);
 
     // generate with streamers
-    std::string operator()(std::string text, OptionalGenerationConfig generation_config=nullopt, OptionalStreamerVariant streamer=nullopt);
+    std::string operator()(std::string text, OptionalGenerationConfig generation_config=std::nullopt, OptionalStreamerVariant streamer=std::nullopt);
     std::string operator()(std::string text, OptionalStreamerVariant streamer);
     
     ov::Tokenizer get_tokenizer();
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 47bf3495d5..9d4161f859 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -107,6 +107,8 @@ ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(std::string& path, std::string
         nlohmann::json data = nlohmann::json::parse(f);
         m_chat_template = data.value("chat_template", "");
     }
+
+    
     
     m_device = device;
 
diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp
index a1d2f3b01d..f9b3ad8ccd 100644
--- a/src/cpp/src/text_callback_streamer.cpp
+++ b/src/cpp/src/text_callback_streamer.cpp
@@ -5,7 +5,7 @@ namespace ov {
 TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function<void (std::string)> callback, bool print_eos_token) {
     m_tokenizer = tokenizer;
     m_print_eos_token = print_eos_token;
-    m_callback = callback;
+    on_decoded_text_callback = callback;
     m_enabled = true;
 }
 
@@ -55,18 +55,18 @@ void TextCallbackStreamer::set_tokenizer(Tokenizer tokenizer) {
 }
 
 void TextCallbackStreamer::set_callback(std::function<void (std::string)> callback) {
-    m_callback = callback;
+    on_decoded_text_callback = callback;
     m_enabled = true;
 }
 
 void TextCallbackStreamer::set_callback() {
-    m_callback = [](std::string words){ ;};
+    on_decoded_text_callback = [](std::string words){};
     m_enabled = false;
 }
 
 void TextCallbackStreamer::on_finalized_text(const std::string& subword) {
     if (m_enabled) {
-        m_callback(subword);
+        on_decoded_text_callback(subword);
     }
 }
 
diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp
index f3d8773fb4..d9c1ba3ee5 100644
--- a/src/cpp/src/text_callback_streamer.hpp
+++ b/src/cpp/src/text_callback_streamer.hpp
@@ -21,7 +21,7 @@ class TextCallbackStreamer: public StreamerBase {
     void set_callback(std::function<void (std::string)> callback);
     void set_callback();
     
-    std::function<void (std::string)> m_callback = [](std::string words){ ;};
+    std::function<void (std::string)> on_decoded_text_callback = [](std::string words){};
     bool m_enabled = false;
     int64_t m_eos_token;
 private:
diff --git a/src/tests/python_tests/test_cpp_samples.py b/src/tests/python_tests/test_cpp_samples.py
new file mode 100644
index 0000000000..85ab4a9fbd
--- /dev/null
+++ b/src/tests/python_tests/test_cpp_samples.py
@@ -0,0 +1,85 @@
+
+import pytest
+
+model_ids = [
+    # ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0-skip-special-tokens"),
+
+    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/"),
+    ("google/gemma-2b-it", "gemma-2b-it/pytorch/dldt/FP16/"),
+    # ("meta-llama/Llama-2-7b-chat-hf", "Llama-2-7b-chat-hf/pytorch/dldt/FP16/"),
+]
+
+def run_cpp_sample_command(command, cwd):
+    import subprocess
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd, text=True)
+    stdout, stderr = process.communicate()
+    return stdout, stderr, process.returncode
+
+def run_transformers_model(model_id, prompt, config=None, add_special_tokens=True):
+    import transformers
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
+    model = transformers.AutoModelForCausalLM.from_pretrained(model_id)
+    tokenized = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=add_special_tokens)
+        
+    default_config = dict( 
+        num_beam_groups=3, 
+        num_beams=15, 
+        diversity_penalty=1.0, 
+        num_return_sequences=15, 
+        max_new_tokens=20, 
+        early_stopping=False, 
+        length_penalty=1.0, 
+        no_repeat_ngram_size=9**9, 
+        do_sample=False
+    )
+
+    if config is None:
+        config = default_config
+    print(tokenized)
+    beams = model.generate(tokenized, **config)
+    return map(lambda beam: tokenizer.decode(beam[tokenized.numel():], skip_special_tokens=True), beams)
+
+@pytest.mark.parametrize("param", model_ids)
+def test_model(param):
+    model_id, path = param
+
+    prompts = ["table is made of", "The Sun is yellow because"]
+    # prompt = " ".join([f'"{item}"' for item in prompts])
+
+    prompt = "table is made of"
+
+    # cmd = 'build-Debug/greedy_causal_lm' // for old samples
+    cmd = 'build-Debug/text_generation/causal_lm/cpp/'
+    
+    # beam search old
+    cmd = 'build-Debug/beam_search_causal_lm'
+    cwd = '/home/epavel/devel/openvino.genai_'
+    config = None # None means greedy
+
+    # greedy new
+    cwd = '/home/epavel/devel/openvino.genai'
+    cmd = 'build-Debug/text_generation/causal_lm/cpp/greedy_causal_lm'
+    config = dict(max_new_tokens=75, do_sample=False)
+
+    # beam search new
+    cwd = '/home/epavel/devel/openvino.genai'
+    cmd = 'build-Debug/text_generation/causal_lm/cpp/beam_search_causal_lm'
+    config = None
+
+    predictions, _, _ = run_cpp_sample_command([cmd, '/home/epavel/devel/openvino.genai/text_generation/causal_lm/' + path, prompt], cwd)
+    print(predictions)
+    
+    beams = run_transformers_model(model_id, prompt, config)
+    for beam in beams:
+        idx = predictions.find(beam)
+        if -1 == idx and beam and predictions:
+            raise RuntimeError(f'Missing "{beam=}" from predictions')
+        predictions = predictions[:idx] + predictions[idx + len(beam):]
+    
+    return True
+    # with open('pred.txt', 'r') as file:
+    #     predictions = file.read()
+
+for model_id, path in model_ids:
+    test_model((model_id, path))
diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 30678f3ad5..8b6281f50e 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -4,7 +4,9 @@
 cmake_minimum_required(VERSION 3.15)
 project(causal_lm)
 
-# add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
+if(NOT TARGET openvino_tokenizers)
+add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/")
+endif()
 
 add_executable(greedy_causal_lm greedy_causal_lm.cpp)
 target_compile_definitions(greedy_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
index c7460dd337..b1ecb5f5f4 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp
@@ -4,6 +4,7 @@
 #include <openvino/openvino.hpp>
 #include "openvino/genai/llm_pipeline.hpp"
 
+using namespace std;
 
 std::vector<string> questions = {
     "1+1=", 

From 11032b43bf9f17a0929eedd6f2f1573786dabe88 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 15 May 2024 12:12:03 +0200
Subject: [PATCH 92/97] remove accidentally added test_cpp_samples.py

---
 src/tests/python_tests/test_cpp_samples.py | 85 ----------------------
 1 file changed, 85 deletions(-)
 delete mode 100644 src/tests/python_tests/test_cpp_samples.py

diff --git a/src/tests/python_tests/test_cpp_samples.py b/src/tests/python_tests/test_cpp_samples.py
deleted file mode 100644
index 85ab4a9fbd..0000000000
--- a/src/tests/python_tests/test_cpp_samples.py
+++ /dev/null
@@ -1,85 +0,0 @@
-
-import pytest
-
-model_ids = [
-    # ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0-skip-special-tokens"),
-
-    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/"),
-    ("google/gemma-2b-it", "gemma-2b-it/pytorch/dldt/FP16/"),
-    # ("meta-llama/Llama-2-7b-chat-hf", "Llama-2-7b-chat-hf/pytorch/dldt/FP16/"),
-]
-
-def run_cpp_sample_command(command, cwd):
-    import subprocess
-    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd, text=True)
-    stdout, stderr = process.communicate()
-    return stdout, stderr, process.returncode
-
-def run_transformers_model(model_id, prompt, config=None, add_special_tokens=True):
-    import transformers
-
-    tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
-    model = transformers.AutoModelForCausalLM.from_pretrained(model_id)
-    tokenized = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=add_special_tokens)
-        
-    default_config = dict( 
-        num_beam_groups=3, 
-        num_beams=15, 
-        diversity_penalty=1.0, 
-        num_return_sequences=15, 
-        max_new_tokens=20, 
-        early_stopping=False, 
-        length_penalty=1.0, 
-        no_repeat_ngram_size=9**9, 
-        do_sample=False
-    )
-
-    if config is None:
-        config = default_config
-    print(tokenized)
-    beams = model.generate(tokenized, **config)
-    return map(lambda beam: tokenizer.decode(beam[tokenized.numel():], skip_special_tokens=True), beams)
-
-@pytest.mark.parametrize("param", model_ids)
-def test_model(param):
-    model_id, path = param
-
-    prompts = ["table is made of", "The Sun is yellow because"]
-    # prompt = " ".join([f'"{item}"' for item in prompts])
-
-    prompt = "table is made of"
-
-    # cmd = 'build-Debug/greedy_causal_lm' // for old samples
-    cmd = 'build-Debug/text_generation/causal_lm/cpp/'
-    
-    # beam search old
-    cmd = 'build-Debug/beam_search_causal_lm'
-    cwd = '/home/epavel/devel/openvino.genai_'
-    config = None # None means greedy
-
-    # greedy new
-    cwd = '/home/epavel/devel/openvino.genai'
-    cmd = 'build-Debug/text_generation/causal_lm/cpp/greedy_causal_lm'
-    config = dict(max_new_tokens=75, do_sample=False)
-
-    # beam search new
-    cwd = '/home/epavel/devel/openvino.genai'
-    cmd = 'build-Debug/text_generation/causal_lm/cpp/beam_search_causal_lm'
-    config = None
-
-    predictions, _, _ = run_cpp_sample_command([cmd, '/home/epavel/devel/openvino.genai/text_generation/causal_lm/' + path, prompt], cwd)
-    print(predictions)
-    
-    beams = run_transformers_model(model_id, prompt, config)
-    for beam in beams:
-        idx = predictions.find(beam)
-        if -1 == idx and beam and predictions:
-            raise RuntimeError(f'Missing "{beam=}" from predictions')
-        predictions = predictions[:idx] + predictions[idx + len(beam):]
-    
-    return True
-    # with open('pred.txt', 'r') as file:
-    #     predictions = file.read()
-
-for model_id, path in model_ids:
-    test_model((model_id, path))

From 7d0c80b397dc82c841383672459580b29f06c25c Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 15 May 2024 12:43:19 +0200
Subject: [PATCH 93/97] fix build

---
 .github/workflows/causal_lm_cpp.yml | 54 ++++++++++++++---------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index df03bab7c6..21f35151bc 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -32,12 +32,12 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
           sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
       - name: greedy_causal_lm
         run: |
           source ./ov/setupvars.sh
-          ./build/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
+          ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
 
   cpp-beam_search_causal_lm-ubuntu:
     runs-on: ubuntu-20.04
@@ -60,13 +60,13 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
           sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
       - name: Compare
         run: |
           source ./ov/setupvars.sh
 
-          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt
+          timeout 25s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -82,7 +82,7 @@ jobs:
           "
           echo "Why is the Sun yellow?" passed
 
-          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt
+          timeout 25s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -98,7 +98,7 @@ jobs:
           "
           echo "69" passed
 
-          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt
+          timeout 25s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -114,7 +114,7 @@ jobs:
           "
           echo "Hi" passed
 
-          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt
+          timeout 25s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -130,7 +130,7 @@ jobs:
           "
           echo "return 0" passed
 
-          ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "你好！ 你好嗎？" > ./pred.txt
+          ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "你好！ 你好嗎？" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -146,7 +146,7 @@ jobs:
           "
           echo "你好！ 你好嗎？" passed
 
-          timeout 1m ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好！ 你好嗎？" > ./pred.txt
+          timeout 1m ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好！ 你好嗎？" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -188,7 +188,7 @@ jobs:
           python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
       - name: Compare
         shell: cmd
@@ -229,12 +229,12 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
           sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
       - name: Compare
         run: |
           source ./ov/setupvars.sh
-          timeout 50s ./build/beam_search_causal_lm ./Qwen-7B-Chat/ 69 > ./pred.txt
+          timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./Qwen-7B-Chat/ 69 > ./pred.txt
 
   cpp-beam_search_causal_lm-Qwen1_5-7B-Chat:
     runs-on: ubuntu-20.04-16-cores
@@ -257,12 +257,12 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
           sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
       - name: Run
         run: |
           source ./ov/setupvars.sh
-          timeout 50s ./build/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好！" > ./pred_qwen15.txt
+          timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好！" > ./pred_qwen15.txt
 
   cpp-beam_search_causal_lm-Phi-2:
     runs-on: ubuntu-20.04-16-cores
@@ -285,12 +285,12 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
           sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j 15
       - name: Compare
         run: |
           source ./ov/setupvars.sh
-          timeout 50s ./build/beam_search_causal_lm ./phi-2/ 69 > ./pred.txt
+          timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./phi-2/ 69 > ./pred.txt
 
   cpp-beam_search_causal_lm-notus-7b-v1:
     runs-on: ubuntu-20.04-16-cores
@@ -313,12 +313,12 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
           sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
       - name: Compare
         run: |
           source ./ov/setupvars.sh
-          timeout 50s ./build/beam_search_causal_lm ./notus-7b-v1/ 69 > ./pred.txt
+          timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./notus-7b-v1/ 69 > ./pred.txt
 
   cpp-speculative_decoding_lm-ubuntu:
     runs-on: ubuntu-20.04-16-cores
@@ -342,13 +342,13 @@ jobs:
           sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
       - name: run and compare
         run: |
           source ./ov/setupvars.sh
           ./build/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt
-          ./build/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
+          ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
               predicted_greedy = f.readline()
@@ -380,7 +380,7 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
           sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
       - name: run and compare
         run: |
@@ -394,7 +394,7 @@ jobs:
           A:' > ./prompt.txt
 
           ./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
-          ./build/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt
+          ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
               predicted_greedy = f.readline()
@@ -425,13 +425,13 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
           sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j 15
       - name: Run Generation
         run: |
           source ./ov/setupvars.sh
-          timeout 50s ./build/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt
-          timeout 50s ./build/beam_search_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_beam.txt
+          timeout 50s ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt
+          timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_beam.txt
       - name: Compare
         run: |
           python -c "
@@ -470,13 +470,13 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
           sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
       - run: source ./ov/setupvars.sh && convert_tokenizer ./redpajama-3b-chat/ --output ./redpajama-3b-chat/ --with-detokenizer --trust-remote-code
       - name: Run Generation
         run: |
           source ./ov/setupvars.sh
-          timeout 50s ./build/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt
+          timeout 50s ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt
       - name: Compare
         run: |
           python -c "

From 2e3cd73eab715282823c49454ffa03254b234fd7 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 15 May 2024 13:50:13 +0200
Subject: [PATCH 94/97] fix causal_lm comparison error

---
 .github/workflows/causal_lm_cpp.yml                     | 4 ++--
 text_generation/causal_lm/cpp/beam_search_causal_lm.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 21f35151bc..7c34ca7f66 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -347,7 +347,7 @@ jobs:
       - name: run and compare
         run: |
           source ./ov/setupvars.sh
-          ./build/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt
+          ./build/text_generation/causal_lm/cpp/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt
           ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
@@ -393,7 +393,7 @@ jobs:
           Question: Can you please add 2 and 3
           A:' > ./prompt.txt
 
-          ./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
+          ./build/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
           ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
index 056c923224..3b40529f38 100644
--- a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
+++ b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
@@ -14,7 +14,7 @@ int main(int argc, char* argv[]) try {
 
     ov::LLMPipeline pipe(model_path, device);
     ov::GenerationConfig config = pipe.get_generation_config();
-    config.max_new_tokens = 20;
+    config.max_new_tokens = 100;
     config.num_beam_groups = 3;
     config.num_beams = 15;
     config.num_return_sequences = config.num_beams * prompts.size();

From e7fa9743f4f9aaeafab88061d7b1831572ac4a40 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Wed, 15 May 2024 16:36:21 +0200
Subject: [PATCH 95/97] fix different outputs

---
 .github/workflows/causal_lm_cpp.yml            | 18 +++++++++---------
 src/cpp/src/generation_config.cpp              |  7 ++++++-
 src/cpp/src/tokenizer.cpp                      |  9 ++++++++-
 .../causal_lm/cpp/beam_search_causal_lm.cpp    |  9 ++++++++-
 4 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 7c34ca7f66..23d9006d07 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -74,7 +74,7 @@ jobs:
           tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           tokenized = tokenizer('Why is the Sun yellow?', return_tensors='pt')
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
-              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
               if -1 == idx:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -90,7 +90,7 @@ jobs:
           tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           tokenized = tokenizer('69', return_tensors='pt')
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
-              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
               if -1 == idx:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -106,7 +106,7 @@ jobs:
           tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           tokenized = tokenizer('Hi', return_tensors='pt')
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
-              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
               if -1 == idx:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -122,7 +122,7 @@ jobs:
           tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           tokenized = tokenizer('return 0', return_tensors='pt')
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
-              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
               if -1 == idx:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -138,7 +138,7 @@ jobs:
           tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           tokenized = tokenizer('你好！ 你好嗎？', return_tensors='pt')
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
-              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
               if -1 == idx:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -160,7 +160,7 @@ jobs:
           for prompt in prompts:
             tokenized = tokenizer(prompt, return_tensors='pt')
             for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
-                ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+                ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
                 idx = predictions.find(ref)
                 if -1 == idx:
                     raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -201,7 +201,7 @@ jobs:
           echo tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') >> ref.py
           echo tokenized = tokenizer('69', return_tensors='pt') >> ref.py
           echo for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): >> ref.py
-          echo     ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' >> ref.py
+          echo     ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py
           echo     idx = predictions.find(ref) >> ref.py
           echo     if -1 == idx: >> ref.py
           echo         raise RuntimeError(f'Missing "{ref=}" from predictions') >> ref.py
@@ -441,7 +441,7 @@ jobs:
           tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5')
           tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
           for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False):
-              ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
               if -1 == idx:
                   raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -486,7 +486,7 @@ jobs:
           tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat')
           tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
           for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False):
-              ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
+              ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
               if -1 == idx:
                   raise RuntimeError(f'Missing "{ref}" from predictions')
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index 68fa0c86ab..b392e44b3b 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -36,7 +36,12 @@ GenerationConfig::GenerationConfig(std::string json_path) {
     if (data.contains("repetition_penalty")) repetition_penalty = data["repetition_penalty"];
     if (data.contains("pad_token_id")) pad_token_id = data["pad_token_id"];
     if (data.contains("bos_token_id")) bos_token_id = data["bos_token_id"];
-    if (data.contains("eos_token_id")) eos_token_id = data["eos_token_id"];
+    
+    if (data.contains("eos_token_id") && data["eos_token_id"].type() == nlohmann::json::value_t::number_integer) {
+        // todo: qwen contains several eos_token_id
+        eos_token_id = data["eos_token_id"];
+    }
+
     if (data.contains("bos_token")) bos_token = data["bos_token"];
     if (data.contains("eos_token")) eos_token = data["eos_token"];
 
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 09d64460a2..75c18734d3 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -80,7 +80,7 @@ class Tokenizer::TokenizerImpl {
             m_bos_token_id = rt_info["bos_token_id"].as<int64_t>();
         if (rt_info.count("pad_token_id") > 0)
             m_pad_token_id = rt_info["pad_token_id"].as<int64_t>();
-    }
+        }
 
     std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt) {
         size_t batch_size = 1;
@@ -94,6 +94,13 @@ class Tokenizer::TokenizerImpl {
         auto size_ = m_tokenize_request.get_input_tensor().get_shape();
         m_tokenize_request.infer();
         pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id);
+        
+        // todo: fix mask filled with '2' instead of '0' 
+        // https://github.com/openvinotoolkit/openvino_tokenizers/pull/90 should've fixed this
+        ov::Tensor attention_mask = m_tokenize_request.get_tensor("attention_mask");
+        int64_t* attention_mask_data = attention_mask.data<int64_t>();
+        std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0);
+        
         return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
     }
 
diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
index 3b40529f38..1afc5f93ed 100644
--- a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
+++ b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
@@ -3,6 +3,10 @@
 
 #include <openvino/genai/llm_pipeline.hpp>
 
+namespace {
+    enum SPECIAL_TOKEN { PAD_TOKEN = 2 };
+}
+
 int main(int argc, char* argv[]) try {
     if (argc < 3) {
         throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> '<PROMPT 1>' ['<PROMPT 2>' ...]");
@@ -14,11 +18,14 @@ int main(int argc, char* argv[]) try {
 
     ov::LLMPipeline pipe(model_path, device);
     ov::GenerationConfig config = pipe.get_generation_config();
-    config.max_new_tokens = 100;
+    config.max_new_tokens = 20;
     config.num_beam_groups = 3;
     config.num_beams = 15;
     config.num_return_sequences = config.num_beams * prompts.size();
     
+    // workaround until pad_token_id is not written into IR
+    pipe.get_tokenizer().set_pad_token_id(PAD_TOKEN);
+    
     auto beams = pipe.generate(prompts, config);
     for (int i = 0; i < beams.scores.size(); i++)
         std::cout << beams.scores[i] << ": " << beams.texts[i] << '\n';

From 93be036ae83d7a452af70af74eb8f05a3d93e9d1 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 16 May 2024 21:07:05 +0200
Subject: [PATCH 96/97] add tests

---
 src/cpp/src/generation_config.cpp           | 125 +++++++++++++-------
 src/tests/python_tests/test_generate_api.py | 102 ++++++++++++++++
 src/tests/python_tests/test_greedy.py       |  29 -----
 3 files changed, 181 insertions(+), 75 deletions(-)
 create mode 100644 src/tests/python_tests/test_generate_api.py
 delete mode 100644 src/tests/python_tests/test_greedy.py

diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index b392e44b3b..6a2e2e407b 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -11,6 +11,45 @@
 
 #include "generation_config_helper.hpp"
 
+namespace {   
+    template <typename>
+    struct json_type_traits {};
+
+    template <>
+    struct json_type_traits<int> { static constexpr auto json_value_t = nlohmann::json::value_t::number_integer; };
+
+    template <>
+    struct json_type_traits<int64_t> { static constexpr auto json_value_t = nlohmann::json::value_t::number_integer; };
+
+    template <>
+    struct json_type_traits<size_t> { static constexpr auto json_value_t = nlohmann::json::value_t::number_unsigned; };
+
+    template <>
+    struct json_type_traits<float> { static constexpr auto json_value_t = nlohmann::json::value_t::number_float; };
+
+    template <>
+    struct json_type_traits<std::string> { static constexpr auto json_value_t = nlohmann::json::value_t::string; };
+
+    template <>
+    struct json_type_traits<bool> { static constexpr auto json_value_t = nlohmann::json::value_t::boolean; };
+
+    template <typename T>
+    void read_json_param(const nlohmann::json& data, const std::string& name, T& param) {
+        if (data.contains(name) && data[name].type() == json_type_traits<T>::json_value_t) {
+            param = data[name];
+        }
+    }
+    
+    template <typename T>
+    void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T& param) {
+        if (config_map.count(name)) {
+            param = config_map.at(name).as<T>();
+        }
+    }
+
+} // namespace
+
+
 namespace ov {
 
 GenerationConfig::GenerationConfig(std::string json_path) {
@@ -18,32 +57,27 @@ GenerationConfig::GenerationConfig(std::string json_path) {
     OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config");
 
     nlohmann::json data = nlohmann::json::parse(f);
-
-    if (data.contains("max_new_tokens")) max_new_tokens = data["max_new_tokens"];
-    if (data.contains("max_length")) max_length = data["max_length"];
+    
+    read_json_param(data, "max_new_tokens", max_new_tokens);
+    read_json_param(data, "max_length", max_length);
     // note that ignore_eos is not present in HF GenerationConfig
-    if (data.contains("num_beam_groups")) num_beam_groups = data["num_beam_groups"];
-    if (data.contains("num_beams")) num_beams = data["num_beams"];
-    if (data.contains("diversity_penalty")) diversity_penalty = data["diversity_penalty"];
-    if (data.contains("length_penalty")) length_penalty = data["length_penalty"];
-    if (data.contains("num_return_sequences")) num_return_sequences = data["num_return_sequences"];
-    if (data.contains("no_repeat_ngram_size")) no_repeat_ngram_size = data["no_repeat_ngram_size"];
+    read_json_param(data, "num_beam_groups", num_beam_groups);
+    read_json_param(data, "num_beams", num_beams);
+    read_json_param(data, "diversity_penalty", diversity_penalty);
+    read_json_param(data, "length_penalty", length_penalty);
+    read_json_param(data, "num_return_sequences", num_return_sequences);
+    read_json_param(data, "no_repeat_ngram_size", no_repeat_ngram_size);
     // stop_criteria will be processed below
-    if (data.contains("temperature")) temperature = data["temperature"];
-    if (data.contains("top_p")) top_p = data["top_p"];
-    if (data.contains("top_k")) top_k = data["top_k"];
-    if (data.contains("do_sample")) do_sample = data["do_sample"];
-    if (data.contains("repetition_penalty")) repetition_penalty = data["repetition_penalty"];
-    if (data.contains("pad_token_id")) pad_token_id = data["pad_token_id"];
-    if (data.contains("bos_token_id")) bos_token_id = data["bos_token_id"];
-    
-    if (data.contains("eos_token_id") && data["eos_token_id"].type() == nlohmann::json::value_t::number_integer) {
-        // todo: qwen contains several eos_token_id
-        eos_token_id = data["eos_token_id"];
-    }
-
-    if (data.contains("bos_token")) bos_token = data["bos_token"];
-    if (data.contains("eos_token")) eos_token = data["eos_token"];
+    read_json_param(data, "temperature", temperature);
+    read_json_param(data, "top_p", top_p);
+    read_json_param(data, "top_k", top_k);
+    read_json_param(data, "do_sample", do_sample);
+    read_json_param(data, "repetition_penalty", repetition_penalty);
+    read_json_param(data, "pad_token_id", pad_token_id);
+    read_json_param(data, "bos_token_id", bos_token_id);
+    read_json_param(data, "eos_token_id", eos_token_id);
+    read_json_param(data, "bos_token", bos_token);
+    read_json_param(data, "eos_token", eos_token);
 
     if (data.contains("early_stopping")) {
         auto field_type = data["early_stopping"].type();
@@ -59,28 +93,27 @@ GenerationConfig::GenerationConfig(std::string json_path) {
 
 GenerationConfig GenerationConfigHelper::anymap_to_generation_config(const ov::AnyMap& config_map) {
     GenerationConfig config = m_config;
-
-    if (config_map.count("max_new_tokens")) config.max_new_tokens = config_map.at("max_new_tokens").as<size_t>();
-    if (config_map.count("max_length")) config.max_length = config_map.at("max_length").as<size_t>();
-    if (config_map.count("ignore_eos")) config.ignore_eos = config_map.at("ignore_eos").as<bool>();
-    if (config_map.count("num_beam_groups")) config.num_beam_groups = config_map.at("num_beam_groups").as<size_t>();
-    if (config_map.count("num_beams")) config.num_beams = config_map.at("num_beams").as<size_t>();
-    if (config_map.count("diversity_penalty")) config.diversity_penalty = config_map.at("diversity_penalty").as<float>();
-    if (config_map.count("length_penalty")) config.length_penalty = config_map.at("length_penalty").as<float>();
-    if (config_map.count("num_return_sequences")) config.num_return_sequences = config_map.at("num_return_sequences").as<size_t>();
-    if (config_map.count("no_repeat_ngram_size")) config.no_repeat_ngram_size = config_map.at("no_repeat_ngram_size").as<size_t>();
-    if (config_map.count("stop_criteria")) config.stop_criteria = config_map.at("stop_criteria").as<StopCriteria>();
-    if (config_map.count("temperature")) config.temperature = config_map.at("temperature").as<float>();
-    if (config_map.count("top_p")) config.top_p = config_map.at("top_p").as<float>();
-    if (config_map.count("top_k")) config.top_k = config_map.at("top_k").as<int>();
-    if (config_map.count("do_sample")) config.do_sample = config_map.at("do_sample").as<bool>();
-    if (config_map.count("repetition_penalty")) config.repetition_penalty = config_map.at("repetition_penalty").as<float>();
-    if (config_map.count("pad_token_id")) config.pad_token_id = config_map.at("pad_token_id").as<int64_t>();
-    if (config_map.count("bos_token_id")) config.bos_token_id = config_map.at("bos_token_id").as<int64_t>();
-    if (config_map.count("eos_token_id")) config.eos_token_id = config_map.at("eos_token_id").as<int64_t>();
-    if (config_map.count("bos_token")) config.bos_token = config_map.at("bos_token").as<std::string>();
-    if (config_map.count("eos_token")) config.eos_token = config_map.at("eos_token").as<std::string>();
-   
+    read_anymap_param(config_map, "max_new_tokens", config.max_new_tokens);
+    read_anymap_param(config_map, "max_length", config.max_length);
+    read_anymap_param(config_map, "ignore_eos", config.ignore_eos);
+    read_anymap_param(config_map, "num_beam_groups", config.num_beam_groups);
+    read_anymap_param(config_map, "num_beams", config.num_beams);
+    read_anymap_param(config_map, "diversity_penalty", config.diversity_penalty);
+    read_anymap_param(config_map, "length_penalty", config.length_penalty);
+    read_anymap_param(config_map, "num_return_sequences", config.num_return_sequences);
+    read_anymap_param(config_map, "no_repeat_ngram_size", config.no_repeat_ngram_size);
+    read_anymap_param(config_map, "stop_criteria", config.stop_criteria);
+    read_anymap_param(config_map, "temperature", config.temperature);
+    read_anymap_param(config_map, "top_p", config.top_p);
+    read_anymap_param(config_map, "top_k", config.top_k);
+    read_anymap_param(config_map, "do_sample", config.do_sample);
+    read_anymap_param(config_map, "repetition_penalty", config.repetition_penalty);
+    read_anymap_param(config_map, "pad_token_id", config.pad_token_id);
+    read_anymap_param(config_map, "bos_token_id", config.bos_token_id);
+    read_anymap_param(config_map, "eos_token_id", config.eos_token_id);
+    read_anymap_param(config_map, "bos_token", config.bos_token);
+    read_anymap_param(config_map, "eos_token", config.eos_token);
+  
     return config;
 }
 
diff --git a/src/tests/python_tests/test_generate_api.py b/src/tests/python_tests/test_generate_api.py
new file mode 100644
index 0000000000..3e652d1fe3
--- /dev/null
+++ b/src/tests/python_tests/test_generate_api.py
@@ -0,0 +1,102 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+model_ids = [
+    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/"),
+    # ("meta-llama/Llama-2-7b-chat-hf", "Llama-2-7b-chat-hf/pytorch/dldt/FP16/"),
+    # ("microsoft/phi-1_5", "phi-1_5/"),
+    # ("google/gemma-2b-it", "gemma-2b-it/pytorch/dldt/FP16/"),
+]
+
+
+def run_hf_ov_genai_comparison(model_fixture, generation_config, prompt):
+    model_id, path, tokenizer, model = model_fixture
+
+    generation_config_hf = generation_config.copy()
+    # in OpenVINO GenAI this parameter is called stop_criteria,
+    # while in HF it's called early_stopping. 
+    # HF values True, False and "never" correspond to OV GenAI values "early", "heuristic" and "never"
+    if generation_config_hf.get('stop_criteria'):
+        generation_config_hf['early_stopping'] = stop_criteria_map()[generation_config_hf.pop('stop_criteria')]
+
+    encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True)
+    hf_encoded_output = model.generate(encoded_prompt, **generation_config_hf)
+    hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:])
+
+    import sys
+    # sys.path.append('../../src/python/openvino_genai/')
+    sys.path.append('/home/epavel/devel/openvino.genai/src/python/openvino_genai/')
+    import py_generate_pipeline as genai
+    
+    pipe = genai.LLMPipeline(path)
+    ov_output = pipe.generate(prompt, **generation_config)
+
+    if hf_output != ov_output:
+        print(f'hf_output: {hf_output}')
+        print(f'ov_output: {ov_output}')
+
+    assert hf_output == ov_output
+
+
+def stop_criteria_map():
+    return {"never": "never", "early": True, "heuristic": False}
+
+
+@pytest.fixture(scope="module", params=model_ids)
+def model_fixture(request):
+    model_id, path = request.param
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    model = AutoModelForCausalLM.from_pretrained(model_id)
+    return model_id, path, tokenizer, model
+
+
+test_cases = [
+    (dict(max_new_tokens=20, do_sample=False), 'table is made of'),  # generation_config, prompt
+    (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'),
+    (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'Alan Turing was a'),
+    (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'),
+    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'),
+    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'),
+    (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'),
+]
+@pytest.mark.parametrize("generation_config,prompt", test_cases)
+def test_greedy_decoding(model_fixture, generation_config, prompt):
+    run_hf_ov_genai_comparison(model_fixture, generation_config, prompt)
+
+
+prompts = ['The Sun is yellow because'] #, 'Alan Turing was a', 'table is made of']
+@pytest.mark.parametrize("num_beam_groups", [2, 3])
+@pytest.mark.parametrize("group_size", [5, 3])
+@pytest.mark.parametrize("max_new_tokens", [20, 15])
+@pytest.mark.parametrize("diversity_penalty", [1.0, 1.5])
+@pytest.mark.parametrize("prompt", prompts)
+def test_beam_search_decoding(model_fixture, num_beam_groups, group_size, 
+                              max_new_tokens, diversity_penalty, prompt):
+    generation_config = dict(
+        num_beam_groups=num_beam_groups, 
+        num_beams=num_beam_groups * group_size, 
+        diversity_penalty=diversity_penalty, 
+        num_return_sequences=num_beam_groups * group_size, 
+        max_new_tokens=max_new_tokens, 
+    )
+    run_hf_ov_genai_comparison(model_fixture, generation_config, prompt)
+
+
+@pytest.mark.parametrize("stop_criteria", ["never", "early", "heuristic"])
+@pytest.mark.parametrize("prompt", prompts)
+@pytest.mark.parametrize("max_new_tokens", [20, 15])
+def test_greedy_decoding(model_fixture, stop_criteria, prompt, max_new_tokens):
+
+    generation_config = dict(
+        num_beam_groups=2, 
+        num_beams=2 * 3, 
+        diversity_penalty=1.0, 
+        num_return_sequences=2 * 3, 
+        max_new_tokens=max_new_tokens, 
+        stop_criteria=stop_criteria,
+    )
+    run_hf_ov_genai_comparison(model_fixture, generation_config, prompt)
+
diff --git a/src/tests/python_tests/test_greedy.py b/src/tests/python_tests/test_greedy.py
deleted file mode 100644
index f33909721b..0000000000
--- a/src/tests/python_tests/test_greedy.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (C) 2023-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-def test_tiny_llama():
-    from transformers import AutoTokenizer, AutoModelForCausalLM
-
-    tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-    model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-
-    max_new_tokens = 32
-    prompt = 'table is made of'
-
-    encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True)
-    hf_encoded_output = model.generate(encoded_prompt, max_new_tokens=max_new_tokens, do_sample=False)
-    hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:])
-    print(f'hf_output: {hf_output}')
-
-    import sys
-    sys.path.append('src/python/openvino_genai/')
-    import py_generate_pipeline as genai
-    
-    pipe = genai.LLMPipeline('text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/')
-    ov_output = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False)
-    print(f'ov_output: {ov_output}')
-
-    assert hf_output == ov_output
-
-if __name__ == '__main__':
-    test_tiny_llama()

From ce81ba13dc4119afc503813846e72e7f1c446255 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 17 May 2024 11:00:29 +0200
Subject: [PATCH 97/97] check windows on precommit

---
 .github/workflows/causal_lm_cpp.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 23d9006d07..d1110fb05b 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -194,14 +194,18 @@ jobs:
         shell: cmd
         run: |
           call w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64\setupvars.bat
-
-          .\build\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt
+          .\build\text_generation\causal_lm\cpp\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt
+          echo "sample outputs"
+          type .\pred.txt
+          echo "huggingface outputs"
+          
           echo import transformers > ref.py
           echo predictions = open('pred.txt', 'r').read() >> ref.py
           echo tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') >> ref.py
           echo tokenized = tokenizer('69', return_tensors='pt') >> ref.py
           echo for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): >> ref.py
           echo     ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py
+          echo     print(ref) >> ref.py
           echo     idx = predictions.find(ref) >> ref.py
           echo     if -1 == idx: >> ref.py
           echo         raise RuntimeError(f'Missing "{ref=}" from predictions') >> ref.py