From 8a74d24a0fbceb29f0c4c52eb1e144c758a8ceb9 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Wed, 11 Dec 2024 14:57:07 +0100
Subject: [PATCH] Port from 24.6 release to master (#1356)

- ~https://github.com/openvinotoolkit/openvino.genai/pull/1302~ (didn't
port this PR because of the issue CVS-159227)
- https://github.com/openvinotoolkit/openvino.genai/pull/1262
- https://github.com/openvinotoolkit/openvino.genai/pull/1336
- https://github.com/openvinotoolkit/openvino.genai/pull/1331

---------

Co-authored-by: Andrei Kochin <andrei.kochin@intel.com>
Co-authored-by: Vladimir Zlobin <vladimir.zlobin@intel.com>
Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .github/workflows/causal_lm_cpp.yml           |  28 +--
 samples/CMakeLists.txt                        |   6 +-
 .../CMakeLists.txt                            |  13 ++
 .../README.md                                 |  12 ++
 .../encrypted_model_causal_lm.cpp             |  59 ++++++
 .../greedy_causal_lm.cpp                      |   0
 .../visual_language_chat.cpp                  |   2 +-
 .../README.md                                 |   0
 .../greedy_causal_lm.py                       |   0
 .../lora.py                                   |   0
 .../genai/continuous_batching_pipeline.hpp    |  26 +++
 .../include/openvino/genai/llm_pipeline.hpp   |  17 ++
 src/cpp/include/openvino/genai/tokenizer.hpp  |  70 +++++++-
 src/cpp/src/continuous_batching_impl.cpp      |  13 +-
 src/cpp/src/continuous_batching_impl.hpp      |  14 +-
 src/cpp/src/continuous_batching_pipeline.cpp  |  50 +++++-
 src/cpp/src/llm_pipeline.cpp                  | 168 ++++++++++++++----
 src/cpp/src/llm_pipeline_static.cpp           |  52 ++++--
 src/cpp/src/llm_pipeline_static.hpp           |  18 +-
 .../speculative_decoding_impl.cpp             |  33 ++--
 .../speculative_decoding_impl.hpp             |  25 +--
 src/cpp/src/tokenizer.cpp                     | 166 ++++++++++++-----
 src/cpp/src/utils.cpp                         |  40 +++++
 src/cpp/src/utils.hpp                         |   7 +
 src/python/py_llm_pipeline.cpp                |   1 +
 tests/python_tests/ov_genai_test_utils.py     |  32 ++++
 tests/python_tests/test_chat_generate_api.py  |   1 +
 tests/python_tests/test_generate_api.py       |   8 +-
 28 files changed, 691 insertions(+), 170 deletions(-)
 rename samples/cpp/{greedy_causal_lm => text_generation}/CMakeLists.txt (58%)
 rename samples/cpp/{greedy_causal_lm => text_generation}/README.md (79%)
 create mode 100644 samples/cpp/text_generation/encrypted_model_causal_lm.cpp
 rename samples/cpp/{greedy_causal_lm => text_generation}/greedy_causal_lm.cpp (100%)
 rename samples/python/{greedy_causal_lm => text_generation}/README.md (100%)
 rename samples/python/{greedy_causal_lm => text_generation}/greedy_causal_lm.py (100%)
 rename samples/python/{greedy_causal_lm => text_generation}/lora.py (100%)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 504e303fb5..107777bf74 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -63,13 +63,13 @@ jobs:
           PYTHONPATH: "./build"
       - run: >
           . ./ov/setupvars.sh
-          && timeout 25s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
-          | diff <(timeout 25s samples/python/greedy_causal_lm/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") -
+          && timeout 25s ./build/samples/cpp/text_generation/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
+          | diff <(timeout 25s samples/python/text_generation/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") -
         env:
           PYTHONPATH: "./build"
       - run: >
           . ./ov/setupvars.sh
-          && samples/python/greedy_causal_lm/lora.py ./TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/ adapter_model.safetensors "How to create a table with two columns, one of them has type float, another one has type int?"
+          && samples/python/text_generation/lora.py ./TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/ adapter_model.safetensors "How to create a table with two columns, one of them has type float, another one has type int?"
         env:
           PYTHONPATH: "./build"
 
@@ -249,7 +249,7 @@ jobs:
       - run: >
           set PATH=.\build\openvino_genai\;%PATH%
           && call .\ov\setupvars.bat
-          && .\build\samples\cpp\greedy_causal_lm\Release\greedy_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\cpp.txt
+          && .\build\samples\cpp\text_generation\Release\greedy_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\cpp.txt
       - run: |
           echo import transformers > ref.py
           echo predictions = open('cpp.txt', 'r').read() >> ref.py
@@ -266,13 +266,13 @@ jobs:
           set PATH=.\build\openvino_genai\;%PATH%
           && set "PYTHONPATH=./build/"
           && call .\ov\setupvars.bat
-          && python samples\python\greedy_causal_lm\greedy_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\py.txt
+          && python samples\python\text_generation\greedy_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\py.txt
       - run: fc .\cpp.txt .\py.txt
       - run: >
           set PATH=.\build\openvino_genai\;%PATH%
           && set "PYTHONPATH=./build/"
           && call .\ov\setupvars.bat
-          && python samples\python\greedy_causal_lm\lora.py .\TinyLlama\TinyLlama-1.1B-intermediate-step-1431k-3T\ adapter_model.safetensors "How to create a table with two columns, one of them has type float, another one has type int?"
+          && python samples\python\text_generation\lora.py .\TinyLlama\TinyLlama-1.1B-intermediate-step-1431k-3T\ adapter_model.safetensors "How to create a table with two columns, one of them has type float, another one has type int?"
 
   cpp-greedy_causal_lm-Qwen-7B-Chat:
     runs-on: ubuntu-20.04-16-cores
@@ -304,7 +304,7 @@ jobs:
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
       - run: >
           . ./ov/setupvars.sh
-          && timeout 2m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 2m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) -
+          && timeout 2m ./build/samples/cpp/text_generation/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 2m samples/python/text_generation/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) -
         env:
           PYTHONPATH: "./build"
 
@@ -446,7 +446,7 @@ jobs:
         run: |
           source ./ov/setupvars.sh
           ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_speculative.txt
-          ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
+          ./build/samples/cpp/text_generation/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
           python ./samples/python/speculative_decoding_lm/speculative_decoding_lm.py ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_py.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
@@ -504,7 +504,7 @@ jobs:
           A:' > ./prompt.txt
 
           ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
-          ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt
+          ./build/samples/cpp/text_generation/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
               predicted_greedy = f.readline()
@@ -525,7 +525,7 @@ jobs:
           A:' > ./prompt.txt
 
           ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./Qwen-7B-Chat/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
-          ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ "$(<prompt.txt)" > predictions_greedy.txt
+          ./build/samples/cpp/text_generation/greedy_causal_lm ./Qwen-7B-Chat/ "$(<prompt.txt)" > predictions_greedy.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
               predicted_greedy = f.readline()
@@ -566,7 +566,7 @@ jobs:
       - name: Run Generation
         run: |
           source ./ov/setupvars.sh
-          timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt
+          timeout 50s ./build/samples/cpp/text_generation/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt
       - name: Compare
         run: |
           python -c "
@@ -585,7 +585,7 @@ jobs:
           echo Phi-1_5 passed
       - run: >
           . ./ov/setupvars.sh
-          && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a"
+          && timeout 50s samples/python/text_generation/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a"
           | diff ./pred_greedy.txt -
         env:
           PYTHONPATH: "./build"
@@ -621,7 +621,7 @@ jobs:
       - name: Run Generation
         run: |
           source ./ov/setupvars.sh
-          timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt
+          timeout 50s ./build/samples/cpp/text_generation/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt
       - name: Compare
         run: |
           python -c "
@@ -640,7 +640,7 @@ jobs:
           echo "Alan Turing was a" passed
       - run: >
           . ./ov/setupvars.sh
-          && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a"
+          && timeout 50s samples/python/text_generation/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a"
           | diff ./pred_greedy.txt -
         env:
           PYTHONPATH: "./build"
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 92f0b3f43a..02539df6e7 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -5,7 +5,7 @@
 add_subdirectory(cpp/beam_search_causal_lm)
 add_subdirectory(cpp/benchmark_genai)
 add_subdirectory(cpp/chat_sample)
-add_subdirectory(cpp/greedy_causal_lm)
+add_subdirectory(cpp/text_generation)
 add_subdirectory(cpp/lora_greedy_causal_lm)
 add_subdirectory(cpp/multinomial_causal_lm)
 add_subdirectory(cpp/prompt_lookup_decoding_lm)
@@ -25,7 +25,7 @@ install(DIRECTORY
             cpp/beam_search_causal_lm
             cpp/benchmark_genai
             cpp/chat_sample
-            cpp/greedy_causal_lm
+            cpp/text_generation
             cpp/image_generation
             cpp/lora_greedy_causal_lm
             cpp/multinomial_causal_lm
@@ -39,7 +39,7 @@ install(DIRECTORY
             python/beam_search_causal_lm
             python/benchmark_genai
             python/chat_sample
-            python/greedy_causal_lm
+            python/text_generation
             python/image_generation
             python/multinomial_causal_lm
             python/speculative_decoding_lm
diff --git a/samples/cpp/greedy_causal_lm/CMakeLists.txt b/samples/cpp/text_generation/CMakeLists.txt
similarity index 58%
rename from samples/cpp/greedy_causal_lm/CMakeLists.txt
rename to samples/cpp/text_generation/CMakeLists.txt
index ff5151676f..377682974e 100644
--- a/samples/cpp/greedy_causal_lm/CMakeLists.txt
+++ b/samples/cpp/text_generation/CMakeLists.txt
@@ -20,3 +20,16 @@ install(TARGETS greedy_causal_lm
         RUNTIME DESTINATION samples_bin/
         COMPONENT samples_bin
         EXCLUDE_FROM_ALL)
+
+add_executable(encrypted_model_causal_lm encrypted_model_causal_lm.cpp)
+target_link_libraries(encrypted_model_causal_lm PRIVATE openvino::genai)
+set_target_properties(encrypted_model_causal_lm PROPERTIES
+    COMPILE_PDB_NAME encrypted_model_causal_lm
+    # Ensure out of box LC_RPATH on macOS with SIP
+    INSTALL_RPATH_USE_LINK_PATH ON)
+target_compile_features(encrypted_model_causal_lm PRIVATE cxx_std_11)
+
+install(TARGETS encrypted_model_causal_lm
+        RUNTIME DESTINATION samples_bin/
+        COMPONENT samples_bin
+        EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/greedy_causal_lm/README.md b/samples/cpp/text_generation/README.md
similarity index 79%
rename from samples/cpp/greedy_causal_lm/README.md
rename to samples/cpp/text_generation/README.md
index 2f3a7751bf..6928d03927 100644
--- a/samples/cpp/greedy_causal_lm/README.md
+++ b/samples/cpp/text_generation/README.md
@@ -24,6 +24,18 @@ Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
 
+## Using encrypted models
+
+LLMPipeline and Tokenizer objects can be initialized directly from the memory buffer, e.g. when user stores only encrypted files and decrypts them on-the-fly. 
+The following code snippet demonstrates how to load the model from the memory buffer:
+
+```cpp
+auto [model_str, weights_tensor] = decrypt_model(models_path + "/openvino_model.xml", models_path + "/openvino_model.bin");
+ov::genai::Tokenizer tokenizer(models_path);
+ov::genai::LLMPipeline pipe(model_str, weights_tensor, tokenizer, device);
+```
+For the sake of brevity the code above does not include Tokenizer decryption. For more details look to encrypted_model_causal_lm sample.
+
 ### Troubleshooting
 
 #### Unicode characters encoding error on Windows
diff --git a/samples/cpp/text_generation/encrypted_model_causal_lm.cpp b/samples/cpp/text_generation/encrypted_model_causal_lm.cpp
new file mode 100644
index 0000000000..3ea94d605f
--- /dev/null
+++ b/samples/cpp/text_generation/encrypted_model_causal_lm.cpp
@@ -0,0 +1,59 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/genai/llm_pipeline.hpp"
+#include <fstream>
+
+std::pair<std::string, ov::Tensor> decrypt_model(const std::string& model_path, const std::string& weights_path) {
+    std::ifstream model_file(model_path);
+    std::ifstream weights_file(weights_path, std::ios::binary);
+    if (!model_file.is_open() || !weights_file.is_open()) {
+        throw std::runtime_error("Cannot open model or weights file");
+    }
+
+    // User can add file decryption of model_file and weights_file in memory here.
+
+    std::string model_str((std::istreambuf_iterator<char>(model_file)), std::istreambuf_iterator<char>());
+    std::vector<char> weights_buffer((std::istreambuf_iterator<char>(weights_file)), std::istreambuf_iterator<char>());
+    auto weights_tensor = ov::Tensor(ov::element::u8, {weights_buffer.size()}, weights_buffer.data());
+    return {model_str, weights_tensor};
+}
+
+ov::genai::Tokenizer decrypt_tokenizer(const std::string& models_path) {
+    std::string tok_model_path = models_path + "/openvino_tokenizer.xml";
+    std::string tok_weights_path = models_path + "/openvino_tokenizer.bin";
+    auto [tok_model_str, tok_weights_tensor] = decrypt_model(tok_model_path, tok_weights_path);
+
+    std::string detok_model_path = models_path + "/openvino_detokenizer.xml";
+    std::string detok_weights_path = models_path + "/openvino_detokenizer.bin";
+    auto [detok_model_str, detok_weights_tensor] = decrypt_model(tok_model_path, tok_weights_path);
+
+    return ov::genai::Tokenizer(tok_model_str, tok_weights_tensor, detok_model_str, detok_weights_tensor);
+}
+
+int main(int argc, char* argv[]) try {
+    if (3 > argc)
+        throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> \"<PROMPT>\"");
+
+    std::string device = "CPU";  // GPU, NPU can be used as well
+    std::string models_path = argv[1];
+    std::string prompt = argv[2];
+
+    auto [model_str, model_weights] = decrypt_model(models_path + "/openvino_model.xml", models_path + "/openvino_model.bin");
+    ov::genai::Tokenizer tokenizer = decrypt_tokenizer(models_path);
+    
+    ov::genai::LLMPipeline pipe(model_str, model_weights, tokenizer, device);
+
+    std::string result = pipe.generate(prompt, ov::genai::max_new_tokens(100));
+    std::cout << result << std::endl;
+} catch (const std::exception& error) {
+    try {
+        std::cerr << error.what() << '\n';
+    } catch (const std::ios_base::failure&) {}
+    return EXIT_FAILURE;
+} catch (...) {
+    try {
+        std::cerr << "Non-exception object thrown\n";
+    } catch (const std::ios_base::failure&) {}
+    return EXIT_FAILURE;
+}
diff --git a/samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp b/samples/cpp/text_generation/greedy_causal_lm.cpp
similarity index 100%
rename from samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp
rename to samples/cpp/text_generation/greedy_causal_lm.cpp
diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp
index 3a655374e9..e426965e66 100644
--- a/samples/cpp/visual_language_chat/visual_language_chat.cpp
+++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp
@@ -18,7 +18,7 @@ int main(int argc, char* argv[]) try {
 
     std::string device = "CPU";  // GPU can be used as well
     ov::AnyMap enable_compile_cache;
-    if ("GPU" == device) {
+    if (device == "GPU") {
         // Cache compiled models on disk for GPU to save time on the
         // next run. It's not beneficial for CPU.
         enable_compile_cache.insert({ov::cache_dir("vlm_cache")});
diff --git a/samples/python/greedy_causal_lm/README.md b/samples/python/text_generation/README.md
similarity index 100%
rename from samples/python/greedy_causal_lm/README.md
rename to samples/python/text_generation/README.md
diff --git a/samples/python/greedy_causal_lm/greedy_causal_lm.py b/samples/python/text_generation/greedy_causal_lm.py
similarity index 100%
rename from samples/python/greedy_causal_lm/greedy_causal_lm.py
rename to samples/python/text_generation/greedy_causal_lm.py
diff --git a/samples/python/greedy_causal_lm/lora.py b/samples/python/text_generation/lora.py
similarity index 100%
rename from samples/python/greedy_causal_lm/lora.py
rename to samples/python/text_generation/lora.py
diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index 2bf5dd773b..4a0637f2d9 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -88,6 +88,32 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
         const ov::AnyMap& properties = {}
     );
 
+    /**
+     * @brief Constructs a ContinuousBatchingPipeline from already existing model and tokenizer.
+     * 
+     * This constructor allows for the creation of a ContinuousBatchingPipeline using an existing model
+     * represented as a string and a weights tensor, along with a manually initialized tokenizer.
+     * This is useful when the model and tokenizer are already loaded or created in memory and do not
+     * need to be loaded from files.
+     *
+     * @param model_str A string representation of the model.
+     * @param weights_tensor A tensor containing the weights of the model.
+     * @param tokenizer A manually initialized ov::genai::Tokenizer.
+     * @param scheduler_config Configuration for the scheduler.
+     * @param device The device to run the pipeline on (e.g., CPU, GPU).
+     * @param properties Optional properties for the pipeline.
+     * @param generation_config Optional generation configuration for the pipeline.
+     */
+    ContinuousBatchingPipeline(
+        const std::string& model_str,
+        const ov::Tensor& weights_tensor,
+        const ov::genai::Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device,
+        const ov::AnyMap& properties = {},
+        const ov::genai::GenerationConfig& generation_config = {}
+    );
+
     ov::genai::Tokenizer get_tokenizer();
 
     ov::genai::GenerationConfig get_config() const;
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index 3d62535120..44427d45b1 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -112,6 +112,15 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
         const ov::AnyMap& properties = {}
     );
 
+    LLMPipeline(
+        const std::string& model_str,
+        const ov::Tensor& weights_tensor,
+        const ov::genai::Tokenizer& tokenizer,
+        const std::string& device,
+        const ov::AnyMap& properties = {},
+        const ov::genai::GenerationConfig& generation_config = {}
+    );
+
     OPENVINO_DEPRECATED("Please, specify device explicitly when create LLMPipeline. This overload will be removed in 2025.0.0 release")
     explicit LLMPipeline(const std::filesystem::path& path) :
         LLMPipeline(path, "CPU") { }
@@ -274,6 +283,14 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
 OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> streamer(StreamerVariant func);
 OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> generation_config(const GenerationConfig& config);
 
+OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> draft_model(
+    std::string& model_str,
+    ov::Tensor& weights_tensor,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string& device = {},
+    const ov::AnyMap& properties = {},
+    const ov::genai::GenerationConfig& generation_config = {});
+
 OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> draft_model(
     const std::filesystem::path& models_path,
     const std::string& device = {},
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 36f63d2b5e..38fc0aaf8c 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -28,12 +28,72 @@ struct TokenizedInputs {
 class OPENVINO_GENAI_EXPORTS Tokenizer {
 public:
     /**
-    * @brief ov::genai::Tokenizer constructor.
-    * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
-    * @param properties Properties passed to ov::Core::compile_model
-    */
-    Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties = {});
+     * @brief ov::genai::Tokenizer constructor.
+     * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
+     * @param properties Properties passed to ov::Core::compile_model
+     */
+    explicit Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties = {});
+
+    /**
+     * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights
+     * 
+     * This constructor is used when tokenizer and detokenizer are separate models already loaded into memory. 
+     * When this constructor is used bos, eos, pad token ids are expected to be in IR. 
+     * If an IR is older (< 2024.3) then this tokens are default initialized to be ignored.
+     * @param tokenizer_model_str tokenizer model string
+     * @param tokenizer_weights_tensor ov::Tensor with tokenizer weights
+     * @param detokenizer_model_str detokenizer model string
+     * @param detokenizer_weights_tensor ov::Tensor with detokenizer weights
+     * @param properties Properties passed to ov::Core::compile_model
+     */
+    Tokenizer(
+        const std::string& tokenizer_model_str,
+        ov::Tensor& tokenizer_weights_tensor,
+        std::string& detokenizer_model_str,
+        ov::Tensor& detokenizer_weights_tensor,
+        const ov::AnyMap& properties = {}
+    );
+
+    /**
+     * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights. 
+     * 
+     * This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's 
+     * tokenizer or detokenizer is defined from model input signature. When this constructor is used bos, eos, pad token ids
+     * are expected to be in IR. If an IR is older (< 2024.3) then this tokens are default initialized to be ignored.
+     * @param model_str model string
+     * @param weights_tensor ov::Tensor with model weights
+     * @param properties Properties passed to ov::Core::compile_model
+     */
+    Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties = {});
 
+    /**
+     * @brief ov::genai::Tokenizer constructor with variable number of properties
+     * @param tokenizer_model_str tokenizer model string
+     * @param tokenizer_weights_tensor ov::Tensor with tokenizer weights
+     * @param detokenizer_model_str detokenizer model string
+     * @param detokenizer_weights_tensor ov::Tensor with detokenizer weights
+     * @param properties optional properties
+     */
+    template <typename... Properties, typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
+    Tokenizer(
+        const std::string& tokenizer_model_str,
+        ov::Tensor& tokenizer_weights_tensor,
+        std::string& detokenizer_model_str,
+        ov::Tensor& detokenizer_weights_tensor,
+        Properties&&... properties
+        ) : Tokenizer(tokenizer_model_str, tokenizer_weights_tensor, detokenizer_model_str, detokenizer_weights_tensor, ov::AnyMap{std::forward<Properties>(properties)...}) { }
+    
+    /**
+     * @brief ov::genai::Tokenizer constructor with variable number of properties
+     * @param model_str model string
+     * @param weights_tensor ov::Tensor with model weights
+     * @param properties optional properties
+     */
+    template <typename... Properties, typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
+    Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor,
+              Properties&&... properties)
+        : Tokenizer(model_str, weights_tensor, ov::AnyMap{std::forward<Properties>(properties)...}) { }
+    
     /**
      * @brief ov::genai::Tokenizer constructor with variable number of properties
      * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 901c5c64be..d27e8934dc 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -11,22 +11,21 @@ template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
 template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
 
 ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
-    const std::filesystem::path& models_path,
+    const std::shared_ptr<ov::Model>& model,
     const Tokenizer& tokenizer,
     const SchedulerConfig& scheduler_config,
     const std::string& device,
-    const ov::AnyMap& properties) {
+    const ov::AnyMap& properties,
+    const ov::genai::GenerationConfig& generation_config
+    ) {
     m_tokenizer = tokenizer;
-    m_generation_config = utils::from_config_json_if_exists(models_path);
-
+    m_generation_config = generation_config;
+    
     ov::Core core;
 
     auto [core_properties, compile_properties] = utils::split_core_compile_config(properties);
     core.set_property(core_properties);
 
-    // The model can be compiled for GPU as well
-    std::shared_ptr<ov::Model> model = core.read_model((models_path / "openvino_model.xml").string());
-
     DeviceConfig device_config(core, scheduler_config, device, compile_properties);
 
     bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp
index 8276edb36b..780bff6a31 100644
--- a/src/cpp/src/continuous_batching_impl.hpp
+++ b/src/cpp/src/continuous_batching_impl.hpp
@@ -53,22 +53,12 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
 
     void _fill_prompt_log_probs(std::vector<SequenceGroup::Ptr>& sequence_groups, ov::Tensor& logits);
 public:
-    ContinuousBatchingImpl(const std::filesystem::path& models_path,
+    ContinuousBatchingImpl(const std::shared_ptr<ov::Model>& model,
                            const Tokenizer& tokenizer,
-                           const SchedulerConfig& scheduler_config,
-                           const std::string& device,
-                           const ov::AnyMap& properties);
-
-    ContinuousBatchingImpl(const std::filesystem::path& models_path,
                            const SchedulerConfig& scheduler_config,
                            const std::string& device,
                            const ov::AnyMap& properties,
-                           const ov::AnyMap& tokenizer_properties)
-    : ContinuousBatchingImpl{ models_path,
-                              Tokenizer(models_path, tokenizer_properties),
-                              scheduler_config,
-                              device,
-                              properties } {}
+                           const ov::genai::GenerationConfig& generation_config);
 
     GenerationHandle add_request(uint64_t request_id,
                                  const ov::Tensor& input_ids,
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 6dcbf342eb..2faad4354e 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -20,7 +20,7 @@ using namespace ov::genai;
 
 inline ov::genai::ModelDesc
 extract_draft_model_from_config(ov::AnyMap& config) {
-    ov::genai::ModelDesc draft_model("");
+    ov::genai::ModelDesc draft_model;
     if (config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end()) {
         draft_model = config.at(utils::DRAFT_MODEL_ARG_NAME).as<ov::genai::ModelDesc>();
         config.erase(utils::DRAFT_MODEL_ARG_NAME);
@@ -28,17 +28,24 @@ extract_draft_model_from_config(ov::AnyMap& config) {
     return draft_model;
 }
 
+
 ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::path& models_path,
                                                         const SchedulerConfig& scheduler_config,
                                                         const std::string& device,
                                                         const ov::AnyMap& properties,
                                                         const ov::AnyMap& tokenizer_properties) {
     auto properties_without_draft_model = properties;
-    auto draft_model = extract_draft_model_from_config(properties_without_draft_model);
-    if (draft_model.models_path.empty()) {
-        m_impl = std::make_shared<ContinuousBatchingImpl>(models_path, scheduler_config, device, properties, tokenizer_properties);
+    auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
+    
+    std::filesystem::path openvino_model_name = "openvino_model.xml";
+    auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+    auto tokenizer = ov::genai::Tokenizer(models_path, tokenizer_properties);
+    auto generation_config = utils::from_config_json_if_exists(models_path);
+    if (draft_model_desr.model == nullptr) {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     } else {
-        m_impl = std::make_shared<SpeculativeDecodingImpl>(models_path, scheduler_config, device, properties_without_draft_model, draft_model, tokenizer_properties);
+        auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
+        m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
     }
 }
 
@@ -49,11 +56,36 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     const std::string& device,
     const ov::AnyMap& properties) {
     auto properties_without_draft_model = properties;
-    auto draft_model = extract_draft_model_from_config(properties_without_draft_model);
-    if (draft_model.models_path.empty()) {
-        m_impl = std::make_shared<ContinuousBatchingImpl>(models_path, tokenizer, scheduler_config, device, properties);
+    auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
+    std::filesystem::path openvino_model_name = "openvino_model.xml";
+    auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+    auto generation_config = utils::from_config_json_if_exists(models_path);
+
+    if (draft_model_desr.model == nullptr) {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
+    } else {
+        auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
+        m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
+    }
+}
+
+ContinuousBatchingPipeline::ContinuousBatchingPipeline(
+    const std::string& model_str,
+    const ov::Tensor& weights_tensor,
+    const Tokenizer& tokenizer,
+    const SchedulerConfig& scheduler_config,
+    const std::string& device,
+    const ov::AnyMap& properties,
+    const ov::genai::GenerationConfig& generation_config) {
+    auto properties_without_draft_model = properties;
+    auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
+    auto model = utils::singleton_core().read_model(model_str, weights_tensor);
+
+    if (draft_model_desr.model == nullptr) {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     } else {
-        m_impl = std::make_shared<SpeculativeDecodingImpl>(models_path, scheduler_config, device, properties_without_draft_model, draft_model);
+        auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
+        m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);    
     }
 }
 
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 5d82a96010..84f76730eb 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -59,23 +59,31 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         const ov::genai::Tokenizer& tokenizer,
         const std::string& device,
         const ov::AnyMap& plugin_config
-    ) : LLMPipelineImplBase(tokenizer, utils::from_config_json_if_exists(models_path))
-    {
+    ) : StatefulLLMPipeline{
+            ov::genai::utils::read_model_with_config(models_path, plugin_config),
+            tokenizer, 
+            device, 
+            plugin_config, 
+            utils::from_config_json_if_exists(models_path)
+        } {}
+
+    StatefulLLMPipeline(
+        const std::shared_ptr<ov::Model>& model,
+        const ov::genai::Tokenizer& tokenizer,
+        const std::string& device,
+        const ov::AnyMap& config,
+        const ov::genai::GenerationConfig& generation_config
+    ) : LLMPipelineImplBase(tokenizer, generation_config) {
         ov::Core core;
+        auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
+        utils::slice_matmul_statefull_model(model);
+
         if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
-            auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_compile_config(*filtered_plugin_config);
-            core.set_property(core_plugin_config);
-            auto model = core.read_model(models_path / "openvino_model.xml");
             m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
             m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device);   // TODO: Make the prefix name configurable
-            utils::slice_matmul_statefull_model(model);
-            m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request();
+            m_model_runner = core.compile_model(model, device, *filtered_plugin_config).create_infer_request();
         } else {
-            auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_compile_config(plugin_config);
-            core.set_property(core_plugin_config);
-            auto model = core.read_model(models_path / "openvino_model.xml");
-            utils::slice_matmul_statefull_model(model);
-            m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request();
+            m_model_runner = core.compile_model(model, device, plugin_config).create_infer_request();
         }
 
         // If eos_token_id was not provided, take value
@@ -87,7 +95,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         const std::filesystem::path& models_path,
         const std::string& device,
         const ov::AnyMap& plugin_config
-    ) : StatefulLLMPipeline{models_path, Tokenizer(models_path.string()), device, plugin_config} {}
+    ) : StatefulLLMPipeline{models_path, Tokenizer(models_path), device, plugin_config} {}
 
     DecodedResults generate(
         StringInputs inputs,
@@ -382,14 +390,26 @@ std::pair<std::string, Any> draft_model(
     const std::filesystem::path& models_path,
     const std::string& device,
     const ov::AnyMap& properties) {
-    ov::AnyMap plugin_config = properties;
-    auto it = plugin_config.find(ov::genai::scheduler_config.name());
-    SchedulerConfig scheduler_config;
-    if (it != plugin_config.end()) {
-        scheduler_config = it->second.as<SchedulerConfig>();
-        plugin_config.erase(it);
-    }
-    return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(models_path, device, plugin_config, scheduler_config) };
+    auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
+    
+    std::filesystem::path openvino_model_name = "openvino_model.xml";
+    auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+    auto generation_config = utils::from_config_json_if_exists(models_path);
+    auto tokenizer = ov::genai::Tokenizer(models_path);
+    return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
+}
+
+std::pair<std::string, Any> draft_model(
+    std::string& model_str,
+    ov::Tensor& weights_tensor,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string& device,
+    const ov::AnyMap& properties,
+    const ov::genai::GenerationConfig& generation_config) {
+    auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
+
+    auto model = utils::singleton_core().read_model(model_str, weights_tensor);
+    return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
 }
 
 }  // namespace genai
@@ -431,6 +451,23 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         m_generation_config = m_impl.get_config();
     }
 
+    ContinuousBatchingAdapter(
+        const std::string& model_str,
+        const ov::Tensor& weights_tensor,
+        const Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device,
+        const ov::AnyMap& plugin_config,
+        const ov::genai::GenerationConfig& generation_config
+    ): LLMPipelineImplBase{tokenizer}, m_impl{
+        model_str, 
+        weights_tensor,
+        tokenizer,
+        scheduler_config,
+        device,
+        plugin_config,
+        generation_config} {}
+
     ContinuousBatchingAdapter(
         const std::filesystem::path& models_path,
         const SchedulerConfig& scheduler_config,
@@ -543,6 +580,29 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         m_impl.finish_chat();
     };
 };
+
+/* 
+* NPU reads some properties from the config file, but when LLMPipeline is initialized
+* from the model_str and weights_tensor, there are not files. 
+* In the later case ModelDesc is stored in properties.
+* This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr.
+*/
+std::pair<ov::AnyMap, ov::genai::ModelConfigDesc> split_model_descr(const ov::AnyMap& properties) {
+    ov::AnyMap main_properties = properties;
+    ov::genai::ModelConfigDesc model_descr;
+
+    auto pop_property = [](ov::AnyMap& orig_propertis, const std::string& key, auto& value) {
+        if (orig_propertis.find(key) != orig_propertis.end()) {
+            value = orig_propertis.at(key).as<std::decay_t<decltype(value)>>();
+            orig_propertis.erase(key);
+        }
+    };
+    pop_property(main_properties, "name_or_path", model_descr.name_or_path);
+    pop_property(main_properties, "type", model_descr.type);
+    pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);
+    
+    return {main_properties, model_descr};
+}
 }
 
 ov::genai::LLMPipeline::LLMPipeline(
@@ -564,11 +624,9 @@ ov::genai::LLMPipeline::LLMPipeline(
 ){
     auto start_time = std::chrono::steady_clock::now();
     if (properties.find(ov::genai::scheduler_config.name()) != properties.end()) {
-        auto config_without_scheduler_config = properties;
-        config_without_scheduler_config.erase(ov::genai::scheduler_config.name());
-        auto& scheduler_config = properties.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
-        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, config_without_scheduler_config);
-    } else if ("NPU" == device) {
+        auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, plugin_config);
+    } else if (device == "NPU") {
         m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, tokenizer, device, properties);
     } else {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
@@ -583,12 +641,11 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::AnyMap& config
 ){
     auto start_time = std::chrono::steady_clock::now();
+
     if (config.find(ov::genai::scheduler_config.name()) != config.end()) {
-        auto config_without_scheduler_config = config;
-        config_without_scheduler_config.erase(ov::genai::scheduler_config.name());
-        auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
-        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, config_without_scheduler_config);
-    } else if ("NPU" == device) {
+        auto [plugin_config, scheduler_config] = utils::split_scheduler_config(config);
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, plugin_config);
+    } else if (device == "NPU") {
         m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, config);
     } else {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, config);
@@ -597,6 +654,55 @@ ov::genai::LLMPipeline::LLMPipeline(
     m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
 }
 
+ov::genai::LLMPipeline::LLMPipeline(
+    const std::string& model_str,
+    const ov::Tensor& weights_tensor,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string& device,
+    const ov::AnyMap& config,
+    const ov::genai::GenerationConfig& generation_config
+){
+    auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config);
+
+    auto start_time = std::chrono::steady_clock::now();
+    if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end()) {
+        auto [plugin_config_, scheduler_config] = utils::split_scheduler_config(plugin_config);
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
+                                                              tokenizer, scheduler_config, device, plugin_config_, generation_config);
+    } else if (device == "NPU") {
+        // TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution.
+        // NPU reads some properties from the config file, but when LLMPipeline is initialized 
+        // from the model_str and weights_tensor, there is no files. 
+        // Therefore, we need to pass these properties manually.
+        // This is necessary only for NPU, for other plugins can be ommited.
+        // Example of usage:
+        // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, 
+        //                                      {"type", "llama"}, 
+        //                                      {"num_key_value_heads", 32}};
+        // ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties);
+        // This will convert from AnyMap to ModelDesc.
+        auto [properties, model_descr] = split_model_descr(plugin_config);
+
+        m_pimpl = std::make_unique<StaticLLMPipeline>(
+            utils::singleton_core().read_model(model_str, weights_tensor), 
+            model_descr,
+            tokenizer,
+            device,
+            properties,
+            generation_config
+        );
+    } else {
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(
+            utils::singleton_core().read_model(model_str, weights_tensor), 
+            tokenizer,
+            device,
+            plugin_config,
+            generation_config);
+    }
+    auto stop_time = std::chrono::steady_clock::now();
+    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
+}
+
 ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
     return m_pimpl->m_generation_config;
 }
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 4bb84dfc05..cb83209b4b 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -314,7 +314,7 @@ std::optional<ov::Any> pop_option(ov::AnyMap& config, const std::string& option_
 }
 
 template <typename T>
-std::optional<T> get_option(ov::AnyMap& config, const std::string& option_name) {
+std::optional<T> get_option(const ov::AnyMap& config, const std::string& option_name) {
     if (auto it = config.find(option_name); it != config.end()) {
         return std::make_optional(it->second.as<T>());
     }
@@ -396,18 +396,12 @@ KVAxesPosition get_kv_axes(const std::string& model_type) {
     return axes;
 }
 
-struct ModelDesc {
-    std::string type;
-    std::string name_or_path;
-    int num_key_value_heads;
-};
-
-ModelDesc get_modeldesc_from_json(const std::filesystem::path& filepath) {
+ov::genai::ModelConfigDesc get_modeldesc_from_json(const std::filesystem::path& filepath) {
     std::ifstream file(filepath);
     OPENVINO_ASSERT(file.is_open(), "Could not open file: " + filepath.string());
     nlohmann::json config_data = nlohmann::json::parse(file);
 
-    ModelDesc desc;
+    ov::genai::ModelConfigDesc desc;
     desc.type = config_data["model_type"].get<std::string>();
     // NB: In case _name_or_path field isn't presented in config.json
     if (config_data.contains("_name_or_path")) {
@@ -664,7 +658,9 @@ StaticLLMPipeline::StaticLLMPipeline(
     */
     const auto use_blobs = pop_or_default(properties, "USE_BLOBS", false);
     if (!use_blobs) {
-        setupAndCompileModels(models_path, device, properties);
+        ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
+        auto model = genai::utils::singleton_core().read_model((models_path / "openvino_model.xml").string());
+        setupAndCompileModels(model, device, model_desc, properties);
     } else {
         setupAndImportModels(models_path, device, properties);
     }
@@ -684,9 +680,39 @@ StaticLLMPipeline::StaticLLMPipeline(
 ) : StaticLLMPipeline(models_path, Tokenizer(models_path), device, properties) {
 }
 
+StaticLLMPipeline::StaticLLMPipeline(
+    const std::shared_ptr<ov::Model>& model,
+    const ModelConfigDesc& model_desc,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string& device,
+    const ov::AnyMap& properties,
+    const ov::genai::GenerationConfig& generation_config
+) : LLMPipelineImplBase(tokenizer, generation_config) {
+    
+    bool use_blobs = false;
+    auto anyopt = get_option<bool>(properties, "USE_BLOBS");
+    if (anyopt.has_value()) {
+        use_blobs = *anyopt;
+    }
+    // Using model_str and weights_tesnor with blobs is meaningless.
+    OPENVINO_ASSERT(!use_blobs, "blobs cannot be used with model string and weights tensor");
+
+    auto properties_ = properties;
+    setupAndCompileModels(model, device, model_desc, properties_);
+
+    // Initialize tensors
+    prepare_for_new_conversation();
+
+    // If eos_token_id was not provided, take value
+    if (m_generation_config.eos_token_id == -1) {
+        m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
+    }
+}
+
 void StaticLLMPipeline::setupAndCompileModels(
-    const std::filesystem::path& models_path,
+    const std::shared_ptr<ov::Model>& model,
     const std::string& device,
+    const ModelConfigDesc& model_desc,
     ov::AnyMap& properties) {
     /* Initialization assumes multiple steps if user passes "USE_BLOBS=NO":
         1) Read the template model - this will be kvcache model
@@ -705,7 +731,7 @@ void StaticLLMPipeline::setupAndCompileModels(
     // NB: Get information about NPU if available
     auto npudesc = extract_npu_descriptor(core);
     // (1) Read the template model - this will be kvcache model
-    auto kvcache_model = core.read_model((models_path / "openvino_model.xml").string());
+    auto kvcache_model = model;
     // (2) Expose KV-cache input and output layers from kvcache model
     ov::pass::StatefulToStateless().run_on_model(kvcache_model);
     // (3) Align u4 ZP constants
@@ -716,7 +742,7 @@ void StaticLLMPipeline::setupAndCompileModels(
     // (5) Reshape both models to static shape
     const uint32_t kMaxPromptLen = align_to(pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u), 64u);
     const uint32_t kMinResponseLen = align_to(pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u), 64u);
-    ModelDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
+
     KVAxesPosition axes = get_kv_axes(model_desc.type);
     m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len, false};
     reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes);
diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
index d8e59d867a..7acc28c684 100644
--- a/src/cpp/src/llm_pipeline_static.hpp
+++ b/src/cpp/src/llm_pipeline_static.hpp
@@ -10,6 +10,12 @@
 namespace ov {
 namespace genai {
 
+struct ModelConfigDesc {
+    std::string type;
+    std::string name_or_path;
+    int num_key_value_heads;
+};
+
 class StaticLLMPipeline final : public LLMPipelineImplBase {
 public:
     StaticLLMPipeline(
@@ -19,6 +25,15 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
         const ov::AnyMap& config
     );
 
+    StaticLLMPipeline(
+        const std::shared_ptr<ov::Model>& model,
+        const ModelConfigDesc& model_desc,
+        const ov::genai::Tokenizer& tokenizer,
+        const std::string& device,
+        const ov::AnyMap& properties,
+        const ov::genai::GenerationConfig& generation_config = {}
+    );
+
     StaticLLMPipeline(
         const std::filesystem::path& path,
         const std::string& device,
@@ -26,8 +41,9 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
     );
 
     void setupAndCompileModels(
-        const std::filesystem::path& path,
+        const std::shared_ptr<ov::Model>& model,
         const std::string& device,
+        const ModelConfigDesc& model_desc,
         ov::AnyMap& pipeline_config);
 
     void setupAndImportModels(
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
index ecce79ac4e..2be67320a9 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -23,27 +23,22 @@ bool are_tokenizers_equal(Tokenizer& lhs, Tokenizer& rhs) {
            lhs.get_bos_token_id() == rhs.get_bos_token_id() && lhs.get_pad_token_id() == rhs.get_pad_token_id();
 }
 
-ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(
-    const std::filesystem::path& main_models_path,
-    const SchedulerConfig& main_scheduler_config,
-    const std::string& main_device,
-    const ov::AnyMap& main_properties,
-    const ov::genai::ModelDesc draft_model_desc,
-    const ov::AnyMap& tokenizer_properties) {
+ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, 
+                                                                             const ov::genai::ModelDesc& draft_model_desc) {
     ov::Core core;
-    auto [core_properties, compile_properties] = utils::split_core_compile_config(main_properties);
+    auto [core_properties, compile_properties] = utils::split_core_compile_config(main_model_desc.properties);
     core.set_property(core_properties);
 
-    std::filesystem::path openvino_model_name = "openvino_model.xml",
-                          draft_models_path = draft_model_desc.models_path;
+    auto main_model = main_model_desc.model;
+    auto draft_model = draft_model_desc.model;
 
-    std::shared_ptr<ov::Model> main_model = core.read_model((main_models_path / openvino_model_name).string()),
-                               draft_model = core.read_model((draft_models_path / openvino_model_name).string());
+    auto main_scheduler_config = main_model_desc.scheduler_config;
+    auto main_device = main_model_desc.device;
 
-    utils::apply_paged_attention_transformations(main_model, main_scheduler_config.use_cache_eviction);
-    utils::apply_paged_attention_transformations(draft_model, main_scheduler_config.use_cache_eviction);
+    utils::apply_paged_attention_transformations(main_model, main_model_desc.scheduler_config.use_cache_eviction);
+    utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction);
 
-    std::string draft_device = draft_model_desc.device.empty() ? main_device : draft_model_desc.device;
+    std::string draft_device = draft_model_desc.device.empty() ? main_model_desc.device : draft_model_desc.device;
 
     bool is_scheduler_undefined = draft_model_desc.scheduler_config == SchedulerConfig();
 
@@ -76,8 +71,8 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(
 
     // main and draft model can have different tokenizers
     // to do: support retokenization: 154103
-    Tokenizer main_model_tokenizer(main_models_path, tokenizer_properties),
-              draft_model_tokenizer(draft_models_path, tokenizer_properties);
+    Tokenizer main_model_tokenizer = main_model_desc.tokenizer;
+    Tokenizer draft_model_tokenizer = draft_model_desc.tokenizer;
 
     // todo: remove this condition after support of CVS-154103
     OPENVINO_ASSERT(are_tokenizers_equal(main_model_tokenizer, draft_model_tokenizer), "Tokenizers for draft and main models are different!");
@@ -86,10 +81,10 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(
 
     // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode
     m_main_pipeline = std::make_shared<ContinuousBatchingForSpeculativeDecodingImpl>(core,
-        main_model, main_model_tokenizer, utils::from_config_json_if_exists(main_models_path),
+        main_model, main_model_tokenizer, main_model_desc.generation_config,
         main_device_config, main_scheduler_config, main_device, compile_properties, true);
     m_draft_pipeline = std::make_shared<ContinuousBatchingForSpeculativeDecodingImpl>(core,
-        draft_model, draft_model_tokenizer, utils::from_config_json_if_exists(draft_models_path),
+        draft_model, draft_model_tokenizer, draft_model_desc.generation_config,
         draft_device_config, draft_scheduler_config, draft_device, draft_properties, false);
 }
 
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp
index f854713b5e..3df02ac394 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp
@@ -11,19 +11,27 @@
 namespace ov::genai {
 
 struct ModelDesc {
-    std::filesystem::path models_path;
     std::string device;
     ov::genai::SchedulerConfig scheduler_config;
     ov::AnyMap properties;
+    ov::genai::GenerationConfig generation_config;
+    std::shared_ptr<ov::Model> model = nullptr;
+    ov::genai::Tokenizer tokenizer;
 
-    ModelDesc(const std::filesystem::path& models_path,
+    ModelDesc(const std::shared_ptr<ov::Model>& model,
+              const ov::genai::Tokenizer& tokenizer,
               const std::string& device = {},
               const ov::AnyMap& properties = {},
-              const ov::genai::SchedulerConfig& scheduler_config = {}) :
-        models_path(models_path),
+              const ov::genai::SchedulerConfig& scheduler_config = {},
+              const ov::genai::GenerationConfig& generation_config = {}) :
+        model(model),
+        tokenizer(tokenizer),
         device(device),
         properties(properties),
-        scheduler_config(scheduler_config) {}
+        scheduler_config(scheduler_config),
+        generation_config(generation_config) {}
+    
+    ModelDesc() = default;
 };
 
 class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBatchingPipeline::ImplInterface {
@@ -35,12 +43,7 @@ class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBat
     std::map<uint64_t, GenerationHandle> m_draft_generations;
     
 public:
-    SpeculativeDecodingImpl(const std::filesystem::path& main_models_path,
-                            const SchedulerConfig& scheduler_config,
-                            const std::string& device,
-                            const ov::AnyMap& properties,
-                            const ov::genai::ModelDesc draft_model_desc,
-                            const ov::AnyMap& tokenizer_properties = {});
+    SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, const ov::genai::ModelDesc& draft_model_desc);
 
     GenerationHandle add_request(uint64_t request_id,
                                  const ov::Tensor& input_ids,
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 41f9a6abd4..cff25f07f8 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -67,6 +67,19 @@ constexpr char bos_token_key_name[] = "bos_token";
 constexpr char eos_token_key_name[] = "eos_token";
 constexpr char pad_token_key_name[] = "pad_token";
 
+ov::Core core_with_extension() {
+    ov::Core core;
+    const char* ov_tokenizer_path = getenv(ScopedVar::ENVIRONMENT_VARIABLE_NAME);
+    OPENVINO_ASSERT(ov_tokenizer_path, "openvino_tokenizers path is not set");
+    core.add_extension(ov_tokenizer_path);
+    return core;
+}
+
+ov::Core get_core_singleton() {
+    static ov::Core core = core_with_extension();
+    return core;
+}
+
 }  // namespace
 
 namespace ov {
@@ -76,7 +89,7 @@ class Tokenizer::TokenizerImpl {
 public:
     ov::CompiledModel m_tokenizer;
     ov::CompiledModel m_detokenizer;
-
+    
     std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_tokenizer;
     std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_detokenizer;
     // To change the adding special tokens mode we use a statefull subgraph, 
@@ -135,65 +148,105 @@ class Tokenizer::TokenizerImpl {
 
     TokenizerImpl() = default;
 
-    TokenizerImpl(std::filesystem::path tokenizer_path, const ov::AnyMap& properties)
-        : m_chat_template{chat_template_from_tokenizer_json_if_exists(tokenizer_path)} {
-        ov::Core core;
-
-        OPENVINO_ASSERT(tokenizer_path.extension() != ".xml", "'tokenizer_path' parameter should be a path to a dir not a xml file");
+    TokenizerImpl(const std::filesystem::path& models_papth,  const ov::AnyMap& properties) {
+        setupTokenizer(models_papth, properties);
+    }
 
-        const char* ov_tokenizer_path = getenv(ScopedVar::ENVIRONMENT_VARIABLE_NAME);
-        OPENVINO_ASSERT(ov_tokenizer_path, "openvino_tokenizers path is not set");
-        core.add_extension(ov_tokenizer_path);
+    TokenizerImpl(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models,  const ov::AnyMap& properties) {
+        setupTokenizer(models, properties);
+    }
 
-        read_config(tokenizer_path);
-        read_special_tokens_map(tokenizer_path);
+    void setupTokenizer(const std::filesystem::path& models_path,  const ov::AnyMap& properties) {
+        ScopedVar env_manager(tokenizers_relative_to_genai().string());
+        auto core = get_core_singleton();
 
-        // Try to read tokenizer_config if some token ids or token str are not defined.
-        read_tokenizer_config_if_necessary(tokenizer_path);
+        OPENVINO_ASSERT(models_path.extension() != ".xml", "'models_papth' parameter should be a path to a dir not a xml file");
 
-        auto device = "CPU"; // currently openvino_tokenizer supports only CPU
-        auto ov_tokenizer = core.read_model(tokenizer_path / "openvino_tokenizer.xml");
+        std::shared_ptr<ov::Model> ov_tokenizer = nullptr;
         std::shared_ptr<ov::Model> ov_detokenizer = nullptr;
-        if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) {
-            ov_detokenizer = core.read_model(tokenizer_path / "openvino_detokenizer.xml");
+
+        if (std::filesystem::exists(models_path / "openvino_tokenizer.xml")) {
+            ov_tokenizer = core.read_model(models_path / "openvino_tokenizer.xml");
         }
-        m_older_than_24_5 = ov_tokenizer->get_rt_info().count("openvino_tokenizers_version") != 1;
         
-        ov::pass::Manager manager_tok;
-        manager_tok.register_pass<MakeCombineSegmentsSatateful>();
-        manager_tok.run_passes(ov_tokenizer);
+        if (std::filesystem::exists(models_path / "openvino_detokenizer.xml")) {
+            ov_detokenizer = core.read_model(models_path / "openvino_detokenizer.xml");
+        }
+
+        setupTokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties);
+
+        // If special tokens were not found from IR, try to read them from config.
+        // This will be triggered only for IRs older than 2024.3.
+        if (m_pad_token_id == -1 || m_bos_token_id == -1 || m_eos_token_id == -1 ||
+            m_pad_token.empty() || m_bos_token.empty() || m_eos_token.empty()) {
+            read_config(models_path);
+            read_special_tokens_map(models_path);
+            // Try to read tokenizer_config if some token ids or token str are not defined.
+            read_tokenizer_config_if_necessary(models_path);
+        }
         
-        m_tokenizer = core.compile_model(ov_tokenizer, device, properties);
+        // If chat_template was not found in IR, try to read them from config.
+        if (m_chat_template.empty()) {
+            m_chat_template = chat_template_from_tokenizer_json_if_exists(models_path);
+        }
+    }
+    
+
+    void setupTokenizer(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models,  const ov::AnyMap& properties) {
+        auto [ov_tokenizer, ov_detokenizer] = models;
+
+        m_older_than_24_5 = ov_tokenizer->get_rt_info().count("openvino_tokenizers_version") != 1;
+        auto core = get_core_singleton();
+        std::string device = "CPU"; // only CPU is supported for now
+        if (ov_tokenizer) {
+            ov::pass::Manager manager;
+            manager.register_pass<MakeCombineSegmentsSatateful>();
+            manager.run_passes(ov_tokenizer);
+            m_tokenizer = core.compile_model(ov_tokenizer, device, properties);
+
+            m_ireq_queue_tokenizer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
+                m_tokenizer.get_property(ov::optimal_number_of_infer_requests),
+                [this]() -> ov::InferRequest {
+                    return std::move(this->m_tokenizer.create_infer_request());
+                });
+        }
+
         if (ov_detokenizer) {
             ov::pass::Manager manager_detok;
             manager_detok.register_pass<MakeVocabDecoderSatateful>();
             manager_detok.run_passes(ov_detokenizer);
             m_detokenizer = core.compile_model(ov_detokenizer, device, properties);
-        }
 
-        const size_t INFER_REQUEST_QUEUE_SIZE = m_tokenizer.get_property(ov::optimal_number_of_infer_requests);
-        m_ireq_queue_tokenizer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
-            INFER_REQUEST_QUEUE_SIZE,
-            [this]() -> ov::InferRequest {
-                return std::move(this->m_tokenizer.create_infer_request());
-            });
-        if (m_detokenizer) {
             m_ireq_queue_detokenizer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
-                INFER_REQUEST_QUEUE_SIZE,
+                m_detokenizer.get_property(ov::optimal_number_of_infer_requests),
                 [this]() -> ov::InferRequest {
                     return std::move(this->m_detokenizer.create_infer_request());
                 });
         }
-
-        // Get special token ids by inference if they are not defined.
-        infer_special_tokens_if_necessary();
+        
         // Initialize tokenizer's cache to save time later.
-        // infer_special_tokens_if_necessary() already could do that
-        // but it didn't run decode() for sure.
-        // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
-        auto tokenized_input = encode("non empty string").input_ids;
+        if (m_tokenizer) {
+            // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
+            encode("non empty string").input_ids;
         if (m_detokenizer)
-            decode(tokenized_input);
+            decode({1, 33, 199, 42, 42});
+        }
+
+        utils::read_rt_info(ov_tokenizer, "chat_template", m_chat_template);
+        utils::read_rt_info(ov_tokenizer, "pad_token_id", m_pad_token_id);
+        utils::read_rt_info(ov_tokenizer, "bos_token_id", m_bos_token_id);
+        utils::read_rt_info(ov_tokenizer, "eos_token_id", m_eos_token_id);
+
+        m_chat_template = patch_chat_template(m_chat_template);
+        if (m_detokenizer) {
+            // Unset/-1 token causes exception in SentencePiece detokenization.
+            if (m_pad_token_id != -1)
+                m_pad_token = decode(std::vector{m_pad_token_id});
+            if (m_bos_token_id != -1)
+                m_bos_token = decode(std::vector{m_bos_token_id});
+            if (m_eos_token_id != -1)
+                m_eos_token = decode(std::vector{m_eos_token_id});
+        }
     }
 
     // load special tokens ids from config.json
@@ -453,7 +506,7 @@ class Tokenizer::TokenizerImpl {
 
         std::string res;
         ov::genai::utils::read_json_param(nlohmann::json::parse(file), "chat_template", res);
-        
+
         return patch_chat_template(res);
     }
 
@@ -518,10 +571,40 @@ class Tokenizer::TokenizerImpl {
 };
 
 Tokenizer::Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties) {
-    ScopedVar env_manager(tokenizers_relative_to_genai().string());
     m_pimpl = std::make_shared<TokenizerImpl>(tokenizer_path, properties);
 }
 
+Tokenizer::Tokenizer(
+    const std::string& tokenizer_model_str,
+    ov::Tensor& tokenizer_weights_tensor,
+    std::string& detokenizer_model_str,
+    ov::Tensor&  detokenizer_weights_tensor,
+    const ov::AnyMap& properties
+) {
+    ScopedVar env_manager(tokenizers_relative_to_genai().string());
+    auto core = get_core_singleton();
+
+    auto ov_tokenizer = core.read_model(tokenizer_model_str, tokenizer_weights_tensor);
+    auto ov_detokenizer = core.read_model(detokenizer_model_str, detokenizer_weights_tensor);
+    m_pimpl = std::make_shared<TokenizerImpl>(std::make_pair(ov_tokenizer, ov_detokenizer), properties);
+}
+
+Tokenizer::Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties) {
+    ScopedVar env_manager(tokenizers_relative_to_genai().string());
+    auto core = get_core_singleton();
+    auto model = core.read_model(model_str, weights_tensor);
+    
+    auto parameters = model->get_parameters();
+    OPENVINO_ASSERT(!parameters.empty());
+    if (parameters.front()->get_element_type() == ov::element::string) {
+        // It's a tokenizer
+        m_pimpl = std::make_shared<TokenizerImpl>(std::make_pair(model, nullptr), properties);
+    } else {
+        // It's a detokenizer
+        m_pimpl = std::make_shared<TokenizerImpl>(std::make_pair(nullptr, model), properties);
+    }
+}
+
 TokenizedInputs Tokenizer::encode(const std::string prompt, const ov::AnyMap& tokenization_params) {
     check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()});
     return m_pimpl->encode(std::move(prompt), tokenization_params);
@@ -557,6 +640,7 @@ std::vector<std::string> Tokenizer::decode(std::vector<std::vector<int64_t>> lin
     return m_pimpl->decode(lines, detokenization_params);
 }
 
+
 int64_t Tokenizer::get_bos_token_id() const {
     return m_pimpl->m_bos_token_id;
 }
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index f1718a8a5d..337b0ab47e 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -219,6 +219,29 @@ std::pair<ov::AnyMap, ov::AnyMap> split_core_compile_config(const ov::AnyMap& pr
     return {core_properties, compile_properties};
 };
 
+/**
+ * scheduler_config is a separate config for continuous batching pipeline. 
+ * This routine splits scheduler_config from plugin_config.
+ */
+std::pair<ov::AnyMap, SchedulerConfig> split_scheduler_config(const ov::AnyMap& properties) {
+    ov::AnyMap plugin_config = properties;
+    auto it = plugin_config.find(ov::genai::scheduler_config.name());
+    SchedulerConfig scheduler_config;
+    if (it != plugin_config.end()) {
+        scheduler_config = it->second.as<SchedulerConfig>();
+        plugin_config.erase(it);
+    }
+    return {plugin_config, scheduler_config};
+};
+
+std::shared_ptr<ov::Model> read_model_with_config(const std::filesystem::path& models_path, const ov::AnyMap& properties) {
+    auto [core_properties, compile_properties] = split_core_compile_config(properties);
+    ov::Core core;
+    core.set_property(core_properties);
+    std::filesystem::path openvino_model_name = "openvino_model.xml";
+    return core.read_model((models_path / openvino_model_name).string());
+}
+
 ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend) {
     auto minuend_size = minuend.input_ids.get_size();
     auto subtrahend_size = subtrahend.input_ids.get_size();
@@ -261,6 +284,23 @@ void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model) {
     }
 }
 
+template <typename T>
+void read_rt_info(std::shared_ptr<ov::Model>& model, const char* name, T& value) {
+    if (!model)
+        return;
+    if (model->get_rt_info().count(name) == 0)
+        return;
+    auto str_value = model->get_rt_info().at(name).as<std::string>();
+    if constexpr (std::is_same<T, int64_t>::value) {
+        value = std::stoll(str_value);
+    } else if constexpr (std::is_same<T, std::string>::value) {
+        value = str_value;
+    }
+}
+
+template void read_rt_info<int64_t>(std::shared_ptr<ov::Model>&,  const char*, int64_t&);
+template void read_rt_info<std::string>(std::shared_ptr<ov::Model>&,  const char*, std::string&);
+
 ov::Core singleton_core() {
     static ov::Core core;
     return core;
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index fb58022d5f..792987d383 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -78,7 +78,11 @@ ProcessorConfig from_any_map(
     const ProcessorConfig& initial
 );
 
+
 std::pair<ov::AnyMap, ov::AnyMap> split_core_compile_config(const ov::AnyMap& properties);
+std::pair<ov::AnyMap, SchedulerConfig> split_scheduler_config(const ov::AnyMap& properties);
+
+std::shared_ptr<ov::Model> read_model_with_config(const std::filesystem::path& models_path, const ov::AnyMap& properties);
 
 ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);
 
@@ -86,6 +90,9 @@ void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model);
 
 ov::Core singleton_core();
 
+template <typename T>
+void read_rt_info(std::shared_ptr<ov::Model>& model, const char* name, T& value);
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/python/py_llm_pipeline.cpp b/src/python/py_llm_pipeline.cpp
index 7255022238..b53cc56f10 100644
--- a/src/python/py_llm_pipeline.cpp
+++ b/src/python/py_llm_pipeline.cpp
@@ -201,6 +201,7 @@ void init_llm_pipeline(py::module_& m) {
             const std::string& device,
             const py::kwargs& kwargs
         ) {
+            ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
             return draft_model(models_path, device, pyutils::kwargs_to_any_map(kwargs)).second;
         }),
         py::arg("models_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files",
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 5730def0c6..b633497d32 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -223,6 +223,38 @@ def model_tmp_path(tmpdir_factory):
                 shutil.copy(src_file, temp_path / src_file.name)
     yield model_id, Path(temp_path)
 
+@pytest.fixture(scope="module")
+def model_tokenizers_path_tmp_path(tmpdir_factory):
+    model_id, path, _, _, _ = read_model(get_models_list()[0])
+    temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_'))
+
+    # If tokens were not found in IR, it fallback to reading from config.
+    # There was no easy way to add tokens to IR in tests, so we remove them
+    # and set tokens in configs and to check if they are read and validated correctly.
+    import openvino as ov
+    
+    # copy openvino converted model and tokenizers
+    for pattern in ['*.xml', '*.bin']:
+        for src_file in path.glob(pattern):
+            core = ov.Core()
+
+            # Update files if they are openvino_tokenizer.xml or openvino_detokenizer.xml
+            if src_file.name in ['openvino_tokenizer.xml', 'openvino_detokenizer.xml']:
+                if src_file.exists():
+                    # Load the XML content
+                    ov_model = core.read_model(src_file)
+                    # Add empty rt_info so that tokens will be read from config instead of IR
+                    ov_model.set_rt_info("pad_token_id", "")
+                    ov_model.set_rt_info("eos_token_id", "")
+                    ov_model.set_rt_info("chat_template", "")
+                    ov.save_model(ov_model, str(temp_path / src_file.name))
+                    
+            if src_file in ['openvino_tokenizer.bin', 'openvino_detokenizer.bin']:
+                continue
+            if src_file.is_file():
+                shutil.copy(src_file, temp_path / src_file.name)
+    yield model_id, Path(temp_path)
+
 
 def load_tok(configs: List[Tuple], temp_path):
     # load Tokenizer where all configs are cleared.
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index 68c25e5391..9260e671d6 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -158,6 +158,7 @@ def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
         **tokenizer_config)
     
     tok = load_tok([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1])
+    tok.set_chat_template(tokenizer_config['chat_template'])
     full_history_str = tok.apply_chat_template(conversation, add_generation_prompt=False)
     if full_history_str != full_history_str_hf:
         print(f'hf reference: {full_history_str_hf}')
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index d17f3c0232..d15747be63 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -509,7 +509,8 @@ def test_load_special_tokens_str_2(model_tmp_path):
 
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_load_special_tokens_3_(model_tmp_path):
+@pytest.mark.skip(reason="CVS-158682 - RTInfo is not modified in tests for unknown reasons")
+def test_load_special_tokens_3_(model_tokenizers_path_tmp_path):
     # special_tokens_map is not available 
     # but tokenize_config.json exists
     # will load both string and integer representations
@@ -524,7 +525,7 @@ def test_load_special_tokens_3_(model_tmp_path):
         "eos_token": "</s>",
     }
 
-    tok = load_tok([(tok_config_json, "tokenizer_config.json")], model_tmp_path[1])
+    tok = load_tok([(tok_config_json, "tokenizer_config.json")], model_tokenizers_path_tmp_path[1])
     assert tok.get_pad_token() == tok_config_json['pad_token']
     assert tok.get_bos_token() == tok_config_json['bos_token']
     assert tok.get_eos_token() == tok_config_json['eos_token']
@@ -605,7 +606,8 @@ def test_load_special_tokens_4(model_tmp_path):
 
 invalid_configs = [
     dict(num_beam_groups=3, num_beams=15, do_sample=True),
-    dict(do_sample=True),  # no eos_token_id no max_new_tokens, no max_len
+    # TODO: CVS-158682 eos_token_id is still read from tiny-random-phi3 and we cannot modify RTInfo in tests
+    # dict(do_sample=True),  # no eos_token_id no max_new_tokens, no max_len 
     dict(eos_token_id=42, ignore_eos=True),  # no max_new_tokens, no max_len with ignore_eos
     dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty
     dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp