openvinotoolkit · andrei-kochin · Dec 5, 2024 · Nov 27, 2024 · Nov 27, 2024 · Nov 28, 2024
diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp
@@ -2,6 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "openvino/genai/llm_pipeline.hpp"
+#include <regex>
+#include <fstream>
 
 int main(int argc, char* argv[]) try {
     if (2 != argc) {
@@ -10,9 +12,37 @@ int main(int argc, char* argv[]) try {
     std::string prompt;
     std::string models_path = argv[1];
 
+    std::string model_path = models_path + "/openvino_model.xml";
+    std::string weights_path = std::regex_replace(model_path, std::regex(".xml"), ".bin");
+    std::ifstream model_file(model_path, std::ios::binary | std::ios::ate);
+    std::ifstream weights_file(weights_path, std::ios::binary | std::ios::ate);
+
+    if (!model_file.is_open() || !weights_file.is_open()) {
+        throw std::runtime_error("Cannot open model or weights file");
+    }
+
+    std::streamsize model_size = model_file.tellg();
+    std::streamsize weights_size = weights_file.tellg();
+
+    model_file.seekg(0, std::ios::beg);
+    weights_file.seekg(0, std::ios::beg);
+
+    std::vector<char> model_buffer(model_size);
+    std::vector<char> weights_buffer(weights_size);
+
+    if (!model_file.read(model_buffer.data(), model_size) || !weights_file.read(weights_buffer.data(), weights_size)) {
+        throw std::runtime_error("Error reading model or weights file");
+    }
+    std::vector<uint8_t> model_uint8_buffer(model_buffer.begin(), model_buffer.end());
+    std::vector<uint8_t> weights_uint8_buffer(weights_buffer.begin(), weights_buffer.end());
+
+
     std::string device = "CPU";  // GPU, NPU can be used as well
-    ov::genai::LLMPipeline pipe(models_path, device);
+    // ov::genai::LLMPipeline pipe(models_path, device);
 
+    ov::genai::Tokenizer tok(models_path);
+    ov::genai::LLMPipeline pipe(model_uint8_buffer, weights_uint8_buffer, tok, device);
+
     ov::genai::GenerationConfig config;
     config.max_new_tokens = 100;
     std::function<bool(std::string)> streamer = [](std::string word) { 

diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -112,6 +112,14 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
         const ov::AnyMap& properties = {}
     );
 
+    LLMPipeline(
+        std::string& model_str,
+        ov::Tensor& weights_tensor,
+        const ov::genai::Tokenizer& tokenizer,
+        const std::string& device,
+        const ov::AnyMap& properties = {}
+    );
+
     OPENVINO_DEPRECATED("Please, specify device explicitly when create LLMPipeline. This overload will be removed in 2025.0.0 release")
     explicit LLMPipeline(const std::filesystem::path& path) :
         LLMPipeline(path, "CPU") { }

diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -28,12 +28,39 @@ struct TokenizedInputs {
 class OPENVINO_GENAI_EXPORTS Tokenizer {
 public:
     /**
-    * @brief ov::genai::Tokenizer constructor.
-    * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
-    * @param properties Properties passed to ov::Core::compile_model
-    */
+     * @brief ov::genai::Tokenizer constructor.
+     * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
+     * @param properties Properties passed to ov::Core::compile_model
+     */
     Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties = {});
 
+    /**
+     * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights
+     * @param tokenizer_model_str tokenizer model string
+     * @param tokenizer_weights_tensor ov::Tensor with tokenizer weights
+     * @param detokenizer_model_str detokenizer model string
+     * @param detokenizer_weights_tensor ov::Tensor with detokenizer weights
+     * @param properties Properties passed to ov::Core::compile_model
+     */
+    Tokenizer(
+        std::string& tokenizer_model_str,
+        ov::Tensor& tokenizer_weights_tensor,
+        std::string& detokenizer_model_str,
+        ov::Tensor&  detokenizer_weights_tensor,
+        const ov::AnyMap& properties = {}
+    );
+
+    /**
+     * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights. 
+     * Whether it's tokenizer or detokenizer is defined from model input signature
+     * @param model_str model string
+     * @param weights_tensor ov::Tensor with model weights
+     * @param properties Properties passed to ov::Core::compile_model
+     */
+    Tokenizer(std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties = {});
+
+    // TODO: add constructor for ov::Properties as well
+
     /**
      * @brief ov::genai::Tokenizer constructor with variable number of properties
      * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -598,6 +598,38 @@ ov::genai::LLMPipeline::LLMPipeline(
     m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
 }
 
+ov::genai::LLMPipeline::LLMPipeline(
+    std::string& model_str,
+    ov::Tensor& weights_tensor,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string& device,
+    const ov::AnyMap& config
+){
+    auto start_time = std::chrono::steady_clock::now();
+    if (config.find(ov::genai::scheduler_config.name()) != config.end()) {
+        auto config_without_scheduler_config = config;
+        config_without_scheduler_config.erase(ov::genai::scheduler_config.name());
+        auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
+        // TODO: make infer request for continious batching
+        // TODO: check why compiled model does not have scheduler config
+        // m_pimpl = std::make_unique<ContinuousBatchingAdapter>("models_path", tokenizer, config);
+    } else if ("NPU" == device) {
+        // TOOD: implement
+        m_pimpl = std::make_unique<StaticLLMPipeline>("models_path", device, config);
+    } else {
+        // TODO: check what's with the adapters
+        ov::InferRequest request;
+        ov::Core core = utils::singleton_core();
+        auto model = core.read_model(model_str, weights_tensor);
+
+        utils::slice_matmul_statefull_model(model);
+        request = utils::singleton_core().compile_model(model, device, config).create_infer_request();
+        m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer);
+    }
+    auto stop_time = std::chrono::steady_clock::now();
+    m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
+}
+
 ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
     return m_pimpl->m_generation_config;
 }