diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp index 478da4f9f8..a6e9c2fe0f 100644 --- a/samples/cpp/chat_sample/chat_sample.cpp +++ b/samples/cpp/chat_sample/chat_sample.cpp @@ -38,10 +38,10 @@ int main(int argc, char* argv[]) try { std::string device = "CPU"; // GPU, NPU can be used as well - ov::genai::LLMPipeline pipe(models_path, device); + // ov::genai::LLMPipeline pipe(models_path, device); - // ov::genai::Tokenizer tok(models_path); - // ov::genai::LLMPipeline pipe(model_uint8_buffer, weights_uint8_buffer, tok, device); + ov::genai::Tokenizer tok(models_path); + ov::genai::LLMPipeline pipe(model_uint8_buffer, weights_uint8_buffer, tok, device); ov::genai::GenerationConfig config; config.max_new_tokens = 100; diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 90db15d463..578f7d64b2 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -113,8 +113,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { ); LLMPipeline( - std::vector& model_buffer, - std::vector& weights_buffer, + std::string& model_str, + ov::Tensor& weights_tensor, const ov::genai::Tokenizer& tokenizer, const std::string& device, const ov::AnyMap& properties = {} diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 8bdfdd0814..5c62d732f5 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -36,31 +36,28 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights - * @param tokenizer_model_buffer buffer with tokenizer model - * @param tokenizer_weights_buffer buffer with tokenizer weights - * @param detokenizer_model_buffer buffer with detokenizer model - * @param detokenizer_weights_buffer buffer with detokenizer weights + * @param tokenizer_model_str tokenizer model string + * @param tokenizer_weights_tensor ov::Tensor with tokenizer weights + * @param detokenizer_model_str detokenizer model string + * @param detokenizer_weights_tensor ov::Tensor with detokenizer weights * @param properties Properties passed to ov::Core::compile_model */ Tokenizer( - std::vector& tokenizer_model_buffer, - std::vector& tokenizer_weights_buffer, - std::vector& detokenizer_model_buffer, - std::vector& detokenizer_weights_buffer, + std::string& tokenizer_model_str, + ov::Tensor& tokenizer_weights_tensor, + std::string& detokenizer_model_str, + ov::Tensor& detokenizer_weights_tensor, const ov::AnyMap& properties = {} ); /** - * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights - * @param model_buffer buffer with model - * @param weights_buffer buffer with weights + * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights. + * Whether it's tokenizer or detokenizer is defined from model input signature + * @param model_str model string + * @param weights_tensor ov::Tensor with model weights * @param properties Properties passed to ov::Core::compile_model */ - Tokenizer( - std::vector& model_buffer, - std::vector& weights_buffer, - const ov::AnyMap& properties = {} - ); + Tokenizer(std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties = {}); // TODO: add constructor for ov::Properties as well diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 3c84e51849..b4de46bb63 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -599,8 +599,8 @@ ov::genai::LLMPipeline::LLMPipeline( } ov::genai::LLMPipeline::LLMPipeline( - std::vector& model_buffer, - std::vector& weights_buffer, + std::string& model_str, + ov::Tensor& weights_tensor, const ov::genai::Tokenizer& tokenizer, const std::string& device, const ov::AnyMap& config @@ -620,7 +620,8 @@ ov::genai::LLMPipeline::LLMPipeline( // TODO: check what's with the adapters ov::InferRequest request; ov::Core core = utils::singleton_core(); - auto model = utils::get_model_from_buffer(core, model_buffer, weights_buffer); + auto model = core.read_model(model_str, weights_tensor); + utils::slice_matmul_statefull_model(model); request = utils::singleton_core().compile_model(model, device, config).create_infer_request(); m_pimpl = std::make_unique(request, tokenizer); diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index f92e9067b7..523345d715 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -189,18 +189,23 @@ class Tokenizer::TokenizerImpl { m_eos_token = decode(std::vector{m_eos_token_id}); } - TokenizerImpl(std::vector& tokenizer_model_buffer, std::vector& tokenizer_weights_buffer, - std::vector& detokenizer_model_buffer, std::vector& detokenizer_weights_buffer, - const ov::AnyMap& properties) { + TokenizerImpl( + std::string& tokenizer_model_str, + ov::Tensor& tokenizer_weights_tensor, + std::string& detokenizer_model_str, + ov::Tensor& detokenizer_weights_tensor, + const ov::AnyMap& properties + ) { auto core = *get_core(); - auto ov_tokenizer = utils::get_model_from_buffer(core, tokenizer_model_buffer, tokenizer_weights_buffer); - auto ov_detokenize = utils::get_model_from_buffer(core, detokenizer_model_buffer, detokenizer_weights_buffer); + + auto ov_tokenizer = core.read_model(tokenizer_model_str, tokenizer_weights_tensor); + auto ov_detokenize = core.read_model(detokenizer_model_str, detokenizer_weights_tensor); *this = TokenizerImpl(std::make_pair(ov_tokenizer, ov_detokenize), properties); } - TokenizerImpl(std::vector& model_buffer, std::vector& weights_buffer, const ov::AnyMap& properties) { + TokenizerImpl(std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties = {}) { auto core = *get_core(); - auto model = utils::get_model_from_buffer(core, model_buffer, weights_buffer); + auto model = core.read_model(model_str, weights_tensor); auto parameters = model->get_parameters(); OPENVINO_ASSERT(!parameters.empty()); @@ -403,27 +408,23 @@ Tokenizer::Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyM } Tokenizer::Tokenizer( - std::vector& tokenizer_model_buffer, - std::vector& tokenizer_weights_buffer, - std::vector& detokenizer_model_buffer, - std::vector& detokenizer_weights_buffer, + std::string& tokenizer_model_str, + ov::Tensor& tokenizer_weights_tensor, + std::string& detokenizer_model_str, + ov::Tensor& detokenizer_weights_tensor, const ov::AnyMap& properties ) { m_pimpl = std::make_shared( - tokenizer_model_buffer, - tokenizer_weights_buffer, - detokenizer_model_buffer, - detokenizer_weights_buffer, + tokenizer_model_str, + tokenizer_weights_tensor, + detokenizer_model_str, + detokenizer_weights_tensor, properties ); } -Tokenizer::Tokenizer( - std::vector& model_buffer, - std::vector& weights_buffer, - const ov::AnyMap& properties -) { - m_pimpl = std::make_shared(model_buffer, weights_buffer, properties); +Tokenizer::Tokenizer(std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties) { + m_pimpl = std::make_shared(model_str, weights_tensor, properties); } TokenizedInputs Tokenizer::encode(const std::string prompt, const ov::AnyMap& tokenization_params) { diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index cd302e9a54..31a9abdfc8 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -260,25 +260,23 @@ void slice_matmul_statefull_model(std::shared_ptr model) { } } -std::shared_ptr get_model_from_buffer(ov::Core& core, std::vector& model_buffer, std::vector& weights_buffer) { - OPENVINO_ASSERT(!model_buffer.empty(), "Model buffer is empty!"); - OPENVINO_ASSERT(!weights_buffer.empty(), "Weights buffer is empty!"); - - std::string str_model(model_buffer.begin(), model_buffer.end()); - return core.read_model(str_model, ov::Tensor(ov::element::u8, {weights_buffer.size()}, weights_buffer.data())); -} - template -void read_rt_info(std::shared_ptr& model, std::string& name, T& value) { +void read_rt_info(std::shared_ptr& model, const char* name, T& value) { if (!model) return; if (model->get_rt_info().count(name) == 0) return; auto str_value = model->get_rt_info().at(name).as(); - value = std::is_same::value ? str_value : std::stoi(str_value); - value = std::is_same::value ? str_value : str_value; + if constexpr (std::is_same::value) { + value = std::stoll(str_value); + } else if constexpr (std::is_same::value) { + value = str_value; + } } +template void read_rt_info(std::shared_ptr&, const char*, int64_t&); +template void read_rt_info(std::shared_ptr&, const char*, std::string&); + ov::Core singleton_core() { static ov::Core core; return core; diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 19968e035f..7e34f03426 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -69,7 +69,7 @@ std::shared_ptr get_model_from_buffer(ov::Core& core, std::vector -void read_rt_info(std::shared_ptr& model, const std::string& name, T& value); +void read_rt_info(std::shared_ptr& model, const char* name, T& value); } // namespace utils } // namespace genai