Skip to content

Commit

Permalink
use string and ov::Tensor instead of a raw buffer
Browse files Browse the repository at this point in the history
  • Loading branch information
pavel-esir committed Nov 27, 2024
1 parent 624d9f5 commit 9ed8f6e
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 57 deletions.
6 changes: 3 additions & 3 deletions samples/cpp/chat_sample/chat_sample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ int main(int argc, char* argv[]) try {


std::string device = "CPU"; // GPU, NPU can be used as well
ov::genai::LLMPipeline pipe(models_path, device);
// ov::genai::LLMPipeline pipe(models_path, device);

// ov::genai::Tokenizer tok(models_path);
// ov::genai::LLMPipeline pipe(model_uint8_buffer, weights_uint8_buffer, tok, device);
ov::genai::Tokenizer tok(models_path);
ov::genai::LLMPipeline pipe(model_uint8_buffer, weights_uint8_buffer, tok, device);

ov::genai::GenerationConfig config;
config.max_new_tokens = 100;
Expand Down
4 changes: 2 additions & 2 deletions src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
);

LLMPipeline(
std::vector<uint8_t>& model_buffer,
std::vector<uint8_t>& weights_buffer,
std::string& model_str,
ov::Tensor& weights_tensor,
const ov::genai::Tokenizer& tokenizer,
const std::string& device,
const ov::AnyMap& properties = {}
Expand Down
29 changes: 13 additions & 16 deletions src/cpp/include/openvino/genai/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,31 +36,28 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {

/**
* @brief ov::genai::Tokenizer constructor to initialize directly from model and weights
* @param tokenizer_model_buffer buffer with tokenizer model
* @param tokenizer_weights_buffer buffer with tokenizer weights
* @param detokenizer_model_buffer buffer with detokenizer model
* @param detokenizer_weights_buffer buffer with detokenizer weights
* @param tokenizer_model_str tokenizer model string
* @param tokenizer_weights_tensor ov::Tensor with tokenizer weights
* @param detokenizer_model_str detokenizer model string
* @param detokenizer_weights_tensor ov::Tensor with detokenizer weights
* @param properties Properties passed to ov::Core::compile_model
*/
Tokenizer(
std::vector<uint8_t>& tokenizer_model_buffer,
std::vector<uint8_t>& tokenizer_weights_buffer,
std::vector<uint8_t>& detokenizer_model_buffer,
std::vector<uint8_t>& detokenizer_weights_buffer,
std::string& tokenizer_model_str,
ov::Tensor& tokenizer_weights_tensor,
std::string& detokenizer_model_str,
ov::Tensor& detokenizer_weights_tensor,
const ov::AnyMap& properties = {}
);

/**
* @brief ov::genai::Tokenizer constructor to initialize directly from model and weights
* @param model_buffer buffer with model
* @param weights_buffer buffer with weights
* @brief ov::genai::Tokenizer constructor to initialize directly from model and weights.
* Whether it's tokenizer or detokenizer is defined from model input signature
* @param model_str model string
* @param weights_tensor ov::Tensor with model weights
* @param properties Properties passed to ov::Core::compile_model
*/
Tokenizer(
std::vector<uint8_t>& model_buffer,
std::vector<uint8_t>& weights_buffer,
const ov::AnyMap& properties = {}
);
Tokenizer(std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties = {});

// TODO: add constructor for ov::Properties as well

Expand Down
7 changes: 4 additions & 3 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -599,8 +599,8 @@ ov::genai::LLMPipeline::LLMPipeline(
}

ov::genai::LLMPipeline::LLMPipeline(
std::vector<uint8_t>& model_buffer,
std::vector<uint8_t>& weights_buffer,
std::string& model_str,
ov::Tensor& weights_tensor,
const ov::genai::Tokenizer& tokenizer,
const std::string& device,
const ov::AnyMap& config
Expand All @@ -620,7 +620,8 @@ ov::genai::LLMPipeline::LLMPipeline(
// TODO: check what's with the adapters
ov::InferRequest request;
ov::Core core = utils::singleton_core();
auto model = utils::get_model_from_buffer(core, model_buffer, weights_buffer);
auto model = core.read_model(model_str, weights_tensor);

utils::slice_matmul_statefull_model(model);
request = utils::singleton_core().compile_model(model, device, config).create_infer_request();
m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer);
Expand Down
43 changes: 22 additions & 21 deletions src/cpp/src/tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,18 +189,23 @@ class Tokenizer::TokenizerImpl {
m_eos_token = decode(std::vector{m_eos_token_id});
}

TokenizerImpl(std::vector<uint8_t>& tokenizer_model_buffer, std::vector<uint8_t>& tokenizer_weights_buffer,
std::vector<uint8_t>& detokenizer_model_buffer, std::vector<uint8_t>& detokenizer_weights_buffer,
const ov::AnyMap& properties) {
TokenizerImpl(
std::string& tokenizer_model_str,
ov::Tensor& tokenizer_weights_tensor,
std::string& detokenizer_model_str,
ov::Tensor& detokenizer_weights_tensor,
const ov::AnyMap& properties
) {
auto core = *get_core();
auto ov_tokenizer = utils::get_model_from_buffer(core, tokenizer_model_buffer, tokenizer_weights_buffer);
auto ov_detokenize = utils::get_model_from_buffer(core, detokenizer_model_buffer, detokenizer_weights_buffer);

auto ov_tokenizer = core.read_model(tokenizer_model_str, tokenizer_weights_tensor);
auto ov_detokenize = core.read_model(detokenizer_model_str, detokenizer_weights_tensor);
*this = TokenizerImpl(std::make_pair(ov_tokenizer, ov_detokenize), properties);
}

TokenizerImpl(std::vector<uint8_t>& model_buffer, std::vector<uint8_t>& weights_buffer, const ov::AnyMap& properties) {
TokenizerImpl(std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties = {}) {
auto core = *get_core();
auto model = utils::get_model_from_buffer(core, model_buffer, weights_buffer);
auto model = core.read_model(model_str, weights_tensor);

auto parameters = model->get_parameters();
OPENVINO_ASSERT(!parameters.empty());
Expand Down Expand Up @@ -403,27 +408,23 @@ Tokenizer::Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyM
}

Tokenizer::Tokenizer(
std::vector<uint8_t>& tokenizer_model_buffer,
std::vector<uint8_t>& tokenizer_weights_buffer,
std::vector<uint8_t>& detokenizer_model_buffer,
std::vector<uint8_t>& detokenizer_weights_buffer,
std::string& tokenizer_model_str,
ov::Tensor& tokenizer_weights_tensor,
std::string& detokenizer_model_str,
ov::Tensor& detokenizer_weights_tensor,
const ov::AnyMap& properties
) {
m_pimpl = std::make_shared<TokenizerImpl>(
tokenizer_model_buffer,
tokenizer_weights_buffer,
detokenizer_model_buffer,
detokenizer_weights_buffer,
tokenizer_model_str,
tokenizer_weights_tensor,
detokenizer_model_str,
detokenizer_weights_tensor,
properties
);
}

Tokenizer::Tokenizer(
std::vector<uint8_t>& model_buffer,
std::vector<uint8_t>& weights_buffer,
const ov::AnyMap& properties
) {
m_pimpl = std::make_shared<TokenizerImpl>(model_buffer, weights_buffer, properties);
Tokenizer::Tokenizer(std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties) {
m_pimpl = std::make_shared<TokenizerImpl>(model_str, weights_tensor, properties);
}

TokenizedInputs Tokenizer::encode(const std::string prompt, const ov::AnyMap& tokenization_params) {
Expand Down
20 changes: 9 additions & 11 deletions src/cpp/src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,25 +260,23 @@ void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model) {
}
}

std::shared_ptr<ov::Model> get_model_from_buffer(ov::Core& core, std::vector<uint8_t>& model_buffer, std::vector<uint8_t>& weights_buffer) {
OPENVINO_ASSERT(!model_buffer.empty(), "Model buffer is empty!");
OPENVINO_ASSERT(!weights_buffer.empty(), "Weights buffer is empty!");

std::string str_model(model_buffer.begin(), model_buffer.end());
return core.read_model(str_model, ov::Tensor(ov::element::u8, {weights_buffer.size()}, weights_buffer.data()));
}

template <typename T>
void read_rt_info(std::shared_ptr<ov::Model>& model, std::string& name, T& value) {
void read_rt_info(std::shared_ptr<ov::Model>& model, const char* name, T& value) {
if (!model)
return;
if (model->get_rt_info().count(name) == 0)
return;
auto str_value = model->get_rt_info().at(name).as<std::string>();
value = std::is_same<T, int64_t>::value ? str_value : std::stoi(str_value);
value = std::is_same<T, std::string>::value ? str_value : str_value;
if constexpr (std::is_same<T, int64_t>::value) {
value = std::stoll(str_value);
} else if constexpr (std::is_same<T, std::string>::value) {
value = str_value;
}
}

template void read_rt_info<int64_t>(std::shared_ptr<ov::Model>&, const char*, int64_t&);
template void read_rt_info<std::string>(std::shared_ptr<ov::Model>&, const char*, std::string&);

ov::Core singleton_core() {
static ov::Core core;
return core;
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ std::shared_ptr<ov::Model> get_model_from_buffer(ov::Core& core, std::vector<uin
ov::Core singleton_core();

template <typename T>
void read_rt_info(std::shared_ptr<ov::Model>& model, const std::string& name, T& value);
void read_rt_info(std::shared_ptr<ov::Model>& model, const char* name, T& value);

} // namespace utils
} // namespace genai
Expand Down

0 comments on commit 9ed8f6e

Please sign in to comment.