Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Accept buffer in LLMPipeline ctor #1262

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
624d9f5
initial
pavel-esir Nov 27, 2024
9ed8f6e
use string and ov::Tensor instead of a raw buffer
pavel-esir Nov 27, 2024
dd69db2
continuous batching ctor with model from buffer
pavel-esir Nov 28, 2024
3856770
revert chat sample
pavel-esir Nov 28, 2024
2cdf0d3
add CTOR with model_str and ov::Tensor buffers to NPU/StaticLLMPipeline
pavel-esir Nov 28, 2024
5b73eb4
fix win build, fix chat template patching
pavel-esir Nov 29, 2024
0f45144
fix speculative decoding
pavel-esir Nov 29, 2024
cb7f55e
improve TokenizerImpl
pavel-esir Nov 29, 2024
add6268
fix typos
pavel-esir Nov 29, 2024
ef736e6
add encryption sample
pavel-esir Dec 2, 2024
44aede3
apply comments 1
pavel-esir Dec 2, 2024
a7081c4
Merge remote-tracking branch 'upstream/releases/2024/5' into add_new_…
pavel-esir Dec 3, 2024
ab64515
fix chat_sample and tests
pavel-esir Dec 3, 2024
380966d
remove stale todos, fix github actions yml
pavel-esir Dec 3, 2024
4b7c4c2
fix path greedy_causal_lm -> text_generation
pavel-esir Dec 4, 2024
0fc3bbe
update encrypted_model_causal_lm sample, made model_desr setable from…
pavel-esir Dec 4, 2024
62ba450
Merge remote-tracking branch 'upstream/releases/2024/5' into add_new_…
pavel-esir Dec 4, 2024
4a45257
add ctor with Properties
pavel-esir Dec 4, 2024
2457b89
fix "Yoda style" if statements, some other corrections
pavel-esir Dec 4, 2024
9befd0c
simplify a bit TokenizerImpl construction
pavel-esir Dec 4, 2024
bbe1b7b
fix plugin_config -> properties for NPY
pavel-esir Dec 4, 2024
c05febe
Merge remote-tracking branch 'upstream/releases/2024/5' into add_new_…
pavel-esir Dec 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion samples/cpp/chat_sample/chat_sample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// SPDX-License-Identifier: Apache-2.0

#include "openvino/genai/llm_pipeline.hpp"
#include <regex>
#include <fstream>

int main(int argc, char* argv[]) try {
if (2 != argc) {
Expand All @@ -10,9 +12,37 @@ int main(int argc, char* argv[]) try {
std::string prompt;
std::string models_path = argv[1];

std::string model_path = models_path + "/openvino_model.xml";
std::string weights_path = std::regex_replace(model_path, std::regex(".xml"), ".bin");
std::ifstream model_file(model_path, std::ios::binary | std::ios::ate);
std::ifstream weights_file(weights_path, std::ios::binary | std::ios::ate);
ilya-lavrenov marked this conversation as resolved.
Show resolved Hide resolved

if (!model_file.is_open() || !weights_file.is_open()) {
throw std::runtime_error("Cannot open model or weights file");
}

std::streamsize model_size = model_file.tellg();
std::streamsize weights_size = weights_file.tellg();

model_file.seekg(0, std::ios::beg);
weights_file.seekg(0, std::ios::beg);

std::vector<char> model_buffer(model_size);
std::vector<char> weights_buffer(weights_size);

if (!model_file.read(model_buffer.data(), model_size) || !weights_file.read(weights_buffer.data(), weights_size)) {
throw std::runtime_error("Error reading model or weights file");
}
std::vector<uint8_t> model_uint8_buffer(model_buffer.begin(), model_buffer.end());
std::vector<uint8_t> weights_uint8_buffer(weights_buffer.begin(), weights_buffer.end());


std::string device = "CPU"; // GPU, NPU can be used as well
ov::genai::LLMPipeline pipe(models_path, device);
// ov::genai::LLMPipeline pipe(models_path, device);

ov::genai::Tokenizer tok(models_path);
ov::genai::LLMPipeline pipe(model_uint8_buffer, weights_uint8_buffer, tok, device);

ov::genai::GenerationConfig config;
config.max_new_tokens = 100;
std::function<bool(std::string)> streamer = [](std::string word) {
Expand Down
8 changes: 8 additions & 0 deletions src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,14 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
const ov::AnyMap& properties = {}
);

LLMPipeline(
std::string& model_str,
ov::Tensor& weights_tensor,
const ov::genai::Tokenizer& tokenizer,
const std::string& device,
const ov::AnyMap& properties = {}
);

OPENVINO_DEPRECATED("Please, specify device explicitly when create LLMPipeline. This overload will be removed in 2025.0.0 release")
explicit LLMPipeline(const std::filesystem::path& path) :
LLMPipeline(path, "CPU") { }
Expand Down
35 changes: 31 additions & 4 deletions src/cpp/include/openvino/genai/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,39 @@ struct TokenizedInputs {
class OPENVINO_GENAI_EXPORTS Tokenizer {
public:
/**
* @brief ov::genai::Tokenizer constructor.
* @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
* @param properties Properties passed to ov::Core::compile_model
*/
* @brief ov::genai::Tokenizer constructor.
* @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
* @param properties Properties passed to ov::Core::compile_model
*/
Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties = {});
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief ov::genai::Tokenizer constructor to initialize directly from model and weights
* @param tokenizer_model_str tokenizer model string
* @param tokenizer_weights_tensor ov::Tensor with tokenizer weights
* @param detokenizer_model_str detokenizer model string
* @param detokenizer_weights_tensor ov::Tensor with detokenizer weights
* @param properties Properties passed to ov::Core::compile_model
*/
Tokenizer(
std::string& tokenizer_model_str,
ov::Tensor& tokenizer_weights_tensor,
std::string& detokenizer_model_str,
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved
ov::Tensor& detokenizer_weights_tensor,
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved
const ov::AnyMap& properties = {}
);

/**
* @brief ov::genai::Tokenizer constructor to initialize directly from model and weights.
* Whether it's tokenizer or detokenizer is defined from model input signature
* @param model_str model string
* @param weights_tensor ov::Tensor with model weights
* @param properties Properties passed to ov::Core::compile_model
*/
Tokenizer(std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties = {});

// TODO: add constructor for ov::Properties as well
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief ov::genai::Tokenizer constructor with variable number of properties
* @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
Expand Down
32 changes: 32 additions & 0 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,38 @@ ov::genai::LLMPipeline::LLMPipeline(
m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
}

ov::genai::LLMPipeline::LLMPipeline(
std::string& model_str,
ov::Tensor& weights_tensor,
const ov::genai::Tokenizer& tokenizer,
const std::string& device,
const ov::AnyMap& config
){
auto start_time = std::chrono::steady_clock::now();
if (config.find(ov::genai::scheduler_config.name()) != config.end()) {
auto config_without_scheduler_config = config;
config_without_scheduler_config.erase(ov::genai::scheduler_config.name());
auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
// TODO: make infer request for continious batching
// TODO: check why compiled model does not have scheduler config
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved
// m_pimpl = std::make_unique<ContinuousBatchingAdapter>("models_path", tokenizer, config);
} else if ("NPU" == device) {
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved
// TOOD: implement
m_pimpl = std::make_unique<StaticLLMPipeline>("models_path", device, config);
} else {
// TODO: check what's with the adapters
ov::InferRequest request;
ov::Core core = utils::singleton_core();
auto model = core.read_model(model_str, weights_tensor);
ilya-lavrenov marked this conversation as resolved.
Show resolved Hide resolved

utils::slice_matmul_statefull_model(model);
request = utils::singleton_core().compile_model(model, device, config).create_infer_request();
m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer);
}
auto stop_time = std::chrono::steady_clock::now();
m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
}

ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
return m_pimpl->m_generation_config;
}
Expand Down
Loading
Loading