Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Accept buffer in LLMPipeline ctor #1262

Merged
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
624d9f5
initial
pavel-esir Nov 27, 2024
9ed8f6e
use string and ov::Tensor instead of a raw buffer
pavel-esir Nov 27, 2024
dd69db2
continuous batching ctor with model from buffer
pavel-esir Nov 28, 2024
3856770
revert chat sample
pavel-esir Nov 28, 2024
2cdf0d3
add CTOR with model_str and ov::Tensor buffers to NPU/StaticLLMPipeline
pavel-esir Nov 28, 2024
5b73eb4
fix win build, fix chat template patching
pavel-esir Nov 29, 2024
0f45144
fix speculative decoding
pavel-esir Nov 29, 2024
cb7f55e
improve TokenizerImpl
pavel-esir Nov 29, 2024
add6268
fix typos
pavel-esir Nov 29, 2024
ef736e6
add encryption sample
pavel-esir Dec 2, 2024
44aede3
apply comments 1
pavel-esir Dec 2, 2024
a7081c4
Merge remote-tracking branch 'upstream/releases/2024/5' into add_new_…
pavel-esir Dec 3, 2024
ab64515
fix chat_sample and tests
pavel-esir Dec 3, 2024
380966d
remove stale todos, fix github actions yml
pavel-esir Dec 3, 2024
4b7c4c2
fix path greedy_causal_lm -> text_generation
pavel-esir Dec 4, 2024
0fc3bbe
update encrypted_model_causal_lm sample, made model_desr setable from…
pavel-esir Dec 4, 2024
62ba450
Merge remote-tracking branch 'upstream/releases/2024/5' into add_new_…
pavel-esir Dec 4, 2024
4a45257
add ctor with Properties
pavel-esir Dec 4, 2024
2457b89
fix "Yoda style" if statements, some other corrections
pavel-esir Dec 4, 2024
9befd0c
simplify a bit TokenizerImpl construction
pavel-esir Dec 4, 2024
bbe1b7b
fix plugin_config -> properties for NPY
pavel-esir Dec 4, 2024
c05febe
Merge remote-tracking branch 'upstream/releases/2024/5' into add_new_…
pavel-esir Dec 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,32 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
const ov::AnyMap& properties = {}
);

/**
* @brief Constructs a ContinuousBatchingPipeline from already existing model and tokenizer.
*
* This constructor allows for the creation of a ContinuousBatchingPipeline using an existing model
* represented as a string and a weights tensor, along with a manually initialized tokenizer.
* This is useful when the model and tokenizer are already loaded or created in memory and do not
* need to be loaded from files.
*
* @param model_str A string representation of the model.
* @param weights_tensor A tensor containing the weights of the model.
* @param tokenizer A manually initialized ov::genai::Tokenizer.
* @param scheduler_config Configuration for the scheduler.
* @param device The device to run the pipeline on (e.g., CPU, GPU).
* @param properties Optional properties for the pipeline.
* @param generation_config Optional generation configuration for the pipeline.
*/
ContinuousBatchingPipeline(
const std::string& model_str,
const ov::Tensor& weights_tensor,
const ov::genai::Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& properties = {},
const ov::genai::GenerationConfig& generation_config = {}
);

ov::genai::Tokenizer get_tokenizer();

ov::genai::GenerationConfig get_config() const;
Expand Down
17 changes: 17 additions & 0 deletions src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,15 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
const ov::AnyMap& properties = {}
);

LLMPipeline(
const std::string& model_str,
const ov::Tensor& weights_tensor,
const ov::genai::Tokenizer& tokenizer,
const std::string& device,
const ov::AnyMap& properties = {},
const ov::genai::GenerationConfig& generation_config = {}
);

OPENVINO_DEPRECATED("Please, specify device explicitly when create LLMPipeline. This overload will be removed in 2025.0.0 release")
explicit LLMPipeline(const std::filesystem::path& path) :
LLMPipeline(path, "CPU") { }
Expand Down Expand Up @@ -274,6 +283,14 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> streamer(StreamerVariant func);
OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> generation_config(const GenerationConfig& config);

OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> draft_model(
std::string& model_str,
ov::Tensor& weights_tensor,
const ov::genai::Tokenizer& tokenizer,
const std::string& device = {},
const ov::AnyMap& properties = {},
const ov::genai::GenerationConfig& generation_config = {});

OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> draft_model(
const std::filesystem::path& models_path,
const std::string& device = {},
Expand Down
42 changes: 38 additions & 4 deletions src/cpp/include/openvino/genai/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,46 @@ struct TokenizedInputs {
class OPENVINO_GENAI_EXPORTS Tokenizer {
public:
/**
* @brief ov::genai::Tokenizer constructor.
* @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
* @param properties Properties passed to ov::Core::compile_model
*/
* @brief ov::genai::Tokenizer constructor.
* @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
* @param properties Properties passed to ov::Core::compile_model
*/
Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties = {});
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief ov::genai::Tokenizer constructor to initialize directly from model and weights
*
* This constructor is used when tokenizer and detokenizer are separate models already loaded into memory.
* When this constructor is used bos, eos, pad token ids are expected to be in IR.
* If your IR is older (< 2024.3) then this tokens will be udefined.
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved
* @param tokenizer_model_str tokenizer model string
* @param tokenizer_weights_tensor ov::Tensor with tokenizer weights
* @param detokenizer_model_str detokenizer model string
* @param detokenizer_weights_tensor ov::Tensor with detokenizer weights
* @param properties Properties passed to ov::Core::compile_model
*/
Tokenizer(
const std::string& tokenizer_model_str,
ov::Tensor& tokenizer_weights_tensor,
std::string& detokenizer_model_str,
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved
ov::Tensor& detokenizer_weights_tensor,
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved
const ov::AnyMap& properties = {}
);

/**
* @brief ov::genai::Tokenizer constructor to initialize directly from model and weights.
*
* This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's
* tokenizer or detokenizer is defined from model input signature. When this constructor is used bos, eos, pad token ids
* are expected to be in IR. If your IR is older (< 2024.3) then this tokens will be udefined.
* @param model_str model string
* @param weights_tensor ov::Tensor with model weights
* @param properties Properties passed to ov::Core::compile_model
*/
Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, const ov::AnyMap& properties = {});

// TODO: add constructor for ov::Properties as well
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief ov::genai::Tokenizer constructor with variable number of properties
* @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
Expand Down
14 changes: 6 additions & 8 deletions src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,20 @@ template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;

ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
const std::filesystem::path& models_path,
const std::shared_ptr<ov::Model>& model,
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& properties) {
const ov::AnyMap& properties,
const ov::genai::GenerationConfig& generation_config
) {
m_tokenizer = tokenizer;
m_generation_config = utils::from_config_json_if_exists(models_path);

m_generation_config = generation_config;
ov::Core core;

auto [core_properties, compile_properties] = utils::split_core_complile_config(properties);
core.set_property(core_properties);

// The model can be compiled for GPU as well
std::shared_ptr<ov::Model> model = core.read_model((models_path / "openvino_model.xml").string());

DeviceConfig device_config(core, scheduler_config, device, compile_properties);

bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
Expand Down
14 changes: 2 additions & 12 deletions src/cpp/src/continuous_batching_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,22 +53,12 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc

void _fill_prompt_log_probs(std::vector<SequenceGroup::Ptr>& sequence_groups, ov::Tensor& logits);
public:
ContinuousBatchingImpl(const std::filesystem::path& models_path,
ContinuousBatchingImpl(const std::shared_ptr<ov::Model>& model,
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& properties);

ContinuousBatchingImpl(const std::filesystem::path& models_path,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& properties,
const ov::AnyMap& tokenizer_properties)
: ContinuousBatchingImpl{ models_path,
Tokenizer(models_path, tokenizer_properties),
scheduler_config,
device,
properties } {}
const ov::genai::GenerationConfig& generation_config);

GenerationHandle add_request(uint64_t request_id,
const ov::Tensor& input_ids,
Expand Down
53 changes: 44 additions & 9 deletions src/cpp/src/continuous_batching_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,34 @@ using namespace ov::genai;

inline ov::genai::ModelDesc
extract_draft_model_from_config(ov::AnyMap& config) {
ov::genai::ModelDesc draft_model("");
ov::genai::ModelDesc draft_model;
if (config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end()) {
draft_model = config.at(utils::DRAFT_MODEL_ARG_NAME).as<ov::genai::ModelDesc>();
config.erase(utils::DRAFT_MODEL_ARG_NAME);
}
return draft_model;
}


// TODO: Check whether this ctor is necessary.
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved
ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::path& models_path,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& properties,
const ov::AnyMap& tokenizer_properties) {
auto properties_without_draft_model = properties;
auto draft_model = extract_draft_model_from_config(properties_without_draft_model);
if (draft_model.models_path.empty()) {
m_impl = std::make_shared<ContinuousBatchingImpl>(models_path, scheduler_config, device, properties, tokenizer_properties);
auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);

std::filesystem::path openvino_model_name = "openvino_model.xml";
auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
auto tokenizer = ov::genai::Tokenizer(models_path, tokenizer_properties);
auto generation_config = utils::from_config_json_if_exists(models_path);
if (draft_model_desr.model == nullptr) {
m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
} else {
m_impl = std::make_shared<SpeculativeDecodingImpl>(models_path, scheduler_config, device, properties_without_draft_model, draft_model, tokenizer_properties);
// todo: check properties
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
}
}

Expand All @@ -49,11 +58,37 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
const std::string& device,
const ov::AnyMap& properties) {
auto properties_without_draft_model = properties;
auto draft_model = extract_draft_model_from_config(properties_without_draft_model);
if (draft_model.models_path.empty()) {
m_impl = std::make_shared<ContinuousBatchingImpl>(models_path, tokenizer, scheduler_config, device, properties);
auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
std::filesystem::path openvino_model_name = "openvino_model.xml";
auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
auto generation_config = utils::from_config_json_if_exists(models_path);

if (draft_model_desr.model == nullptr) {
m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
} else {
// todo: check properties
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
}
}

ContinuousBatchingPipeline::ContinuousBatchingPipeline(
const std::string& model_str,
const ov::Tensor& weights_tensor,
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& properties,
const ov::genai::GenerationConfig& generation_config) {
auto properties_without_draft_model = properties;
auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
auto model = utils::singleton_core().read_model(model_str, weights_tensor);

if (draft_model_desr.model == nullptr) {
m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
} else {
m_impl = std::make_shared<SpeculativeDecodingImpl>(models_path, scheduler_config, device, properties_without_draft_model, draft_model);
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
}
}

Expand Down
Loading
Loading