Skip to content

Commit

Permalink
continuous batching ctor with model from buffer
Browse files Browse the repository at this point in the history
  • Loading branch information
pavel-esir committed Nov 28, 2024
1 parent 9ed8f6e commit dd69db2
Show file tree
Hide file tree
Showing 11 changed files with 172 additions and 68 deletions.
26 changes: 26 additions & 0 deletions src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,32 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
const ov::AnyMap& properties = {}
);

/**
* @brief Constructs a ContinuousBatchingPipeline from already existing model and tokenizer.
*
* This constructor allows for the creation of a ContinuousBatchingPipeline using an existing model
* represented as a string and a weights tensor, along with a manually initialized tokenizer.
* This is useful when the model and tokenizer are already loaded or created in memory and do not
* need to be loaded from files.
*
* @param model_str A string representation of the model.
* @param weights_tensor A tensor containing the weights of the model.
* @param tokenizer A manually initialized ov::genai::Tokenizer.
* @param scheduler_config Configuration for the scheduler.
* @param device The device to run the pipeline on (e.g., CPU, GPU).
* @param properties Optional properties for the pipeline.
* @param generation_config Optional generation configuration for the pipeline.
*/
ContinuousBatchingPipeline(
const std::string& model_str,
const ov::Tensor& weights_tensor,
const ov::genai::Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& properties = {},
const ov::genai::GenerationConfig& generation_config = {}
);

ov::genai::Tokenizer get_tokenizer();

ov::genai::GenerationConfig get_config() const;
Expand Down
8 changes: 8 additions & 0 deletions src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,14 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> streamer(StreamerVariant func);
OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> generation_config(const GenerationConfig& config);

OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> draft_model(
std::string& model_str,
ov::Tensor& weights_tensor,
const ov::genai::Tokenizer& tokenizer,
const std::string& device = {},
const ov::genai::GenerationConfig& generation_config = {},
const ov::AnyMap& properties = {});

OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> draft_model(
const std::filesystem::path& models_path,
const std::string& device = {},
Expand Down
7 changes: 6 additions & 1 deletion src/cpp/include/openvino/genai/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {

/**
* @brief ov::genai::Tokenizer constructor to initialize directly from model and weights
*
* This constructor is used when tokenizer and detokenizer are separate models already loaded into memory.
* @param tokenizer_model_str tokenizer model string
* @param tokenizer_weights_tensor ov::Tensor with tokenizer weights
* @param detokenizer_model_str detokenizer model string
Expand All @@ -52,7 +54,10 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {

/**
* @brief ov::genai::Tokenizer constructor to initialize directly from model and weights.
* Whether it's tokenizer or detokenizer is defined from model input signature
*
* This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's
* tokenizer or detokenizer is defined from model input signature. When this constructor is used bos, eos, pad token ids
* are expected to be in IR. If your IR is older (< 2024.3) then this tokens will be udefined.
* @param model_str model string
* @param weights_tensor ov::Tensor with model weights
* @param properties Properties passed to ov::Core::compile_model
Expand Down
14 changes: 6 additions & 8 deletions src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,20 @@ template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;

ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
const std::filesystem::path& models_path,
const std::shared_ptr<ov::Model>& model,
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& properties) {
const ov::AnyMap& properties,
const ov::genai::GenerationConfig& generation_config
) {
m_tokenizer = tokenizer;
m_generation_config = utils::from_config_json_if_exists(models_path);

m_generation_config = generation_config;
ov::Core core;

auto [core_properties, compile_properties] = utils::split_core_complile_config(properties);
core.set_property(core_properties);

// The model can be compiled for GPU as well
std::shared_ptr<ov::Model> model = core.read_model((models_path / "openvino_model.xml").string());

DeviceConfig device_config(core, scheduler_config, device, compile_properties);

bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
Expand Down
14 changes: 2 additions & 12 deletions src/cpp/src/continuous_batching_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,22 +53,12 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc

void _fill_prompt_log_probs(std::vector<SequenceGroup::Ptr>& sequence_groups, ov::Tensor& logits);
public:
ContinuousBatchingImpl(const std::filesystem::path& models_path,
ContinuousBatchingImpl(const std::shared_ptr<ov::Model>& model,
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& properties);

ContinuousBatchingImpl(const std::filesystem::path& models_path,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& properties,
const ov::AnyMap& tokenizer_properties)
: ContinuousBatchingImpl{ models_path,
Tokenizer(models_path, tokenizer_properties),
scheduler_config,
device,
properties } {}
const ov::genai::GenerationConfig& generation_config);

GenerationHandle add_request(uint64_t request_id,
const ov::Tensor& input_ids,
Expand Down
53 changes: 44 additions & 9 deletions src/cpp/src/continuous_batching_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,34 @@ using namespace ov::genai;

inline ov::genai::ModelDesc
extract_draft_model_from_config(ov::AnyMap& config) {
ov::genai::ModelDesc draft_model("");
ov::genai::ModelDesc draft_model;
if (config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end()) {
draft_model = config.at(utils::DRAFT_MODEL_ARG_NAME).as<ov::genai::ModelDesc>();
config.erase(utils::DRAFT_MODEL_ARG_NAME);
}
return draft_model;
}


// TODO: Check whether this ctor is necessary.
ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::path& models_path,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& properties,
const ov::AnyMap& tokenizer_properties) {
auto properties_without_draft_model = properties;
auto draft_model = extract_draft_model_from_config(properties_without_draft_model);
if (draft_model.models_path.empty()) {
m_impl = std::make_shared<ContinuousBatchingImpl>(models_path, scheduler_config, device, properties, tokenizer_properties);
auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);

std::filesystem::path openvino_model_name = "openvino_model.xml";
auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
auto tokenizer = ov::genai::Tokenizer(models_path, tokenizer_properties);
auto generation_config = utils::from_config_json_if_exists(models_path);
if (draft_model_desr.model == nullptr) {
m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
} else {
m_impl = std::make_shared<SpeculativeDecodingImpl>(models_path, scheduler_config, device, properties_without_draft_model, draft_model, tokenizer_properties);
// todo: check properties
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
}
}

Expand All @@ -49,11 +58,37 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
const std::string& device,
const ov::AnyMap& properties) {
auto properties_without_draft_model = properties;
auto draft_model = extract_draft_model_from_config(properties_without_draft_model);
if (draft_model.models_path.empty()) {
m_impl = std::make_shared<ContinuousBatchingImpl>(models_path, tokenizer, scheduler_config, device, properties);
auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
std::filesystem::path openvino_model_name = "openvino_model.xml";
auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
auto generation_config = utils::from_config_json_if_exists(models_path);

if (draft_model_desr.model == nullptr) {
m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
} else {
// todo: check properties
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
}
}

ContinuousBatchingPipeline::ContinuousBatchingPipeline(
const std::string& model_str,
const ov::Tensor& weights_tensor,
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& properties,
const ov::genai::GenerationConfig& generation_config) {
auto properties_without_draft_model = properties;
auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
auto model = utils::singleton_core().read_model(model_str, weights_tensor);

if (draft_model_desr.model == nullptr) {
m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
} else {
m_impl = std::make_shared<SpeculativeDecodingImpl>(models_path, scheduler_config, device, properties_without_draft_model, draft_model);
auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
}
}

Expand Down
46 changes: 38 additions & 8 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -383,14 +383,26 @@ std::pair<std::string, Any> draft_model(
const std::filesystem::path& models_path,
const std::string& device,
const ov::AnyMap& properties) {
ov::AnyMap plugin_config = properties;
auto it = plugin_config.find(ov::genai::scheduler_config.name());
SchedulerConfig scheduler_config;
if (it != plugin_config.end()) {
scheduler_config = it->second.as<SchedulerConfig>();
plugin_config.erase(it);
}
return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(models_path, device, plugin_config, scheduler_config) };
auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);

std::filesystem::path openvino_model_name = "openvino_model.xml";
auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
auto generation_config = ov::genai::GenerationConfig(models_path);
auto tokenizer = ov::genai::Tokenizer(models_path);
return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
}

std::pair<std::string, Any> draft_model(
std::string& model_str,
ov::Tensor& weights_tensor,
const ov::genai::Tokenizer& tokenizer,
const std::string& device,
const ov::genai::GenerationConfig& generation_config,
const ov::AnyMap& properties) {
auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);

auto model = utils::singleton_core().read_model(model_str, weights_tensor);
return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
}

} // namespace genai
Expand Down Expand Up @@ -432,6 +444,24 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
m_generation_config = m_impl.get_config();
}

ContinuousBatchingAdapter(
std::string& model_str,
ov::Tensor& weights_tensor,
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& generation_config,
const ov::AnyMap& plugin_config
): LLMPipelineImplBase{tokenizer}, m_impl{
model_str,
weights_tensor,
tokenizer,
scheduler_config,
device,
plugin_config} {
m_generation_config.update_generation_config(generation_config);
}

ContinuousBatchingAdapter(
const std::filesystem::path& models_path,
const SchedulerConfig& scheduler_config,
Expand Down
33 changes: 14 additions & 19 deletions src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,27 +23,22 @@ bool are_tokenizers_equal(Tokenizer& lhs, Tokenizer& rhs) {
lhs.get_bos_token_id() == rhs.get_bos_token_id() && lhs.get_pad_token_id() == rhs.get_pad_token_id();
}

ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(
const std::filesystem::path& main_models_path,
const SchedulerConfig& main_scheduler_config,
const std::string& main_device,
const ov::AnyMap& main_properties,
const ov::genai::ModelDesc draft_model_desc,
const ov::AnyMap& tokenizer_properties) {
ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc,
const ov::genai::ModelDesc& draft_model_desc) {
ov::Core core;
auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(main_properties);
auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(main_model_desc.properties);
core.set_property(core_properties);

std::filesystem::path openvino_model_name = "openvino_model.xml",
draft_models_path = draft_model_desc.models_path;
auto main_model = main_model_desc.model;
auto draft_model = draft_model_desc.model;

std::shared_ptr<ov::Model> main_model = core.read_model((main_models_path / openvino_model_name).string()),
draft_model = core.read_model((draft_models_path / openvino_model_name).string());
auto main_scheduler_config = main_model_desc.scheduler_config;
auto main_device = main_model_desc.device;

utils::apply_paged_attention_transformations(main_model, main_scheduler_config.use_cache_eviction);
utils::apply_paged_attention_transformations(draft_model, main_scheduler_config.use_cache_eviction);
utils::apply_paged_attention_transformations(main_model, main_model_desc.scheduler_config.use_cache_eviction);
utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction);

std::string draft_device = draft_model_desc.device.empty() ? main_device : draft_model_desc.device;
std::string draft_device = draft_model_desc.device.empty() ? main_model_desc.device : draft_model_desc.device;

bool is_scheduler_undefined = draft_model_desc.scheduler_config == SchedulerConfig();

Expand Down Expand Up @@ -76,8 +71,8 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(

// main and draft model can have different tokenizers
// to do: support retokenization: 154103
Tokenizer main_model_tokenizer(main_models_path, tokenizer_properties),
draft_model_tokenizer(draft_models_path, tokenizer_properties);
Tokenizer main_model_tokenizer = main_model_desc.tokenizer_model;
Tokenizer draft_model_tokenizer = draft_model_desc.tokenizer_model;

// todo: remove this condition after support of CVS-154103
OPENVINO_ASSERT(are_tokenizers_equal(main_model_tokenizer, draft_model_tokenizer), "Tokenizers for draft and main models are different!");
Expand All @@ -86,10 +81,10 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(

// to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode
m_main_pipeline = std::make_shared<ContinuousBatchingForSpeculativeDecodingImpl>(core,
main_model, main_model_tokenizer, utils::from_config_json_if_exists(main_models_path),
main_model, main_model_tokenizer, main_model_desc.generation_config,
main_device_config, main_scheduler_config, main_device, compile_properties, true);
m_draft_pipeline = std::make_shared<ContinuousBatchingForSpeculativeDecodingImpl>(core,
draft_model, draft_model_tokenizer, utils::from_config_json_if_exists(draft_models_path),
draft_model, draft_model_tokenizer, draft_model_desc.generation_config,
draft_device_config, draft_scheduler_config, draft_device, draft_properties, false);
}

Expand Down
25 changes: 14 additions & 11 deletions src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,27 @@
namespace ov::genai {

struct ModelDesc {
std::filesystem::path models_path;
std::string device;
ov::genai::SchedulerConfig scheduler_config;
ov::AnyMap properties;
ov::genai::GenerationConfig generation_config;
std::shared_ptr<ov::Model> model = nullptr;
ov::genai::Tokenizer tokenizer_model;

ModelDesc(const std::filesystem::path& models_path,
ModelDesc(const std::shared_ptr<ov::Model>& model,
const ov::genai::Tokenizer& tokenizer_model,
const std::string& device = {},
const ov::AnyMap& properties = {},
const ov::genai::SchedulerConfig& scheduler_config = {}) :
models_path(models_path),
const ov::genai::SchedulerConfig& scheduler_config = {},
const ov::genai::GenerationConfig& generation_config = {}) :
model(model),
tokenizer_model(tokenizer_model),
device(device),
properties(properties),
scheduler_config(scheduler_config) {}
scheduler_config(scheduler_config),
generation_config(generation_config) {}

ModelDesc() = default;
};

class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBatchingPipeline::ImplInterface {
Expand All @@ -35,12 +43,7 @@ class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBat
std::map<uint64_t, GenerationHandle> m_draft_generations;

public:
SpeculativeDecodingImpl(const std::filesystem::path& main_models_path,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& properties,
const ov::genai::ModelDesc draft_model_desc,
const ov::AnyMap& tokenizer_properties = {});
SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, const ov::genai::ModelDesc& draft_model_desc);

GenerationHandle add_request(uint64_t request_id,
const ov::Tensor& input_ids,
Expand Down
Loading

0 comments on commit dd69db2

Please sign in to comment.