From dd69db2716b871e974eff5973f4e2cc31191a541 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Thu, 28 Nov 2024 16:03:09 +0100 Subject: [PATCH] continuous batching ctor with model from buffer --- .../genai/continuous_batching_pipeline.hpp | 26 +++++++++ .../include/openvino/genai/llm_pipeline.hpp | 8 +++ src/cpp/include/openvino/genai/tokenizer.hpp | 7 ++- src/cpp/src/continuous_batching_impl.cpp | 14 +++-- src/cpp/src/continuous_batching_impl.hpp | 14 +---- src/cpp/src/continuous_batching_pipeline.cpp | 53 +++++++++++++++---- src/cpp/src/llm_pipeline.cpp | 46 +++++++++++++--- .../speculative_decoding_impl.cpp | 33 +++++------- .../speculative_decoding_impl.hpp | 25 +++++---- src/cpp/src/utils.cpp | 13 +++++ src/cpp/src/utils.hpp | 1 + 11 files changed, 172 insertions(+), 68 deletions(-) diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index 2bf5dd773b..4a0637f2d9 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -88,6 +88,32 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { const ov::AnyMap& properties = {} ); + /** + * @brief Constructs a ContinuousBatchingPipeline from already existing model and tokenizer. + * + * This constructor allows for the creation of a ContinuousBatchingPipeline using an existing model + * represented as a string and a weights tensor, along with a manually initialized tokenizer. + * This is useful when the model and tokenizer are already loaded or created in memory and do not + * need to be loaded from files. + * + * @param model_str A string representation of the model. + * @param weights_tensor A tensor containing the weights of the model. + * @param tokenizer A manually initialized ov::genai::Tokenizer. + * @param scheduler_config Configuration for the scheduler. + * @param device The device to run the pipeline on (e.g., CPU, GPU). + * @param properties Optional properties for the pipeline. + * @param generation_config Optional generation configuration for the pipeline. + */ + ContinuousBatchingPipeline( + const std::string& model_str, + const ov::Tensor& weights_tensor, + const ov::genai::Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& properties = {}, + const ov::genai::GenerationConfig& generation_config = {} + ); + ov::genai::Tokenizer get_tokenizer(); ov::genai::GenerationConfig get_config() const; diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 578f7d64b2..4b33c89e5b 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -282,6 +282,14 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { OPENVINO_GENAI_EXPORTS std::pair streamer(StreamerVariant func); OPENVINO_GENAI_EXPORTS std::pair generation_config(const GenerationConfig& config); +OPENVINO_GENAI_EXPORTS std::pair draft_model( + std::string& model_str, + ov::Tensor& weights_tensor, + const ov::genai::Tokenizer& tokenizer, + const std::string& device = {}, + const ov::genai::GenerationConfig& generation_config = {}, + const ov::AnyMap& properties = {}); + OPENVINO_GENAI_EXPORTS std::pair draft_model( const std::filesystem::path& models_path, const std::string& device = {}, diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 5c62d732f5..e01699eeb6 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -36,6 +36,8 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights + * + * This constructor is used when tokenizer and detokenizer are separate models already loaded into memory. * @param tokenizer_model_str tokenizer model string * @param tokenizer_weights_tensor ov::Tensor with tokenizer weights * @param detokenizer_model_str detokenizer model string @@ -52,7 +54,10 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights. - * Whether it's tokenizer or detokenizer is defined from model input signature + * + * This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's + * tokenizer or detokenizer is defined from model input signature. When this constructor is used bos, eos, pad token ids + * are expected to be in IR. If your IR is older (< 2024.3) then this tokens will be udefined. * @param model_str model string * @param weights_tensor ov::Tensor with model weights * @param properties Properties passed to ov::Core::compile_model diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 916167b63b..8e7386d9d3 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -11,22 +11,20 @@ template struct overloaded : Ts... {using Ts::operator()...;}; template overloaded(Ts...) -> overloaded; ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl( - const std::filesystem::path& models_path, + const std::shared_ptr& model, const Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, - const ov::AnyMap& properties) { + const ov::AnyMap& properties, + const ov::genai::GenerationConfig& generation_config + ) { m_tokenizer = tokenizer; - m_generation_config = utils::from_config_json_if_exists(models_path); - + m_generation_config = generation_config; + ov::Core core; - auto [core_properties, compile_properties] = utils::split_core_complile_config(properties); core.set_property(core_properties); - // The model can be compiled for GPU as well - std::shared_ptr model = core.read_model((models_path / "openvino_model.xml").string()); - DeviceConfig device_config(core, scheduler_config, device, compile_properties); bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction; diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index 8276edb36b..780bff6a31 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -53,22 +53,12 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc void _fill_prompt_log_probs(std::vector& sequence_groups, ov::Tensor& logits); public: - ContinuousBatchingImpl(const std::filesystem::path& models_path, + ContinuousBatchingImpl(const std::shared_ptr& model, const Tokenizer& tokenizer, - const SchedulerConfig& scheduler_config, - const std::string& device, - const ov::AnyMap& properties); - - ContinuousBatchingImpl(const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& properties, - const ov::AnyMap& tokenizer_properties) - : ContinuousBatchingImpl{ models_path, - Tokenizer(models_path, tokenizer_properties), - scheduler_config, - device, - properties } {} + const ov::genai::GenerationConfig& generation_config); GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp index 6dcbf342eb..5b224ea085 100644 --- a/src/cpp/src/continuous_batching_pipeline.cpp +++ b/src/cpp/src/continuous_batching_pipeline.cpp @@ -20,7 +20,7 @@ using namespace ov::genai; inline ov::genai::ModelDesc extract_draft_model_from_config(ov::AnyMap& config) { - ov::genai::ModelDesc draft_model(""); + ov::genai::ModelDesc draft_model; if (config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end()) { draft_model = config.at(utils::DRAFT_MODEL_ARG_NAME).as(); config.erase(utils::DRAFT_MODEL_ARG_NAME); @@ -28,17 +28,26 @@ extract_draft_model_from_config(ov::AnyMap& config) { return draft_model; } + +// TODO: Check whether this ctor is necessary. ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& properties, const ov::AnyMap& tokenizer_properties) { auto properties_without_draft_model = properties; - auto draft_model = extract_draft_model_from_config(properties_without_draft_model); - if (draft_model.models_path.empty()) { - m_impl = std::make_shared(models_path, scheduler_config, device, properties, tokenizer_properties); + auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model); + + std::filesystem::path openvino_model_name = "openvino_model.xml"; + auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string()); + auto tokenizer = ov::genai::Tokenizer(models_path, tokenizer_properties); + auto generation_config = utils::from_config_json_if_exists(models_path); + if (draft_model_desr.model == nullptr) { + m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config); } else { - m_impl = std::make_shared(models_path, scheduler_config, device, properties_without_draft_model, draft_model, tokenizer_properties); + // todo: check properties + auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config); + m_impl = std::make_shared(main_model_descr, draft_model_desr); } } @@ -49,11 +58,37 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& device, const ov::AnyMap& properties) { auto properties_without_draft_model = properties; - auto draft_model = extract_draft_model_from_config(properties_without_draft_model); - if (draft_model.models_path.empty()) { - m_impl = std::make_shared(models_path, tokenizer, scheduler_config, device, properties); + auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model); + std::filesystem::path openvino_model_name = "openvino_model.xml"; + auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string()); + auto generation_config = utils::from_config_json_if_exists(models_path); + + if (draft_model_desr.model == nullptr) { + m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config); + } else { + // todo: check properties + auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config); + m_impl = std::make_shared(main_model_descr, draft_model_desr); + } +} + +ContinuousBatchingPipeline::ContinuousBatchingPipeline( + const std::string& model_str, + const ov::Tensor& weights_tensor, + const Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& properties, + const ov::genai::GenerationConfig& generation_config) { + auto properties_without_draft_model = properties; + auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model); + auto model = utils::singleton_core().read_model(model_str, weights_tensor); + + if (draft_model_desr.model == nullptr) { + m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config); } else { - m_impl = std::make_shared(models_path, scheduler_config, device, properties_without_draft_model, draft_model); + auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config); + m_impl = std::make_shared(main_model_descr, draft_model_desr); } } diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index b4de46bb63..09105e0f03 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -383,14 +383,26 @@ std::pair draft_model( const std::filesystem::path& models_path, const std::string& device, const ov::AnyMap& properties) { - ov::AnyMap plugin_config = properties; - auto it = plugin_config.find(ov::genai::scheduler_config.name()); - SchedulerConfig scheduler_config; - if (it != plugin_config.end()) { - scheduler_config = it->second.as(); - plugin_config.erase(it); - } - return { utils::DRAFT_MODEL_ARG_NAME, Any::make(models_path, device, plugin_config, scheduler_config) }; + auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); + + std::filesystem::path openvino_model_name = "openvino_model.xml"; + auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string()); + auto generation_config = ov::genai::GenerationConfig(models_path); + auto tokenizer = ov::genai::Tokenizer(models_path); + return { utils::DRAFT_MODEL_ARG_NAME, Any::make(model, tokenizer, device, plugin_config, scheduler_config, generation_config) }; +} + +std::pair draft_model( + std::string& model_str, + ov::Tensor& weights_tensor, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::genai::GenerationConfig& generation_config, + const ov::AnyMap& properties) { + auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); + + auto model = utils::singleton_core().read_model(model_str, weights_tensor); + return { utils::DRAFT_MODEL_ARG_NAME, Any::make(model, tokenizer, device, plugin_config, scheduler_config, generation_config) }; } } // namespace genai @@ -432,6 +444,24 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase { m_generation_config = m_impl.get_config(); } + ContinuousBatchingAdapter( + std::string& model_str, + ov::Tensor& weights_tensor, + const Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& generation_config, + const ov::AnyMap& plugin_config + ): LLMPipelineImplBase{tokenizer}, m_impl{ + model_str, + weights_tensor, + tokenizer, + scheduler_config, + device, + plugin_config} { + m_generation_config.update_generation_config(generation_config); + } + ContinuousBatchingAdapter( const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index 0f43555a5f..0eb6f2c0e6 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -23,27 +23,22 @@ bool are_tokenizers_equal(Tokenizer& lhs, Tokenizer& rhs) { lhs.get_bos_token_id() == rhs.get_bos_token_id() && lhs.get_pad_token_id() == rhs.get_pad_token_id(); } -ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl( - const std::filesystem::path& main_models_path, - const SchedulerConfig& main_scheduler_config, - const std::string& main_device, - const ov::AnyMap& main_properties, - const ov::genai::ModelDesc draft_model_desc, - const ov::AnyMap& tokenizer_properties) { +ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, + const ov::genai::ModelDesc& draft_model_desc) { ov::Core core; - auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(main_properties); + auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(main_model_desc.properties); core.set_property(core_properties); - std::filesystem::path openvino_model_name = "openvino_model.xml", - draft_models_path = draft_model_desc.models_path; + auto main_model = main_model_desc.model; + auto draft_model = draft_model_desc.model; - std::shared_ptr main_model = core.read_model((main_models_path / openvino_model_name).string()), - draft_model = core.read_model((draft_models_path / openvino_model_name).string()); + auto main_scheduler_config = main_model_desc.scheduler_config; + auto main_device = main_model_desc.device; - utils::apply_paged_attention_transformations(main_model, main_scheduler_config.use_cache_eviction); - utils::apply_paged_attention_transformations(draft_model, main_scheduler_config.use_cache_eviction); + utils::apply_paged_attention_transformations(main_model, main_model_desc.scheduler_config.use_cache_eviction); + utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction); - std::string draft_device = draft_model_desc.device.empty() ? main_device : draft_model_desc.device; + std::string draft_device = draft_model_desc.device.empty() ? main_model_desc.device : draft_model_desc.device; bool is_scheduler_undefined = draft_model_desc.scheduler_config == SchedulerConfig(); @@ -76,8 +71,8 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl( // main and draft model can have different tokenizers // to do: support retokenization: 154103 - Tokenizer main_model_tokenizer(main_models_path, tokenizer_properties), - draft_model_tokenizer(draft_models_path, tokenizer_properties); + Tokenizer main_model_tokenizer = main_model_desc.tokenizer_model; + Tokenizer draft_model_tokenizer = draft_model_desc.tokenizer_model; // todo: remove this condition after support of CVS-154103 OPENVINO_ASSERT(are_tokenizers_equal(main_model_tokenizer, draft_model_tokenizer), "Tokenizers for draft and main models are different!"); @@ -86,10 +81,10 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl( // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode m_main_pipeline = std::make_shared(core, - main_model, main_model_tokenizer, utils::from_config_json_if_exists(main_models_path), + main_model, main_model_tokenizer, main_model_desc.generation_config, main_device_config, main_scheduler_config, main_device, compile_properties, true); m_draft_pipeline = std::make_shared(core, - draft_model, draft_model_tokenizer, utils::from_config_json_if_exists(draft_models_path), + draft_model, draft_model_tokenizer, draft_model_desc.generation_config, draft_device_config, draft_scheduler_config, draft_device, draft_properties, false); } diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp index f854713b5e..557b84f278 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp @@ -11,19 +11,27 @@ namespace ov::genai { struct ModelDesc { - std::filesystem::path models_path; std::string device; ov::genai::SchedulerConfig scheduler_config; ov::AnyMap properties; + ov::genai::GenerationConfig generation_config; + std::shared_ptr model = nullptr; + ov::genai::Tokenizer tokenizer_model; - ModelDesc(const std::filesystem::path& models_path, + ModelDesc(const std::shared_ptr& model, + const ov::genai::Tokenizer& tokenizer_model, const std::string& device = {}, const ov::AnyMap& properties = {}, - const ov::genai::SchedulerConfig& scheduler_config = {}) : - models_path(models_path), + const ov::genai::SchedulerConfig& scheduler_config = {}, + const ov::genai::GenerationConfig& generation_config = {}) : + model(model), + tokenizer_model(tokenizer_model), device(device), properties(properties), - scheduler_config(scheduler_config) {} + scheduler_config(scheduler_config), + generation_config(generation_config) {} + + ModelDesc() = default; }; class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBatchingPipeline::ImplInterface { @@ -35,12 +43,7 @@ class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBat std::map m_draft_generations; public: - SpeculativeDecodingImpl(const std::filesystem::path& main_models_path, - const SchedulerConfig& scheduler_config, - const std::string& device, - const ov::AnyMap& properties, - const ov::genai::ModelDesc draft_model_desc, - const ov::AnyMap& tokenizer_properties = {}); + SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, const ov::genai::ModelDesc& draft_model_desc); GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 31a9abdfc8..f00bdda819 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -219,6 +219,19 @@ std::pair split_core_complile_config(const ov::AnyMap& p return {core_properties, compile_properties}; }; +/** + */ +std::pair split_scheduler_config(const ov::AnyMap& properties) { + ov::AnyMap plugin_config = properties; + auto it = plugin_config.find(ov::genai::scheduler_config.name()); + SchedulerConfig scheduler_config; + if (it != plugin_config.end()) { + scheduler_config = it->second.as(); + plugin_config.erase(it); + } + return {plugin_config, scheduler_config}; +}; + ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend) { auto minuend_size = minuend.input_ids.get_size(); auto subtrahend_size = subtrahend.input_ids.get_size(); diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 7e34f03426..734303830e 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -59,6 +59,7 @@ ProcessorConfig from_any_map( ); std::pair split_core_complile_config(const ov::AnyMap& properties); +std::pair split_scheduler_config(const ov::AnyMap& properties); ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);