continuous batching ctor with model from buffer

openvinotoolkit · Nov 28, 2024 · dd69db2 · dd69db2
1 parent 9ed8f6e
commit dd69db2
Show file tree

Hide file tree

Showing 11 changed files with 172 additions and 68 deletions.
diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -88,6 +88,32 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
         const ov::AnyMap& properties = {}
     );
 
+    /**
+     * @brief Constructs a ContinuousBatchingPipeline from already existing model and tokenizer.
+     * 
+     * This constructor allows for the creation of a ContinuousBatchingPipeline using an existing model
+     * represented as a string and a weights tensor, along with a manually initialized tokenizer.
+     * This is useful when the model and tokenizer are already loaded or created in memory and do not
+     * need to be loaded from files.
+     *
+     * @param model_str A string representation of the model.
+     * @param weights_tensor A tensor containing the weights of the model.
+     * @param tokenizer A manually initialized ov::genai::Tokenizer.
+     * @param scheduler_config Configuration for the scheduler.
+     * @param device The device to run the pipeline on (e.g., CPU, GPU).
+     * @param properties Optional properties for the pipeline.
+     * @param generation_config Optional generation configuration for the pipeline.
+     */
+    ContinuousBatchingPipeline(
+        const std::string& model_str,
+        const ov::Tensor& weights_tensor,
+        const ov::genai::Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device,
+        const ov::AnyMap& properties = {},
+        const ov::genai::GenerationConfig& generation_config = {}
+    );
+
     ov::genai::Tokenizer get_tokenizer();
 
     ov::genai::GenerationConfig get_config() const;

diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -282,6 +282,14 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
 OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> streamer(StreamerVariant func);
 OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> generation_config(const GenerationConfig& config);
 
+OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> draft_model(
+    std::string& model_str,
+    ov::Tensor& weights_tensor,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string& device = {},
+    const ov::genai::GenerationConfig& generation_config = {},
+    const ov::AnyMap& properties = {});
+
 OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> draft_model(
     const std::filesystem::path& models_path,
     const std::string& device = {},

diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -36,6 +36,8 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
 
     /**
      * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights
+     * 
+     * This constructor is used when tokenizer and detokenizer are separate models already loaded into memory.
      * @param tokenizer_model_str tokenizer model string
      * @param tokenizer_weights_tensor ov::Tensor with tokenizer weights
      * @param detokenizer_model_str detokenizer model string
@@ -52,7 +54,10 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
 
     /**
      * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights. 
-     * Whether it's tokenizer or detokenizer is defined from model input signature
+     * 
+     * This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's 
+     * tokenizer or detokenizer is defined from model input signature. When this constructor is used bos, eos, pad token ids
+     * are expected to be in IR. If your IR is older (< 2024.3) then this tokens will be udefined.
      * @param model_str model string
      * @param weights_tensor ov::Tensor with model weights
      * @param properties Properties passed to ov::Core::compile_model

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
@@ -11,22 +11,20 @@ template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
 template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
 
 ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
-    const std::filesystem::path& models_path,
+    const std::shared_ptr<ov::Model>& model,
     const Tokenizer& tokenizer,
     const SchedulerConfig& scheduler_config,
     const std::string& device,
-    const ov::AnyMap& properties) {
+    const ov::AnyMap& properties,
+    const ov::genai::GenerationConfig& generation_config
+    ) {
     m_tokenizer = tokenizer;
-    m_generation_config = utils::from_config_json_if_exists(models_path);
-
+    m_generation_config = generation_config;
+    
     ov::Core core;
-
     auto [core_properties, compile_properties] = utils::split_core_complile_config(properties);
     core.set_property(core_properties);
 
-    // The model can be compiled for GPU as well
-    std::shared_ptr<ov::Model> model = core.read_model((models_path / "openvino_model.xml").string());
-
     DeviceConfig device_config(core, scheduler_config, device, compile_properties);
 
     bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;

diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp
@@ -53,22 +53,12 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
 
     void _fill_prompt_log_probs(std::vector<SequenceGroup::Ptr>& sequence_groups, ov::Tensor& logits);
 public:
-    ContinuousBatchingImpl(const std::filesystem::path& models_path,
+    ContinuousBatchingImpl(const std::shared_ptr<ov::Model>& model,
                            const Tokenizer& tokenizer,
-                           const SchedulerConfig& scheduler_config,
-                           const std::string& device,
-                           const ov::AnyMap& properties);
-
-    ContinuousBatchingImpl(const std::filesystem::path& models_path,
                            const SchedulerConfig& scheduler_config,
                            const std::string& device,
                            const ov::AnyMap& properties,
-                           const ov::AnyMap& tokenizer_properties)
-    : ContinuousBatchingImpl{ models_path,
-                              Tokenizer(models_path, tokenizer_properties),
-                              scheduler_config,
-                              device,
-                              properties } {}
+                           const ov::genai::GenerationConfig& generation_config);
 
     GenerationHandle add_request(uint64_t request_id,
                                  const ov::Tensor& input_ids,

diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -20,25 +20,34 @@ using namespace ov::genai;
 
 inline ov::genai::ModelDesc
 extract_draft_model_from_config(ov::AnyMap& config) {
-    ov::genai::ModelDesc draft_model("");
+    ov::genai::ModelDesc draft_model;
     if (config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end()) {
         draft_model = config.at(utils::DRAFT_MODEL_ARG_NAME).as<ov::genai::ModelDesc>();
         config.erase(utils::DRAFT_MODEL_ARG_NAME);
     }
     return draft_model;
 }
 
+
+// TODO: Check whether this ctor is necessary.
 ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::path& models_path,
                                                         const SchedulerConfig& scheduler_config,
                                                         const std::string& device,
                                                         const ov::AnyMap& properties,
                                                         const ov::AnyMap& tokenizer_properties) {
     auto properties_without_draft_model = properties;
-    auto draft_model = extract_draft_model_from_config(properties_without_draft_model);
-    if (draft_model.models_path.empty()) {
-        m_impl = std::make_shared<ContinuousBatchingImpl>(models_path, scheduler_config, device, properties, tokenizer_properties);
+    auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
+
+    std::filesystem::path openvino_model_name = "openvino_model.xml";
+    auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+    auto tokenizer = ov::genai::Tokenizer(models_path, tokenizer_properties);
+    auto generation_config = utils::from_config_json_if_exists(models_path);
+    if (draft_model_desr.model == nullptr) {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     } else {
-        m_impl = std::make_shared<SpeculativeDecodingImpl>(models_path, scheduler_config, device, properties_without_draft_model, draft_model, tokenizer_properties);
+        // todo: check properties
+        auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
+        m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
     }
 }
 
@@ -49,11 +58,37 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     const std::string& device,
     const ov::AnyMap& properties) {
     auto properties_without_draft_model = properties;
-    auto draft_model = extract_draft_model_from_config(properties_without_draft_model);
-    if (draft_model.models_path.empty()) {
-        m_impl = std::make_shared<ContinuousBatchingImpl>(models_path, tokenizer, scheduler_config, device, properties);
+    auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
+    std::filesystem::path openvino_model_name = "openvino_model.xml";
+    auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+    auto generation_config = utils::from_config_json_if_exists(models_path);
+
+    if (draft_model_desr.model == nullptr) {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
+    } else {
+        // todo: check properties
+        auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
+        m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);
+    }
+}
+
+ContinuousBatchingPipeline::ContinuousBatchingPipeline(
+    const std::string& model_str,
+    const ov::Tensor& weights_tensor,
+    const Tokenizer& tokenizer,
+    const SchedulerConfig& scheduler_config,
+    const std::string& device,
+    const ov::AnyMap& properties,
+    const ov::genai::GenerationConfig& generation_config) {
+    auto properties_without_draft_model = properties;
+    auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model);
+    auto model = utils::singleton_core().read_model(model_str, weights_tensor);
+
+    if (draft_model_desr.model == nullptr) {
+        m_impl = std::make_shared<ContinuousBatchingImpl>(model, tokenizer, scheduler_config, device, properties, generation_config);
     } else {
-        m_impl = std::make_shared<SpeculativeDecodingImpl>(models_path, scheduler_config, device, properties_without_draft_model, draft_model);
+        auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, scheduler_config, generation_config);
+        m_impl = std::make_shared<SpeculativeDecodingImpl>(main_model_descr, draft_model_desr);    
     }
 }
 

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -383,14 +383,26 @@ std::pair<std::string, Any> draft_model(
     const std::filesystem::path& models_path,
     const std::string& device,
     const ov::AnyMap& properties) {
-    ov::AnyMap plugin_config = properties;
-    auto it = plugin_config.find(ov::genai::scheduler_config.name());
-    SchedulerConfig scheduler_config;
-    if (it != plugin_config.end()) {
-        scheduler_config = it->second.as<SchedulerConfig>();
-        plugin_config.erase(it);
-    }
-    return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(models_path, device, plugin_config, scheduler_config) };
+    auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
+
+    std::filesystem::path openvino_model_name = "openvino_model.xml";
+    auto model = utils::singleton_core().read_model((models_path / openvino_model_name).string());
+    auto generation_config = ov::genai::GenerationConfig(models_path);
+    auto tokenizer = ov::genai::Tokenizer(models_path);
+    return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
+}
+
+std::pair<std::string, Any> draft_model(
+    std::string& model_str,
+    ov::Tensor& weights_tensor,
+    const ov::genai::Tokenizer& tokenizer,
+    const std::string& device,
+    const ov::genai::GenerationConfig& generation_config,
+    const ov::AnyMap& properties) {
+    auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
+
+    auto model = utils::singleton_core().read_model(model_str, weights_tensor);
+    return { utils::DRAFT_MODEL_ARG_NAME, Any::make<ModelDesc>(model, tokenizer, device, plugin_config, scheduler_config, generation_config) };
 }
 
 }  // namespace genai
@@ -432,6 +444,24 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         m_generation_config = m_impl.get_config();
     }
 
+    ContinuousBatchingAdapter(
+        std::string& model_str,
+        ov::Tensor& weights_tensor,
+        const Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device,
+        const ov::AnyMap& generation_config,
+        const ov::AnyMap& plugin_config
+    ): LLMPipelineImplBase{tokenizer}, m_impl{
+        model_str, 
+        weights_tensor,
+        tokenizer,
+        scheduler_config,
+        device,
+        plugin_config} {
+            m_generation_config.update_generation_config(generation_config);
+    }
+
     ContinuousBatchingAdapter(
         const std::filesystem::path& models_path,
         const SchedulerConfig& scheduler_config,

diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -23,27 +23,22 @@ bool are_tokenizers_equal(Tokenizer& lhs, Tokenizer& rhs) {
            lhs.get_bos_token_id() == rhs.get_bos_token_id() && lhs.get_pad_token_id() == rhs.get_pad_token_id();
 }
 
-ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(
-    const std::filesystem::path& main_models_path,
-    const SchedulerConfig& main_scheduler_config,
-    const std::string& main_device,
-    const ov::AnyMap& main_properties,
-    const ov::genai::ModelDesc draft_model_desc,
-    const ov::AnyMap& tokenizer_properties) {
+ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, 
+                                                                             const ov::genai::ModelDesc& draft_model_desc) {
     ov::Core core;
-    auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(main_properties);
+    auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(main_model_desc.properties);
     core.set_property(core_properties);
 
-    std::filesystem::path openvino_model_name = "openvino_model.xml",
-                          draft_models_path = draft_model_desc.models_path;
+    auto main_model = main_model_desc.model;
+    auto draft_model = draft_model_desc.model;
 
-    std::shared_ptr<ov::Model> main_model = core.read_model((main_models_path / openvino_model_name).string()),
-                               draft_model = core.read_model((draft_models_path / openvino_model_name).string());
+    auto main_scheduler_config = main_model_desc.scheduler_config;
+    auto main_device = main_model_desc.device;
 
-    utils::apply_paged_attention_transformations(main_model, main_scheduler_config.use_cache_eviction);
-    utils::apply_paged_attention_transformations(draft_model, main_scheduler_config.use_cache_eviction);
+    utils::apply_paged_attention_transformations(main_model, main_model_desc.scheduler_config.use_cache_eviction);
+    utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction);
 
-    std::string draft_device = draft_model_desc.device.empty() ? main_device : draft_model_desc.device;
+    std::string draft_device = draft_model_desc.device.empty() ? main_model_desc.device : draft_model_desc.device;
 
     bool is_scheduler_undefined = draft_model_desc.scheduler_config == SchedulerConfig();
 
@@ -76,8 +71,8 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(
 
     // main and draft model can have different tokenizers
     // to do: support retokenization: 154103
-    Tokenizer main_model_tokenizer(main_models_path, tokenizer_properties),
-              draft_model_tokenizer(draft_models_path, tokenizer_properties);
+    Tokenizer main_model_tokenizer = main_model_desc.tokenizer_model;
+    Tokenizer draft_model_tokenizer = draft_model_desc.tokenizer_model;
 
     // todo: remove this condition after support of CVS-154103
     OPENVINO_ASSERT(are_tokenizers_equal(main_model_tokenizer, draft_model_tokenizer), "Tokenizers for draft and main models are different!");
@@ -86,10 +81,10 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(
 
     // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode
     m_main_pipeline = std::make_shared<ContinuousBatchingForSpeculativeDecodingImpl>(core,
-        main_model, main_model_tokenizer, utils::from_config_json_if_exists(main_models_path),
+        main_model, main_model_tokenizer, main_model_desc.generation_config,
         main_device_config, main_scheduler_config, main_device, compile_properties, true);
     m_draft_pipeline = std::make_shared<ContinuousBatchingForSpeculativeDecodingImpl>(core,
-        draft_model, draft_model_tokenizer, utils::from_config_json_if_exists(draft_models_path),
+        draft_model, draft_model_tokenizer, draft_model_desc.generation_config,
         draft_device_config, draft_scheduler_config, draft_device, draft_properties, false);
 }
 

diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp
@@ -11,19 +11,27 @@
 namespace ov::genai {
 
 struct ModelDesc {
-    std::filesystem::path models_path;
     std::string device;
     ov::genai::SchedulerConfig scheduler_config;
     ov::AnyMap properties;
+    ov::genai::GenerationConfig generation_config;
+    std::shared_ptr<ov::Model> model = nullptr;
+    ov::genai::Tokenizer tokenizer_model;
 
-    ModelDesc(const std::filesystem::path& models_path,
+    ModelDesc(const std::shared_ptr<ov::Model>& model,
+              const ov::genai::Tokenizer& tokenizer_model,
               const std::string& device = {},
               const ov::AnyMap& properties = {},
-              const ov::genai::SchedulerConfig& scheduler_config = {}) :
-        models_path(models_path),
+              const ov::genai::SchedulerConfig& scheduler_config = {},
+              const ov::genai::GenerationConfig& generation_config = {}) :
+        model(model),
+        tokenizer_model(tokenizer_model),
         device(device),
         properties(properties),
-        scheduler_config(scheduler_config) {}
+        scheduler_config(scheduler_config),
+        generation_config(generation_config) {}
+
+    ModelDesc() = default;
 };
 
 class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBatchingPipeline::ImplInterface {
@@ -35,12 +43,7 @@ class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBat
     std::map<uint64_t, GenerationHandle> m_draft_generations;
 
 public:
-    SpeculativeDecodingImpl(const std::filesystem::path& main_models_path,
-                            const SchedulerConfig& scheduler_config,
-                            const std::string& device,
-                            const ov::AnyMap& properties,
-                            const ov::genai::ModelDesc draft_model_desc,
-                            const ov::AnyMap& tokenizer_properties = {});
+    SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, const ov::genai::ModelDesc& draft_model_desc);
 
     GenerationHandle add_request(uint64_t request_id,
                                  const ov::Tensor& input_ids,