openvinotoolkit · AsyaPronina · Dec 20, 2024 · Dec 23, 2024 · Dec 24, 2024 · TolyaTalamanov
@@ -412,6 +412,14 @@ static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"
  */
 static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"};
 
+/**
+ * @brief
+ * Type: ov::AnyMap.
+ * Tell NPUW the configuration for compilation of prefill model.
+ * NOTE: !! Write-only !!
+ */
+static constexpr ov::Property<ov::AnyMap> prefill_config{"NPUW_LLM_PREFILL_CONFIG"};
+
 /**
  * @brief
  * Type: std::string.
@@ -421,6 +429,13 @@ static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_
  */
 static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT"};
 
+/**
+ * @brief
+ * Type: ov::AnyMap.
+ * Tell NPUW the configuration for compilation of generate model.
+ * NOTE: !! Write-only !!
+ */
+static constexpr ov::Property<ov::AnyMap> generate_config{"NPUW_LLM_GENERATE_CONFIG"};
 }  // namespace llm
 
 }  // namespace npuw

@@ -133,15 +133,6 @@ std::optional<T> get_option(ov::AnyMap& config, const std::string& option_name)
     return std::nullopt;
 }
 
-template <typename T>
-T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) {
-    auto anyopt = pop_option(config, key);
-    if (anyopt.has_value()) {
-        return anyopt.value().as<T>();
-    }
-    return default_value;
-}
-
 ov::AnyMap get_baseline_common_config() {
     ov::AnyMap config = {
         {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm"},
@@ -206,12 +197,6 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) {
     }
 }
 
-void drop_cache_dir(ov::AnyMap& config) {
-    if (config.count("NPU_USE_NPUW") != 0u) {
-        pop_option(config, "CACHE_DIR");
-    }
-}
-
 void split_llm_properties(const ov::AnyMap& properties, ov::AnyMap& llm_properties, ov::AnyMap& other_properties) {
     for (auto it = properties.begin(); it != properties.end(); ++it) {
         if (it->first.find("NPUW_LLM") != it->first.npos) {
@@ -245,6 +230,13 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     std::map<std::string, ov::Any> npuw_llm_props;
     std::map<std::string, ov::Any> other_props;
     split_llm_properties(properties, npuw_llm_props, other_props);
+
+    // Remove "NPUW_LLM_PREFILL_CONFIG", "NPUW_LLM_GENERATE_CONFIG" from map,
+    // to not pass them into ::intel_npu::Config object, as we don't need to
+    // preserve them somewhere.
+    auto prefill_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_PREFILL_CONFIG"));
+    auto generate_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_GENERATE_CONFIG"));
+
     m_cfg.update(any_copy(npuw_llm_props));
 
     LOG_DEBUG("1. Creating kvcache model as clone of passed one.");
@@ -258,7 +250,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     LOG_DEBUG("4. Converting KV-cache in prefill model to FP16.");
     prefill_model = cvt_kvcache_to_fp16(prefill_model);
 
-    LOG_DEBUG("5. Optimize kvcache kvcache model to output key/values for new token.");
+    LOG_DEBUG("5. Optimize kvcache model to output key/values for new token.");
     kvcache_model = redirect_new_kv_to_output(kvcache_model);
     LOG_DEBUG("6. Converting KV-cache in kvcache model to FP16.");
     kvcache_model = cvt_kvcache_to_fp16(kvcache_model);
@@ -274,18 +266,22 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes);
 
     auto npudesc = extract_npu_descriptor(plugin);
-
     ov::AnyMap properties_copy = std::move(other_props);
-    auto prefill_config = get_default_prefill_config(model, npudesc);
-    // NB: GENERATE_HINT is only applicable for default generate config!
+
+    auto prefill_config =
+        prefill_config_opt.value_or(get_default_prefill_config(prefill_model, npudesc)).as<ov::AnyMap>();
+
     const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>();
     LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint)));
-    auto generate_config = get_default_generate_config(model, npudesc, generate_hint);
+    // NB: GENERATE_HINT is only applicable for default generate config!
+    if (generate_config_opt.has_value() && npuw_llm_props.count(ov::intel_npu::npuw::llm::generate_hint.name())) {
+        OPENVINO_THROW("GENERATE_HINT is only applicable for default generate config!");
+    }
+    auto generate_config =
+        generate_config_opt.value_or(get_default_generate_config(model, npudesc, generate_hint)).as<ov::AnyMap>();
+
     merge_config_with(prefill_config, properties_copy);
     merge_config_with(generate_config, properties_copy);
-    // FIXME: Drop CACHE_DIR option if NPUW is enabled
-    drop_cache_dir(prefill_config);
-    drop_cache_dir(generate_config);
 
     m_kvcache_compiled = std::make_shared<ov::npuw::CompiledModel>(kvcache_model, plugin, generate_config);
     m_prefill_compiled = std::make_shared<ov::npuw::CompiledModel>(prefill_model, plugin, prefill_config);
@@ -308,6 +304,11 @@ void ov::npuw::LLMCompiledModel::set_property(const ov::AnyMap& properties) {
 
 ov::Any ov::npuw::LLMCompiledModel::get_property(const std::string& name) const {
     OPENVINO_SUPPRESS_DEPRECATED_START
+    if (name == ov::intel_npu::npuw::llm::prefill_config.name() ||
+        name == ov::intel_npu::npuw::llm::generate_config.name()) {
+        OPENVINO_THROW(name, " is write-only option!");
+    }
+
     auto&& configIterator = m_prop_to_opt.find(name);
     if (configIterator != m_prop_to_opt.cend()) {
         return std::get<1>(configIterator->second)(m_cfg);
@@ -324,7 +325,7 @@ std::shared_ptr<ov::ISyncInferRequest> ov::npuw::LLMCompiledModel::create_sync_i
 
 std::shared_ptr<ov::ISyncInferRequest> ov::npuw::LLMCompiledModel::create_llm_infer_request() {
     auto this_sptr = std::static_pointer_cast<ov::npuw::LLMCompiledModel>(shared_from_this());
-    return std::make_shared<ov::npuw::LLMInferRequest>(this_sptr, m_kvcache_desc);
+    return std::make_shared<ov::npuw::LLMInferRequest>(this_sptr);
 }
 
 void ov::npuw::LLMCompiledModel::implement_properties() {

@@ -29,10 +29,9 @@ ov::SoPtr<ov::ITensor> make_tensor_slice(ov::SoPtr<ov::ITensor> tensor,
 }
 }  // anonymous namespace
 
-ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model,
-                                           const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc)
+ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model)
     : ov::ISyncInferRequest(compiled_model),
-      m_kvcache_desc(kvcache_desc) {
+      m_npuw_llm_compiled_model(compiled_model) {
     m_kvcache_request = compiled_model->m_kvcache_compiled->create_infer_request();
     m_prefill_request = compiled_model->m_prefill_compiled->create_infer_request();
 
@@ -52,13 +51,11 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
 }
 
 void ov::npuw::LLMInferRequest::prepare_for_new_conversation() {
-    // FIXME: for input_ids it must be padding from tokenizer that not available from here
-    // Get it from NPUW options
-    fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0u);
-    fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0u);
-    fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0u);
-    fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0u);
-    m_kvcache_desc.num_stored_tokens = 0u;
+    fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0);
+    fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0);
+    fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0);
+    fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0);
+    m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens = 0u;
 }
 
 void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
@@ -82,7 +79,7 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
     std::copy_n(position_ids->data<int64_t>(), position_ids->get_size(), padded_position_ids->data<int64_t>() + offset);
 
     m_prefill_request->infer();
-    m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_size());
+    m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_size());
     m_need_copy_kvcache = true;
 
     m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at("logits"));
@@ -96,8 +93,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
     LOG_DEBUG("Calling inference for generate model...");
     LOG_BLOCK();
 
+    auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
     // NB: KV-cache is full, further generation is impossible
-    if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) {
+    if (kvcache_desc.num_stored_tokens == kvcache_desc.total_size) {
         OPENVINO_THROW("KV-Cache is full.");
     }
 
@@ -116,21 +114,20 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
             //        taking into account kvcache dimension.
             fill_tensor<ov::float16>(kvcache_in_tensor, 0);
 
-            auto prefill_out_slice =
-                make_tensor_slice(prefill_out_tensor,
-                                  m_kvcache_desc.dim,
-                                  m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens,
-                                  m_kvcache_desc.max_prompt_size);
+            auto prefill_out_slice = make_tensor_slice(prefill_out_tensor,
+                                                       kvcache_desc.dim,
+                                                       kvcache_desc.max_prompt_size - kvcache_desc.num_stored_tokens,
+                                                       kvcache_desc.max_prompt_size);
 
             auto kvcache_in_slice =
-                make_tensor_slice(kvcache_in_tensor, m_kvcache_desc.dim, 0u, m_kvcache_desc.num_stored_tokens);
+                make_tensor_slice(kvcache_in_tensor, kvcache_desc.dim, 0u, kvcache_desc.num_stored_tokens);
 
             prefill_out_slice->copy_to(kvcache_in_slice._ptr);
         }
         LOG_DEBUG("Prepare attention mask pattern.");
         auto* attention_mask_data =
             m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask"))->data<int64_t>();
-        attention_mask_data[m_kvcache_desc.total_size - 1] = 1;
+        attention_mask_data[kvcache_desc.total_size - 1] = 1;
 
         m_need_copy_kvcache = false;
     }
@@ -147,7 +144,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
 
     m_kvcache_request->infer();
     m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at("logits"));
-    m_kvcache_desc.num_stored_tokens += 1;
+    kvcache_desc.num_stored_tokens += 1;
 
     LOG_DEBUG("Write KV-cache for the new token to the correct input position for next iteration.");
     const std::size_t kStartOutputKVCacheLayers = 1u;
@@ -157,9 +154,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
         const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values");
         auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name));
         auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor,
-                                                  m_kvcache_desc.dim,
-                                                  m_kvcache_desc.num_stored_tokens - 1,
-                                                  m_kvcache_desc.num_stored_tokens);
+                                                  kvcache_desc.dim,
+                                                  kvcache_desc.num_stored_tokens - 1,
+                                                  kvcache_desc.num_stored_tokens);
         auto kvcache_out_tensor = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(output_name));
         kvcache_out_tensor->copy_to(kvcache_in_slice._ptr);
     }

@@ -15,8 +15,7 @@ namespace npuw {
 
 class LLMInferRequest final : public ov::ISyncInferRequest {
 public:
-    explicit LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model,
-                             const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc);
+    explicit LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model);
 
     void infer() override;
 
@@ -44,7 +43,7 @@ class LLMInferRequest final : public ov::ISyncInferRequest {
 
     std::shared_ptr<ov::IAsyncInferRequest> m_kvcache_request;
     std::shared_ptr<ov::IAsyncInferRequest> m_prefill_request;
-    LLMCompiledModel::KVCacheDesc m_kvcache_desc;
+    std::shared_ptr<LLMCompiledModel> m_npuw_llm_compiled_model;
     ov::SoPtr<ov::ITensor> m_logits;
     bool m_need_copy_kvcache = false;