Fixed according review comments

openvinotoolkit · Dec 24, 2024 · a263f2c · a263f2c
1 parent b52da47
commit a263f2c
Show file tree

Hide file tree

Showing 5 changed files with 11 additions and 29 deletions.
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
@@ -70,7 +70,6 @@ DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime);
 DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime);
 DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
 DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);
-DEFINE_OPT(NPUW_LLM_PAD_TOKEN_ID, int64_t, 0, npuw::llm::pad_token_id, CompileTime);
 
 namespace npuw {
 namespace llm {

diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
@@ -414,11 +414,11 @@ static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_
 
 /**
  * @brief
- * Type: std::map<std::string, ov::Any>.
+ * Type: ov::AnyMap.
  * Tell NPUW the configuration for compilation of prefill model.
  * NOTE: !! Write-only !!
  */
-static constexpr ov::Property<std::string> prefill_config{"NPUW_LLM_PREFILL_CONFIG"};
+static constexpr ov::Property<ov::AnyMap> prefill_config{"NPUW_LLM_PREFILL_CONFIG"};
 
 /**
  * @brief
@@ -431,19 +431,11 @@ static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT
 
 /**
  * @brief
- * Type: std::map<std::string, ov::Any>.
+ * Type: ov::AnyMap.
  * Tell NPUW the configuration for compilation of generate model.
  * NOTE: !! Write-only !!
  */
-static constexpr ov::Property<std::string> generate_config{"NPUW_LLM_GENERATE_CONFIG"};
-
-/**
- * @brief
- * Type: int64_t.
- * Pad token ID to fill input token ids in the conversation mode.
- * Default: 0.
- */
-static constexpr ov::Property<int64_t> pad_token_id{"NPUW_LLM_PAD_TOKEN_ID"};
+static constexpr ov::Property<ov::AnyMap> generate_config{"NPUW_LLM_GENERATE_CONFIG"};
 }  // namespace llm
 
 }  // namespace npuw

diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -61,5 +61,4 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
     desc.add<NPUW_LLM_MAX_PROMPT_LEN>();
     desc.add<NPUW_LLM_MIN_RESPONSE_LEN>();
     desc.add<NPUW_LLM_GENERATE_HINT>();
-    desc.add<NPUW_LLM_PAD_TOKEN_ID>();
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -133,14 +133,6 @@ std::optional<T> get_option(ov::AnyMap& config, const std::string& option_name)
     return std::nullopt;
 }
 
-template <typename T>
-T opt_or_default(const std::optional<ov::Any>& opt, const T& default_value) {
-    if (opt.has_value()) {
-        return opt.value().as<T>();
-    }
-    return default_value;
-}
-
 ov::AnyMap get_baseline_common_config() {
     ov::AnyMap config = {
         {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm"},
@@ -276,7 +268,8 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     auto npudesc = extract_npu_descriptor(plugin);
     ov::AnyMap properties_copy = std::move(other_props);
 
-    auto prefill_config = opt_or_default(prefill_config_opt, get_default_prefill_config(prefill_model, npudesc));
+    auto prefill_config =
+        prefill_config_opt.value_or(get_default_prefill_config(prefill_model, npudesc)).as<ov::AnyMap>();
 
     const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>();
     LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint)));
@@ -285,7 +278,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
         OPENVINO_THROW("GENERATE_HINT is only applicable for default generate config!");
     }
     auto generate_config =
-        opt_or_default(generate_config_opt, get_default_generate_config(model, npudesc, generate_hint));
+        generate_config_opt.value_or(get_default_generate_config(model, npudesc, generate_hint)).as<ov::AnyMap>();
 
     merge_config_with(prefill_config, properties_copy);
     merge_config_with(generate_config, properties_copy);
@@ -349,7 +342,6 @@ void ov::npuw::LLMCompiledModel::implement_properties() {
                           BIND(npuw::llm::model_desc, NPUW_LLM_MODEL_DESC, getString),
                           BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN, get),
                           BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN, get),
-                          BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString),
-                          BIND(npuw::llm::pad_token_id, NPUW_LLM_PAD_TOKEN_ID, get)});
+                          BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString)});
 #undef BIND
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
@@ -30,7 +30,8 @@ ov::SoPtr<ov::ITensor> make_tensor_slice(ov::SoPtr<ov::ITensor> tensor,
 }  // anonymous namespace
 
 ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model)
-    : ov::ISyncInferRequest(compiled_model) {
+    : ov::ISyncInferRequest(compiled_model),
+      m_npuw_llm_compiled_model(compiled_model) {
     m_kvcache_request = compiled_model->m_kvcache_compiled->create_infer_request();
     m_prefill_request = compiled_model->m_prefill_compiled->create_infer_request();
 
@@ -50,8 +51,7 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
 }
 
 void ov::npuw::LLMInferRequest::prepare_for_new_conversation() {
-    fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")),
-                         m_npuw_llm_compiled_model->m_cfg.get<::intel_npu::NPUW_LLM_PAD_TOKEN_ID>());
+    fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0);
     fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0);
     fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0);
     fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0);