Skip to content

Commit

Permalink
Fixed according review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
AsyaPronina committed Dec 24, 2024
1 parent b52da47 commit a263f2c
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime);
DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime);
DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);
DEFINE_OPT(NPUW_LLM_PAD_TOKEN_ID, int64_t, 0, npuw::llm::pad_token_id, CompileTime);

namespace npuw {
namespace llm {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -414,11 +414,11 @@ static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_

/**
* @brief
* Type: std::map<std::string, ov::Any>.
* Type: ov::AnyMap.
* Tell NPUW the configuration for compilation of prefill model.
* NOTE: !! Write-only !!
*/
static constexpr ov::Property<std::string> prefill_config{"NPUW_LLM_PREFILL_CONFIG"};
static constexpr ov::Property<ov::AnyMap> prefill_config{"NPUW_LLM_PREFILL_CONFIG"};

/**
* @brief
Expand All @@ -431,19 +431,11 @@ static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT

/**
* @brief
* Type: std::map<std::string, ov::Any>.
* Type: ov::AnyMap.
* Tell NPUW the configuration for compilation of generate model.
* NOTE: !! Write-only !!
*/
static constexpr ov::Property<std::string> generate_config{"NPUW_LLM_GENERATE_CONFIG"};

/**
* @brief
* Type: int64_t.
* Pad token ID to fill input token ids in the conversation mode.
* Default: 0.
*/
static constexpr ov::Property<int64_t> pad_token_id{"NPUW_LLM_PAD_TOKEN_ID"};
static constexpr ov::Property<ov::AnyMap> generate_config{"NPUW_LLM_GENERATE_CONFIG"};
} // namespace llm

} // namespace npuw
Expand Down
1 change: 0 additions & 1 deletion src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,4 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
desc.add<NPUW_LLM_MAX_PROMPT_LEN>();
desc.add<NPUW_LLM_MIN_RESPONSE_LEN>();
desc.add<NPUW_LLM_GENERATE_HINT>();
desc.add<NPUW_LLM_PAD_TOKEN_ID>();
}
16 changes: 4 additions & 12 deletions src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,14 +133,6 @@ std::optional<T> get_option(ov::AnyMap& config, const std::string& option_name)
return std::nullopt;
}

template <typename T>
T opt_or_default(const std::optional<ov::Any>& opt, const T& default_value) {
if (opt.has_value()) {
return opt.value().as<T>();
}
return default_value;
}

ov::AnyMap get_baseline_common_config() {
ov::AnyMap config = {
{"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm"},
Expand Down Expand Up @@ -276,7 +268,8 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
auto npudesc = extract_npu_descriptor(plugin);
ov::AnyMap properties_copy = std::move(other_props);

auto prefill_config = opt_or_default(prefill_config_opt, get_default_prefill_config(prefill_model, npudesc));
auto prefill_config =
prefill_config_opt.value_or(get_default_prefill_config(prefill_model, npudesc)).as<ov::AnyMap>();

const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>();
LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint)));
Expand All @@ -285,7 +278,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
OPENVINO_THROW("GENERATE_HINT is only applicable for default generate config!");
}
auto generate_config =
opt_or_default(generate_config_opt, get_default_generate_config(model, npudesc, generate_hint));
generate_config_opt.value_or(get_default_generate_config(model, npudesc, generate_hint)).as<ov::AnyMap>();

merge_config_with(prefill_config, properties_copy);
merge_config_with(generate_config, properties_copy);
Expand Down Expand Up @@ -349,7 +342,6 @@ void ov::npuw::LLMCompiledModel::implement_properties() {
BIND(npuw::llm::model_desc, NPUW_LLM_MODEL_DESC, getString),
BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN, get),
BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN, get),
BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString),
BIND(npuw::llm::pad_token_id, NPUW_LLM_PAD_TOKEN_ID, get)});
BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString)});
#undef BIND
}
6 changes: 3 additions & 3 deletions src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ ov::SoPtr<ov::ITensor> make_tensor_slice(ov::SoPtr<ov::ITensor> tensor,
} // anonymous namespace

ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model)
: ov::ISyncInferRequest(compiled_model) {
: ov::ISyncInferRequest(compiled_model),
m_npuw_llm_compiled_model(compiled_model) {
m_kvcache_request = compiled_model->m_kvcache_compiled->create_infer_request();
m_prefill_request = compiled_model->m_prefill_compiled->create_infer_request();

Expand All @@ -50,8 +51,7 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
}

void ov::npuw::LLMInferRequest::prepare_for_new_conversation() {
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")),
m_npuw_llm_compiled_model->m_cfg.get<::intel_npu::NPUW_LLM_PAD_TOKEN_ID>());
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0);
fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0);
Expand Down

0 comments on commit a263f2c

Please sign in to comment.