-
Notifications
You must be signed in to change notification settings - Fork 2.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added possibility to pass PREFILL/GENERATE configs and pad_token_id #28154
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -412,6 +412,14 @@ static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN" | |
*/ | ||
static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"}; | ||
|
||
/** | ||
* @brief | ||
* Type: ov::AnyMap. | ||
* Tell NPUW the configuration for compilation of prefill model. | ||
* NOTE: !! Write-only !! | ||
*/ | ||
static constexpr ov::Property<ov::AnyMap> prefill_config{"NPUW_LLM_PREFILL_CONFIG"}; | ||
|
||
/** | ||
* @brief | ||
* Type: std::string. | ||
|
@@ -421,6 +429,13 @@ static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_ | |
*/ | ||
static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT"}; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same, I'd make it as |
||
|
||
/** | ||
* @brief | ||
* Type: ov::AnyMap. | ||
* Tell NPUW the configuration for compilation of generate model. | ||
* NOTE: !! Write-only !! | ||
*/ | ||
static constexpr ov::Property<ov::AnyMap> generate_config{"NPUW_LLM_GENERATE_CONFIG"}; | ||
} // namespace llm | ||
|
||
} // namespace npuw | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -133,15 +133,6 @@ std::optional<T> get_option(ov::AnyMap& config, const std::string& option_name) | |
return std::nullopt; | ||
} | ||
|
||
template <typename T> | ||
T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) { | ||
auto anyopt = pop_option(config, key); | ||
if (anyopt.has_value()) { | ||
return anyopt.value().as<T>(); | ||
} | ||
return default_value; | ||
} | ||
|
||
ov::AnyMap get_baseline_common_config() { | ||
ov::AnyMap config = { | ||
{"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm"}, | ||
|
@@ -206,12 +197,6 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) { | |
} | ||
} | ||
|
||
void drop_cache_dir(ov::AnyMap& config) { | ||
if (config.count("NPU_USE_NPUW") != 0u) { | ||
pop_option(config, "CACHE_DIR"); | ||
} | ||
} | ||
|
||
void split_llm_properties(const ov::AnyMap& properties, ov::AnyMap& llm_properties, ov::AnyMap& other_properties) { | ||
for (auto it = properties.begin(); it != properties.end(); ++it) { | ||
if (it->first.find("NPUW_LLM") != it->first.npos) { | ||
|
@@ -245,6 +230,13 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m | |
std::map<std::string, ov::Any> npuw_llm_props; | ||
std::map<std::string, ov::Any> other_props; | ||
split_llm_properties(properties, npuw_llm_props, other_props); | ||
|
||
// Remove "NPUW_LLM_PREFILL_CONFIG", "NPUW_LLM_GENERATE_CONFIG" from map, | ||
// to not pass them into ::intel_npu::Config object, as we don't need to | ||
// preserve them somewhere. | ||
auto prefill_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_PREFILL_CONFIG")); | ||
auto generate_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_GENERATE_CONFIG")); | ||
|
||
m_cfg.update(any_copy(npuw_llm_props)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe nothing from Everything related to LLM pipeline can be extracted here and then forgotten. |
||
|
||
LOG_DEBUG("1. Creating kvcache model as clone of passed one."); | ||
|
@@ -258,7 +250,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m | |
LOG_DEBUG("4. Converting KV-cache in prefill model to FP16."); | ||
prefill_model = cvt_kvcache_to_fp16(prefill_model); | ||
|
||
LOG_DEBUG("5. Optimize kvcache kvcache model to output key/values for new token."); | ||
LOG_DEBUG("5. Optimize kvcache model to output key/values for new token."); | ||
kvcache_model = redirect_new_kv_to_output(kvcache_model); | ||
LOG_DEBUG("6. Converting KV-cache in kvcache model to FP16."); | ||
kvcache_model = cvt_kvcache_to_fp16(kvcache_model); | ||
|
@@ -274,18 +266,22 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m | |
reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes); | ||
|
||
auto npudesc = extract_npu_descriptor(plugin); | ||
|
||
ov::AnyMap properties_copy = std::move(other_props); | ||
auto prefill_config = get_default_prefill_config(model, npudesc); | ||
// NB: GENERATE_HINT is only applicable for default generate config! | ||
|
||
auto prefill_config = | ||
prefill_config_opt.value_or(get_default_prefill_config(prefill_model, npudesc)).as<ov::AnyMap>(); | ||
|
||
const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd assume it initially extracted from |
||
LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint))); | ||
auto generate_config = get_default_generate_config(model, npudesc, generate_hint); | ||
// NB: GENERATE_HINT is only applicable for default generate config! | ||
if (generate_config_opt.has_value() && npuw_llm_props.count(ov::intel_npu::npuw::llm::generate_hint.name())) { | ||
TolyaTalamanov marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need |
||
OPENVINO_THROW("GENERATE_HINT is only applicable for default generate config!"); | ||
} | ||
auto generate_config = | ||
generate_config_opt.value_or(get_default_generate_config(model, npudesc, generate_hint)).as<ov::AnyMap>(); | ||
|
||
merge_config_with(prefill_config, properties_copy); | ||
merge_config_with(generate_config, properties_copy); | ||
// FIXME: Drop CACHE_DIR option if NPUW is enabled | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is it dropped? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because it should be handled on the GenAI side, here we already passed through the NPU plugin, that chooses us ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given this config, will
|
||
drop_cache_dir(prefill_config); | ||
drop_cache_dir(generate_config); | ||
|
||
m_kvcache_compiled = std::make_shared<ov::npuw::CompiledModel>(kvcache_model, plugin, generate_config); | ||
m_prefill_compiled = std::make_shared<ov::npuw::CompiledModel>(prefill_model, plugin, prefill_config); | ||
|
@@ -308,6 +304,11 @@ void ov::npuw::LLMCompiledModel::set_property(const ov::AnyMap& properties) { | |
|
||
ov::Any ov::npuw::LLMCompiledModel::get_property(const std::string& name) const { | ||
OPENVINO_SUPPRESS_DEPRECATED_START | ||
if (name == ov::intel_npu::npuw::llm::prefill_config.name() || | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't believe it's really needed, see comment above There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keys provided to |
||
name == ov::intel_npu::npuw::llm::generate_config.name()) { | ||
OPENVINO_THROW(name, " is write-only option!"); | ||
} | ||
|
||
auto&& configIterator = m_prop_to_opt.find(name); | ||
if (configIterator != m_prop_to_opt.cend()) { | ||
return std::get<1>(configIterator->second)(m_cfg); | ||
|
@@ -324,7 +325,7 @@ std::shared_ptr<ov::ISyncInferRequest> ov::npuw::LLMCompiledModel::create_sync_i | |
|
||
std::shared_ptr<ov::ISyncInferRequest> ov::npuw::LLMCompiledModel::create_llm_infer_request() { | ||
auto this_sptr = std::static_pointer_cast<ov::npuw::LLMCompiledModel>(shared_from_this()); | ||
return std::make_shared<ov::npuw::LLMInferRequest>(this_sptr, m_kvcache_desc); | ||
return std::make_shared<ov::npuw::LLMInferRequest>(this_sptr); | ||
} | ||
|
||
void ov::npuw::LLMCompiledModel::implement_properties() { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NPUW_LLM_PREFILL_CONFIG
andNPUW_LLM_GENERATE_CONFIG
are supposed to be passed tocompile(...)
once and then can be forgotten. Why do we need to define properties for that?