Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added possibility to pass PREFILL/GENERATE configs and pad_token_id #28154

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,14 @@ static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"
*/
static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"};

/**
* @brief
* Type: ov::AnyMap.
* Tell NPUW the configuration for compilation of prefill model.
* NOTE: !! Write-only !!
*/
static constexpr ov::Property<ov::AnyMap> prefill_config{"NPUW_LLM_PREFILL_CONFIG"};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NPUW_LLM_PREFILL_CONFIG and NPUW_LLM_GENERATE_CONFIG are supposed to be passed to compile(...) once and then can be forgotten. Why do we need to define properties for that?


/**
* @brief
* Type: std::string.
Expand All @@ -421,6 +429,13 @@ static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_
*/
static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT"};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same, I'd make it as property


/**
* @brief
* Type: ov::AnyMap.
* Tell NPUW the configuration for compilation of generate model.
* NOTE: !! Write-only !!
*/
static constexpr ov::Property<ov::AnyMap> generate_config{"NPUW_LLM_GENERATE_CONFIG"};
} // namespace llm

} // namespace npuw
Expand Down
49 changes: 25 additions & 24 deletions src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,6 @@ std::optional<T> get_option(ov::AnyMap& config, const std::string& option_name)
return std::nullopt;
}

template <typename T>
T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) {
auto anyopt = pop_option(config, key);
if (anyopt.has_value()) {
return anyopt.value().as<T>();
}
return default_value;
}

ov::AnyMap get_baseline_common_config() {
ov::AnyMap config = {
{"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm"},
Expand Down Expand Up @@ -206,12 +197,6 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) {
}
}

void drop_cache_dir(ov::AnyMap& config) {
if (config.count("NPU_USE_NPUW") != 0u) {
pop_option(config, "CACHE_DIR");
}
}

void split_llm_properties(const ov::AnyMap& properties, ov::AnyMap& llm_properties, ov::AnyMap& other_properties) {
for (auto it = properties.begin(); it != properties.end(); ++it) {
if (it->first.find("NPUW_LLM") != it->first.npos) {
Expand Down Expand Up @@ -245,6 +230,13 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
std::map<std::string, ov::Any> npuw_llm_props;
std::map<std::string, ov::Any> other_props;
split_llm_properties(properties, npuw_llm_props, other_props);

// Remove "NPUW_LLM_PREFILL_CONFIG", "NPUW_LLM_GENERATE_CONFIG" from map,
// to not pass them into ::intel_npu::Config object, as we don't need to
// preserve them somewhere.
auto prefill_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_PREFILL_CONFIG"));
auto generate_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_GENERATE_CONFIG"));

m_cfg.update(any_copy(npuw_llm_props));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe nothing from npuw_llm_props should get into m_cfg, right?

Everything related to LLM pipeline can be extracted here and then forgotten.


LOG_DEBUG("1. Creating kvcache model as clone of passed one.");
Expand All @@ -258,7 +250,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
LOG_DEBUG("4. Converting KV-cache in prefill model to FP16.");
prefill_model = cvt_kvcache_to_fp16(prefill_model);

LOG_DEBUG("5. Optimize kvcache kvcache model to output key/values for new token.");
LOG_DEBUG("5. Optimize kvcache model to output key/values for new token.");
kvcache_model = redirect_new_kv_to_output(kvcache_model);
LOG_DEBUG("6. Converting KV-cache in kvcache model to FP16.");
kvcache_model = cvt_kvcache_to_fp16(kvcache_model);
Expand All @@ -274,18 +266,22 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes);

auto npudesc = extract_npu_descriptor(plugin);

ov::AnyMap properties_copy = std::move(other_props);
auto prefill_config = get_default_prefill_config(model, npudesc);
// NB: GENERATE_HINT is only applicable for default generate config!

auto prefill_config =
prefill_config_opt.value_or(get_default_prefill_config(prefill_model, npudesc)).as<ov::AnyMap>();

const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd assume it initially extracted from npuw_llm_props

LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint)));
auto generate_config = get_default_generate_config(model, npudesc, generate_hint);
// NB: GENERATE_HINT is only applicable for default generate config!
if (generate_config_opt.has_value() && npuw_llm_props.count(ov::intel_npu::npuw::llm::generate_hint.name())) {
TolyaTalamanov marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need npuw_llm_props.count(...) part if generate_hint already extracted a few lines above?

OPENVINO_THROW("GENERATE_HINT is only applicable for default generate config!");
}
auto generate_config =
generate_config_opt.value_or(get_default_generate_config(model, npudesc, generate_hint)).as<ov::AnyMap>();

merge_config_with(prefill_config, properties_copy);
merge_config_with(generate_config, properties_copy);
// FIXME: Drop CACHE_DIR option if NPUW is enabled
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is it dropped?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because it should be handled on the GenAI side, here we already passed through the NPU plugin, that chooses us (npuw::LLMCompiledModel) and checked CACHE_DIR existance

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given this config, will NPU plugin handle CACHE_DIR? Or it will be responsibility of NPUW?

USE_NPUW: YES,
NPUW_LLM_PIPELINE: YES,
CACHE_DIR: "..."

drop_cache_dir(prefill_config);
drop_cache_dir(generate_config);

m_kvcache_compiled = std::make_shared<ov::npuw::CompiledModel>(kvcache_model, plugin, generate_config);
m_prefill_compiled = std::make_shared<ov::npuw::CompiledModel>(prefill_model, plugin, prefill_config);
Expand All @@ -308,6 +304,11 @@ void ov::npuw::LLMCompiledModel::set_property(const ov::AnyMap& properties) {

ov::Any ov::npuw::LLMCompiledModel::get_property(const std::string& name) const {
OPENVINO_SUPPRESS_DEPRECATED_START
if (name == ov::intel_npu::npuw::llm::prefill_config.name() ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't believe it's really needed, see comment above

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get_property() might be not needed at all here, so as it is a redudant functionality, I suppose to at least handle everything in a unified way here to not create a mess.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keys provided to LLM pipeline must not be properties, so there won't be any mess

name == ov::intel_npu::npuw::llm::generate_config.name()) {
OPENVINO_THROW(name, " is write-only option!");
}

auto&& configIterator = m_prop_to_opt.find(name);
if (configIterator != m_prop_to_opt.cend()) {
return std::get<1>(configIterator->second)(m_cfg);
Expand All @@ -324,7 +325,7 @@ std::shared_ptr<ov::ISyncInferRequest> ov::npuw::LLMCompiledModel::create_sync_i

std::shared_ptr<ov::ISyncInferRequest> ov::npuw::LLMCompiledModel::create_llm_infer_request() {
auto this_sptr = std::static_pointer_cast<ov::npuw::LLMCompiledModel>(shared_from_this());
return std::make_shared<ov::npuw::LLMInferRequest>(this_sptr, m_kvcache_desc);
return std::make_shared<ov::npuw::LLMInferRequest>(this_sptr);
}

void ov::npuw::LLMCompiledModel::implement_properties() {
Expand Down
43 changes: 20 additions & 23 deletions src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,9 @@ ov::SoPtr<ov::ITensor> make_tensor_slice(ov::SoPtr<ov::ITensor> tensor,
}
} // anonymous namespace

ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model,
const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc)
ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model)
: ov::ISyncInferRequest(compiled_model),
m_kvcache_desc(kvcache_desc) {
m_npuw_llm_compiled_model(compiled_model) {
m_kvcache_request = compiled_model->m_kvcache_compiled->create_infer_request();
TolyaTalamanov marked this conversation as resolved.
Show resolved Hide resolved
m_prefill_request = compiled_model->m_prefill_compiled->create_infer_request();
TolyaTalamanov marked this conversation as resolved.
Show resolved Hide resolved

Expand All @@ -52,13 +51,11 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
}

void ov::npuw::LLMInferRequest::prepare_for_new_conversation() {
// FIXME: for input_ids it must be padding from tokenizer that not available from here
// Get it from NPUW options
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0u);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0u);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0u);
fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0u);
m_kvcache_desc.num_stored_tokens = 0u;
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0);
fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0);
m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens = 0u;
}

void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
Expand All @@ -82,7 +79,7 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
std::copy_n(position_ids->data<int64_t>(), position_ids->get_size(), padded_position_ids->data<int64_t>() + offset);

m_prefill_request->infer();
m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_size());
m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_size());
m_need_copy_kvcache = true;

m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at("logits"));
Expand All @@ -96,8 +93,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
LOG_DEBUG("Calling inference for generate model...");
LOG_BLOCK();

auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
// NB: KV-cache is full, further generation is impossible
if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) {
if (kvcache_desc.num_stored_tokens == kvcache_desc.total_size) {
OPENVINO_THROW("KV-Cache is full.");
}

Expand All @@ -116,21 +114,20 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
// taking into account kvcache dimension.
fill_tensor<ov::float16>(kvcache_in_tensor, 0);

auto prefill_out_slice =
make_tensor_slice(prefill_out_tensor,
m_kvcache_desc.dim,
m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens,
m_kvcache_desc.max_prompt_size);
auto prefill_out_slice = make_tensor_slice(prefill_out_tensor,
kvcache_desc.dim,
kvcache_desc.max_prompt_size - kvcache_desc.num_stored_tokens,
kvcache_desc.max_prompt_size);

auto kvcache_in_slice =
make_tensor_slice(kvcache_in_tensor, m_kvcache_desc.dim, 0u, m_kvcache_desc.num_stored_tokens);
make_tensor_slice(kvcache_in_tensor, kvcache_desc.dim, 0u, kvcache_desc.num_stored_tokens);

prefill_out_slice->copy_to(kvcache_in_slice._ptr);
}
LOG_DEBUG("Prepare attention mask pattern.");
auto* attention_mask_data =
m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask"))->data<int64_t>();
attention_mask_data[m_kvcache_desc.total_size - 1] = 1;
attention_mask_data[kvcache_desc.total_size - 1] = 1;

m_need_copy_kvcache = false;
}
Expand All @@ -147,7 +144,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,

m_kvcache_request->infer();
m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at("logits"));
m_kvcache_desc.num_stored_tokens += 1;
kvcache_desc.num_stored_tokens += 1;

LOG_DEBUG("Write KV-cache for the new token to the correct input position for next iteration.");
const std::size_t kStartOutputKVCacheLayers = 1u;
Expand All @@ -157,9 +154,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values");
auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name));
auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor,
m_kvcache_desc.dim,
m_kvcache_desc.num_stored_tokens - 1,
m_kvcache_desc.num_stored_tokens);
kvcache_desc.dim,
kvcache_desc.num_stored_tokens - 1,
kvcache_desc.num_stored_tokens);
auto kvcache_out_tensor = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(output_name));
kvcache_out_tensor->copy_to(kvcache_in_slice._ptr);
}
Expand Down
5 changes: 2 additions & 3 deletions src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ namespace npuw {

class LLMInferRequest final : public ov::ISyncInferRequest {
public:
explicit LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model,
const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc);
explicit LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model);

void infer() override;

Expand Down Expand Up @@ -44,7 +43,7 @@ class LLMInferRequest final : public ov::ISyncInferRequest {

std::shared_ptr<ov::IAsyncInferRequest> m_kvcache_request;
std::shared_ptr<ov::IAsyncInferRequest> m_prefill_request;
LLMCompiledModel::KVCacheDesc m_kvcache_desc;
std::shared_ptr<LLMCompiledModel> m_npuw_llm_compiled_model;
ov::SoPtr<ov::ITensor> m_logits;
bool m_need_copy_kvcache = false;

Expand Down
Loading