Skip to content

Commit

Permalink
NPUW: Enable PREFILL/GENERATE configs in LLMCompiledModel (#28154)
Browse files Browse the repository at this point in the history
### Details:
- *Added parsing of passed `NPUW_LLM_PREFILL_CONFIG` and
`NPUW_LLM_GENERATE_CONFIG` options*
 - *Added parsing of passed `NPUW_LLM_PAD_TOKEN_ID`*

### Tickets:
 - *EISW-149349*
 - *EISW-149350*

### Related PRs:
- OpenVINO GenAI:
openvinotoolkit/openvino.genai#1240
  • Loading branch information
AsyaPronina authored Jan 2, 2025
1 parent ef5678a commit a5af1e0
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,14 @@ static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"
*/
static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"};

/**
* @brief
* Type: ov::AnyMap.
* Tell NPUW the configuration for compilation of prefill model.
* NOTE: !! Write-only !!
*/
static constexpr ov::Property<ov::AnyMap> prefill_config{"NPUW_LLM_PREFILL_CONFIG"};

/**
* @brief
* Type: std::string.
Expand All @@ -421,6 +429,13 @@ static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_
*/
static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT"};

/**
* @brief
* Type: ov::AnyMap.
* Tell NPUW the configuration for compilation of generate model.
* NOTE: !! Write-only !!
*/
static constexpr ov::Property<ov::AnyMap> generate_config{"NPUW_LLM_GENERATE_CONFIG"};
} // namespace llm

} // namespace npuw
Expand Down
42 changes: 33 additions & 9 deletions src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,15 @@ std::optional<NPUDesc> extract_npu_descriptor(const std::shared_ptr<const ov::IP
return std::make_optional(NPUDesc{arch.as<std::string>(), max_tiles.as<int64_t>()});
}

std::optional<ov::Any> pop_option(ov::AnyMap& config, const std::string& option_name) {
if (auto it = config.find(option_name); it != config.end()) {
std::optional<ov::Any> found = std::make_optional(it->second);
config.erase(it);
return found;
}
return std::nullopt;
}

ov::AnyMap get_baseline_common_config() {
ov::AnyMap config = {
{"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm"},
Expand Down Expand Up @@ -418,6 +427,13 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
std::map<std::string, ov::Any> npuw_llm_props;
std::map<std::string, ov::Any> other_props;
split_llm_properties(properties, npuw_llm_props, other_props);

// Remove "NPUW_LLM_PREFILL_CONFIG", "NPUW_LLM_GENERATE_CONFIG" from map,
// to not pass them into ::intel_npu::Config object, as we don't need to
// preserve them somewhere.
auto prefill_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_PREFILL_CONFIG"));
auto generate_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_GENERATE_CONFIG"));

m_cfg.update(any_copy(npuw_llm_props));

LOG_DEBUG("1. Creating kvcache model as clone of passed one.");
Expand Down Expand Up @@ -455,17 +471,20 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
prefill_model = cvt_kvcache_to_fp16(prefill_model);

auto npudesc = extract_npu_descriptor(plugin);
ov::AnyMap properties_copy = other_props;
auto prefill_config = get_default_prefill_config(model, npudesc);
auto prefill_config =
prefill_config_opt.value_or(get_default_prefill_config(prefill_model, npudesc)).as<ov::AnyMap>();

// NB: GENERATE_HINT is only applicable for default generate config!
const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>();
LOG_DEBUG(
"10. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint)));
auto generate_config = get_default_generate_config(model, npudesc, generate_hint);
LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint)));
// NB: GENERATE_HINT is only applicable for default generate config!
if (generate_config_opt.has_value() && npuw_llm_props.count(ov::intel_npu::npuw::llm::generate_hint.name())) {
OPENVINO_THROW("GENERATE_HINT is only applicable for default generate config!");
}
auto generate_config =
generate_config_opt.value_or(get_default_generate_config(model, npudesc, generate_hint)).as<ov::AnyMap>();

merge_config_with(prefill_config, properties_copy);
merge_config_with(generate_config, properties_copy);
merge_config_with(prefill_config, other_props);
merge_config_with(generate_config, other_props);

m_kvcache_compiled = std::make_shared<ov::npuw::CompiledModel>(kvcache_model, plugin, generate_config);
m_prefill_compiled = std::make_shared<ov::npuw::CompiledModel>(prefill_model, plugin, prefill_config);
Expand All @@ -488,6 +507,11 @@ void ov::npuw::LLMCompiledModel::set_property(const ov::AnyMap& properties) {

ov::Any ov::npuw::LLMCompiledModel::get_property(const std::string& name) const {
OPENVINO_SUPPRESS_DEPRECATED_START
if (name == ov::intel_npu::npuw::llm::prefill_config.name() ||
name == ov::intel_npu::npuw::llm::generate_config.name()) {
OPENVINO_THROW(name, " is write-only option!");
}

auto&& configIterator = m_prop_to_opt.find(name);
if (configIterator != m_prop_to_opt.cend()) {
return std::get<1>(configIterator->second)(m_cfg);
Expand All @@ -504,7 +528,7 @@ std::shared_ptr<ov::ISyncInferRequest> ov::npuw::LLMCompiledModel::create_sync_i

std::shared_ptr<ov::ISyncInferRequest> ov::npuw::LLMCompiledModel::create_llm_infer_request() {
auto this_sptr = std::static_pointer_cast<ov::npuw::LLMCompiledModel>(shared_from_this());
return std::make_shared<ov::npuw::LLMInferRequest>(this_sptr, m_kvcache_desc);
return std::make_shared<ov::npuw::LLMInferRequest>(this_sptr);
}

void ov::npuw::LLMCompiledModel::implement_properties() {
Expand Down
49 changes: 23 additions & 26 deletions src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,9 @@ void copy_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITenso
}
} // anonymous namespace

ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model,
const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc)
ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model)
: ov::ISyncInferRequest(compiled_model),
m_kvcache_desc(kvcache_desc) {
m_npuw_llm_compiled_model(compiled_model) {
m_kvcache_request = compiled_model->m_kvcache_compiled->create_infer_request();
m_prefill_request = compiled_model->m_prefill_compiled->create_infer_request();

Expand All @@ -82,13 +81,11 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
}

void ov::npuw::LLMInferRequest::prepare_for_new_conversation() {
// FIXME: for input_ids it must be padding from tokenizer that not available from here
// Get it from NPUW options
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0u);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0u);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0u);
fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0u);
m_kvcache_desc.num_stored_tokens = 0u;
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0);
fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0);
m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens = 0u;
}

void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
Expand All @@ -112,7 +109,7 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
std::copy_n(position_ids->data<int64_t>(), position_ids->get_size(), padded_position_ids->data<int64_t>() + offset);

m_prefill_request->infer();
m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_size());
m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_size());
m_need_copy_kvcache = true;

m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at("logits"));
Expand All @@ -126,8 +123,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
LOG_DEBUG("Calling inference for generate model...");
LOG_BLOCK();

auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
// NB: KV-cache is full, further generation is impossible
if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) {
if (kvcache_desc.num_stored_tokens == kvcache_desc.total_size) {
OPENVINO_THROW("KV-Cache is full.");
}

Expand All @@ -146,17 +144,16 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
// taking into account kvcache dimension.
fill_tensor<ov::float16>(kvcache_in_tensor, 0);

const auto& kv_dim = (output_name.find("value") != std::string::npos && m_kvcache_desc.v_tensors_transposed)
const auto& kv_dim = (output_name.find("value") != std::string::npos && kvcache_desc.v_tensors_transposed)
? 3u
: m_kvcache_desc.dim;
: kvcache_desc.dim;

auto prefill_out_slice =
make_tensor_slice(prefill_out_tensor,
kv_dim,
m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens,
m_kvcache_desc.max_prompt_size);
auto prefill_out_slice = make_tensor_slice(prefill_out_tensor,
kv_dim,
kvcache_desc.max_prompt_size - kvcache_desc.num_stored_tokens,
kvcache_desc.max_prompt_size);

auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor, kv_dim, 0u, m_kvcache_desc.num_stored_tokens);
auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor, kv_dim, 0u, kvcache_desc.num_stored_tokens);

if (kv_dim == 3u) {
copy_columns_by_row_chunks(prefill_out_slice, kvcache_in_slice);
Expand All @@ -168,7 +165,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
LOG_DEBUG("Prepare attention mask pattern.");
auto* attention_mask_data =
m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask"))->data<int64_t>();
attention_mask_data[m_kvcache_desc.total_size - 1] = 1;
attention_mask_data[kvcache_desc.total_size - 1] = 1;

m_need_copy_kvcache = false;
}
Expand All @@ -185,7 +182,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,

m_kvcache_request->infer();
m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at("logits"));
m_kvcache_desc.num_stored_tokens += 1;
kvcache_desc.num_stored_tokens += 1;

LOG_DEBUG("Write KV-cache for the new token to the correct input position for next iteration.");
const std::size_t kStartOutputKVCacheLayers = 1u;
Expand All @@ -194,13 +191,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
const auto& output_name = kvcache_compiled->outputs()[kStartOutputKVCacheLayers + i].get_any_name();
const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values");
auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name));
const auto& kv_dim = (output_name.find("value") != std::string::npos && m_kvcache_desc.v_tensors_transposed)
const auto& kv_dim = (output_name.find("value") != std::string::npos && kvcache_desc.v_tensors_transposed)
? 3u
: m_kvcache_desc.dim;
: kvcache_desc.dim;
auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor,
kv_dim,
m_kvcache_desc.num_stored_tokens - 1,
m_kvcache_desc.num_stored_tokens);
kvcache_desc.num_stored_tokens - 1,
kvcache_desc.num_stored_tokens);
auto kvcache_out_tensor = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(output_name));
kvcache_out_tensor->copy_to(kvcache_in_slice._ptr);
}
Expand Down
5 changes: 2 additions & 3 deletions src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ namespace npuw {

class LLMInferRequest final : public ov::ISyncInferRequest {
public:
explicit LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model,
const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc);
explicit LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model);

void infer() override;

Expand Down Expand Up @@ -44,7 +43,7 @@ class LLMInferRequest final : public ov::ISyncInferRequest {

std::shared_ptr<ov::IAsyncInferRequest> m_kvcache_request;
std::shared_ptr<ov::IAsyncInferRequest> m_prefill_request;
LLMCompiledModel::KVCacheDesc m_kvcache_desc;
std::shared_ptr<LLMCompiledModel> m_npuw_llm_compiled_model;
ov::SoPtr<ov::ITensor> m_logits;
bool m_need_copy_kvcache = false;

Expand Down

0 comments on commit a5af1e0

Please sign in to comment.