Skip to content

Commit

Permalink
Refactoring LLMCompiledModel according to review comments in GenAI st…
Browse files Browse the repository at this point in the history
…atic_llm::StatefulLLMPipeline
  • Loading branch information
AsyaPronina committed Jan 5, 2025
1 parent 65f6ce8 commit b6dc8ff
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 80 deletions.
48 changes: 3 additions & 45 deletions src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,60 +68,18 @@ DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fai
DEFINE_OPT(NPUW_DUMP_IO, std::string, "", npuw::dump::inputs_outputs, RunTime);
DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime);
DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime);
DEFINE_OPT(NPUW_LLM_BATCH_DIM, uint32_t, 0, npuw::llm::batch_dim, CompileTime);
DEFINE_OPT(NPUW_LLM_SEQ_LEN_DIM, uint32_t, 2, npuw::llm::seq_len_dim, CompileTime);
DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);
DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, false, npuw::llm::optimize_v_tensors, CompileTime);

namespace npuw {
namespace llm {
struct ModelDesc {
std::string type;
std::string name_or_path;
int num_key_value_heads;
};
enum class GenerateHint { FAST_COMPILE, BEST_PERF };
} // namespace llm
} // namespace npuw

struct NPUW_LLM_MODEL_DESC final : OptionBase<NPUW_LLM_MODEL_DESC, ::intel_npu::npuw::llm::ModelDesc> {
static std::string_view key() {
return ov::intel_npu::npuw::llm::model_desc.name();
}

static constexpr std::string_view getTypeName() {
return "::intel_npu::npuw::llm::ModelDesc";
}

static ::intel_npu::npuw::llm::ModelDesc defaultValue() {
return {};
}

static ::intel_npu::npuw::llm::ModelDesc parse(std::string_view val) {
::intel_npu::npuw::llm::ModelDesc res;
std::map<std::string, std::string> res_map = OptionParser<std::map<std::string, std::string>>::parse(val);
res.type = res_map["type"];
res.name_or_path = res_map["name_or_path"];
res.num_key_value_heads = std::stoi(res_map["num_key_value_heads"]);
return res;
}

static std::string toString(const ::intel_npu::npuw::llm::ModelDesc& val) {
std::string res;
std::map<std::string, std::string> res_map;
res_map["type"] = val.type;
res_map["name_or_path"] = val.name_or_path;
res_map["num_key_value_heads"] = std::to_string(val.num_key_value_heads);
return OptionPrinter<std::map<std::string, std::string>>::toString(res_map);
}

static OptionMode mode() {
return OptionMode::CompileTime;
}

static bool isPublic() {
return true;
}
};

struct NPUW_LLM_GENERATE_HINT final : OptionBase<NPUW_LLM_GENERATE_HINT, ::intel_npu::npuw::llm::GenerateHint> {
static std::string_view key() {
return ov::intel_npu::npuw::llm::generate_hint.name();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -387,43 +387,60 @@ namespace llm {
*/
static constexpr ov::Property<bool> enabled{"NPUW_LLM"};


/**
* @brief
* Type: std::map<std::string, std::string>.
* Tell NPUW about your LLM model. Use following structure for that:
* "type:<type>,name_or_path:<name_or_path>,num_key_value_heads:<number>".
* Default value: empty structure defined above.
* Type: uint32_t.
* Dimension of the batch in input tensor shape.
* Default value: 0.
*/
static constexpr ov::Property<uint32_t> batch_dim{"NPUW_LLM_BATCH_DIM"};

/**
* @brief
* Type: uint32_t.
* Dimension of KV-Cache size in input tensor shape.
* Default value: 2.
*/
static constexpr ov::Property<std::string> model_desc{"NPUW_LLM_MODEL_DESC"};
static constexpr ov::Property<uint32_t> seq_len_dim{"NPUW_LLM_SEQ_LEN_DIM"};

/**
* @brief
* Type: uint32_t.
* Tell NPUW your desirable max prompt length.
* Desirable max prompt length.
* Default value: 1024.
*/
static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"};

/**
* @brief
* Type: uint32_t.
* Tell NPUW your desirable min response length.
* Desirable min response length.
* Default value: 128.
*/
static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"};

/**
* @brief
* Type: bool.
* Tell NPUW to apply values transpose optimization for the model.
* Default value: false.
*/
static constexpr ov::Property<bool> optimize_v_tensors{"NPUW_LLM_OPTIMIZE_V_TENSORS"};

/**
* @brief
* Type: ov::AnyMap.
* Tell NPUW the configuration for compilation of prefill model.
* Configuration for compilation of prefill model.
* NOTE: !! Write-only !!
*/
static constexpr ov::Property<ov::AnyMap> prefill_config{"NPUW_LLM_PREFILL_CONFIG"};

/**
* @brief
* Type: std::string.
* Tell NPUW the preferrable hint for generation stage, that leads to usage of optimal configuration for it.
* Hint for generation stage. NPUW will use optimal configuration based on the passed preference via hint.
* Hint is ignored if used with "NPUW_LLM_GENERATE_CONFIG".
* Possible values: "FAST_COMPILE", "BEST_PERF".
* Default value: "FAST_COMPILE".
*/
Expand All @@ -432,7 +449,7 @@ static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT
/**
* @brief
* Type: ov::AnyMap.
* Tell NPUW the configuration for compilation of generate model.
* Configuration for compilation of generate model.
* NOTE: !! Write-only !!
*/
static constexpr ov::Property<ov::AnyMap> generate_config{"NPUW_LLM_GENERATE_CONFIG"};
Expand Down
4 changes: 3 additions & 1 deletion src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {

void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
desc.add<NPUW_LLM>();
desc.add<NPUW_LLM_MODEL_DESC>();
desc.add<NPUW_LLM_BATCH_DIM>();
desc.add<NPUW_LLM_SEQ_LEN_DIM>();
desc.add<NPUW_LLM_MAX_PROMPT_LEN>();
desc.add<NPUW_LLM_MIN_RESPONSE_LEN>();
desc.add<NPUW_LLM_OPTIMIZE_V_TENSORS>();
desc.add<NPUW_LLM_GENERATE_HINT>();
}
37 changes: 13 additions & 24 deletions src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,22 +280,6 @@ void reshape_to_static(std::shared_ptr<ov::Model> model,
model->reshape(new_shapes);
}

KVAxesPosition get_kv_axes(const std::string& model_type) {
KVAxesPosition axes;
if (model_type == "chatglm") {
axes.batch = 1u;
axes.seq_len = 0u;
} else if (model_type == "qwen") {
// Note, qwen2 does not fall into this category and conforms to default layout
axes.batch = 0u;
axes.seq_len = 1u;
} else {
axes.batch = 0u;
axes.seq_len = 2u;
}
return axes;
}

bool is_cw_compressed(const std::shared_ptr<ov::Model>& model) {
std::vector<std::string> rt_info_path = {"nncf", "weight_compression", "group_size"};
if (!model->has_rt_info(rt_info_path)) {
Expand Down Expand Up @@ -444,19 +428,22 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
auto prefill_model = kvcache_model->clone();
prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill");

const ::intel_npu::npuw::llm::ModelDesc model_desc = m_cfg.get<::intel_npu::NPUW_LLM_MODEL_DESC>();
const uint32_t kMaxPromptLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u);
const uint32_t kMinResponseLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u);
KVAxesPosition axes = get_kv_axes(model_desc.type);
m_kvcache_desc = KVCacheDesc{kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len};
const uint32_t batch_dim = m_cfg.get<::intel_npu::NPUW_LLM_BATCH_DIM>();
const uint32_t seq_len_dim = m_cfg.get<::intel_npu::NPUW_LLM_SEQ_LEN_DIM>();
KVAxesPosition axes{batch_dim, seq_len_dim};
const uint32_t max_prompt_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u);
const uint32_t min_response_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u);

m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim};
LOG_DEBUG("4. Make prefill model with static shapes");
reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes);
LOG_DEBUG("5. Make kvcache model with static shapes");
reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes);
LOG_DEBUG("6.Check and apply opt layout if applicable.");

const bool optimize_v_tensors = m_cfg.get<::intel_npu::NPUW_LLM_OPTIMIZE_V_TENSORS>();
// NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model
if (model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" ||
(model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) {
if (optimize_v_tensors) {
if (optimize_value_tensors(kvcache_model)) {
// NB: Check if TransposeValueTensors transformation was applied
m_kvcache_desc.v_tensors_transposed = true;
Expand Down Expand Up @@ -542,9 +529,11 @@ void ov::npuw::LLMCompiledModel::implement_properties() {
}

m_prop_to_opt.insert({BIND(npuw::llm::enabled, NPUW_LLM, get),
BIND(npuw::llm::model_desc, NPUW_LLM_MODEL_DESC, getString),
BIND(npuw::llm::batch_dim, NPUW_LLM_BATCH_DIM, get),
BIND(npuw::llm::batch_dim, NPUW_LLM_SEQ_LEN_DIM, get),
BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN, get),
BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN, get),
BIND(npuw::llm::optimize_v_tensors, NPUW_LLM_OPTIMIZE_V_TENSORS, get),
BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString)});
#undef BIND
}

0 comments on commit b6dc8ff

Please sign in to comment.