Refactoring LLMCompiledModel according to review comments in GenAI st…

…atic_llm::StatefulLLMPipeline
openvinotoolkit · Jan 5, 2025 · b6dc8ff · b6dc8ff
1 parent 65f6ce8
commit b6dc8ff
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 80 deletions.
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
@@ -68,60 +68,18 @@ DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fai
 DEFINE_OPT(NPUW_DUMP_IO, std::string, "", npuw::dump::inputs_outputs, RunTime);
 DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime);
 DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime);
+DEFINE_OPT(NPUW_LLM_BATCH_DIM, uint32_t, 0, npuw::llm::batch_dim, CompileTime);
+DEFINE_OPT(NPUW_LLM_SEQ_LEN_DIM, uint32_t, 2, npuw::llm::seq_len_dim, CompileTime);
 DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
 DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);
+DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, false, npuw::llm::optimize_v_tensors, CompileTime);
 
 namespace npuw {
 namespace llm {
-struct ModelDesc {
-    std::string type;
-    std::string name_or_path;
-    int num_key_value_heads;
-};
 enum class GenerateHint { FAST_COMPILE, BEST_PERF };
 }  // namespace llm
 }  // namespace npuw
 
-struct NPUW_LLM_MODEL_DESC final : OptionBase<NPUW_LLM_MODEL_DESC, ::intel_npu::npuw::llm::ModelDesc> {
-    static std::string_view key() {
-        return ov::intel_npu::npuw::llm::model_desc.name();
-    }
-
-    static constexpr std::string_view getTypeName() {
-        return "::intel_npu::npuw::llm::ModelDesc";
-    }
-
-    static ::intel_npu::npuw::llm::ModelDesc defaultValue() {
-        return {};
-    }
-
-    static ::intel_npu::npuw::llm::ModelDesc parse(std::string_view val) {
-        ::intel_npu::npuw::llm::ModelDesc res;
-        std::map<std::string, std::string> res_map = OptionParser<std::map<std::string, std::string>>::parse(val);
-        res.type = res_map["type"];
-        res.name_or_path = res_map["name_or_path"];
-        res.num_key_value_heads = std::stoi(res_map["num_key_value_heads"]);
-        return res;
-    }
-
-    static std::string toString(const ::intel_npu::npuw::llm::ModelDesc& val) {
-        std::string res;
-        std::map<std::string, std::string> res_map;
-        res_map["type"] = val.type;
-        res_map["name_or_path"] = val.name_or_path;
-        res_map["num_key_value_heads"] = std::to_string(val.num_key_value_heads);
-        return OptionPrinter<std::map<std::string, std::string>>::toString(res_map);
-    }
-
-    static OptionMode mode() {
-        return OptionMode::CompileTime;
-    }
-
-    static bool isPublic() {
-        return true;
-    }
-};
-
 struct NPUW_LLM_GENERATE_HINT final : OptionBase<NPUW_LLM_GENERATE_HINT, ::intel_npu::npuw::llm::GenerateHint> {
     static std::string_view key() {
         return ov::intel_npu::npuw::llm::generate_hint.name();

diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
@@ -387,43 +387,60 @@ namespace llm {
  */
 static constexpr ov::Property<bool> enabled{"NPUW_LLM"};
 
+
 /**
  * @brief
- * Type: std::map<std::string, std::string>.
- * Tell NPUW about your LLM model. Use following structure for that:
- * "type:<type>,name_or_path:<name_or_path>,num_key_value_heads:<number>".
- * Default value: empty structure defined above.
+ * Type: uint32_t.
+ * Dimension of the batch in input tensor shape.
+ * Default value: 0.
+ */
+static constexpr ov::Property<uint32_t> batch_dim{"NPUW_LLM_BATCH_DIM"};
+
+/**
+ * @brief
+ * Type: uint32_t.
+ * Dimension of KV-Cache size in input tensor shape.
+ * Default value: 2.
  */
-static constexpr ov::Property<std::string> model_desc{"NPUW_LLM_MODEL_DESC"};
+static constexpr ov::Property<uint32_t> seq_len_dim{"NPUW_LLM_SEQ_LEN_DIM"};
 
 /**
  * @brief
  * Type: uint32_t.
- * Tell NPUW your desirable max prompt length.
+ * Desirable max prompt length.
  * Default value: 1024.
  */
 static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"};
 
 /**
  * @brief
  * Type: uint32_t.
- * Tell NPUW your desirable min response length.
+ * Desirable min response length.
  * Default value: 128.
  */
 static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"};
 
+/**
+ * @brief
+ * Type: bool.
+ * Tell NPUW to apply values transpose optimization for the model.
+ * Default value: false.
+ */
+static constexpr ov::Property<bool> optimize_v_tensors{"NPUW_LLM_OPTIMIZE_V_TENSORS"};
+
 /**
  * @brief
  * Type: ov::AnyMap.
- * Tell NPUW the configuration for compilation of prefill model.
+ * Configuration for compilation of prefill model.
  * NOTE: !! Write-only !!
  */
 static constexpr ov::Property<ov::AnyMap> prefill_config{"NPUW_LLM_PREFILL_CONFIG"};
 
 /**
  * @brief
  * Type: std::string.
- * Tell NPUW the preferrable hint for generation stage, that leads to usage of optimal configuration for it.
+ * Hint for generation stage. NPUW will use optimal configuration based on the passed preference via hint.
+ * Hint is ignored if used with "NPUW_LLM_GENERATE_CONFIG".
  * Possible values: "FAST_COMPILE", "BEST_PERF".
  * Default value: "FAST_COMPILE".
  */
@@ -432,7 +449,7 @@ static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT
 /**
  * @brief
  * Type: ov::AnyMap.
- * Tell NPUW the configuration for compilation of generate model.
+ * Configuration for compilation of generate model.
  * NOTE: !! Write-only !!
  */
 static constexpr ov::Property<ov::AnyMap> generate_config{"NPUW_LLM_GENERATE_CONFIG"};

diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -57,8 +57,10 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
 
 void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
     desc.add<NPUW_LLM>();
-    desc.add<NPUW_LLM_MODEL_DESC>();
+    desc.add<NPUW_LLM_BATCH_DIM>();
+    desc.add<NPUW_LLM_SEQ_LEN_DIM>();
     desc.add<NPUW_LLM_MAX_PROMPT_LEN>();
     desc.add<NPUW_LLM_MIN_RESPONSE_LEN>();
+    desc.add<NPUW_LLM_OPTIMIZE_V_TENSORS>();
     desc.add<NPUW_LLM_GENERATE_HINT>();
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -280,22 +280,6 @@ void reshape_to_static(std::shared_ptr<ov::Model> model,
     model->reshape(new_shapes);
 }
 
-KVAxesPosition get_kv_axes(const std::string& model_type) {
-    KVAxesPosition axes;
-    if (model_type == "chatglm") {
-        axes.batch = 1u;
-        axes.seq_len = 0u;
-    } else if (model_type == "qwen") {
-        // Note, qwen2 does not fall into this category and conforms to default layout
-        axes.batch = 0u;
-        axes.seq_len = 1u;
-    } else {
-        axes.batch = 0u;
-        axes.seq_len = 2u;
-    }
-    return axes;
-}
-
 bool is_cw_compressed(const std::shared_ptr<ov::Model>& model) {
     std::vector<std::string> rt_info_path = {"nncf", "weight_compression", "group_size"};
     if (!model->has_rt_info(rt_info_path)) {
@@ -444,19 +428,22 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     auto prefill_model = kvcache_model->clone();
     prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill");
 
-    const ::intel_npu::npuw::llm::ModelDesc model_desc = m_cfg.get<::intel_npu::NPUW_LLM_MODEL_DESC>();
-    const uint32_t kMaxPromptLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u);
-    const uint32_t kMinResponseLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u);
-    KVAxesPosition axes = get_kv_axes(model_desc.type);
-    m_kvcache_desc = KVCacheDesc{kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len};
+    const uint32_t batch_dim = m_cfg.get<::intel_npu::NPUW_LLM_BATCH_DIM>();
+    const uint32_t seq_len_dim = m_cfg.get<::intel_npu::NPUW_LLM_SEQ_LEN_DIM>();
+    KVAxesPosition axes{batch_dim, seq_len_dim};
+    const uint32_t max_prompt_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u);
+    const uint32_t min_response_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u);
+
+    m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim};
     LOG_DEBUG("4. Make prefill model with static shapes");
     reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes);
     LOG_DEBUG("5. Make kvcache model with static shapes");
     reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes);
     LOG_DEBUG("6.Check and apply opt layout if applicable.");
+
+    const bool optimize_v_tensors = m_cfg.get<::intel_npu::NPUW_LLM_OPTIMIZE_V_TENSORS>();
     // NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model
-    if (model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" ||
-        (model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) {
+    if (optimize_v_tensors) {
         if (optimize_value_tensors(kvcache_model)) {
             // NB: Check if TransposeValueTensors transformation was applied
             m_kvcache_desc.v_tensors_transposed = true;
@@ -542,9 +529,11 @@ void ov::npuw::LLMCompiledModel::implement_properties() {
     }
 
     m_prop_to_opt.insert({BIND(npuw::llm::enabled, NPUW_LLM, get),
-                          BIND(npuw::llm::model_desc, NPUW_LLM_MODEL_DESC, getString),
+                          BIND(npuw::llm::batch_dim, NPUW_LLM_BATCH_DIM, get),
+                          BIND(npuw::llm::batch_dim, NPUW_LLM_SEQ_LEN_DIM, get),
                           BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN, get),
                           BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN, get),
+                          BIND(npuw::llm::optimize_v_tensors, NPUW_LLM_OPTIMIZE_V_TENSORS, get),
                           BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString)});
 #undef BIND
 }