Skip to content

Commit

Permalink
[NPUW] Dynamic stateful model support (#27651)
Browse files Browse the repository at this point in the history
### Details:
 - *item1*
 - *...*
 
### Related PRs:
- GenAI: *openvinotoolkit/openvino.genai#1240
### Tickets:
 - *ticket-id*

---------

Co-authored-by: TolyaTalamanov <[email protected]>
  • Loading branch information
AsyaPronina and TolyaTalamanov authored Nov 30, 2024
1 parent 09d1e50 commit f332cb4
Show file tree
Hide file tree
Showing 13 changed files with 882 additions and 5 deletions.
24 changes: 24 additions & 0 deletions src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ struct OptionParser<int32_t> final {
static int32_t parse(std::string_view val);
};

template <>
struct OptionParser<uint32_t> final {
static uint32_t parse(std::string_view val);
};

template <>
struct OptionParser<int64_t> final {
static int64_t parse(std::string_view val);
Expand Down Expand Up @@ -167,6 +172,25 @@ struct OptionPrinter final {
}
};

template <typename K, typename V>
struct OptionPrinter<std::map<K, V>> final {
static std::string toString(const std::map<K, V>& val) {
std::stringstream ss;
std::size_t counter = 0;
std::size_t size = val.size();
for (auto& [key, value] : val) {
std::string key_str = OptionPrinter<K>::toString(key);
std::string value_str = OptionPrinter<V>::toString(value);
ss << key_str << ":" << value_str;
if (counter < size - 1) {
ss << ",";
}
++counter;
}
return ss.str();
}
};

// NB: boolean config option has values YES for true, NO for false
template <>
struct OptionPrinter<bool> final {
Expand Down
107 changes: 107 additions & 0 deletions src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ namespace intel_npu {
//

void registerNPUWOptions(OptionsDesc& desc);
void registerNPUWLLMOptions(OptionsDesc& desc);

#define DEFINE_OPT(Name, Type, DefaultValue, PropertyKey, Mode) \
struct Name final : OptionBase<Name, Type> { \
Expand Down Expand Up @@ -66,4 +67,110 @@ DEFINE_OPT(NPUW_DUMP_SUBS, std::string, "", npuw::dump::subgraphs, CompileTime);
DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fail, CompileTime);
DEFINE_OPT(NPUW_DUMP_IO, std::string, "", npuw::dump::inputs_outputs, RunTime);
DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime);
DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime);
DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);

namespace npuw {
namespace llm {
struct ModelDesc {
std::string type;
std::string name_or_path;
int num_key_value_heads;
};
enum class GenerateHint { FAST_COMPILE, BEST_PERF };
} // namespace llm
} // namespace npuw

struct NPUW_LLM_MODEL_DESC final : OptionBase<NPUW_LLM_MODEL_DESC, ::intel_npu::npuw::llm::ModelDesc> {
static std::string_view key() {
return ov::intel_npu::npuw::llm::model_desc.name();
}

static constexpr std::string_view getTypeName() {
return "::intel_npu::npuw::llm::ModelDesc";
}

static ::intel_npu::npuw::llm::ModelDesc defaultValue() {
return {};
}

static ::intel_npu::npuw::llm::ModelDesc parse(std::string_view val) {
::intel_npu::npuw::llm::ModelDesc res;
std::map<std::string, std::string> res_map = OptionParser<std::map<std::string, std::string>>::parse(val);
res.type = res_map["type"];
res.name_or_path = res_map["name_or_path"];
res.num_key_value_heads = std::stoi(res_map["num_key_value_heads"]);
return res;
}

static std::string toString(const ::intel_npu::npuw::llm::ModelDesc& val) {
std::string res;
std::map<std::string, std::string> res_map;
res_map["type"] = val.type;
res_map["name_or_path"] = val.name_or_path;
res_map["num_key_value_heads"] = std::to_string(val.num_key_value_heads);
return OptionPrinter<std::map<std::string, std::string>>::toString(res_map);
}

static OptionMode mode() {
return OptionMode::CompileTime;
}

static bool isPublic() {
return true;
}
};

struct NPUW_LLM_GENERATE_HINT final : OptionBase<NPUW_LLM_GENERATE_HINT, ::intel_npu::npuw::llm::GenerateHint> {
static std::string_view key() {
return ov::intel_npu::npuw::llm::generate_hint.name();
}

static constexpr std::string_view getTypeName() {
return "::intel_npu::npuw::llm::GenerateHint";
}

static ::intel_npu::npuw::llm::GenerateHint defaultValue() {
return ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE;
}

static ::intel_npu::npuw::llm::GenerateHint parse(std::string_view val) {
::intel_npu::npuw::llm::GenerateHint res;

if (val == "FAST_COMPILE") {
res = ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE;
} else if (val == "BEST_PERF") {
res = ::intel_npu::npuw::llm::GenerateHint::BEST_PERF;
} else {
OPENVINO_THROW("Unsupported \"GENERATE_HINT\" provided: ",
val,
". Please select either \"FAST_COMPILE\" or \"BEST_PERF\".");
}
return res;
}

static std::string toString(const ::intel_npu::npuw::llm::GenerateHint& val) {
std::string res;
switch (val) {
case ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE:
res = "FAST_COMPILE";
break;
case ::intel_npu::npuw::llm::GenerateHint::BEST_PERF:
res = "BEST_PERF";
break;
default:
OPENVINO_THROW("Can't convert provided \"GENERATE_HINT\" : ", int(val), " to string.");
}
return res;
}

static OptionMode mode() {
return OptionMode::CompileTime;
}

static bool isPublic() {
return true;
}
};
} // namespace intel_npu
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,51 @@ static constexpr ov::Property<std::string> inputs_outputs{"NPUW_DUMP_IO"};
static constexpr ov::Property<std::string> io_iters{"NPUW_DUMP_IO_ITERS"};
} // namespace dump

namespace llm {
/**
* @brief
* Type: bool.
* Tell NPUW that you want to pass dynamic stateful LLM model.
* Default value: false.
*/
static constexpr ov::Property<bool> enabled{"NPUW_LLM"};

/**
* @brief
* Type: std::map<std::string, std::string>.
* Tell NPUW about your LLM model. Use following structure for that:
* "type:<type>,name_or_path:<name_or_path>,num_key_value_heads:<number>".
* Default value: empty structure defined above.
*/
static constexpr ov::Property<std::string> model_desc{"NPUW_LLM_MODEL_DESC"};

/**
* @brief
* Type: uint32_t.
* Tell NPUW your desirable max prompt length.
* Default value: 1024.
*/
static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"};

/**
* @brief
* Type: uint32_t.
* Tell NPUW your desirable min response length.
* Default value: 128.
*/
static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"};

/**
* @brief
* Type: std::string.
* Tell NPUW the preferrable hint for generation stage, that leads to usage of optimal configuration for it.
* Possible values: "FAST_COMPILE", "BEST_PERF".
* Default value: "FAST_COMPILE".
*/
static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT"};

} // namespace llm

} // namespace npuw
} // namespace intel_npu
} // namespace ov
8 changes: 8 additions & 0 deletions src/plugins/intel_npu/src/al/src/config/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@ int32_t OptionParser<int32_t>::parse(std::string_view val) {
}
}

uint32_t OptionParser<uint32_t>::parse(std::string_view val) {
try {
return std::stoul(val.data());
} catch (...) {
OPENVINO_THROW("Value '%s' is not a valid UINT32 option", val.data());
}
}

int64_t OptionParser<int64_t>::parse(std::string_view val) {
try {
return std::stoll(val.data());
Expand Down
8 changes: 8 additions & 0 deletions src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,11 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
desc.add<NPUW_DUMP_IO_ITERS>();
#endif
}

void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
desc.add<NPUW_LLM>();
desc.add<NPUW_LLM_MODEL_DESC>();
desc.add<NPUW_LLM_MAX_PROMPT_LEN>();
desc.add<NPUW_LLM_MIN_RESPONSE_LEN>();
desc.add<NPUW_LLM_GENERATE_HINT>();
}
26 changes: 25 additions & 1 deletion src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "intel_npu/config/config.hpp"
#include "intel_npu/config/npuw.hpp"
#include "intel_npu/npuw_private_properties.hpp"
#include "llm_compiled_model.hpp"
#include "openvino/runtime/device_id_parser.hpp"
#include "openvino/runtime/internal_properties.hpp"
#include "openvino/runtime/properties.hpp"
Expand Down Expand Up @@ -85,10 +86,33 @@ ov::npuw::DeviceProperties get_properties_per_device(const std::shared_ptr<const
} // namespace npuw
} // namespace ov

std::shared_ptr<ov::npuw::ICompiledModel> ov::npuw::ICompiledModel::create(
const std::shared_ptr<ov::Model>& model,
const std::shared_ptr<const ov::IPlugin>& plugin,
const ov::AnyMap& properties) {
LOG_INFO("Choosing which NPUW CompiledModel to create");
LOG_BLOCK();
std::shared_ptr<ov::npuw::ICompiledModel> compiled_model;
auto use_llm_key = ov::intel_npu::npuw::llm::enabled.name();
if (properties.count(use_llm_key) && properties.at(use_llm_key).as<bool>() == true) {
LOG_INFO("ov::npuw::LLMCompiledModel will be created.");
compiled_model = std::make_shared<ov::npuw::LLMCompiledModel>(model, plugin, properties);
} else {
LOG_INFO("ov::npuw::CompiledModel will be created.");
compiled_model = std::make_shared<ov::npuw::CompiledModel>(model, plugin, properties);
}
LOG_INFO("Done");
return compiled_model;
}

ov::npuw::ICompiledModel::ICompiledModel(const std::shared_ptr<ov::Model>& model,
const std::shared_ptr<const ov::IPlugin>& plugin)
: ov::ICompiledModel(model, plugin) {}

ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
const std::shared_ptr<const ov::IPlugin>& plugin,
const ov::AnyMap& properties)
: ov::ICompiledModel(model, plugin),
: ov::npuw::ICompiledModel(model, plugin),
m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()),
m_cfg(m_options_desc),
m_name(model->get_friendly_name()),
Expand Down
10 changes: 8 additions & 2 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,16 @@ class Plugin;

namespace ov {
namespace npuw {
class ICompiledModel : public ov::ICompiledModel {
public:
static std::shared_ptr<ov::npuw::ICompiledModel> create(const std::shared_ptr<ov::Model>& model,
const std::shared_ptr<const ov::IPlugin>& plugin,
const ov::AnyMap& properties);
ICompiledModel(const std::shared_ptr<ov::Model>& model, const std::shared_ptr<const ov::IPlugin>& plugin);
};

class InferRequest;

class CompiledModel : public ov::ICompiledModel {
class CompiledModel : public ov::npuw::ICompiledModel {
using DevList = std::vector<std::string>;
using GetPropertiesMap =
std::map<std::string, std::tuple<ov::PropertyMutability, std::function<ov::Any(const ::intel_npu::Config&)>>>;
Expand Down
Loading

0 comments on commit f332cb4

Please sign in to comment.