From 247441fb311b9072f63b8f1dc2333de8b220c2de Mon Sep 17 00:00:00 2001 From: Zhang Yi3 Date: Tue, 26 Nov 2024 18:35:00 -0800 Subject: [PATCH 1/6] [CPU]Check runtime_options from IR model --- src/inference/src/dev/core_impl.cpp | 23 +++++++++++++++++-- src/inference/src/dev/core_impl.hpp | 6 +++++ .../ov_executable_network/properties.cpp | 14 +++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp index 244d27b5eebb67..a0360b8f9c4c41 100644 --- a/src/inference/src/dev/core_impl.cpp +++ b/src/inference/src/dev/core_impl.cpp @@ -736,7 +736,7 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::shared_ptr< ov::AnyMap config_with_batch = config; // if auto-batching is applicable, the below function will patch the device name and config accordingly: auto model = apply_auto_batching(model_, deviceName, config_with_batch); - + apply_rt_info(model_, config_with_batch); auto parsed = parseDeviceNameIntoConfig(deviceName, config_with_batch, is_proxy_device(device_name)); auto plugin = get_plugin(parsed._deviceName); ov::SoPtr res; @@ -769,7 +769,7 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::shared_ptr< ov::AnyMap config_with_batch = config; // if auto-batching is applicable, the below function will patch the device name and config accordingly: auto model = apply_auto_batching(model_, deviceName, config_with_batch); - + apply_rt_info(model_, config_with_batch); auto parsed = parseDeviceNameIntoConfig(deviceName, config_with_batch, is_proxy_device(deviceName)); auto plugin = get_plugin(parsed._deviceName); ov::SoPtr res; @@ -1098,6 +1098,25 @@ std::shared_ptr ov::CoreImpl::apply_auto_batching(const std::sh return ov::details::apply_batch_affinity(model, deviceNameWithoutBatch); } +void ov::CoreImpl::apply_rt_info(const std::shared_ptr& model, + ov::AnyMap& config) const { + if (model->has_rt_info({"runtime_options", "KV_CACHE_PRECISION"})) { + if (config.find("KV_CACHE_PRECISION") == config.end()) { + const auto kv_cache_precision = + model->get_rt_info({"runtime_options", "KV_CACHE_PRECISION"}); + config.insert(ov::hint::kv_cache_precision(kv_cache_precision)); + } + } + if (model->has_rt_info({"runtime_options", "DYNAMIC_QUANTIZATION_GROUP_SIZE"})) { + if (config.find("DYNAMIC_QUANTIZATION_GROUP_SIZE") == config.end()) { + const auto dyn_quant_group_size = + model->get_rt_info({"runtime_options", "DYNAMIC_QUANTIZATION_GROUP_SIZE"}); + config.insert(ov::hint::dynamic_quantization_group_size(dyn_quant_group_size)); + } + } +} + + void ov::CoreImpl::set_property(const std::string& device_name, const AnyMap& properties) { OPENVINO_ASSERT(device_name.find("HETERO:") != 0, "set_property is supported only for HETERO itself (without devices). " diff --git a/src/inference/src/dev/core_impl.hpp b/src/inference/src/dev/core_impl.hpp index 7cf12f3ba3280c..6fb63f2ef4e522 100644 --- a/src/inference/src/dev/core_impl.hpp +++ b/src/inference/src/dev/core_impl.hpp @@ -200,6 +200,12 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this& model, + ov::AnyMap& config) const; + /* * @brief Register plugins according to the build configuration */ diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp index 8ec0900bc7d176..37845422195a95 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp @@ -327,4 +327,18 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPUExecutionDevice) { ASSERT_EQ(value.as(), "CPU"); } +TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptions) { + ov::Core ie; + ov::Any type; + ov::Any size; + ov::CompiledModel compiledModel; + model->set_rt_info("f16", "runtime_options", "KV_CACHE_PRECISION"); + model->set_rt_info("0", "runtime_options", "DYNAMIC_QUANTIZATION_GROUP_SIZE"); + OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName)); + OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision)); + OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size)); + ASSERT_EQ(type.as(), ov::element::f16); + ASSERT_EQ(size.as(), 0); +} + } // namespace From c6eeb4db818a89b3fcd3089c93bc7c5f39c0dda1 Mon Sep 17 00:00:00 2001 From: Zhang Yi3 Date: Tue, 26 Nov 2024 18:51:04 -0800 Subject: [PATCH 2/6] fix code style --- src/inference/src/dev/core_impl.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp index a0360b8f9c4c41..d5227eeab8cbef 100644 --- a/src/inference/src/dev/core_impl.cpp +++ b/src/inference/src/dev/core_impl.cpp @@ -1098,8 +1098,7 @@ std::shared_ptr ov::CoreImpl::apply_auto_batching(const std::sh return ov::details::apply_batch_affinity(model, deviceNameWithoutBatch); } -void ov::CoreImpl::apply_rt_info(const std::shared_ptr& model, - ov::AnyMap& config) const { +void ov::CoreImpl::apply_rt_info(const std::shared_ptr& model, ov::AnyMap& config) const { if (model->has_rt_info({"runtime_options", "KV_CACHE_PRECISION"})) { if (config.find("KV_CACHE_PRECISION") == config.end()) { const auto kv_cache_precision = @@ -1116,7 +1115,6 @@ void ov::CoreImpl::apply_rt_info(const std::shared_ptr& model, } } - void ov::CoreImpl::set_property(const std::string& device_name, const AnyMap& properties) { OPENVINO_ASSERT(device_name.find("HETERO:") != 0, "set_property is supported only for HETERO itself (without devices). " From 0d954808e3c60cdfbe7ef30fc1a130b6b7db4c99 Mon Sep 17 00:00:00 2001 From: Zhang Yi3 Date: Wed, 27 Nov 2024 17:00:59 -0800 Subject: [PATCH 3/6] Revert "[CPU]Check runtime_options from IR model" This reverts commit 247441fb311b9072f63b8f1dc2333de8b220c2de. --- src/inference/src/dev/core_impl.cpp | 21 ++------------------- src/inference/src/dev/core_impl.hpp | 6 ------ 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp index d5227eeab8cbef..244d27b5eebb67 100644 --- a/src/inference/src/dev/core_impl.cpp +++ b/src/inference/src/dev/core_impl.cpp @@ -736,7 +736,7 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::shared_ptr< ov::AnyMap config_with_batch = config; // if auto-batching is applicable, the below function will patch the device name and config accordingly: auto model = apply_auto_batching(model_, deviceName, config_with_batch); - apply_rt_info(model_, config_with_batch); + auto parsed = parseDeviceNameIntoConfig(deviceName, config_with_batch, is_proxy_device(device_name)); auto plugin = get_plugin(parsed._deviceName); ov::SoPtr res; @@ -769,7 +769,7 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::shared_ptr< ov::AnyMap config_with_batch = config; // if auto-batching is applicable, the below function will patch the device name and config accordingly: auto model = apply_auto_batching(model_, deviceName, config_with_batch); - apply_rt_info(model_, config_with_batch); + auto parsed = parseDeviceNameIntoConfig(deviceName, config_with_batch, is_proxy_device(deviceName)); auto plugin = get_plugin(parsed._deviceName); ov::SoPtr res; @@ -1098,23 +1098,6 @@ std::shared_ptr ov::CoreImpl::apply_auto_batching(const std::sh return ov::details::apply_batch_affinity(model, deviceNameWithoutBatch); } -void ov::CoreImpl::apply_rt_info(const std::shared_ptr& model, ov::AnyMap& config) const { - if (model->has_rt_info({"runtime_options", "KV_CACHE_PRECISION"})) { - if (config.find("KV_CACHE_PRECISION") == config.end()) { - const auto kv_cache_precision = - model->get_rt_info({"runtime_options", "KV_CACHE_PRECISION"}); - config.insert(ov::hint::kv_cache_precision(kv_cache_precision)); - } - } - if (model->has_rt_info({"runtime_options", "DYNAMIC_QUANTIZATION_GROUP_SIZE"})) { - if (config.find("DYNAMIC_QUANTIZATION_GROUP_SIZE") == config.end()) { - const auto dyn_quant_group_size = - model->get_rt_info({"runtime_options", "DYNAMIC_QUANTIZATION_GROUP_SIZE"}); - config.insert(ov::hint::dynamic_quantization_group_size(dyn_quant_group_size)); - } - } -} - void ov::CoreImpl::set_property(const std::string& device_name, const AnyMap& properties) { OPENVINO_ASSERT(device_name.find("HETERO:") != 0, "set_property is supported only for HETERO itself (without devices). " diff --git a/src/inference/src/dev/core_impl.hpp b/src/inference/src/dev/core_impl.hpp index 6fb63f2ef4e522..7cf12f3ba3280c 100644 --- a/src/inference/src/dev/core_impl.hpp +++ b/src/inference/src/dev/core_impl.hpp @@ -200,12 +200,6 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this& model, - ov::AnyMap& config) const; - /* * @brief Register plugins according to the build configuration */ From b64d31b258b358677dcb62c07bd45f8ad2b757b6 Mon Sep 17 00:00:00 2001 From: Zhang Yi3 Date: Wed, 27 Nov 2024 17:26:46 -0800 Subject: [PATCH 4/6] [CPU]move runtim_option check to cpu plugin --- src/plugins/intel_cpu/src/plugin.cpp | 14 +++++++++++++- src/plugins/intel_cpu/src/plugin.h | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index b74d4f7c8acbbb..7dcde369bf9ff6 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -206,6 +206,16 @@ static Config::ModelType getModelType(const std::shared_ptr& model) return Config::ModelType::Unknown; } +void Plugin::apply_rt_info(const std::shared_ptr& model, ov::intel_cpu::Config& config) const { + if (model->has_rt_info({"runtime_options", "KV_CACHE_PRECISION"})) { + config.kvCachePrecision = model->get_rt_info({"runtime_options", "KV_CACHE_PRECISION"}); + } + if (model->has_rt_info({"runtime_options", "DYNAMIC_QUANTIZATION_GROUP_SIZE"})) { + config.fcDynamicQuantizationGroupSize = + model->get_rt_info({"runtime_options", "DYNAMIC_QUANTIZATION_GROUP_SIZE"}); + } +} + std::shared_ptr Plugin::compile_model(const std::shared_ptr& model, const ov::AnyMap& orig_config) const { OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, "Plugin::compile_model"); @@ -247,6 +257,7 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< // update the props after the perf mode translated to configs // TODO: Clarify the behavior of SetConfig method. Skip eng_config or not? Config conf = engConfig; + apply_rt_info(cloned_model, conf); conf.readProperties(config, modelType); Transformations transformations(cloned_model, conf); @@ -520,6 +531,7 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr& Config conf = engConfig; Config::ModelType modelType = getModelType(model); + apply_rt_info(model, conf); conf.readProperties(config, modelType); auto context = std::make_shared(conf, fake_w_cache, false); @@ -575,7 +587,7 @@ std::shared_ptr Plugin::import_model(std::istream& model_str Config conf = engConfig; Config::ModelType modelType = getModelType(model); - + apply_rt_info(model, conf); // check ov::loaded_from_cache property and erase it to avoid exception in readProperties. auto _config = config; const auto& it = _config.find(ov::loaded_from_cache.name()); diff --git a/src/plugins/intel_cpu/src/plugin.h b/src/plugins/intel_cpu/src/plugin.h index 2548ba2c1cc8af..414811a2a2a5b7 100644 --- a/src/plugins/intel_cpu/src/plugin.h +++ b/src/plugins/intel_cpu/src/plugin.h @@ -50,7 +50,7 @@ class Plugin : public ov::IPlugin { void get_performance_streams(Config& config, const std::shared_ptr& model) const; void calculate_streams(Config& conf, const std::shared_ptr& model, bool imported = false) const; - + void apply_rt_info(const std::shared_ptr& model, ov::intel_cpu::Config& config) const; Config engConfig; /* Explicily configured streams have higher priority than performance hints. So track if streams is set explicitly (not auto-configured) */ From 88fec83af06035a0d95336cba4bb20343f5517a8 Mon Sep 17 00:00:00 2001 From: Zhang Yi3 Date: Thu, 28 Nov 2024 03:03:03 -0800 Subject: [PATCH 5/6] apply review comments --- src/plugins/intel_cpu/src/plugin.cpp | 8 ++++---- .../custom/behavior/ov_executable_network/properties.cpp | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 7dcde369bf9ff6..2b9253d0e41f2c 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -207,12 +207,12 @@ static Config::ModelType getModelType(const std::shared_ptr& model) } void Plugin::apply_rt_info(const std::shared_ptr& model, ov::intel_cpu::Config& config) const { - if (model->has_rt_info({"runtime_options", "KV_CACHE_PRECISION"})) { - config.kvCachePrecision = model->get_rt_info({"runtime_options", "KV_CACHE_PRECISION"}); + if (model->has_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()})) { + config.kvCachePrecision = model->get_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()}); } - if (model->has_rt_info({"runtime_options", "DYNAMIC_QUANTIZATION_GROUP_SIZE"})) { + if (model->has_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()})) { config.fcDynamicQuantizationGroupSize = - model->get_rt_info({"runtime_options", "DYNAMIC_QUANTIZATION_GROUP_SIZE"}); + model->get_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()}); } } diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp index 37845422195a95..e0a3f7e30f10f0 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp @@ -332,8 +332,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptions) { ov::Any type; ov::Any size; ov::CompiledModel compiledModel; - model->set_rt_info("f16", "runtime_options", "KV_CACHE_PRECISION"); - model->set_rt_info("0", "runtime_options", "DYNAMIC_QUANTIZATION_GROUP_SIZE"); + model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name()); + model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name()); OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName)); OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision)); OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size)); From 54500577aeb77e4eefcc0d8c35540903aa99936f Mon Sep 17 00:00:00 2001 From: Zhang Yi3 Date: Thu, 28 Nov 2024 22:20:18 -0800 Subject: [PATCH 6/6] [CPU]make Config check runtime_options --- src/plugins/intel_cpu/src/config.cpp | 10 ++++++++++ src/plugins/intel_cpu/src/config.h | 2 ++ src/plugins/intel_cpu/src/plugin.cpp | 16 +++------------- src/plugins/intel_cpu/src/plugin.h | 1 - .../ov_executable_network/properties.cpp | 17 +++++++++++++++++ 5 files changed, 32 insertions(+), 14 deletions(-) diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 92470ca063a4c0..8c90c5aeb11ad3 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -457,5 +457,15 @@ void Config::updateProperties() { _config.insert({ov::hint::num_requests.name(), std::to_string(hintNumRequests)}); } +void Config::applyRtInfo(const std::shared_ptr& model) { + if (model->has_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()})) { + this->kvCachePrecision = model->get_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()}); + } + if (model->has_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()})) { + this->fcDynamicQuantizationGroupSize = + model->get_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()}); + } +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 5f4bb25ede350e..a8439d87803fd4 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -106,6 +106,8 @@ struct Config { void updateProperties(); + void applyRtInfo(const std::shared_ptr& model); + std::map _config; int modelPreferThreads = -1; diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 2b9253d0e41f2c..6fdbf7a4ea4dee 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -206,16 +206,6 @@ static Config::ModelType getModelType(const std::shared_ptr& model) return Config::ModelType::Unknown; } -void Plugin::apply_rt_info(const std::shared_ptr& model, ov::intel_cpu::Config& config) const { - if (model->has_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()})) { - config.kvCachePrecision = model->get_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()}); - } - if (model->has_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()})) { - config.fcDynamicQuantizationGroupSize = - model->get_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()}); - } -} - std::shared_ptr Plugin::compile_model(const std::shared_ptr& model, const ov::AnyMap& orig_config) const { OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, "Plugin::compile_model"); @@ -257,7 +247,7 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< // update the props after the perf mode translated to configs // TODO: Clarify the behavior of SetConfig method. Skip eng_config or not? Config conf = engConfig; - apply_rt_info(cloned_model, conf); + conf.applyRtInfo(cloned_model); conf.readProperties(config, modelType); Transformations transformations(cloned_model, conf); @@ -531,7 +521,7 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr& Config conf = engConfig; Config::ModelType modelType = getModelType(model); - apply_rt_info(model, conf); + conf.applyRtInfo(model); conf.readProperties(config, modelType); auto context = std::make_shared(conf, fake_w_cache, false); @@ -587,7 +577,7 @@ std::shared_ptr Plugin::import_model(std::istream& model_str Config conf = engConfig; Config::ModelType modelType = getModelType(model); - apply_rt_info(model, conf); + conf.applyRtInfo(model); // check ov::loaded_from_cache property and erase it to avoid exception in readProperties. auto _config = config; const auto& it = _config.find(ov::loaded_from_cache.name()); diff --git a/src/plugins/intel_cpu/src/plugin.h b/src/plugins/intel_cpu/src/plugin.h index 414811a2a2a5b7..8973478d30403f 100644 --- a/src/plugins/intel_cpu/src/plugin.h +++ b/src/plugins/intel_cpu/src/plugin.h @@ -50,7 +50,6 @@ class Plugin : public ov::IPlugin { void get_performance_streams(Config& config, const std::shared_ptr& model) const; void calculate_streams(Config& conf, const std::shared_ptr& model, bool imported = false) const; - void apply_rt_info(const std::shared_ptr& model, ov::intel_cpu::Config& config) const; Config engConfig; /* Explicily configured streams have higher priority than performance hints. So track if streams is set explicitly (not auto-configured) */ diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp index e0a3f7e30f10f0..a014eeb2cecdac 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp @@ -341,4 +341,21 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptions) { ASSERT_EQ(size.as(), 0); } +TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptionsWithCompileConfig) { + ov::Core ie; + ov::Any type; + ov::Any size; + ov::CompiledModel compiledModel; + model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name()); + model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name()); + ov::AnyMap config; + config[ov::hint::kv_cache_precision.name()] = "u8"; + config[ov::hint::dynamic_quantization_group_size.name()] = "16"; + OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName, config)); + OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision)); + OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size)); + ASSERT_EQ(type.as(), ov::element::u8); + ASSERT_EQ(size.as(), 16); +} + } // namespace