From e713b072b645580f00c079f38c00bc4af04101c8 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Thu, 17 Oct 2024 04:10:30 +0900 Subject: [PATCH 01/64] added the static scaling feature --- src/bindings/c/src/ov_property.cpp | 1 + .../tests/test_runtime/test_properties.py | 10 +++ .../common_optimizations/static_scaling.hpp | 24 +++++++ .../common_optimizations/static_scaling.cpp | 69 +++++++++++++++++++ .../src/plugin/transformations_pipeline.cpp | 4 ++ 5 files changed, 108 insertions(+) create mode 100644 src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp create mode 100644 src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp diff --git a/src/bindings/c/src/ov_property.cpp b/src/bindings/c/src/ov_property.cpp index 61be74ee265599..5292cf3c609b4f 100644 --- a/src/bindings/c/src/ov_property.cpp +++ b/src/bindings/c/src/ov_property.cpp @@ -36,6 +36,7 @@ const char* ov_property_key_hint_execution_mode = "EXECUTION_MODE_HINT"; const char* ov_property_key_force_tbb_terminate = "FORCE_TBB_TERMINATE"; const char* ov_property_key_enable_mmap = "ENABLE_MMAP"; const char* ov_property_key_auto_batch_timeout = "AUTO_BATCH_TIMEOUT"; +const char* ov_property_key_static_scaling = "ENABLE_STATIC_SCALING"; // Write-only property key const char* ov_property_key_cache_encryption_callbacks = "CACHE_ENCRYPTION_CALLBACKS"; diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py index 15e2d86ead4653..d3bf876dc9b1a8 100644 --- a/src/bindings/python/tests/test_runtime/test_properties.py +++ b/src/bindings/python/tests/test_runtime/test_properties.py @@ -417,6 +417,16 @@ def test_properties_ro(ov_property_ro, expected_value): "AVAILABLE_DEVICE_MEM_SIZE", ((128, 128),), ), + ( + hints.enable_static_scaling, + "ENABLE_STATIC_SCALING", + ( + (True, True), + (False, False), + (1, True), + (0, False), + ), + ), ], ) def test_properties_rw(ov_property_rw, expected_value, test_values): diff --git a/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp new file mode 100644 index 00000000000000..d7cdf7853f4a8e --- /dev/null +++ b/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp @@ -0,0 +1,24 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "openvino/pass/matcher_pass.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { + +class TRANSFORMATIONS_API StaticScaling; + +} // namespace pass +} // namespace ov + +class ov::pass::StaticScaling : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("StaticScaling", "0"); + StaticScaling(); +}; \ No newline at end of file diff --git a/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp new file mode 100644 index 00000000000000..58e7eb28f837f0 --- /dev/null +++ b/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp @@ -0,0 +1,69 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/common_optimizations/static_scaling.hpp" + +#include + +#include "itt.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/convolution.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "transformations/utils/utils.hpp" + +ov::pass::StaticScaling::StaticScaling() { + using namespace ov::pass::pattern; + using ov::pass::pattern::op::Or; + + const float default_scale_factor = 256.f; + const ov::element::Type infer_prec = ov::element::f32; + const ov::element::Type scaled_prec = ov::element::f16; + + auto input_m = any_input(); + auto weights_m = wrap_type(type_matches_any({infer_prec})); + auto convolution_m = wrap_type({ input_m, weights_m }); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + OPENVINO_ASSERT(pattern_map.count(convolution_m)); + + auto conv = std::dynamic_pointer_cast(pattern_map.at(convolution_m).get_node_shared_ptr()); + if (!conv || transformation_callback(conv)) + return false; + + if (conv->get_input_element_type(0) != infer_prec || conv->get_output_element_type(0) != infer_prec) { + return false; + } + + auto conv_weight = std::dynamic_pointer_cast(pattern_map.at(weights_m).get_node_shared_ptr()); + auto conv_weight_convert = std::make_shared(conv_weight, scaled_prec); + ov::replace_node(conv_weight, conv_weight_convert); + + auto input = pattern_map.at(input_m); + + ov::Shape scale_const_shape = {1}; + std::vector inverse_scale_value = {(1.f / default_scale_factor)}; + std::shared_ptr inverse_scale_const = std::make_shared(infer_prec, scale_const_shape, inverse_scale_value); + auto scale_down = std::make_shared(input.get_node_shared_ptr()->output(0), + inverse_scale_const->output(0)); + auto precision_down = std::make_shared(scale_down, scaled_prec); + conv->input(0).replace_source_output(precision_down->output(0)); + + std::vector scale_value = {default_scale_factor}; + std::shared_ptr scale_const = std::make_shared(infer_prec, scale_const_shape, scale_value); + auto scale_up = std::make_shared(conv->output(0), + scale_const->output(0)); + ov::replace_node(conv, scale_up); +std::cout << "StaticScaling - converted" << std::endl; + return true; + }; + + auto m = std::make_shared(convolution_m, "StaticScaling"); + this->register_matcher(m, callback); +} diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 7c7c09adcd182f..0a7d56bdbdc323 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -93,6 +93,7 @@ #include "transformations/common_optimizations/move_eltwise_up_data_movement.hpp" #include "transformations/common_optimizations/mvn_fusion.hpp" #include "transformations/common_optimizations/sdpa_scale_fusion.hpp" +#include "transformations/common_optimizations/static_scaling.hpp" #include "transformations/common_optimizations/softmax_fusion.hpp" #include "transformations/common_optimizations/glu_fusion.hpp" #include "transformations/common_optimizations/transpose_sinking.hpp" @@ -943,6 +944,9 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); + if (config.get_property(ov::enable_static_scaling)) + manager.register_pass(); + manager.register_pass(); auto pass_config = manager.get_pass_config(); manager.register_pass(); From 1529e5880c12e8064613baebf5f37c65af139151 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Thu, 17 Oct 2024 04:35:54 +0900 Subject: [PATCH 02/64] added a new rt_info scale_factor --- .../common_optimizations/static_scaling.hpp | 2 +- .../common_optimizations/static_scaling.cpp | 10 ++++++---- .../intel_gpu/src/plugin/transformations_pipeline.cpp | 5 ++++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp index d7cdf7853f4a8e..9c6300774955f3 100644 --- a/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp @@ -20,5 +20,5 @@ class TRANSFORMATIONS_API StaticScaling; class ov::pass::StaticScaling : public ov::pass::MatcherPass { public: OPENVINO_RTTI("StaticScaling", "0"); - StaticScaling(); + StaticScaling(float scale_factor = 0.f); }; \ No newline at end of file diff --git a/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp index 58e7eb28f837f0..c5b2648768eb87 100644 --- a/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp @@ -16,7 +16,7 @@ #include "openvino/pass/pattern/op/wrap_type.hpp" #include "transformations/utils/utils.hpp" -ov::pass::StaticScaling::StaticScaling() { +ov::pass::StaticScaling::StaticScaling(float scale_factor) { using namespace ov::pass::pattern; using ov::pass::pattern::op::Or; @@ -24,6 +24,8 @@ ov::pass::StaticScaling::StaticScaling() { const ov::element::Type infer_prec = ov::element::f32; const ov::element::Type scaled_prec = ov::element::f16; + scale_factor = (scale_factor < 1.f) ? default_scale_factor : scale_factor; + auto input_m = any_input(); auto weights_m = wrap_type(type_matches_any({infer_prec})); auto convolution_m = wrap_type({ input_m, weights_m }); @@ -48,19 +50,19 @@ ov::pass::StaticScaling::StaticScaling() { auto input = pattern_map.at(input_m); ov::Shape scale_const_shape = {1}; - std::vector inverse_scale_value = {(1.f / default_scale_factor)}; + std::vector inverse_scale_value = {(1.f / scale_factor)}; std::shared_ptr inverse_scale_const = std::make_shared(infer_prec, scale_const_shape, inverse_scale_value); auto scale_down = std::make_shared(input.get_node_shared_ptr()->output(0), inverse_scale_const->output(0)); auto precision_down = std::make_shared(scale_down, scaled_prec); conv->input(0).replace_source_output(precision_down->output(0)); - std::vector scale_value = {default_scale_factor}; + std::vector scale_value = {scale_factor}; std::shared_ptr scale_const = std::make_shared(infer_prec, scale_const_shape, scale_value); auto scale_up = std::make_shared(conv->output(0), scale_const->output(0)); ov::replace_node(conv, scale_up); -std::cout << "StaticScaling - converted" << std::endl; +std::cout << "StaticScaling - converted " << scale_factor << std::endl; return true; }; diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 0a7d56bdbdc323..75cd7a96fbc1e3 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -944,8 +944,11 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); - if (config.get_property(ov::enable_static_scaling)) + + if (config.get_property(ov::enable_static_scaling)) { + float scale_factor = func->get_rt_info().count("scale_factor") ? func->get_rt_info("scale_factor") : 0.f; manager.register_pass(); + } manager.register_pass(); auto pass_config = manager.get_pass_config(); From 81f9cc16c5a6b48bdaa53725538be49feffb7316 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Thu, 24 Oct 2024 11:18:38 +0900 Subject: [PATCH 03/64] fp16 scaling for vae decoder of sdxl --- .../common_optimizations/static_scaling.hpp | 34 +- .../common_optimizations/static_scaling.cpp | 382 ++++++++++++++++-- src/plugins/intel_gpu/src/plugin/plugin.cpp | 1 + .../src/plugin/transformations_pipeline.cpp | 9 + 4 files changed, 402 insertions(+), 24 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp index 9c6300774955f3..3fbdd538d05c54 100644 --- a/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp @@ -13,6 +13,10 @@ namespace ov { namespace pass { class TRANSFORMATIONS_API StaticScaling; +class TRANSFORMATIONS_API StaticScalingInput; +class TRANSFORMATIONS_API StaticScalingOutput; +class TRANSFORMATIONS_API StaticScalingAdd; +class TRANSFORMATIONS_API StaticScalingModel; } // namespace pass } // namespace ov @@ -21,4 +25,32 @@ class ov::pass::StaticScaling : public ov::pass::MatcherPass { public: OPENVINO_RTTI("StaticScaling", "0"); StaticScaling(float scale_factor = 0.f); -}; \ No newline at end of file +}; + +class ov::pass::StaticScalingInput : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("StaticScalingInput", "0"); + StaticScalingInput(float scale_factor = 0.f); +}; + +class ov::pass::StaticScalingOutput : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("StaticScalingOutput", "0"); + StaticScalingOutput(float scale_factor = 0.f); +}; + +class ov::pass::StaticScalingAdd : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("StaticScalingAdd", "0"); + StaticScalingAdd(float scale_factor = 0.f); +}; + +class ov::pass::StaticScalingModel : public ov::pass::ModelPass { +public: + OPENVINO_RTTI("StaticScalingModel", "0"); + explicit StaticScalingModel(float scale_factor = 0.f); + bool run_on_model(const std::shared_ptr& model) override; + +private: + float m_scale_factor = 0.f; +}; diff --git a/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp index c5b2648768eb87..04426e3c9372c8 100644 --- a/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp @@ -12,60 +12,396 @@ #include "openvino/op/convert.hpp" #include "openvino/op/convolution.hpp" #include "openvino/op/multiply.hpp" +#include "openvino/op/swish.hpp" +#include "openvino/op/sin.hpp" +#include "openvino/op/cos.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/result.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/mvn.hpp" +#include "openvino/op/group_normalization.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" +#include "openvino/op/transpose.hpp" #include "openvino/pass/pattern/op/or.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "transformations/utils/utils.hpp" +float get_scale_factor(float scale_factor) { + const float default_scale_factor = 256.f; + + // scale_factor = (scale_factor < 1) ? default_scale_factor : scale_factor; + + return default_scale_factor; +} + +ov::pass::StaticScalingModel::StaticScalingModel(float scale_factor) { + m_scale_factor = get_scale_factor(scale_factor); +} + +bool ov::pass::StaticScalingModel::run_on_model(const std::shared_ptr& f) { + RUN_ON_MODEL_SCOPE(StaticScalingModel); + + std::unordered_set scaled_down_subgraph; + std::unordered_set normalized_subgraph; + std::unordered_set constant_subgraph; + + ov::Shape scale_const_shape = {1}; + std::vector scale_value = {m_scale_factor}; + std::vector inverse_scale_value = {(1.f / m_scale_factor)}; + std::shared_ptr scale_const_f16 = std::make_shared(ov::element::f16, scale_const_shape, scale_value); + std::shared_ptr scale_const_f32 = std::make_shared(ov::element::f32, scale_const_shape, scale_value); + std::shared_ptr inverse_scale_const_f16 = std::make_shared(ov::element::f16, scale_const_shape, inverse_scale_value); + std::shared_ptr inverse_scale_const_f32 = std::make_shared(ov::element::f32, scale_const_shape, inverse_scale_value); + + for (auto& node : f->get_ordered_ops()) { + if (node->get_friendly_name().compare("__module.transformer_blocks.0.norm1_context.linear/ov_ext::linear/MatMul") == 0) + std::cout << "!" << std::endl; + + auto parameter_node = std::dynamic_pointer_cast(node); + if (parameter_node && + (parameter_node->get_element_type() == ov::element::f16 || + parameter_node->get_element_type() == ov::element::f32)) { + std::shared_ptr inverse_scale_const = (parameter_node->get_element_type() == ov::element::f16) ? + inverse_scale_const_f16 : inverse_scale_const_f32; + auto scale_down = std::make_shared(parameter_node->output(0), + inverse_scale_const->output(0)); + ov::replace_node(parameter_node, scale_down); + scaled_down_subgraph.insert(node->get_friendly_name()); + scaled_down_subgraph.insert(scale_down->get_friendly_name()); + continue; + } + + auto const_node = std::dynamic_pointer_cast(node); + if (const_node) { + constant_subgraph.insert(node->get_friendly_name()); + continue; + } + + auto group_norm_node = std::dynamic_pointer_cast(node); + auto mvn_node = std::dynamic_pointer_cast(node); + if (group_norm_node || mvn_node) { + normalized_subgraph.insert(node->get_friendly_name()); + continue; + } + + size_t num_scaled_down_inputs = 0; + size_t num_const_inputs = 0; + size_t num_normalized_inputs = 0; + for (auto& dep: node->inputs()) { + auto dep_name = dep.get_source_output().get_node_shared_ptr()->get_friendly_name(); + + if (scaled_down_subgraph.find(dep_name) != scaled_down_subgraph.end()) { + num_scaled_down_inputs += 1; + continue; + } + if (constant_subgraph.find(dep_name) != constant_subgraph.end()) { + num_const_inputs += 1; + continue; + } + if (normalized_subgraph.find(dep_name) != normalized_subgraph.end()) { + num_normalized_inputs += 1; + continue; + } + } + + if (node->get_input_size() > 0) { + if (num_const_inputs == node->get_input_size()) { + constant_subgraph.insert(node->get_friendly_name()); + continue; + } + if ((num_const_inputs + num_normalized_inputs) == node->get_input_size()) { + normalized_subgraph.insert(node->get_friendly_name()); + continue; + } + } + + if (num_scaled_down_inputs == 0) { + continue; + } + + // input0 input1 input0 input1 + // (scaled_down) (normalized (scaled_down) (normalized + // or const) or const) + // \ / \ / + // \ / ==> \ scale_down + // \ / \ / + // add add + auto add = std::dynamic_pointer_cast(node); + if (add && num_scaled_down_inputs == 1) { + for (auto& dep: node->inputs()) { + if (scaled_down_subgraph.find(dep.get_source_output().get_node_shared_ptr()->get_friendly_name()) == scaled_down_subgraph.end()) { + std::shared_ptr inverse_scale_const = (dep.get_element_type() == ov::element::f16) ? + inverse_scale_const_f16 : inverse_scale_const_f32; + auto scale_down = std::make_shared(dep.get_source_output(), + inverse_scale_const->output(0)); + dep.replace_source_output(scale_down->output(0)); + std::cout << "scale_down " << scale_down->input(0).get_source_output().get_node_shared_ptr()->get_friendly_name() << " --> " + << node->get_friendly_name() << std::endl; + } + } + } + + auto sdpa = std::dynamic_pointer_cast(node); + if (sdpa) { + for (size_t i = 0; i < 2; i++) { + if (scaled_down_subgraph.find(node->input(i).get_source_output().get_node_shared_ptr()->get_friendly_name()) != scaled_down_subgraph.end()) { + std::shared_ptr scale_const = (node->get_input_element_type(i) == ov::element::f16) ? scale_const_f16 : scale_const_f32; + auto transpose = std::dynamic_pointer_cast(node->input(i).get_source_output().get_node_shared_ptr()); + if (transpose) { + auto scale_up = std::make_shared(transpose->get_input_source_output(0), + scale_const->output(0)); + transpose->input(0).replace_source_output(scale_up->output(0)); + } else { + auto scale_up = std::make_shared(node->get_input_source_output(i), + scale_const->output(0)); + node->input(i).replace_source_output(scale_up->output(0)); + } + } + } + + if (scaled_down_subgraph.find(node->input(2).get_source_output().get_node_shared_ptr()->get_friendly_name()) != scaled_down_subgraph.end()) { + scaled_down_subgraph.insert(node->get_friendly_name()); + } + continue; + } + + // input(scaled_down) -- activation + // ==> + // input(scaled_down) -- multiply(scale_up) -- activation -- multiply(scale_down) + auto sin = std::dynamic_pointer_cast(node); + auto cos = std::dynamic_pointer_cast(node); + auto swish = std::dynamic_pointer_cast(node); + if ((sin || cos || swish) && num_scaled_down_inputs == 1) { + std::shared_ptr scale_const = (node->get_input_element_type(0) == ov::element::f16) ? scale_const_f16 : scale_const_f32; + auto scale_up = std::make_shared(node->get_input_source_output(0), + scale_const->output(0)); + node->input(0).replace_source_output(scale_up->output(0)); + + std::shared_ptr inverse_scale_const = (node->get_output_element_type(0) == ov::element::f16) ? + inverse_scale_const_f16 : inverse_scale_const_f32; + auto scale_down = std::make_shared(node->output(0), + inverse_scale_const->output(0)); + ov::replace_node(node, scale_down); + scaled_down_subgraph.insert(scale_down->get_friendly_name()); + std::cout << "scale activation " << node->get_friendly_name() << std::endl; + } + + if (num_scaled_down_inputs > 0) + scaled_down_subgraph.insert(node->get_friendly_name()); + } + + return true; +} + ov::pass::StaticScaling::StaticScaling(float scale_factor) { using namespace ov::pass::pattern; using ov::pass::pattern::op::Or; - const float default_scale_factor = 256.f; - const ov::element::Type infer_prec = ov::element::f32; - const ov::element::Type scaled_prec = ov::element::f16; + const ov::element::Type infer_prec = ov::element::f16; + const ov::element::Type scaled_prec = ov::element::f32; - scale_factor = (scale_factor < 1.f) ? default_scale_factor : scale_factor; + scale_factor = get_scale_factor(scale_factor); auto input_m = any_input(); - auto weights_m = wrap_type(type_matches_any({infer_prec})); - auto convolution_m = wrap_type({ input_m, weights_m }); + auto swish_m = wrap_type({ input_m }); ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); - OPENVINO_ASSERT(pattern_map.count(convolution_m)); + OPENVINO_ASSERT(pattern_map.count(swish_m)); - auto conv = std::dynamic_pointer_cast(pattern_map.at(convolution_m).get_node_shared_ptr()); - if (!conv || transformation_callback(conv)) + auto swish = std::dynamic_pointer_cast(pattern_map.at(swish_m).get_node_shared_ptr()); + if (!swish || transformation_callback(swish)) return false; - if (conv->get_input_element_type(0) != infer_prec || conv->get_output_element_type(0) != infer_prec) { + if (swish->get_input_element_type(0) != infer_prec || swish->get_output_element_type(0) != infer_prec) { return false; } - auto conv_weight = std::dynamic_pointer_cast(pattern_map.at(weights_m).get_node_shared_ptr()); - auto conv_weight_convert = std::make_shared(conv_weight, scaled_prec); - ov::replace_node(conv_weight, conv_weight_convert); - auto input = pattern_map.at(input_m); + auto precision_up = std::make_shared(input, scaled_prec); ov::Shape scale_const_shape = {1}; + std::vector scale_value = {scale_factor}; + std::shared_ptr scale_const = std::make_shared(scaled_prec, scale_const_shape, scale_value); + auto scale_up = std::make_shared(precision_up->output(0), + scale_const->output(0)); + swish->input(0).replace_source_output(scale_up->output(0)); + swish->revalidate_and_infer_types(); + std::vector inverse_scale_value = {(1.f / scale_factor)}; - std::shared_ptr inverse_scale_const = std::make_shared(infer_prec, scale_const_shape, inverse_scale_value); - auto scale_down = std::make_shared(input.get_node_shared_ptr()->output(0), + std::shared_ptr inverse_scale_const = std::make_shared(scaled_prec, scale_const_shape, inverse_scale_value); + auto scale_down = std::make_shared(swish->output(0), inverse_scale_const->output(0)); - auto precision_down = std::make_shared(scale_down, scaled_prec); - conv->input(0).replace_source_output(precision_down->output(0)); + ov::replace_node(swish, scale_down); + + auto precision_down = std::make_shared(scale_down->output(0), infer_prec); + ov::replace_node(scale_down, precision_down); +std::cout << "StaticScaling - converted " << scale_factor << std::endl; + return true; + }; + + auto m = std::make_shared(swish_m, "StaticScaling"); + this->register_matcher(m, callback); +} + +ov::pass::StaticScalingInput::StaticScalingInput(float scale_factor) { + using namespace ov::pass::pattern; + using ov::pass::pattern::op::Or; + scale_factor = get_scale_factor(scale_factor); + + auto input_m = wrap_type(); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + OPENVINO_ASSERT(pattern_map.count(input_m)); + + auto input = std::dynamic_pointer_cast(pattern_map.at(input_m).get_node_shared_ptr()); + if (!input || transformation_callback(input)) + return false; + + auto input_prec = input->get_output_element_type(0); + + ov::Shape scale_const_shape = {1}; + std::vector inverse_scale_value = {(1.f / scale_factor)}; + std::shared_ptr inverse_scale_const = std::make_shared(input_prec, scale_const_shape, inverse_scale_value); + auto scale_down = std::make_shared(input->output(0), + inverse_scale_const->output(0)); + + ov::replace_node(input, scale_down); +std::cout << "StaticScalingInput - converted " << scale_factor << std::endl; + return true; + }; + + auto m = std::make_shared(input_m, "StaticScalingInput"); + this->register_matcher(m, callback); +} + +ov::pass::StaticScalingOutput::StaticScalingOutput(float scale_factor) { + using namespace ov::pass::pattern; + using ov::pass::pattern::op::Or; + + scale_factor = get_scale_factor(scale_factor); + + auto input_m = any_input(); + auto output_m = wrap_type({input_m}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + OPENVINO_ASSERT(pattern_map.count(output_m)); + + auto output = std::dynamic_pointer_cast(pattern_map.at(output_m).get_node_shared_ptr()); + if (!output || transformation_callback(output)) + return false; + + auto output_prec = output->get_input_element_type(0); + + ov::Shape scale_const_shape = {1}; std::vector scale_value = {scale_factor}; - std::shared_ptr scale_const = std::make_shared(infer_prec, scale_const_shape, scale_value); - auto scale_up = std::make_shared(conv->output(0), + std::shared_ptr scale_const = std::make_shared(output_prec, scale_const_shape, scale_value); + auto scale_up = std::make_shared(output->get_input_source_output(0), scale_const->output(0)); - ov::replace_node(conv, scale_up); -std::cout << "StaticScaling - converted " << scale_factor << std::endl; + output->input(0).replace_source_output(scale_up->output(0)); +std::cout << "StaticScalingOutput - converted " << scale_factor << std::endl; + return true; + }; + + auto m = std::make_shared(output_m, "StaticScalingOutput"); + this->register_matcher(m, callback); +} + +ov::pass::StaticScalingAdd::StaticScalingAdd(float scale_factor) { + using namespace ov::pass::pattern; + using ov::pass::pattern::op::Or; + + scale_factor = get_scale_factor(scale_factor); + + auto input_m = any_input(); + auto const_input_m = wrap_type(); + auto add_m = wrap_type({input_m, const_input_m}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + OPENVINO_ASSERT(pattern_map.count(add_m)); + + auto add = std::dynamic_pointer_cast(pattern_map.at(add_m).get_node_shared_ptr()); + if (!add || transformation_callback(add)) + return false; + + auto const_input = std::dynamic_pointer_cast(pattern_map.at(const_input_m).get_node_shared_ptr()); + if (!const_input) + return false; + + auto runtime_prec = add->get_input_element_type(0); + + ov::Shape scale_const_shape = {1}; + std::vector inverse_scale_value = {(1.f / scale_factor)}; + std::shared_ptr inverse_scale_const = std::make_shared(runtime_prec, scale_const_shape, inverse_scale_value); + auto scale_down = std::make_shared(const_input->output(0), + inverse_scale_const->output(0)); + ov::replace_node(const_input, scale_down); +std::cout << "StaticScalingAdd - converted " << scale_factor << std::endl; return true; }; - auto m = std::make_shared(convolution_m, "StaticScaling"); + auto m = std::make_shared(add_m, "StaticScalingAdd"); this->register_matcher(m, callback); } + +// ov::pass::StaticScaling::StaticScaling(float scale_factor) { +// using namespace ov::pass::pattern; +// using ov::pass::pattern::op::Or; + +// const float default_scale_factor = 256.f; +// const ov::element::Type infer_prec = ov::element::f32; +// const ov::element::Type scaled_prec = ov::element::f16; + +// scale_factor = (scale_factor < 1.f) ? default_scale_factor : scale_factor; + +// auto input_m = any_input(); +// auto weights_m = wrap_type(type_matches_any({infer_prec})); +// auto convolution_m = wrap_type({ input_m, weights_m }); + +// ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { +// const auto& pattern_map = m.get_pattern_value_map(); + +// OPENVINO_ASSERT(pattern_map.count(convolution_m)); + +// auto conv = std::dynamic_pointer_cast(pattern_map.at(convolution_m).get_node_shared_ptr()); +// if (!conv || transformation_callback(conv)) +// return false; + +// if (conv->get_input_element_type(0) != infer_prec || conv->get_output_element_type(0) != infer_prec) { +// return false; +// } + +// auto conv_weight = std::dynamic_pointer_cast(pattern_map.at(weights_m).get_node_shared_ptr()); +// auto conv_weight_convert = std::make_shared(conv_weight, scaled_prec); +// ov::replace_node(conv_weight, conv_weight_convert); + +// auto input = pattern_map.at(input_m); + +// ov::Shape scale_const_shape = {1}; +// std::vector inverse_scale_value = {(1.f / scale_factor)}; +// std::shared_ptr inverse_scale_const = std::make_shared(infer_prec, scale_const_shape, inverse_scale_value); +// auto scale_down = std::make_shared(input.get_node_shared_ptr()->output(0), +// inverse_scale_const->output(0)); +// auto precision_down = std::make_shared(scale_down, scaled_prec); +// conv->input(0).replace_source_output(precision_down->output(0)); + +// std::vector scale_value = {scale_factor}; +// std::shared_ptr scale_const = std::make_shared(infer_prec, scale_const_shape, scale_value); +// auto scale_up = std::make_shared(conv->output(0), +// scale_const->output(0)); +// ov::replace_node(conv, scale_up); +// std::cout << "StaticScaling - converted " << scale_factor << std::endl; +// return true; +// }; + +// auto m = std::make_shared(convolution_m, "StaticScaling"); +// this->register_matcher(m, callback); +// } diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 5650f5a66a2ae6..6e7178f8385813 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -126,6 +126,7 @@ std::shared_ptr Plugin::clone_and_transform_model(const std::shared_p GPU_DEBUG_IF(!debug_config->dump_graphs.empty()) { auto path_base = debug_config->dump_graphs + "/" + cloned_model->get_name() + "_" + "transformed_func"; ov::pass::VisualizeTree(path_base + ".svg").run_on_model(cloned_model); + ov::pass::Serialize(path_base + ".xml", path_base + ".bin").run_on_model(cloned_model); } return cloned_model; } diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 75cd7a96fbc1e3..bb3c3fb2b578d3 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -914,6 +914,15 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // not working properly. manager.register_pass(); + if (config.get_property(ov::enable_static_scaling)) { + float scale_factor = func->get_rt_info().count("scale_factor") ? func->get_rt_info("scale_factor") : 0.f; + // manager.register_pass(scale_factor); + // manager.register_pass(scale_factor); + // manager.register_pass(scale_factor); + // manager.register_pass(scale_factor); + manager.register_pass(scale_factor); + } + manager.register_pass(); manager.register_pass(); manager.register_pass(); From 6764f23d7c9cefaa29fa9b286748c3c2668f9f9a Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 28 Oct 2024 03:13:02 +0900 Subject: [PATCH 04/64] resolved accuracy issue in transformer of flux.1 --- .../common_optimizations/static_scaling.cpp | 67 +++++++++++++++---- 1 file changed, 55 insertions(+), 12 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp index 04426e3c9372c8..92f71a0815ec86 100644 --- a/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp @@ -13,8 +13,13 @@ #include "openvino/op/convolution.hpp" #include "openvino/op/multiply.hpp" #include "openvino/op/swish.hpp" +#include "openvino/op/gelu.hpp" #include "openvino/op/sin.hpp" #include "openvino/op/cos.hpp" +#include "openvino/op/power.hpp" +#include "openvino/op/sqrt.hpp" +#include "openvino/op/softmax.hpp" +#include "openvino/op/matmul.hpp" #include "openvino/op/parameter.hpp" #include "openvino/op/result.hpp" #include "openvino/op/add.hpp" @@ -27,7 +32,7 @@ #include "transformations/utils/utils.hpp" float get_scale_factor(float scale_factor) { - const float default_scale_factor = 256.f; + const float default_scale_factor = 10.f; // scale_factor = (scale_factor < 1) ? default_scale_factor : scale_factor; @@ -54,9 +59,6 @@ bool ov::pass::StaticScalingModel::run_on_model(const std::shared_ptr std::shared_ptr inverse_scale_const_f32 = std::make_shared(ov::element::f32, scale_const_shape, inverse_scale_value); for (auto& node : f->get_ordered_ops()) { - if (node->get_friendly_name().compare("__module.transformer_blocks.0.norm1_context.linear/ov_ext::linear/MatMul") == 0) - std::cout << "!" << std::endl; - auto parameter_node = std::dynamic_pointer_cast(node); if (parameter_node && (parameter_node->get_element_type() == ov::element::f16 || @@ -119,6 +121,17 @@ bool ov::pass::StaticScalingModel::run_on_model(const std::shared_ptr continue; } + auto result = std::dynamic_pointer_cast(node); + if (result && num_scaled_down_inputs == 1) { + auto dep = node->input(0); + std::shared_ptr scale_const = (dep.get_element_type() == ov::element::f16) ? + scale_const_f16 : scale_const_f32; + auto scale_up = std::make_shared(dep.get_source_output(), scale_const->output(0)); + dep.replace_source_output(scale_up->output(0)); + std::cout << "result scale_up " << scale_up->input(0).get_source_output().get_node_shared_ptr()->get_friendly_name() << " --> " + << node->get_friendly_name() << std::endl; + } + // input0 input1 input0 input1 // (scaled_down) (normalized (scaled_down) (normalized // or const) or const) @@ -141,6 +154,18 @@ bool ov::pass::StaticScalingModel::run_on_model(const std::shared_ptr } } + auto multiply = std::dynamic_pointer_cast(node); + auto matmul = std::dynamic_pointer_cast(node); + if ((multiply || matmul) && num_scaled_down_inputs == 2) { + auto dep = node->input(1); + std::shared_ptr scale_const = (dep.get_element_type() == ov::element::f16) ? + scale_const_f16 : scale_const_f32; + auto scale_up = std::make_shared(dep.get_source_output(), scale_const->output(0)); + dep.replace_source_output(scale_up->output(0)); + std::cout << "scale_up " << scale_up->input(0).get_source_output().get_node_shared_ptr()->get_friendly_name() << " --> " + << node->get_friendly_name() << std::endl; + } + auto sdpa = std::dynamic_pointer_cast(node); if (sdpa) { for (size_t i = 0; i < 2; i++) { @@ -167,22 +192,40 @@ bool ov::pass::StaticScalingModel::run_on_model(const std::shared_ptr // input(scaled_down) -- activation // ==> - // input(scaled_down) -- multiply(scale_up) -- activation -- multiply(scale_down) + // input(scaled_down) -- convert(precision_up) -- multiply(scale_up) -- activation -- multiply(scale_down) -- convert(precision_down) auto sin = std::dynamic_pointer_cast(node); auto cos = std::dynamic_pointer_cast(node); auto swish = std::dynamic_pointer_cast(node); - if ((sin || cos || swish) && num_scaled_down_inputs == 1) { - std::shared_ptr scale_const = (node->get_input_element_type(0) == ov::element::f16) ? scale_const_f16 : scale_const_f32; - auto scale_up = std::make_shared(node->get_input_source_output(0), - scale_const->output(0)); + auto power = std::dynamic_pointer_cast(node); + auto sqrt = std::dynamic_pointer_cast(node); + auto gelu = std::dynamic_pointer_cast(node); + auto softmax = std::dynamic_pointer_cast(node); + if ((sin || cos || swish || power || sqrt || gelu || softmax) && num_scaled_down_inputs == 1) { + auto input_prec = node->get_input_element_type(0); + auto output_prec = node->get_output_element_type(0); + + ov::Output input_src; + if (input_prec == ov::element::f16) { + auto precision_up = std::make_shared(node->get_input_source_output(0), ov::element::f32); + input_src = precision_up->output(0); + } else { + input_src = node->get_input_source_output(0); + } + auto scale_up = std::make_shared(input_src, + scale_const_f32->output(0)); node->input(0).replace_source_output(scale_up->output(0)); + node->revalidate_and_infer_types(); - std::shared_ptr inverse_scale_const = (node->get_output_element_type(0) == ov::element::f16) ? - inverse_scale_const_f16 : inverse_scale_const_f32; auto scale_down = std::make_shared(node->output(0), - inverse_scale_const->output(0)); + inverse_scale_const_f32->output(0)); ov::replace_node(node, scale_down); scaled_down_subgraph.insert(scale_down->get_friendly_name()); + + if (output_prec == ov::element::f16) { + auto precision_down = std::make_shared(scale_down->output(0), ov::element::f16); + ov::replace_node(scale_down, precision_down); + scaled_down_subgraph.insert(precision_down->get_friendly_name()); + } std::cout << "scale activation " << node->get_friendly_name() << std::endl; } From cc3d8f45e69fc75de3b3b775f3c0c8bf8e8948e1 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 28 Oct 2024 04:52:46 +0900 Subject: [PATCH 05/64] removed unnecessary codes --- src/bindings/c/src/ov_property.cpp | 1 - .../tests/test_runtime/test_properties.py | 10 +- .../common_optimizations/static_scaling.hpp | 26 +- .../common_optimizations/static_scaling.cpp | 284 ++---------------- .../src/plugin/transformations_pipeline.cpp | 9 +- 5 files changed, 34 insertions(+), 296 deletions(-) diff --git a/src/bindings/c/src/ov_property.cpp b/src/bindings/c/src/ov_property.cpp index 5292cf3c609b4f..61be74ee265599 100644 --- a/src/bindings/c/src/ov_property.cpp +++ b/src/bindings/c/src/ov_property.cpp @@ -36,7 +36,6 @@ const char* ov_property_key_hint_execution_mode = "EXECUTION_MODE_HINT"; const char* ov_property_key_force_tbb_terminate = "FORCE_TBB_TERMINATE"; const char* ov_property_key_enable_mmap = "ENABLE_MMAP"; const char* ov_property_key_auto_batch_timeout = "AUTO_BATCH_TIMEOUT"; -const char* ov_property_key_static_scaling = "ENABLE_STATIC_SCALING"; // Write-only property key const char* ov_property_key_cache_encryption_callbacks = "CACHE_ENCRYPTION_CALLBACKS"; diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py index d3bf876dc9b1a8..47809dd63f304b 100644 --- a/src/bindings/python/tests/test_runtime/test_properties.py +++ b/src/bindings/python/tests/test_runtime/test_properties.py @@ -418,13 +418,11 @@ def test_properties_ro(ov_property_ro, expected_value): ((128, 128),), ), ( - hints.enable_static_scaling, - "ENABLE_STATIC_SCALING", + hints.activations_scale_factor, + "ACTIVATIONS_SCALE_FACTOR", ( - (True, True), - (False, False), - (1, True), - (0, False), + (16.0, np.float32(16.0)), + (256.0, 256.0), ), ), ], diff --git a/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp index 3fbdd538d05c54..fe2d514454a34c 100644 --- a/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp @@ -21,34 +21,10 @@ class TRANSFORMATIONS_API StaticScalingModel; } // namespace pass } // namespace ov -class ov::pass::StaticScaling : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("StaticScaling", "0"); - StaticScaling(float scale_factor = 0.f); -}; - -class ov::pass::StaticScalingInput : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("StaticScalingInput", "0"); - StaticScalingInput(float scale_factor = 0.f); -}; - -class ov::pass::StaticScalingOutput : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("StaticScalingOutput", "0"); - StaticScalingOutput(float scale_factor = 0.f); -}; - -class ov::pass::StaticScalingAdd : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("StaticScalingAdd", "0"); - StaticScalingAdd(float scale_factor = 0.f); -}; - class ov::pass::StaticScalingModel : public ov::pass::ModelPass { public: OPENVINO_RTTI("StaticScalingModel", "0"); - explicit StaticScalingModel(float scale_factor = 0.f); + explicit StaticScalingModel(float scale_factor): m_scale_factor(scale_factor) {} bool run_on_model(const std::shared_ptr& model) override; private: diff --git a/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp index 92f71a0815ec86..0be31155c3a339 100644 --- a/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp @@ -31,24 +31,17 @@ #include "openvino/pass/pattern/op/wrap_type.hpp" #include "transformations/utils/utils.hpp" -float get_scale_factor(float scale_factor) { - const float default_scale_factor = 10.f; - - // scale_factor = (scale_factor < 1) ? default_scale_factor : scale_factor; - - return default_scale_factor; -} - -ov::pass::StaticScalingModel::StaticScalingModel(float scale_factor) { - m_scale_factor = get_scale_factor(scale_factor); -} - bool ov::pass::StaticScalingModel::run_on_model(const std::shared_ptr& f) { RUN_ON_MODEL_SCOPE(StaticScalingModel); - std::unordered_set scaled_down_subgraph; - std::unordered_set normalized_subgraph; - std::unordered_set constant_subgraph; + if (m_scale_factor < 1.f) + return false; + + std::cout << "scale_factor: " << m_scale_factor << std::endl; + + std::unordered_set scaled_down_nodes; + std::unordered_set normal_nodes; + std::unordered_set constant_nodes; ov::Shape scale_const_shape = {1}; std::vector scale_value = {m_scale_factor}; @@ -68,51 +61,51 @@ bool ov::pass::StaticScalingModel::run_on_model(const std::shared_ptr auto scale_down = std::make_shared(parameter_node->output(0), inverse_scale_const->output(0)); ov::replace_node(parameter_node, scale_down); - scaled_down_subgraph.insert(node->get_friendly_name()); - scaled_down_subgraph.insert(scale_down->get_friendly_name()); + scaled_down_nodes.insert(node->get_friendly_name()); + scaled_down_nodes.insert(scale_down->get_friendly_name()); continue; } auto const_node = std::dynamic_pointer_cast(node); if (const_node) { - constant_subgraph.insert(node->get_friendly_name()); + constant_nodes.insert(node->get_friendly_name()); continue; } auto group_norm_node = std::dynamic_pointer_cast(node); auto mvn_node = std::dynamic_pointer_cast(node); if (group_norm_node || mvn_node) { - normalized_subgraph.insert(node->get_friendly_name()); + normal_nodes.insert(node->get_friendly_name()); continue; } size_t num_scaled_down_inputs = 0; size_t num_const_inputs = 0; - size_t num_normalized_inputs = 0; + size_t num_normal_inputs = 0; for (auto& dep: node->inputs()) { auto dep_name = dep.get_source_output().get_node_shared_ptr()->get_friendly_name(); - if (scaled_down_subgraph.find(dep_name) != scaled_down_subgraph.end()) { + if (scaled_down_nodes.find(dep_name) != scaled_down_nodes.end()) { num_scaled_down_inputs += 1; continue; } - if (constant_subgraph.find(dep_name) != constant_subgraph.end()) { + if (constant_nodes.find(dep_name) != constant_nodes.end()) { num_const_inputs += 1; continue; } - if (normalized_subgraph.find(dep_name) != normalized_subgraph.end()) { - num_normalized_inputs += 1; + if (normal_nodes.find(dep_name) != normal_nodes.end()) { + num_normal_inputs += 1; continue; } } if (node->get_input_size() > 0) { if (num_const_inputs == node->get_input_size()) { - constant_subgraph.insert(node->get_friendly_name()); + constant_nodes.insert(node->get_friendly_name()); continue; } - if ((num_const_inputs + num_normalized_inputs) == node->get_input_size()) { - normalized_subgraph.insert(node->get_friendly_name()); + if ((num_const_inputs + num_normal_inputs) == node->get_input_size()) { + normal_nodes.insert(node->get_friendly_name()); continue; } } @@ -128,13 +121,10 @@ bool ov::pass::StaticScalingModel::run_on_model(const std::shared_ptr scale_const_f16 : scale_const_f32; auto scale_up = std::make_shared(dep.get_source_output(), scale_const->output(0)); dep.replace_source_output(scale_up->output(0)); - std::cout << "result scale_up " << scale_up->input(0).get_source_output().get_node_shared_ptr()->get_friendly_name() << " --> " - << node->get_friendly_name() << std::endl; } // input0 input1 input0 input1 - // (scaled_down) (normalized (scaled_down) (normalized - // or const) or const) + // (scaled_down) (non-scaled) (scaled_down) (non-scaled) // \ / \ / // \ / ==> \ scale_down // \ / \ / @@ -142,14 +132,12 @@ bool ov::pass::StaticScalingModel::run_on_model(const std::shared_ptr auto add = std::dynamic_pointer_cast(node); if (add && num_scaled_down_inputs == 1) { for (auto& dep: node->inputs()) { - if (scaled_down_subgraph.find(dep.get_source_output().get_node_shared_ptr()->get_friendly_name()) == scaled_down_subgraph.end()) { + if (scaled_down_nodes.find(dep.get_source_output().get_node_shared_ptr()->get_friendly_name()) == scaled_down_nodes.end()) { std::shared_ptr inverse_scale_const = (dep.get_element_type() == ov::element::f16) ? inverse_scale_const_f16 : inverse_scale_const_f32; auto scale_down = std::make_shared(dep.get_source_output(), inverse_scale_const->output(0)); dep.replace_source_output(scale_down->output(0)); - std::cout << "scale_down " << scale_down->input(0).get_source_output().get_node_shared_ptr()->get_friendly_name() << " --> " - << node->get_friendly_name() << std::endl; } } } @@ -162,14 +150,12 @@ bool ov::pass::StaticScalingModel::run_on_model(const std::shared_ptr scale_const_f16 : scale_const_f32; auto scale_up = std::make_shared(dep.get_source_output(), scale_const->output(0)); dep.replace_source_output(scale_up->output(0)); - std::cout << "scale_up " << scale_up->input(0).get_source_output().get_node_shared_ptr()->get_friendly_name() << " --> " - << node->get_friendly_name() << std::endl; } auto sdpa = std::dynamic_pointer_cast(node); if (sdpa) { for (size_t i = 0; i < 2; i++) { - if (scaled_down_subgraph.find(node->input(i).get_source_output().get_node_shared_ptr()->get_friendly_name()) != scaled_down_subgraph.end()) { + if (scaled_down_nodes.find(node->input(i).get_source_output().get_node_shared_ptr()->get_friendly_name()) != scaled_down_nodes.end()) { std::shared_ptr scale_const = (node->get_input_element_type(i) == ov::element::f16) ? scale_const_f16 : scale_const_f32; auto transpose = std::dynamic_pointer_cast(node->input(i).get_source_output().get_node_shared_ptr()); if (transpose) { @@ -184,8 +170,8 @@ bool ov::pass::StaticScalingModel::run_on_model(const std::shared_ptr } } - if (scaled_down_subgraph.find(node->input(2).get_source_output().get_node_shared_ptr()->get_friendly_name()) != scaled_down_subgraph.end()) { - scaled_down_subgraph.insert(node->get_friendly_name()); + if (scaled_down_nodes.find(node->input(2).get_source_output().get_node_shared_ptr()->get_friendly_name()) != scaled_down_nodes.end()) { + scaled_down_nodes.insert(node->get_friendly_name()); } continue; } @@ -219,232 +205,18 @@ bool ov::pass::StaticScalingModel::run_on_model(const std::shared_ptr auto scale_down = std::make_shared(node->output(0), inverse_scale_const_f32->output(0)); ov::replace_node(node, scale_down); - scaled_down_subgraph.insert(scale_down->get_friendly_name()); + scaled_down_nodes.insert(scale_down->get_friendly_name()); if (output_prec == ov::element::f16) { auto precision_down = std::make_shared(scale_down->output(0), ov::element::f16); ov::replace_node(scale_down, precision_down); - scaled_down_subgraph.insert(precision_down->get_friendly_name()); + scaled_down_nodes.insert(precision_down->get_friendly_name()); } - std::cout << "scale activation " << node->get_friendly_name() << std::endl; } if (num_scaled_down_inputs > 0) - scaled_down_subgraph.insert(node->get_friendly_name()); + scaled_down_nodes.insert(node->get_friendly_name()); } return true; } - -ov::pass::StaticScaling::StaticScaling(float scale_factor) { - using namespace ov::pass::pattern; - using ov::pass::pattern::op::Or; - - const ov::element::Type infer_prec = ov::element::f16; - const ov::element::Type scaled_prec = ov::element::f32; - - scale_factor = get_scale_factor(scale_factor); - - auto input_m = any_input(); - auto swish_m = wrap_type({ input_m }); - - ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); - - OPENVINO_ASSERT(pattern_map.count(swish_m)); - - auto swish = std::dynamic_pointer_cast(pattern_map.at(swish_m).get_node_shared_ptr()); - if (!swish || transformation_callback(swish)) - return false; - - if (swish->get_input_element_type(0) != infer_prec || swish->get_output_element_type(0) != infer_prec) { - return false; - } - - auto input = pattern_map.at(input_m); - auto precision_up = std::make_shared(input, scaled_prec); - - ov::Shape scale_const_shape = {1}; - std::vector scale_value = {scale_factor}; - std::shared_ptr scale_const = std::make_shared(scaled_prec, scale_const_shape, scale_value); - auto scale_up = std::make_shared(precision_up->output(0), - scale_const->output(0)); - swish->input(0).replace_source_output(scale_up->output(0)); - swish->revalidate_and_infer_types(); - - std::vector inverse_scale_value = {(1.f / scale_factor)}; - std::shared_ptr inverse_scale_const = std::make_shared(scaled_prec, scale_const_shape, inverse_scale_value); - auto scale_down = std::make_shared(swish->output(0), - inverse_scale_const->output(0)); - ov::replace_node(swish, scale_down); - - auto precision_down = std::make_shared(scale_down->output(0), infer_prec); - ov::replace_node(scale_down, precision_down); -std::cout << "StaticScaling - converted " << scale_factor << std::endl; - return true; - }; - - auto m = std::make_shared(swish_m, "StaticScaling"); - this->register_matcher(m, callback); -} - -ov::pass::StaticScalingInput::StaticScalingInput(float scale_factor) { - using namespace ov::pass::pattern; - using ov::pass::pattern::op::Or; - - scale_factor = get_scale_factor(scale_factor); - - auto input_m = wrap_type(); - - ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); - - OPENVINO_ASSERT(pattern_map.count(input_m)); - - auto input = std::dynamic_pointer_cast(pattern_map.at(input_m).get_node_shared_ptr()); - if (!input || transformation_callback(input)) - return false; - - auto input_prec = input->get_output_element_type(0); - - ov::Shape scale_const_shape = {1}; - std::vector inverse_scale_value = {(1.f / scale_factor)}; - std::shared_ptr inverse_scale_const = std::make_shared(input_prec, scale_const_shape, inverse_scale_value); - auto scale_down = std::make_shared(input->output(0), - inverse_scale_const->output(0)); - - ov::replace_node(input, scale_down); -std::cout << "StaticScalingInput - converted " << scale_factor << std::endl; - return true; - }; - - auto m = std::make_shared(input_m, "StaticScalingInput"); - this->register_matcher(m, callback); -} - -ov::pass::StaticScalingOutput::StaticScalingOutput(float scale_factor) { - using namespace ov::pass::pattern; - using ov::pass::pattern::op::Or; - - scale_factor = get_scale_factor(scale_factor); - - auto input_m = any_input(); - auto output_m = wrap_type({input_m}); - - ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); - - OPENVINO_ASSERT(pattern_map.count(output_m)); - - auto output = std::dynamic_pointer_cast(pattern_map.at(output_m).get_node_shared_ptr()); - if (!output || transformation_callback(output)) - return false; - - auto output_prec = output->get_input_element_type(0); - - ov::Shape scale_const_shape = {1}; - std::vector scale_value = {scale_factor}; - std::shared_ptr scale_const = std::make_shared(output_prec, scale_const_shape, scale_value); - auto scale_up = std::make_shared(output->get_input_source_output(0), - scale_const->output(0)); - output->input(0).replace_source_output(scale_up->output(0)); -std::cout << "StaticScalingOutput - converted " << scale_factor << std::endl; - return true; - }; - - auto m = std::make_shared(output_m, "StaticScalingOutput"); - this->register_matcher(m, callback); -} - -ov::pass::StaticScalingAdd::StaticScalingAdd(float scale_factor) { - using namespace ov::pass::pattern; - using ov::pass::pattern::op::Or; - - scale_factor = get_scale_factor(scale_factor); - - auto input_m = any_input(); - auto const_input_m = wrap_type(); - auto add_m = wrap_type({input_m, const_input_m}); - - ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); - - OPENVINO_ASSERT(pattern_map.count(add_m)); - - auto add = std::dynamic_pointer_cast(pattern_map.at(add_m).get_node_shared_ptr()); - if (!add || transformation_callback(add)) - return false; - - auto const_input = std::dynamic_pointer_cast(pattern_map.at(const_input_m).get_node_shared_ptr()); - if (!const_input) - return false; - - auto runtime_prec = add->get_input_element_type(0); - - ov::Shape scale_const_shape = {1}; - std::vector inverse_scale_value = {(1.f / scale_factor)}; - std::shared_ptr inverse_scale_const = std::make_shared(runtime_prec, scale_const_shape, inverse_scale_value); - auto scale_down = std::make_shared(const_input->output(0), - inverse_scale_const->output(0)); - ov::replace_node(const_input, scale_down); -std::cout << "StaticScalingAdd - converted " << scale_factor << std::endl; - return true; - }; - - auto m = std::make_shared(add_m, "StaticScalingAdd"); - this->register_matcher(m, callback); -} - -// ov::pass::StaticScaling::StaticScaling(float scale_factor) { -// using namespace ov::pass::pattern; -// using ov::pass::pattern::op::Or; - -// const float default_scale_factor = 256.f; -// const ov::element::Type infer_prec = ov::element::f32; -// const ov::element::Type scaled_prec = ov::element::f16; - -// scale_factor = (scale_factor < 1.f) ? default_scale_factor : scale_factor; - -// auto input_m = any_input(); -// auto weights_m = wrap_type(type_matches_any({infer_prec})); -// auto convolution_m = wrap_type({ input_m, weights_m }); - -// ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { -// const auto& pattern_map = m.get_pattern_value_map(); - -// OPENVINO_ASSERT(pattern_map.count(convolution_m)); - -// auto conv = std::dynamic_pointer_cast(pattern_map.at(convolution_m).get_node_shared_ptr()); -// if (!conv || transformation_callback(conv)) -// return false; - -// if (conv->get_input_element_type(0) != infer_prec || conv->get_output_element_type(0) != infer_prec) { -// return false; -// } - -// auto conv_weight = std::dynamic_pointer_cast(pattern_map.at(weights_m).get_node_shared_ptr()); -// auto conv_weight_convert = std::make_shared(conv_weight, scaled_prec); -// ov::replace_node(conv_weight, conv_weight_convert); - -// auto input = pattern_map.at(input_m); - -// ov::Shape scale_const_shape = {1}; -// std::vector inverse_scale_value = {(1.f / scale_factor)}; -// std::shared_ptr inverse_scale_const = std::make_shared(infer_prec, scale_const_shape, inverse_scale_value); -// auto scale_down = std::make_shared(input.get_node_shared_ptr()->output(0), -// inverse_scale_const->output(0)); -// auto precision_down = std::make_shared(scale_down, scaled_prec); -// conv->input(0).replace_source_output(precision_down->output(0)); - -// std::vector scale_value = {scale_factor}; -// std::shared_ptr scale_const = std::make_shared(infer_prec, scale_const_shape, scale_value); -// auto scale_up = std::make_shared(conv->output(0), -// scale_const->output(0)); -// ov::replace_node(conv, scale_up); -// std::cout << "StaticScaling - converted " << scale_factor << std::endl; -// return true; -// }; - -// auto m = std::make_shared(convolution_m, "StaticScaling"); -// this->register_matcher(m, callback); -// } diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index bb3c3fb2b578d3..981cb89224e138 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -914,14 +914,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // not working properly. manager.register_pass(); - if (config.get_property(ov::enable_static_scaling)) { - float scale_factor = func->get_rt_info().count("scale_factor") ? func->get_rt_info("scale_factor") : 0.f; - // manager.register_pass(scale_factor); - // manager.register_pass(scale_factor); - // manager.register_pass(scale_factor); - // manager.register_pass(scale_factor); - manager.register_pass(scale_factor); - } + manager.register_pass(config.get_property(ov::hint::activations_scale_factor)); manager.register_pass(); manager.register_pass(); From a025716c3146061fa86cb514e093876ec2da1c4f Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 28 Oct 2024 05:00:34 +0900 Subject: [PATCH 06/64] removed unnecessary codes --- src/plugins/intel_gpu/src/plugin/plugin.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 6e7178f8385813..5650f5a66a2ae6 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -126,7 +126,6 @@ std::shared_ptr Plugin::clone_and_transform_model(const std::shared_p GPU_DEBUG_IF(!debug_config->dump_graphs.empty()) { auto path_base = debug_config->dump_graphs + "/" + cloned_model->get_name() + "_" + "transformed_func"; ov::pass::VisualizeTree(path_base + ".svg").run_on_model(cloned_model); - ov::pass::Serialize(path_base + ".xml", path_base + ".bin").run_on_model(cloned_model); } return cloned_model; } From ea0829ddc1b58e4ef4a478904ed12aa11d9295f8 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 28 Oct 2024 11:11:19 +0900 Subject: [PATCH 07/64] renamed to ActivationsScaling --- .../python/tests/test_runtime/test_properties.py | 8 -------- .../{static_scaling.hpp => activations_scaling.hpp} | 12 ++++-------- .../{static_scaling.cpp => activations_scaling.cpp} | 6 +++--- .../src/plugin/transformations_pipeline.cpp | 4 ++-- 4 files changed, 9 insertions(+), 21 deletions(-) rename src/common/transformations/include/transformations/common_optimizations/{static_scaling.hpp => activations_scaling.hpp} (51%) rename src/common/transformations/src/transformations/common_optimizations/{static_scaling.cpp => activations_scaling.cpp} (98%) diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py index 47809dd63f304b..15e2d86ead4653 100644 --- a/src/bindings/python/tests/test_runtime/test_properties.py +++ b/src/bindings/python/tests/test_runtime/test_properties.py @@ -417,14 +417,6 @@ def test_properties_ro(ov_property_ro, expected_value): "AVAILABLE_DEVICE_MEM_SIZE", ((128, 128),), ), - ( - hints.activations_scale_factor, - "ACTIVATIONS_SCALE_FACTOR", - ( - (16.0, np.float32(16.0)), - (256.0, 256.0), - ), - ), ], ) def test_properties_rw(ov_property_rw, expected_value, test_values): diff --git a/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp similarity index 51% rename from src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp rename to src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index fe2d514454a34c..24f94fec5bc2f4 100644 --- a/src/common/transformations/include/transformations/common_optimizations/static_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -12,19 +12,15 @@ namespace ov { namespace pass { -class TRANSFORMATIONS_API StaticScaling; -class TRANSFORMATIONS_API StaticScalingInput; -class TRANSFORMATIONS_API StaticScalingOutput; -class TRANSFORMATIONS_API StaticScalingAdd; -class TRANSFORMATIONS_API StaticScalingModel; +class TRANSFORMATIONS_API ActivationsScaling; } // namespace pass } // namespace ov -class ov::pass::StaticScalingModel : public ov::pass::ModelPass { +class ov::pass::ActivationsScaling : public ov::pass::ModelPass { public: - OPENVINO_RTTI("StaticScalingModel", "0"); - explicit StaticScalingModel(float scale_factor): m_scale_factor(scale_factor) {} + OPENVINO_RTTI("ActivationsScaling", "0"); + explicit ActivationsScaling(float scale_factor): m_scale_factor(scale_factor) {} bool run_on_model(const std::shared_ptr& model) override; private: diff --git a/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp similarity index 98% rename from src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp rename to src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 0be31155c3a339..4f02974b2e7cba 100644 --- a/src/common/transformations/src/transformations/common_optimizations/static_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "transformations/common_optimizations/static_scaling.hpp" +#include "transformations/common_optimizations/activations_scaling.hpp" #include @@ -31,8 +31,8 @@ #include "openvino/pass/pattern/op/wrap_type.hpp" #include "transformations/utils/utils.hpp" -bool ov::pass::StaticScalingModel::run_on_model(const std::shared_ptr& f) { - RUN_ON_MODEL_SCOPE(StaticScalingModel); +bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr& f) { + RUN_ON_MODEL_SCOPE(ActivationsScaling); if (m_scale_factor < 1.f) return false; diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 981cb89224e138..5122df3ad46534 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -93,7 +93,7 @@ #include "transformations/common_optimizations/move_eltwise_up_data_movement.hpp" #include "transformations/common_optimizations/mvn_fusion.hpp" #include "transformations/common_optimizations/sdpa_scale_fusion.hpp" -#include "transformations/common_optimizations/static_scaling.hpp" +#include "transformations/common_optimizations/activations_scaling.hpp" #include "transformations/common_optimizations/softmax_fusion.hpp" #include "transformations/common_optimizations/glu_fusion.hpp" #include "transformations/common_optimizations/transpose_sinking.hpp" @@ -914,7 +914,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // not working properly. manager.register_pass(); - manager.register_pass(config.get_property(ov::hint::activations_scale_factor)); + manager.register_pass(config.get_property(ov::hint::activations_scale_factor)); manager.register_pass(); manager.register_pass(); From 7a50c1b4ad47897b4d22497c5369fc111052ea2f Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 28 Oct 2024 11:52:56 +0900 Subject: [PATCH 08/64] updated code style --- .../activations_scaling.hpp | 2 +- .../activations_scaling.cpp | 99 +++++++++++-------- 2 files changed, 57 insertions(+), 44 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index 24f94fec5bc2f4..ad8474e8567c75 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -20,7 +20,7 @@ class TRANSFORMATIONS_API ActivationsScaling; class ov::pass::ActivationsScaling : public ov::pass::ModelPass { public: OPENVINO_RTTI("ActivationsScaling", "0"); - explicit ActivationsScaling(float scale_factor): m_scale_factor(scale_factor) {} + explicit ActivationsScaling(float scale_factor) : m_scale_factor(scale_factor) {} bool run_on_model(const std::shared_ptr& model) override; private: diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 4f02974b2e7cba..8e54e2c85bcf78 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -8,24 +8,24 @@ #include "itt.hpp" #include "openvino/core/rt_info.hpp" +#include "openvino/op/add.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/convert.hpp" #include "openvino/op/convolution.hpp" -#include "openvino/op/multiply.hpp" -#include "openvino/op/swish.hpp" -#include "openvino/op/gelu.hpp" -#include "openvino/op/sin.hpp" #include "openvino/op/cos.hpp" -#include "openvino/op/power.hpp" -#include "openvino/op/sqrt.hpp" -#include "openvino/op/softmax.hpp" +#include "openvino/op/gelu.hpp" +#include "openvino/op/group_normalization.hpp" #include "openvino/op/matmul.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/mvn.hpp" #include "openvino/op/parameter.hpp" +#include "openvino/op/power.hpp" #include "openvino/op/result.hpp" -#include "openvino/op/add.hpp" -#include "openvino/op/mvn.hpp" -#include "openvino/op/group_normalization.hpp" #include "openvino/op/scaled_dot_product_attention.hpp" +#include "openvino/op/sin.hpp" +#include "openvino/op/softmax.hpp" +#include "openvino/op/sqrt.hpp" +#include "openvino/op/swish.hpp" #include "openvino/op/transpose.hpp" #include "openvino/pass/pattern/op/or.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" @@ -46,20 +46,24 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr ov::Shape scale_const_shape = {1}; std::vector scale_value = {m_scale_factor}; std::vector inverse_scale_value = {(1.f / m_scale_factor)}; - std::shared_ptr scale_const_f16 = std::make_shared(ov::element::f16, scale_const_shape, scale_value); - std::shared_ptr scale_const_f32 = std::make_shared(ov::element::f32, scale_const_shape, scale_value); - std::shared_ptr inverse_scale_const_f16 = std::make_shared(ov::element::f16, scale_const_shape, inverse_scale_value); - std::shared_ptr inverse_scale_const_f32 = std::make_shared(ov::element::f32, scale_const_shape, inverse_scale_value); + std::shared_ptr scale_const_f16 = + std::make_shared(ov::element::f16, scale_const_shape, scale_value); + std::shared_ptr scale_const_f32 = + std::make_shared(ov::element::f32, scale_const_shape, scale_value); + std::shared_ptr inverse_scale_const_f16 = + std::make_shared(ov::element::f16, scale_const_shape, inverse_scale_value); + std::shared_ptr inverse_scale_const_f32 = + std::make_shared(ov::element::f32, scale_const_shape, inverse_scale_value); for (auto& node : f->get_ordered_ops()) { auto parameter_node = std::dynamic_pointer_cast(node); - if (parameter_node && - (parameter_node->get_element_type() == ov::element::f16 || - parameter_node->get_element_type() == ov::element::f32)) { - std::shared_ptr inverse_scale_const = (parameter_node->get_element_type() == ov::element::f16) ? - inverse_scale_const_f16 : inverse_scale_const_f32; - auto scale_down = std::make_shared(parameter_node->output(0), - inverse_scale_const->output(0)); + if (parameter_node && (parameter_node->get_element_type() == ov::element::f16 || + parameter_node->get_element_type() == ov::element::f32)) { + std::shared_ptr inverse_scale_const = (parameter_node->get_element_type() == ov::element::f16) + ? inverse_scale_const_f16 + : inverse_scale_const_f32; + auto scale_down = + std::make_shared(parameter_node->output(0), inverse_scale_const->output(0)); ov::replace_node(parameter_node, scale_down); scaled_down_nodes.insert(node->get_friendly_name()); scaled_down_nodes.insert(scale_down->get_friendly_name()); @@ -82,7 +86,7 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr size_t num_scaled_down_inputs = 0; size_t num_const_inputs = 0; size_t num_normal_inputs = 0; - for (auto& dep: node->inputs()) { + for (auto& dep : node->inputs()) { auto dep_name = dep.get_source_output().get_node_shared_ptr()->get_friendly_name(); if (scaled_down_nodes.find(dep_name) != scaled_down_nodes.end()) { @@ -117,8 +121,8 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr auto result = std::dynamic_pointer_cast(node); if (result && num_scaled_down_inputs == 1) { auto dep = node->input(0); - std::shared_ptr scale_const = (dep.get_element_type() == ov::element::f16) ? - scale_const_f16 : scale_const_f32; + std::shared_ptr scale_const = + (dep.get_element_type() == ov::element::f16) ? scale_const_f16 : scale_const_f32; auto scale_up = std::make_shared(dep.get_source_output(), scale_const->output(0)); dep.replace_source_output(scale_up->output(0)); } @@ -131,12 +135,14 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr // add add auto add = std::dynamic_pointer_cast(node); if (add && num_scaled_down_inputs == 1) { - for (auto& dep: node->inputs()) { - if (scaled_down_nodes.find(dep.get_source_output().get_node_shared_ptr()->get_friendly_name()) == scaled_down_nodes.end()) { - std::shared_ptr inverse_scale_const = (dep.get_element_type() == ov::element::f16) ? - inverse_scale_const_f16 : inverse_scale_const_f32; - auto scale_down = std::make_shared(dep.get_source_output(), - inverse_scale_const->output(0)); + for (auto& dep : node->inputs()) { + if (scaled_down_nodes.find(dep.get_source_output().get_node_shared_ptr()->get_friendly_name()) == + scaled_down_nodes.end()) { + std::shared_ptr inverse_scale_const = (dep.get_element_type() == ov::element::f16) + ? inverse_scale_const_f16 + : inverse_scale_const_f32; + auto scale_down = + std::make_shared(dep.get_source_output(), inverse_scale_const->output(0)); dep.replace_source_output(scale_down->output(0)); } } @@ -146,8 +152,8 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr auto matmul = std::dynamic_pointer_cast(node); if ((multiply || matmul) && num_scaled_down_inputs == 2) { auto dep = node->input(1); - std::shared_ptr scale_const = (dep.get_element_type() == ov::element::f16) ? - scale_const_f16 : scale_const_f32; + std::shared_ptr scale_const = + (dep.get_element_type() == ov::element::f16) ? scale_const_f16 : scale_const_f32; auto scale_up = std::make_shared(dep.get_source_output(), scale_const->output(0)); dep.replace_source_output(scale_up->output(0)); } @@ -155,9 +161,13 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr auto sdpa = std::dynamic_pointer_cast(node); if (sdpa) { for (size_t i = 0; i < 2; i++) { - if (scaled_down_nodes.find(node->input(i).get_source_output().get_node_shared_ptr()->get_friendly_name()) != scaled_down_nodes.end()) { - std::shared_ptr scale_const = (node->get_input_element_type(i) == ov::element::f16) ? scale_const_f16 : scale_const_f32; - auto transpose = std::dynamic_pointer_cast(node->input(i).get_source_output().get_node_shared_ptr()); + if (scaled_down_nodes.find( + node->input(i).get_source_output().get_node_shared_ptr()->get_friendly_name()) != + scaled_down_nodes.end()) { + std::shared_ptr scale_const = + (node->get_input_element_type(i) == ov::element::f16) ? scale_const_f16 : scale_const_f32; + auto transpose = std::dynamic_pointer_cast( + node->input(i).get_source_output().get_node_shared_ptr()); if (transpose) { auto scale_up = std::make_shared(transpose->get_input_source_output(0), scale_const->output(0)); @@ -170,15 +180,18 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr } } - if (scaled_down_nodes.find(node->input(2).get_source_output().get_node_shared_ptr()->get_friendly_name()) != scaled_down_nodes.end()) { + if (scaled_down_nodes.find(node->input(2).get_source_output().get_node_shared_ptr()->get_friendly_name()) != + scaled_down_nodes.end()) { scaled_down_nodes.insert(node->get_friendly_name()); } continue; } - // input(scaled_down) -- activation + // input(scaled_down) -> (non-linear layers) // ==> - // input(scaled_down) -- convert(precision_up) -- multiply(scale_up) -- activation -- multiply(scale_down) -- convert(precision_down) + // input(scaled_down) -> convert(precision_up) -> multiply(scale_up) + // -> (non-linear layers) -> + // multiply(scale_down) -> convert(precision_down) auto sin = std::dynamic_pointer_cast(node); auto cos = std::dynamic_pointer_cast(node); auto swish = std::dynamic_pointer_cast(node); @@ -192,18 +205,18 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr ov::Output input_src; if (input_prec == ov::element::f16) { - auto precision_up = std::make_shared(node->get_input_source_output(0), ov::element::f32); + auto precision_up = + std::make_shared(node->get_input_source_output(0), ov::element::f32); input_src = precision_up->output(0); } else { input_src = node->get_input_source_output(0); } - auto scale_up = std::make_shared(input_src, - scale_const_f32->output(0)); + auto scale_up = std::make_shared(input_src, scale_const_f32->output(0)); node->input(0).replace_source_output(scale_up->output(0)); node->revalidate_and_infer_types(); - auto scale_down = std::make_shared(node->output(0), - inverse_scale_const_f32->output(0)); + auto scale_down = + std::make_shared(node->output(0), inverse_scale_const_f32->output(0)); ov::replace_node(node, scale_down); scaled_down_nodes.insert(scale_down->get_friendly_name()); From dcad25a95979c23fa198bdff3ce0ee5dc7cf8943 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 29 Oct 2024 22:16:46 +0900 Subject: [PATCH 09/64] updated to use multiple MatcherPass --- .../activations_scaling.hpp | 21 + .../activations_scaling.cpp | 369 +++++++++--------- .../src/plugin/transformations_pipeline.cpp | 4 +- 3 files changed, 200 insertions(+), 194 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index ad8474e8567c75..ed119d0aacefa7 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -13,6 +13,9 @@ namespace ov { namespace pass { class TRANSFORMATIONS_API ActivationsScaling; +class TRANSFORMATIONS_API ScaleDownSingleLayer; +class TRANSFORMATIONS_API MulGroupNormFusion; +class TRANSFORMATIONS_API MulMulAddFusion; } // namespace pass } // namespace ov @@ -26,3 +29,21 @@ class ov::pass::ActivationsScaling : public ov::pass::ModelPass { private: float m_scale_factor = 0.f; }; + +class ov::pass::ScaleDownSingleLayer : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ScaleDownSingleLayer", "0"); + ScaleDownSingleLayer(float scale_factor); +}; + +class ov::pass::MulGroupNormFusion : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("MulGroupNormFusion", "0"); + MulGroupNormFusion(); +}; + +class ov::pass::MulMulAddFusion : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("MulMulAddFusion", "0"); + MulMulAddFusion(); +}; diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 8e54e2c85bcf78..b0c56f7bd55dbe 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -3,6 +3,7 @@ // #include "transformations/common_optimizations/activations_scaling.hpp" +#include "transformations/common_optimizations/lin_op_sequence_fusion.hpp" #include @@ -10,226 +11,210 @@ #include "openvino/core/rt_info.hpp" #include "openvino/op/add.hpp" #include "openvino/op/constant.hpp" -#include "openvino/op/convert.hpp" #include "openvino/op/convolution.hpp" -#include "openvino/op/cos.hpp" -#include "openvino/op/gelu.hpp" +#include "openvino/op/divide.hpp" #include "openvino/op/group_normalization.hpp" #include "openvino/op/matmul.hpp" #include "openvino/op/multiply.hpp" -#include "openvino/op/mvn.hpp" -#include "openvino/op/parameter.hpp" -#include "openvino/op/power.hpp" -#include "openvino/op/result.hpp" -#include "openvino/op/scaled_dot_product_attention.hpp" -#include "openvino/op/sin.hpp" -#include "openvino/op/softmax.hpp" -#include "openvino/op/sqrt.hpp" -#include "openvino/op/swish.hpp" -#include "openvino/op/transpose.hpp" +#include "openvino/pass/manager.hpp" #include "openvino/pass/pattern/op/or.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "transformations/utils/utils.hpp" -bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr& f) { - RUN_ON_MODEL_SCOPE(ActivationsScaling); - - if (m_scale_factor < 1.f) +namespace { +const auto is_scalar_node = [](const ov::Output& output) -> bool { + const auto shape = output.get_partial_shape(); + if (shape.is_dynamic() || shape.rank().is_dynamic()) return false; + if (std::all_of(shape.begin(), shape.end(), [](const ov::Dimension& dimension) { return dimension == 1ul; })) + return true; + return false; +}; +} - std::cout << "scale_factor: " << m_scale_factor << std::endl; +using namespace ov::pass::pattern; +using ov::pass::pattern::op::Or; - std::unordered_set scaled_down_nodes; - std::unordered_set normal_nodes; - std::unordered_set constant_nodes; +// Add scale_down and scale_up layers around Convolution and MatMul nodes +// Conv/MatMul ==> Multiply(scale_down) --> Conv/MatMul --> Multiply(scale_up) +ov::pass::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor) { + MATCHER_SCOPE(ScaleDownSingleLayer); + + auto activation_m = any_input(); + auto weights_m = any_input(); + auto convolution_m = wrap_type({ activation_m, weights_m }); + auto matmul_m = wrap_type({ activation_m, weights_m }); + auto scaled_op_m = std::make_shared(OutputVector{convolution_m, matmul_m}); ov::Shape scale_const_shape = {1}; - std::vector scale_value = {m_scale_factor}; - std::vector inverse_scale_value = {(1.f / m_scale_factor)}; - std::shared_ptr scale_const_f16 = - std::make_shared(ov::element::f16, scale_const_shape, scale_value); - std::shared_ptr scale_const_f32 = - std::make_shared(ov::element::f32, scale_const_shape, scale_value); - std::shared_ptr inverse_scale_const_f16 = - std::make_shared(ov::element::f16, scale_const_shape, inverse_scale_value); - std::shared_ptr inverse_scale_const_f32 = - std::make_shared(ov::element::f32, scale_const_shape, inverse_scale_value); - - for (auto& node : f->get_ordered_ops()) { - auto parameter_node = std::dynamic_pointer_cast(node); - if (parameter_node && (parameter_node->get_element_type() == ov::element::f16 || - parameter_node->get_element_type() == ov::element::f32)) { - std::shared_ptr inverse_scale_const = (parameter_node->get_element_type() == ov::element::f16) - ? inverse_scale_const_f16 - : inverse_scale_const_f32; - auto scale_down = - std::make_shared(parameter_node->output(0), inverse_scale_const->output(0)); - ov::replace_node(parameter_node, scale_down); - scaled_down_nodes.insert(node->get_friendly_name()); - scaled_down_nodes.insert(scale_down->get_friendly_name()); - continue; + std::vector scale_down_value = {1.f/scale_factor}; + std::shared_ptr scale_down_const_f16 = std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); + std::shared_ptr scale_down_const_f32 = std::make_shared(ov::element::f32, scale_const_shape, scale_down_value); + std::vector scale_up_value = {scale_factor}; + std::shared_ptr scale_up_const_f16 = std::make_shared(ov::element::f16, scale_const_shape, scale_up_value); + std::shared_ptr scale_up_const_f32 = std::make_shared(ov::element::f32, scale_const_shape, scale_up_value); + + ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + OPENVINO_ASSERT(pattern_map.count(convolution_m) || pattern_map.count(matmul_m)); + + std::shared_ptr scaled_op = nullptr; + + if (pattern_map.count(convolution_m)) + scaled_op = std::dynamic_pointer_cast(pattern_map.at(convolution_m).get_node_shared_ptr()); + + if (pattern_map.count(matmul_m)) + scaled_op = std::dynamic_pointer_cast(pattern_map.at(matmul_m).get_node_shared_ptr()); + + if (transformation_callback(scaled_op)) + return false; + + auto scale_down = std::make_shared(scaled_op->input(0).get_source_output(), + (scaled_op->input(0).get_element_type() == ov::element::f32) ? + scale_down_const_f32 : scale_down_const_f16); + scaled_op->input(0).replace_source_output(scale_down->output(0)); + + auto child = scaled_op->get_output_target_inputs(0).begin()->get_node(); + if (scaled_op->get_output_target_inputs(0).size() == 1 && ov::is_type(child)) { + auto add = child->shared_from_this(); + auto scale_down_bias = std::make_shared(add->input(1).get_source_output(), + (add->input(1).get_element_type() == ov::element::f32) ? + scale_down_const_f32 : scale_down_const_f16); + add->input(1).replace_source_output(scale_down_bias->output(0)); + + auto scale_up = std::make_shared(add->output(0), + (add->output(0).get_element_type() == ov::element::f32) ? + scale_up_const_f32 : scale_up_const_f16); + ov::replace_node(add, scale_up); + } else { + auto scale_up = std::make_shared(scaled_op->output(0), + (scaled_op->output(0).get_element_type() == ov::element::f32) ? + scale_up_const_f32 : scale_up_const_f16); + ov::replace_node(scaled_op, scale_up); } - auto const_node = std::dynamic_pointer_cast(node); - if (const_node) { - constant_nodes.insert(node->get_friendly_name()); - continue; - } + return true; + }; - auto group_norm_node = std::dynamic_pointer_cast(node); - auto mvn_node = std::dynamic_pointer_cast(node); - if (group_norm_node || mvn_node) { - normal_nodes.insert(node->get_friendly_name()); - continue; - } + auto m = std::make_shared(scaled_op_m, "ScaleDownSingleLayer"); + this->register_matcher(m, callback); +} - size_t num_scaled_down_inputs = 0; - size_t num_const_inputs = 0; - size_t num_normal_inputs = 0; - for (auto& dep : node->inputs()) { - auto dep_name = dep.get_source_output().get_node_shared_ptr()->get_friendly_name(); - - if (scaled_down_nodes.find(dep_name) != scaled_down_nodes.end()) { - num_scaled_down_inputs += 1; - continue; - } - if (constant_nodes.find(dep_name) != constant_nodes.end()) { - num_const_inputs += 1; - continue; - } - if (normal_nodes.find(dep_name) != normal_nodes.end()) { - num_normal_inputs += 1; - continue; - } - } +// MulMulAddFusion makes the target pattern to be easy to be merged with other nodes. +// +// input_a const_a input_b const_b input_a (const_a/const_b) +// \ / \ / \ / +// Multiply_a Multiply_b ==> Multiply_a input_b +// \ / \ / +// \ / Add const_b +// \ / | / +// Add Multiply_c +// +// (input_a * const_a) + (input_b * const_b) ==> ((input_a * (const_a / const_b)) + input_b) * const_b +ov::pass::MulMulAddFusion::MulMulAddFusion() { + MATCHER_SCOPE(MulMulAddFusion); - if (node->get_input_size() > 0) { - if (num_const_inputs == node->get_input_size()) { - constant_nodes.insert(node->get_friendly_name()); - continue; - } - if ((num_const_inputs + num_normal_inputs) == node->get_input_size()) { - normal_nodes.insert(node->get_friendly_name()); - continue; - } - } + auto activation0_m = any_input(); + auto scale_const0_m = ov::pass::pattern::wrap_type(is_scalar_node); + auto mul0_m = wrap_type({ activation0_m, scale_const0_m }); - if (num_scaled_down_inputs == 0) { - continue; - } + auto activation1_m = any_input(); + auto scale_const1_m = ov::pass::pattern::wrap_type(is_scalar_node); + auto mul1_m = wrap_type({ activation1_m, scale_const1_m }); - auto result = std::dynamic_pointer_cast(node); - if (result && num_scaled_down_inputs == 1) { - auto dep = node->input(0); - std::shared_ptr scale_const = - (dep.get_element_type() == ov::element::f16) ? scale_const_f16 : scale_const_f32; - auto scale_up = std::make_shared(dep.get_source_output(), scale_const->output(0)); - dep.replace_source_output(scale_up->output(0)); - } + auto add_m = wrap_type({ mul0_m, mul1_m }); - // input0 input1 input0 input1 - // (scaled_down) (non-scaled) (scaled_down) (non-scaled) - // \ / \ / - // \ / ==> \ scale_down - // \ / \ / - // add add - auto add = std::dynamic_pointer_cast(node); - if (add && num_scaled_down_inputs == 1) { - for (auto& dep : node->inputs()) { - if (scaled_down_nodes.find(dep.get_source_output().get_node_shared_ptr()->get_friendly_name()) == - scaled_down_nodes.end()) { - std::shared_ptr inverse_scale_const = (dep.get_element_type() == ov::element::f16) - ? inverse_scale_const_f16 - : inverse_scale_const_f32; - auto scale_down = - std::make_shared(dep.get_source_output(), inverse_scale_const->output(0)); - dep.replace_source_output(scale_down->output(0)); - } - } - } + ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + OPENVINO_ASSERT(pattern_map.count(mul0_m)); + OPENVINO_ASSERT(pattern_map.count(mul1_m)); + OPENVINO_ASSERT(pattern_map.count(add_m)); - auto multiply = std::dynamic_pointer_cast(node); - auto matmul = std::dynamic_pointer_cast(node); - if ((multiply || matmul) && num_scaled_down_inputs == 2) { - auto dep = node->input(1); - std::shared_ptr scale_const = - (dep.get_element_type() == ov::element::f16) ? scale_const_f16 : scale_const_f32; - auto scale_up = std::make_shared(dep.get_source_output(), scale_const->output(0)); - dep.replace_source_output(scale_up->output(0)); + auto add = std::dynamic_pointer_cast(pattern_map.at(add_m).get_node_shared_ptr()); + + if (transformation_callback(add)) { + return false; } - auto sdpa = std::dynamic_pointer_cast(node); - if (sdpa) { - for (size_t i = 0; i < 2; i++) { - if (scaled_down_nodes.find( - node->input(i).get_source_output().get_node_shared_ptr()->get_friendly_name()) != - scaled_down_nodes.end()) { - std::shared_ptr scale_const = - (node->get_input_element_type(i) == ov::element::f16) ? scale_const_f16 : scale_const_f32; - auto transpose = std::dynamic_pointer_cast( - node->input(i).get_source_output().get_node_shared_ptr()); - if (transpose) { - auto scale_up = std::make_shared(transpose->get_input_source_output(0), - scale_const->output(0)); - transpose->input(0).replace_source_output(scale_up->output(0)); - } else { - auto scale_up = std::make_shared(node->get_input_source_output(i), - scale_const->output(0)); - node->input(i).replace_source_output(scale_up->output(0)); - } - } - } - - if (scaled_down_nodes.find(node->input(2).get_source_output().get_node_shared_ptr()->get_friendly_name()) != - scaled_down_nodes.end()) { - scaled_down_nodes.insert(node->get_friendly_name()); - } - continue; + auto scale_const0 = std::dynamic_pointer_cast(pattern_map.at(scale_const0_m).get_node_shared_ptr()); + auto mul0 = std::dynamic_pointer_cast(pattern_map.at(mul0_m).get_node_shared_ptr()); + + auto scale_const1 = std::dynamic_pointer_cast(pattern_map.at(scale_const1_m).get_node_shared_ptr()); + auto mul1 = std::dynamic_pointer_cast(pattern_map.at(mul1_m).get_node_shared_ptr()); + + mul0->input(1).replace_source_output(ov::op::util::eltwise_fold(scale_const0, scale_const1)); + add->input(1).replace_source_output(mul1->get_input_source_output(0)); + + auto new_mul = register_new_node(add, scale_const1); + replace_node(add, new_mul); + + return true; + }; + + auto m = std::make_shared(add_m, "MulMulAddFusion"); + this->register_matcher(m, callback); +} + +// GroupNormalization has the following property. +// +// GroupNorm(input * const_a) = GroupNorm(input) +// +// So, we can skip Multiply that is connected to GroupNormalization. +// +// input --> Multiply --> GroupNormalization ==> input --> GroupNormalization +ov::pass::MulGroupNormFusion::MulGroupNormFusion() { + MATCHER_SCOPE(MulGroupNormFusion); + + auto activation_m = any_input(); + auto scale_const_m = ov::pass::pattern::wrap_type(is_scalar_node); + auto mul_m = wrap_type({ activation_m, scale_const_m }); + auto norm_scale_m = any_input(); + auto norm_bias_m = any_input(); + auto norm_m = wrap_type({ mul_m, norm_scale_m, norm_bias_m }); + + ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + OPENVINO_ASSERT(pattern_map.count(mul_m)); + OPENVINO_ASSERT(pattern_map.count(norm_m)); + + auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); + auto norm = std::dynamic_pointer_cast(pattern_map.at(norm_m).get_node_shared_ptr()); + + if (transformation_callback(norm)) { + return false; } - // input(scaled_down) -> (non-linear layers) - // ==> - // input(scaled_down) -> convert(precision_up) -> multiply(scale_up) - // -> (non-linear layers) -> - // multiply(scale_down) -> convert(precision_down) - auto sin = std::dynamic_pointer_cast(node); - auto cos = std::dynamic_pointer_cast(node); - auto swish = std::dynamic_pointer_cast(node); - auto power = std::dynamic_pointer_cast(node); - auto sqrt = std::dynamic_pointer_cast(node); - auto gelu = std::dynamic_pointer_cast(node); - auto softmax = std::dynamic_pointer_cast(node); - if ((sin || cos || swish || power || sqrt || gelu || softmax) && num_scaled_down_inputs == 1) { - auto input_prec = node->get_input_element_type(0); - auto output_prec = node->get_output_element_type(0); - - ov::Output input_src; - if (input_prec == ov::element::f16) { - auto precision_up = - std::make_shared(node->get_input_source_output(0), ov::element::f32); - input_src = precision_up->output(0); - } else { - input_src = node->get_input_source_output(0); - } - auto scale_up = std::make_shared(input_src, scale_const_f32->output(0)); - node->input(0).replace_source_output(scale_up->output(0)); - node->revalidate_and_infer_types(); - - auto scale_down = - std::make_shared(node->output(0), inverse_scale_const_f32->output(0)); - ov::replace_node(node, scale_down); - scaled_down_nodes.insert(scale_down->get_friendly_name()); - - if (output_prec == ov::element::f16) { - auto precision_down = std::make_shared(scale_down->output(0), ov::element::f16); - ov::replace_node(scale_down, precision_down); - scaled_down_nodes.insert(precision_down->get_friendly_name()); - } + if (mul && norm) { + norm->input(0).replace_source_output(mul->get_input_source_output(0)); + return true; } + return false; + }; + + auto m = std::make_shared(norm_m, "MulGroupNormFusion"); + this->register_matcher(m, callback); +} + +bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr& f) { + RUN_ON_FUNCTION_SCOPE(ActivationsScaling); + + if (m_scale_factor <= 0.f) + return false; + + ov::pass::Manager manager(get_pass_config(), "ActivationsScaling"); + manager.set_per_pass_validation(false); + + manager.register_pass(m_scale_factor); + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); - if (num_scaled_down_inputs > 0) - scaled_down_nodes.insert(node->get_friendly_name()); - } + manager.run_passes(f); return true; } diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 5122df3ad46534..c6176e6592bc28 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -896,6 +896,8 @@ void TransformationsPipeline::apply(std::shared_ptr func) { ov::pass::Manager manager("GPU:PostLPT"); manager.set_per_pass_validation(false); + manager.register_pass(config.get_property(ov::hint::activations_scale_factor)); + // Other ops support eltwise fusions const std::vector allowed_data_movement_ops = { ov::op::v1::Reshape::get_type_info_static(), @@ -914,8 +916,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // not working properly. manager.register_pass(); - manager.register_pass(config.get_property(ov::hint::activations_scale_factor)); - manager.register_pass(); manager.register_pass(); manager.register_pass(); From d07460280c31756bb27c51c937c43669954ded1b Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 29 Oct 2024 22:35:50 +0900 Subject: [PATCH 10/64] updated code style --- .../activations_scaling.cpp | 68 +++++++++++-------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index b0c56f7bd55dbe..9908f6ae1fa27b 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -3,7 +3,6 @@ // #include "transformations/common_optimizations/activations_scaling.hpp" -#include "transformations/common_optimizations/lin_op_sequence_fusion.hpp" #include @@ -19,6 +18,7 @@ #include "openvino/pass/manager.hpp" #include "openvino/pass/pattern/op/or.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" +#include "transformations/common_optimizations/lin_op_sequence_fusion.hpp" #include "transformations/utils/utils.hpp" namespace { @@ -26,7 +26,9 @@ const auto is_scalar_node = [](const ov::Output& output) -> bool { const auto shape = output.get_partial_shape(); if (shape.is_dynamic() || shape.rank().is_dynamic()) return false; - if (std::all_of(shape.begin(), shape.end(), [](const ov::Dimension& dimension) { return dimension == 1ul; })) + if (std::all_of(shape.begin(), shape.end(), [](const ov::Dimension& dimension) { + return dimension == 1ul; + })) return true; return false; }; @@ -42,17 +44,21 @@ ov::pass::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor) { auto activation_m = any_input(); auto weights_m = any_input(); - auto convolution_m = wrap_type({ activation_m, weights_m }); - auto matmul_m = wrap_type({ activation_m, weights_m }); + auto convolution_m = wrap_type({activation_m, weights_m}); + auto matmul_m = wrap_type({activation_m, weights_m}); auto scaled_op_m = std::make_shared(OutputVector{convolution_m, matmul_m}); ov::Shape scale_const_shape = {1}; - std::vector scale_down_value = {1.f/scale_factor}; - std::shared_ptr scale_down_const_f16 = std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); - std::shared_ptr scale_down_const_f32 = std::make_shared(ov::element::f32, scale_const_shape, scale_down_value); + std::vector scale_down_value = {1.f / scale_factor}; + std::shared_ptr scale_down_const_f16 = + std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); + std::shared_ptr scale_down_const_f32 = + std::make_shared(ov::element::f32, scale_const_shape, scale_down_value); std::vector scale_up_value = {scale_factor}; - std::shared_ptr scale_up_const_f16 = std::make_shared(ov::element::f16, scale_const_shape, scale_up_value); - std::shared_ptr scale_up_const_f32 = std::make_shared(ov::element::f32, scale_const_shape, scale_up_value); + std::shared_ptr scale_up_const_f16 = + std::make_shared(ov::element::f16, scale_const_shape, scale_up_value); + std::shared_ptr scale_up_const_f32 = + std::make_shared(ov::element::f32, scale_const_shape, scale_up_value); ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); @@ -70,27 +76,28 @@ ov::pass::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor) { if (transformation_callback(scaled_op)) return false; - auto scale_down = std::make_shared(scaled_op->input(0).get_source_output(), - (scaled_op->input(0).get_element_type() == ov::element::f32) ? - scale_down_const_f32 : scale_down_const_f16); + auto scale_down = std::make_shared( + scaled_op->input(0).get_source_output(), + (scaled_op->input(0).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); scaled_op->input(0).replace_source_output(scale_down->output(0)); auto child = scaled_op->get_output_target_inputs(0).begin()->get_node(); if (scaled_op->get_output_target_inputs(0).size() == 1 && ov::is_type(child)) { auto add = child->shared_from_this(); - auto scale_down_bias = std::make_shared(add->input(1).get_source_output(), - (add->input(1).get_element_type() == ov::element::f32) ? - scale_down_const_f32 : scale_down_const_f16); + auto scale_down_bias = std::make_shared( + add->input(1).get_source_output(), + (add->input(1).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); add->input(1).replace_source_output(scale_down_bias->output(0)); - auto scale_up = std::make_shared(add->output(0), - (add->output(0).get_element_type() == ov::element::f32) ? - scale_up_const_f32 : scale_up_const_f16); + auto scale_up = std::make_shared( + add->output(0), + (add->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 : scale_up_const_f16); ov::replace_node(add, scale_up); } else { - auto scale_up = std::make_shared(scaled_op->output(0), - (scaled_op->output(0).get_element_type() == ov::element::f32) ? - scale_up_const_f32 : scale_up_const_f16); + auto scale_up = std::make_shared( + scaled_op->output(0), + (scaled_op->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 + : scale_up_const_f16); ov::replace_node(scaled_op, scale_up); } @@ -117,13 +124,13 @@ ov::pass::MulMulAddFusion::MulMulAddFusion() { auto activation0_m = any_input(); auto scale_const0_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul0_m = wrap_type({ activation0_m, scale_const0_m }); + auto mul0_m = wrap_type({activation0_m, scale_const0_m}); auto activation1_m = any_input(); auto scale_const1_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul1_m = wrap_type({ activation1_m, scale_const1_m }); + auto mul1_m = wrap_type({activation1_m, scale_const1_m}); - auto add_m = wrap_type({ mul0_m, mul1_m }); + auto add_m = wrap_type({mul0_m, mul1_m}); ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); @@ -138,13 +145,16 @@ ov::pass::MulMulAddFusion::MulMulAddFusion() { return false; } - auto scale_const0 = std::dynamic_pointer_cast(pattern_map.at(scale_const0_m).get_node_shared_ptr()); + auto scale_const0 = + std::dynamic_pointer_cast(pattern_map.at(scale_const0_m).get_node_shared_ptr()); auto mul0 = std::dynamic_pointer_cast(pattern_map.at(mul0_m).get_node_shared_ptr()); - auto scale_const1 = std::dynamic_pointer_cast(pattern_map.at(scale_const1_m).get_node_shared_ptr()); + auto scale_const1 = + std::dynamic_pointer_cast(pattern_map.at(scale_const1_m).get_node_shared_ptr()); auto mul1 = std::dynamic_pointer_cast(pattern_map.at(mul1_m).get_node_shared_ptr()); - mul0->input(1).replace_source_output(ov::op::util::eltwise_fold(scale_const0, scale_const1)); + mul0->input(1).replace_source_output( + ov::op::util::eltwise_fold(scale_const0, scale_const1)); add->input(1).replace_source_output(mul1->get_input_source_output(0)); auto new_mul = register_new_node(add, scale_const1); @@ -169,10 +179,10 @@ ov::pass::MulGroupNormFusion::MulGroupNormFusion() { auto activation_m = any_input(); auto scale_const_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul_m = wrap_type({ activation_m, scale_const_m }); + auto mul_m = wrap_type({activation_m, scale_const_m}); auto norm_scale_m = any_input(); auto norm_bias_m = any_input(); - auto norm_m = wrap_type({ mul_m, norm_scale_m, norm_bias_m }); + auto norm_m = wrap_type({mul_m, norm_scale_m, norm_bias_m}); ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); From 5043f773832c8e8dbc6567498967677a37f7eeba Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 29 Oct 2024 22:45:38 +0900 Subject: [PATCH 11/64] updated code style --- .../common_optimizations/activations_scaling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 9908f6ae1fa27b..48ae06de35e624 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -88,7 +88,7 @@ ov::pass::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor) { add->input(1).get_source_output(), (add->input(1).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); add->input(1).replace_source_output(scale_down_bias->output(0)); - + auto scale_up = std::make_shared( add->output(0), (add->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 : scale_up_const_f16); @@ -97,7 +97,7 @@ ov::pass::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor) { auto scale_up = std::make_shared( scaled_op->output(0), (scaled_op->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 - : scale_up_const_f16); + : scale_up_const_f16); ov::replace_node(scaled_op, scale_up); } From 163c983886920a4d507e544fe8033b7ea2b5ba90 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Wed, 30 Oct 2024 03:49:31 +0900 Subject: [PATCH 12/64] added unit tests --- .../activations_scaling.hpp | 1 + .../activations_scaling.cpp | 30 +++-- .../activations_scaling_test.cpp | 110 ++++++++++++++++++ .../src/runtime/execution_config.cpp | 2 +- 4 files changed, 135 insertions(+), 8 deletions(-) create mode 100644 src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index ed119d0aacefa7..7848bd67755ef9 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -20,6 +20,7 @@ class TRANSFORMATIONS_API MulMulAddFusion; } // namespace pass } // namespace ov +// ActivationsScaling scales down activations to prevent overflow due to the limited range of FP16 class ov::pass::ActivationsScaling : public ov::pass::ModelPass { public: OPENVINO_RTTI("ActivationsScaling", "0"); diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 48ae06de35e624..5c6070c5724d34 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -79,26 +79,40 @@ ov::pass::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor) { auto scale_down = std::make_shared( scaled_op->input(0).get_source_output(), (scaled_op->input(0).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); + scale_down->set_friendly_name(scaled_op->get_friendly_name() + "_scale_down"); + ov::copy_runtime_info(scaled_op, scale_down); scaled_op->input(0).replace_source_output(scale_down->output(0)); auto child = scaled_op->get_output_target_inputs(0).begin()->get_node(); if (scaled_op->get_output_target_inputs(0).size() == 1 && ov::is_type(child)) { auto add = child->shared_from_this(); + auto target_inputs = add->get_output_target_inputs(0); auto scale_down_bias = std::make_shared( add->input(1).get_source_output(), (add->input(1).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); + scale_down_bias->set_friendly_name(add->get_friendly_name() + "_scale_down"); + ov::copy_runtime_info(add, scale_down_bias); add->input(1).replace_source_output(scale_down_bias->output(0)); - auto scale_up = std::make_shared( + auto scale_up = register_new_node( add->output(0), (add->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 : scale_up_const_f16); - ov::replace_node(add, scale_up); + scale_up->set_friendly_name(scaled_op->get_friendly_name() + "_scale_up"); + ov::copy_runtime_info(scaled_op, scale_up); + for (auto& in : target_inputs) { + in.replace_source_output(scale_up); + } } else { - auto scale_up = std::make_shared( + auto target_inputs = scaled_op->get_output_target_inputs(0); + auto scale_up = register_new_node( scaled_op->output(0), (scaled_op->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 : scale_up_const_f16); - ov::replace_node(scaled_op, scale_up); + scale_up->set_friendly_name(scaled_op->get_friendly_name() + "_scale_up"); + ov::copy_runtime_info(scaled_op, scale_up); + for (auto& in : target_inputs) { + in.replace_source_output(scale_up); + } } return true; @@ -144,6 +158,7 @@ ov::pass::MulMulAddFusion::MulMulAddFusion() { if (transformation_callback(add)) { return false; } + auto target_inputs = add->get_output_target_inputs(0); auto scale_const0 = std::dynamic_pointer_cast(pattern_map.at(scale_const0_m).get_node_shared_ptr()); @@ -156,9 +171,10 @@ ov::pass::MulMulAddFusion::MulMulAddFusion() { mul0->input(1).replace_source_output( ov::op::util::eltwise_fold(scale_const0, scale_const1)); add->input(1).replace_source_output(mul1->get_input_source_output(0)); - - auto new_mul = register_new_node(add, scale_const1); - replace_node(add, new_mul); + mul1->input(0).replace_source_output(add); + for (auto& in : target_inputs) { + in.replace_source_output(mul1); + } return true; }; diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp new file mode 100644 index 00000000000000..211ebea4a8582f --- /dev/null +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -0,0 +1,110 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/common_optimizations/activations_scaling.hpp" + +#include +#include "common_test_utils/graph_comparator.hpp" +#include "common_test_utils/ov_test_utils.hpp" + +#include +#include + +#include "openvino/op/add.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convolution.hpp" +#include "openvino/op/group_normalization.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/pass/manager.hpp" + +#include "transformations/utils/utils.hpp" + +using namespace ov; +using namespace testing; + +TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { + float scale_factor = 128.f; + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 3, 3, 3, 3 }, { 1 }); + auto conv = std::make_shared(input, weights_const, + Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{}); + auto convert = std::make_shared(conv, ov::element::f32); + auto result = std::make_shared(convert); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + manager.register_pass(scale_factor); + } + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 3, 3, 3, 3 }, { 1 }); + auto scale_down_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 1.f / scale_factor }); + auto scale_down = std::make_shared(input, scale_down_const); + auto conv = std::make_shared(scale_down, weights_const, + Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{}); + auto scale_up_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { scale_factor }); + auto scale_up = std::make_shared(conv, scale_up_const); + auto convert = std::make_shared(scale_up, ov::element::f32); + auto result = std::make_shared(convert); + + model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + } +} + +TEST_F(TransformationTestsF, MulMulAddFusionTest) { + { + auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); + auto scale_const_0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 10 }); + auto mul0 = std::make_shared(input0, scale_const_0); + auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); + auto scale_const_1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 10 }); + auto mul1 = std::make_shared(input1, scale_const_1); + auto add = std::make_shared(mul0, mul1); + auto convert = std::make_shared(add, ov::element::f32); + auto result = std::make_shared(convert); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); + manager.register_pass(); + } + { + auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); + auto scale_const_0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 10 }); + auto mul0 = std::make_shared(input0, scale_const_0); + auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); + auto add = std::make_shared(mul0, input1); + auto scale_const_1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 10 }); + auto mul1 = std::make_shared(add, scale_const_1); + auto convert = std::make_shared(mul1, ov::element::f32); + auto result = std::make_shared(convert); + + model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); + } +} + +TEST_F(TransformationTestsF, MulGroupNormFusionTest) { + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 10 }); + auto mul = std::make_shared(input, scale_const); + auto norm_scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 3 }, { 10 }); + auto norm_bias_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 3 }, { 10 }); + auto group_norm = std::make_shared(mul, norm_scale_const, norm_bias_const, 1, 0.01f); + auto convert = std::make_shared(group_norm, ov::element::f32); + auto result = std::make_shared(convert); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + manager.register_pass(); + } + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); + auto norm_scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 3 }, { 10 }); + auto norm_bias_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 3 }, { 10 }); + auto group_norm = std::make_shared(input, norm_scale_const, norm_bias_const, 1, 0.01f); + auto convert = std::make_shared(group_norm, ov::element::f32); + auto result = std::make_shared(convert); + + model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + } +} diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index 3b1376d19b4fea..0372050657f018 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -61,7 +61,7 @@ void ExecutionConfig::set_default() { std::make_tuple(ov::hint::kv_cache_precision, ov::element::undefined), std::make_tuple(ov::intel_gpu::hint::enable_kernels_reuse, false), std::make_tuple(ov::weights_path, ""), - std::make_tuple(ov::hint::activations_scale_factor, 0.f), + std::make_tuple(ov::hint::activations_scale_factor, -1.f), // Legacy API properties std::make_tuple(ov::intel_gpu::nv12_two_inputs, false), From 4d93af3900aa00e0099d7b483dea2c9824fa2bed Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Wed, 30 Oct 2024 05:09:08 +0900 Subject: [PATCH 13/64] update code style --- .../activations_scaling_test.cpp | 73 +++++++++++-------- 1 file changed, 41 insertions(+), 32 deletions(-) diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 211ebea4a8582f..e31a65105655b8 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -5,20 +5,19 @@ #include "transformations/common_optimizations/activations_scaling.hpp" #include -#include "common_test_utils/graph_comparator.hpp" -#include "common_test_utils/ov_test_utils.hpp" + #include #include - +#include "common_test_utils/graph_comparator.hpp" +#include "common_test_utils/ov_test_utils.hpp" #include "openvino/op/add.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/convolution.hpp" #include "openvino/op/group_normalization.hpp" -#include "openvino/op/parameter.hpp" #include "openvino/op/multiply.hpp" +#include "openvino/op/parameter.hpp" #include "openvino/pass/manager.hpp" - #include "transformations/utils/utils.hpp" using namespace ov; @@ -27,10 +26,14 @@ using namespace testing; TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { float scale_factor = 128.f; { - auto input = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); - auto weights_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 3, 3, 3, 3 }, { 1 }); - auto conv = std::make_shared(input, weights_const, - Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{}); + auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); + auto weights_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{3, 3, 3, 3}, {1}); + auto conv = std::make_shared(input, + weights_const, + Strides{}, + CoordinateDiff{}, + CoordinateDiff{}, + Strides{}); auto convert = std::make_shared(conv, ov::element::f32); auto result = std::make_shared(convert); @@ -38,13 +41,17 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { manager.register_pass(scale_factor); } { - auto input = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); - auto weights_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 3, 3, 3, 3 }, { 1 }); - auto scale_down_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 1.f / scale_factor }); + auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); + auto weights_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{3, 3, 3, 3}, {1}); + auto scale_down_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {1.f / scale_factor}); auto scale_down = std::make_shared(input, scale_down_const); - auto conv = std::make_shared(scale_down, weights_const, - Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{}); - auto scale_up_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { scale_factor }); + auto conv = std::make_shared(scale_down, + weights_const, + Strides{}, + CoordinateDiff{}, + CoordinateDiff{}, + Strides{}); + auto scale_up_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {scale_factor}); auto scale_up = std::make_shared(conv, scale_up_const); auto convert = std::make_shared(scale_up, ov::element::f32); auto result = std::make_shared(convert); @@ -55,11 +62,11 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { TEST_F(TransformationTestsF, MulMulAddFusionTest) { { - auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); - auto scale_const_0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 10 }); + auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); + auto scale_const_0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); auto mul0 = std::make_shared(input0, scale_const_0); - auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); - auto scale_const_1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 10 }); + auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); + auto scale_const_1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); auto mul1 = std::make_shared(input1, scale_const_1); auto add = std::make_shared(mul0, mul1); auto convert = std::make_shared(add, ov::element::f32); @@ -69,12 +76,12 @@ TEST_F(TransformationTestsF, MulMulAddFusionTest) { manager.register_pass(); } { - auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); - auto scale_const_0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 10 }); + auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); + auto scale_const_0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); auto mul0 = std::make_shared(input0, scale_const_0); - auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); + auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); auto add = std::make_shared(mul0, input1); - auto scale_const_1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 10 }); + auto scale_const_1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); auto mul1 = std::make_shared(add, scale_const_1); auto convert = std::make_shared(mul1, ov::element::f32); auto result = std::make_shared(convert); @@ -85,12 +92,13 @@ TEST_F(TransformationTestsF, MulMulAddFusionTest) { TEST_F(TransformationTestsF, MulGroupNormFusionTest) { { - auto input = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); - auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 10 }); + auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); auto mul = std::make_shared(input, scale_const); - auto norm_scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 3 }, { 10 }); - auto norm_bias_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 3 }, { 10 }); - auto group_norm = std::make_shared(mul, norm_scale_const, norm_bias_const, 1, 0.01f); + auto norm_scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{3}, {10}); + auto norm_bias_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{3}, {10}); + auto group_norm = + std::make_shared(mul, norm_scale_const, norm_bias_const, 1, 0.01f); auto convert = std::make_shared(group_norm, ov::element::f32); auto result = std::make_shared(convert); @@ -98,10 +106,11 @@ TEST_F(TransformationTestsF, MulGroupNormFusionTest) { manager.register_pass(); } { - auto input = std::make_shared(ov::element::f16, ov::PartialShape{ 1, 3, 16, 16 }); - auto norm_scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 3 }, { 10 }); - auto norm_bias_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 3 }, { 10 }); - auto group_norm = std::make_shared(input, norm_scale_const, norm_bias_const, 1, 0.01f); + auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); + auto norm_scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{3}, {10}); + auto norm_bias_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{3}, {10}); + auto group_norm = + std::make_shared(input, norm_scale_const, norm_bias_const, 1, 0.01f); auto convert = std::make_shared(group_norm, ov::element::f32); auto result = std::make_shared(convert); From 8842ff09fb9b8a0a5ce663f69006cb941aed094b Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Wed, 30 Oct 2024 05:15:46 +0900 Subject: [PATCH 14/64] updated code style --- .../tests/common_optimizations/activations_scaling_test.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index e31a65105655b8..77cdf84f303c79 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -6,7 +6,6 @@ #include - #include #include #include "common_test_utils/graph_comparator.hpp" From ac22e5589d288e13bfae44f5b9881a157ea37712 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Wed, 30 Oct 2024 05:21:20 +0900 Subject: [PATCH 15/64] updated code style --- .../tests/common_optimizations/activations_scaling_test.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 77cdf84f303c79..19c373a2799e92 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -6,7 +6,9 @@ #include + #include + #include #include "common_test_utils/graph_comparator.hpp" #include "common_test_utils/ov_test_utils.hpp" From 8bc37f824499b99c845db6c482c82747ff6e1804 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Wed, 30 Oct 2024 05:27:04 +0900 Subject: [PATCH 16/64] updated code style --- .../tests/common_optimizations/activations_scaling_test.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 19c373a2799e92..4ef3604cfe57a4 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -6,10 +6,9 @@ #include - +#include #include -#include #include "common_test_utils/graph_comparator.hpp" #include "common_test_utils/ov_test_utils.hpp" #include "openvino/op/add.hpp" From 51d5144a2a607b98959d34e9716c00688b1e8373 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 4 Nov 2024 15:21:35 +0900 Subject: [PATCH 17/64] updated for transformer of FLUX.1 --- .../activations_scaling.hpp | 45 ++- .../activations_scaling.cpp | 355 ++++++++++++++++-- .../activations_scaling_test.cpp | 6 +- 3 files changed, 379 insertions(+), 27 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index 7848bd67755ef9..050d0e4d58142f 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -13,10 +13,19 @@ namespace ov { namespace pass { class TRANSFORMATIONS_API ActivationsScaling; + +namespace activations_scaling { + class TRANSFORMATIONS_API ScaleDownSingleLayer; class TRANSFORMATIONS_API MulGroupNormFusion; class TRANSFORMATIONS_API MulMulAddFusion; +class TRANSFORMATIONS_API CropTransformation; +class TRANSFORMATIONS_API ReshapeTransformation; +class TRANSFORMATIONS_API MulMulMulTransformation; +class TRANSFORMATIONS_API MulMVNTransformation; +class TRANSFORMATIONS_API ConcatTransformation; +} // namespace activations_scaling } // namespace pass } // namespace ov @@ -31,20 +40,50 @@ class ov::pass::ActivationsScaling : public ov::pass::ModelPass { float m_scale_factor = 0.f; }; -class ov::pass::ScaleDownSingleLayer : public ov::pass::MatcherPass { +class ov::pass::activations_scaling::ScaleDownSingleLayer : public ov::pass::MatcherPass { public: OPENVINO_RTTI("ScaleDownSingleLayer", "0"); ScaleDownSingleLayer(float scale_factor); }; -class ov::pass::MulGroupNormFusion : public ov::pass::MatcherPass { +class ov::pass::activations_scaling::MulGroupNormFusion : public ov::pass::MatcherPass { public: OPENVINO_RTTI("MulGroupNormFusion", "0"); MulGroupNormFusion(); }; -class ov::pass::MulMulAddFusion : public ov::pass::MatcherPass { +class ov::pass::activations_scaling::MulMulAddFusion : public ov::pass::MatcherPass { public: OPENVINO_RTTI("MulMulAddFusion", "0"); MulMulAddFusion(); }; + +class ov::pass::activations_scaling::CropTransformation : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("CropTransformation", "0"); + CropTransformation(); +}; + +class ov::pass::activations_scaling::ReshapeTransformation : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ReshapeTransformation", "0"); + ReshapeTransformation(); +}; + +class ov::pass::activations_scaling::MulMulMulTransformation : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("MulMulMulTransformation", "0"); + MulMulMulTransformation(); +}; + +class ov::pass::activations_scaling::MulMVNTransformation : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("MulMVNTransformation", "0"); + MulMVNTransformation(); +}; + +class ov::pass::activations_scaling::ConcatTransformation : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ConcatTransformation", "0"); + ConcatTransformation(); +}; diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 5c6070c5724d34..c89dbddbe5c163 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -9,12 +9,17 @@ #include "itt.hpp" #include "openvino/core/rt_info.hpp" #include "openvino/op/add.hpp" +#include "openvino/op/concat.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/convolution.hpp" #include "openvino/op/divide.hpp" #include "openvino/op/group_normalization.hpp" #include "openvino/op/matmul.hpp" #include "openvino/op/multiply.hpp" +#include "openvino/op/mvn.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/unsqueeze.hpp" +#include "openvino/op/variadic_split.hpp" #include "openvino/pass/manager.hpp" #include "openvino/pass/pattern/op/or.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" @@ -32,14 +37,24 @@ const auto is_scalar_node = [](const ov::Output& output) -> bool { return true; return false; }; + +const auto is_non_const_node = [](const ov::Output& output) -> bool { + auto node = std::dynamic_pointer_cast(output.get_node_shared_ptr()); + if (node) { + return false; + } else { + return true; + } +}; } +using namespace ov::pass::activations_scaling; using namespace ov::pass::pattern; using ov::pass::pattern::op::Or; // Add scale_down and scale_up layers around Convolution and MatMul nodes // Conv/MatMul ==> Multiply(scale_down) --> Conv/MatMul --> Multiply(scale_up) -ov::pass::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor) { +ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor) { MATCHER_SCOPE(ScaleDownSingleLayer); auto activation_m = any_input(); @@ -126,21 +141,21 @@ ov::pass::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor) { // // input_a const_a input_b const_b input_a (const_a/const_b) // \ / \ / \ / -// Multiply_a Multiply_b ==> Multiply_a input_b -// \ / \ / -// \ / Add const_b -// \ / | / -// Add Multiply_c +// Multiply_a Multiply_b ==> Multiply_a_mma input_b +// \ / \ / +// \ / Add const_b +// \ / | / +// Add Multiply_b_mma // // (input_a * const_a) + (input_b * const_b) ==> ((input_a * (const_a / const_b)) + input_b) * const_b -ov::pass::MulMulAddFusion::MulMulAddFusion() { +ov::pass::activations_scaling::MulMulAddFusion::MulMulAddFusion() { MATCHER_SCOPE(MulMulAddFusion); - auto activation0_m = any_input(); + auto activation0_m = any_input(is_non_const_node); auto scale_const0_m = ov::pass::pattern::wrap_type(is_scalar_node); auto mul0_m = wrap_type({activation0_m, scale_const0_m}); - auto activation1_m = any_input(); + auto activation1_m = any_input(is_non_const_node); auto scale_const1_m = ov::pass::pattern::wrap_type(is_scalar_node); auto mul1_m = wrap_type({activation1_m, scale_const1_m}); @@ -160,20 +175,30 @@ ov::pass::MulMulAddFusion::MulMulAddFusion() { } auto target_inputs = add->get_output_target_inputs(0); - auto scale_const0 = - std::dynamic_pointer_cast(pattern_map.at(scale_const0_m).get_node_shared_ptr()); - auto mul0 = std::dynamic_pointer_cast(pattern_map.at(mul0_m).get_node_shared_ptr()); + auto mul0 = add->get_input_source_output(0).get_node_shared_ptr(); + auto mul1 = add->get_input_source_output(1).get_node_shared_ptr(); + + size_t const0_index = ov::is_type(mul0->get_input_source_output(1).get_node()) ? 1 : 0; + size_t const1_index = ov::is_type(mul1->get_input_source_output(1).get_node()) ? 1 : 0; - auto scale_const1 = - std::dynamic_pointer_cast(pattern_map.at(scale_const1_m).get_node_shared_ptr()); - auto mul1 = std::dynamic_pointer_cast(pattern_map.at(mul1_m).get_node_shared_ptr()); + auto scale_const0 = mul0->get_input_source_output(const0_index).get_node_shared_ptr(); + auto scale_const1 = mul1->get_input_source_output(const1_index).get_node_shared_ptr(); - mul0->input(1).replace_source_output( + auto new_mul0 = register_new_node( + mul0->get_input_source_output((const0_index == 0) ? 1 : 0), ov::op::util::eltwise_fold(scale_const0, scale_const1)); - add->input(1).replace_source_output(mul1->get_input_source_output(0)); - mul1->input(0).replace_source_output(add); + new_mul0->set_friendly_name(mul0->get_friendly_name() + "_mma"); + ov::copy_runtime_info(mul0, new_mul0); + + add->input(0).replace_source_output(new_mul0); + add->input(1).replace_source_output(mul1->get_input_source_output((const1_index == 0) ? 1 : 0)); + + auto new_mul1 = register_new_node(add, scale_const1); + new_mul1->set_friendly_name(mul1->get_friendly_name() + "_mma"); + ov::copy_runtime_info(mul1, new_mul1); + for (auto& in : target_inputs) { - in.replace_source_output(mul1); + in.replace_source_output(new_mul1); } return true; @@ -190,10 +215,10 @@ ov::pass::MulMulAddFusion::MulMulAddFusion() { // So, we can skip Multiply that is connected to GroupNormalization. // // input --> Multiply --> GroupNormalization ==> input --> GroupNormalization -ov::pass::MulGroupNormFusion::MulGroupNormFusion() { +ov::pass::activations_scaling::MulGroupNormFusion::MulGroupNormFusion() { MATCHER_SCOPE(MulGroupNormFusion); - auto activation_m = any_input(); + auto activation_m = any_input(is_non_const_node); auto scale_const_m = ov::pass::pattern::wrap_type(is_scalar_node); auto mul_m = wrap_type({activation_m, scale_const_m}); auto norm_scale_m = any_input(); @@ -224,6 +249,286 @@ ov::pass::MulGroupNormFusion::MulGroupNormFusion() { this->register_matcher(m, callback); } +// MVN has the following property. +// +// MVN(input * const_a) = MVN(input) +// +// So, we can skip Multiply that is connected to MVN. +// +// input --> Multiply --> MVN ==> input --> MVN +ov::pass::activations_scaling::MulMVNTransformation::MulMVNTransformation() { + MATCHER_SCOPE(MulMVNTransformation); + + auto activation_m = any_input(is_non_const_node); + auto scale_const_m = ov::pass::pattern::wrap_type(is_scalar_node); + auto mul_m = wrap_type({activation_m, scale_const_m}); + auto norm_axes_m = any_input(); + auto norm_m = wrap_type({mul_m, norm_axes_m}); + + ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + OPENVINO_ASSERT(pattern_map.count(mul_m)); + OPENVINO_ASSERT(pattern_map.count(norm_m)); + + auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); + auto norm = std::dynamic_pointer_cast(pattern_map.at(norm_m).get_node_shared_ptr()); + + if (transformation_callback(norm)) { + return false; + } + + if (mul && norm) { + norm->input(0).replace_source_output(mul->get_input_source_output(0)); + return true; + } + return false; + }; + + auto m = std::make_shared(norm_m, "MulMVNTransformation"); + this->register_matcher(m, callback); +} + +ov::pass::activations_scaling::CropTransformation::CropTransformation() { + MATCHER_SCOPE(CropTransformation); + + auto activation_m = any_input(is_non_const_node); + auto scale_const_m = ov::pass::pattern::wrap_type(is_scalar_node); + auto mul_m = wrap_type({activation_m, scale_const_m}); + auto axis_m = any_input(); + auto split_length_m = any_input(); + auto split_m = wrap_type({mul_m, axis_m, split_length_m}); + + ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + OPENVINO_ASSERT(pattern_map.count(mul_m)); + OPENVINO_ASSERT(pattern_map.count(split_m)); + + auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); + auto split = std::dynamic_pointer_cast(pattern_map.at(split_m).get_node_shared_ptr()); + + if (transformation_callback(split)) { + return false; + } + + if (mul && split) { + size_t num_split_outputs = split->get_output_size(); + + std::vector>> target_inputs; + target_inputs.resize(num_split_outputs); + for (size_t i = 0; i < num_split_outputs; i++) { + target_inputs[i] = split->get_output_target_inputs(i); + } + + split->input(0).replace_source_output(mul->input(0).get_source_output()); + + for (size_t i = 0; i < num_split_outputs; i++) { + auto new_mul = register_new_node( + split->output(i), + mul->input(1).get_source_output()); + new_mul->set_friendly_name(mul->get_friendly_name() + "_" + std::to_string(i)); + ov::copy_runtime_info(mul, new_mul); + + for (auto& in : target_inputs[i]) { + in.replace_source_output(new_mul); + } + } + + return true; + } + return false; + }; + + auto m = std::make_shared(split_m, "CropTransformation"); + this->register_matcher(m, callback); +} + +ov::pass::activations_scaling::ReshapeTransformation::ReshapeTransformation() { + MATCHER_SCOPE(ReshapeTransformation); + + auto activation_m = any_input(is_non_const_node); + auto scale_const_m = ov::pass::pattern::wrap_type(is_scalar_node); + auto mul_m = wrap_type({activation_m, scale_const_m}); + auto axes_m = any_input(); + auto reshape_m = wrap_type({mul_m, axes_m}); + + ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + OPENVINO_ASSERT(pattern_map.count(mul_m)); + OPENVINO_ASSERT(pattern_map.count(reshape_m)); + + auto scale_const = std::dynamic_pointer_cast(pattern_map.at(scale_const_m).get_node_shared_ptr()); + auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); + auto reshape = std::dynamic_pointer_cast(pattern_map.at(reshape_m).get_node_shared_ptr()); + + if (transformation_callback(reshape)) { + return false; + } + + if (scale_const && mul && reshape) { + auto target_inputs = reshape->get_output_target_inputs(0); + size_t activation_index = ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; + reshape->input(0).replace_source_output(mul->input(activation_index).get_source_output()); + + auto new_mul = register_new_node(reshape, scale_const); + new_mul->set_friendly_name(mul->get_friendly_name() + "_r"); + ov::copy_runtime_info(mul, new_mul); + + for (auto& in : target_inputs) { + in.replace_source_output(new_mul); + } + + return true; + } + return false; + }; + + auto m = std::make_shared(reshape_m, "ReshapeTransformation"); + this->register_matcher(m, callback); +} + +// MulMulAddFusion makes the target pattern to be easy to be merged with other nodes. +// +// input_a const_a input_b const_b input_a input_b +// \ / \ / \ / +// Multiply_a Multiply_b ==> Multiply_c (const_a * const_b) +// \ / \ / +// \ / Multiply_c_mmm +// \ / +// Multiply_c +// +// (input_a * const_a) * (input_b * const_b) ==> (input_a * input_b) * (const_a * const_b) +ov::pass::activations_scaling::MulMulMulTransformation::MulMulMulTransformation() { + MATCHER_SCOPE(MulMulMulTransformation); + + auto activation0_m = any_input(is_non_const_node); + auto scale_const0_m = ov::pass::pattern::wrap_type(is_scalar_node); + auto mul0_m = wrap_type({activation0_m, scale_const0_m}); + + auto activation1_m = any_input(is_non_const_node); + auto scale_const1_m = ov::pass::pattern::wrap_type(is_scalar_node); + auto mul1_m = wrap_type({activation1_m, scale_const1_m}); + + auto mul2_m = wrap_type({mul0_m, mul1_m}); + + ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + OPENVINO_ASSERT(pattern_map.count(mul0_m)); + OPENVINO_ASSERT(pattern_map.count(mul1_m)); + OPENVINO_ASSERT(pattern_map.count(mul2_m)); + + auto mul2 = std::dynamic_pointer_cast(pattern_map.at(mul2_m).get_node_shared_ptr()); + + if (transformation_callback(mul2)) { + return false; + } + auto target_inputs = mul2->get_output_target_inputs(0); + + auto mul0 = mul2->get_input_source_output(0).get_node_shared_ptr(); + auto mul1 = mul2->get_input_source_output(1).get_node_shared_ptr(); + + size_t const0_index = ov::is_type(mul0->get_input_source_output(1).get_node()) ? 1 : 0; + size_t const1_index = ov::is_type(mul1->get_input_source_output(1).get_node()) ? 1 : 0; + + auto scale_const0 = mul0->get_input_source_output(const0_index).get_node_shared_ptr(); + auto scale_const1 = mul1->get_input_source_output(const1_index).get_node_shared_ptr(); + + mul2->input(0).replace_source_output(mul0->get_input_source_output((const0_index == 0) ? 1 : 0)); + mul2->input(1).replace_source_output(mul1->get_input_source_output((const1_index == 0) ? 1 : 0)); + + auto new_mul = register_new_node( + mul2, + ov::op::util::eltwise_fold(scale_const0, scale_const1)); + new_mul->set_friendly_name(mul2->get_friendly_name() + "_mmm"); + ov::copy_runtime_info(mul2, new_mul); + + for (auto& in : target_inputs) { + in.replace_source_output(new_mul); + } + + return true; + }; + + auto m = std::make_shared(mul2_m, "MulMulMulTransformation"); + this->register_matcher(m, callback); +} + +ov::pass::activations_scaling::ConcatTransformation::ConcatTransformation() { + MATCHER_SCOPE(ConcatTransformation); + + auto concat_m = wrap_type(); + + ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + OPENVINO_ASSERT(pattern_map.count(concat_m)); + + auto concat = std::dynamic_pointer_cast(pattern_map.at(concat_m).get_node_shared_ptr()); + + if (transformation_callback(concat_m)) { + return false; + } + + // check if all inputs are Multiply with scalar operand + bool can_be_transformed = true; + ov::Output last_dep_const; + for (auto &input : concat->inputs()) { + auto dep_node = std::dynamic_pointer_cast(input.get_source_output().get_node_shared_ptr()); + if (!dep_node) { + can_be_transformed = false; + break; + } + auto dep_const0 = std::dynamic_pointer_cast(dep_node->input(0).get_source_output().get_node_shared_ptr()); + auto dep_const1 = std::dynamic_pointer_cast(dep_node->input(1).get_source_output().get_node_shared_ptr()); + if (!dep_const0 && !dep_const1) { + can_be_transformed = false; + break; + } + last_dep_const = dep_const0 ? dep_node->input(0).get_source_output() : dep_node->input(1).get_source_output(); + if (!is_scalar_node(last_dep_const)) { + can_be_transformed = false; + break; + } + } + + if (!can_be_transformed) + return false; + + auto target_inputs = concat->get_output_target_inputs(0); + + for (auto &input : concat->inputs()) { + auto dep_node = input.get_source_output().get_node_shared_ptr(); + auto dep_input0 = dep_node->input(0).get_source_output().get_node(); + size_t const_index = ov::is_type(dep_input0) ? 0 : 1; + size_t activation_index = ov::is_type(dep_input0) ? 1 : 0; + + auto new_mul = register_new_node( + dep_node->input(activation_index).get_source_output(), + ov::op::util::eltwise_fold(dep_node->input(const_index).get_source_output(), last_dep_const)); + new_mul->set_friendly_name(dep_node->get_friendly_name() + "_c"); + ov::copy_runtime_info(dep_node, new_mul); + + input.replace_source_output(new_mul); + } + + auto new_mul = register_new_node(concat, last_dep_const); + new_mul->set_friendly_name(concat->get_friendly_name() + "_c"); + ov::copy_runtime_info(concat, new_mul); + + for (auto& in : target_inputs) { + in.replace_source_output(new_mul); + } + + return false; + }; + + auto m = std::make_shared(concat_m, "ConcatTransformation"); + this->register_matcher(m, callback); +} + bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr& f) { RUN_ON_FUNCTION_SCOPE(ActivationsScaling); @@ -239,6 +544,14 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr manager.register_pass(); manager.register_pass(); manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); manager.run_passes(f); diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 4ef3604cfe57a4..9dd34ff27dbce6 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -38,7 +38,7 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); - manager.register_pass(scale_factor); + manager.register_pass(scale_factor); } { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); @@ -73,7 +73,7 @@ TEST_F(TransformationTestsF, MulMulAddFusionTest) { auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); - manager.register_pass(); + manager.register_pass(); } { auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); @@ -103,7 +103,7 @@ TEST_F(TransformationTestsF, MulGroupNormFusionTest) { auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); - manager.register_pass(); + manager.register_pass(); } { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); From e2cba3db5871a7a31e09f83f9c8654fbed797079 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 4 Nov 2024 20:13:48 +0900 Subject: [PATCH 18/64] disabled FullyConnectedPerLayerScaling --- src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index c6176e6592bc28..ba38625cc144fa 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -957,7 +957,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); manager.register_pass(device_info.supports_immad); - manager.register_pass(config.get_property(ov::hint::activations_scale_factor)); if (!device_info.supports_immad) { manager.register_pass(); From a693e196e5e5c4ca34421cf7465c517f431dd2b0 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 4 Nov 2024 21:39:39 +0900 Subject: [PATCH 19/64] added unit tests --- .../activations_scaling.hpp | 29 ++-- .../activations_scaling.cpp | 101 +++++++---- .../activations_scaling_test.cpp | 162 +++++++++++++++++- .../src/plugin/transformations_pipeline.cpp | 10 ++ 4 files changed, 251 insertions(+), 51 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index 050d0e4d58142f..f71a98725c1066 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -17,9 +17,9 @@ class TRANSFORMATIONS_API ActivationsScaling; namespace activations_scaling { class TRANSFORMATIONS_API ScaleDownSingleLayer; -class TRANSFORMATIONS_API MulGroupNormFusion; -class TRANSFORMATIONS_API MulMulAddFusion; -class TRANSFORMATIONS_API CropTransformation; +class TRANSFORMATIONS_API MulGroupNormTransformation; +class TRANSFORMATIONS_API MulMulAddTransformation; +class TRANSFORMATIONS_API SplitTransformation; class TRANSFORMATIONS_API ReshapeTransformation; class TRANSFORMATIONS_API MulMulMulTransformation; class TRANSFORMATIONS_API MulMVNTransformation; @@ -29,7 +29,10 @@ class TRANSFORMATIONS_API ConcatTransformation; } // namespace pass } // namespace ov -// ActivationsScaling scales down activations to prevent overflow due to the limited range of FP16 +// ActivationsScaling makes activation values smaller to prevent overflow due to the limited range of FP16 +// This feature is controlled by ov::hint::activations_scale_factor. +// For example, when this property is set as 16, activations are divided by 16. +// If ov::hint::activations_scale_factor is less than zero, it is disabled. class ov::pass::ActivationsScaling : public ov::pass::ModelPass { public: OPENVINO_RTTI("ActivationsScaling", "0"); @@ -46,22 +49,22 @@ class ov::pass::activations_scaling::ScaleDownSingleLayer : public ov::pass::Mat ScaleDownSingleLayer(float scale_factor); }; -class ov::pass::activations_scaling::MulGroupNormFusion : public ov::pass::MatcherPass { +class ov::pass::activations_scaling::MulGroupNormTransformation : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("MulGroupNormFusion", "0"); - MulGroupNormFusion(); + OPENVINO_RTTI("MulGroupNormTransformation", "0"); + MulGroupNormTransformation(); }; -class ov::pass::activations_scaling::MulMulAddFusion : public ov::pass::MatcherPass { +class ov::pass::activations_scaling::MulMulAddTransformation : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("MulMulAddFusion", "0"); - MulMulAddFusion(); + OPENVINO_RTTI("MulMulAddTransformation", "0"); + MulMulAddTransformation(); }; -class ov::pass::activations_scaling::CropTransformation : public ov::pass::MatcherPass { +class ov::pass::activations_scaling::SplitTransformation : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("CropTransformation", "0"); - CropTransformation(); + OPENVINO_RTTI("SplitTransformation", "0"); + SplitTransformation(); }; class ov::pass::activations_scaling::ReshapeTransformation : public ov::pass::MatcherPass { diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index c89dbddbe5c163..f3ae3d58f40001 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -53,7 +53,9 @@ using namespace ov::pass::pattern; using ov::pass::pattern::op::Or; // Add scale_down and scale_up layers around Convolution and MatMul nodes -// Conv/MatMul ==> Multiply(scale_down) --> Conv/MatMul --> Multiply(scale_up) +// Conv/MatMul +// ==> +// Multiply(scale_down by scale_factor) --> Conv/MatMul --> Multiply(scale_up by scale_factor) ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor) { MATCHER_SCOPE(ScaleDownSingleLayer); @@ -137,7 +139,7 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float this->register_matcher(m, callback); } -// MulMulAddFusion makes the target pattern to be easy to be merged with other nodes. +// MulMulAddTransformation makes the target pattern to be easy to be merged with followig nodes. // // input_a const_a input_b const_b input_a (const_a/const_b) // \ / \ / \ / @@ -148,8 +150,8 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float // Add Multiply_b_mma // // (input_a * const_a) + (input_b * const_b) ==> ((input_a * (const_a / const_b)) + input_b) * const_b -ov::pass::activations_scaling::MulMulAddFusion::MulMulAddFusion() { - MATCHER_SCOPE(MulMulAddFusion); +ov::pass::activations_scaling::MulMulAddTransformation::MulMulAddTransformation() { + MATCHER_SCOPE(MulMulAddTransformation); auto activation0_m = any_input(is_non_const_node); auto scale_const0_m = ov::pass::pattern::wrap_type(is_scalar_node); @@ -204,7 +206,7 @@ ov::pass::activations_scaling::MulMulAddFusion::MulMulAddFusion() { return true; }; - auto m = std::make_shared(add_m, "MulMulAddFusion"); + auto m = std::make_shared(add_m, "MulMulAddTransformation"); this->register_matcher(m, callback); } @@ -214,9 +216,11 @@ ov::pass::activations_scaling::MulMulAddFusion::MulMulAddFusion() { // // So, we can skip Multiply that is connected to GroupNormalization. // -// input --> Multiply --> GroupNormalization ==> input --> GroupNormalization -ov::pass::activations_scaling::MulGroupNormFusion::MulGroupNormFusion() { - MATCHER_SCOPE(MulGroupNormFusion); +// input --> Multiply --> GroupNormalization +// ==> +// input --> GroupNormalization +ov::pass::activations_scaling::MulGroupNormTransformation::MulGroupNormTransformation() { + MATCHER_SCOPE(MulGroupNormTransformation); auto activation_m = any_input(is_non_const_node); auto scale_const_m = ov::pass::pattern::wrap_type(is_scalar_node); @@ -239,13 +243,14 @@ ov::pass::activations_scaling::MulGroupNormFusion::MulGroupNormFusion() { } if (mul && norm) { - norm->input(0).replace_source_output(mul->get_input_source_output(0)); + size_t activation_index = ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; + norm->input(0).replace_source_output(mul->get_input_source_output(activation_index)); return true; } return false; }; - auto m = std::make_shared(norm_m, "MulGroupNormFusion"); + auto m = std::make_shared(norm_m, "MulGroupNormTransformation"); this->register_matcher(m, callback); } @@ -255,7 +260,9 @@ ov::pass::activations_scaling::MulGroupNormFusion::MulGroupNormFusion() { // // So, we can skip Multiply that is connected to MVN. // -// input --> Multiply --> MVN ==> input --> MVN +// input --> Multiply --> MVN +// ==> +// input --> MVN ov::pass::activations_scaling::MulMVNTransformation::MulMVNTransformation() { MATCHER_SCOPE(MulMVNTransformation); @@ -279,7 +286,8 @@ ov::pass::activations_scaling::MulMVNTransformation::MulMVNTransformation() { } if (mul && norm) { - norm->input(0).replace_source_output(mul->get_input_source_output(0)); + size_t activation_index = ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; + norm->input(0).replace_source_output(mul->get_input_source_output(activation_index)); return true; } return false; @@ -289,8 +297,16 @@ ov::pass::activations_scaling::MulMVNTransformation::MulMVNTransformation() { this->register_matcher(m, callback); } -ov::pass::activations_scaling::CropTransformation::CropTransformation() { - MATCHER_SCOPE(CropTransformation); +// input const input +// \ / | +// Multiply ==> VariadicSplit +// | const / | const \ const +// VariadicSplit | / | / \ / +// / | \ Multiply_a Multiply_b Multiply_c +// output_a output_b output_c | | | +// output_a output_b output_c +ov::pass::activations_scaling::SplitTransformation::SplitTransformation() { + MATCHER_SCOPE(SplitTransformation); auto activation_m = any_input(is_non_const_node); auto scale_const_m = ov::pass::pattern::wrap_type(is_scalar_node); @@ -321,12 +337,14 @@ ov::pass::activations_scaling::CropTransformation::CropTransformation() { target_inputs[i] = split->get_output_target_inputs(i); } - split->input(0).replace_source_output(mul->input(0).get_source_output()); + size_t activation_index = ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; + size_t const_index = (activation_index == 1) ? 0 : 1; + split->input(0).replace_source_output(mul->input(activation_index).get_source_output()); for (size_t i = 0; i < num_split_outputs; i++) { auto new_mul = register_new_node( split->output(i), - mul->input(1).get_source_output()); + mul->input(const_index).get_source_output()); new_mul->set_friendly_name(mul->get_friendly_name() + "_" + std::to_string(i)); ov::copy_runtime_info(mul, new_mul); @@ -340,10 +358,15 @@ ov::pass::activations_scaling::CropTransformation::CropTransformation() { return false; }; - auto m = std::make_shared(split_m, "CropTransformation"); + auto m = std::make_shared(split_m, "SplitTransformation"); this->register_matcher(m, callback); } +// input const input +// \ / | +// Multiply ==> Reshape const +// | | / +// Reshape Multiply ov::pass::activations_scaling::ReshapeTransformation::ReshapeTransformation() { MATCHER_SCOPE(ReshapeTransformation); @@ -389,7 +412,7 @@ ov::pass::activations_scaling::ReshapeTransformation::ReshapeTransformation() { this->register_matcher(m, callback); } -// MulMulAddFusion makes the target pattern to be easy to be merged with other nodes. +// MulMulAddTransformation makes the target pattern to be easy to be merged with other nodes. // // input_a const_a input_b const_b input_a input_b // \ / \ / \ / @@ -456,6 +479,23 @@ ov::pass::activations_scaling::MulMulMulTransformation::MulMulMulTransformation( this->register_matcher(m, callback); } +// input_a const_a input_b const_b input_c const_c +// \ / \ / \ / +// Multiply_a Multiply_b Multiply_c +// \ | / +// \ | / +// ---------- Concat ------------ +// ==> +// (const_a (const_b (const_c +// input_a /const_c) input_b /const_c) input_c /const_c) +// \ / \ / \ / +// Multiply_a Multiply_b Multiply_c +// \ | / +// \ | / +// ---------- Concat ------------ +// | const_c +// | / +// Multiply ov::pass::activations_scaling::ConcatTransformation::ConcatTransformation() { MATCHER_SCOPE(ConcatTransformation); @@ -473,30 +513,23 @@ ov::pass::activations_scaling::ConcatTransformation::ConcatTransformation() { } // check if all inputs are Multiply with scalar operand - bool can_be_transformed = true; ov::Output last_dep_const; for (auto &input : concat->inputs()) { auto dep_node = std::dynamic_pointer_cast(input.get_source_output().get_node_shared_ptr()); if (!dep_node) { - can_be_transformed = false; - break; + return false; } auto dep_const0 = std::dynamic_pointer_cast(dep_node->input(0).get_source_output().get_node_shared_ptr()); auto dep_const1 = std::dynamic_pointer_cast(dep_node->input(1).get_source_output().get_node_shared_ptr()); if (!dep_const0 && !dep_const1) { - can_be_transformed = false; - break; + return false; } last_dep_const = dep_const0 ? dep_node->input(0).get_source_output() : dep_node->input(1).get_source_output(); if (!is_scalar_node(last_dep_const)) { - can_be_transformed = false; - break; + return false; } } - if (!can_be_transformed) - return false; - auto target_inputs = concat->get_output_target_inputs(0); for (auto &input : concat->inputs()) { @@ -540,17 +573,17 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr manager.register_pass(m_scale_factor); manager.register_pass(); - manager.register_pass(); + manager.register_pass(); manager.register_pass(); - manager.register_pass(); - manager.register_pass(); - manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); manager.register_pass(); manager.register_pass(); - manager.register_pass(); + manager.register_pass(); manager.register_pass(); manager.register_pass(); - manager.register_pass(); + manager.register_pass(); manager.register_pass(); manager.run_passes(f); diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 9dd34ff27dbce6..8664dbffdfc6ca 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -12,11 +12,15 @@ #include "common_test_utils/graph_comparator.hpp" #include "common_test_utils/ov_test_utils.hpp" #include "openvino/op/add.hpp" +#include "openvino/op/concat.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/convolution.hpp" #include "openvino/op/group_normalization.hpp" #include "openvino/op/multiply.hpp" +#include "openvino/op/mvn.hpp" #include "openvino/op/parameter.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/variadic_split.hpp" #include "openvino/pass/manager.hpp" #include "transformations/utils/utils.hpp" @@ -60,7 +64,7 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { } } -TEST_F(TransformationTestsF, MulMulAddFusionTest) { +TEST_F(TransformationTestsF, MulMulAddTransformationTest) { { auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); auto scale_const_0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); @@ -73,7 +77,7 @@ TEST_F(TransformationTestsF, MulMulAddFusionTest) { auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); - manager.register_pass(); + manager.register_pass(); } { auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); @@ -90,7 +94,7 @@ TEST_F(TransformationTestsF, MulMulAddFusionTest) { } } -TEST_F(TransformationTestsF, MulGroupNormFusionTest) { +TEST_F(TransformationTestsF, MulGroupNormTransformationTest) { { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); @@ -103,7 +107,7 @@ TEST_F(TransformationTestsF, MulGroupNormFusionTest) { auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); - manager.register_pass(); + manager.register_pass(); } { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); @@ -117,3 +121,153 @@ TEST_F(TransformationTestsF, MulGroupNormFusionTest) { model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); } } + +TEST_F(TransformationTestsF, MulMVNTransformationTest) { + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 224, 224}); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul = std::make_shared(input, scale_const); + auto norm_axes_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, 2, 3}); + auto mvn = + std::make_shared(mul, norm_axes_const, true, 0.01f, ov::op::MVNEpsMode::INSIDE_SQRT); + auto convert = std::make_shared(mvn, ov::element::f32); + auto result = std::make_shared(convert); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + manager.register_pass(); + } + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 224, 224}); + auto norm_axes_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, 2, 3}); + auto mvn = + std::make_shared(input, norm_axes_const, true, 0.01f, ov::op::MVNEpsMode::INSIDE_SQRT); + auto convert = std::make_shared(mvn, ov::element::f32); + auto result = std::make_shared(convert); + + model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + } +} + +TEST_F(TransformationTestsF, SplitTransformationTest) { + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul = std::make_shared(input, scale_const); + auto axis = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {0}); + auto split_length = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, 2, 3}); + auto split = std::make_shared(mul, axis, split_length); + auto convert0 = std::make_shared(split->output(0), ov::element::f32); + auto result0 = std::make_shared(convert0); + auto convert1 = std::make_shared(split->output(1), ov::element::f32); + auto result1 = std::make_shared(convert1); + auto convert2 = std::make_shared(split->output(2), ov::element::f32); + auto result2 = std::make_shared(convert2); + + model = std::make_shared(ov::ResultVector{result0, result1, result2}, ov::ParameterVector{input}); + manager.register_pass(); + } + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto axis = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {0}); + auto split_length = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, 2, 3}); + auto split = std::make_shared(input, axis, split_length); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul0 = std::make_shared(split->output(0), scale_const); + auto convert0 = std::make_shared(mul0, ov::element::f32); + auto result0 = std::make_shared(convert0); + auto mul1 = std::make_shared(split->output(1), scale_const); + auto convert1 = std::make_shared(mul1, ov::element::f32); + auto result1 = std::make_shared(convert1); + auto mul2 = std::make_shared(split->output(2), scale_const); + auto convert2 = std::make_shared(mul2, ov::element::f32); + auto result2 = std::make_shared(convert2); + + model_ref = std::make_shared(ov::ResultVector{result0, result1, result2}, ov::ParameterVector{input}); + } +} + +TEST_F(TransformationTestsF, ReshapeTransformationTest) { + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul = std::make_shared(input, scale_const); + auto shape = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 0, 1, -1}); + auto reshape = std::make_shared(mul, shape, true); + auto convert = std::make_shared(reshape, ov::element::f32); + auto result = std::make_shared(convert); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + manager.register_pass(); + } + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto shape = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 0, 1, -1}); + auto reshape = std::make_shared(input, shape, true); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul = std::make_shared(reshape, scale_const); + auto convert = std::make_shared(mul, ov::element::f32); + auto result = std::make_shared(convert); + + model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + } +} + +TEST_F(TransformationTestsF, MulMulMulTransformationTest) { + { + auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto scale_const0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul0 = std::make_shared(input0, scale_const0); + auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto scale_const1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul1 = std::make_shared(input1, scale_const1); + auto mul2 = std::make_shared(mul0, mul1); + auto convert = std::make_shared(mul2, ov::element::f32); + auto result = std::make_shared(convert); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); + manager.register_pass(); + } + { + auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto mul = std::make_shared(input0, input1); + auto new_scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto new_mul = std::make_shared(mul, new_scale_const); + auto convert = std::make_shared(new_mul, ov::element::f32); + auto result = std::make_shared(convert); + + model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); + } +} + +TEST_F(TransformationTestsF, ConcatTransformationTest) { + { + auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto scale_const0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul0 = std::make_shared(input0, scale_const0); + auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto scale_const1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul1 = std::make_shared(input1, scale_const1); + auto concat = std::make_shared(OutputVector{mul0, mul1}, 0); + auto convert = std::make_shared(concat, ov::element::f32); + auto result = std::make_shared(convert); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); + manager.register_pass(); + } + { + auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto scale_const0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul0 = std::make_shared(input0, scale_const0); + auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto scale_const1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul1 = std::make_shared(input1, scale_const1); + auto concat = std::make_shared(OutputVector{mul0, mul1}, 0); + auto new_scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto new_mul = std::make_shared(concat, new_scale_const); + auto convert = std::make_shared(new_mul, ov::element::f32); + auto result = std::make_shared(convert); + + model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); + } +} diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index ba38625cc144fa..c5911f534a48c5 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -954,6 +954,16 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); auto pass_config = manager.get_pass_config(); + pass_config->set_callback([=](const_node_ptr& root) -> bool { + if (!root->get_input_node_ptr(0)->get_input_partial_shape(0).is_static()) { + return false; + } + const auto& gamma_shape = root->get_input_node_ptr(0)->get_input_partial_shape(0).to_shape(); + const int32_t vec_size = 8; + return static_cast((gamma_shape.back() / vec_size)) > static_cast(device_info.max_work_group_size); + }); + + // manager.register_pass(); manager.register_pass(); manager.register_pass(); manager.register_pass(device_info.supports_immad); From a8328a3f9eb9ddc74ca1a778244ca1c7a9d4bae0 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 4 Nov 2024 21:56:40 +0900 Subject: [PATCH 20/64] fixed code style --- .../activations_scaling.cpp | 56 +++++++++++-------- .../activations_scaling_test.cpp | 3 +- .../src/plugin/transformations_pipeline.cpp | 2 +- 3 files changed, 36 insertions(+), 25 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index f3ae3d58f40001..91563b93b57946 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -46,7 +46,7 @@ const auto is_non_const_node = [](const ov::Output& output) -> bool { return true; } }; -} +} // namespace using namespace ov::pass::activations_scaling; using namespace ov::pass::pattern; @@ -243,7 +243,8 @@ ov::pass::activations_scaling::MulGroupNormTransformation::MulGroupNormTransform } if (mul && norm) { - size_t activation_index = ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; + size_t activation_index = + ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; norm->input(0).replace_source_output(mul->get_input_source_output(activation_index)); return true; } @@ -286,7 +287,8 @@ ov::pass::activations_scaling::MulMVNTransformation::MulMVNTransformation() { } if (mul && norm) { - size_t activation_index = ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; + size_t activation_index = + ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; norm->input(0).replace_source_output(mul->get_input_source_output(activation_index)); return true; } @@ -322,7 +324,8 @@ ov::pass::activations_scaling::SplitTransformation::SplitTransformation() { OPENVINO_ASSERT(pattern_map.count(split_m)); auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); - auto split = std::dynamic_pointer_cast(pattern_map.at(split_m).get_node_shared_ptr()); + auto split = + std::dynamic_pointer_cast(pattern_map.at(split_m).get_node_shared_ptr()); if (transformation_callback(split)) { return false; @@ -337,17 +340,17 @@ ov::pass::activations_scaling::SplitTransformation::SplitTransformation() { target_inputs[i] = split->get_output_target_inputs(i); } - size_t activation_index = ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; + size_t activation_index = + ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; size_t const_index = (activation_index == 1) ? 0 : 1; split->input(0).replace_source_output(mul->input(activation_index).get_source_output()); - + for (size_t i = 0; i < num_split_outputs; i++) { - auto new_mul = register_new_node( - split->output(i), - mul->input(const_index).get_source_output()); + auto new_mul = register_new_node(split->output(i), + mul->input(const_index).get_source_output()); new_mul->set_friendly_name(mul->get_friendly_name() + "_" + std::to_string(i)); ov::copy_runtime_info(mul, new_mul); - + for (auto& in : target_inputs[i]) { in.replace_source_output(new_mul); } @@ -382,7 +385,8 @@ ov::pass::activations_scaling::ReshapeTransformation::ReshapeTransformation() { OPENVINO_ASSERT(pattern_map.count(mul_m)); OPENVINO_ASSERT(pattern_map.count(reshape_m)); - auto scale_const = std::dynamic_pointer_cast(pattern_map.at(scale_const_m).get_node_shared_ptr()); + auto scale_const = + std::dynamic_pointer_cast(pattern_map.at(scale_const_m).get_node_shared_ptr()); auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); auto reshape = std::dynamic_pointer_cast(pattern_map.at(reshape_m).get_node_shared_ptr()); @@ -392,13 +396,14 @@ ov::pass::activations_scaling::ReshapeTransformation::ReshapeTransformation() { if (scale_const && mul && reshape) { auto target_inputs = reshape->get_output_target_inputs(0); - size_t activation_index = ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; + size_t activation_index = + ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; reshape->input(0).replace_source_output(mul->input(activation_index).get_source_output()); auto new_mul = register_new_node(reshape, scale_const); new_mul->set_friendly_name(mul->get_friendly_name() + "_r"); ov::copy_runtime_info(mul, new_mul); - + for (auto& in : target_inputs) { in.replace_source_output(new_mul); } @@ -461,7 +466,7 @@ ov::pass::activations_scaling::MulMulMulTransformation::MulMulMulTransformation( mul2->input(0).replace_source_output(mul0->get_input_source_output((const0_index == 0) ? 1 : 0)); mul2->input(1).replace_source_output(mul1->get_input_source_output((const1_index == 0) ? 1 : 0)); - + auto new_mul = register_new_node( mul2, ov::op::util::eltwise_fold(scale_const0, scale_const1)); @@ -485,7 +490,7 @@ ov::pass::activations_scaling::MulMulMulTransformation::MulMulMulTransformation( // \ | / // \ | / // ---------- Concat ------------ -// ==> +// ==> // (const_a (const_b (const_c // input_a /const_c) input_b /const_c) input_c /const_c) // \ / \ / \ / @@ -494,7 +499,7 @@ ov::pass::activations_scaling::MulMulMulTransformation::MulMulMulTransformation( // \ | / // ---------- Concat ------------ // | const_c -// | / +// | / // Multiply ov::pass::activations_scaling::ConcatTransformation::ConcatTransformation() { MATCHER_SCOPE(ConcatTransformation); @@ -514,17 +519,21 @@ ov::pass::activations_scaling::ConcatTransformation::ConcatTransformation() { // check if all inputs are Multiply with scalar operand ov::Output last_dep_const; - for (auto &input : concat->inputs()) { - auto dep_node = std::dynamic_pointer_cast(input.get_source_output().get_node_shared_ptr()); + for (auto& input : concat->inputs()) { + auto dep_node = + std::dynamic_pointer_cast(input.get_source_output().get_node_shared_ptr()); if (!dep_node) { return false; } - auto dep_const0 = std::dynamic_pointer_cast(dep_node->input(0).get_source_output().get_node_shared_ptr()); - auto dep_const1 = std::dynamic_pointer_cast(dep_node->input(1).get_source_output().get_node_shared_ptr()); + auto dep_const0 = std::dynamic_pointer_cast( + dep_node->input(0).get_source_output().get_node_shared_ptr()); + auto dep_const1 = std::dynamic_pointer_cast( + dep_node->input(1).get_source_output().get_node_shared_ptr()); if (!dep_const0 && !dep_const1) { return false; } - last_dep_const = dep_const0 ? dep_node->input(0).get_source_output() : dep_node->input(1).get_source_output(); + last_dep_const = + dep_const0 ? dep_node->input(0).get_source_output() : dep_node->input(1).get_source_output(); if (!is_scalar_node(last_dep_const)) { return false; } @@ -532,7 +541,7 @@ ov::pass::activations_scaling::ConcatTransformation::ConcatTransformation() { auto target_inputs = concat->get_output_target_inputs(0); - for (auto &input : concat->inputs()) { + for (auto& input : concat->inputs()) { auto dep_node = input.get_source_output().get_node_shared_ptr(); auto dep_input0 = dep_node->input(0).get_source_output().get_node(); size_t const_index = ov::is_type(dep_input0) ? 0 : 1; @@ -540,7 +549,8 @@ ov::pass::activations_scaling::ConcatTransformation::ConcatTransformation() { auto new_mul = register_new_node( dep_node->input(activation_index).get_source_output(), - ov::op::util::eltwise_fold(dep_node->input(const_index).get_source_output(), last_dep_const)); + ov::op::util::eltwise_fold(dep_node->input(const_index).get_source_output(), + last_dep_const)); new_mul->set_friendly_name(dep_node->get_friendly_name() + "_c"); ov::copy_runtime_info(dep_node, new_mul); diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 8664dbffdfc6ca..4bee50ddb6e5d1 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -182,7 +182,8 @@ TEST_F(TransformationTestsF, SplitTransformationTest) { auto convert2 = std::make_shared(mul2, ov::element::f32); auto result2 = std::make_shared(convert2); - model_ref = std::make_shared(ov::ResultVector{result0, result1, result2}, ov::ParameterVector{input}); + model_ref = + std::make_shared(ov::ResultVector{result0, result1, result2}, ov::ParameterVector{input}); } } diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index c5911f534a48c5..4d8c2f07af3273 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -963,7 +963,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { return static_cast((gamma_shape.back() / vec_size)) > static_cast(device_info.max_work_group_size); }); - // manager.register_pass(); + manager.register_pass(); manager.register_pass(); manager.register_pass(); manager.register_pass(device_info.supports_immad); From 2194bda3593b4b59b600e46ba79221bf629aa7c5 Mon Sep 17 00:00:00 2001 From: Andrew Park Date: Tue, 5 Nov 2024 23:18:54 +0900 Subject: [PATCH 21/64] Enable FullyConnectedHorizontalFusion with activations scaling --- .../activations_scaling.hpp | 7 ++ .../activations_scaling.cpp | 114 +++++++++++++++++- 2 files changed, 118 insertions(+), 3 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index f71a98725c1066..a5c694d671c7f1 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -17,6 +17,7 @@ class TRANSFORMATIONS_API ActivationsScaling; namespace activations_scaling { class TRANSFORMATIONS_API ScaleDownSingleLayer; +class TRANSFORMATIONS_API ScaleDownMultipleLayers; class TRANSFORMATIONS_API MulGroupNormTransformation; class TRANSFORMATIONS_API MulMulAddTransformation; class TRANSFORMATIONS_API SplitTransformation; @@ -49,6 +50,12 @@ class ov::pass::activations_scaling::ScaleDownSingleLayer : public ov::pass::Mat ScaleDownSingleLayer(float scale_factor); }; +class ov::pass::activations_scaling::ScaleDownMultipleLayers : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ScaleDownMultipleLayers", "0"); + ScaleDownMultipleLayers(float scale_factor); +}; + class ov::pass::activations_scaling::MulGroupNormTransformation : public ov::pass::MatcherPass { public: OPENVINO_RTTI("MulGroupNormTransformation", "0"); diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 91563b93b57946..9e5c3a2d72c42f 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -20,6 +20,7 @@ #include "openvino/op/reshape.hpp" #include "openvino/op/unsqueeze.hpp" #include "openvino/op/variadic_split.hpp" +#include "openvino/pass/constant_folding.hpp" #include "openvino/pass/manager.hpp" #include "openvino/pass/pattern/op/or.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" @@ -59,10 +60,23 @@ using ov::pass::pattern::op::Or; ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor) { MATCHER_SCOPE(ScaleDownSingleLayer); + auto is_single_matmul = [](const Output& output) { + auto matmul = std::dynamic_pointer_cast(output.get_node_shared_ptr()); + auto input = matmul->get_input_node_shared_ptr(0); + size_t user_matmul_count = 0; + for (const auto& u : input->get_users()) { + auto matmul_user = std::dynamic_pointer_cast(u); + if (!matmul_user) + continue; + user_matmul_count++; + } + return user_matmul_count == 1; + }; + auto activation_m = any_input(); auto weights_m = any_input(); auto convolution_m = wrap_type({activation_m, weights_m}); - auto matmul_m = wrap_type({activation_m, weights_m}); + auto matmul_m = wrap_type({activation_m, weights_m}, is_single_matmul); auto scaled_op_m = std::make_shared(OutputVector{convolution_m, matmul_m}); ov::Shape scale_const_shape = {1}; @@ -139,6 +153,98 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float this->register_matcher(m, callback); } +ov::pass::activations_scaling::ScaleDownMultipleLayers::ScaleDownMultipleLayers(float scale_factor) { + MATCHER_SCOPE(ScaleDownMultipleLayers); + + auto is_mutiple_matmuls = [](const Output& output) { + auto matmul = std::dynamic_pointer_cast(output.get_node_shared_ptr()); + auto input = matmul->get_input_node_shared_ptr(0); + size_t user_matmul_count = 0; + for (const auto& u : input->get_users()) { + auto matmul_user = std::dynamic_pointer_cast(u); + if (!matmul_user) + continue; + user_matmul_count++; + } + return !ov::is_type(input) && + input->get_users().size() > 1 && + input->get_users().size() == user_matmul_count; + }; + + auto activation_m = any_input(); + auto weights_m = any_input(); + auto scaled_op_m = wrap_type({activation_m, weights_m}, is_mutiple_matmuls); + + ov::Shape scale_const_shape = {1}; + std::vector scale_down_value = {1.f / scale_factor}; + std::shared_ptr scale_down_const_f16 = + std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); + std::shared_ptr scale_down_const_f32 = + std::make_shared(ov::element::f32, scale_const_shape, scale_down_value); + std::vector scale_up_value = {scale_factor}; + std::shared_ptr scale_up_const_f16 = + std::make_shared(ov::element::f16, scale_const_shape, scale_up_value); + std::shared_ptr scale_up_const_f32 = + std::make_shared(ov::element::f32, scale_const_shape, scale_up_value); + + ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + auto scaled_op = std::dynamic_pointer_cast(pattern_map.at(scaled_op_m).get_node_shared_ptr()); + if (!scaled_op || transformation_callback(scaled_op)) + return false; + + auto input_node = scaled_op->get_input_node_shared_ptr(0); + auto scale_down = std::make_shared( + input_node, (input_node->get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); + scale_down->set_friendly_name(scaled_op->get_friendly_name() + "_scale_down"); + ov::copy_runtime_info(scaled_op, scale_down); + + for (const auto& u : input_node->get_users()) { + auto matmul_user = std::dynamic_pointer_cast(u); + if (matmul_user) { + matmul_user->input(0).replace_source_output(scale_down); + auto child = matmul_user->get_output_target_inputs(0).begin()->get_node(); + if (matmul_user->get_output_target_inputs(0).size() == 1 && ov::is_type(child)) { + auto add = child->shared_from_this(); + auto target_inputs = add->get_output_target_inputs(0); + auto scale_down_bias = std::make_shared( + add->input(1).get_source_output(), + (add->input(1).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); + scale_down_bias->set_friendly_name(add->get_friendly_name() + "_scale_down"); + ov::copy_runtime_info(add, scale_down_bias); + add->input(1).replace_source_output(scale_down_bias->output(0)); + + auto scale_up = register_new_node( + add->output(0), + (add->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 : scale_up_const_f16); + scale_up->set_friendly_name(matmul_user->get_friendly_name() + "_scale_up"); + ov::copy_runtime_info(matmul_user, scale_up); + for (auto& in : target_inputs) { + in.replace_source_output(scale_up); + } + } else { + auto target_inputs = matmul_user->get_output_target_inputs(0); + auto scale_up = register_new_node( + matmul_user->output(0), + (matmul_user->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 + : scale_up_const_f16); + scale_up->set_friendly_name(matmul_user->get_friendly_name() + "_scale_up"); + ov::copy_runtime_info(matmul_user, scale_up); + for (auto& in : target_inputs) { + in.replace_source_output(scale_up); + } + } + } + } + return true; + }; + + auto m = std::make_shared(scaled_op_m, "ScaleDownMultipleLayers"); + this->register_matcher(m, callback); +} + + // MulMulAddTransformation makes the target pattern to be easy to be merged with followig nodes. // // input_a const_a input_b const_b input_a (const_a/const_b) @@ -582,9 +688,10 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr manager.set_per_pass_validation(false); manager.register_pass(m_scale_factor); - manager.register_pass(); + manager.register_pass(m_scale_factor); + manager.register_pass(); manager.register_pass(); - manager.register_pass(); + manager.register_pass(); manager.register_pass(); manager.register_pass(); manager.register_pass(); @@ -595,6 +702,7 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr manager.register_pass(); manager.register_pass(); manager.register_pass(); + manager.register_pass(); manager.run_passes(f); From 2336387a849cf23559aa5038df27849532d145f2 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 11 Nov 2024 22:55:28 +0900 Subject: [PATCH 22/64] updated ScaleDownMultipleLayers --- .../activations_scaling.cpp | 43 ++++++++++++------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 9e5c3a2d72c42f..bf060bb371920f 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -60,9 +60,11 @@ using ov::pass::pattern::op::Or; ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor) { MATCHER_SCOPE(ScaleDownSingleLayer); - auto is_single_matmul = [](const Output& output) { + auto are_not_mutiple_matmuls_connected = [](const Output& output) { auto matmul = std::dynamic_pointer_cast(output.get_node_shared_ptr()); auto input = matmul->get_input_node_shared_ptr(0); + if (input->get_output_size() > 1) + return true; size_t user_matmul_count = 0; for (const auto& u : input->get_users()) { auto matmul_user = std::dynamic_pointer_cast(u); @@ -76,7 +78,7 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float auto activation_m = any_input(); auto weights_m = any_input(); auto convolution_m = wrap_type({activation_m, weights_m}); - auto matmul_m = wrap_type({activation_m, weights_m}, is_single_matmul); + auto matmul_m = wrap_type({activation_m, weights_m}, are_not_mutiple_matmuls_connected); auto scaled_op_m = std::make_shared(OutputVector{convolution_m, matmul_m}); ov::Shape scale_const_shape = {1}; @@ -153,12 +155,21 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float this->register_matcher(m, callback); } +// scale_down +// | +// input ==> input +// / \ / \_ +// MatMul_a MatMul_b MatMul_a MatMul_b +// | | +// scale_up scale_up ov::pass::activations_scaling::ScaleDownMultipleLayers::ScaleDownMultipleLayers(float scale_factor) { MATCHER_SCOPE(ScaleDownMultipleLayers); - auto is_mutiple_matmuls = [](const Output& output) { + auto are_mutiple_matmuls_connected = [](const Output& output) { auto matmul = std::dynamic_pointer_cast(output.get_node_shared_ptr()); auto input = matmul->get_input_node_shared_ptr(0); + if (input->get_output_size() > 1) + return false; size_t user_matmul_count = 0; for (const auto& u : input->get_users()) { auto matmul_user = std::dynamic_pointer_cast(u); @@ -166,14 +177,13 @@ ov::pass::activations_scaling::ScaleDownMultipleLayers::ScaleDownMultipleLayers( continue; user_matmul_count++; } - return !ov::is_type(input) && - input->get_users().size() > 1 && + return !ov::is_type(input) && input->get_users().size() > 1 && input->get_users().size() == user_matmul_count; }; auto activation_m = any_input(); auto weights_m = any_input(); - auto scaled_op_m = wrap_type({activation_m, weights_m}, is_mutiple_matmuls); + auto scaled_op_m = wrap_type({activation_m, weights_m}, are_mutiple_matmuls_connected); ov::Shape scale_const_shape = {1}; std::vector scale_down_value = {1.f / scale_factor}; @@ -190,18 +200,20 @@ ov::pass::activations_scaling::ScaleDownMultipleLayers::ScaleDownMultipleLayers( ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); - auto scaled_op = std::dynamic_pointer_cast(pattern_map.at(scaled_op_m).get_node_shared_ptr()); + auto scaled_op + = std::dynamic_pointer_cast(pattern_map.at(scaled_op_m).get_node_shared_ptr()); if (!scaled_op || transformation_callback(scaled_op)) return false; - auto input_node = scaled_op->get_input_node_shared_ptr(0); + auto input_node = scaled_op->input(0).get_source_output(); auto scale_down = std::make_shared( - input_node, (input_node->get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); + input_node, + (input_node.get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); scale_down->set_friendly_name(scaled_op->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(scaled_op, scale_down); - for (const auto& u : input_node->get_users()) { - auto matmul_user = std::dynamic_pointer_cast(u); + for (const auto& u : input_node.get_target_inputs()) { + auto matmul_user = std::dynamic_pointer_cast(u.get_node()->shared_from_this()); if (matmul_user) { matmul_user->input(0).replace_source_output(scale_down); auto child = matmul_user->get_output_target_inputs(0).begin()->get_node(); @@ -210,14 +222,16 @@ ov::pass::activations_scaling::ScaleDownMultipleLayers::ScaleDownMultipleLayers( auto target_inputs = add->get_output_target_inputs(0); auto scale_down_bias = std::make_shared( add->input(1).get_source_output(), - (add->input(1).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); + (add->input(1).get_element_type() == ov::element::f32) ? scale_down_const_f32 + : scale_down_const_f16); scale_down_bias->set_friendly_name(add->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(add, scale_down_bias); add->input(1).replace_source_output(scale_down_bias->output(0)); auto scale_up = register_new_node( add->output(0), - (add->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 : scale_up_const_f16); + (add->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 + : scale_up_const_f16); scale_up->set_friendly_name(matmul_user->get_friendly_name() + "_scale_up"); ov::copy_runtime_info(matmul_user, scale_up); for (auto& in : target_inputs) { @@ -228,7 +242,7 @@ ov::pass::activations_scaling::ScaleDownMultipleLayers::ScaleDownMultipleLayers( auto scale_up = register_new_node( matmul_user->output(0), (matmul_user->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 - : scale_up_const_f16); + : scale_up_const_f16); scale_up->set_friendly_name(matmul_user->get_friendly_name() + "_scale_up"); ov::copy_runtime_info(matmul_user, scale_up); for (auto& in : target_inputs) { @@ -244,7 +258,6 @@ ov::pass::activations_scaling::ScaleDownMultipleLayers::ScaleDownMultipleLayers( this->register_matcher(m, callback); } - // MulMulAddTransformation makes the target pattern to be easy to be merged with followig nodes. // // input_a const_a input_b const_b input_a (const_a/const_b) From bf71bf5be33ca1d08f72263b0388e3c39d479011 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 11 Nov 2024 23:06:30 +0900 Subject: [PATCH 23/64] updated code style --- .../common_optimizations/activations_scaling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index bf060bb371920f..c633a8d7a428e0 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -200,8 +200,8 @@ ov::pass::activations_scaling::ScaleDownMultipleLayers::ScaleDownMultipleLayers( ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); - auto scaled_op - = std::dynamic_pointer_cast(pattern_map.at(scaled_op_m).get_node_shared_ptr()); + auto scaled_op = + std::dynamic_pointer_cast(pattern_map.at(scaled_op_m).get_node_shared_ptr()); if (!scaled_op || transformation_callback(scaled_op)) return false; From 579f57e5625d43b90e36ae86136069cb28126ff9 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 12 Nov 2024 15:49:50 +0900 Subject: [PATCH 24/64] reading ACTIVATIONS_SCALE_FACTOR from rt_info --- src/inference/src/dev/core_impl.cpp | 13 +++++++++++++ src/inference/src/dev/core_impl.hpp | 3 +++ 2 files changed, 16 insertions(+) diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp index e0e2fb109dc642..a7dc786fd7e4bc 100644 --- a/src/inference/src/dev/core_impl.cpp +++ b/src/inference/src/dev/core_impl.cpp @@ -776,6 +776,7 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::shared_ptr< ov::AnyMap config_with_batch = config; // if auto-batching is applicable, the below function will patch the device name and config accordingly: auto model = apply_auto_batching(model_, deviceName, config_with_batch); + apply_rt_info(model_, config_with_batch); auto parsed = parseDeviceNameIntoConfig(deviceName, coreConfig, config_with_batch, is_proxy_device(deviceName)); auto plugin = get_plugin(parsed._deviceName); @@ -810,6 +811,7 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::shared_ptr< ov::AnyMap config_with_batch = config; // if auto-batching is applicable, the below function will patch the device name and config accordingly: auto model = apply_auto_batching(model_, deviceName, config_with_batch); + apply_rt_info(model_, config_with_batch); auto parsed = parseDeviceNameIntoConfig(deviceName, coreConfig, config_with_batch, is_proxy_device(deviceName)); auto plugin = get_plugin(parsed._deviceName); @@ -1134,6 +1136,17 @@ std::shared_ptr ov::CoreImpl::apply_auto_batching(const std::sh return ov::details::apply_batch_affinity(model, deviceNameWithoutBatch); } +void ov::CoreImpl::apply_rt_info(const std::shared_ptr& model, + ov::AnyMap& config) const { + if (model->has_rt_info({"runtime_options", "ACTIVATIONS_SCALE_FACTOR"})) { + if (config.find("ACTIVATIONS_SCALE_FACTOR") == config.end()) { + const auto activations_scale_factor = + model->get_rt_info({"runtime_options", "ACTIVATIONS_SCALE_FACTOR"}); + config.insert(ov::hint::activations_scale_factor(activations_scale_factor)); + } + } +} + void ov::CoreImpl::set_property(const std::string& device_name, const AnyMap& properties) { OPENVINO_ASSERT(device_name.find("HETERO:") != 0, "set_property is supported only for HETERO itself (without devices). " diff --git a/src/inference/src/dev/core_impl.hpp b/src/inference/src/dev/core_impl.hpp index 85417175c22556..80973f2ee335a4 100644 --- a/src/inference/src/dev/core_impl.hpp +++ b/src/inference/src/dev/core_impl.hpp @@ -251,6 +251,9 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this& model, + ov::AnyMap& config) const; + /* * @brief Register plugins according to the build configuration */ From 6ba03b2c23420260b0556e319372e9de1bb27774 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Thu, 21 Nov 2024 03:26:49 +0900 Subject: [PATCH 25/64] updated to use LPT --- .../low_precision_transformations/src/add.cpp | 2 +- .../src/multiply_partial.cpp | 10 +- .../tests/convolution_qdq_transformation.cpp | 2 +- .../activations_scaling.hpp | 6 +- .../activations_scaling.cpp | 92 +++++++++++-------- .../activations_scaling_test.cpp | 2 +- .../src/plugin/transformations_pipeline.cpp | 35 ++++++- 7 files changed, 101 insertions(+), 48 deletions(-) diff --git a/src/common/low_precision_transformations/src/add.cpp b/src/common/low_precision_transformations/src/add.cpp index 1ba6f6598be247..e8c0380336362b 100644 --- a/src/common/low_precision_transformations/src/add.cpp +++ b/src/common/low_precision_transformations/src/add.cpp @@ -215,7 +215,7 @@ bool AddTransformation::transform(TransformationContext& context, ov::pass::patt newMultiplyFullPathValues); newAddOrSubtract = std::make_shared>( - std::vector{element::f32, element::f32}, std::vector{ element::f32 }, + std::vector{element::f32, element::f32}, std::vector{ add->get_output_element_type(0) }, ov::op::TemporaryReplaceOutputType(inputs[0], element::f32).get(), ov::op::TemporaryReplaceOutputType(inputs[1], element::f32).get()); newMultiply = std::make_shared>( diff --git a/src/common/low_precision_transformations/src/multiply_partial.cpp b/src/common/low_precision_transformations/src/multiply_partial.cpp index c0760d4b1f1c01..14671d75346c3f 100644 --- a/src/common/low_precision_transformations/src/multiply_partial.cpp +++ b/src/common/low_precision_transformations/src/multiply_partial.cpp @@ -133,24 +133,24 @@ bool MultiplyPartialTransformation::transform(TransformationContext& context, ov // before: Y = (SC1 * (X1 - SH1)) * (SC2 * X2) - // after : Y = (SC1' * (X1 - SH1)) * (X2) , where : + // after : Y = ((X1 - SH1) * X2) * SC1' , where : // SC1' = SC1 * SC2 auto newMultiplyValuesFullPath = fold(multiplyValuesEmptyPath, multiplyValuesFullPath); OutputVector inputs{ {}, {} }; - inputs[emptyPathIndex] = dequantizationEmptyPath.data; + inputs[emptyPathIndex] = newMultiplyValuesFullPath; ov::Output parent0 = dequantizationFullPath.subtract == nullptr ? (dequantizationFullPath.convert == nullptr ? dequantizationFullPath.data : dequantizationFullPath.convert) : dequantizationFullPath.subtract; inputs[fullPathIndex] = - parent0.get_node()->get_output_element_type(0) == newMultiplyValuesFullPath->get_output_element_type(0) ? - std::make_shared(parent0, newMultiplyValuesFullPath) : + parent0.get_node()->get_output_element_type(0) == dequantizationEmptyPath.data.get_node()->get_output_element_type(0) ? + std::make_shared(parent0, dequantizationEmptyPath.data) : std::make_shared>( std::vector{element::f32, element::f32}, std::vector{element::f32}, ov::op::TemporaryReplaceOutputType(parent0, element::f32).get(), - ov::op::TemporaryReplaceOutputType(newMultiplyValuesFullPath, element::f32).get()); + ov::op::TemporaryReplaceOutputType(dequantizationEmptyPath.data, element::f32).get()); newMultiply = std::make_shared>( std::vector{element::f32, element::f32}, diff --git a/src/common/low_precision_transformations/tests/convolution_qdq_transformation.cpp b/src/common/low_precision_transformations/tests/convolution_qdq_transformation.cpp index 971a512365e201..22d86242da5e70 100644 --- a/src/common/low_precision_transformations/tests/convolution_qdq_transformation.cpp +++ b/src/common/low_precision_transformations/tests/convolution_qdq_transformation.cpp @@ -506,7 +506,7 @@ const std::vector testValues = { { {0.03f}, element::f32, {}, false } }, { std::vector{ 2.f }, ov::element::i8}, - {}, + {}, ov::element::f32, {} }, diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index a5c694d671c7f1..1f6aef647e0ecc 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -37,17 +37,19 @@ class TRANSFORMATIONS_API ConcatTransformation; class ov::pass::ActivationsScaling : public ov::pass::ModelPass { public: OPENVINO_RTTI("ActivationsScaling", "0"); - explicit ActivationsScaling(float scale_factor) : m_scale_factor(scale_factor) {} + explicit ActivationsScaling(float scale_factor, ov::element::Type scaled_prec) + : m_scale_factor(scale_factor), m_scaled_prec(scaled_prec) {} bool run_on_model(const std::shared_ptr& model) override; private: float m_scale_factor = 0.f; + ov::element::Type m_scaled_prec = element::f16; }; class ov::pass::activations_scaling::ScaleDownSingleLayer : public ov::pass::MatcherPass { public: OPENVINO_RTTI("ScaleDownSingleLayer", "0"); - ScaleDownSingleLayer(float scale_factor); + ScaleDownSingleLayer(float scale_factor, ov::element::Type scaled_prec); }; class ov::pass::activations_scaling::ScaleDownMultipleLayers : public ov::pass::MatcherPass { diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index c633a8d7a428e0..85384b8ef1fde4 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -7,10 +7,12 @@ #include #include "itt.hpp" +#include "low_precision/network_helper.hpp" #include "openvino/core/rt_info.hpp" #include "openvino/op/add.hpp" #include "openvino/op/concat.hpp" #include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" #include "openvino/op/convolution.hpp" #include "openvino/op/divide.hpp" #include "openvino/op/group_normalization.hpp" @@ -40,12 +42,7 @@ const auto is_scalar_node = [](const ov::Output& output) -> bool { }; const auto is_non_const_node = [](const ov::Output& output) -> bool { - auto node = std::dynamic_pointer_cast(output.get_node_shared_ptr()); - if (node) { - return false; - } else { - return true; - } + return !ov::is_type(output.get_node()); }; } // namespace @@ -57,7 +54,7 @@ using ov::pass::pattern::op::Or; // Conv/MatMul // ==> // Multiply(scale_down by scale_factor) --> Conv/MatMul --> Multiply(scale_up by scale_factor) -ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor) { +ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor, ov::element::Type scaled_prec) { MATCHER_SCOPE(ScaleDownSingleLayer); auto are_not_mutiple_matmuls_connected = [](const Output& output) { @@ -78,7 +75,7 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float auto activation_m = any_input(); auto weights_m = any_input(); auto convolution_m = wrap_type({activation_m, weights_m}); - auto matmul_m = wrap_type({activation_m, weights_m}, are_not_mutiple_matmuls_connected); + auto matmul_m = wrap_type({activation_m, weights_m});//, are_not_mutiple_matmuls_connected); auto scaled_op_m = std::make_shared(OutputVector{convolution_m, matmul_m}); ov::Shape scale_const_shape = {1}; @@ -114,40 +111,44 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float (scaled_op->input(0).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); scale_down->set_friendly_name(scaled_op->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(scaled_op, scale_down); - scaled_op->input(0).replace_source_output(scale_down->output(0)); + + auto convert_prec0 = std::make_shared(scale_down->output(0), scaled_prec); + scaled_op->input(0).replace_source_output(convert_prec0->output(0)); + auto convert_prec1 = std::make_shared(scaled_op->input(1).get_source_output(), scaled_prec); + scaled_op->input(1).replace_source_output(convert_prec1->output(0)); + + auto output_prec = scaled_op->output(0).get_element_type(); + std::set> target_inputs; + std::shared_ptr runtime_scaled_op; auto child = scaled_op->get_output_target_inputs(0).begin()->get_node(); if (scaled_op->get_output_target_inputs(0).size() == 1 && ov::is_type(child)) { auto add = child->shared_from_this(); - auto target_inputs = add->get_output_target_inputs(0); + target_inputs = add->get_output_target_inputs(0); auto scale_down_bias = std::make_shared( add->input(1).get_source_output(), (add->input(1).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); scale_down_bias->set_friendly_name(add->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(add, scale_down_bias); - add->input(1).replace_source_output(scale_down_bias->output(0)); - - auto scale_up = register_new_node( - add->output(0), - (add->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 : scale_up_const_f16); - scale_up->set_friendly_name(scaled_op->get_friendly_name() + "_scale_up"); - ov::copy_runtime_info(scaled_op, scale_up); - for (auto& in : target_inputs) { - in.replace_source_output(scale_up); - } + auto convert_bias_prec = std::make_shared(scale_down_bias->output(0), scaled_prec); + add->input(1).replace_source_output(convert_bias_prec->output(0)); + runtime_scaled_op = std::make_shared(add->output(0), output_prec); } else { - auto target_inputs = scaled_op->get_output_target_inputs(0); - auto scale_up = register_new_node( - scaled_op->output(0), - (scaled_op->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 - : scale_up_const_f16); - scale_up->set_friendly_name(scaled_op->get_friendly_name() + "_scale_up"); - ov::copy_runtime_info(scaled_op, scale_up); - for (auto& in : target_inputs) { - in.replace_source_output(scale_up); - } + target_inputs = scaled_op->get_output_target_inputs(0); + runtime_scaled_op = std::make_shared(scaled_op->output(0), output_prec); + } + + auto scale_up = register_new_node( + runtime_scaled_op->output(0), + (runtime_scaled_op->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 : scale_up_const_f16); + scale_up->set_friendly_name(scaled_op->get_friendly_name() + "_scale_up"); + ov::copy_runtime_info(scaled_op, scale_up); + for (auto& in : target_inputs) { + in.replace_source_output(scale_up); } + scaled_op->revalidate_and_infer_types(); + return true; }; @@ -355,7 +356,7 @@ ov::pass::activations_scaling::MulGroupNormTransformation::MulGroupNormTransform OPENVINO_ASSERT(pattern_map.count(norm_m)); auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); - auto norm = std::dynamic_pointer_cast(pattern_map.at(norm_m).get_node_shared_ptr()); + auto norm = std::dynamic_pointer_cast(pattern_map.at(norm_m).get_node_shared_ptr()); if (transformation_callback(norm)) { return false; @@ -364,7 +365,17 @@ ov::pass::activations_scaling::MulGroupNormTransformation::MulGroupNormTransform if (mul && norm) { size_t activation_index = ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; - norm->input(0).replace_source_output(mul->get_input_source_output(activation_index)); + // norm->input(0).replace_source_output(mul->get_input_source_output(activation_index)); + auto activation = mul->get_input_source_output(activation_index); + if (ov::is_type(activation.get_node())) + activation = activation.get_node()->get_input_source_output(0); + auto newGroupNorm = std::make_shared>( + ov::op::v12::GroupNormalization(activation, + norm->get_input_source_output(1), norm->get_input_source_output(2), + norm->get_num_groups(), norm->get_epsilon()), + norm->get_output_element_type(0)); + ov::copy_runtime_info(norm, newGroupNorm); + ov::replace_node(norm, newGroupNorm); return true; } return false; @@ -399,7 +410,7 @@ ov::pass::activations_scaling::MulMVNTransformation::MulMVNTransformation() { OPENVINO_ASSERT(pattern_map.count(norm_m)); auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); - auto norm = std::dynamic_pointer_cast(pattern_map.at(norm_m).get_node_shared_ptr()); + auto norm = std::dynamic_pointer_cast(pattern_map.at(norm_m).get_node_shared_ptr()); if (transformation_callback(norm)) { return false; @@ -408,7 +419,16 @@ ov::pass::activations_scaling::MulMVNTransformation::MulMVNTransformation() { if (mul && norm) { size_t activation_index = ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; - norm->input(0).replace_source_output(mul->get_input_source_output(activation_index)); + // norm->input(0).replace_source_output(mul->get_input_source_output(activation_index)); + auto activation = mul->get_input_source_output(activation_index); + if (ov::is_type(activation.get_node())) + activation = activation.get_node()->get_input_source_output(0); + auto newMVN = std::make_shared>( + ov::op::v6::MVN(activation, norm->get_input_source_output(1), + norm->get_normalize_variance(), norm->get_eps(), norm->get_eps_mode()), + norm->get_output_element_type(0)); + ov::copy_runtime_info(norm, newMVN); + ov::replace_node(norm, newMVN); return true; } return false; @@ -536,7 +556,7 @@ ov::pass::activations_scaling::ReshapeTransformation::ReshapeTransformation() { this->register_matcher(m, callback); } -// MulMulAddTransformation makes the target pattern to be easy to be merged with other nodes. +// MulMulMulTransformation makes the target pattern to be easy to be merged with other nodes. // // input_a const_a input_b const_b input_a input_b // \ / \ / \ / @@ -700,7 +720,7 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr ov::pass::Manager manager(get_pass_config(), "ActivationsScaling"); manager.set_per_pass_validation(false); - manager.register_pass(m_scale_factor); + manager.register_pass(m_scale_factor, m_scaled_prec); manager.register_pass(m_scale_factor); manager.register_pass(); manager.register_pass(); diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 4bee50ddb6e5d1..430a3a3332c384 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -42,7 +42,7 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); - manager.register_pass(scale_factor); + manager.register_pass(scale_factor, ov::element::f16); } { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 4d8c2f07af3273..7ce33ce477b6bd 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -24,6 +24,7 @@ #include "low_precision/low_precision.hpp" #include "low_precision/mat_mul.hpp" #include "low_precision/multiply_to_group_convolution.hpp" +#include "low_precision/mvn.hpp" #include "low_precision/network_helper.hpp" #include "low_precision/pull_reshape_through_dequantization.hpp" #include "low_precision/pull_transpose_through_dequantization.hpp" @@ -892,12 +893,42 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.run_passes(func); } + float activations_scale_factor = config.get_property(ov::hint::activations_scale_factor); + if (activations_scale_factor > 0.f) { + OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "TransformationsPipeline::apply::activations_scaling"); + using namespace ov::pass::low_precision; + + auto supportedPrecisions = std::vector({}); + auto perTensorQuantization = std::vector({}); + + ov::element::Type scaled_precision = element::f16; + std::cout << "scale_factor: " << activations_scale_factor << std::endl; + + ov::pass::Manager manager("GPU:ActivationsScaling"); + manager.set_per_pass_validation(false); + + auto pass_config = manager.get_pass_config(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + + auto params = LayerTransformation::Params(false, scaled_precision, {scaled_precision}, true); + manager.register_pass(activations_scale_factor, scaled_precision); + auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); + lpt_pass->add_main(); + lpt_pass->add_main(); + manager.run_passes(func); + } + { ov::pass::Manager manager("GPU:PostLPT"); manager.set_per_pass_validation(false); - manager.register_pass(config.get_property(ov::hint::activations_scale_factor)); - // Other ops support eltwise fusions const std::vector allowed_data_movement_ops = { ov::op::v1::Reshape::get_type_info_static(), From 1b7646225ae65ce7342f6eae4f9aa258236348ab Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 26 Nov 2024 19:01:24 +0900 Subject: [PATCH 26/64] fixed for flux.1 dynamic model --- .../activations_scaling.hpp | 8 +-- .../activations_scaling.cpp | 32 ++++++--- .../activations_scaling_test.cpp | 2 +- src/plugins/intel_gpu/src/plugin/graph.cpp | 11 +-- src/plugins/intel_gpu/src/plugin/plugin.cpp | 1 + .../src/plugin/transformations_pipeline.cpp | 70 ++++++++++--------- 6 files changed, 75 insertions(+), 49 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index 1f6aef647e0ecc..78e511bd409271 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -24,7 +24,7 @@ class TRANSFORMATIONS_API SplitTransformation; class TRANSFORMATIONS_API ReshapeTransformation; class TRANSFORMATIONS_API MulMulMulTransformation; class TRANSFORMATIONS_API MulMVNTransformation; -class TRANSFORMATIONS_API ConcatTransformation; +class TRANSFORMATIONS_API MulConcatTransformation; } // namespace activations_scaling } // namespace pass @@ -94,8 +94,8 @@ class ov::pass::activations_scaling::MulMVNTransformation : public ov::pass::Mat MulMVNTransformation(); }; -class ov::pass::activations_scaling::ConcatTransformation : public ov::pass::MatcherPass { +class ov::pass::activations_scaling::MulConcatTransformation : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ConcatTransformation", "0"); - ConcatTransformation(); + OPENVINO_RTTI("MulConcatTransformation", "0"); + MulConcatTransformation(); }; diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 85384b8ef1fde4..10df4b0be650bd 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -122,16 +122,32 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float std::shared_ptr runtime_scaled_op; auto child = scaled_op->get_output_target_inputs(0).begin()->get_node(); - if (scaled_op->get_output_target_inputs(0).size() == 1 && ov::is_type(child)) { + bool has_bias = false; + size_t bias_index = 1; + { + if (scaled_op->get_output_target_inputs(0).size() == 1 && ov::is_type(child)) { + bias_index = (child->get_input_node_shared_ptr(0) == scaled_op) ? 1 : 0; + const auto& bias_pshape = child->get_input_partial_shape(bias_index); + if (bias_pshape.is_static()) { + const auto& bias_shape = bias_pshape.get_shape(); + const bool per_channel = std::count_if(bias_shape.begin(), bias_shape.end(), [](size_t x) { return x > 1; }) == 1; + if (ov::shape_size(bias_shape) == 1 || per_channel) { + has_bias = true; + } + } + } + } + + if (has_bias) { auto add = child->shared_from_this(); target_inputs = add->get_output_target_inputs(0); auto scale_down_bias = std::make_shared( - add->input(1).get_source_output(), - (add->input(1).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); + add->input(bias_index).get_source_output(), + (add->input(bias_index).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); scale_down_bias->set_friendly_name(add->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(add, scale_down_bias); auto convert_bias_prec = std::make_shared(scale_down_bias->output(0), scaled_prec); - add->input(1).replace_source_output(convert_bias_prec->output(0)); + add->input(bias_index).replace_source_output(convert_bias_prec->output(0)); runtime_scaled_op = std::make_shared(add->output(0), output_prec); } else { target_inputs = scaled_op->get_output_target_inputs(0); @@ -640,8 +656,8 @@ ov::pass::activations_scaling::MulMulMulTransformation::MulMulMulTransformation( // | const_c // | / // Multiply -ov::pass::activations_scaling::ConcatTransformation::ConcatTransformation() { - MATCHER_SCOPE(ConcatTransformation); +ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation() { + MATCHER_SCOPE(MulConcatTransformation); auto concat_m = wrap_type(); @@ -707,7 +723,7 @@ ov::pass::activations_scaling::ConcatTransformation::ConcatTransformation() { return false; }; - auto m = std::make_shared(concat_m, "ConcatTransformation"); + auto m = std::make_shared(concat_m, "MulConcatTransformation"); this->register_matcher(m, callback); } @@ -731,7 +747,7 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr manager.register_pass(); manager.register_pass(); manager.register_pass(); - manager.register_pass(); + manager.register_pass(); manager.register_pass(); manager.register_pass(); manager.register_pass(); diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 430a3a3332c384..44ee2b0c8cfb8b 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -254,7 +254,7 @@ TEST_F(TransformationTestsF, ConcatTransformationTest) { auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); - manager.register_pass(); + manager.register_pass(); } { auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); diff --git a/src/plugins/intel_gpu/src/plugin/graph.cpp b/src/plugins/intel_gpu/src/plugin/graph.cpp index c3d74feffb5599..76c12a63221d5d 100644 --- a/src/plugins/intel_gpu/src/plugin/graph.cpp +++ b/src/plugins/intel_gpu/src/plugin/graph.cpp @@ -593,11 +593,12 @@ void Graph::update_profiling_info() { perfMap[executedID.first].first = executedID.first; pcIter = perfMap.find(executedID.first); auto& perfCount = pcIter->second.second; + if (executedID.second != nullptr) { + cldnn::instrumentation::profiling_info cldnnInfo{executedID.first, executedID.second->get_profiling_info()}; - cldnn::instrumentation::profiling_info cldnnInfo{executedID.first, executedID.second->get_profiling_info()}; - - collectTimings(cldnnInfo, perfCount); - perfCount.num++; + collectTimings(cldnnInfo, perfCount); + perfCount.num++; + } } } } @@ -722,6 +723,8 @@ std::vector Graph::get_profiling_info() const { if ((!existInProfiling || (existInProfiling && perfIter->second.first.length() == 0)) && executedPrimitives.find(primId) != executedPrimitives.end()) { auto event = executedPrimitives.at(primId); + if (event == nullptr) + continue; cldnn::instrumentation::profiling_info cldnnInfo{primId, event->get_profiling_info()}; diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 5650f5a66a2ae6..6e7178f8385813 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -126,6 +126,7 @@ std::shared_ptr Plugin::clone_and_transform_model(const std::shared_p GPU_DEBUG_IF(!debug_config->dump_graphs.empty()) { auto path_base = debug_config->dump_graphs + "/" + cloned_model->get_name() + "_" + "transformed_func"; ov::pass::VisualizeTree(path_base + ".svg").run_on_model(cloned_model); + ov::pass::Serialize(path_base + ".xml", path_base + ".bin").run_on_model(cloned_model); } return cloned_model; } diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 7ce33ce477b6bd..45077523f3f707 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -16,6 +16,7 @@ #include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/runtime/itt.hpp" #include "low_precision/add.hpp" +#include "low_precision/concat.hpp" #include "low_precision/convolution.hpp" #include "low_precision/convolution_backprop_data.hpp" #include "low_precision/fold_convert.hpp" @@ -893,42 +894,11 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.run_passes(func); } - float activations_scale_factor = config.get_property(ov::hint::activations_scale_factor); - if (activations_scale_factor > 0.f) { + { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "TransformationsPipeline::apply::activations_scaling"); - using namespace ov::pass::low_precision; - - auto supportedPrecisions = std::vector({}); - auto perTensorQuantization = std::vector({}); - - ov::element::Type scaled_precision = element::f16; - std::cout << "scale_factor: " << activations_scale_factor << std::endl; - ov::pass::Manager manager("GPU:ActivationsScaling"); manager.set_per_pass_validation(false); - auto pass_config = manager.get_pass_config(); - pass_config->disable(); - pass_config->disable(); - pass_config->disable(); - pass_config->disable(); - pass_config->disable(); - pass_config->disable(); - pass_config->disable(); - pass_config->disable(); - - auto params = LayerTransformation::Params(false, scaled_precision, {scaled_precision}, true); - manager.register_pass(activations_scale_factor, scaled_precision); - auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); - lpt_pass->add_main(); - lpt_pass->add_main(); - manager.run_passes(func); - } - - { - ov::pass::Manager manager("GPU:PostLPT"); - manager.set_per_pass_validation(false); - // Other ops support eltwise fusions const std::vector allowed_data_movement_ops = { ov::op::v1::Reshape::get_type_info_static(), @@ -947,6 +917,42 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // not working properly. manager.register_pass(); + float activations_scale_factor = config.get_property(ov::hint::activations_scale_factor); + ov::element::Type scaled_precision = element::f16; + std::cout << "scale_factor: " << activations_scale_factor << std::endl; + + if (activations_scale_factor > 0.f) { + using namespace ov::pass::low_precision; + + auto supportedPrecisions = std::vector({}); + auto perTensorQuantization = std::vector({}); + + auto pass_config = manager.get_pass_config(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + + manager.register_pass(activations_scale_factor, scaled_precision); + auto params = LayerTransformation::Params(false, scaled_precision, {scaled_precision}, true); + auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); + lpt_pass->add_main(); + lpt_pass->add_main(); + lpt_pass->add_main(); + } + + manager.run_passes(func); + } + + { + ov::pass::Manager manager("GPU:PostLPT"); + manager.set_per_pass_validation(false); + manager.register_pass(); manager.register_pass(); manager.register_pass(); From b645bff7d92e7e757df5ee02149f952c45d89fb2 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 26 Nov 2024 19:12:01 +0900 Subject: [PATCH 27/64] fix merging faults --- .../intel_gpu/src/plugin/transformations_pipeline.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 45077523f3f707..202b4797ff2d61 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -991,16 +991,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); auto pass_config = manager.get_pass_config(); - pass_config->set_callback([=](const_node_ptr& root) -> bool { - if (!root->get_input_node_ptr(0)->get_input_partial_shape(0).is_static()) { - return false; - } - const auto& gamma_shape = root->get_input_node_ptr(0)->get_input_partial_shape(0).to_shape(); - const int32_t vec_size = 8; - return static_cast((gamma_shape.back() / vec_size)) > static_cast(device_info.max_work_group_size); - }); - - manager.register_pass(); manager.register_pass(); manager.register_pass(); manager.register_pass(device_info.supports_immad); From 2bd0ed49861006aa8285e1231eb0e8e01961ccb9 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Thu, 28 Nov 2024 20:44:15 +0900 Subject: [PATCH 28/64] fixes for flux.1 --- .../tests/convolution_qdq_transformation.cpp | 2 +- .../activations_scaling.cpp | 45 +++++++++++++------ .../cl_kernels/permute_f_y_axes.cl | 2 +- .../transformations/clamp_fp16_output.cpp | 7 +-- .../src/plugin/transformations_pipeline.cpp | 2 +- 5 files changed, 37 insertions(+), 21 deletions(-) diff --git a/src/common/low_precision_transformations/tests/convolution_qdq_transformation.cpp b/src/common/low_precision_transformations/tests/convolution_qdq_transformation.cpp index 22d86242da5e70..971a512365e201 100644 --- a/src/common/low_precision_transformations/tests/convolution_qdq_transformation.cpp +++ b/src/common/low_precision_transformations/tests/convolution_qdq_transformation.cpp @@ -506,7 +506,7 @@ const std::vector testValues = { { {0.03f}, element::f32, {}, false } }, { std::vector{ 2.f }, ov::element::i8}, - {}, + {}, ov::element::f32, {} }, diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 10df4b0be650bd..7a42e93da1a9af 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -103,6 +103,14 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float if (pattern_map.count(matmul_m)) scaled_op = std::dynamic_pointer_cast(pattern_map.at(matmul_m).get_node_shared_ptr()); + auto child_node = scaled_op->get_output_target_inputs(0).begin()->get_node(); + if (scaled_op->get_output_target_inputs(0).size() == 1 && + ov::is_type(child_node) && + ov::fp16_compression_is_disabled(child_node->shared_from_this()) && + ov::pass::constant_folding_is_disabled(child_node->shared_from_this())) { + return false; + } + if (transformation_callback(scaled_op)) return false; @@ -121,13 +129,12 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float std::set> target_inputs; std::shared_ptr runtime_scaled_op; - auto child = scaled_op->get_output_target_inputs(0).begin()->get_node(); bool has_bias = false; size_t bias_index = 1; { - if (scaled_op->get_output_target_inputs(0).size() == 1 && ov::is_type(child)) { - bias_index = (child->get_input_node_shared_ptr(0) == scaled_op) ? 1 : 0; - const auto& bias_pshape = child->get_input_partial_shape(bias_index); + if (scaled_op->get_output_target_inputs(0).size() == 1 && ov::is_type(child_node)) { + bias_index = (child_node->get_input_node_shared_ptr(0) == scaled_op) ? 1 : 0; + const auto& bias_pshape = child_node->get_input_partial_shape(bias_index); if (bias_pshape.is_static()) { const auto& bias_shape = bias_pshape.get_shape(); const bool per_channel = std::count_if(bias_shape.begin(), bias_shape.end(), [](size_t x) { return x > 1; }) == 1; @@ -139,7 +146,7 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float } if (has_bias) { - auto add = child->shared_from_this(); + auto add = child_node->shared_from_this(); target_inputs = add->get_output_target_inputs(0); auto scale_down_bias = std::make_shared( add->input(bias_index).get_source_output(), @@ -673,7 +680,8 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( } // check if all inputs are Multiply with scalar operand - ov::Output last_dep_const; + ov::Output last_dep_const = {}; + ov::element::Type last_dep_const_type = ov::element::undefined; for (auto& input : concat->inputs()) { auto dep_node = std::dynamic_pointer_cast(input.get_source_output().get_node_shared_ptr()); @@ -689,9 +697,11 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( } last_dep_const = dep_const0 ? dep_node->input(0).get_source_output() : dep_node->input(1).get_source_output(); - if (!is_scalar_node(last_dep_const)) { + if (!is_scalar_node(last_dep_const)) return false; - } + if (last_dep_const_type != ov::element::undefined && last_dep_const_type != last_dep_const.get_element_type()) + return false; + last_dep_const_type = last_dep_const.get_element_type(); } auto target_inputs = concat->get_output_target_inputs(0); @@ -702,17 +712,26 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( size_t const_index = ov::is_type(dep_input0) ? 0 : 1; size_t activation_index = ov::is_type(dep_input0) ? 1 : 0; - auto new_mul = register_new_node( - dep_node->input(activation_index).get_source_output(), - ov::op::util::eltwise_fold(dep_node->input(const_index).get_source_output(), - last_dep_const)); + auto dep_type = dep_node->get_output_element_type(0); + auto new_mul = std::make_shared>( + std::vector{dep_type, dep_type}, + std::vector{dep_type}, + ov::op::TemporaryReplaceOutputType(dep_node->input(activation_index).get_source_output(), dep_type).get(), + ov::op::TemporaryReplaceOutputType(ov::op::util::eltwise_fold( + dep_node->input(const_index).get_source_output(), + last_dep_const), dep_type).get()); new_mul->set_friendly_name(dep_node->get_friendly_name() + "_c"); ov::copy_runtime_info(dep_node, new_mul); input.replace_source_output(new_mul); } - auto new_mul = register_new_node(concat, last_dep_const); + auto concat_type = concat->get_output_element_type(0); + auto new_mul = std::make_shared>( + std::vector{concat_type, concat_type}, + std::vector{concat_type}, + ov::op::TemporaryReplaceOutputType(concat->output(0), concat_type).get(), + ov::op::TemporaryReplaceOutputType(last_dep_const, concat_type).get()); new_mul->set_friendly_name(concat->get_friendly_name() + "_c"); ov::copy_runtime_info(concat, new_mul); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl index 3aafc2c727b345..9f74654bd45a65 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl @@ -40,7 +40,7 @@ KERNEL (permute_f_y_axes)( OUT_VEC_TYPE result; IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]); FUSED_OPS_VEC; - result = FUSED_OPS_RESULT_VEC; + result = TO_OUT_VEC_TYPE(FUSED_OPS_RESULT_VEC); #else IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]); OUT_VEC_TYPE result = TO_OUT_VEC_TYPE(ACTIVATION(res, ACTIVATION_PARAMS)); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/clamp_fp16_output.cpp b/src/plugins/intel_gpu/src/plugin/transformations/clamp_fp16_output.cpp index ba779eb0f52ca9..91f46b1cd6e2dd 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/clamp_fp16_output.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/clamp_fp16_output.cpp @@ -47,17 +47,14 @@ ClampFP16Output::ClampFP16Output() { } auto matmul = pattern_map.at(matmul_m).get_node_shared_ptr(); - auto target_inputs = matmul->get_output_target_inputs(0); auto min = static_cast(std::numeric_limits::lowest()); auto max = static_cast(std::numeric_limits::max()); - auto clamp = std::make_shared(matmul, min, max); + auto clamp = std::make_shared(softmax->get_input_source_output(0), min, max); clamp->set_friendly_name(matmul->get_friendly_name() + "/ClampFP16Output"); ov::copy_runtime_info({matmul, softmax}, clamp); - for (auto& in : target_inputs) { - in.replace_source_output(clamp); - } + softmax->input(0).replace_source_output(clamp); return true; }; diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 202b4797ff2d61..683f958303b4c8 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -395,7 +395,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { const int32_t vec_size = 8; return static_cast((gamma_shape.back() / vec_size)) > static_cast(device_info.max_work_group_size); }); - manager.register_pass(false); + // manager.register_pass(false); const bool keep_precision_sensitive_in_fp32_1 = true; const bool convert_input_output_precision = false; From a97d23920206795d9eb736a020d061df2ef5a0d4 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Fri, 29 Nov 2024 22:10:02 +0900 Subject: [PATCH 29/64] update not to add redundant Convert --- .../rt_info/dequantization_node.hpp | 2 +- .../activations_scaling.cpp | 54 +++++++++++++------ .../rt_info/dequantization_node.cpp | 2 +- .../src/plugin/transformations_pipeline.cpp | 11 ++++ 4 files changed, 52 insertions(+), 17 deletions(-) diff --git a/src/common/transformations/include/transformations/rt_info/dequantization_node.hpp b/src/common/transformations/include/transformations/rt_info/dequantization_node.hpp index d9cf3589391b6d..fcf0e06158ad98 100644 --- a/src/common/transformations/include/transformations/rt_info/dequantization_node.hpp +++ b/src/common/transformations/include/transformations/rt_info/dequantization_node.hpp @@ -12,7 +12,7 @@ namespace ov { TRANSFORMATIONS_API void mark_as_dequantization_node(const std::shared_ptr& node); -TRANSFORMATIONS_API bool is_dequantization_node(const std::shared_ptr& node); +TRANSFORMATIONS_API bool is_dequantization_node(const std::shared_ptr& node); /** * @ingroup ov_runtime_attr_api diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 7a42e93da1a9af..6e97cddb56dfa3 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -103,16 +103,22 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float if (pattern_map.count(matmul_m)) scaled_op = std::dynamic_pointer_cast(pattern_map.at(matmul_m).get_node_shared_ptr()); + if (transformation_callback(scaled_op)) + return false; + + bool keep_precision = false; + std::shared_ptr output_of_scaled_op = scaled_op; auto child_node = scaled_op->get_output_target_inputs(0).begin()->get_node(); if (scaled_op->get_output_target_inputs(0).size() == 1 && ov::is_type(child_node) && ov::fp16_compression_is_disabled(child_node->shared_from_this()) && ov::pass::constant_folding_is_disabled(child_node->shared_from_this())) { - return false; + output_of_scaled_op = std::dynamic_pointer_cast(child_node->shared_from_this()); + child_node = output_of_scaled_op->get_output_target_inputs(0).begin()->get_node(); + keep_precision = true; } - if (transformation_callback(scaled_op)) - return false; + auto output_prec = output_of_scaled_op->output(0).get_element_type(); auto scale_down = std::make_shared( scaled_op->input(0).get_source_output(), @@ -120,12 +126,19 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_down->set_friendly_name(scaled_op->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(scaled_op, scale_down); - auto convert_prec0 = std::make_shared(scale_down->output(0), scaled_prec); - scaled_op->input(0).replace_source_output(convert_prec0->output(0)); - auto convert_prec1 = std::make_shared(scaled_op->input(1).get_source_output(), scaled_prec); - scaled_op->input(1).replace_source_output(convert_prec1->output(0)); + if (scale_down->output(0).get_element_type() != scaled_prec && !keep_precision) { + auto convert_prec0 = std::make_shared(scale_down->output(0), scaled_prec); + scaled_op->input(0).replace_source_output(convert_prec0->output(0)); + } else { + scaled_op->input(0).replace_source_output(scale_down->output(0)); + } + if (scaled_op->input(1).get_element_type() != scaled_prec && !keep_precision) { + auto convert_prec1 = std::make_shared(scaled_op->input(1).get_source_output(), scaled_prec); + scaled_op->input(1).replace_source_output(convert_prec1->output(0)); + } + + scaled_op->revalidate_and_infer_types(); - auto output_prec = scaled_op->output(0).get_element_type(); std::set> target_inputs; std::shared_ptr runtime_scaled_op; @@ -153,12 +166,25 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float (add->input(bias_index).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); scale_down_bias->set_friendly_name(add->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(add, scale_down_bias); - auto convert_bias_prec = std::make_shared(scale_down_bias->output(0), scaled_prec); - add->input(bias_index).replace_source_output(convert_bias_prec->output(0)); - runtime_scaled_op = std::make_shared(add->output(0), output_prec); + if (scale_down_bias->output(0).get_element_type() != scaled_prec && !keep_precision) { + auto convert_bias_prec = std::make_shared(scale_down_bias->output(0), scaled_prec); + add->input(bias_index).replace_source_output(convert_bias_prec->output(0)); + } else { + add->input(bias_index).replace_source_output(scale_down_bias->output(0)); + } + add->revalidate_and_infer_types(); + if (add->output(0).get_element_type() != output_prec && !keep_precision) { + runtime_scaled_op = std::make_shared(add->output(0), output_prec); + } else { + runtime_scaled_op = add; + } } else { - target_inputs = scaled_op->get_output_target_inputs(0); - runtime_scaled_op = std::make_shared(scaled_op->output(0), output_prec); + target_inputs = output_of_scaled_op->get_output_target_inputs(0); + if (output_of_scaled_op->output(0).get_element_type() != output_prec && !keep_precision) { + runtime_scaled_op = std::make_shared(output_of_scaled_op->output(0), output_prec); + } else { + runtime_scaled_op = output_of_scaled_op; + } } auto scale_up = register_new_node( @@ -170,8 +196,6 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float in.replace_source_output(scale_up); } - scaled_op->revalidate_and_infer_types(); - return true; }; diff --git a/src/common/transformations/src/transformations/rt_info/dequantization_node.cpp b/src/common/transformations/src/transformations/rt_info/dequantization_node.cpp index 4694de56ae62d3..91ebe224fcc576 100644 --- a/src/common/transformations/src/transformations/rt_info/dequantization_node.cpp +++ b/src/common/transformations/src/transformations/rt_info/dequantization_node.cpp @@ -9,7 +9,7 @@ void ov::mark_as_dequantization_node(const std::shared_ptr& node) { rt_info[DequantizationNode::get_type_info_static()] = DequantizationNode(); } -bool ov::is_dequantization_node(const std::shared_ptr& node) { +bool ov::is_dequantization_node(const std::shared_ptr& node) { const auto& rt_info = node->get_rt_info(); return rt_info.find(DequantizationNode::get_type_info_static()) != rt_info.end(); } diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 683f958303b4c8..65c2fdb158a997 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -164,6 +164,7 @@ #include "transformations/opset_conversions/convert_opset2_to_opset1.hpp" #include "transformations/opset_conversions/convert_opset3_to_opset2.hpp" #include "transformations/resolve_names_collisions.hpp" +#include "transformations/rt_info/dequantization_node.hpp" #include "transformations/rt_info/fused_names_attribute.hpp" #include "transformations/rt_info/keep_const_precision.hpp" #include "transformations/smart_reshape/matmul_sr.hpp" @@ -938,6 +939,16 @@ void TransformationsPipeline::apply(std::shared_ptr func) { pass_config->disable(); pass_config->disable(); + pass_config->set_callback( + [](const std::shared_ptr &node) -> bool { + return ov::is_dequantization_node(node); + }); + + pass_config->set_callback( + [](const std::shared_ptr &node) -> bool { + return (ov::is_dequantization_node(node) || ov::is_type(node)); + }); + manager.register_pass(activations_scale_factor, scaled_precision); auto params = LayerTransformation::Params(false, scaled_precision, {scaled_precision}, true); auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); From bfd017897b844f1152be0daa10068386a9852cab Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Fri, 29 Nov 2024 22:43:37 +0900 Subject: [PATCH 30/64] updated apply_rt_info --- src/inference/src/dev/core_impl.cpp | 19 +++++++++++-------- src/inference/src/dev/core_impl.hpp | 3 ++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp index a7dc786fd7e4bc..cbea8ba5268755 100644 --- a/src/inference/src/dev/core_impl.cpp +++ b/src/inference/src/dev/core_impl.cpp @@ -776,10 +776,10 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::shared_ptr< ov::AnyMap config_with_batch = config; // if auto-batching is applicable, the below function will patch the device name and config accordingly: auto model = apply_auto_batching(model_, deviceName, config_with_batch); - apply_rt_info(model_, config_with_batch); auto parsed = parseDeviceNameIntoConfig(deviceName, coreConfig, config_with_batch, is_proxy_device(deviceName)); auto plugin = get_plugin(parsed._deviceName); + apply_rt_info(plugin, model_, parsed._config); ov::SoPtr res; // will consume ov::cache_dir if plugin not support it auto cacheManager = parsed._core_config.get_cache_config_for_device(plugin, parsed._config)._cacheManager; @@ -811,10 +811,10 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::shared_ptr< ov::AnyMap config_with_batch = config; // if auto-batching is applicable, the below function will patch the device name and config accordingly: auto model = apply_auto_batching(model_, deviceName, config_with_batch); - apply_rt_info(model_, config_with_batch); auto parsed = parseDeviceNameIntoConfig(deviceName, coreConfig, config_with_batch, is_proxy_device(deviceName)); auto plugin = get_plugin(parsed._deviceName); + apply_rt_info(plugin, model_, parsed._config); ov::SoPtr res; // will consume ov::cache_dir if plugin not support it auto cacheManager = parsed._core_config.get_cache_config_for_device(plugin, parsed._config)._cacheManager; @@ -1136,13 +1136,16 @@ std::shared_ptr ov::CoreImpl::apply_auto_batching(const std::sh return ov::details::apply_batch_affinity(model, deviceNameWithoutBatch); } -void ov::CoreImpl::apply_rt_info(const std::shared_ptr& model, +void ov::CoreImpl::apply_rt_info(const ov::Plugin& plugin, + const std::shared_ptr& model, ov::AnyMap& config) const { - if (model->has_rt_info({"runtime_options", "ACTIVATIONS_SCALE_FACTOR"})) { - if (config.find("ACTIVATIONS_SCALE_FACTOR") == config.end()) { - const auto activations_scale_factor = - model->get_rt_info({"runtime_options", "ACTIVATIONS_SCALE_FACTOR"}); - config.insert(ov::hint::activations_scale_factor(activations_scale_factor)); + if (util::contains(plugin.get_property(ov::supported_properties), ov::hint::activations_scale_factor)) { + if (model->has_rt_info({"runtime_options", "ACTIVATIONS_SCALE_FACTOR"})) { + if (config.find("ACTIVATIONS_SCALE_FACTOR") == config.end()) { + const auto activations_scale_factor = + model->get_rt_info({"runtime_options", "ACTIVATIONS_SCALE_FACTOR"}); + config.insert(ov::hint::activations_scale_factor(activations_scale_factor)); + } } } } diff --git a/src/inference/src/dev/core_impl.hpp b/src/inference/src/dev/core_impl.hpp index 80973f2ee335a4..7e60c434bf3e57 100644 --- a/src/inference/src/dev/core_impl.hpp +++ b/src/inference/src/dev/core_impl.hpp @@ -251,7 +251,8 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this& model, + void apply_rt_info(const ov::Plugin& plugin, + const std::shared_ptr& model, ov::AnyMap& config) const; /* From a8f2945225baeacb3b58d75901e677999be207cf Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 2 Dec 2024 20:08:22 +0900 Subject: [PATCH 31/64] added a new ScaleDownFusion pass --- .../activations_scaling.hpp | 21 ++- .../activations_scaling.cpp | 163 +++++++----------- .../activations_scaling_test.cpp | 49 ++++++ .../src/plugin/transformations_pipeline.cpp | 1 + 4 files changed, 128 insertions(+), 106 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index 78e511bd409271..ac8ac063d96f32 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -16,8 +16,21 @@ class TRANSFORMATIONS_API ActivationsScaling; namespace activations_scaling { +TRANSFORMATIONS_API void mark_as_scale_down_node(const std::shared_ptr& node); + +TRANSFORMATIONS_API bool is_scale_down_node(const std::shared_ptr& node); + +class TRANSFORMATIONS_API ScaleDownNode : public RuntimeAttribute { +public: + OPENVINO_RTTI("scale_down_node", "0"); + + bool is_copyable() const override { + return false; + } +}; + class TRANSFORMATIONS_API ScaleDownSingleLayer; -class TRANSFORMATIONS_API ScaleDownMultipleLayers; +class TRANSFORMATIONS_API ScaleDownFusion; class TRANSFORMATIONS_API MulGroupNormTransformation; class TRANSFORMATIONS_API MulMulAddTransformation; class TRANSFORMATIONS_API SplitTransformation; @@ -52,10 +65,10 @@ class ov::pass::activations_scaling::ScaleDownSingleLayer : public ov::pass::Mat ScaleDownSingleLayer(float scale_factor, ov::element::Type scaled_prec); }; -class ov::pass::activations_scaling::ScaleDownMultipleLayers : public ov::pass::MatcherPass { +class ov::pass::activations_scaling::ScaleDownFusion : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ScaleDownMultipleLayers", "0"); - ScaleDownMultipleLayers(float scale_factor); + OPENVINO_RTTI("ScaleDownFusion", "0"); + ScaleDownFusion(float scale_factor); }; class ov::pass::activations_scaling::MulGroupNormTransformation : public ov::pass::MatcherPass { diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 6e97cddb56dfa3..9e63727701838c 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -46,6 +46,16 @@ const auto is_non_const_node = [](const ov::Output& output) -> bool { }; } // namespace +void ov::pass::activations_scaling::mark_as_scale_down_node(const std::shared_ptr& node) { + auto& rt_info = node->get_rt_info(); + rt_info[ScaleDownNode::get_type_info_static()] = ScaleDownNode(); +} + +bool ov::pass::activations_scaling::is_scale_down_node(const std::shared_ptr& node) { + const auto& rt_info = node->get_rt_info(); + return rt_info.find(ScaleDownNode::get_type_info_static()) != rt_info.end(); +} + using namespace ov::pass::activations_scaling; using namespace ov::pass::pattern; using ov::pass::pattern::op::Or; @@ -57,25 +67,10 @@ using ov::pass::pattern::op::Or; ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor, ov::element::Type scaled_prec) { MATCHER_SCOPE(ScaleDownSingleLayer); - auto are_not_mutiple_matmuls_connected = [](const Output& output) { - auto matmul = std::dynamic_pointer_cast(output.get_node_shared_ptr()); - auto input = matmul->get_input_node_shared_ptr(0); - if (input->get_output_size() > 1) - return true; - size_t user_matmul_count = 0; - for (const auto& u : input->get_users()) { - auto matmul_user = std::dynamic_pointer_cast(u); - if (!matmul_user) - continue; - user_matmul_count++; - } - return user_matmul_count == 1; - }; - auto activation_m = any_input(); auto weights_m = any_input(); auto convolution_m = wrap_type({activation_m, weights_m}); - auto matmul_m = wrap_type({activation_m, weights_m});//, are_not_mutiple_matmuls_connected); + auto matmul_m = wrap_type({activation_m, weights_m}); auto scaled_op_m = std::make_shared(OutputVector{convolution_m, matmul_m}); ov::Shape scale_const_shape = {1}; @@ -125,6 +120,7 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float (scaled_op->input(0).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); scale_down->set_friendly_name(scaled_op->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(scaled_op, scale_down); + mark_as_scale_down_node(scale_down); if (scale_down->output(0).get_element_type() != scaled_prec && !keep_precision) { auto convert_prec0 = std::make_shared(scale_down->output(0), scaled_prec); @@ -203,106 +199,69 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float this->register_matcher(m, callback); } -// scale_down -// | -// input ==> input -// / \ / \_ -// MatMul_a MatMul_b MatMul_a MatMul_b -// | | -// scale_up scale_up -ov::pass::activations_scaling::ScaleDownMultipleLayers::ScaleDownMultipleLayers(float scale_factor) { - MATCHER_SCOPE(ScaleDownMultipleLayers); - - auto are_mutiple_matmuls_connected = [](const Output& output) { - auto matmul = std::dynamic_pointer_cast(output.get_node_shared_ptr()); - auto input = matmul->get_input_node_shared_ptr(0); - if (input->get_output_size() > 1) - return false; - size_t user_matmul_count = 0; - for (const auto& u : input->get_users()) { - auto matmul_user = std::dynamic_pointer_cast(u); - if (!matmul_user) - continue; - user_matmul_count++; - } - return !ov::is_type(input) && input->get_users().size() > 1 && - input->get_users().size() == user_matmul_count; +// input Mul_c +// / \ ==> | +// Mul_a Mul_b input +ov::pass::activations_scaling::ScaleDownFusion::ScaleDownFusion(float scale_factor) { + MATCHER_SCOPE(ScaleDownFusion); + + const auto is_scale_down_mul = [](const ov::Output& output) -> bool { + return is_scale_down_node(output.get_node_shared_ptr()); }; auto activation_m = any_input(); - auto weights_m = any_input(); - auto scaled_op_m = wrap_type({activation_m, weights_m}, are_mutiple_matmuls_connected); - - ov::Shape scale_const_shape = {1}; - std::vector scale_down_value = {1.f / scale_factor}; - std::shared_ptr scale_down_const_f16 = - std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); - std::shared_ptr scale_down_const_f32 = - std::make_shared(ov::element::f32, scale_const_shape, scale_down_value); - std::vector scale_up_value = {scale_factor}; - std::shared_ptr scale_up_const_f16 = - std::make_shared(ov::element::f16, scale_const_shape, scale_up_value); - std::shared_ptr scale_up_const_f32 = - std::make_shared(ov::element::f32, scale_const_shape, scale_up_value); + auto scale_const_m = ov::pass::pattern::wrap_type(is_scalar_node); + auto mul_m = wrap_type({activation_m, scale_const_m}, is_scale_down_mul); ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); - auto scaled_op = - std::dynamic_pointer_cast(pattern_map.at(scaled_op_m).get_node_shared_ptr()); - if (!scaled_op || transformation_callback(scaled_op)) + auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); + auto parent = mul->get_input_node_shared_ptr(0); + if (parent->get_output_size() > 1) return false; - auto input_node = scaled_op->input(0).get_source_output(); - auto scale_down = std::make_shared( - input_node, - (input_node.get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); - scale_down->set_friendly_name(scaled_op->get_friendly_name() + "_scale_down"); - ov::copy_runtime_info(scaled_op, scale_down); + auto children = parent->get_users(); + size_t num_scaled_down_nodes = 0; + for (const auto& child : children) { + if (!is_scale_down_node(child)) + return false; + num_scaled_down_nodes += 1; + } - for (const auto& u : input_node.get_target_inputs()) { - auto matmul_user = std::dynamic_pointer_cast(u.get_node()->shared_from_this()); - if (matmul_user) { - matmul_user->input(0).replace_source_output(scale_down); - auto child = matmul_user->get_output_target_inputs(0).begin()->get_node(); - if (matmul_user->get_output_target_inputs(0).size() == 1 && ov::is_type(child)) { - auto add = child->shared_from_this(); - auto target_inputs = add->get_output_target_inputs(0); - auto scale_down_bias = std::make_shared( - add->input(1).get_source_output(), - (add->input(1).get_element_type() == ov::element::f32) ? scale_down_const_f32 - : scale_down_const_f16); - scale_down_bias->set_friendly_name(add->get_friendly_name() + "_scale_down"); - ov::copy_runtime_info(add, scale_down_bias); - add->input(1).replace_source_output(scale_down_bias->output(0)); - - auto scale_up = register_new_node( - add->output(0), - (add->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 - : scale_up_const_f16); - scale_up->set_friendly_name(matmul_user->get_friendly_name() + "_scale_up"); - ov::copy_runtime_info(matmul_user, scale_up); - for (auto& in : target_inputs) { - in.replace_source_output(scale_up); - } - } else { - auto target_inputs = matmul_user->get_output_target_inputs(0); - auto scale_up = register_new_node( - matmul_user->output(0), - (matmul_user->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 - : scale_up_const_f16); - scale_up->set_friendly_name(matmul_user->get_friendly_name() + "_scale_up"); - ov::copy_runtime_info(matmul_user, scale_up); - for (auto& in : target_inputs) { - in.replace_source_output(scale_up); - } - } + if (num_scaled_down_nodes < 2) + return false; + + if (transformation_callback(mul)) + return false; + + if (!ov::is_type(parent) && + !ov::is_type(parent) && + !ov::is_type(parent) && + !ov::is_type(parent)) { + return false; + } + + ov::Shape scale_const_shape = {1}; + std::vector scale_down_value = {1.f / scale_factor}; + std::shared_ptr scale_down_const = + std::make_shared(parent->input(0).get_element_type(), scale_const_shape, scale_down_value); + + auto new_scale_down = std::make_shared(parent->input(0).get_source_output(), scale_down_const); + new_scale_down->set_friendly_name(parent->get_friendly_name() + "_scale_down"); + ov::copy_runtime_info(parent, new_scale_down); + parent->input(0).replace_source_output(new_scale_down->output(0)); + + for (const auto& child : children) { + for (auto& target : child->get_output_target_inputs(0)) { + target.replace_source_output(parent->output(0)); } } + return true; }; - auto m = std::make_shared(scaled_op_m, "ScaleDownMultipleLayers"); + auto m = std::make_shared(mul_m, "ScaleDownFusion"); this->register_matcher(m, callback); } @@ -780,7 +739,7 @@ bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr manager.set_per_pass_validation(false); manager.register_pass(m_scale_factor, m_scaled_prec); - manager.register_pass(m_scale_factor); + manager.register_pass(m_scale_factor); manager.register_pass(); manager.register_pass(); manager.register_pass(); diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 44ee2b0c8cfb8b..4c11dd024cbb72 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -64,6 +64,55 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { } } +TEST_F(TransformationTestsF, ScaleDownFusionTest) { + float scale_factor = 128.f; + { + ov::Shape scale_const_shape = {1}; + std::vector scale_down_value = {1.f / scale_factor}; + std::shared_ptr scale_down_const = + std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); + + auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); + auto shape_pre = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, 3, 256}); + auto reshape_pre = std::make_shared(input, shape_pre, true); + auto shape_post = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {1, 3, 16, 16}); + + auto scale_down0 = std::make_shared(reshape_pre->output(0), scale_down_const); + ov::pass::activations_scaling::mark_as_scale_down_node(scale_down0); + auto reshape_post0 = std::make_shared(scale_down0, shape_post, true); + auto result0 = std::make_shared(reshape_post0); + + auto scale_down1 = std::make_shared(reshape_pre->output(0), scale_down_const); + ov::pass::activations_scaling::mark_as_scale_down_node(scale_down1); + auto reshape_post1 = std::make_shared(scale_down1, shape_post, true); + auto result1 = std::make_shared(reshape_post1); + + model = std::make_shared(ov::ResultVector{result0, result1}, ov::ParameterVector{input}); + manager.register_pass(scale_factor); + } + { + ov::Shape scale_const_shape = {1}; + std::vector scale_down_value = {1.f / scale_factor}; + std::shared_ptr scale_down_const = + std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); + + auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); + auto new_scale_down = std::make_shared(input->output(0), scale_down_const); + + auto shape_pre = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, 3, 256}); + auto reshape_pre = std::make_shared(new_scale_down, shape_pre, true); + auto shape_post = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {1, 3, 16, 16}); + + auto reshape_post0 = std::make_shared(reshape_pre, shape_post, true); + auto result0 = std::make_shared(reshape_post0); + + auto reshape_post1 = std::make_shared(reshape_pre, shape_post, true); + auto result1 = std::make_shared(reshape_post1); + + model_ref = std::make_shared(ov::ResultVector{result0, result1}, ov::ParameterVector{input}); + } +} + TEST_F(TransformationTestsF, MulMulAddTransformationTest) { { auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 65c2fdb158a997..feedbaf0450a2a 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -950,6 +950,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { }); manager.register_pass(activations_scale_factor, scaled_precision); + manager.register_pass(activations_scale_factor); auto params = LayerTransformation::Params(false, scaled_precision, {scaled_precision}, true); auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); lpt_pass->add_main(); From a9ae892b318e4b4f94440b238f72a5caeb4792f9 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 2 Dec 2024 22:36:17 +0900 Subject: [PATCH 32/64] added a new param useDefaultTransformation for activations scaling --- .../low_precision/layer_transformation.hpp | 9 +++++++-- .../low_precision_transformations/src/add.cpp | 2 +- .../src/layer_transformation.cpp | 1 + .../src/multiply_partial.cpp | 18 ++++++++++++------ .../src/plugin/transformations_pipeline.cpp | 2 +- 5 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp b/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp index 952cb3e468a17b..558e26aeb56097 100644 --- a/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp +++ b/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp @@ -252,11 +252,13 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass element::Type deqPrecision = element::f32, const std::vector defaultPrecisions = { ov::element::u8, ov::element::i8 }, - const bool reshapeIgnorePerTensorQuantizationCheck = false) : + const bool reshapeIgnorePerTensorQuantizationCheck = false, + const bool useDefaultTransformation = true) : updatePrecisions(updatePrecisions), deqPrecision(deqPrecision), defaultPrecisions(defaultPrecisions), - reshapeIgnorePerTensorQuantizationCheck(reshapeIgnorePerTensorQuantizationCheck) {} + reshapeIgnorePerTensorQuantizationCheck(reshapeIgnorePerTensorQuantizationCheck), + useDefaultTransformation(useDefaultTransformation) {} Params& setUpdatePrecisions(const bool updatePrecisions) { this->updatePrecisions = updatePrecisions; @@ -281,6 +283,8 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass std::vector defaultPrecisions; // to support GPU workarround to keep Reshape and MatMul in FP32 bool reshapeIgnorePerTensorQuantizationCheck; + // for MultiplyPartialTransformation to support Activations Scaling + bool useDefaultTransformation; }; class PrecisionDetails { @@ -352,6 +356,7 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass element::Type deqPrecision; std::vector defaultPrecisions; bool reshapeIgnorePerTensorQuantizationCheck; + bool useDefaultTransformation; static constexpr char originalLayerPostfix[] = "_original"; TransformationContext* context; diff --git a/src/common/low_precision_transformations/src/add.cpp b/src/common/low_precision_transformations/src/add.cpp index e8c0380336362b..1ba6f6598be247 100644 --- a/src/common/low_precision_transformations/src/add.cpp +++ b/src/common/low_precision_transformations/src/add.cpp @@ -215,7 +215,7 @@ bool AddTransformation::transform(TransformationContext& context, ov::pass::patt newMultiplyFullPathValues); newAddOrSubtract = std::make_shared>( - std::vector{element::f32, element::f32}, std::vector{ add->get_output_element_type(0) }, + std::vector{element::f32, element::f32}, std::vector{ element::f32 }, ov::op::TemporaryReplaceOutputType(inputs[0], element::f32).get(), ov::op::TemporaryReplaceOutputType(inputs[1], element::f32).get()); newMultiply = std::make_shared>( diff --git a/src/common/low_precision_transformations/src/layer_transformation.cpp b/src/common/low_precision_transformations/src/layer_transformation.cpp index 4ec573c0f2a6ea..70d88743cb34ec 100644 --- a/src/common/low_precision_transformations/src/layer_transformation.cpp +++ b/src/common/low_precision_transformations/src/layer_transformation.cpp @@ -45,6 +45,7 @@ LayerTransformation::LayerTransformation(const Params& params) : deqPrecision(params.deqPrecision), defaultPrecisions(params.defaultPrecisions), reshapeIgnorePerTensorQuantizationCheck(params.reshapeIgnorePerTensorQuantizationCheck), + useDefaultTransformation(params.useDefaultTransformation), context(nullptr) {} void LayerTransformation::setContext(TransformationContext* context) noexcept { diff --git a/src/common/low_precision_transformations/src/multiply_partial.cpp b/src/common/low_precision_transformations/src/multiply_partial.cpp index 14671d75346c3f..f09b0ed866f420 100644 --- a/src/common/low_precision_transformations/src/multiply_partial.cpp +++ b/src/common/low_precision_transformations/src/multiply_partial.cpp @@ -133,24 +133,30 @@ bool MultiplyPartialTransformation::transform(TransformationContext& context, ov // before: Y = (SC1 * (X1 - SH1)) * (SC2 * X2) - // after : Y = ((X1 - SH1) * X2) * SC1' , where : - // SC1' = SC1 * SC2 + // if useDefaultTransformation = true + // after : Y = (SC1' * (X1 - SH1)) * (X2) , where : + // SC1' = SC1 * SC2 + // else + // after : Y = ((X1 - SH1) * X2) * SC1' , where : + // SC1' = SC1 * SC2 auto newMultiplyValuesFullPath = fold(multiplyValuesEmptyPath, multiplyValuesFullPath); OutputVector inputs{ {}, {} }; - inputs[emptyPathIndex] = newMultiplyValuesFullPath; + inputs[emptyPathIndex] = useDefaultTransformation ? dequantizationEmptyPath.data : newMultiplyValuesFullPath; + auto input_for_fullPath = useDefaultTransformation ? newMultiplyValuesFullPath : + dequantizationEmptyPath.data.get_node_shared_ptr(); ov::Output parent0 = dequantizationFullPath.subtract == nullptr ? (dequantizationFullPath.convert == nullptr ? dequantizationFullPath.data : dequantizationFullPath.convert) : dequantizationFullPath.subtract; inputs[fullPathIndex] = - parent0.get_node()->get_output_element_type(0) == dequantizationEmptyPath.data.get_node()->get_output_element_type(0) ? - std::make_shared(parent0, dequantizationEmptyPath.data) : + parent0.get_node()->get_output_element_type(0) == input_for_fullPath->get_output_element_type(0) ? + std::make_shared(parent0, input_for_fullPath) : std::make_shared>( std::vector{element::f32, element::f32}, std::vector{element::f32}, ov::op::TemporaryReplaceOutputType(parent0, element::f32).get(), - ov::op::TemporaryReplaceOutputType(dequantizationEmptyPath.data, element::f32).get()); + ov::op::TemporaryReplaceOutputType(input_for_fullPath, element::f32).get()); newMultiply = std::make_shared>( std::vector{element::f32, element::f32}, diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index feedbaf0450a2a..6983a77f4483b7 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -951,7 +951,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(activations_scale_factor, scaled_precision); manager.register_pass(activations_scale_factor); - auto params = LayerTransformation::Params(false, scaled_precision, {scaled_precision}, true); + auto params = LayerTransformation::Params(false, scaled_precision, {scaled_precision}, true, false); auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); lpt_pass->add_main(); lpt_pass->add_main(); From ed36d41b634f783b97f625594bc405f61a529ae8 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 2 Dec 2024 22:48:57 +0900 Subject: [PATCH 33/64] update code style --- .../activations_scaling.hpp | 3 +- .../activations_scaling.cpp | 66 ++++++++++++------- 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index ac8ac063d96f32..316f4e40a721ea 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -51,7 +51,8 @@ class ov::pass::ActivationsScaling : public ov::pass::ModelPass { public: OPENVINO_RTTI("ActivationsScaling", "0"); explicit ActivationsScaling(float scale_factor, ov::element::Type scaled_prec) - : m_scale_factor(scale_factor), m_scaled_prec(scaled_prec) {} + : m_scale_factor(scale_factor), + m_scaled_prec(scaled_prec) {} bool run_on_model(const std::shared_ptr& model) override; private: diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 9e63727701838c..29e3cdeb400470 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -64,7 +64,8 @@ using ov::pass::pattern::op::Or; // Conv/MatMul // ==> // Multiply(scale_down by scale_factor) --> Conv/MatMul --> Multiply(scale_up by scale_factor) -ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor, ov::element::Type scaled_prec) { +ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor, + ov::element::Type scaled_prec) { MATCHER_SCOPE(ScaleDownSingleLayer); auto activation_m = any_input(); @@ -104,8 +105,7 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float bool keep_precision = false; std::shared_ptr output_of_scaled_op = scaled_op; auto child_node = scaled_op->get_output_target_inputs(0).begin()->get_node(); - if (scaled_op->get_output_target_inputs(0).size() == 1 && - ov::is_type(child_node) && + if (scaled_op->get_output_target_inputs(0).size() == 1 && ov::is_type(child_node) && ov::fp16_compression_is_disabled(child_node->shared_from_this()) && ov::pass::constant_folding_is_disabled(child_node->shared_from_this())) { output_of_scaled_op = std::dynamic_pointer_cast(child_node->shared_from_this()); @@ -129,7 +129,8 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scaled_op->input(0).replace_source_output(scale_down->output(0)); } if (scaled_op->input(1).get_element_type() != scaled_prec && !keep_precision) { - auto convert_prec1 = std::make_shared(scaled_op->input(1).get_source_output(), scaled_prec); + auto convert_prec1 = + std::make_shared(scaled_op->input(1).get_source_output(), scaled_prec); scaled_op->input(1).replace_source_output(convert_prec1->output(0)); } @@ -146,7 +147,9 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float const auto& bias_pshape = child_node->get_input_partial_shape(bias_index); if (bias_pshape.is_static()) { const auto& bias_shape = bias_pshape.get_shape(); - const bool per_channel = std::count_if(bias_shape.begin(), bias_shape.end(), [](size_t x) { return x > 1; }) == 1; + const bool per_channel = std::count_if(bias_shape.begin(), bias_shape.end(), [](size_t x) { + return x > 1; + }) == 1; if (ov::shape_size(bias_shape) == 1 || per_channel) { has_bias = true; } @@ -159,7 +162,8 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float target_inputs = add->get_output_target_inputs(0); auto scale_down_bias = std::make_shared( add->input(bias_index).get_source_output(), - (add->input(bias_index).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); + (add->input(bias_index).get_element_type() == ov::element::f32) ? scale_down_const_f32 + : scale_down_const_f16); scale_down_bias->set_friendly_name(add->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(add, scale_down_bias); if (scale_down_bias->output(0).get_element_type() != scaled_prec && !keep_precision) { @@ -185,7 +189,8 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float auto scale_up = register_new_node( runtime_scaled_op->output(0), - (runtime_scaled_op->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 : scale_up_const_f16); + (runtime_scaled_op->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 + : scale_up_const_f16); scale_up->set_friendly_name(scaled_op->get_friendly_name() + "_scale_up"); ov::copy_runtime_info(scaled_op, scale_up); for (auto& in : target_inputs) { @@ -235,19 +240,20 @@ ov::pass::activations_scaling::ScaleDownFusion::ScaleDownFusion(float scale_fact if (transformation_callback(mul)) return false; - if (!ov::is_type(parent) && - !ov::is_type(parent) && - !ov::is_type(parent) && - !ov::is_type(parent)) { + if (!ov::is_type(parent) && !ov::is_type(parent) && + !ov::is_type(parent) && !ov::is_type(parent)) { return false; } ov::Shape scale_const_shape = {1}; std::vector scale_down_value = {1.f / scale_factor}; std::shared_ptr scale_down_const = - std::make_shared(parent->input(0).get_element_type(), scale_const_shape, scale_down_value); + std::make_shared(parent->input(0).get_element_type(), + scale_const_shape, + scale_down_value); - auto new_scale_down = std::make_shared(parent->input(0).get_source_output(), scale_down_const); + auto new_scale_down = + std::make_shared(parent->input(0).get_source_output(), scale_down_const); new_scale_down->set_friendly_name(parent->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(parent, new_scale_down); parent->input(0).replace_source_output(new_scale_down->output(0)); @@ -362,7 +368,8 @@ ov::pass::activations_scaling::MulGroupNormTransformation::MulGroupNormTransform OPENVINO_ASSERT(pattern_map.count(norm_m)); auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); - auto norm = std::dynamic_pointer_cast(pattern_map.at(norm_m).get_node_shared_ptr()); + auto norm + = std::dynamic_pointer_cast(pattern_map.at(norm_m).get_node_shared_ptr()); if (transformation_callback(norm)) { return false; @@ -377,8 +384,10 @@ ov::pass::activations_scaling::MulGroupNormTransformation::MulGroupNormTransform activation = activation.get_node()->get_input_source_output(0); auto newGroupNorm = std::make_shared>( ov::op::v12::GroupNormalization(activation, - norm->get_input_source_output(1), norm->get_input_source_output(2), - norm->get_num_groups(), norm->get_epsilon()), + norm->get_input_source_output(1), + norm->get_input_source_output(2), + norm->get_num_groups(), + norm->get_epsilon()), norm->get_output_element_type(0)); ov::copy_runtime_info(norm, newGroupNorm); ov::replace_node(norm, newGroupNorm); @@ -429,10 +438,13 @@ ov::pass::activations_scaling::MulMVNTransformation::MulMVNTransformation() { auto activation = mul->get_input_source_output(activation_index); if (ov::is_type(activation.get_node())) activation = activation.get_node()->get_input_source_output(0); - auto newMVN = std::make_shared>( - ov::op::v6::MVN(activation, norm->get_input_source_output(1), - norm->get_normalize_variance(), norm->get_eps(), norm->get_eps_mode()), - norm->get_output_element_type(0)); + auto newMVN + = std::make_shared>(ov::op::v6::MVN(activation, + norm->get_input_source_output(1), + norm->get_normalize_variance(), + norm->get_eps(), + norm->get_eps_mode()), + norm->get_output_element_type(0)); ov::copy_runtime_info(norm, newMVN); ov::replace_node(norm, newMVN); return true; @@ -682,7 +694,8 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( dep_const0 ? dep_node->input(0).get_source_output() : dep_node->input(1).get_source_output(); if (!is_scalar_node(last_dep_const)) return false; - if (last_dep_const_type != ov::element::undefined && last_dep_const_type != last_dep_const.get_element_type()) + if (last_dep_const_type != ov::element::undefined && + last_dep_const_type != last_dep_const.get_element_type()) return false; last_dep_const_type = last_dep_const.get_element_type(); } @@ -699,10 +712,13 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( auto new_mul = std::make_shared>( std::vector{dep_type, dep_type}, std::vector{dep_type}, - ov::op::TemporaryReplaceOutputType(dep_node->input(activation_index).get_source_output(), dep_type).get(), - ov::op::TemporaryReplaceOutputType(ov::op::util::eltwise_fold( - dep_node->input(const_index).get_source_output(), - last_dep_const), dep_type).get()); + ov::op::TemporaryReplaceOutputType(dep_node->input(activation_index).get_source_output(), dep_type) + .get(), + ov::op::TemporaryReplaceOutputType( + ov::op::util::eltwise_fold(dep_node->input(const_index).get_source_output(), + last_dep_const), + dep_type) + .get()); new_mul->set_friendly_name(dep_node->get_friendly_name() + "_c"); ov::copy_runtime_info(dep_node, new_mul); From 7cd14fb8a0f35326d9fa8184da35ca18e936fba3 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 2 Dec 2024 22:55:12 +0900 Subject: [PATCH 34/64] update code style --- .../activations_scaling.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 29e3cdeb400470..03ea140420655b 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -106,7 +106,7 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float std::shared_ptr output_of_scaled_op = scaled_op; auto child_node = scaled_op->get_output_target_inputs(0).begin()->get_node(); if (scaled_op->get_output_target_inputs(0).size() == 1 && ov::is_type(child_node) && - ov::fp16_compression_is_disabled(child_node->shared_from_this()) && + ov::fp16_compression_is_disabled(child_node->shared_from_this()) && ov::pass::constant_folding_is_disabled(child_node->shared_from_this())) { output_of_scaled_op = std::dynamic_pointer_cast(child_node->shared_from_this()); child_node = output_of_scaled_op->get_output_target_inputs(0).begin()->get_node(); @@ -368,8 +368,8 @@ ov::pass::activations_scaling::MulGroupNormTransformation::MulGroupNormTransform OPENVINO_ASSERT(pattern_map.count(norm_m)); auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); - auto norm - = std::dynamic_pointer_cast(pattern_map.at(norm_m).get_node_shared_ptr()); + auto norm = + std::dynamic_pointer_cast(pattern_map.at(norm_m).get_node_shared_ptr()); if (transformation_callback(norm)) { return false; @@ -438,13 +438,13 @@ ov::pass::activations_scaling::MulMVNTransformation::MulMVNTransformation() { auto activation = mul->get_input_source_output(activation_index); if (ov::is_type(activation.get_node())) activation = activation.get_node()->get_input_source_output(0); - auto newMVN - = std::make_shared>(ov::op::v6::MVN(activation, - norm->get_input_source_output(1), - norm->get_normalize_variance(), - norm->get_eps(), - norm->get_eps_mode()), - norm->get_output_element_type(0)); + auto newMVN = + std::make_shared>(ov::op::v6::MVN(activation, + norm->get_input_source_output(1), + norm->get_normalize_variance(), + norm->get_eps(), + norm->get_eps_mode()), + norm->get_output_element_type(0)); ov::copy_runtime_info(norm, newMVN); ov::replace_node(norm, newMVN); return true; From 92a1239dc90285ae49ab85ee2ca8df8d505f0bf2 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 3 Dec 2024 01:11:58 +0900 Subject: [PATCH 35/64] updated clamp_fp16 tests --- .../transformations/clamp_fp16_output_test.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_gpu/tests/unit/transformations/clamp_fp16_output_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/clamp_fp16_output_test.cpp index b5cb1671c455b5..380f2ee95507c4 100644 --- a/src/plugins/intel_gpu/tests/unit/transformations/clamp_fp16_output_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/transformations/clamp_fp16_output_test.cpp @@ -68,12 +68,12 @@ TEST_F(TransformationTestsF, ClampFp16OutputTest2) { auto input1 = std::make_shared(ov::element::f16, ov::Shape{ 3, 2, 2 }); auto input2 = std::make_shared(ov::element::f16, ov::Shape{ 1, 2, 2 }); auto matmul = std::make_shared(input1, input2, true, false); + auto target_shape = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 3, 4 }); + auto reshape = std::make_shared(matmul, target_shape, false); auto min = static_cast(std::numeric_limits::lowest()); auto max = static_cast(std::numeric_limits::max()); - auto clamp = std::make_shared(matmul, min, max); - auto target_shape = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 3, 4 }); - auto reshape = std::make_shared(clamp, target_shape, false); - auto softmax = std::make_shared(reshape, 1); + auto clamp = std::make_shared(reshape, min, max); + auto softmax = std::make_shared(clamp, 1); model_ref = std::make_shared(ov::NodeVector{ softmax }, ov::ParameterVector{ input1, input2 }); } @@ -128,12 +128,12 @@ TEST_F(TransformationTestsF, ClampFp16OutputTest5) { auto input1 = std::make_shared(ov::element::f16, ov::Shape{ 3, 2, 2 }); auto input2 = std::make_shared(ov::element::f16, ov::Shape{ 1, 2, 2 }); auto matmul = std::make_shared(input1, input2, true, false); + auto data = std::make_shared(ov::element::f16, ov::Shape{ 3, 2, 2 }); + auto add = std::make_shared(matmul, data); auto min = static_cast(std::numeric_limits::lowest()); auto max = static_cast(std::numeric_limits::max()); - auto clamp = std::make_shared(matmul, min, max); - auto data = std::make_shared(ov::element::f16, ov::Shape{ 3, 2, 2 }); - auto add = std::make_shared(clamp, data); - auto softmax = std::make_shared(add, 1); + auto clamp = std::make_shared(add, min, max); + auto softmax = std::make_shared(clamp, 1); model_ref = std::make_shared(ov::NodeVector{ softmax }, ov::ParameterVector{ input1, input2, data }); } From 4a9bca691fcf1276dd71116a435e8c208b3418a1 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 3 Dec 2024 01:34:56 +0900 Subject: [PATCH 36/64] code cleanup --- .../activations_scaling.hpp | 40 --- .../activations_scaling.cpp | 287 ------------------ .../activations_scaling_test.cpp | 123 -------- src/plugins/intel_gpu/src/plugin/graph.cpp | 11 +- src/plugins/intel_gpu/src/plugin/plugin.cpp | 1 - .../src/plugin/transformations_pipeline.cpp | 3 +- 6 files changed, 5 insertions(+), 460 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index 316f4e40a721ea..15b8bb61081e65 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -32,10 +32,6 @@ class TRANSFORMATIONS_API ScaleDownNode : public RuntimeAttribute { class TRANSFORMATIONS_API ScaleDownSingleLayer; class TRANSFORMATIONS_API ScaleDownFusion; class TRANSFORMATIONS_API MulGroupNormTransformation; -class TRANSFORMATIONS_API MulMulAddTransformation; -class TRANSFORMATIONS_API SplitTransformation; -class TRANSFORMATIONS_API ReshapeTransformation; -class TRANSFORMATIONS_API MulMulMulTransformation; class TRANSFORMATIONS_API MulMVNTransformation; class TRANSFORMATIONS_API MulConcatTransformation; @@ -47,18 +43,6 @@ class TRANSFORMATIONS_API MulConcatTransformation; // This feature is controlled by ov::hint::activations_scale_factor. // For example, when this property is set as 16, activations are divided by 16. // If ov::hint::activations_scale_factor is less than zero, it is disabled. -class ov::pass::ActivationsScaling : public ov::pass::ModelPass { -public: - OPENVINO_RTTI("ActivationsScaling", "0"); - explicit ActivationsScaling(float scale_factor, ov::element::Type scaled_prec) - : m_scale_factor(scale_factor), - m_scaled_prec(scaled_prec) {} - bool run_on_model(const std::shared_ptr& model) override; - -private: - float m_scale_factor = 0.f; - ov::element::Type m_scaled_prec = element::f16; -}; class ov::pass::activations_scaling::ScaleDownSingleLayer : public ov::pass::MatcherPass { public: @@ -78,30 +62,6 @@ class ov::pass::activations_scaling::MulGroupNormTransformation : public ov::pas MulGroupNormTransformation(); }; -class ov::pass::activations_scaling::MulMulAddTransformation : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("MulMulAddTransformation", "0"); - MulMulAddTransformation(); -}; - -class ov::pass::activations_scaling::SplitTransformation : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("SplitTransformation", "0"); - SplitTransformation(); -}; - -class ov::pass::activations_scaling::ReshapeTransformation : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("ReshapeTransformation", "0"); - ReshapeTransformation(); -}; - -class ov::pass::activations_scaling::MulMulMulTransformation : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("MulMulMulTransformation", "0"); - MulMulMulTransformation(); -}; - class ov::pass::activations_scaling::MulMVNTransformation : public ov::pass::MatcherPass { public: OPENVINO_RTTI("MulMVNTransformation", "0"); diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 03ea140420655b..b4f3c4abed9ec5 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -271,77 +271,6 @@ ov::pass::activations_scaling::ScaleDownFusion::ScaleDownFusion(float scale_fact this->register_matcher(m, callback); } -// MulMulAddTransformation makes the target pattern to be easy to be merged with followig nodes. -// -// input_a const_a input_b const_b input_a (const_a/const_b) -// \ / \ / \ / -// Multiply_a Multiply_b ==> Multiply_a_mma input_b -// \ / \ / -// \ / Add const_b -// \ / | / -// Add Multiply_b_mma -// -// (input_a * const_a) + (input_b * const_b) ==> ((input_a * (const_a / const_b)) + input_b) * const_b -ov::pass::activations_scaling::MulMulAddTransformation::MulMulAddTransformation() { - MATCHER_SCOPE(MulMulAddTransformation); - - auto activation0_m = any_input(is_non_const_node); - auto scale_const0_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul0_m = wrap_type({activation0_m, scale_const0_m}); - - auto activation1_m = any_input(is_non_const_node); - auto scale_const1_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul1_m = wrap_type({activation1_m, scale_const1_m}); - - auto add_m = wrap_type({mul0_m, mul1_m}); - - ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); - - OPENVINO_ASSERT(pattern_map.count(mul0_m)); - OPENVINO_ASSERT(pattern_map.count(mul1_m)); - OPENVINO_ASSERT(pattern_map.count(add_m)); - - auto add = std::dynamic_pointer_cast(pattern_map.at(add_m).get_node_shared_ptr()); - - if (transformation_callback(add)) { - return false; - } - auto target_inputs = add->get_output_target_inputs(0); - - auto mul0 = add->get_input_source_output(0).get_node_shared_ptr(); - auto mul1 = add->get_input_source_output(1).get_node_shared_ptr(); - - size_t const0_index = ov::is_type(mul0->get_input_source_output(1).get_node()) ? 1 : 0; - size_t const1_index = ov::is_type(mul1->get_input_source_output(1).get_node()) ? 1 : 0; - - auto scale_const0 = mul0->get_input_source_output(const0_index).get_node_shared_ptr(); - auto scale_const1 = mul1->get_input_source_output(const1_index).get_node_shared_ptr(); - - auto new_mul0 = register_new_node( - mul0->get_input_source_output((const0_index == 0) ? 1 : 0), - ov::op::util::eltwise_fold(scale_const0, scale_const1)); - new_mul0->set_friendly_name(mul0->get_friendly_name() + "_mma"); - ov::copy_runtime_info(mul0, new_mul0); - - add->input(0).replace_source_output(new_mul0); - add->input(1).replace_source_output(mul1->get_input_source_output((const1_index == 0) ? 1 : 0)); - - auto new_mul1 = register_new_node(add, scale_const1); - new_mul1->set_friendly_name(mul1->get_friendly_name() + "_mma"); - ov::copy_runtime_info(mul1, new_mul1); - - for (auto& in : target_inputs) { - in.replace_source_output(new_mul1); - } - - return true; - }; - - auto m = std::make_shared(add_m, "MulMulAddTransformation"); - this->register_matcher(m, callback); -} - // GroupNormalization has the following property. // // GroupNorm(input * const_a) = GroupNorm(input) @@ -456,191 +385,6 @@ ov::pass::activations_scaling::MulMVNTransformation::MulMVNTransformation() { this->register_matcher(m, callback); } -// input const input -// \ / | -// Multiply ==> VariadicSplit -// | const / | const \ const -// VariadicSplit | / | / \ / -// / | \ Multiply_a Multiply_b Multiply_c -// output_a output_b output_c | | | -// output_a output_b output_c -ov::pass::activations_scaling::SplitTransformation::SplitTransformation() { - MATCHER_SCOPE(SplitTransformation); - - auto activation_m = any_input(is_non_const_node); - auto scale_const_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul_m = wrap_type({activation_m, scale_const_m}); - auto axis_m = any_input(); - auto split_length_m = any_input(); - auto split_m = wrap_type({mul_m, axis_m, split_length_m}); - - ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); - - OPENVINO_ASSERT(pattern_map.count(mul_m)); - OPENVINO_ASSERT(pattern_map.count(split_m)); - - auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); - auto split = - std::dynamic_pointer_cast(pattern_map.at(split_m).get_node_shared_ptr()); - - if (transformation_callback(split)) { - return false; - } - - if (mul && split) { - size_t num_split_outputs = split->get_output_size(); - - std::vector>> target_inputs; - target_inputs.resize(num_split_outputs); - for (size_t i = 0; i < num_split_outputs; i++) { - target_inputs[i] = split->get_output_target_inputs(i); - } - - size_t activation_index = - ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; - size_t const_index = (activation_index == 1) ? 0 : 1; - split->input(0).replace_source_output(mul->input(activation_index).get_source_output()); - - for (size_t i = 0; i < num_split_outputs; i++) { - auto new_mul = register_new_node(split->output(i), - mul->input(const_index).get_source_output()); - new_mul->set_friendly_name(mul->get_friendly_name() + "_" + std::to_string(i)); - ov::copy_runtime_info(mul, new_mul); - - for (auto& in : target_inputs[i]) { - in.replace_source_output(new_mul); - } - } - - return true; - } - return false; - }; - - auto m = std::make_shared(split_m, "SplitTransformation"); - this->register_matcher(m, callback); -} - -// input const input -// \ / | -// Multiply ==> Reshape const -// | | / -// Reshape Multiply -ov::pass::activations_scaling::ReshapeTransformation::ReshapeTransformation() { - MATCHER_SCOPE(ReshapeTransformation); - - auto activation_m = any_input(is_non_const_node); - auto scale_const_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul_m = wrap_type({activation_m, scale_const_m}); - auto axes_m = any_input(); - auto reshape_m = wrap_type({mul_m, axes_m}); - - ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); - - OPENVINO_ASSERT(pattern_map.count(mul_m)); - OPENVINO_ASSERT(pattern_map.count(reshape_m)); - - auto scale_const = - std::dynamic_pointer_cast(pattern_map.at(scale_const_m).get_node_shared_ptr()); - auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); - auto reshape = std::dynamic_pointer_cast(pattern_map.at(reshape_m).get_node_shared_ptr()); - - if (transformation_callback(reshape)) { - return false; - } - - if (scale_const && mul && reshape) { - auto target_inputs = reshape->get_output_target_inputs(0); - size_t activation_index = - ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; - reshape->input(0).replace_source_output(mul->input(activation_index).get_source_output()); - - auto new_mul = register_new_node(reshape, scale_const); - new_mul->set_friendly_name(mul->get_friendly_name() + "_r"); - ov::copy_runtime_info(mul, new_mul); - - for (auto& in : target_inputs) { - in.replace_source_output(new_mul); - } - - return true; - } - return false; - }; - - auto m = std::make_shared(reshape_m, "ReshapeTransformation"); - this->register_matcher(m, callback); -} - -// MulMulMulTransformation makes the target pattern to be easy to be merged with other nodes. -// -// input_a const_a input_b const_b input_a input_b -// \ / \ / \ / -// Multiply_a Multiply_b ==> Multiply_c (const_a * const_b) -// \ / \ / -// \ / Multiply_c_mmm -// \ / -// Multiply_c -// -// (input_a * const_a) * (input_b * const_b) ==> (input_a * input_b) * (const_a * const_b) -ov::pass::activations_scaling::MulMulMulTransformation::MulMulMulTransformation() { - MATCHER_SCOPE(MulMulMulTransformation); - - auto activation0_m = any_input(is_non_const_node); - auto scale_const0_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul0_m = wrap_type({activation0_m, scale_const0_m}); - - auto activation1_m = any_input(is_non_const_node); - auto scale_const1_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul1_m = wrap_type({activation1_m, scale_const1_m}); - - auto mul2_m = wrap_type({mul0_m, mul1_m}); - - ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); - - OPENVINO_ASSERT(pattern_map.count(mul0_m)); - OPENVINO_ASSERT(pattern_map.count(mul1_m)); - OPENVINO_ASSERT(pattern_map.count(mul2_m)); - - auto mul2 = std::dynamic_pointer_cast(pattern_map.at(mul2_m).get_node_shared_ptr()); - - if (transformation_callback(mul2)) { - return false; - } - auto target_inputs = mul2->get_output_target_inputs(0); - - auto mul0 = mul2->get_input_source_output(0).get_node_shared_ptr(); - auto mul1 = mul2->get_input_source_output(1).get_node_shared_ptr(); - - size_t const0_index = ov::is_type(mul0->get_input_source_output(1).get_node()) ? 1 : 0; - size_t const1_index = ov::is_type(mul1->get_input_source_output(1).get_node()) ? 1 : 0; - - auto scale_const0 = mul0->get_input_source_output(const0_index).get_node_shared_ptr(); - auto scale_const1 = mul1->get_input_source_output(const1_index).get_node_shared_ptr(); - - mul2->input(0).replace_source_output(mul0->get_input_source_output((const0_index == 0) ? 1 : 0)); - mul2->input(1).replace_source_output(mul1->get_input_source_output((const1_index == 0) ? 1 : 0)); - - auto new_mul = register_new_node( - mul2, - ov::op::util::eltwise_fold(scale_const0, scale_const1)); - new_mul->set_friendly_name(mul2->get_friendly_name() + "_mmm"); - ov::copy_runtime_info(mul2, new_mul); - - for (auto& in : target_inputs) { - in.replace_source_output(new_mul); - } - - return true; - }; - - auto m = std::make_shared(mul2_m, "MulMulMulTransformation"); - this->register_matcher(m, callback); -} - // input_a const_a input_b const_b input_c const_c // \ / \ / \ / // Multiply_a Multiply_b Multiply_c @@ -744,34 +488,3 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( auto m = std::make_shared(concat_m, "MulConcatTransformation"); this->register_matcher(m, callback); } - -bool ov::pass::ActivationsScaling::run_on_model(const std::shared_ptr& f) { - RUN_ON_FUNCTION_SCOPE(ActivationsScaling); - - if (m_scale_factor <= 0.f) - return false; - - ov::pass::Manager manager(get_pass_config(), "ActivationsScaling"); - manager.set_per_pass_validation(false); - - manager.register_pass(m_scale_factor, m_scaled_prec); - manager.register_pass(m_scale_factor); - manager.register_pass(); - manager.register_pass(); - manager.register_pass(); - manager.register_pass(); - manager.register_pass(); - manager.register_pass(); - manager.register_pass(); - manager.register_pass(); - manager.register_pass(); - manager.register_pass(); - manager.register_pass(); - manager.register_pass(); - manager.register_pass(); - manager.register_pass(); - - manager.run_passes(f); - - return true; -} diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 4c11dd024cbb72..877408d5606c0f 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -113,36 +113,6 @@ TEST_F(TransformationTestsF, ScaleDownFusionTest) { } } -TEST_F(TransformationTestsF, MulMulAddTransformationTest) { - { - auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); - auto scale_const_0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); - auto mul0 = std::make_shared(input0, scale_const_0); - auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); - auto scale_const_1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); - auto mul1 = std::make_shared(input1, scale_const_1); - auto add = std::make_shared(mul0, mul1); - auto convert = std::make_shared(add, ov::element::f32); - auto result = std::make_shared(convert); - - model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); - manager.register_pass(); - } - { - auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); - auto scale_const_0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); - auto mul0 = std::make_shared(input0, scale_const_0); - auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); - auto add = std::make_shared(mul0, input1); - auto scale_const_1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); - auto mul1 = std::make_shared(add, scale_const_1); - auto convert = std::make_shared(mul1, ov::element::f32); - auto result = std::make_shared(convert); - - model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); - } -} - TEST_F(TransformationTestsF, MulGroupNormTransformationTest) { { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); @@ -197,99 +167,6 @@ TEST_F(TransformationTestsF, MulMVNTransformationTest) { } } -TEST_F(TransformationTestsF, SplitTransformationTest) { - { - auto input = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); - auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); - auto mul = std::make_shared(input, scale_const); - auto axis = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {0}); - auto split_length = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, 2, 3}); - auto split = std::make_shared(mul, axis, split_length); - auto convert0 = std::make_shared(split->output(0), ov::element::f32); - auto result0 = std::make_shared(convert0); - auto convert1 = std::make_shared(split->output(1), ov::element::f32); - auto result1 = std::make_shared(convert1); - auto convert2 = std::make_shared(split->output(2), ov::element::f32); - auto result2 = std::make_shared(convert2); - - model = std::make_shared(ov::ResultVector{result0, result1, result2}, ov::ParameterVector{input}); - manager.register_pass(); - } - { - auto input = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); - auto axis = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {0}); - auto split_length = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, 2, 3}); - auto split = std::make_shared(input, axis, split_length); - auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); - auto mul0 = std::make_shared(split->output(0), scale_const); - auto convert0 = std::make_shared(mul0, ov::element::f32); - auto result0 = std::make_shared(convert0); - auto mul1 = std::make_shared(split->output(1), scale_const); - auto convert1 = std::make_shared(mul1, ov::element::f32); - auto result1 = std::make_shared(convert1); - auto mul2 = std::make_shared(split->output(2), scale_const); - auto convert2 = std::make_shared(mul2, ov::element::f32); - auto result2 = std::make_shared(convert2); - - model_ref = - std::make_shared(ov::ResultVector{result0, result1, result2}, ov::ParameterVector{input}); - } -} - -TEST_F(TransformationTestsF, ReshapeTransformationTest) { - { - auto input = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); - auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); - auto mul = std::make_shared(input, scale_const); - auto shape = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 0, 1, -1}); - auto reshape = std::make_shared(mul, shape, true); - auto convert = std::make_shared(reshape, ov::element::f32); - auto result = std::make_shared(convert); - - model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); - manager.register_pass(); - } - { - auto input = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); - auto shape = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {0, 0, 1, -1}); - auto reshape = std::make_shared(input, shape, true); - auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); - auto mul = std::make_shared(reshape, scale_const); - auto convert = std::make_shared(mul, ov::element::f32); - auto result = std::make_shared(convert); - - model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); - } -} - -TEST_F(TransformationTestsF, MulMulMulTransformationTest) { - { - auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); - auto scale_const0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); - auto mul0 = std::make_shared(input0, scale_const0); - auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); - auto scale_const1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); - auto mul1 = std::make_shared(input1, scale_const1); - auto mul2 = std::make_shared(mul0, mul1); - auto convert = std::make_shared(mul2, ov::element::f32); - auto result = std::make_shared(convert); - - model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); - manager.register_pass(); - } - { - auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); - auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); - auto mul = std::make_shared(input0, input1); - auto new_scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); - auto new_mul = std::make_shared(mul, new_scale_const); - auto convert = std::make_shared(new_mul, ov::element::f32); - auto result = std::make_shared(convert); - - model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); - } -} - TEST_F(TransformationTestsF, ConcatTransformationTest) { { auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); diff --git a/src/plugins/intel_gpu/src/plugin/graph.cpp b/src/plugins/intel_gpu/src/plugin/graph.cpp index 76c12a63221d5d..c3d74feffb5599 100644 --- a/src/plugins/intel_gpu/src/plugin/graph.cpp +++ b/src/plugins/intel_gpu/src/plugin/graph.cpp @@ -593,12 +593,11 @@ void Graph::update_profiling_info() { perfMap[executedID.first].first = executedID.first; pcIter = perfMap.find(executedID.first); auto& perfCount = pcIter->second.second; - if (executedID.second != nullptr) { - cldnn::instrumentation::profiling_info cldnnInfo{executedID.first, executedID.second->get_profiling_info()}; - collectTimings(cldnnInfo, perfCount); - perfCount.num++; - } + cldnn::instrumentation::profiling_info cldnnInfo{executedID.first, executedID.second->get_profiling_info()}; + + collectTimings(cldnnInfo, perfCount); + perfCount.num++; } } } @@ -723,8 +722,6 @@ std::vector Graph::get_profiling_info() const { if ((!existInProfiling || (existInProfiling && perfIter->second.first.length() == 0)) && executedPrimitives.find(primId) != executedPrimitives.end()) { auto event = executedPrimitives.at(primId); - if (event == nullptr) - continue; cldnn::instrumentation::profiling_info cldnnInfo{primId, event->get_profiling_info()}; diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 6e7178f8385813..5650f5a66a2ae6 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -126,7 +126,6 @@ std::shared_ptr Plugin::clone_and_transform_model(const std::shared_p GPU_DEBUG_IF(!debug_config->dump_graphs.empty()) { auto path_base = debug_config->dump_graphs + "/" + cloned_model->get_name() + "_" + "transformed_func"; ov::pass::VisualizeTree(path_base + ".svg").run_on_model(cloned_model); - ov::pass::Serialize(path_base + ".xml", path_base + ".bin").run_on_model(cloned_model); } return cloned_model; } diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 6983a77f4483b7..3c34e00f479548 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -396,7 +396,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { const int32_t vec_size = 8; return static_cast((gamma_shape.back() / vec_size)) > static_cast(device_info.max_work_group_size); }); - // manager.register_pass(false); + manager.register_pass(false); const bool keep_precision_sensitive_in_fp32_1 = true; const bool convert_input_output_precision = false; @@ -920,7 +920,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { float activations_scale_factor = config.get_property(ov::hint::activations_scale_factor); ov::element::Type scaled_precision = element::f16; - std::cout << "scale_factor: " << activations_scale_factor << std::endl; if (activations_scale_factor > 0.f) { using namespace ov::pass::low_precision; From 88b0d993eef27b0e90c30191ab40abe18bd14957 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 3 Dec 2024 16:20:26 +0900 Subject: [PATCH 37/64] code cleanup --- .../activations_scaling.cpp | 54 ++++++++----------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index b4f3c4abed9ec5..92f067d3ea3e2d 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -74,23 +74,12 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float auto matmul_m = wrap_type({activation_m, weights_m}); auto scaled_op_m = std::make_shared(OutputVector{convolution_m, matmul_m}); - ov::Shape scale_const_shape = {1}; - std::vector scale_down_value = {1.f / scale_factor}; - std::shared_ptr scale_down_const_f16 = - std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); - std::shared_ptr scale_down_const_f32 = - std::make_shared(ov::element::f32, scale_const_shape, scale_down_value); - std::vector scale_up_value = {scale_factor}; - std::shared_ptr scale_up_const_f16 = - std::make_shared(ov::element::f16, scale_const_shape, scale_up_value); - std::shared_ptr scale_up_const_f32 = - std::make_shared(ov::element::f32, scale_const_shape, scale_up_value); - ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); OPENVINO_ASSERT(pattern_map.count(convolution_m) || pattern_map.count(matmul_m)); + // scale_down and scale_up layers will be added around scaled_op std::shared_ptr scaled_op = nullptr; if (pattern_map.count(convolution_m)) @@ -102,6 +91,7 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float if (transformation_callback(scaled_op)) return false; + // in the case of decompressed_to_f32 nodes, scale_up layer will be added after Convert node. bool keep_precision = false; std::shared_ptr output_of_scaled_op = scaled_op; auto child_node = scaled_op->get_output_target_inputs(0).begin()->get_node(); @@ -113,11 +103,15 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float keep_precision = true; } + const ov::Shape scale_shape = {}; + const std::vector scale_down_value = {1.f / scale_factor}; + const std::vector scale_up_value = {scale_factor}; auto output_prec = output_of_scaled_op->output(0).get_element_type(); + // adding a scale_down layer before the target node auto scale_down = std::make_shared( scaled_op->input(0).get_source_output(), - (scaled_op->input(0).get_element_type() == ov::element::f32) ? scale_down_const_f32 : scale_down_const_f16); + std::make_shared(scaled_op->input(0).get_element_type(), scale_shape, scale_down_value)); scale_down->set_friendly_name(scaled_op->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(scaled_op, scale_down); mark_as_scale_down_node(scale_down); @@ -137,8 +131,9 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scaled_op->revalidate_and_infer_types(); std::set> target_inputs; - std::shared_ptr runtime_scaled_op; + // If the target node has a bias layer, scale_up layer will be added after the bias layer. + // So, we need to scale_down the bias layer too. bool has_bias = false; size_t bias_index = 1; { @@ -162,8 +157,7 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float target_inputs = add->get_output_target_inputs(0); auto scale_down_bias = std::make_shared( add->input(bias_index).get_source_output(), - (add->input(bias_index).get_element_type() == ov::element::f32) ? scale_down_const_f32 - : scale_down_const_f16); + std::make_shared(add->input(bias_index).get_element_type(), scale_shape, scale_down_value)); scale_down_bias->set_friendly_name(add->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(add, scale_down_bias); if (scale_down_bias->output(0).get_element_type() != scaled_prec && !keep_precision) { @@ -174,23 +168,22 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float } add->revalidate_and_infer_types(); if (add->output(0).get_element_type() != output_prec && !keep_precision) { - runtime_scaled_op = std::make_shared(add->output(0), output_prec); + output_of_scaled_op = std::make_shared(add->output(0), output_prec); } else { - runtime_scaled_op = add; + output_of_scaled_op = std::dynamic_pointer_cast(add); } } else { target_inputs = output_of_scaled_op->get_output_target_inputs(0); if (output_of_scaled_op->output(0).get_element_type() != output_prec && !keep_precision) { - runtime_scaled_op = std::make_shared(output_of_scaled_op->output(0), output_prec); + output_of_scaled_op = std::make_shared(output_of_scaled_op->output(0), output_prec); } else { - runtime_scaled_op = output_of_scaled_op; + output_of_scaled_op = output_of_scaled_op; } } auto scale_up = register_new_node( - runtime_scaled_op->output(0), - (runtime_scaled_op->output(0).get_element_type() == ov::element::f32) ? scale_up_const_f32 - : scale_up_const_f16); + output_of_scaled_op->output(0), + std::make_shared(output_of_scaled_op->output(0).get_element_type(), scale_shape, scale_up_value)); scale_up->set_friendly_name(scaled_op->get_friendly_name() + "_scale_up"); ov::copy_runtime_info(scaled_op, scale_up); for (auto& in : target_inputs) { @@ -204,6 +197,8 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float this->register_matcher(m, callback); } +// ScaleDownFusion merges multiple scale_down layers into one. +// // input Mul_c // / \ ==> | // Mul_a Mul_b input @@ -245,15 +240,12 @@ ov::pass::activations_scaling::ScaleDownFusion::ScaleDownFusion(float scale_fact return false; } - ov::Shape scale_const_shape = {1}; + ov::Shape scale_shape = {}; std::vector scale_down_value = {1.f / scale_factor}; - std::shared_ptr scale_down_const = - std::make_shared(parent->input(0).get_element_type(), - scale_const_shape, - scale_down_value); - auto new_scale_down = - std::make_shared(parent->input(0).get_source_output(), scale_down_const); + auto new_scale_down = std::make_shared( + parent->input(0).get_source_output(), + std::make_shared(parent->input(0).get_element_type(), scale_shape, scale_down_value)); new_scale_down->set_friendly_name(parent->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(parent, new_scale_down); parent->input(0).replace_source_output(new_scale_down->output(0)); @@ -307,7 +299,6 @@ ov::pass::activations_scaling::MulGroupNormTransformation::MulGroupNormTransform if (mul && norm) { size_t activation_index = ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; - // norm->input(0).replace_source_output(mul->get_input_source_output(activation_index)); auto activation = mul->get_input_source_output(activation_index); if (ov::is_type(activation.get_node())) activation = activation.get_node()->get_input_source_output(0); @@ -363,7 +354,6 @@ ov::pass::activations_scaling::MulMVNTransformation::MulMVNTransformation() { if (mul && norm) { size_t activation_index = ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; - // norm->input(0).replace_source_output(mul->get_input_source_output(activation_index)); auto activation = mul->get_input_source_output(activation_index); if (ov::is_type(activation.get_node())) activation = activation.get_node()->get_input_source_output(0); From 4ddbf27add1c67839585ce5d56fc2b78329cbd0d Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 3 Dec 2024 16:26:46 +0900 Subject: [PATCH 38/64] update code style --- .../common_optimizations/activations_scaling.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 92f067d3ea3e2d..e575ea6a6a7b9a 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -111,7 +111,9 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float // adding a scale_down layer before the target node auto scale_down = std::make_shared( scaled_op->input(0).get_source_output(), - std::make_shared(scaled_op->input(0).get_element_type(), scale_shape, scale_down_value)); + std::make_shared(scaled_op->input(0).get_element_type(), + scale_shape, + scale_down_value)); scale_down->set_friendly_name(scaled_op->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(scaled_op, scale_down); mark_as_scale_down_node(scale_down); @@ -157,7 +159,9 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float target_inputs = add->get_output_target_inputs(0); auto scale_down_bias = std::make_shared( add->input(bias_index).get_source_output(), - std::make_shared(add->input(bias_index).get_element_type(), scale_shape, scale_down_value)); + std::make_shared(add->input(bias_index).get_element_type(), + scale_shape, + scale_down_value)); scale_down_bias->set_friendly_name(add->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(add, scale_down_bias); if (scale_down_bias->output(0).get_element_type() != scaled_prec && !keep_precision) { @@ -175,7 +179,8 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float } else { target_inputs = output_of_scaled_op->get_output_target_inputs(0); if (output_of_scaled_op->output(0).get_element_type() != output_prec && !keep_precision) { - output_of_scaled_op = std::make_shared(output_of_scaled_op->output(0), output_prec); + output_of_scaled_op = + std::make_shared(output_of_scaled_op->output(0), output_prec); } else { output_of_scaled_op = output_of_scaled_op; } @@ -183,7 +188,9 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float auto scale_up = register_new_node( output_of_scaled_op->output(0), - std::make_shared(output_of_scaled_op->output(0).get_element_type(), scale_shape, scale_up_value)); + std::make_shared(output_of_scaled_op->output(0).get_element_type(), + scale_shape, + scale_up_value)); scale_up->set_friendly_name(scaled_op->get_friendly_name() + "_scale_up"); ov::copy_runtime_info(scaled_op, scale_up); for (auto& in : target_inputs) { From 21897c7c2ca702999c004b06caeff2ae60839b43 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 3 Dec 2024 16:34:04 +0900 Subject: [PATCH 39/64] remove redundant code --- .../common_optimizations/activations_scaling.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index e575ea6a6a7b9a..2a276c1d9eab8e 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -181,8 +181,6 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float if (output_of_scaled_op->output(0).get_element_type() != output_prec && !keep_precision) { output_of_scaled_op = std::make_shared(output_of_scaled_op->output(0), output_prec); - } else { - output_of_scaled_op = output_of_scaled_op; } } From f219faa489211031929ef2ca051f2bd6cbfd7a3f Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 3 Dec 2024 17:07:58 +0900 Subject: [PATCH 40/64] updated activations scaling tests --- .../common_optimizations/activations_scaling_test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 877408d5606c0f..a7fc3934e908e7 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -47,7 +47,7 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); auto weights_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{3, 3, 3, 3}, {1}); - auto scale_down_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {1.f / scale_factor}); + auto scale_down_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{}, {1.f / scale_factor}); auto scale_down = std::make_shared(input, scale_down_const); auto conv = std::make_shared(scale_down, weights_const, @@ -55,7 +55,7 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { CoordinateDiff{}, CoordinateDiff{}, Strides{}); - auto scale_up_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {scale_factor}); + auto scale_up_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{}, {scale_factor}); auto scale_up = std::make_shared(conv, scale_up_const); auto convert = std::make_shared(scale_up, ov::element::f32); auto result = std::make_shared(convert); @@ -67,7 +67,7 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { TEST_F(TransformationTestsF, ScaleDownFusionTest) { float scale_factor = 128.f; { - ov::Shape scale_const_shape = {1}; + ov::Shape scale_const_shape = {}; std::vector scale_down_value = {1.f / scale_factor}; std::shared_ptr scale_down_const = std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); @@ -91,7 +91,7 @@ TEST_F(TransformationTestsF, ScaleDownFusionTest) { manager.register_pass(scale_factor); } { - ov::Shape scale_const_shape = {1}; + ov::Shape scale_const_shape = {}; std::vector scale_down_value = {1.f / scale_factor}; std::shared_ptr scale_down_const = std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); From e11baf4f800ff97c269f96ec8a9d50305ad5a721 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Wed, 4 Dec 2024 12:28:22 +0900 Subject: [PATCH 41/64] updated ScaleDownFusion --- .../activations_scaling.hpp | 2 +- .../activations_scaling.cpp | 34 ++++++------------- .../src/plugin/transformations_pipeline.cpp | 2 +- 3 files changed, 13 insertions(+), 25 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index 15b8bb61081e65..0c2b1b76547bd2 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -53,7 +53,7 @@ class ov::pass::activations_scaling::ScaleDownSingleLayer : public ov::pass::Mat class ov::pass::activations_scaling::ScaleDownFusion : public ov::pass::MatcherPass { public: OPENVINO_RTTI("ScaleDownFusion", "0"); - ScaleDownFusion(float scale_factor); + ScaleDownFusion(); }; class ov::pass::activations_scaling::MulGroupNormTransformation : public ov::pass::MatcherPass { diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 2a276c1d9eab8e..84bf7f319ee2cc 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -204,10 +204,12 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float // ScaleDownFusion merges multiple scale_down layers into one. // -// input Mul_c +// input input // / \ ==> | -// Mul_a Mul_b input -ov::pass::activations_scaling::ScaleDownFusion::ScaleDownFusion(float scale_factor) { +// Mul_a Mul_b Mul_a +// | | / | +// op_a op_b op_a op_b +ov::pass::activations_scaling::ScaleDownFusion::ScaleDownFusion() { MATCHER_SCOPE(ScaleDownFusion); const auto is_scale_down_mul = [](const ov::Output& output) -> bool { @@ -229,9 +231,8 @@ ov::pass::activations_scaling::ScaleDownFusion::ScaleDownFusion(float scale_fact auto children = parent->get_users(); size_t num_scaled_down_nodes = 0; for (const auto& child : children) { - if (!is_scale_down_node(child)) - return false; - num_scaled_down_nodes += 1; + if (is_scale_down_node(child)) + num_scaled_down_nodes += 1; } if (num_scaled_down_nodes < 2) @@ -240,24 +241,11 @@ ov::pass::activations_scaling::ScaleDownFusion::ScaleDownFusion(float scale_fact if (transformation_callback(mul)) return false; - if (!ov::is_type(parent) && !ov::is_type(parent) && - !ov::is_type(parent) && !ov::is_type(parent)) { - return false; - } - - ov::Shape scale_shape = {}; - std::vector scale_down_value = {1.f / scale_factor}; - - auto new_scale_down = std::make_shared( - parent->input(0).get_source_output(), - std::make_shared(parent->input(0).get_element_type(), scale_shape, scale_down_value)); - new_scale_down->set_friendly_name(parent->get_friendly_name() + "_scale_down"); - ov::copy_runtime_info(parent, new_scale_down); - parent->input(0).replace_source_output(new_scale_down->output(0)); - for (const auto& child : children) { - for (auto& target : child->get_output_target_inputs(0)) { - target.replace_source_output(parent->output(0)); + if (is_scale_down_node(child)) { + for (auto& target : child->get_output_target_inputs(0)) { + target.replace_source_output(mul->output(0)); + } } } diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 3c34e00f479548..19abf6d9358274 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -949,7 +949,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { }); manager.register_pass(activations_scale_factor, scaled_precision); - manager.register_pass(activations_scale_factor); + manager.register_pass(); auto params = LayerTransformation::Params(false, scaled_precision, {scaled_precision}, true, false); auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); lpt_pass->add_main(); From 56219346c6ee6a66e872e3cead1071d5970e23b6 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Wed, 4 Dec 2024 14:02:38 +0900 Subject: [PATCH 42/64] fixed ScaleDownFusionTest --- .../activations_scaling_test.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index a7fc3934e908e7..b1eb7e17c8e78a 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -88,7 +88,7 @@ TEST_F(TransformationTestsF, ScaleDownFusionTest) { auto result1 = std::make_shared(reshape_post1); model = std::make_shared(ov::ResultVector{result0, result1}, ov::ParameterVector{input}); - manager.register_pass(scale_factor); + manager.register_pass(); } { ov::Shape scale_const_shape = {}; @@ -97,16 +97,16 @@ TEST_F(TransformationTestsF, ScaleDownFusionTest) { std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); - auto new_scale_down = std::make_shared(input->output(0), scale_down_const); - auto shape_pre = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, 3, 256}); - auto reshape_pre = std::make_shared(new_scale_down, shape_pre, true); + auto reshape_pre = std::make_shared(input, shape_pre, true); auto shape_post = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {1, 3, 16, 16}); - auto reshape_post0 = std::make_shared(reshape_pre, shape_post, true); + auto scale_down0 = std::make_shared(reshape_pre->output(0), scale_down_const); + ov::pass::activations_scaling::mark_as_scale_down_node(scale_down0); + auto reshape_post0 = std::make_shared(scale_down0, shape_post, true); auto result0 = std::make_shared(reshape_post0); - auto reshape_post1 = std::make_shared(reshape_pre, shape_post, true); + auto reshape_post1 = std::make_shared(scale_down0, shape_post, true); auto result1 = std::make_shared(reshape_post1); model_ref = std::make_shared(ov::ResultVector{result0, result1}, ov::ParameterVector{input}); From 421989dfe3d375be69c5dbf673ecd1571cf19615 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 9 Dec 2024 03:22:07 +0900 Subject: [PATCH 43/64] added MulNormTransformation and NormMulTransformation --- .../activations_scaling.hpp | 22 +- .../activations_scaling.cpp | 200 ++++++++++-------- .../src/plugin/transformations_pipeline.cpp | 16 +- 3 files changed, 130 insertions(+), 108 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index 0c2b1b76547bd2..cdad964f28d56c 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -31,9 +31,9 @@ class TRANSFORMATIONS_API ScaleDownNode : public RuntimeAttribute { class TRANSFORMATIONS_API ScaleDownSingleLayer; class TRANSFORMATIONS_API ScaleDownFusion; -class TRANSFORMATIONS_API MulGroupNormTransformation; -class TRANSFORMATIONS_API MulMVNTransformation; +class TRANSFORMATIONS_API MulNormTransformation; class TRANSFORMATIONS_API MulConcatTransformation; +class TRANSFORMATIONS_API NormMulTransformation; } // namespace activations_scaling } // namespace pass @@ -56,16 +56,10 @@ class ov::pass::activations_scaling::ScaleDownFusion : public ov::pass::MatcherP ScaleDownFusion(); }; -class ov::pass::activations_scaling::MulGroupNormTransformation : public ov::pass::MatcherPass { +class ov::pass::activations_scaling::MulNormTransformation : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("MulGroupNormTransformation", "0"); - MulGroupNormTransformation(); -}; - -class ov::pass::activations_scaling::MulMVNTransformation : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("MulMVNTransformation", "0"); - MulMVNTransformation(); + OPENVINO_RTTI("MulNormTransformation", "0"); + MulNormTransformation(); }; class ov::pass::activations_scaling::MulConcatTransformation : public ov::pass::MatcherPass { @@ -73,3 +67,9 @@ class ov::pass::activations_scaling::MulConcatTransformation : public ov::pass:: OPENVINO_RTTI("MulConcatTransformation", "0"); MulConcatTransformation(); }; + +class ov::pass::activations_scaling::NormMulTransformation : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("NormMulTransformation", "0"); + NormMulTransformation(); +}; diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 84bf7f319ee2cc..b489fe3f96701c 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -24,10 +24,12 @@ #include "openvino/op/variadic_split.hpp" #include "openvino/pass/constant_folding.hpp" #include "openvino/pass/manager.hpp" +#include "openvino/pass/pattern/op/optional.hpp" #include "openvino/pass/pattern/op/or.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "transformations/common_optimizations/lin_op_sequence_fusion.hpp" #include "transformations/utils/utils.hpp" +#include "ov_ops/rms.hpp" namespace { const auto is_scalar_node = [](const ov::Output& output) -> bool { @@ -256,115 +258,79 @@ ov::pass::activations_scaling::ScaleDownFusion::ScaleDownFusion() { this->register_matcher(m, callback); } -// GroupNormalization has the following property. +// Normalization has the following property. // -// GroupNorm(input * const_a) = GroupNorm(input) +// Norm(input * const_a) = Norm(input) // -// So, we can skip Multiply that is connected to GroupNormalization. +// So, we can skip Multiply that is connected to Normalization. // -// input --> Multiply --> GroupNormalization +// input --> Multiply --> Normalization // ==> -// input --> GroupNormalization -ov::pass::activations_scaling::MulGroupNormTransformation::MulGroupNormTransformation() { - MATCHER_SCOPE(MulGroupNormTransformation); +// input --> Normalization +ov::pass::activations_scaling::MulNormTransformation::MulNormTransformation() { + MATCHER_SCOPE(MulNormTransformation); auto activation_m = any_input(is_non_const_node); + auto convert_m = ov::pass::pattern::optional(activation_m); auto scale_const_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul_m = wrap_type({activation_m, scale_const_m}); - auto norm_scale_m = any_input(); - auto norm_bias_m = any_input(); - auto norm_m = wrap_type({mul_m, norm_scale_m, norm_bias_m}); + auto mul_m = wrap_type({convert_m, scale_const_m}); + auto mvn_m = wrap_type({mul_m, any_input()}); + auto rms_m = wrap_type({mul_m, any_input()}); + auto group_norm_m = wrap_type({mul_m, any_input(), any_input()}); + auto norm_m = std::make_shared(OutputVector{mvn_m, rms_m, group_norm_m}); ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); - OPENVINO_ASSERT(pattern_map.count(mul_m)); - OPENVINO_ASSERT(pattern_map.count(norm_m)); - - auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); - auto norm = - std::dynamic_pointer_cast(pattern_map.at(norm_m).get_node_shared_ptr()); - - if (transformation_callback(norm)) { + if (transformation_callback(m.get_match_root())) { return false; } - if (mul && norm) { - size_t activation_index = - ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; - auto activation = mul->get_input_source_output(activation_index); - if (ov::is_type(activation.get_node())) - activation = activation.get_node()->get_input_source_output(0); - auto newGroupNorm = std::make_shared>( - ov::op::v12::GroupNormalization(activation, - norm->get_input_source_output(1), - norm->get_input_source_output(2), - norm->get_num_groups(), - norm->get_epsilon()), - norm->get_output_element_type(0)); - ov::copy_runtime_info(norm, newGroupNorm); - ov::replace_node(norm, newGroupNorm); - return true; - } - return false; - }; - - auto m = std::make_shared(norm_m, "MulGroupNormTransformation"); - this->register_matcher(m, callback); -} - -// MVN has the following property. -// -// MVN(input * const_a) = MVN(input) -// -// So, we can skip Multiply that is connected to MVN. -// -// input --> Multiply --> MVN -// ==> -// input --> MVN -ov::pass::activations_scaling::MulMVNTransformation::MulMVNTransformation() { - MATCHER_SCOPE(MulMVNTransformation); - - auto activation_m = any_input(is_non_const_node); - auto scale_const_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul_m = wrap_type({activation_m, scale_const_m}); - auto norm_axes_m = any_input(); - auto norm_m = wrap_type({mul_m, norm_axes_m}); - - ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); + auto activation = pattern_map.at(activation_m); + auto norm = pattern_map.at(norm_m).get_node_shared_ptr(); - OPENVINO_ASSERT(pattern_map.count(mul_m)); - OPENVINO_ASSERT(pattern_map.count(norm_m)); - - auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); - auto norm = std::dynamic_pointer_cast(pattern_map.at(norm_m).get_node_shared_ptr()); - - if (transformation_callback(norm)) { - return false; + OutputVector new_inputs = {activation}; + for (size_t i = 1; i < norm->get_input_size(); ++i) { + new_inputs.push_back(norm->input(i).get_source_output()); } - if (mul && norm) { - size_t activation_index = - ov::is_type(mul->get_input_source_output(1).get_node()) ? 0 : 1; - auto activation = mul->get_input_source_output(activation_index); - if (ov::is_type(activation.get_node())) - activation = activation.get_node()->get_input_source_output(0); - auto newMVN = - std::make_shared>(ov::op::v6::MVN(activation, - norm->get_input_source_output(1), - norm->get_normalize_variance(), - norm->get_eps(), - norm->get_eps_mode()), - norm->get_output_element_type(0)); - ov::copy_runtime_info(norm, newMVN); - ov::replace_node(norm, newMVN); - return true; + std::shared_ptr new_norm; + if (pattern_map.count(mvn_m)) { + auto mvn = std::dynamic_pointer_cast(pattern_map.at(mvn_m).get_node_shared_ptr()); + new_norm = + std::make_shared>(ov::op::v6::MVN(new_inputs[0], + new_inputs[1], + mvn->get_normalize_variance(), + mvn->get_eps(), + mvn->get_eps_mode()), + mvn->get_output_element_type(0)); + + } else if (pattern_map.count(rms_m)) { + auto rms = std::dynamic_pointer_cast(pattern_map.at(rms_m).get_node_shared_ptr()); + new_norm = + std::make_shared>(ov::op::internal::RMS(new_inputs[0], + new_inputs[1], + rms->get_epsilon(), + rms->get_output_element_type(0)), + rms->get_output_element_type(0)); + } else { + auto group_norm = std::dynamic_pointer_cast(pattern_map.at(group_norm_m).get_node_shared_ptr()); + new_norm = std::make_shared>( + ov::op::v12::GroupNormalization(new_inputs[0], + new_inputs[1], + new_inputs[2], + group_norm->get_num_groups(), + group_norm->get_epsilon()), + group_norm->get_output_element_type(0)); } - return false; + new_norm->set_friendly_name(norm->get_friendly_name()); + ov::copy_runtime_info(norm, new_norm); + ov::replace_node(norm, new_norm); + + return true; }; - auto m = std::make_shared(norm_m, "MulMVNTransformation"); + auto m = std::make_shared(norm_m, "MulNormTransformation"); this->register_matcher(m, callback); } @@ -471,3 +437,59 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( auto m = std::make_shared(concat_m, "MulConcatTransformation"); this->register_matcher(m, callback); } + +// input input +// / \ | +// RMS Mul ==> Mul (expect to be fused into the input layer) +// | | / \_ +// op_a op_b RMS op_b +// | +// op_a +ov::pass::activations_scaling::NormMulTransformation::NormMulTransformation() { + MATCHER_SCOPE(NormMulTransformation); + + auto mvn_m = wrap_type({any_input(), any_input()}); + auto rms_m = wrap_type({any_input(), any_input()}); + auto group_norm_m = wrap_type({any_input(), any_input(), any_input()}); + auto norm_m = std::make_shared(OutputVector{mvn_m, rms_m, group_norm_m}); + + ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + if (transformation_callback(m.get_match_root())) { + return false; + } + + auto norm = pattern_map.at(norm_m).get_node_shared_ptr(); + + auto parent_output = norm->get_input_source_output(0); + if (parent_output.get_target_inputs().size() != 2) + return false; + + ov::Node *mul = nullptr; + for (auto& child : parent_output.get_target_inputs()) { + if (child == norm->input(0)) + continue; + mul = child.get_node(); + } + + if (!ov::is_type(mul)) + return false; + + ov::Output const_input; + for (auto input : mul->input_values()) { + if (input == parent_output) + continue; + const_input = input; + } + + if (!is_scalar_node(const_input)) + return false; + + norm->input(0).replace_source_output(mul->output(0)); + return false; + }; + + auto m = std::make_shared(norm_m, "NormMulTransformation"); + this->register_matcher(m, callback); +} diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 19abf6d9358274..1eaccf625d3e79 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -899,6 +899,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "TransformationsPipeline::apply::activations_scaling"); ov::pass::Manager manager("GPU:ActivationsScaling"); manager.set_per_pass_validation(false); + auto pass_config = manager.get_pass_config(); // Other ops support eltwise fusions const std::vector allowed_data_movement_ops = { @@ -918,6 +919,11 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // not working properly. manager.register_pass(); + manager.register_pass(true); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + float activations_scale_factor = config.get_property(ov::hint::activations_scale_factor); ov::element::Type scaled_precision = element::f16; @@ -927,7 +933,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { auto supportedPrecisions = std::vector({}); auto perTensorQuantization = std::vector({}); - auto pass_config = manager.get_pass_config(); pass_config->disable(); pass_config->disable(); pass_config->disable(); @@ -952,9 +957,9 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); auto params = LayerTransformation::Params(false, scaled_precision, {scaled_precision}, true, false); auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); - lpt_pass->add_main(); - lpt_pass->add_main(); + lpt_pass->add_main(); lpt_pass->add_main(); + manager.register_pass(); } manager.run_passes(func); @@ -1027,11 +1032,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { const size_t zp_pad_size = device_info.supports_immad ? 16 : 32; manager.register_pass(zp_pad_size, device_info.supports_immad); - manager.register_pass(true); - pass_config->disable(); - pass_config->disable(); - pass_config->disable(); - manager.register_pass(); manager.register_pass(); From 3e68e298a1f20e3c46804ae56450ab5885d4544d Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 9 Dec 2024 03:26:48 +0900 Subject: [PATCH 44/64] removed apply_rt_info --- src/inference/src/dev/core_impl.cpp | 16 ---------------- src/inference/src/dev/core_impl.hpp | 4 ---- 2 files changed, 20 deletions(-) diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp index cbea8ba5268755..e0e2fb109dc642 100644 --- a/src/inference/src/dev/core_impl.cpp +++ b/src/inference/src/dev/core_impl.cpp @@ -779,7 +779,6 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::shared_ptr< auto parsed = parseDeviceNameIntoConfig(deviceName, coreConfig, config_with_batch, is_proxy_device(deviceName)); auto plugin = get_plugin(parsed._deviceName); - apply_rt_info(plugin, model_, parsed._config); ov::SoPtr res; // will consume ov::cache_dir if plugin not support it auto cacheManager = parsed._core_config.get_cache_config_for_device(plugin, parsed._config)._cacheManager; @@ -814,7 +813,6 @@ ov::SoPtr ov::CoreImpl::compile_model(const std::shared_ptr< auto parsed = parseDeviceNameIntoConfig(deviceName, coreConfig, config_with_batch, is_proxy_device(deviceName)); auto plugin = get_plugin(parsed._deviceName); - apply_rt_info(plugin, model_, parsed._config); ov::SoPtr res; // will consume ov::cache_dir if plugin not support it auto cacheManager = parsed._core_config.get_cache_config_for_device(plugin, parsed._config)._cacheManager; @@ -1136,20 +1134,6 @@ std::shared_ptr ov::CoreImpl::apply_auto_batching(const std::sh return ov::details::apply_batch_affinity(model, deviceNameWithoutBatch); } -void ov::CoreImpl::apply_rt_info(const ov::Plugin& plugin, - const std::shared_ptr& model, - ov::AnyMap& config) const { - if (util::contains(plugin.get_property(ov::supported_properties), ov::hint::activations_scale_factor)) { - if (model->has_rt_info({"runtime_options", "ACTIVATIONS_SCALE_FACTOR"})) { - if (config.find("ACTIVATIONS_SCALE_FACTOR") == config.end()) { - const auto activations_scale_factor = - model->get_rt_info({"runtime_options", "ACTIVATIONS_SCALE_FACTOR"}); - config.insert(ov::hint::activations_scale_factor(activations_scale_factor)); - } - } - } -} - void ov::CoreImpl::set_property(const std::string& device_name, const AnyMap& properties) { OPENVINO_ASSERT(device_name.find("HETERO:") != 0, "set_property is supported only for HETERO itself (without devices). " diff --git a/src/inference/src/dev/core_impl.hpp b/src/inference/src/dev/core_impl.hpp index 7e60c434bf3e57..85417175c22556 100644 --- a/src/inference/src/dev/core_impl.hpp +++ b/src/inference/src/dev/core_impl.hpp @@ -251,10 +251,6 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this& model, - ov::AnyMap& config) const; - /* * @brief Register plugins according to the build configuration */ From ce3a738d4f4470f709c047bf33b2febaeb7f03fa Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 9 Dec 2024 03:42:18 +0900 Subject: [PATCH 45/64] updated activations scaling unit tests --- .../activations_scaling_test.cpp | 30 ++----------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index b1eb7e17c8e78a..7f89461b8994a3 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -113,7 +113,7 @@ TEST_F(TransformationTestsF, ScaleDownFusionTest) { } } -TEST_F(TransformationTestsF, MulGroupNormTransformationTest) { +TEST_F(TransformationTestsF, MulNormTransformationTest) { { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); @@ -126,7 +126,7 @@ TEST_F(TransformationTestsF, MulGroupNormTransformationTest) { auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); - manager.register_pass(); + manager.register_pass(); } { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); @@ -141,32 +141,6 @@ TEST_F(TransformationTestsF, MulGroupNormTransformationTest) { } } -TEST_F(TransformationTestsF, MulMVNTransformationTest) { - { - auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 224, 224}); - auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); - auto mul = std::make_shared(input, scale_const); - auto norm_axes_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, 2, 3}); - auto mvn = - std::make_shared(mul, norm_axes_const, true, 0.01f, ov::op::MVNEpsMode::INSIDE_SQRT); - auto convert = std::make_shared(mvn, ov::element::f32); - auto result = std::make_shared(convert); - - model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); - manager.register_pass(); - } - { - auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 224, 224}); - auto norm_axes_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, 2, 3}); - auto mvn = - std::make_shared(input, norm_axes_const, true, 0.01f, ov::op::MVNEpsMode::INSIDE_SQRT); - auto convert = std::make_shared(mvn, ov::element::f32); - auto result = std::make_shared(convert); - - model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); - } -} - TEST_F(TransformationTestsF, ConcatTransformationTest) { { auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); From accce377216c3e420d5a2e546a5f95ffd1a1b671 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 9 Dec 2024 03:51:55 +0900 Subject: [PATCH 46/64] updated code style --- .../activations_scaling.cpp | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index b489fe3f96701c..4e66e49e642bfd 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -27,9 +27,9 @@ #include "openvino/pass/pattern/op/optional.hpp" #include "openvino/pass/pattern/op/or.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" +#include "ov_ops/rms.hpp" #include "transformations/common_optimizations/lin_op_sequence_fusion.hpp" #include "transformations/utils/utils.hpp" -#include "ov_ops/rms.hpp" namespace { const auto is_scalar_node = [](const ov::Output& output) -> bool { @@ -297,7 +297,7 @@ ov::pass::activations_scaling::MulNormTransformation::MulNormTransformation() { std::shared_ptr new_norm; if (pattern_map.count(mvn_m)) { auto mvn = std::dynamic_pointer_cast(pattern_map.at(mvn_m).get_node_shared_ptr()); - new_norm = + new_norm = std::make_shared>(ov::op::v6::MVN(new_inputs[0], new_inputs[1], mvn->get_normalize_variance(), @@ -307,21 +307,22 @@ ov::pass::activations_scaling::MulNormTransformation::MulNormTransformation() { } else if (pattern_map.count(rms_m)) { auto rms = std::dynamic_pointer_cast(pattern_map.at(rms_m).get_node_shared_ptr()); - new_norm = - std::make_shared>(ov::op::internal::RMS(new_inputs[0], - new_inputs[1], - rms->get_epsilon(), - rms->get_output_element_type(0)), - rms->get_output_element_type(0)); + new_norm = std::make_shared>( + ov::op::internal::RMS(new_inputs[0], + new_inputs[1], + rms->get_epsilon(), + rms->get_output_element_type(0)), + rms->get_output_element_type(0)); } else { - auto group_norm = std::dynamic_pointer_cast(pattern_map.at(group_norm_m).get_node_shared_ptr()); + auto group_norm = std::dynamic_pointer_cast( + pattern_map.at(group_norm_m).get_node_shared_ptr()); new_norm = std::make_shared>( - ov::op::v12::GroupNormalization(new_inputs[0], - new_inputs[1], - new_inputs[2], - group_norm->get_num_groups(), - group_norm->get_epsilon()), - group_norm->get_output_element_type(0)); + ov::op::v12::GroupNormalization(new_inputs[0], + new_inputs[1], + new_inputs[2], + group_norm->get_num_groups(), + group_norm->get_epsilon()), + group_norm->get_output_element_type(0)); } new_norm->set_friendly_name(norm->get_friendly_name()); ov::copy_runtime_info(norm, new_norm); @@ -466,7 +467,7 @@ ov::pass::activations_scaling::NormMulTransformation::NormMulTransformation() { if (parent_output.get_target_inputs().size() != 2) return false; - ov::Node *mul = nullptr; + ov::Node* mul = nullptr; for (auto& child : parent_output.get_target_inputs()) { if (child == norm->input(0)) continue; From b2b56e493d1ae3b41b0f8529b870da357d6c71cb Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 10 Dec 2024 21:17:27 +0900 Subject: [PATCH 47/64] updated AddTransformation to use output_type instead of fp32 --- .../low_precision_transformations/src/add.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/common/low_precision_transformations/src/add.cpp b/src/common/low_precision_transformations/src/add.cpp index 1ba6f6598be247..7fa283089bef0b 100644 --- a/src/common/low_precision_transformations/src/add.cpp +++ b/src/common/low_precision_transformations/src/add.cpp @@ -214,14 +214,15 @@ bool AddTransformation::transform(TransformationContext& context, ov::pass::patt newSubtractFullPathValues), newMultiplyFullPathValues); + auto output_type = useDefaultTransformation ? element::f32 : add->get_output_element_type(0); newAddOrSubtract = std::make_shared>( - std::vector{element::f32, element::f32}, std::vector{ element::f32 }, - ov::op::TemporaryReplaceOutputType(inputs[0], element::f32).get(), - ov::op::TemporaryReplaceOutputType(inputs[1], element::f32).get()); + std::vector{output_type, output_type}, std::vector{ output_type }, + ov::op::TemporaryReplaceOutputType(inputs[0], output_type).get(), + ov::op::TemporaryReplaceOutputType(inputs[1], output_type).get()); newMultiply = std::make_shared>( - std::vector{element::f32, element::f32}, std::vector{ add->get_output_element_type(0) }, - ov::op::TemporaryReplaceOutputType(newAddOrSubtract, element::f32).get(), - ov::op::TemporaryReplaceOutputType(multiplyEmptyPathValues, element::f32).get()); + std::vector{output_type, output_type}, std::vector{ add->get_output_element_type(0) }, + ov::op::TemporaryReplaceOutputType(newAddOrSubtract, output_type).get(), + ov::op::TemporaryReplaceOutputType(multiplyEmptyPathValues, output_type).get()); NetworkHelper::insertDequantizationAfter(add, newMultiply, newAddOrSubtract); NetworkHelper::copyInfo(add, newAddOrSubtract); From 1a81ad26c2e451fa4eb0e67e48c2f82c598d1578 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 10 Dec 2024 21:25:38 +0900 Subject: [PATCH 48/64] added a new EliminateMultiplyX1 pass --- .../activations_scaling.hpp | 15 +++-- .../activations_scaling.cpp | 60 ++++++++++++++++--- .../activations_scaling_test.cpp | 4 +- .../src/plugin/transformations_pipeline.cpp | 3 +- 4 files changed, 68 insertions(+), 14 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index cdad964f28d56c..7f072f79e8a838 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -31,9 +31,10 @@ class TRANSFORMATIONS_API ScaleDownNode : public RuntimeAttribute { class TRANSFORMATIONS_API ScaleDownSingleLayer; class TRANSFORMATIONS_API ScaleDownFusion; -class TRANSFORMATIONS_API MulNormTransformation; +class TRANSFORMATIONS_API EliminateMultiplyNorm; class TRANSFORMATIONS_API MulConcatTransformation; class TRANSFORMATIONS_API NormMulTransformation; +class TRANSFORMATIONS_API EliminateMultiplyX1; } // namespace activations_scaling } // namespace pass @@ -56,10 +57,10 @@ class ov::pass::activations_scaling::ScaleDownFusion : public ov::pass::MatcherP ScaleDownFusion(); }; -class ov::pass::activations_scaling::MulNormTransformation : public ov::pass::MatcherPass { +class ov::pass::activations_scaling::EliminateMultiplyNorm : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("MulNormTransformation", "0"); - MulNormTransformation(); + OPENVINO_RTTI("EliminateMultiplyNorm", "0"); + EliminateMultiplyNorm(); }; class ov::pass::activations_scaling::MulConcatTransformation : public ov::pass::MatcherPass { @@ -73,3 +74,9 @@ class ov::pass::activations_scaling::NormMulTransformation : public ov::pass::Ma OPENVINO_RTTI("NormMulTransformation", "0"); NormMulTransformation(); }; + +class ov::pass::activations_scaling::EliminateMultiplyX1 : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("EliminateMultiplyX1", "0"); + EliminateMultiplyX1(); +}; diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 4e66e49e642bfd..eb43dc30edcefb 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -267,8 +267,8 @@ ov::pass::activations_scaling::ScaleDownFusion::ScaleDownFusion() { // input --> Multiply --> Normalization // ==> // input --> Normalization -ov::pass::activations_scaling::MulNormTransformation::MulNormTransformation() { - MATCHER_SCOPE(MulNormTransformation); +ov::pass::activations_scaling::EliminateMultiplyNorm::EliminateMultiplyNorm() { + MATCHER_SCOPE(EliminateMultiplyNorm); auto activation_m = any_input(is_non_const_node); auto convert_m = ov::pass::pattern::optional(activation_m); @@ -331,7 +331,7 @@ ov::pass::activations_scaling::MulNormTransformation::MulNormTransformation() { return true; }; - auto m = std::make_shared(norm_m, "MulNormTransformation"); + auto m = std::make_shared(norm_m, "EliminateMultiplyNorm"); this->register_matcher(m, callback); } @@ -441,18 +441,19 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( // input input // / \ | -// RMS Mul ==> Mul (expect to be fused into the input layer) +// Norm Mul ==> Mul (expect to be fused into the input layer) // | | / \_ -// op_a op_b RMS op_b +// op_a op_b Norm op_b // | -// op_a +// op_a ov::pass::activations_scaling::NormMulTransformation::NormMulTransformation() { MATCHER_SCOPE(NormMulTransformation); auto mvn_m = wrap_type({any_input(), any_input()}); auto rms_m = wrap_type({any_input(), any_input()}); auto group_norm_m = wrap_type({any_input(), any_input(), any_input()}); - auto norm_m = std::make_shared(OutputVector{mvn_m, rms_m, group_norm_m}); + auto shape_of_m = wrap_type({any_input()}); + auto norm_m = std::make_shared(OutputVector{mvn_m, rms_m, group_norm_m, shape_of_m}); ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); @@ -494,3 +495,48 @@ ov::pass::activations_scaling::NormMulTransformation::NormMulTransformation() { auto m = std::make_shared(norm_m, "NormMulTransformation"); this->register_matcher(m, callback); } + +ov::pass::activations_scaling::EliminateMultiplyX1::EliminateMultiplyX1() { + MATCHER_SCOPE(EliminateMultiplyX1); + + auto activation_m = any_input(is_non_const_node); + auto mul_const_m = ov::pass::pattern::wrap_type(is_scalar_node); + auto mul_m = wrap_type({activation_m, mul_const_m}); + + ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + if (transformation_callback(m.get_match_root())) { + return false; + } + + auto mul_const = std::dynamic_pointer_cast( + pattern_map.at(mul_const_m).get_node_shared_ptr()); + + float const_value = 0.f; + if (mul_const->get_element_type() == ov::element::f16) { + const_value = std::stof(mul_const->get_data_ptr()->to_string()); + } else if (mul_const->get_element_type() == ov::element::f32) { + const_value = *mul_const->get_data_ptr(); + } else { + return false; + } + + if (const_value != 1.f) + return false; + + auto activation = m.get_match_root()->get_input_source_output(0); + if (ov::is_type(m.get_match_root()->get_input_source_output(0).get_node())) + activation = m.get_match_root()->get_input_source_output(1); + + auto target_inputs = m.get_match_root()->get_output_target_inputs(0); + for (auto& in : target_inputs) { + in.replace_source_output(activation); + } + + return true; + }; + + auto m = std::make_shared(mul_m, "EliminateMultiplyX1"); + this->register_matcher(m, callback); +} diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 7f89461b8994a3..55184446766592 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -113,7 +113,7 @@ TEST_F(TransformationTestsF, ScaleDownFusionTest) { } } -TEST_F(TransformationTestsF, MulNormTransformationTest) { +TEST_F(TransformationTestsF, EliminateMultiplyNormTest) { { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); @@ -126,7 +126,7 @@ TEST_F(TransformationTestsF, MulNormTransformationTest) { auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); - manager.register_pass(); + manager.register_pass(); } { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 1eaccf625d3e79..b90bdce9950246 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -957,9 +957,10 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); auto params = LayerTransformation::Params(false, scaled_precision, {scaled_precision}, true, false); auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); - lpt_pass->add_main(); + lpt_pass->add_main(); lpt_pass->add_main(); manager.register_pass(); + manager.register_pass(); } manager.run_passes(func); From e05b39856c5d3283e8f2b50a0aeda3b4c98e1e69 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 10 Dec 2024 21:37:29 +0900 Subject: [PATCH 49/64] update code style --- .../common_optimizations/activations_scaling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index eb43dc30edcefb..849d16f7b593e3 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -510,8 +510,8 @@ ov::pass::activations_scaling::EliminateMultiplyX1::EliminateMultiplyX1() { return false; } - auto mul_const = std::dynamic_pointer_cast( - pattern_map.at(mul_const_m).get_node_shared_ptr()); + auto mul_const = + std::dynamic_pointer_cast(pattern_map.at(mul_const_m).get_node_shared_ptr()); float const_value = 0.f; if (mul_const->get_element_type() == ov::element::f16) { From f4abb206aa022e3acc808a334c30bef179f28bcd Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 16 Dec 2024 22:40:36 +0900 Subject: [PATCH 50/64] added a new MulMulTransformation --- .../activations_scaling.hpp | 7 +++ .../activations_scaling.cpp | 61 +++++++++++++++++-- .../src/plugin/transformations_pipeline.cpp | 10 +++ 3 files changed, 74 insertions(+), 4 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index 7f072f79e8a838..4df8bb15c9366d 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -35,6 +35,7 @@ class TRANSFORMATIONS_API EliminateMultiplyNorm; class TRANSFORMATIONS_API MulConcatTransformation; class TRANSFORMATIONS_API NormMulTransformation; class TRANSFORMATIONS_API EliminateMultiplyX1; +class TRANSFORMATIONS_API MulMulTransformation; } // namespace activations_scaling } // namespace pass @@ -80,3 +81,9 @@ class ov::pass::activations_scaling::EliminateMultiplyX1 : public ov::pass::Matc OPENVINO_RTTI("EliminateMultiplyX1", "0"); EliminateMultiplyX1(); }; + +class ov::pass::activations_scaling::MulMulTransformation : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("MulMulTransformation", "0"); + MulMulTransformation(); +}; diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 849d16f7b593e3..69ece01efcbb2a 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -277,7 +277,8 @@ ov::pass::activations_scaling::EliminateMultiplyNorm::EliminateMultiplyNorm() { auto mvn_m = wrap_type({mul_m, any_input()}); auto rms_m = wrap_type({mul_m, any_input()}); auto group_norm_m = wrap_type({mul_m, any_input(), any_input()}); - auto norm_m = std::make_shared(OutputVector{mvn_m, rms_m, group_norm_m}); + auto shape_of_m = wrap_type({mul_m}); + auto norm_m = std::make_shared(OutputVector{mvn_m, rms_m, group_norm_m, shape_of_m}); ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); @@ -313,7 +314,7 @@ ov::pass::activations_scaling::EliminateMultiplyNorm::EliminateMultiplyNorm() { rms->get_epsilon(), rms->get_output_element_type(0)), rms->get_output_element_type(0)); - } else { + } else if (pattern_map.count(group_norm_m)) { auto group_norm = std::dynamic_pointer_cast( pattern_map.at(group_norm_m).get_node_shared_ptr()); new_norm = std::make_shared>( @@ -323,6 +324,10 @@ ov::pass::activations_scaling::EliminateMultiplyNorm::EliminateMultiplyNorm() { group_norm->get_num_groups(), group_norm->get_epsilon()), group_norm->get_output_element_type(0)); + } else if (pattern_map.count(shape_of_m)) { + auto shape_of = std::dynamic_pointer_cast( + pattern_map.at(shape_of_m).get_node_shared_ptr()); + new_norm = std::make_shared(new_inputs[0], shape_of->get_output_type()); } new_norm->set_friendly_name(norm->get_friendly_name()); ov::copy_runtime_info(norm, new_norm); @@ -485,11 +490,11 @@ ov::pass::activations_scaling::NormMulTransformation::NormMulTransformation() { const_input = input; } - if (!is_scalar_node(const_input)) + if (!is_scalar_node(const_input) || !ov::is_type(const_input.get_node())) return false; norm->input(0).replace_source_output(mul->output(0)); - return false; + return true; }; auto m = std::make_shared(norm_m, "NormMulTransformation"); @@ -540,3 +545,51 @@ ov::pass::activations_scaling::EliminateMultiplyX1::EliminateMultiplyX1() { auto m = std::make_shared(mul_m, "EliminateMultiplyX1"); this->register_matcher(m, callback); } + +// input_b scalar input_a input_b +// \ / \ / +// input_a Mul_b ==> Mul_a' scalar +// \ / \ / +// Mul_a Mul_b' (expect to be merged with Mul_a') +ov::pass::activations_scaling::MulMulTransformation::MulMulTransformation() { + MATCHER_SCOPE(MulMulTransformation); + + auto activation_b_m = any_input(is_non_const_node); + auto mul_const_m = ov::pass::pattern::wrap_type(is_scalar_node); + auto mul_b_m = wrap_type({activation_b_m, mul_const_m}); + auto activation_a_m = any_input(is_non_const_node); + auto mul_a_m = wrap_type({activation_a_m, mul_b_m}); + + ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + if (transformation_callback(m.get_match_root())) { + return false; + } + + auto mul_a = pattern_map.at(mul_a_m).get_node_shared_ptr(); + auto mul_b = pattern_map.at(mul_b_m).get_node_shared_ptr(); + auto output_type = mul_a->get_output_element_type(0); + + auto new_mul_a = std::make_shared>( + std::vector{output_type, output_type}, std::vector{ output_type }, + ov::op::TemporaryReplaceOutputType(pattern_map.at(activation_a_m), output_type).get(), + ov::op::TemporaryReplaceOutputType(pattern_map.at(activation_b_m), output_type).get()); + new_mul_a->set_friendly_name(mul_a->get_friendly_name() + "_mm"); + ov::copy_runtime_info(mul_a, new_mul_a); + + auto new_mul_b = std::make_shared>( + std::vector{output_type, output_type}, std::vector{ output_type }, + ov::op::TemporaryReplaceOutputType(new_mul_a->output(0), output_type).get(), + ov::op::TemporaryReplaceOutputType(pattern_map.at(mul_const_m), output_type).get()); + new_mul_b->set_friendly_name(mul_b->get_friendly_name() + "_mm"); + ov::copy_runtime_info(mul_b, new_mul_b); + + ov::replace_node(mul_a, new_mul_b); + + return true; + }; + + auto m = std::make_shared(mul_a_m, "MulMulTransformation"); + this->register_matcher(m, callback); +} diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index b90bdce9950246..17051becbdc5a2 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -16,6 +16,7 @@ #include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/runtime/itt.hpp" #include "low_precision/add.hpp" +#include "low_precision/clamp.hpp" #include "low_precision/concat.hpp" #include "low_precision/convolution.hpp" #include "low_precision/convolution_backprop_data.hpp" @@ -30,9 +31,12 @@ #include "low_precision/pull_reshape_through_dequantization.hpp" #include "low_precision/pull_transpose_through_dequantization.hpp" #include "low_precision/recurrent_cell.hpp" +#include "low_precision/reshape.hpp" #include "low_precision/rt_info/bias_attribute.hpp" #include "low_precision/strided_slice.hpp" #include "low_precision/transpose.hpp" +#include "low_precision/unsqueeze.hpp" +#include "low_precision/variadic_split.hpp" #include "openvino/core/deprecated.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/core/validation_util.hpp" @@ -942,6 +946,11 @@ void TransformationsPipeline::apply(std::shared_ptr func) { pass_config->disable(); pass_config->disable(); pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); pass_config->set_callback( [](const std::shared_ptr &node) -> bool { @@ -959,6 +968,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); lpt_pass->add_main(); lpt_pass->add_main(); + lpt_pass->add_main(); manager.register_pass(); manager.register_pass(); } From 5da2ccde7f17fb8a080ec0803e680615ff64076c Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Wed, 18 Dec 2024 03:59:46 +0900 Subject: [PATCH 51/64] added MulDownTransformation --- .../activations_scaling.hpp | 14 +- .../activations_scaling.cpp | 142 +++++++----------- .../src/plugin/transformations_pipeline.cpp | 14 +- 3 files changed, 73 insertions(+), 97 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index 4df8bb15c9366d..7ad8e32933bc7c 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -34,8 +34,8 @@ class TRANSFORMATIONS_API ScaleDownFusion; class TRANSFORMATIONS_API EliminateMultiplyNorm; class TRANSFORMATIONS_API MulConcatTransformation; class TRANSFORMATIONS_API NormMulTransformation; -class TRANSFORMATIONS_API EliminateMultiplyX1; class TRANSFORMATIONS_API MulMulTransformation; +class TRANSFORMATIONS_API MulDownTransformation; } // namespace activations_scaling } // namespace pass @@ -76,14 +76,14 @@ class ov::pass::activations_scaling::NormMulTransformation : public ov::pass::Ma NormMulTransformation(); }; -class ov::pass::activations_scaling::EliminateMultiplyX1 : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("EliminateMultiplyX1", "0"); - EliminateMultiplyX1(); -}; - class ov::pass::activations_scaling::MulMulTransformation : public ov::pass::MatcherPass { public: OPENVINO_RTTI("MulMulTransformation", "0"); MulMulTransformation(); }; + +class ov::pass::activations_scaling::MulDownTransformation : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("MulDownTransformation", "0"); + MulDownTransformation(); +}; diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 69ece01efcbb2a..301c2050a1d6ec 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -290,48 +290,7 @@ ov::pass::activations_scaling::EliminateMultiplyNorm::EliminateMultiplyNorm() { auto activation = pattern_map.at(activation_m); auto norm = pattern_map.at(norm_m).get_node_shared_ptr(); - OutputVector new_inputs = {activation}; - for (size_t i = 1; i < norm->get_input_size(); ++i) { - new_inputs.push_back(norm->input(i).get_source_output()); - } - - std::shared_ptr new_norm; - if (pattern_map.count(mvn_m)) { - auto mvn = std::dynamic_pointer_cast(pattern_map.at(mvn_m).get_node_shared_ptr()); - new_norm = - std::make_shared>(ov::op::v6::MVN(new_inputs[0], - new_inputs[1], - mvn->get_normalize_variance(), - mvn->get_eps(), - mvn->get_eps_mode()), - mvn->get_output_element_type(0)); - - } else if (pattern_map.count(rms_m)) { - auto rms = std::dynamic_pointer_cast(pattern_map.at(rms_m).get_node_shared_ptr()); - new_norm = std::make_shared>( - ov::op::internal::RMS(new_inputs[0], - new_inputs[1], - rms->get_epsilon(), - rms->get_output_element_type(0)), - rms->get_output_element_type(0)); - } else if (pattern_map.count(group_norm_m)) { - auto group_norm = std::dynamic_pointer_cast( - pattern_map.at(group_norm_m).get_node_shared_ptr()); - new_norm = std::make_shared>( - ov::op::v12::GroupNormalization(new_inputs[0], - new_inputs[1], - new_inputs[2], - group_norm->get_num_groups(), - group_norm->get_epsilon()), - group_norm->get_output_element_type(0)); - } else if (pattern_map.count(shape_of_m)) { - auto shape_of = std::dynamic_pointer_cast( - pattern_map.at(shape_of_m).get_node_shared_ptr()); - new_norm = std::make_shared(new_inputs[0], shape_of->get_output_type()); - } - new_norm->set_friendly_name(norm->get_friendly_name()); - ov::copy_runtime_info(norm, new_norm); - ov::replace_node(norm, new_norm); + norm->input(0).replace_source_output(activation); return true; }; @@ -501,51 +460,6 @@ ov::pass::activations_scaling::NormMulTransformation::NormMulTransformation() { this->register_matcher(m, callback); } -ov::pass::activations_scaling::EliminateMultiplyX1::EliminateMultiplyX1() { - MATCHER_SCOPE(EliminateMultiplyX1); - - auto activation_m = any_input(is_non_const_node); - auto mul_const_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul_m = wrap_type({activation_m, mul_const_m}); - - ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); - - if (transformation_callback(m.get_match_root())) { - return false; - } - - auto mul_const = - std::dynamic_pointer_cast(pattern_map.at(mul_const_m).get_node_shared_ptr()); - - float const_value = 0.f; - if (mul_const->get_element_type() == ov::element::f16) { - const_value = std::stof(mul_const->get_data_ptr()->to_string()); - } else if (mul_const->get_element_type() == ov::element::f32) { - const_value = *mul_const->get_data_ptr(); - } else { - return false; - } - - if (const_value != 1.f) - return false; - - auto activation = m.get_match_root()->get_input_source_output(0); - if (ov::is_type(m.get_match_root()->get_input_source_output(0).get_node())) - activation = m.get_match_root()->get_input_source_output(1); - - auto target_inputs = m.get_match_root()->get_output_target_inputs(0); - for (auto& in : target_inputs) { - in.replace_source_output(activation); - } - - return true; - }; - - auto m = std::make_shared(mul_m, "EliminateMultiplyX1"); - this->register_matcher(m, callback); -} - // input_b scalar input_a input_b // \ / \ / // input_a Mul_b ==> Mul_a' scalar @@ -572,14 +486,16 @@ ov::pass::activations_scaling::MulMulTransformation::MulMulTransformation() { auto output_type = mul_a->get_output_element_type(0); auto new_mul_a = std::make_shared>( - std::vector{output_type, output_type}, std::vector{ output_type }, + std::vector{output_type, output_type}, + std::vector{output_type}, ov::op::TemporaryReplaceOutputType(pattern_map.at(activation_a_m), output_type).get(), ov::op::TemporaryReplaceOutputType(pattern_map.at(activation_b_m), output_type).get()); new_mul_a->set_friendly_name(mul_a->get_friendly_name() + "_mm"); ov::copy_runtime_info(mul_a, new_mul_a); auto new_mul_b = std::make_shared>( - std::vector{output_type, output_type}, std::vector{ output_type }, + std::vector{output_type, output_type}, + std::vector{output_type}, ov::op::TemporaryReplaceOutputType(new_mul_a->output(0), output_type).get(), ov::op::TemporaryReplaceOutputType(pattern_map.at(mul_const_m), output_type).get()); new_mul_b->set_friendly_name(mul_b->get_friendly_name() + "_mm"); @@ -593,3 +509,51 @@ ov::pass::activations_scaling::MulMulTransformation::MulMulTransformation() { auto m = std::make_shared(mul_a_m, "MulMulTransformation"); this->register_matcher(m, callback); } + +// input scalar input +// \ / | +// Mul [Reshape, +// | ==> Transpose] scalar +// [Reshape, \ / +// Transpose] Mul +// | | +// output output +ov::pass::activations_scaling::MulDownTransformation::MulDownTransformation() { + MATCHER_SCOPE(MulDownTransformation); + + auto activation_m = any_input(is_non_const_node); + auto mul_const_m = ov::pass::pattern::wrap_type(is_scalar_node); + auto mul_m = wrap_type({activation_m, mul_const_m}); + auto reshape_m = pattern::wrap_type({ mul_m, any_input() }); + auto transpose_m = pattern::wrap_type({ mul_m, any_input() }); + auto matcher_m = std::make_shared(OutputVector{reshape_m, transpose_m}); + + ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + if (transformation_callback(m.get_match_root())) { + return false; + } + + auto activation = pattern_map.at(activation_m); + auto mul_const = pattern_map.at(mul_const_m); + auto mul = pattern_map.at(mul_m).get_node_shared_ptr(); + auto op = pattern_map.at(matcher_m).get_node_shared_ptr(); + std::set> target_inputs = op->output(0).get_target_inputs(); + + op->input(0).replace_source_output(activation); + + auto new_mul = std::make_shared(op->output(0), mul_const); + new_mul->set_friendly_name(mul->get_friendly_name() + "_d"); + ov::copy_runtime_info(mul, new_mul); + + for (auto& in : target_inputs) { + in.replace_source_output(new_mul); + } + + return true; + }; + + auto m = std::make_shared(matcher_m, "MulDownTransformation"); + this->register_matcher(m, callback); +} diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 17051becbdc5a2..65e5addab5a261 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -964,13 +964,25 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(activations_scale_factor, scaled_precision); manager.register_pass(); + + // Move down scalar-multiply layers as much as possible auto params = LayerTransformation::Params(false, scaled_precision, {scaled_precision}, true, false); auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); lpt_pass->add_main(); lpt_pass->add_main(); lpt_pass->add_main(); + lpt_pass->add_main(); + + // Move up remained scalar-multiply layers + manager.register_pass(); manager.register_pass(); - manager.register_pass(); + + const std::vector allowed_data_movement_ops = { + ov::op::v1::Reshape::get_type_info_static(), + ov::op::v1::Transpose::get_type_info_static(), + }; + manager.register_pass(allowed_data_movement_ops); + manager.register_pass(); } manager.run_passes(func); From 74971f859e9d972092ab98e5d0e315ed9085120a Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Wed, 18 Dec 2024 15:48:51 +0900 Subject: [PATCH 52/64] fixed code style --- .../common_optimizations/activations_scaling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 301c2050a1d6ec..1cb116342f0b99 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -524,8 +524,8 @@ ov::pass::activations_scaling::MulDownTransformation::MulDownTransformation() { auto activation_m = any_input(is_non_const_node); auto mul_const_m = ov::pass::pattern::wrap_type(is_scalar_node); auto mul_m = wrap_type({activation_m, mul_const_m}); - auto reshape_m = pattern::wrap_type({ mul_m, any_input() }); - auto transpose_m = pattern::wrap_type({ mul_m, any_input() }); + auto reshape_m = pattern::wrap_type({mul_m, any_input()}); + auto transpose_m = pattern::wrap_type({mul_m, any_input()}); auto matcher_m = std::make_shared(OutputVector{reshape_m, transpose_m}); ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { From 4626cdb649ea3521d11525ffa0313f6754ec58a0 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Mon, 23 Dec 2024 23:25:49 +0900 Subject: [PATCH 53/64] added a functional test --- .../src/plugin/transformations_pipeline.cpp | 20 +- .../dynamic/activations_scaling.cpp | 217 ++++++++++++++++++ 2 files changed, 227 insertions(+), 10 deletions(-) create mode 100644 src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 65e5addab5a261..e52cc9bcbcc021 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -291,6 +291,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { const auto& defaultPrecisions = ov::pass::low_precision::precision_set::get_int8_support(); const ov::element::TypeVector supported_woq_types = {ov::element::u8, ov::element::i8, ov::element::u4, ov::element::i4}; bool enableInt8; + ov::element::Type infer_precision = ov::element::undefined; bool unroll_loop = config.get_property(ov::intel_gpu::enable_loop_unrolling); { ov::pass::Manager manager("Plugin:GPU"); @@ -337,7 +338,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { }; // Add conversion from FP data types to infer precision if it's specified - auto infer_precision = config.get_property(ov::hint::inference_precision); + infer_precision = config.get_property(ov::hint::inference_precision); if (infer_precision != ov::element::undefined) { if (!fp_precision_supported(infer_precision)) infer_precision = fallback_precision; @@ -929,9 +930,8 @@ void TransformationsPipeline::apply(std::shared_ptr func) { pass_config->disable(); float activations_scale_factor = config.get_property(ov::hint::activations_scale_factor); - ov::element::Type scaled_precision = element::f16; - if (activations_scale_factor > 0.f) { + if (activations_scale_factor > 0.f && infer_precision == ov::element::f16) { using namespace ov::pass::low_precision; auto supportedPrecisions = std::vector({}); @@ -947,10 +947,10 @@ void TransformationsPipeline::apply(std::shared_ptr func) { pass_config->disable(); pass_config->disable(); pass_config->disable(); - pass_config->disable(); - pass_config->disable(); - pass_config->disable(); - pass_config->disable(); + // pass_config->disable(); + // pass_config->disable(); + // pass_config->disable(); + // pass_config->disable(); pass_config->set_callback( [](const std::shared_ptr &node) -> bool { @@ -962,16 +962,16 @@ void TransformationsPipeline::apply(std::shared_ptr func) { return (ov::is_dequantization_node(node) || ov::is_type(node)); }); - manager.register_pass(activations_scale_factor, scaled_precision); + manager.register_pass(activations_scale_factor, infer_precision); manager.register_pass(); // Move down scalar-multiply layers as much as possible - auto params = LayerTransformation::Params(false, scaled_precision, {scaled_precision}, true, false); + auto params = LayerTransformation::Params(false, infer_precision, {infer_precision}, true, false); auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); lpt_pass->add_main(); lpt_pass->add_main(); lpt_pass->add_main(); - lpt_pass->add_main(); + // lpt_pass->add_main(); // Move up remained scalar-multiply layers manager.register_pass(); diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp new file mode 100644 index 00000000000000..64042c7ab0b9e9 --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp @@ -0,0 +1,217 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/ov_tensor_utils.hpp" +#include "common_test_utils/file_utils.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" + +#include "openvino/op/parameter.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/result.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/sqrt.hpp" +#include "openvino/op/divide.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/power.hpp" +#include "openvino/op/reduce_mean.hpp" + +namespace { +using ov::test::InputShape; + +struct ShapeParams { + ShapeParams() = default; + ShapeParams(std::vector input_shape, std::vector weights_shapes) + : input_shape(std::move(input_shape)), + weights_shapes(std::move(weights_shapes)) {} + + std::vector input_shape; + std::vector weights_shapes; +}; + +using ActivationsScalingParams = std::tuple; // input precision + +class ActivationsScaling : public testing::WithParamInterface, + virtual public ov::test::SubgraphBaseTest { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + ShapeParams shape_params; + ov::element::Type input_precision; + + std::tie(shape_params, input_precision) = obj.param; + + std::ostringstream result; + result << "IS=("; + for (const auto& shape : shape_params.input_shape) { + result << ov::test::utils::partialShape2str({shape.first}) << "_"; + } + result << ")_TS="; + for (const auto& shape : shape_params.input_shape) { + result << "("; + if (!shape.second.empty()) { + auto itr = shape.second.begin(); + do { + result << ov::test::utils::vec2str(*itr); + } while (++itr != shape.second.end() && result << "_"); + } + result << ")_"; + } + result << "input_precision=" << input_precision; + return result.str(); + } + +protected: + // input + // / \_ + // MatMul MutMul + // | | + // Add Add + // | | + // Variadic Reshape + // Split | + // / \ Transpose + // Add \ | + // / \ | + // Reshape Reshape | + // | | | + // Transpose Transpose / + // | \ / / + // | Add / + // | / \ / + // Concat Concat + // \ / + // Add + // | + // Output + std::shared_ptr init_subgraph(const std::vector& input_shapes, + const std::vector& weights_shapes, + const ov::element::Type input_precision) { + ov::ParameterVector params{std::make_shared(input_precision, input_shapes[0])}; + + const auto weights_tensor0 = ov::test::utils::create_and_fill_tensor( + input_precision, weights_shapes[0], ov::test::utils::InputGenerateData(-1, 2, 1357, 1)); + auto weight0 = std::make_shared(weights_tensor0); + + const auto weights_tensor1 = ov::test::utils::create_and_fill_tensor( + input_precision, weights_shapes[1], ov::test::utils::InputGenerateData(-1, 2, 1357, 1)); + auto weight1 = std::make_shared(weights_tensor1); + + auto matmul0 = std::make_shared(params[0], weight0, false, false); + auto matmul1 = std::make_shared(params[0], weight1, false, false); + + std::vector bias_shape = {{weights_shapes[0][1]}, {weights_shapes[1][1]}}; + const auto bias_tensor0 = ov::test::utils::create_and_fill_tensor( + input_precision, bias_shape[0], ov::test::utils::InputGenerateData(-1, 2, 1357, 1)); + const auto bias_tensor1 = ov::test::utils::create_and_fill_tensor( + input_precision, bias_shape[1], ov::test::utils::InputGenerateData(-1, 2, 1357, 1)); + + auto bias0 = std::make_shared(bias_tensor0); + auto bias1 = std::make_shared(bias_tensor1); + + auto add0 = std::make_shared(matmul0, bias0); + auto add1 = std::make_shared(matmul1, bias1); + + int32_t last_size0 = static_cast(weights_shapes[0][1]); + auto axis0 = ov::op::v0::Constant::create(ov::element::i32, {1}, {2}); + auto split_lengths0 = ov::op::v0::Constant::create(ov::element::i32, {2}, {last_size0/2, last_size0/2}); + auto variadic_split0 = std::make_shared(add0, axis0, split_lengths0); + + ov::Shape bias_shape00 = {weights_shapes[0][1]/2}; + const auto bias_tensor00 = ov::test::utils::create_and_fill_tensor( + input_precision, bias_shape00, ov::test::utils::InputGenerateData(-1, 2, 1357, 1)); + auto bias00 = std::make_shared(bias_tensor00); + auto add00 = std::make_shared(variadic_split0->output(0), bias00); + + const std::vector reshape_target_shape = {0, -1, 16, (last_size0/32)}; + auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, + {reshape_target_shape.size()}, + reshape_target_shape); + + auto reshape0 = std::make_shared(add00, reshape_const, true); + auto reshape1 = std::make_shared(variadic_split0->output(1), reshape_const, true); + auto reshape2 = std::make_shared(add1, reshape_const, true); + + const std::vector transpose_order = {0, 2, 1, 3}; + auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, + {transpose_order.size()}, + transpose_order); + + auto transpose0 = std::make_shared(reshape0, transpose_const); + auto transpose1 = std::make_shared(reshape1, transpose_const); + auto transpose2 = std::make_shared(reshape2, transpose_const); + + auto add12 = std::make_shared(transpose1, transpose2); + + auto concat0 = std::make_shared(ov::NodeVector{transpose0, add12}, 3); + + auto concat1 = std::make_shared(ov::NodeVector{add12, transpose2}, 3); + + auto add = std::make_shared(concat0, concat1); + + return std::make_shared(ov::NodeVector{add}, params, "ActivationsScaling"); + } + + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_GPU; + + ShapeParams shape_params; + ov::element::Type input_precision; + + std::tie(shape_params, input_precision) = GetParam(); + + init_input_shapes(shape_params.input_shape); + + inType = outType = input_precision; + + if (input_precision == ov::element::f16) { + abs_threshold = 1.0f; + } else { + abs_threshold = 1e-4f; + } + + function = init_subgraph(inputDynamicShapes, shape_params.weights_shapes, input_precision); + } +}; + +TEST_P(ActivationsScaling, Inference) { + core->set_property(targetDevice, ov::hint::activations_scale_factor(4.3)); + run(); + ov::serialize(compiledModel.get_runtime_model(), "test.xml"); +} + +// TEST_P(ActivationsScaling, Inference_cached) { +// std::stringstream ss; +// ss << "gpu_model_cache_" << std::hash{}( +// std::string(::testing::UnitTest::GetInstance()->current_test_info()->test_suite_name()) + +// std::string(::testing::UnitTest::GetInstance()->current_test_info()->name())); +// std::string cacheDirName = ss.str(); +// { +// ov::test::utils::removeFilesWithExt(cacheDirName, "blob"); +// ov::test::utils::removeFilesWithExt(cacheDirName, "cl_cache"); +// ov::test::utils::removeDir(cacheDirName); +// core->set_property(ov::cache_dir(cacheDirName)); +// compile_model(); +// } +// { +// run(); +// ov::test::utils::removeFilesWithExt(cacheDirName, "blob"); +// ov::test::utils::removeFilesWithExt(cacheDirName, "cl_cache"); +// ov::test::utils::removeDir(cacheDirName); +// } +// } + +const std::vector input_precisions = {ov::element::f16}; + +const std::vector input_shapes = { + {{{{-1, 32, 128}, {{1, 32, 128}}}}, {{128, 128}, {128, 64}}}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_ActivationsScaling_basic, + ActivationsScaling, + ::testing::Combine(::testing::ValuesIn(input_shapes), + ::testing::ValuesIn(input_precisions)), + ActivationsScaling::getTestCaseName); +} // namespace From 92f1427ac138e4625813195cd69c38ec704a78d3 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 24 Dec 2024 16:40:50 +0900 Subject: [PATCH 54/64] applied reviews --- .../activations_scaling.hpp | 30 +--- .../activations_scaling.cpp | 154 ++++++------------ .../activations_scaling_test.cpp | 7 +- .../src/plugin/transformations_pipeline.cpp | 5 +- 4 files changed, 61 insertions(+), 135 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index 7ad8e32933bc7c..e6f96572cf1d00 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -16,26 +16,12 @@ class TRANSFORMATIONS_API ActivationsScaling; namespace activations_scaling { -TRANSFORMATIONS_API void mark_as_scale_down_node(const std::shared_ptr& node); - -TRANSFORMATIONS_API bool is_scale_down_node(const std::shared_ptr& node); - -class TRANSFORMATIONS_API ScaleDownNode : public RuntimeAttribute { -public: - OPENVINO_RTTI("scale_down_node", "0"); - - bool is_copyable() const override { - return false; - } -}; - class TRANSFORMATIONS_API ScaleDownSingleLayer; class TRANSFORMATIONS_API ScaleDownFusion; -class TRANSFORMATIONS_API EliminateMultiplyNorm; +class TRANSFORMATIONS_API EliminateMultiplyScalar; class TRANSFORMATIONS_API MulConcatTransformation; class TRANSFORMATIONS_API NormMulTransformation; class TRANSFORMATIONS_API MulMulTransformation; -class TRANSFORMATIONS_API MulDownTransformation; } // namespace activations_scaling } // namespace pass @@ -44,7 +30,7 @@ class TRANSFORMATIONS_API MulDownTransformation; // ActivationsScaling makes activation values smaller to prevent overflow due to the limited range of FP16 // This feature is controlled by ov::hint::activations_scale_factor. // For example, when this property is set as 16, activations are divided by 16. -// If ov::hint::activations_scale_factor is less than zero, it is disabled. +// If ov::hint::activations_scale_factor is less than or equal to zero, it is disabled. class ov::pass::activations_scaling::ScaleDownSingleLayer : public ov::pass::MatcherPass { public: @@ -58,10 +44,10 @@ class ov::pass::activations_scaling::ScaleDownFusion : public ov::pass::MatcherP ScaleDownFusion(); }; -class ov::pass::activations_scaling::EliminateMultiplyNorm : public ov::pass::MatcherPass { +class ov::pass::activations_scaling::EliminateMultiplyScalar : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("EliminateMultiplyNorm", "0"); - EliminateMultiplyNorm(); + OPENVINO_RTTI("EliminateMultiplyScalar", "0"); + EliminateMultiplyScalar(); }; class ov::pass::activations_scaling::MulConcatTransformation : public ov::pass::MatcherPass { @@ -81,9 +67,3 @@ class ov::pass::activations_scaling::MulMulTransformation : public ov::pass::Mat OPENVINO_RTTI("MulMulTransformation", "0"); MulMulTransformation(); }; - -class ov::pass::activations_scaling::MulDownTransformation : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("MulDownTransformation", "0"); - MulDownTransformation(); -}; diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 1cb116342f0b99..ddf32e494db665 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -34,13 +34,9 @@ namespace { const auto is_scalar_node = [](const ov::Output& output) -> bool { const auto shape = output.get_partial_shape(); - if (shape.is_dynamic() || shape.rank().is_dynamic()) + if (shape.is_dynamic()) return false; - if (std::all_of(shape.begin(), shape.end(), [](const ov::Dimension& dimension) { - return dimension == 1ul; - })) - return true; - return false; + return ov::shape_size(shape.to_shape()) == 1; }; const auto is_non_const_node = [](const ov::Output& output) -> bool { @@ -48,16 +44,6 @@ const auto is_non_const_node = [](const ov::Output& output) -> bool { }; } // namespace -void ov::pass::activations_scaling::mark_as_scale_down_node(const std::shared_ptr& node) { - auto& rt_info = node->get_rt_info(); - rt_info[ScaleDownNode::get_type_info_static()] = ScaleDownNode(); -} - -bool ov::pass::activations_scaling::is_scale_down_node(const std::shared_ptr& node) { - const auto& rt_info = node->get_rt_info(); - return rt_info.find(ScaleDownNode::get_type_info_static()) != rt_info.end(); -} - using namespace ov::pass::activations_scaling; using namespace ov::pass::pattern; using ov::pass::pattern::op::Or; @@ -82,25 +68,25 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float OPENVINO_ASSERT(pattern_map.count(convolution_m) || pattern_map.count(matmul_m)); // scale_down and scale_up layers will be added around scaled_op - std::shared_ptr scaled_op = nullptr; + std::shared_ptr scaled_op = nullptr; if (pattern_map.count(convolution_m)) - scaled_op = std::dynamic_pointer_cast(pattern_map.at(convolution_m).get_node_shared_ptr()); + scaled_op = pattern_map.at(convolution_m).get_node_shared_ptr(); if (pattern_map.count(matmul_m)) - scaled_op = std::dynamic_pointer_cast(pattern_map.at(matmul_m).get_node_shared_ptr()); + scaled_op = pattern_map.at(matmul_m).get_node_shared_ptr(); if (transformation_callback(scaled_op)) return false; // in the case of decompressed_to_f32 nodes, scale_up layer will be added after Convert node. bool keep_precision = false; - std::shared_ptr output_of_scaled_op = scaled_op; + std::shared_ptr output_of_scaled_op = scaled_op; auto child_node = scaled_op->get_output_target_inputs(0).begin()->get_node(); if (scaled_op->get_output_target_inputs(0).size() == 1 && ov::is_type(child_node) && ov::fp16_compression_is_disabled(child_node->shared_from_this()) && ov::pass::constant_folding_is_disabled(child_node->shared_from_this())) { - output_of_scaled_op = std::dynamic_pointer_cast(child_node->shared_from_this()); + output_of_scaled_op = child_node->shared_from_this(); child_node = output_of_scaled_op->get_output_target_inputs(0).begin()->get_node(); keep_precision = true; } @@ -118,7 +104,6 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_down_value)); scale_down->set_friendly_name(scaled_op->get_friendly_name() + "_scale_down"); ov::copy_runtime_info(scaled_op, scale_down); - mark_as_scale_down_node(scale_down); if (scale_down->output(0).get_element_type() != scaled_prec && !keep_precision) { auto convert_prec0 = std::make_shared(scale_down->output(0), scaled_prec); @@ -176,7 +161,7 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float if (add->output(0).get_element_type() != output_prec && !keep_precision) { output_of_scaled_op = std::make_shared(add->output(0), output_prec); } else { - output_of_scaled_op = std::dynamic_pointer_cast(add); + output_of_scaled_op = add; } } else { target_inputs = output_of_scaled_op->get_output_target_inputs(0); @@ -209,49 +194,64 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float // input input // / \ ==> | // Mul_a Mul_b Mul_a -// | | / | -// op_a op_b op_a op_b +// | | / \_ +// op_a op_b op_a op_b ov::pass::activations_scaling::ScaleDownFusion::ScaleDownFusion() { MATCHER_SCOPE(ScaleDownFusion); - const auto is_scale_down_mul = [](const ov::Output& output) -> bool { - return is_scale_down_node(output.get_node_shared_ptr()); + const auto get_const_value = [](const std::shared_ptr& node, + float& const_value) -> bool { + auto const_node = ov::as_type_ptr(node); + if (const_node == nullptr) + return false; + + if (const_node->get_element_type() == ov::element::f16) { + const_value = std::stof(const_node->get_data_ptr()->to_string()); + } else if (const_node->get_element_type() == ov::element::f32) { + const_value = *const_node->get_data_ptr(); + } else { + return false; + } + return true; }; auto activation_m = any_input(); - auto scale_const_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul_m = wrap_type({activation_m, scale_const_m}, is_scale_down_mul); + auto mul_const_m = ov::pass::pattern::wrap_type(is_scalar_node); + auto mul_m = wrap_type({activation_m, mul_const_m}); ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); - auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_m).get_node_shared_ptr()); + auto mul = pattern_map.at(mul_m).get_node_shared_ptr(); auto parent = mul->get_input_node_shared_ptr(0); - if (parent->get_output_size() > 1) + if (parent->get_output_size() > 1 || parent->get_users().size() == 1) return false; - auto children = parent->get_users(); - size_t num_scaled_down_nodes = 0; - for (const auto& child : children) { - if (is_scale_down_node(child)) - num_scaled_down_nodes += 1; - } - - if (num_scaled_down_nodes < 2) + if (transformation_callback(mul)) return false; - if (transformation_callback(mul)) + float mul_const_value = 0.f; + if (!get_const_value(pattern_map.at(mul_const_m).get_node_shared_ptr(), mul_const_value)) return false; + size_t num_fused_mul_nodes = 0; + auto children = parent->get_users(); for (const auto& child : children) { - if (is_scale_down_node(child)) { - for (auto& target : child->get_output_target_inputs(0)) { - target.replace_source_output(mul->output(0)); + if (child == mul) + continue; + if (ov::is_type(child)) { + float mul_const_value2 = 0.f; + if (get_const_value(child->input(0).get_source_output().get_node_shared_ptr(), mul_const_value2) || + get_const_value(child->input(1).get_source_output().get_node_shared_ptr(), mul_const_value2)) { + if (mul_const_value == mul_const_value2) { + ov::replace_output_update_name(child->output(0), mul->output(0)); + num_fused_mul_nodes += 1; + } } } } - return true; + return (num_fused_mul_nodes > 0); }; auto m = std::make_shared(mul_m, "ScaleDownFusion"); @@ -267,8 +267,8 @@ ov::pass::activations_scaling::ScaleDownFusion::ScaleDownFusion() { // input --> Multiply --> Normalization // ==> // input --> Normalization -ov::pass::activations_scaling::EliminateMultiplyNorm::EliminateMultiplyNorm() { - MATCHER_SCOPE(EliminateMultiplyNorm); +ov::pass::activations_scaling::EliminateMultiplyScalar::EliminateMultiplyScalar() { + MATCHER_SCOPE(EliminateMultiplyScalar); auto activation_m = any_input(is_non_const_node); auto convert_m = ov::pass::pattern::optional(activation_m); @@ -295,7 +295,7 @@ ov::pass::activations_scaling::EliminateMultiplyNorm::EliminateMultiplyNorm() { return true; }; - auto m = std::make_shared(norm_m, "EliminateMultiplyNorm"); + auto m = std::make_shared(norm_m, "EliminateMultiplyScalar"); this->register_matcher(m, callback); } @@ -326,9 +326,9 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( OPENVINO_ASSERT(pattern_map.count(concat_m)); - auto concat = std::dynamic_pointer_cast(pattern_map.at(concat_m).get_node_shared_ptr()); + auto concat = pattern_map.at(concat_m).get_node_shared_ptr(); - if (transformation_callback(concat_m)) { + if (transformation_callback(concat)) { return false; } @@ -337,13 +337,13 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( ov::element::Type last_dep_const_type = ov::element::undefined; for (auto& input : concat->inputs()) { auto dep_node = - std::dynamic_pointer_cast(input.get_source_output().get_node_shared_ptr()); + ov::as_type_ptr(input.get_source_output().get_node_shared_ptr()); if (!dep_node) { return false; } - auto dep_const0 = std::dynamic_pointer_cast( + auto dep_const0 = ov::as_type_ptr( dep_node->input(0).get_source_output().get_node_shared_ptr()); - auto dep_const1 = std::dynamic_pointer_cast( + auto dep_const1 = ov::as_type_ptr( dep_node->input(1).get_source_output().get_node_shared_ptr()); if (!dep_const0 && !dep_const1) { return false; @@ -449,7 +449,7 @@ ov::pass::activations_scaling::NormMulTransformation::NormMulTransformation() { const_input = input; } - if (!is_scalar_node(const_input) || !ov::is_type(const_input.get_node())) + if (!is_scalar_node(const_input) || is_non_const_node(const_input)) return false; norm->input(0).replace_source_output(mul->output(0)); @@ -509,51 +509,3 @@ ov::pass::activations_scaling::MulMulTransformation::MulMulTransformation() { auto m = std::make_shared(mul_a_m, "MulMulTransformation"); this->register_matcher(m, callback); } - -// input scalar input -// \ / | -// Mul [Reshape, -// | ==> Transpose] scalar -// [Reshape, \ / -// Transpose] Mul -// | | -// output output -ov::pass::activations_scaling::MulDownTransformation::MulDownTransformation() { - MATCHER_SCOPE(MulDownTransformation); - - auto activation_m = any_input(is_non_const_node); - auto mul_const_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul_m = wrap_type({activation_m, mul_const_m}); - auto reshape_m = pattern::wrap_type({mul_m, any_input()}); - auto transpose_m = pattern::wrap_type({mul_m, any_input()}); - auto matcher_m = std::make_shared(OutputVector{reshape_m, transpose_m}); - - ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); - - if (transformation_callback(m.get_match_root())) { - return false; - } - - auto activation = pattern_map.at(activation_m); - auto mul_const = pattern_map.at(mul_const_m); - auto mul = pattern_map.at(mul_m).get_node_shared_ptr(); - auto op = pattern_map.at(matcher_m).get_node_shared_ptr(); - std::set> target_inputs = op->output(0).get_target_inputs(); - - op->input(0).replace_source_output(activation); - - auto new_mul = std::make_shared(op->output(0), mul_const); - new_mul->set_friendly_name(mul->get_friendly_name() + "_d"); - ov::copy_runtime_info(mul, new_mul); - - for (auto& in : target_inputs) { - in.replace_source_output(new_mul); - } - - return true; - }; - - auto m = std::make_shared(matcher_m, "MulDownTransformation"); - this->register_matcher(m, callback); -} diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 55184446766592..6e43d9a85e8f0b 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -78,12 +78,10 @@ TEST_F(TransformationTestsF, ScaleDownFusionTest) { auto shape_post = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {1, 3, 16, 16}); auto scale_down0 = std::make_shared(reshape_pre->output(0), scale_down_const); - ov::pass::activations_scaling::mark_as_scale_down_node(scale_down0); auto reshape_post0 = std::make_shared(scale_down0, shape_post, true); auto result0 = std::make_shared(reshape_post0); auto scale_down1 = std::make_shared(reshape_pre->output(0), scale_down_const); - ov::pass::activations_scaling::mark_as_scale_down_node(scale_down1); auto reshape_post1 = std::make_shared(scale_down1, shape_post, true); auto result1 = std::make_shared(reshape_post1); @@ -102,7 +100,6 @@ TEST_F(TransformationTestsF, ScaleDownFusionTest) { auto shape_post = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {1, 3, 16, 16}); auto scale_down0 = std::make_shared(reshape_pre->output(0), scale_down_const); - ov::pass::activations_scaling::mark_as_scale_down_node(scale_down0); auto reshape_post0 = std::make_shared(scale_down0, shape_post, true); auto result0 = std::make_shared(reshape_post0); @@ -113,7 +110,7 @@ TEST_F(TransformationTestsF, ScaleDownFusionTest) { } } -TEST_F(TransformationTestsF, EliminateMultiplyNormTest) { +TEST_F(TransformationTestsF, EliminateMultiplyScalarTest) { { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); @@ -126,7 +123,7 @@ TEST_F(TransformationTestsF, EliminateMultiplyNormTest) { auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); - manager.register_pass(); + manager.register_pass(); } { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index e52cc9bcbcc021..f8fb56f04fb9c1 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -949,8 +949,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { pass_config->disable(); // pass_config->disable(); // pass_config->disable(); - // pass_config->disable(); - // pass_config->disable(); pass_config->set_callback( [](const std::shared_ptr &node) -> bool { @@ -968,10 +966,9 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // Move down scalar-multiply layers as much as possible auto params = LayerTransformation::Params(false, infer_precision, {infer_precision}, true, false); auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); - lpt_pass->add_main(); + lpt_pass->add_main(); lpt_pass->add_main(); lpt_pass->add_main(); - // lpt_pass->add_main(); // Move up remained scalar-multiply layers manager.register_pass(); From f389d3408dcc0dc2f1752c9d7ebbeabadbf4a1a0 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Tue, 24 Dec 2024 17:28:19 +0900 Subject: [PATCH 55/64] merged master --- .../intel_gpu/src/plugin/transformations_pipeline.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index f8fb56f04fb9c1..7e99d2216c1657 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -1019,12 +1019,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); - - if (config.get_property(ov::enable_static_scaling)) { - float scale_factor = func->get_rt_info().count("scale_factor") ? func->get_rt_info("scale_factor") : 0.f; - manager.register_pass(); - } - manager.register_pass(); auto pass_config = manager.get_pass_config(); manager.register_pass(); From ef31f2c50fc90f628c0a8f7dd3ab9070cb3df1c3 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Thu, 2 Jan 2025 12:52:26 +0900 Subject: [PATCH 56/64] applied reviews --- .../src/multiply_partial.cpp | 11 +-- .../activations_scaling.hpp | 17 ++-- .../activations_scaling.cpp | 80 ++----------------- .../activations_scaling_test.cpp | 46 ----------- .../src/plugin/transformations_pipeline.cpp | 5 +- .../dynamic/activations_scaling.cpp | 40 +++++----- 6 files changed, 38 insertions(+), 161 deletions(-) diff --git a/src/common/low_precision_transformations/src/multiply_partial.cpp b/src/common/low_precision_transformations/src/multiply_partial.cpp index f09b0ed866f420..102dde7f1cd65c 100644 --- a/src/common/low_precision_transformations/src/multiply_partial.cpp +++ b/src/common/low_precision_transformations/src/multiply_partial.cpp @@ -79,16 +79,17 @@ bool MultiplyPartialTransformation::transform(TransformationContext& context, ov auto constParent = multiply->input_value(multiplyBranch.first == 0 ? 1 : 0); auto multiplyParentParent = multiplyParent.get_node_shared_ptr()->input_value(multiplyBranch.second); auto multiplyParentConst = multiplyParent.get_node_shared_ptr()->input_value(multiplyBranch.second == 0 ? 1 : 0); + auto input_data_type = useDefaultTransformation ? element::f32 : multiply->get_output_element_type(0); newMultiply = std::make_shared>( - std::vector{ element::f32, element::f32 }, + std::vector{ input_data_type, input_data_type }, std::vector{ multiply->get_output_element_type(0) }, - ov::op::TemporaryReplaceOutputType(multiplyParentParent, element::f32).get(), + ov::op::TemporaryReplaceOutputType(multiplyParentParent, input_data_type).get(), ov::op::TemporaryReplaceOutputType( fold( - foldConvert(multiplyParentConst, element::f32), - foldConvert(constParent, element::f32)), - element::f32).get()); + foldConvert(multiplyParentConst, input_data_type), + foldConvert(constParent, input_data_type)), + input_data_type).get()); NetworkHelper::copyInfo(multiplyParent.get_node_shared_ptr(), newMultiply); NetworkHelper::copyInfo(multiply, newMultiply); diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index e6f96572cf1d00..295666fdec4222 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -17,7 +17,6 @@ class TRANSFORMATIONS_API ActivationsScaling; namespace activations_scaling { class TRANSFORMATIONS_API ScaleDownSingleLayer; -class TRANSFORMATIONS_API ScaleDownFusion; class TRANSFORMATIONS_API EliminateMultiplyScalar; class TRANSFORMATIONS_API MulConcatTransformation; class TRANSFORMATIONS_API NormMulTransformation; @@ -34,36 +33,30 @@ class TRANSFORMATIONS_API MulMulTransformation; class ov::pass::activations_scaling::ScaleDownSingleLayer : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("ScaleDownSingleLayer", "0"); + OPENVINO_MATCHER_PASS_RTTI("ScaleDownSingleLayer", "0"); ScaleDownSingleLayer(float scale_factor, ov::element::Type scaled_prec); }; -class ov::pass::activations_scaling::ScaleDownFusion : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("ScaleDownFusion", "0"); - ScaleDownFusion(); -}; - class ov::pass::activations_scaling::EliminateMultiplyScalar : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("EliminateMultiplyScalar", "0"); + OPENVINO_MATCHER_PASS_RTTI("EliminateMultiplyScalar", "0"); EliminateMultiplyScalar(); }; class ov::pass::activations_scaling::MulConcatTransformation : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("MulConcatTransformation", "0"); + OPENVINO_MATCHER_PASS_RTTI("MulConcatTransformation", "0"); MulConcatTransformation(); }; class ov::pass::activations_scaling::NormMulTransformation : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("NormMulTransformation", "0"); + OPENVINO_MATCHER_PASS_RTTI("NormMulTransformation", "0"); NormMulTransformation(); }; class ov::pass::activations_scaling::MulMulTransformation : public ov::pass::MatcherPass { public: - OPENVINO_RTTI("MulMulTransformation", "0"); + OPENVINO_MATCHER_PASS_RTTI("MulMulTransformation", "0"); MulMulTransformation(); }; diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index ddf32e494db665..ba7fa82c389c5d 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -189,75 +189,6 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float this->register_matcher(m, callback); } -// ScaleDownFusion merges multiple scale_down layers into one. -// -// input input -// / \ ==> | -// Mul_a Mul_b Mul_a -// | | / \_ -// op_a op_b op_a op_b -ov::pass::activations_scaling::ScaleDownFusion::ScaleDownFusion() { - MATCHER_SCOPE(ScaleDownFusion); - - const auto get_const_value = [](const std::shared_ptr& node, - float& const_value) -> bool { - auto const_node = ov::as_type_ptr(node); - if (const_node == nullptr) - return false; - - if (const_node->get_element_type() == ov::element::f16) { - const_value = std::stof(const_node->get_data_ptr()->to_string()); - } else if (const_node->get_element_type() == ov::element::f32) { - const_value = *const_node->get_data_ptr(); - } else { - return false; - } - return true; - }; - - auto activation_m = any_input(); - auto mul_const_m = ov::pass::pattern::wrap_type(is_scalar_node); - auto mul_m = wrap_type({activation_m, mul_const_m}); - - ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { - const auto& pattern_map = m.get_pattern_value_map(); - - auto mul = pattern_map.at(mul_m).get_node_shared_ptr(); - auto parent = mul->get_input_node_shared_ptr(0); - if (parent->get_output_size() > 1 || parent->get_users().size() == 1) - return false; - - if (transformation_callback(mul)) - return false; - - float mul_const_value = 0.f; - if (!get_const_value(pattern_map.at(mul_const_m).get_node_shared_ptr(), mul_const_value)) - return false; - - size_t num_fused_mul_nodes = 0; - auto children = parent->get_users(); - for (const auto& child : children) { - if (child == mul) - continue; - if (ov::is_type(child)) { - float mul_const_value2 = 0.f; - if (get_const_value(child->input(0).get_source_output().get_node_shared_ptr(), mul_const_value2) || - get_const_value(child->input(1).get_source_output().get_node_shared_ptr(), mul_const_value2)) { - if (mul_const_value == mul_const_value2) { - ov::replace_output_update_name(child->output(0), mul->output(0)); - num_fused_mul_nodes += 1; - } - } - } - } - - return (num_fused_mul_nodes > 0); - }; - - auto m = std::make_shared(mul_m, "ScaleDownFusion"); - this->register_matcher(m, callback); -} - // Normalization has the following property. // // Norm(input * const_a) = Norm(input) @@ -336,15 +267,14 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( ov::Output last_dep_const = {}; ov::element::Type last_dep_const_type = ov::element::undefined; for (auto& input : concat->inputs()) { - auto dep_node = - ov::as_type_ptr(input.get_source_output().get_node_shared_ptr()); + auto dep_node = ov::as_type_ptr(input.get_source_output().get_node_shared_ptr()); if (!dep_node) { return false; } - auto dep_const0 = ov::as_type_ptr( - dep_node->input(0).get_source_output().get_node_shared_ptr()); - auto dep_const1 = ov::as_type_ptr( - dep_node->input(1).get_source_output().get_node_shared_ptr()); + auto dep_const0 = + ov::as_type_ptr(dep_node->input(0).get_source_output().get_node_shared_ptr()); + auto dep_const1 = + ov::as_type_ptr(dep_node->input(1).get_source_output().get_node_shared_ptr()); if (!dep_const0 && !dep_const1) { return false; } diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 6e43d9a85e8f0b..095e5ed66ba98e 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -64,52 +64,6 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { } } -TEST_F(TransformationTestsF, ScaleDownFusionTest) { - float scale_factor = 128.f; - { - ov::Shape scale_const_shape = {}; - std::vector scale_down_value = {1.f / scale_factor}; - std::shared_ptr scale_down_const = - std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); - - auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); - auto shape_pre = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, 3, 256}); - auto reshape_pre = std::make_shared(input, shape_pre, true); - auto shape_post = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {1, 3, 16, 16}); - - auto scale_down0 = std::make_shared(reshape_pre->output(0), scale_down_const); - auto reshape_post0 = std::make_shared(scale_down0, shape_post, true); - auto result0 = std::make_shared(reshape_post0); - - auto scale_down1 = std::make_shared(reshape_pre->output(0), scale_down_const); - auto reshape_post1 = std::make_shared(scale_down1, shape_post, true); - auto result1 = std::make_shared(reshape_post1); - - model = std::make_shared(ov::ResultVector{result0, result1}, ov::ParameterVector{input}); - manager.register_pass(); - } - { - ov::Shape scale_const_shape = {}; - std::vector scale_down_value = {1.f / scale_factor}; - std::shared_ptr scale_down_const = - std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); - - auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); - auto shape_pre = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {1, 3, 256}); - auto reshape_pre = std::make_shared(input, shape_pre, true); - auto shape_post = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, {1, 3, 16, 16}); - - auto scale_down0 = std::make_shared(reshape_pre->output(0), scale_down_const); - auto reshape_post0 = std::make_shared(scale_down0, shape_post, true); - auto result0 = std::make_shared(reshape_post0); - - auto reshape_post1 = std::make_shared(scale_down0, shape_post, true); - auto result1 = std::make_shared(reshape_post1); - - model_ref = std::make_shared(ov::ResultVector{result0, result1}, ov::ParameterVector{input}); - } -} - TEST_F(TransformationTestsF, EliminateMultiplyScalarTest) { { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 7e99d2216c1657..55808477fbc2be 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -102,6 +102,7 @@ #include "transformations/common_optimizations/activations_scaling.hpp" #include "transformations/common_optimizations/softmax_fusion.hpp" #include "transformations/common_optimizations/glu_fusion.hpp" +#include "transformations/common_optimizations/shared_ops_optimization.hpp" #include "transformations/common_optimizations/transpose_sinking.hpp" #include "transformations/common_optimizations/weights_dequantize_to_fake_quantize.hpp" #include "transformations/common_optimizations/wrap_interpolate_into_transposes.hpp" @@ -947,8 +948,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { pass_config->disable(); pass_config->disable(); pass_config->disable(); - // pass_config->disable(); - // pass_config->disable(); pass_config->set_callback( [](const std::shared_ptr &node) -> bool { @@ -961,7 +960,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { }); manager.register_pass(activations_scale_factor, infer_precision); - manager.register_pass(); + manager.register_pass(); // Move down scalar-multiply layers as much as possible auto params = LayerTransformation::Params(false, infer_precision, {infer_precision}, true, false); diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp index 64042c7ab0b9e9..3cef97f9ffc105 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp @@ -182,26 +182,26 @@ TEST_P(ActivationsScaling, Inference) { ov::serialize(compiledModel.get_runtime_model(), "test.xml"); } -// TEST_P(ActivationsScaling, Inference_cached) { -// std::stringstream ss; -// ss << "gpu_model_cache_" << std::hash{}( -// std::string(::testing::UnitTest::GetInstance()->current_test_info()->test_suite_name()) + -// std::string(::testing::UnitTest::GetInstance()->current_test_info()->name())); -// std::string cacheDirName = ss.str(); -// { -// ov::test::utils::removeFilesWithExt(cacheDirName, "blob"); -// ov::test::utils::removeFilesWithExt(cacheDirName, "cl_cache"); -// ov::test::utils::removeDir(cacheDirName); -// core->set_property(ov::cache_dir(cacheDirName)); -// compile_model(); -// } -// { -// run(); -// ov::test::utils::removeFilesWithExt(cacheDirName, "blob"); -// ov::test::utils::removeFilesWithExt(cacheDirName, "cl_cache"); -// ov::test::utils::removeDir(cacheDirName); -// } -// } +TEST_P(ActivationsScaling, Inference_cached) { + std::stringstream ss; + ss << "gpu_model_cache_" << std::hash{}( + std::string(::testing::UnitTest::GetInstance()->current_test_info()->test_suite_name()) + + std::string(::testing::UnitTest::GetInstance()->current_test_info()->name())); + std::string cacheDirName = ss.str(); + { + ov::test::utils::removeFilesWithExt(cacheDirName, "blob"); + ov::test::utils::removeFilesWithExt(cacheDirName, "cl_cache"); + ov::test::utils::removeDir(cacheDirName); + core->set_property(ov::cache_dir(cacheDirName)); + compile_model(); + } + { + run(); + ov::test::utils::removeFilesWithExt(cacheDirName, "blob"); + ov::test::utils::removeFilesWithExt(cacheDirName, "cl_cache"); + ov::test::utils::removeDir(cacheDirName); + } +} const std::vector input_precisions = {ov::element::f16}; From 9a99eea7818e2fd22c6c7d95e5536a67cc30ac13 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Wed, 8 Jan 2025 10:27:27 +0900 Subject: [PATCH 57/64] updated to preserve the original output precision --- .../src/network_helper.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/common/low_precision_transformations/src/network_helper.cpp b/src/common/low_precision_transformations/src/network_helper.cpp index 01bdadf59e35c8..c49a1f9be83760 100644 --- a/src/common/low_precision_transformations/src/network_helper.cpp +++ b/src/common/low_precision_transformations/src/network_helper.cpp @@ -218,7 +218,6 @@ std::shared_ptr NetworkHelper::swapMultiplyAndAdd(std::shared_ptrinput_value(multiplyInputBranch); auto a = as_type_ptr(multiply->get_input_node_shared_ptr(multiplyInputBranch == 0 ? 1 : 0)); auto b = as_type_ptr(addAfterMultiply->get_input_node_shared_ptr(multiplyBranch == 0 ? 1 : 0)); std::shared_ptr bDivA; @@ -263,15 +262,15 @@ std::shared_ptr NetworkHelper::swapMultiplyAndAdd(std::shared_ptr(foldConvert(bDivA->output(0), a->get_element_type())); } - OutputVector inputs{ {}, {} }; - inputs[0] = x; - inputs[1] = bDivA->output(0); - + const auto& add_input = multiply->input_value(multiplyInputBranch); + // Note: precision is copied to a separate variable intentionally, + // since TemporaryReplaceOutputType replaces add_input's precision, whereas we need to set the original precision on newAdd's output + const auto add_output_precision = add_input.get_element_type(); std::shared_ptr newAdd = std::make_shared>( std::vector{element::f32, element::f32}, - std::vector{ x.get_element_type() }, - ov::op::TemporaryReplaceOutputType(inputs[0], element::f32).get(), - ov::op::TemporaryReplaceOutputType(inputs[1], element::f32).get()); + std::vector{ add_output_precision }, + ov::op::TemporaryReplaceOutputType(add_input, element::f32).get(), + ov::op::TemporaryReplaceOutputType(bDivA, element::f32).get()); copyInfo(addAfterMultiply, newAdd); auto newMultiply = std::make_shared>( From dfd322567d9473acb59546beddf0b0ce53365d56 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Wed, 8 Jan 2025 23:13:48 +0900 Subject: [PATCH 58/64] updated per reviews --- .../activations_scaling.cpp | 65 ++++++++----------- .../src/plugin/transformations_pipeline.cpp | 1 - 2 files changed, 27 insertions(+), 39 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index ba7fa82c389c5d..530f14bd397caf 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -65,7 +65,28 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); - OPENVINO_ASSERT(pattern_map.count(convolution_m) || pattern_map.count(matmul_m)); + OPENVINO_ASSERT(pattern_map.count(convolution_m) || pattern_map.count(matmul_m), "Not found any Convolution or MatMul layer"); + + auto insert_scale_down_layer = [&scale_factor, &scaled_prec](std::shared_ptr& node, + const size_t input_idx, + const bool keep_precision) { + const std::vector scale_down_value = {1.f / scale_factor}; + + auto scale_down_layer = std::make_shared( + node->input(input_idx).get_source_output(), + std::make_shared(node->input(input_idx).get_element_type(), + ov::Shape(), + scale_down_value)); + scale_down_layer->set_friendly_name(node->get_friendly_name() + "_scale_down"); + ov::copy_runtime_info(node, scale_down_layer); + + if (scale_down_layer->output(0).get_element_type() != scaled_prec && !keep_precision) { + auto convert_prec = std::make_shared(scale_down_layer->output(0), scaled_prec); + node->input(input_idx).replace_source_output(convert_prec->output(0)); + } else { + node->input(input_idx).replace_source_output(scale_down_layer->output(0)); + } + }; // scale_down and scale_up layers will be added around scaled_op std::shared_ptr scaled_op = nullptr; @@ -91,26 +112,11 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float keep_precision = true; } - const ov::Shape scale_shape = {}; - const std::vector scale_down_value = {1.f / scale_factor}; const std::vector scale_up_value = {scale_factor}; auto output_prec = output_of_scaled_op->output(0).get_element_type(); // adding a scale_down layer before the target node - auto scale_down = std::make_shared( - scaled_op->input(0).get_source_output(), - std::make_shared(scaled_op->input(0).get_element_type(), - scale_shape, - scale_down_value)); - scale_down->set_friendly_name(scaled_op->get_friendly_name() + "_scale_down"); - ov::copy_runtime_info(scaled_op, scale_down); - - if (scale_down->output(0).get_element_type() != scaled_prec && !keep_precision) { - auto convert_prec0 = std::make_shared(scale_down->output(0), scaled_prec); - scaled_op->input(0).replace_source_output(convert_prec0->output(0)); - } else { - scaled_op->input(0).replace_source_output(scale_down->output(0)); - } + insert_scale_down_layer(scaled_op, 0, keep_precision); if (scaled_op->input(1).get_element_type() != scaled_prec && !keep_precision) { auto convert_prec1 = std::make_shared(scaled_op->input(1).get_source_output(), scaled_prec); @@ -144,19 +150,7 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float if (has_bias) { auto add = child_node->shared_from_this(); target_inputs = add->get_output_target_inputs(0); - auto scale_down_bias = std::make_shared( - add->input(bias_index).get_source_output(), - std::make_shared(add->input(bias_index).get_element_type(), - scale_shape, - scale_down_value)); - scale_down_bias->set_friendly_name(add->get_friendly_name() + "_scale_down"); - ov::copy_runtime_info(add, scale_down_bias); - if (scale_down_bias->output(0).get_element_type() != scaled_prec && !keep_precision) { - auto convert_bias_prec = std::make_shared(scale_down_bias->output(0), scaled_prec); - add->input(bias_index).replace_source_output(convert_bias_prec->output(0)); - } else { - add->input(bias_index).replace_source_output(scale_down_bias->output(0)); - } + insert_scale_down_layer(add, bias_index, keep_precision); add->revalidate_and_infer_types(); if (add->output(0).get_element_type() != output_prec && !keep_precision) { output_of_scaled_op = std::make_shared(add->output(0), output_prec); @@ -174,7 +168,7 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float auto scale_up = register_new_node( output_of_scaled_op->output(0), std::make_shared(output_of_scaled_op->output(0).get_element_type(), - scale_shape, + ov::Shape(), scale_up_value)); scale_up->set_friendly_name(scaled_op->get_friendly_name() + "_scale_up"); ov::copy_runtime_info(scaled_op, scale_up); @@ -255,7 +249,7 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); - OPENVINO_ASSERT(pattern_map.count(concat_m)); + OPENVINO_ASSERT(pattern_map.count(concat_m), "Not found any Concat layer"); auto concat = pattern_map.at(concat_m).get_node_shared_ptr(); @@ -288,8 +282,6 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( last_dep_const_type = last_dep_const.get_element_type(); } - auto target_inputs = concat->get_output_target_inputs(0); - for (auto& input : concat->inputs()) { auto dep_node = input.get_source_output().get_node_shared_ptr(); auto dep_input0 = dep_node->input(0).get_source_output().get_node(); @@ -321,10 +313,7 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( ov::op::TemporaryReplaceOutputType(last_dep_const, concat_type).get()); new_mul->set_friendly_name(concat->get_friendly_name() + "_c"); ov::copy_runtime_info(concat, new_mul); - - for (auto& in : target_inputs) { - in.replace_source_output(new_mul); - } + ov::replace_output_update_name(concat->output(0), new_mul->output(0)); return false; }; diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 55808477fbc2be..d58fdab4c6c78e 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -947,7 +947,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { pass_config->disable(); pass_config->disable(); pass_config->disable(); - pass_config->disable(); pass_config->set_callback( [](const std::shared_ptr &node) -> bool { From 4d76ffb75d4d985103b7782ce2ced126b2203c44 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Wed, 8 Jan 2025 23:14:30 +0900 Subject: [PATCH 59/64] reverted to apply activations_scale_factor from rt_info --- src/plugins/intel_gpu/src/runtime/execution_config.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index 0372050657f018..c5397103fe3795 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -275,8 +275,8 @@ void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) { void ExecutionConfig::apply_rt_info(const cldnn::device_info& info, const ov::RTMap& rt_info) { if (!info.supports_immad) { apply_rt_info_property(ov::hint::kv_cache_precision, rt_info); - apply_rt_info_property(ov::hint::activations_scale_factor, rt_info); } + apply_rt_info_property(ov::hint::activations_scale_factor, rt_info); apply_rt_info_property(ov::hint::dynamic_quantization_group_size, rt_info); } From 07136d29b1e0f01dc11df967e0a350e7fa3bfe51 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Thu, 9 Jan 2025 03:31:50 +0900 Subject: [PATCH 60/64] added MulMulTransformationTest --- .../activations_scaling.cpp | 3 ++- .../activations_scaling_test.cpp | 27 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 530f14bd397caf..c00add14baf3f1 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -65,7 +65,8 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); - OPENVINO_ASSERT(pattern_map.count(convolution_m) || pattern_map.count(matmul_m), "Not found any Convolution or MatMul layer"); + OPENVINO_ASSERT(pattern_map.count(convolution_m) || pattern_map.count(matmul_m), + "Not found any Convolution or MatMul layer"); auto insert_scale_down_layer = [&scale_factor, &scaled_prec](std::shared_ptr& node, const size_t input_idx, diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 095e5ed66ba98e..d734a37e318d96 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -23,6 +23,7 @@ #include "openvino/op/variadic_split.hpp" #include "openvino/pass/manager.hpp" #include "transformations/utils/utils.hpp" +#include "low_precision/multiply_partial.hpp" using namespace ov; using namespace testing; @@ -123,3 +124,29 @@ TEST_F(TransformationTestsF, ConcatTransformationTest) { model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); } } + +TEST_F(TransformationTestsF, MulMulTransformationTest) { + { + auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto scale_const0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul0 = std::make_shared(input0, scale_const0); + auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto mul1 = std::make_shared(input1, mul0); + auto convert = std::make_shared(mul1, ov::element::f32); + auto result = std::make_shared(convert); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); + manager.register_pass(); + } + { + auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto mul0 = std::make_shared(input0, input1); + auto scale_const0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul1 = std::make_shared(mul0, scale_const0); + auto convert = std::make_shared(mul1, ov::element::f32); + auto result = std::make_shared(convert); + + model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); + } +} From 40a72a4404476469e632817544c9c98188592834 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Thu, 9 Jan 2025 19:29:53 +0900 Subject: [PATCH 61/64] updated MulShareTransformation --- .../activations_scaling.hpp | 8 ++-- .../activations_scaling.cpp | 38 +++++++++---------- .../activations_scaling_test.cpp | 34 +++++++++++++++-- .../src/plugin/transformations_pipeline.cpp | 6 +-- 4 files changed, 54 insertions(+), 32 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index 295666fdec4222..a7d19091a260b8 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -19,7 +19,7 @@ namespace activations_scaling { class TRANSFORMATIONS_API ScaleDownSingleLayer; class TRANSFORMATIONS_API EliminateMultiplyScalar; class TRANSFORMATIONS_API MulConcatTransformation; -class TRANSFORMATIONS_API NormMulTransformation; +class TRANSFORMATIONS_API MulShareTransformation; class TRANSFORMATIONS_API MulMulTransformation; } // namespace activations_scaling @@ -49,10 +49,10 @@ class ov::pass::activations_scaling::MulConcatTransformation : public ov::pass:: MulConcatTransformation(); }; -class ov::pass::activations_scaling::NormMulTransformation : public ov::pass::MatcherPass { +class ov::pass::activations_scaling::MulShareTransformation : public ov::pass::MatcherPass { public: - OPENVINO_MATCHER_PASS_RTTI("NormMulTransformation", "0"); - NormMulTransformation(); + OPENVINO_MATCHER_PASS_RTTI("MulShareTransformation", "0"); + MulShareTransformation(); }; class ov::pass::activations_scaling::MulMulTransformation : public ov::pass::MatcherPass { diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index c00add14baf3f1..cc9cf9740d5d53 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -316,7 +316,7 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( ov::copy_runtime_info(concat, new_mul); ov::replace_output_update_name(concat->output(0), new_mul->output(0)); - return false; + return true; }; auto m = std::make_shared(concat_m, "MulConcatTransformation"); @@ -330,8 +330,8 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( // op_a op_b Norm op_b // | // op_a -ov::pass::activations_scaling::NormMulTransformation::NormMulTransformation() { - MATCHER_SCOPE(NormMulTransformation); +ov::pass::activations_scaling::MulShareTransformation::MulShareTransformation() { + MATCHER_SCOPE(MulShareTransformation); auto mvn_m = wrap_type({any_input(), any_input()}); auto rms_m = wrap_type({any_input(), any_input()}); @@ -349,34 +349,32 @@ ov::pass::activations_scaling::NormMulTransformation::NormMulTransformation() { auto norm = pattern_map.at(norm_m).get_node_shared_ptr(); auto parent_output = norm->get_input_source_output(0); - if (parent_output.get_target_inputs().size() != 2) + if (parent_output.get_target_inputs().size() == 1) return false; - ov::Node* mul = nullptr; for (auto& child : parent_output.get_target_inputs()) { if (child == norm->input(0)) continue; - mul = child.get_node(); - } - if (!ov::is_type(mul)) - return false; + if (ov::is_type(child.get_node())) { + ov::Output const_input; + for (auto input : child.get_node()->input_values()) { + if (input == parent_output) + continue; + const_input = input; + } - ov::Output const_input; - for (auto input : mul->input_values()) { - if (input == parent_output) - continue; - const_input = input; + if (is_scalar_node(const_input) && !is_non_const_node(const_input)) { + norm->input(0).replace_source_output(child.get_node()->output(0)); + return true; + } + } } - if (!is_scalar_node(const_input) || is_non_const_node(const_input)) - return false; - - norm->input(0).replace_source_output(mul->output(0)); - return true; + return false; }; - auto m = std::make_shared(norm_m, "NormMulTransformation"); + auto m = std::make_shared(norm_m, "ScalarMulShareTransformation"); this->register_matcher(m, callback); } diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index d734a37e318d96..7f628b62e6874d 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -20,10 +20,10 @@ #include "openvino/op/mvn.hpp" #include "openvino/op/parameter.hpp" #include "openvino/op/reshape.hpp" +#include "openvino/op/shape_of.hpp" #include "openvino/op/variadic_split.hpp" #include "openvino/pass/manager.hpp" #include "transformations/utils/utils.hpp" -#include "low_precision/multiply_partial.hpp" using namespace ov; using namespace testing; @@ -110,10 +110,10 @@ TEST_F(TransformationTestsF, ConcatTransformationTest) { } { auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); - auto scale_const0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto scale_const0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {1}); auto mul0 = std::make_shared(input0, scale_const0); auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); - auto scale_const1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto scale_const1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {1}); auto mul1 = std::make_shared(input1, scale_const1); auto concat = std::make_shared(OutputVector{mul0, mul1}, 0); auto new_scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); @@ -150,3 +150,31 @@ TEST_F(TransformationTestsF, MulMulTransformationTest) { model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); } } + +TEST_F(TransformationTestsF, MulShareTransformationTest) { + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto shape_of = std::make_shared(input); + auto convert0 = std::make_shared(shape_of, ov::element::f32); + auto result0 = std::make_shared(convert0); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul = std::make_shared(input, scale_const); + auto convert1 = std::make_shared(mul, ov::element::f32); + auto result1 = std::make_shared(convert1); + + model = std::make_shared(ov::ResultVector{result0, result1}, ov::ParameterVector{input}); + manager.register_pass(); + } + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); + auto mul = std::make_shared(input, scale_const); + auto shape_of = std::make_shared(mul); + auto convert0 = std::make_shared(shape_of, ov::element::f32); + auto result0 = std::make_shared(convert0); + auto convert1 = std::make_shared(mul, ov::element::f32); + auto result1 = std::make_shared(convert1); + + model_ref = std::make_shared(ov::ResultVector{result0, result1}, ov::ParameterVector{input}); + } +} diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index d58fdab4c6c78e..4d652d65229497 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -16,7 +16,6 @@ #include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/runtime/itt.hpp" #include "low_precision/add.hpp" -#include "low_precision/clamp.hpp" #include "low_precision/concat.hpp" #include "low_precision/convolution.hpp" #include "low_precision/convolution_backprop_data.hpp" @@ -31,12 +30,9 @@ #include "low_precision/pull_reshape_through_dequantization.hpp" #include "low_precision/pull_transpose_through_dequantization.hpp" #include "low_precision/recurrent_cell.hpp" -#include "low_precision/reshape.hpp" #include "low_precision/rt_info/bias_attribute.hpp" #include "low_precision/strided_slice.hpp" #include "low_precision/transpose.hpp" -#include "low_precision/unsqueeze.hpp" -#include "low_precision/variadic_split.hpp" #include "openvino/core/deprecated.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/core/validation_util.hpp" @@ -970,7 +966,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // Move up remained scalar-multiply layers manager.register_pass(); - manager.register_pass(); + manager.register_pass(); const std::vector allowed_data_movement_ops = { ov::op::v1::Reshape::get_type_info_static(), From 422c1c06d5611b74246ac2608503d051fa61be17 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Thu, 9 Jan 2025 22:34:47 +0900 Subject: [PATCH 62/64] updated scaling tests --- .../common_optimizations/activations_scaling.cpp | 7 ++++++- .../activations_scaling_test.cpp | 9 +++++++-- .../subgraph_tests/dynamic/activations_scaling.cpp | 13 ++++++++++++- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index cc9cf9740d5d53..0113502f2497e1 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -283,6 +283,8 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( last_dep_const_type = last_dep_const.get_element_type(); } + auto target_inputs = concat->get_output_target_inputs(0); + for (auto& input : concat->inputs()) { auto dep_node = input.get_source_output().get_node_shared_ptr(); auto dep_input0 = dep_node->input(0).get_source_output().get_node(); @@ -314,7 +316,10 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( ov::op::TemporaryReplaceOutputType(last_dep_const, concat_type).get()); new_mul->set_friendly_name(concat->get_friendly_name() + "_c"); ov::copy_runtime_info(concat, new_mul); - ov::replace_output_update_name(concat->output(0), new_mul->output(0)); + + for (auto& in : target_inputs) { + in.replace_source_output(new_mul); + } return true; }; diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index 7f628b62e6874d..b460cf1cb2e709 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -39,7 +39,9 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { CoordinateDiff{}, CoordinateDiff{}, Strides{}); - auto convert = std::make_shared(conv, ov::element::f32); + auto bias_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1, 3, 1, 1}, {2.3f}); + auto add = std::make_shared(conv, bias_const); + auto convert = std::make_shared(add, ov::element::f32); auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); @@ -56,8 +58,11 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { CoordinateDiff{}, CoordinateDiff{}, Strides{}); + auto bias_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1, 3, 1, 1}, {2.3f}); + auto scale_down_bias = std::make_shared(bias_const, scale_down_const); + auto add = std::make_shared(conv, scale_down_bias); auto scale_up_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{}, {scale_factor}); - auto scale_up = std::make_shared(conv, scale_up_const); + auto scale_up = std::make_shared(add, scale_up_const); auto convert = std::make_shared(scale_up, ov::element::f32); auto result = std::make_shared(convert); diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp index 3cef97f9ffc105..5a60d6eb06c519 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp @@ -174,12 +174,23 @@ class ActivationsScaling : public testing::WithParamInterfaceinput(0).get_element_type(); + auto runtime_model = compiledModel.get_runtime_model(); + + for (auto& op : runtime_model->get_ordered_ops()) { + ASSERT_EQ(op->output(0).get_element_type(), input_precision) + << "expected output precision is " << input_precision << " , while actual is " << op->output(0).get_element_type(); + } + } }; TEST_P(ActivationsScaling, Inference) { core->set_property(targetDevice, ov::hint::activations_scale_factor(4.3)); run(); - ov::serialize(compiledModel.get_runtime_model(), "test.xml"); } TEST_P(ActivationsScaling, Inference_cached) { From 192604185d7d2a8458745c00b4d8a02bd53989c5 Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Fri, 10 Jan 2025 03:02:03 +0900 Subject: [PATCH 63/64] applied reviews --- .../low_precision/layer_transformation.hpp | 10 ++-- .../low_precision_transformations/src/add.cpp | 2 +- .../src/layer_transformation.cpp | 2 +- .../src/multiply_partial.cpp | 10 ++-- .../activations_scaling.hpp | 58 ++++++++++++++++--- .../activations_scaling.cpp | 54 ++--------------- .../activations_scaling_test.cpp | 8 +-- .../src/plugin/transformations_pipeline.cpp | 4 +- .../dynamic/activations_scaling.cpp | 4 +- 9 files changed, 76 insertions(+), 76 deletions(-) diff --git a/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp b/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp index 558e26aeb56097..f2b2de6ed4e792 100644 --- a/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp +++ b/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp @@ -253,12 +253,12 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass const std::vector defaultPrecisions = { ov::element::u8, ov::element::i8 }, const bool reshapeIgnorePerTensorQuantizationCheck = false, - const bool useDefaultTransformation = true) : + const bool scalingMode = false) : updatePrecisions(updatePrecisions), deqPrecision(deqPrecision), defaultPrecisions(defaultPrecisions), reshapeIgnorePerTensorQuantizationCheck(reshapeIgnorePerTensorQuantizationCheck), - useDefaultTransformation(useDefaultTransformation) {} + scalingMode(scalingMode) {} Params& setUpdatePrecisions(const bool updatePrecisions) { this->updatePrecisions = updatePrecisions; @@ -283,8 +283,8 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass std::vector defaultPrecisions; // to support GPU workarround to keep Reshape and MatMul in FP32 bool reshapeIgnorePerTensorQuantizationCheck; - // for MultiplyPartialTransformation to support Activations Scaling - bool useDefaultTransformation; + // to support Activations Scaling + bool scalingMode; }; class PrecisionDetails { @@ -356,7 +356,7 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass element::Type deqPrecision; std::vector defaultPrecisions; bool reshapeIgnorePerTensorQuantizationCheck; - bool useDefaultTransformation; + bool scalingMode; static constexpr char originalLayerPostfix[] = "_original"; TransformationContext* context; diff --git a/src/common/low_precision_transformations/src/add.cpp b/src/common/low_precision_transformations/src/add.cpp index 7fa283089bef0b..0c9f727c18b4ad 100644 --- a/src/common/low_precision_transformations/src/add.cpp +++ b/src/common/low_precision_transformations/src/add.cpp @@ -214,7 +214,7 @@ bool AddTransformation::transform(TransformationContext& context, ov::pass::patt newSubtractFullPathValues), newMultiplyFullPathValues); - auto output_type = useDefaultTransformation ? element::f32 : add->get_output_element_type(0); + auto output_type = scalingMode ? add->get_output_element_type(0) : element::f32; newAddOrSubtract = std::make_shared>( std::vector{output_type, output_type}, std::vector{ output_type }, ov::op::TemporaryReplaceOutputType(inputs[0], output_type).get(), diff --git a/src/common/low_precision_transformations/src/layer_transformation.cpp b/src/common/low_precision_transformations/src/layer_transformation.cpp index 70d88743cb34ec..3679f6d027abad 100644 --- a/src/common/low_precision_transformations/src/layer_transformation.cpp +++ b/src/common/low_precision_transformations/src/layer_transformation.cpp @@ -45,7 +45,7 @@ LayerTransformation::LayerTransformation(const Params& params) : deqPrecision(params.deqPrecision), defaultPrecisions(params.defaultPrecisions), reshapeIgnorePerTensorQuantizationCheck(params.reshapeIgnorePerTensorQuantizationCheck), - useDefaultTransformation(params.useDefaultTransformation), + scalingMode(params.scalingMode), context(nullptr) {} void LayerTransformation::setContext(TransformationContext* context) noexcept { diff --git a/src/common/low_precision_transformations/src/multiply_partial.cpp b/src/common/low_precision_transformations/src/multiply_partial.cpp index 102dde7f1cd65c..ce9ef0816147b8 100644 --- a/src/common/low_precision_transformations/src/multiply_partial.cpp +++ b/src/common/low_precision_transformations/src/multiply_partial.cpp @@ -79,7 +79,7 @@ bool MultiplyPartialTransformation::transform(TransformationContext& context, ov auto constParent = multiply->input_value(multiplyBranch.first == 0 ? 1 : 0); auto multiplyParentParent = multiplyParent.get_node_shared_ptr()->input_value(multiplyBranch.second); auto multiplyParentConst = multiplyParent.get_node_shared_ptr()->input_value(multiplyBranch.second == 0 ? 1 : 0); - auto input_data_type = useDefaultTransformation ? element::f32 : multiply->get_output_element_type(0); + auto input_data_type = scalingMode ? multiply->get_output_element_type(0) : element::f32; newMultiply = std::make_shared>( std::vector{ input_data_type, input_data_type }, @@ -134,7 +134,7 @@ bool MultiplyPartialTransformation::transform(TransformationContext& context, ov // before: Y = (SC1 * (X1 - SH1)) * (SC2 * X2) - // if useDefaultTransformation = true + // if scalingMode == false // after : Y = (SC1' * (X1 - SH1)) * (X2) , where : // SC1' = SC1 * SC2 // else @@ -142,9 +142,9 @@ bool MultiplyPartialTransformation::transform(TransformationContext& context, ov // SC1' = SC1 * SC2 auto newMultiplyValuesFullPath = fold(multiplyValuesEmptyPath, multiplyValuesFullPath); OutputVector inputs{ {}, {} }; - inputs[emptyPathIndex] = useDefaultTransformation ? dequantizationEmptyPath.data : newMultiplyValuesFullPath; - auto input_for_fullPath = useDefaultTransformation ? newMultiplyValuesFullPath : - dequantizationEmptyPath.data.get_node_shared_ptr(); + inputs[emptyPathIndex] = scalingMode ? newMultiplyValuesFullPath : dequantizationEmptyPath.data; + auto input_for_fullPath = scalingMode ? dequantizationEmptyPath.data.get_node_shared_ptr() : + newMultiplyValuesFullPath; ov::Output parent0 = dequantizationFullPath.subtract == nullptr ? (dequantizationFullPath.convert == nullptr ? dequantizationFullPath.data : dequantizationFullPath.convert) : diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index a7d19091a260b8..d8c96a1df542af 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -17,10 +17,10 @@ class TRANSFORMATIONS_API ActivationsScaling; namespace activations_scaling { class TRANSFORMATIONS_API ScaleDownSingleLayer; -class TRANSFORMATIONS_API EliminateMultiplyScalar; +class TRANSFORMATIONS_API EliminateScalarMul; class TRANSFORMATIONS_API MulConcatTransformation; class TRANSFORMATIONS_API MulShareTransformation; -class TRANSFORMATIONS_API MulMulTransformation; +class TRANSFORMATIONS_API MoveDownScalarMul; } // namespace activations_scaling } // namespace pass @@ -31,32 +31,74 @@ class TRANSFORMATIONS_API MulMulTransformation; // For example, when this property is set as 16, activations are divided by 16. // If ov::hint::activations_scale_factor is less than or equal to zero, it is disabled. +// Add scale_down and scale_up layers around Convolution and MatMul nodes +// Conv/MatMul +// ==> +// Multiply(scale_down by scale_factor) --> Conv/MatMul --> Multiply(scale_up by scale_factor) class ov::pass::activations_scaling::ScaleDownSingleLayer : public ov::pass::MatcherPass { public: OPENVINO_MATCHER_PASS_RTTI("ScaleDownSingleLayer", "0"); ScaleDownSingleLayer(float scale_factor, ov::element::Type scaled_prec); }; -class ov::pass::activations_scaling::EliminateMultiplyScalar : public ov::pass::MatcherPass { +// Normalization and ShapeOf have the following property. +// +// Norm(input * const_a) = Norm(input) +// +// So, we can skip Multiply that is connected to Normalization and ShapeOf. +// +// input --> Multiply --> Normalization/ShapeOf +// ==> +// input --> Normalization/ShapeOf +class ov::pass::activations_scaling::EliminateScalarMul : public ov::pass::MatcherPass { public: - OPENVINO_MATCHER_PASS_RTTI("EliminateMultiplyScalar", "0"); - EliminateMultiplyScalar(); + OPENVINO_MATCHER_PASS_RTTI("EliminateScalarMul", "0"); + EliminateScalarMul(); }; +// input_a const_a input_b const_b input_c const_c +// \ / \ / \ / +// Multiply_a Multiply_b Multiply_c +// \ | / +// \ | / +// ---------- Concat ------------ +// ==> +// (const_a (const_b (const_c +// input_a /const_c) input_b /const_c) input_c /const_c) +// \ / \ / \ / +// Multiply_a Multiply_b Multiply_c +// \ | / +// \ | / +// ---------- Concat ------------ +// | const_c +// | / +// Multiply class ov::pass::activations_scaling::MulConcatTransformation : public ov::pass::MatcherPass { public: OPENVINO_MATCHER_PASS_RTTI("MulConcatTransformation", "0"); MulConcatTransformation(); }; +// input input +// / \ | +// Norm Mul ==> Mul (expect to be fused into the input layer) +// | | / \_ +// op_a op_b Norm op_b +// | +// op_a class ov::pass::activations_scaling::MulShareTransformation : public ov::pass::MatcherPass { public: OPENVINO_MATCHER_PASS_RTTI("MulShareTransformation", "0"); MulShareTransformation(); }; -class ov::pass::activations_scaling::MulMulTransformation : public ov::pass::MatcherPass { +// input_b scalar input_a input_b +// \ / \ / +// input_a Mul_b ==> Mul_a' scalar +// \ / \ / +// Mul_a Mul_b' (expect to be merged with Mul_a') +class ov::pass::activations_scaling::MoveDownScalarMul : public ov::pass::MatcherPass { public: - OPENVINO_MATCHER_PASS_RTTI("MulMulTransformation", "0"); - MulMulTransformation(); + OPENVINO_MATCHER_PASS_RTTI("MoveDownScalarMul", "0"); + MoveDownScalarMul(); }; diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 0113502f2497e1..7fd1a5a237fa3b 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -48,10 +48,6 @@ using namespace ov::pass::activations_scaling; using namespace ov::pass::pattern; using ov::pass::pattern::op::Or; -// Add scale_down and scale_up layers around Convolution and MatMul nodes -// Conv/MatMul -// ==> -// Multiply(scale_down by scale_factor) --> Conv/MatMul --> Multiply(scale_up by scale_factor) ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor, ov::element::Type scaled_prec) { MATCHER_SCOPE(ScaleDownSingleLayer); @@ -184,17 +180,8 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float this->register_matcher(m, callback); } -// Normalization has the following property. -// -// Norm(input * const_a) = Norm(input) -// -// So, we can skip Multiply that is connected to Normalization. -// -// input --> Multiply --> Normalization -// ==> -// input --> Normalization -ov::pass::activations_scaling::EliminateMultiplyScalar::EliminateMultiplyScalar() { - MATCHER_SCOPE(EliminateMultiplyScalar); +ov::pass::activations_scaling::EliminateScalarMul::EliminateScalarMul() { + MATCHER_SCOPE(EliminateScalarMul); auto activation_m = any_input(is_non_const_node); auto convert_m = ov::pass::pattern::optional(activation_m); @@ -221,27 +208,10 @@ ov::pass::activations_scaling::EliminateMultiplyScalar::EliminateMultiplyScalar( return true; }; - auto m = std::make_shared(norm_m, "EliminateMultiplyScalar"); + auto m = std::make_shared(norm_m, "EliminateScalarMul"); this->register_matcher(m, callback); } -// input_a const_a input_b const_b input_c const_c -// \ / \ / \ / -// Multiply_a Multiply_b Multiply_c -// \ | / -// \ | / -// ---------- Concat ------------ -// ==> -// (const_a (const_b (const_c -// input_a /const_c) input_b /const_c) input_c /const_c) -// \ / \ / \ / -// Multiply_a Multiply_b Multiply_c -// \ | / -// \ | / -// ---------- Concat ------------ -// | const_c -// | / -// Multiply ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation() { MATCHER_SCOPE(MulConcatTransformation); @@ -328,13 +298,6 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( this->register_matcher(m, callback); } -// input input -// / \ | -// Norm Mul ==> Mul (expect to be fused into the input layer) -// | | / \_ -// op_a op_b Norm op_b -// | -// op_a ov::pass::activations_scaling::MulShareTransformation::MulShareTransformation() { MATCHER_SCOPE(MulShareTransformation); @@ -383,13 +346,8 @@ ov::pass::activations_scaling::MulShareTransformation::MulShareTransformation() this->register_matcher(m, callback); } -// input_b scalar input_a input_b -// \ / \ / -// input_a Mul_b ==> Mul_a' scalar -// \ / \ / -// Mul_a Mul_b' (expect to be merged with Mul_a') -ov::pass::activations_scaling::MulMulTransformation::MulMulTransformation() { - MATCHER_SCOPE(MulMulTransformation); +ov::pass::activations_scaling::MoveDownScalarMul::MoveDownScalarMul() { + MATCHER_SCOPE(MoveDownScalarMul); auto activation_b_m = any_input(is_non_const_node); auto mul_const_m = ov::pass::pattern::wrap_type(is_scalar_node); @@ -429,6 +387,6 @@ ov::pass::activations_scaling::MulMulTransformation::MulMulTransformation() { return true; }; - auto m = std::make_shared(mul_a_m, "MulMulTransformation"); + auto m = std::make_shared(mul_a_m, "MoveDownScalarMul"); this->register_matcher(m, callback); } diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index b460cf1cb2e709..a8797b588c31cf 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -70,7 +70,7 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { } } -TEST_F(TransformationTestsF, EliminateMultiplyScalarTest) { +TEST_F(TransformationTestsF, EliminateScalarMulTest) { { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); @@ -83,7 +83,7 @@ TEST_F(TransformationTestsF, EliminateMultiplyScalarTest) { auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); - manager.register_pass(); + manager.register_pass(); } { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); @@ -130,7 +130,7 @@ TEST_F(TransformationTestsF, ConcatTransformationTest) { } } -TEST_F(TransformationTestsF, MulMulTransformationTest) { +TEST_F(TransformationTestsF, MoveDownScalarMulTest) { { auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); auto scale_const0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); @@ -141,7 +141,7 @@ TEST_F(TransformationTestsF, MulMulTransformationTest) { auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); - manager.register_pass(); + manager.register_pass(); } { auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 4d652d65229497..cb78e3d51b1409 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -960,9 +960,9 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // Move down scalar-multiply layers as much as possible auto params = LayerTransformation::Params(false, infer_precision, {infer_precision}, true, false); auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); - lpt_pass->add_main(); + lpt_pass->add_main(); lpt_pass->add_main(); - lpt_pass->add_main(); + lpt_pass->add_main(); // Move up remained scalar-multiply layers manager.register_pass(); diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp index 5a60d6eb06c519..0b315dc088af45 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp @@ -145,9 +145,9 @@ class ActivationsScaling : public testing::WithParamInterface(transpose1, transpose2); - auto concat0 = std::make_shared(ov::NodeVector{transpose0, add12}, 3); + auto concat0 = std::make_shared(ov::NodeVector{transpose0, add12}, 0); - auto concat1 = std::make_shared(ov::NodeVector{add12, transpose2}, 3); + auto concat1 = std::make_shared(ov::NodeVector{add12, transpose2}, 0); auto add = std::make_shared(concat0, concat1); From ca1cf6f7a7775110ac7220397e3b7e181afe725d Mon Sep 17 00:00:00 2001 From: "Kim, Eddy" Date: Fri, 10 Jan 2025 11:43:45 +0900 Subject: [PATCH 64/64] set scalingMode = true --- src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index cb78e3d51b1409..48015d7a66ab81 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -958,7 +958,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); // Move down scalar-multiply layers as much as possible - auto params = LayerTransformation::Params(false, infer_precision, {infer_precision}, true, false); + auto params = LayerTransformation::Params(false, infer_precision, {infer_precision}, true, true); auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); lpt_pass->add_main(); lpt_pass->add_main();