diff --git a/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp b/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp index 558e26aeb56097..f2b2de6ed4e792 100644 --- a/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp +++ b/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp @@ -253,12 +253,12 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass const std::vector defaultPrecisions = { ov::element::u8, ov::element::i8 }, const bool reshapeIgnorePerTensorQuantizationCheck = false, - const bool useDefaultTransformation = true) : + const bool scalingMode = false) : updatePrecisions(updatePrecisions), deqPrecision(deqPrecision), defaultPrecisions(defaultPrecisions), reshapeIgnorePerTensorQuantizationCheck(reshapeIgnorePerTensorQuantizationCheck), - useDefaultTransformation(useDefaultTransformation) {} + scalingMode(scalingMode) {} Params& setUpdatePrecisions(const bool updatePrecisions) { this->updatePrecisions = updatePrecisions; @@ -283,8 +283,8 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass std::vector defaultPrecisions; // to support GPU workarround to keep Reshape and MatMul in FP32 bool reshapeIgnorePerTensorQuantizationCheck; - // for MultiplyPartialTransformation to support Activations Scaling - bool useDefaultTransformation; + // to support Activations Scaling + bool scalingMode; }; class PrecisionDetails { @@ -356,7 +356,7 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass element::Type deqPrecision; std::vector defaultPrecisions; bool reshapeIgnorePerTensorQuantizationCheck; - bool useDefaultTransformation; + bool scalingMode; static constexpr char originalLayerPostfix[] = "_original"; TransformationContext* context; diff --git a/src/common/low_precision_transformations/src/add.cpp b/src/common/low_precision_transformations/src/add.cpp index 7fa283089bef0b..0c9f727c18b4ad 100644 --- a/src/common/low_precision_transformations/src/add.cpp +++ b/src/common/low_precision_transformations/src/add.cpp @@ -214,7 +214,7 @@ bool AddTransformation::transform(TransformationContext& context, ov::pass::patt newSubtractFullPathValues), newMultiplyFullPathValues); - auto output_type = useDefaultTransformation ? element::f32 : add->get_output_element_type(0); + auto output_type = scalingMode ? add->get_output_element_type(0) : element::f32; newAddOrSubtract = std::make_shared>( std::vector{output_type, output_type}, std::vector{ output_type }, ov::op::TemporaryReplaceOutputType(inputs[0], output_type).get(), diff --git a/src/common/low_precision_transformations/src/layer_transformation.cpp b/src/common/low_precision_transformations/src/layer_transformation.cpp index 70d88743cb34ec..3679f6d027abad 100644 --- a/src/common/low_precision_transformations/src/layer_transformation.cpp +++ b/src/common/low_precision_transformations/src/layer_transformation.cpp @@ -45,7 +45,7 @@ LayerTransformation::LayerTransformation(const Params& params) : deqPrecision(params.deqPrecision), defaultPrecisions(params.defaultPrecisions), reshapeIgnorePerTensorQuantizationCheck(params.reshapeIgnorePerTensorQuantizationCheck), - useDefaultTransformation(params.useDefaultTransformation), + scalingMode(params.scalingMode), context(nullptr) {} void LayerTransformation::setContext(TransformationContext* context) noexcept { diff --git a/src/common/low_precision_transformations/src/multiply_partial.cpp b/src/common/low_precision_transformations/src/multiply_partial.cpp index 102dde7f1cd65c..ce9ef0816147b8 100644 --- a/src/common/low_precision_transformations/src/multiply_partial.cpp +++ b/src/common/low_precision_transformations/src/multiply_partial.cpp @@ -79,7 +79,7 @@ bool MultiplyPartialTransformation::transform(TransformationContext& context, ov auto constParent = multiply->input_value(multiplyBranch.first == 0 ? 1 : 0); auto multiplyParentParent = multiplyParent.get_node_shared_ptr()->input_value(multiplyBranch.second); auto multiplyParentConst = multiplyParent.get_node_shared_ptr()->input_value(multiplyBranch.second == 0 ? 1 : 0); - auto input_data_type = useDefaultTransformation ? element::f32 : multiply->get_output_element_type(0); + auto input_data_type = scalingMode ? multiply->get_output_element_type(0) : element::f32; newMultiply = std::make_shared>( std::vector{ input_data_type, input_data_type }, @@ -134,7 +134,7 @@ bool MultiplyPartialTransformation::transform(TransformationContext& context, ov // before: Y = (SC1 * (X1 - SH1)) * (SC2 * X2) - // if useDefaultTransformation = true + // if scalingMode == false // after : Y = (SC1' * (X1 - SH1)) * (X2) , where : // SC1' = SC1 * SC2 // else @@ -142,9 +142,9 @@ bool MultiplyPartialTransformation::transform(TransformationContext& context, ov // SC1' = SC1 * SC2 auto newMultiplyValuesFullPath = fold(multiplyValuesEmptyPath, multiplyValuesFullPath); OutputVector inputs{ {}, {} }; - inputs[emptyPathIndex] = useDefaultTransformation ? dequantizationEmptyPath.data : newMultiplyValuesFullPath; - auto input_for_fullPath = useDefaultTransformation ? newMultiplyValuesFullPath : - dequantizationEmptyPath.data.get_node_shared_ptr(); + inputs[emptyPathIndex] = scalingMode ? newMultiplyValuesFullPath : dequantizationEmptyPath.data; + auto input_for_fullPath = scalingMode ? dequantizationEmptyPath.data.get_node_shared_ptr() : + newMultiplyValuesFullPath; ov::Output parent0 = dequantizationFullPath.subtract == nullptr ? (dequantizationFullPath.convert == nullptr ? dequantizationFullPath.data : dequantizationFullPath.convert) : diff --git a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp index a7d19091a260b8..d8c96a1df542af 100644 --- a/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/activations_scaling.hpp @@ -17,10 +17,10 @@ class TRANSFORMATIONS_API ActivationsScaling; namespace activations_scaling { class TRANSFORMATIONS_API ScaleDownSingleLayer; -class TRANSFORMATIONS_API EliminateMultiplyScalar; +class TRANSFORMATIONS_API EliminateScalarMul; class TRANSFORMATIONS_API MulConcatTransformation; class TRANSFORMATIONS_API MulShareTransformation; -class TRANSFORMATIONS_API MulMulTransformation; +class TRANSFORMATIONS_API MoveDownScalarMul; } // namespace activations_scaling } // namespace pass @@ -31,32 +31,74 @@ class TRANSFORMATIONS_API MulMulTransformation; // For example, when this property is set as 16, activations are divided by 16. // If ov::hint::activations_scale_factor is less than or equal to zero, it is disabled. +// Add scale_down and scale_up layers around Convolution and MatMul nodes +// Conv/MatMul +// ==> +// Multiply(scale_down by scale_factor) --> Conv/MatMul --> Multiply(scale_up by scale_factor) class ov::pass::activations_scaling::ScaleDownSingleLayer : public ov::pass::MatcherPass { public: OPENVINO_MATCHER_PASS_RTTI("ScaleDownSingleLayer", "0"); ScaleDownSingleLayer(float scale_factor, ov::element::Type scaled_prec); }; -class ov::pass::activations_scaling::EliminateMultiplyScalar : public ov::pass::MatcherPass { +// Normalization and ShapeOf have the following property. +// +// Norm(input * const_a) = Norm(input) +// +// So, we can skip Multiply that is connected to Normalization and ShapeOf. +// +// input --> Multiply --> Normalization/ShapeOf +// ==> +// input --> Normalization/ShapeOf +class ov::pass::activations_scaling::EliminateScalarMul : public ov::pass::MatcherPass { public: - OPENVINO_MATCHER_PASS_RTTI("EliminateMultiplyScalar", "0"); - EliminateMultiplyScalar(); + OPENVINO_MATCHER_PASS_RTTI("EliminateScalarMul", "0"); + EliminateScalarMul(); }; +// input_a const_a input_b const_b input_c const_c +// \ / \ / \ / +// Multiply_a Multiply_b Multiply_c +// \ | / +// \ | / +// ---------- Concat ------------ +// ==> +// (const_a (const_b (const_c +// input_a /const_c) input_b /const_c) input_c /const_c) +// \ / \ / \ / +// Multiply_a Multiply_b Multiply_c +// \ | / +// \ | / +// ---------- Concat ------------ +// | const_c +// | / +// Multiply class ov::pass::activations_scaling::MulConcatTransformation : public ov::pass::MatcherPass { public: OPENVINO_MATCHER_PASS_RTTI("MulConcatTransformation", "0"); MulConcatTransformation(); }; +// input input +// / \ | +// Norm Mul ==> Mul (expect to be fused into the input layer) +// | | / \_ +// op_a op_b Norm op_b +// | +// op_a class ov::pass::activations_scaling::MulShareTransformation : public ov::pass::MatcherPass { public: OPENVINO_MATCHER_PASS_RTTI("MulShareTransformation", "0"); MulShareTransformation(); }; -class ov::pass::activations_scaling::MulMulTransformation : public ov::pass::MatcherPass { +// input_b scalar input_a input_b +// \ / \ / +// input_a Mul_b ==> Mul_a' scalar +// \ / \ / +// Mul_a Mul_b' (expect to be merged with Mul_a') +class ov::pass::activations_scaling::MoveDownScalarMul : public ov::pass::MatcherPass { public: - OPENVINO_MATCHER_PASS_RTTI("MulMulTransformation", "0"); - MulMulTransformation(); + OPENVINO_MATCHER_PASS_RTTI("MoveDownScalarMul", "0"); + MoveDownScalarMul(); }; diff --git a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp index 0113502f2497e1..7fd1a5a237fa3b 100644 --- a/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp @@ -48,10 +48,6 @@ using namespace ov::pass::activations_scaling; using namespace ov::pass::pattern; using ov::pass::pattern::op::Or; -// Add scale_down and scale_up layers around Convolution and MatMul nodes -// Conv/MatMul -// ==> -// Multiply(scale_down by scale_factor) --> Conv/MatMul --> Multiply(scale_up by scale_factor) ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_factor, ov::element::Type scaled_prec) { MATCHER_SCOPE(ScaleDownSingleLayer); @@ -184,17 +180,8 @@ ov::pass::activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float this->register_matcher(m, callback); } -// Normalization has the following property. -// -// Norm(input * const_a) = Norm(input) -// -// So, we can skip Multiply that is connected to Normalization. -// -// input --> Multiply --> Normalization -// ==> -// input --> Normalization -ov::pass::activations_scaling::EliminateMultiplyScalar::EliminateMultiplyScalar() { - MATCHER_SCOPE(EliminateMultiplyScalar); +ov::pass::activations_scaling::EliminateScalarMul::EliminateScalarMul() { + MATCHER_SCOPE(EliminateScalarMul); auto activation_m = any_input(is_non_const_node); auto convert_m = ov::pass::pattern::optional(activation_m); @@ -221,27 +208,10 @@ ov::pass::activations_scaling::EliminateMultiplyScalar::EliminateMultiplyScalar( return true; }; - auto m = std::make_shared(norm_m, "EliminateMultiplyScalar"); + auto m = std::make_shared(norm_m, "EliminateScalarMul"); this->register_matcher(m, callback); } -// input_a const_a input_b const_b input_c const_c -// \ / \ / \ / -// Multiply_a Multiply_b Multiply_c -// \ | / -// \ | / -// ---------- Concat ------------ -// ==> -// (const_a (const_b (const_c -// input_a /const_c) input_b /const_c) input_c /const_c) -// \ / \ / \ / -// Multiply_a Multiply_b Multiply_c -// \ | / -// \ | / -// ---------- Concat ------------ -// | const_c -// | / -// Multiply ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation() { MATCHER_SCOPE(MulConcatTransformation); @@ -328,13 +298,6 @@ ov::pass::activations_scaling::MulConcatTransformation::MulConcatTransformation( this->register_matcher(m, callback); } -// input input -// / \ | -// Norm Mul ==> Mul (expect to be fused into the input layer) -// | | / \_ -// op_a op_b Norm op_b -// | -// op_a ov::pass::activations_scaling::MulShareTransformation::MulShareTransformation() { MATCHER_SCOPE(MulShareTransformation); @@ -383,13 +346,8 @@ ov::pass::activations_scaling::MulShareTransformation::MulShareTransformation() this->register_matcher(m, callback); } -// input_b scalar input_a input_b -// \ / \ / -// input_a Mul_b ==> Mul_a' scalar -// \ / \ / -// Mul_a Mul_b' (expect to be merged with Mul_a') -ov::pass::activations_scaling::MulMulTransformation::MulMulTransformation() { - MATCHER_SCOPE(MulMulTransformation); +ov::pass::activations_scaling::MoveDownScalarMul::MoveDownScalarMul() { + MATCHER_SCOPE(MoveDownScalarMul); auto activation_b_m = any_input(is_non_const_node); auto mul_const_m = ov::pass::pattern::wrap_type(is_scalar_node); @@ -429,6 +387,6 @@ ov::pass::activations_scaling::MulMulTransformation::MulMulTransformation() { return true; }; - auto m = std::make_shared(mul_a_m, "MulMulTransformation"); + auto m = std::make_shared(mul_a_m, "MoveDownScalarMul"); this->register_matcher(m, callback); } diff --git a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp index b460cf1cb2e709..a8797b588c31cf 100644 --- a/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp +++ b/src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp @@ -70,7 +70,7 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) { } } -TEST_F(TransformationTestsF, EliminateMultiplyScalarTest) { +TEST_F(TransformationTestsF, EliminateScalarMulTest) { { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); @@ -83,7 +83,7 @@ TEST_F(TransformationTestsF, EliminateMultiplyScalarTest) { auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); - manager.register_pass(); + manager.register_pass(); } { auto input = std::make_shared(ov::element::f16, ov::PartialShape{1, 3, 16, 16}); @@ -130,7 +130,7 @@ TEST_F(TransformationTestsF, ConcatTransformationTest) { } } -TEST_F(TransformationTestsF, MulMulTransformationTest) { +TEST_F(TransformationTestsF, MoveDownScalarMulTest) { { auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); auto scale_const0 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1}, {10}); @@ -141,7 +141,7 @@ TEST_F(TransformationTestsF, MulMulTransformationTest) { auto result = std::make_shared(convert); model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input0, input1}); - manager.register_pass(); + manager.register_pass(); } { auto input0 = std::make_shared(ov::element::f16, ov::PartialShape{6, 12, 10, 24}); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 4d652d65229497..cb78e3d51b1409 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -960,9 +960,9 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // Move down scalar-multiply layers as much as possible auto params = LayerTransformation::Params(false, infer_precision, {infer_precision}, true, false); auto lpt_pass = manager.register_pass(supportedPrecisions, perTensorQuantization, params); - lpt_pass->add_main(); + lpt_pass->add_main(); lpt_pass->add_main(); - lpt_pass->add_main(); + lpt_pass->add_main(); // Move up remained scalar-multiply layers manager.register_pass(); diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp index 5a60d6eb06c519..0b315dc088af45 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/activations_scaling.cpp @@ -145,9 +145,9 @@ class ActivationsScaling : public testing::WithParamInterface(transpose1, transpose2); - auto concat0 = std::make_shared(ov::NodeVector{transpose0, add12}, 3); + auto concat0 = std::make_shared(ov::NodeVector{transpose0, add12}, 0); - auto concat1 = std::make_shared(ov::NodeVector{add12, transpose2}, 3); + auto concat1 = std::make_shared(ov::NodeVector{add12, transpose2}, 0); auto add = std::make_shared(concat0, concat1);