diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp index 379d7b3b64a222..4c51ee2bacc04e 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp @@ -47,6 +47,7 @@ struct program { friend class post_optimize_weights; // to be removed when possible friend class prepare_primitive_fusing_through; // to be removed when possible friend class reorder_transfer; // to be removed when possible + friend class reshape_transfer; // to be removed when possible friend class fuse_constant_transposes; // to be removed when possible friend class program_wrapper; // this class is intended to extend the interface of program for // the usage within tests_core_internal project only diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reshape_transfer.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reshape_transfer.cpp new file mode 100644 index 00000000000000..90faed7160310a --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reshape_transfer.cpp @@ -0,0 +1,141 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "pass_manager.h" +#include "permute_inst.h" +#include "program_helpers.h" +#include "reorder_inst.h" +#include "reshape_inst.h" + +using namespace cldnn; + +void reshape_transfer::run(program& p) { + // (reorder) + reshape + transpose + // sink reshape for further possible optimization + auto is_suitable_permute = [](cldnn::program_node* node) { + return node->get_users().size() == 1 && node->is_dynamic() == false && + node->get_output_layout().get_rank() == 4; + }; + + auto is_suitable_reshape = [](cldnn::program_node* node) -> bool { + if (node->get_users().size() != 1 || node->is_dynamic()) + return false; + auto& input_lay = node->get_input_layout(0); + auto& output_lay = node->get_output_layout(); + if (input_lay.compatible(output_lay)) + return true; + return false; + }; + + std::function is_suitable_parent; + is_suitable_parent = [&is_suitable_parent](const cldnn::program_node* node) -> bool { + if (node->get_users().size() != 1 || node->is_dynamic()) + return false; + for (size_t idx = 0; idx < node->get_dependencies().size(); idx++) { + auto& input = node->get_dependency(idx); + if (!input.is_in_data_flow() || input.is_constant()) + continue; + if (node->is_type() || input.is_type()) { + return true; + } else if (input.is_type() && input.get_dependency(1).is_constant()) { + return is_suitable_parent(&input); + } else if (input.is_type()) { + return is_suitable_parent(&input); + } + return false; + } + return false; + }; + + auto update_order = [](std::vector original_order, cldnn::program_node* reshape) { + if (!reshape) + return original_order; + // Example. For this sequence, there is Reshape node which merges 2 consecutive dims into one + // order must be updated like permute is done before reshape + // [1,3,4,6] -> Reshape[1,3,24,1]-> permute(0,2,1) -> [1,24,3,1] + // updated order must be (0,2,3,1): + // dim with index=2 is split into 2 parts: 2 and 3 + const auto& reshape_in_shape = reshape->get_input_layout().get_dims(); + const auto& reshape_out_shape = reshape->get_output_layout().get_dims(); + auto transformed_order = original_order; + ov::Shape new_shape(transformed_order.size()); + const uint16_t merge_dim_idx = [&]() { + for (uint16_t i = 0; i < reshape_in_shape.size(); ++i) { + if (reshape_in_shape[i] != reshape_out_shape[i]) + return i; + } + OPENVINO_THROW("same input/output for reshape node"); + }(); + auto insertIt = transformed_order.end(); + for (auto it = transformed_order.begin(); it != transformed_order.end(); ++it) { + auto& elem = *it; + if (elem > merge_dim_idx) { + elem++; + } else if (elem == merge_dim_idx) { + insertIt = it + 1; + } + } + transformed_order.insert(insertIt, merge_dim_idx + 1); + // remove invalid orders + if (transformed_order.size() > reshape_out_shape.size()) { + transformed_order.erase( + std::remove_if(transformed_order.begin(), transformed_order.end(), [&](uint16_t& order) { + return order >= reshape_out_shape.size(); + })); + } + return transformed_order; + }; + + auto itr = p.get_processing_order().begin(); + while (itr != p.get_processing_order().end()) { + auto& node = *itr++; + if (!node->is_type()) + continue; + auto& transpose_node = node->as(); + if (!is_suitable_permute(&transpose_node)) + continue; + auto& child_node = transpose_node; + auto parent_node = child_node.get_dependency_with_port(0).first; + cldnn::program_node* inter_node; + if (parent_node->is_type()) { + inter_node = parent_node; + if (!is_suitable_reshape(inter_node)) { + continue; + } + parent_node = inter_node->get_dependency_with_port(0).first; + } else { + continue; + } + + if (!is_suitable_parent(parent_node)) { + continue; + } + reshape_node* reshape_node = nullptr; + if (inter_node && inter_node->is_type()) + reshape_node = &(inter_node->as()); + + auto transpose_order = update_order(transpose_node.get_permute_order(), reshape_node); + auto new_permute = + std::make_shared(transpose_node.id() + "_reordered", parent_node->id(), transpose_order); + auto& new_permute_node = p.get_or_create(new_permute); + if (new_permute_node.as().is_rotating_except_batch()) { + auto next_node = transpose_node.get_users().front(); + auto new_reshape_tensor = transpose_node.get_output_layout().get_tensor(); + p.move_node(*reshape_node, *node, *next_node); + // replace the permute node and reshape node + auto new_reshape = + std::make_shared(reshape_node->id() + "_sinked", new_permute_node.id(), new_reshape_tensor); + auto& new_reshape_node = p.get_or_create(new_reshape); + + p.replace(transpose_node, new_permute_node); + p.replace(*reshape_node, new_reshape_node); + new_permute_node.recalc_output_layout(false); + new_reshape_node.recalc_output_layout(false); + } else { + p.remove_if_dangling(new_permute_node); + } + } +} diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h index 490076a37f788e..60832e05856fed 100644 --- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h +++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h @@ -366,6 +366,14 @@ class reorder_transfer : public base_pass { void run(program& p) override; }; +class reshape_transfer : public base_pass { +public: + reshape_transfer() : base_pass("reshape_transfer") {} + +private: + void run(program& p) override; +}; + class mark_runtime_skippable_nodes : public base_pass { public: mark_runtime_skippable_nodes() : base_pass("mark_runtime_skippable_nodes") {} diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index bdffb9c4980722..d57df3f7d33c53 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -569,6 +569,7 @@ void program::pre_optimize_graph(bool is_internal) { apply_opt_pass(); + apply_opt_pass(); #ifdef GPU_DEBUG_CONFIG GPU_DEBUG_IF(!debug_config->disable_primitive_fusing) { #else diff --git a/src/plugins/intel_gpu/tests/unit/passes/reorder_reshape_permute.cpp b/src/plugins/intel_gpu/tests/unit/passes/reorder_reshape_permute.cpp new file mode 100644 index 00000000000000..261d41c93ed37b --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/passes/reorder_reshape_permute.cpp @@ -0,0 +1,372 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "convolution_inst.h" +#include "intel_gpu/graph/program.hpp" +#include "permute_inst.h" +#include "program_wrapper.h" +#include "reshape_inst.h" +#include "test_utils.h" + +using namespace cldnn; +using namespace ::tests; + +TEST(opt_reorder_reshape_permute, no_reshape) { + auto& engine = get_test_engine(); + auto in_layout = layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}; + auto input = engine.allocate_memory(layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}); + auto weight = engine.allocate_memory(layout{ov::PartialShape({3, 2, 1, 1}), data_types::f16, format::bfyx}); + tests::set_random_values(input); + tests::set_random_values(weight); + + topology topology; + topology.add(input_layout("input", in_layout)); + topology.add(data("weight", weight)); + topology.add( + convolution("convolution", input_info("input"), "weight", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false)); + topology.add(reorder("reorder_inter", input_info("convolution"), format::bfyx, data_types::f16)); + topology.add(permute("permute_inter", input_info("reorder_inter"), {0, 2, 3, 1})); + topology.add(softmax("softmax", input_info("permute_inter"), 1)); + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + auto prog = program::build_program(engine, topology, config); + + network net(prog); + + net.set_input_data("input", input); + auto output = net.execute(); + + ExecutionConfig ref_config = get_test_default_config(engine); + ref_config.set_property(ov::intel_gpu::optimize_data(false)); + cldnn::network ref_network(engine, topology, ref_config); + // reorder node is removed in primitive fusing + // later permute is optimized after convolution in selected preferred formats, e.g conv + permute + auto optimzed_nodes = net.get_program()->get_optimized(); + auto it = + std::find_if(std::begin(optimzed_nodes), std::end(optimzed_nodes), [&](cldnn::program::optimized_info& oi) { + return oi.first == "reorder_inter"; + }); + ASSERT_NE(it, optimzed_nodes.end()); + auto permute_inst = net.get_primitive("permute_inter"); + if (net.get_primitive("convolution")->get_impl()->is_onednn()) { + ASSERT_TRUE(permute_inst->can_be_optimized()); + } + auto out_mem = output.at("softmax").get_memory(); + mem_lock lock(out_mem, get_test_stream()); + + ref_network.set_input_data("input", input); + auto ref_output = ref_network.execute(); + auto ref_out_mem = ref_output.at("softmax").get_memory(); + mem_lock lock_ref(ref_out_mem, get_test_stream()); + auto tolerance = default_tolerance(ref_out_mem->get_layout().data_type); + for (size_t i = 0; i < out_mem->count(); i++) { + ASSERT_NEAR(lock[i],lock_ref[i], tolerance) + << "\ntolerance = " << tolerance + << "\ni = " << i + << "\nref[i] = " << lock_ref[i] + << "\nopt[i] = " << lock[i]; + } +} + +TEST(opt_reorder_reshape_permute, no_reorder) { + auto& engine = get_test_engine(); + auto in_layout = layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}; + auto input = engine.allocate_memory(layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}); + auto weight = engine.allocate_memory(layout{ov::PartialShape({3, 2, 1, 1}), data_types::f16, format::bfyx}); + tests::set_random_values(input); + tests::set_random_values(weight); + topology topology; + topology.add(input_layout("input", in_layout)); + topology.add(data("weight", weight)); + topology.add( + convolution("convolution", input_info("input"), "weight", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false)); + topology.add( + reshape("reshape_inter", input_info("convolution"), false, {1, 3, 24, 1}, ov::PartialShape{1, 3, 24, 1})); + topology.add(permute("permute_inter", input_info("reshape_inter"), {0, 2, 1})); + topology.add(softmax("softmax", input_info("permute_inter"), 1)); + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + auto prog = program::build_program(engine, topology, config); + + network net(prog); + + net.set_input_data("input", input); + auto output = net.execute(); + + ExecutionConfig ref_config = get_test_default_config(engine); + ref_config.set_property(ov::intel_gpu::optimize_data(false)); + cldnn::network ref_network(engine, topology, ref_config); + auto& processing_order = prog->get_processing_order(); + auto reshape_node = std::find(processing_order.begin(), processing_order.end(), &prog->get_node("reshape_inter")); + size_t reshape_dist = std::distance(processing_order.begin(), reshape_node); + + auto permute_node = std::find(processing_order.begin(), processing_order.end(), &prog->get_node("permute_inter")); + size_t permute_dist = std::distance(processing_order.begin(), permute_node); + ASSERT_TRUE(reshape_dist > permute_dist); + // select preferred formats, conv + permute + auto permute_inst = net.get_primitive("permute_inter"); + if (net.get_primitive("convolution")->get_impl()->is_onednn()) { + ASSERT_TRUE(permute_inst->can_be_optimized()); + } + auto out_mem = output.at("softmax").get_memory(); + mem_lock lock(out_mem, get_test_stream()); + + ref_network.set_input_data("input", input); + auto ref_output = ref_network.execute(); + auto ref_out_mem = ref_output.at("softmax").get_memory(); + mem_lock lock_ref(ref_out_mem, get_test_stream()); + auto tolerance = default_tolerance(ref_out_mem->get_layout().data_type); + for (size_t i = 0; i < out_mem->count(); i++) { + ASSERT_NEAR(lock[i],lock_ref[i], tolerance) + << "\ntolerance = " << tolerance + << "\ni = " << i + << "\nref[i] = " << lock_ref[i] + << "\nopt[i] = " << lock[i]; + } +} + +TEST(opt_reorder_reshape_permute, no_reorder_no_reshape) { + auto& engine = get_test_engine(); + auto in_layout = layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}; + auto input = engine.allocate_memory(layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}); + auto weight = engine.allocate_memory(layout{ov::PartialShape({3, 2, 1, 1}), data_types::f16, format::bfyx}); + tests::set_random_values(input); + tests::set_random_values(weight); + topology topology; + topology.add(input_layout("input", in_layout)); + topology.add(data("weight", weight)); + topology.add( + convolution("convolution", input_info("input"), "weight", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false)); + topology.add(permute("permute_inter", input_info("convolution"), {0, 2, 3, 1})); + topology.add(softmax("softmax", input_info("permute_inter"), 1)); + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + auto prog = program::build_program(engine, topology, config); + + network net(prog); + + net.set_input_data("input", input); + auto output = net.execute(); + + ExecutionConfig ref_config = get_test_default_config(engine); + ref_config.set_property(ov::intel_gpu::optimize_data(false)); + cldnn::network ref_network(engine, topology, ref_config); + // select preferred formats, conv + permute + auto permute_inst = net.get_primitive("permute_inter"); + if (net.get_primitive("convolution")->get_impl()->is_onednn()) { + ASSERT_TRUE(permute_inst->can_be_optimized()); + } + auto out_mem = output.at("softmax").get_memory(); + mem_lock lock(out_mem, get_test_stream()); + + ref_network.set_input_data("input", input); + auto ref_output = ref_network.execute(); + auto ref_out_mem = ref_output.at("softmax").get_memory(); + mem_lock lock_ref(ref_out_mem, get_test_stream()); + auto tolerance = default_tolerance(ref_out_mem->get_layout().data_type); + for (size_t i = 0; i < out_mem->count(); i++) { + ASSERT_NEAR(lock[i],lock_ref[i], tolerance) + << "\ntolerance = " << tolerance + << "\ni = " << i + << "\nref[i] = " << lock_ref[i] + << "\nopt[i] = " << lock[i]; + } +} + +TEST(opt_reorder_reshape_permute, cutomized_net_yolov6_alike) { + auto& engine = get_test_engine(); + auto in_layout = layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}; + auto input = engine.allocate_memory(layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}); + auto weight = engine.allocate_memory(layout{ov::PartialShape({3, 2, 1, 1}), data_types::f16, format::bfyx}); + tests::set_random_values(input); + tests::set_random_values(weight); + topology topology; + topology.add(input_layout("input", in_layout)); + topology.add(data("weight", weight)); + topology.add( + convolution("convolution", input_info("input"), "weight", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false)); + topology.add(reorder("reorder_inter", input_info("convolution"), format::bfyx, data_types::f16)); + topology.add( + reshape("reshape_inter", input_info("reorder_inter"), false, {1, 3, 24, 1}, ov::PartialShape{1, 3, 24, 1})); + topology.add(permute("permute_inter", input_info("reshape_inter"), {0, 2, 1})); + topology.add(softmax("softmax", input_info("permute_inter"), 1)); + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(false)); + config.set_property(ov::intel_gpu::optimize_data(true)); + auto prog = program::build_program(engine, topology, config); + network net(prog); + + ExecutionConfig ref_config = get_test_default_config(engine); + ref_config.set_property(ov::intel_gpu::optimize_data(false)); + cldnn::network ref_network(engine, topology, ref_config); + + net.set_input_data("input", input); + auto output = net.execute(); + auto optimzed_nodes = net.get_program()->get_optimized(); + auto it = + std::find_if(std::begin(optimzed_nodes), std::end(optimzed_nodes), [&](cldnn::program::optimized_info& oi) { + return oi.first == "reorder_inter"; + }); + ASSERT_NE(it, optimzed_nodes.end()); + auto permute_inst = net.get_primitive("permute_inter"); + if (net.get_primitive("convolution")->get_impl()->is_onednn()) { + ASSERT_TRUE(permute_inst->can_be_optimized()); + } + auto reshape_inst = net.get_primitive("reshape_inter"); + ASSERT_TRUE(reshape_inst->can_be_optimized()); + + auto& processing_order = prog->get_processing_order(); + + auto reshape_node = std::find(processing_order.begin(), processing_order.end(), &prog->get_node("reshape_inter")); + size_t reshape_dist = std::distance(processing_order.begin(), reshape_node); + + auto permute_node = std::find(processing_order.begin(), processing_order.end(), &prog->get_node("permute_inter")); + size_t permute_dist = std::distance(processing_order.begin(), permute_node); + ASSERT_TRUE(reshape_dist > permute_dist); + auto out_mem = output.at("softmax").get_memory(); + mem_lock lock(out_mem, get_test_stream()); + + ref_network.set_input_data("input", input); + auto ref_output = ref_network.execute(); + + auto ref_out_mem = ref_output.at("softmax").get_memory(); + mem_lock lock_ref(ref_out_mem, get_test_stream()); + auto tolerance = default_tolerance(ref_out_mem->get_layout().data_type); + for (size_t i = 0; i < out_mem->count(); i++) { + ASSERT_NEAR(lock[i],lock_ref[i], tolerance) + << "\ntolerance = " << tolerance + << "\ni = " << i + << "\nref[i] = " << lock_ref[i] + << "\nopt[i] = " << lock[i]; + } +} + +TEST(opt_reorder_reshape_permute, cutomized_net_yolov6_alike_4d) { + auto& engine = get_test_engine(); + auto in_layout = layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}; + auto input = engine.allocate_memory(layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}); + auto weight = engine.allocate_memory(layout{ov::PartialShape({3, 2, 1, 1}), data_types::f16, format::bfyx}); + tests::set_random_values(input); + tests::set_random_values(weight); + topology topology; + topology.add(input_layout("input", in_layout)); + topology.add(data("weight", weight)); + topology.add( + convolution("convolution", input_info("input"), "weight", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false)); + topology.add(reorder("reorder_inter", input_info("convolution"), format::bfyx, data_types::f16)); + topology.add( + reshape("reshape_inter", input_info("reorder_inter"), false, {1, 3, 24, 1}, ov::PartialShape{1, 3, 24, 1})); + topology.add(permute("permute_inter", input_info("reshape_inter"), {0, 2, 1, 3})); + topology.add(softmax("softmax", input_info("permute_inter"), 1)); + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(false)); + config.set_property(ov::intel_gpu::optimize_data(true)); + auto prog = program::build_program(engine, topology, config); + network net(prog); + + ExecutionConfig ref_config = get_test_default_config(engine); + ref_config.set_property(ov::intel_gpu::optimize_data(false)); + cldnn::network ref_network(engine, topology, ref_config); + + net.set_input_data("input", input); + auto output = net.execute(); + auto optimzed_nodes = net.get_program()->get_optimized(); + auto it = + std::find_if(std::begin(optimzed_nodes), std::end(optimzed_nodes), [&](cldnn::program::optimized_info& oi) { + return oi.first == "reorder_inter"; + }); + ASSERT_NE(it, optimzed_nodes.end()); + auto permute_inst = net.get_primitive("permute_inter"); + if (net.get_primitive("convolution")->get_impl()->is_onednn()) { + ASSERT_TRUE(permute_inst->can_be_optimized()); + } + auto reshape_inst = net.get_primitive("reshape_inter"); + ASSERT_TRUE(reshape_inst->can_be_optimized()); + + auto& processing_order = prog->get_processing_order(); + + auto reshape_node = std::find(processing_order.begin(), processing_order.end(), &prog->get_node("reshape_inter")); + size_t reshape_dist = std::distance(processing_order.begin(), reshape_node); + + auto permute_node = std::find(processing_order.begin(), processing_order.end(), &prog->get_node("permute_inter")); + size_t permute_dist = std::distance(processing_order.begin(), permute_node); + ASSERT_TRUE(reshape_dist > permute_dist); + auto out_mem = output.at("softmax").get_memory(); + mem_lock lock(out_mem, get_test_stream()); + + ref_network.set_input_data("input", input); + auto ref_output = ref_network.execute(); + + auto ref_out_mem = ref_output.at("softmax").get_memory(); + mem_lock lock_ref(ref_out_mem, get_test_stream()); + auto tolerance = default_tolerance(ref_out_mem->get_layout().data_type); + for (size_t i = 0; i < out_mem->count(); i++) { + ASSERT_NEAR(lock[i],lock_ref[i], tolerance) + << "\ntolerance = " << tolerance + << "\ni = " << i + << "\nref[i] = " << lock_ref[i] + << "\nopt[i] = " << lock[i]; + } +} + +TEST(opt_reorder_reshape_permute, not_sinking_reshape) { + auto& engine = get_test_engine(); + auto in_layout = layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}; + auto input = engine.allocate_memory(layout{ov::PartialShape({1, 2, 4, 6}), data_types::f16, format::bfyx}); + auto weight = engine.allocate_memory(layout{ov::PartialShape({3, 2, 1, 1}), data_types::f16, format::bfyx}); + tests::set_random_values(input); + tests::set_random_values(weight); + topology topology; + topology.add(input_layout("input", in_layout)); + topology.add(data("weight", weight)); + topology.add( + convolution("convolution", input_info("input"), "weight", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false)); + topology.add(reorder("reorder_inter", input_info("convolution"), format::bfyx, data_types::f16)); + topology.add( + reshape("reshape_inter", input_info("reorder_inter"), false, {1, 18, 4, 1}, ov::PartialShape{1, 18, 4, 1})); + topology.add(permute("permute_inter", input_info("reshape_inter"), {0, 2, 1})); + topology.add(softmax("softmax", input_info("permute_inter"), 1)); + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(false)); + config.set_property(ov::intel_gpu::optimize_data(true)); + auto prog = program::build_program(engine, topology, config); + network net(prog); + + ExecutionConfig ref_config = get_test_default_config(engine); + ref_config.set_property(ov::intel_gpu::optimize_data(false)); + cldnn::network ref_network(engine, topology, ref_config); + + net.set_input_data("input", input); + auto output = net.execute(); + auto permute_inst = net.get_primitive("permute_inter"); + ASSERT_FALSE(permute_inst->can_be_optimized()); + + auto& processing_order = prog->get_processing_order(); + + auto reshape_node = std::find(processing_order.begin(), processing_order.end(), &prog->get_node("reshape_inter")); + size_t reshape_dist = std::distance(processing_order.begin(), reshape_node); + + auto permute_node = std::find(processing_order.begin(), processing_order.end(), &prog->get_node("permute_inter")); + size_t permute_dist = std::distance(processing_order.begin(), permute_node); + ASSERT_TRUE(reshape_dist < permute_dist); + auto out_mem = output.at("softmax").get_memory(); + mem_lock lock(out_mem, get_test_stream()); + + ref_network.set_input_data("input", input); + auto ref_output = ref_network.execute(); + + auto ref_out_mem = ref_output.at("softmax").get_memory(); + mem_lock lock_ref(ref_out_mem, get_test_stream()); + auto tolerance = default_tolerance(ref_out_mem->get_layout().data_type); + for (size_t i = 0; i < out_mem->count(); i++) { + ASSERT_NEAR(lock[i],lock_ref[i], tolerance) + << "\ntolerance = " << tolerance + << "\ni = " << i + << "\nref[i] = " << lock_ref[i] + << "\nopt[i] = " << lock[i]; + } +} \ No newline at end of file