[GPU] Fix reset output issue (#27695)

### Details: - *Fix invalid output memory usages by reset output memory* - *At the previous iteration, gather primitive has output memory which is refer input memory because it is optimized out but it also use input memory as output without reallocation memory at the next iteration which turns to executed from optimized out. it makes memory corruption issue.* - *Reset output memory when the current prim status is changed from optimized / skipped to executed* ![image](https://github.com/user-attachments/assets/7fa53f2b-80ec-4fa2-8e4d-b3df0f052c51) ### Tickets: - *154591*
openvinotoolkit · Dec 2, 2024 · cfe2a59 · cfe2a59
1 parent 4c8c290
commit cfe2a59
Show file tree

Hide file tree

Showing 4 changed files with 140 additions and 32 deletions.
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
@@ -92,7 +92,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
         // concat buffer fusing for dynamic shape is adaptively applied at runtime. So we need to build dynamic impl at build time.
         if (impl_param.can_be_optimized() &&
             !((impl_param.is_type<concatenation>() ||
-               impl_param.is_type<strided_slice>() ||
                impl_param.is_type<crop>() ||
                impl_param.runtime_skippable()) && impl_param.is_dynamic())) {
             return make_unique<ImplType>(kernel_selector::kernel_data{});

diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@@ -427,7 +427,7 @@ class primitive_inst {
     bool use_async_compilation();
     // if primitive_inst doesn't replace impl to new impl(static impl with opt kerenl or dynamic impl), return false
     void update_impl(bool use_async_compilation);
-    void realloc_if_needed();
+    void realloc_if_needed(bool prev_execution_skipped = false);
 
     cldnn::network::ptr get_unfused_subgraph();
 
@@ -481,6 +481,8 @@ class primitive_inst {
         return false;
     }
 
+    void clear_output_memory();
+
     // This could be implemented via single map std::unordered_map<instrumentation::perf_counter_key, std::tuple<int64_t, size_t>>
     // but the overhead on using perf_counter_key as map key is too big, thus we use hash as map key
     // and store mapping onto original perf_clounter_key for further data analysis and dumps

diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -549,7 +549,12 @@ bool primitive_inst::all_dependencies_cpu_impl() const {
     return check_all_deps_cpu(this);
 }
 
-void primitive_inst::realloc_if_needed() {
+void primitive_inst::clear_output_memory() {
+    _outputs[0] = nullptr;
+    _max_output_layout_count[0] = 0;
+}
+
+void primitive_inst::realloc_if_needed(bool prev_execution_skipped) {
     OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("realloc_if_needed: " + id()));
     GPU_DEBUG_GET_INSTANCE(debug_config);
     GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);
@@ -738,21 +743,15 @@ void primitive_inst::realloc_if_needed() {
 
     // Clear out memory if was previously reused, but now primitive can't be optimized
     if (!_node->is_type<concatenation>() && (_node->is_runtime_skippable() || _node->is_type<crop>())) {
-        std::function<void(cldnn::primitive_inst*, cldnn::memory::ptr)> reset_user_output_memory;
-        reset_user_output_memory = [&](cldnn::primitive_inst* curr_inst, cldnn::memory::ptr input_mem_ptr) {
-            auto curr_output_memory_ptr = curr_inst->output_memory_ptr(0);
-            if (curr_inst->can_be_optimized()
-                    && (curr_output_memory_ptr
-                        && get_network().get_engine().is_the_same_buffer(*curr_output_memory_ptr, *input_mem_ptr))) {
-                if (curr_inst->mem_allocated()) {
-                    get_network().get_memory_pool().release_memory(curr_inst->_outputs[0].get(),
-                            curr_inst->get_node().get_unique_id(), curr_inst->id(), get_network_id());
-                    _mem_allocated = false;
-                }
-                curr_inst->_outputs[0] = nullptr;
-                curr_inst->_max_output_layout_count[0] = 0;
-                for (auto& user_inst : curr_inst->get_user_insts()) {
-                    reset_user_output_memory(user_inst, input_mem_ptr);
+        std::function<void(cldnn::primitive_inst*, cldnn::memory::ptr)> reset_user_output_memory
+                            = [&](cldnn::primitive_inst* curr_inst, cldnn::memory::ptr target_mem_ptr) {
+            for (auto& user_inst : curr_inst->get_user_insts()) {
+                auto curr_output_memory_ptr = user_inst->output_memory_ptr(0);
+                if (user_inst->can_be_optimized()
+                        && (curr_output_memory_ptr
+                            && get_network().get_engine().is_the_same_buffer(*curr_output_memory_ptr, *target_mem_ptr))) {
+                    user_inst->clear_output_memory();
+                    reset_user_output_memory(user_inst, target_mem_ptr);
                 }
             }
         };
@@ -766,9 +765,7 @@ void primitive_inst::realloc_if_needed() {
             // * iter1: node1(skipped)  -> node2(skipped) -> node3(executed)
             if (_outputs[0] && dep_memory_ptr(0)
                 && !_network.get_engine().is_the_same_buffer(dep_memory(0), output_memory(0))) {
-                for (auto& user_inst : get_user_insts()) {
-                    reset_user_output_memory(user_inst, dep_memory_ptr(0));
-                }
+                reset_user_output_memory(this, dep_memory_ptr(0));
             }
             return;
         } else if (_outputs[0] && dep_memory_ptr(0) &&
@@ -778,16 +775,22 @@ void primitive_inst::realloc_if_needed() {
                         get_node().get_unique_id(), id(), get_network_id());
                 _mem_allocated = false;
             }
-            _outputs[0] = nullptr;
-            _max_output_layout_count[0] = 0;
+            clear_output_memory();
             // Check users recursively and if the users is can_be_optimized && runtime_skippable
             // && output_memory of user is same as current input memory,
             // then reset the users output memory too.
             // Ex.
             // * iter0: node1(skipped)  -> node2(skipped) -> node3(skipped)
             // * iter1: node1(executed) -> node2(skipped) -> node3(executed)
-            for (auto& user_inst : get_user_insts()) {
-                reset_user_output_memory(user_inst, dep_memory_ptr(0));
+            reset_user_output_memory(this, dep_memory_ptr(0));
+        } else {
+            // when this inst was not executed at the previous iteration,
+            // Reset output memory becuase current output memory is invalid.
+            if (prev_execution_skipped) {
+                if (_outputs[0]) {
+                    reset_user_output_memory(this, _outputs[0]);
+                }
+                clear_output_memory();
             }
         }
     }
@@ -1389,7 +1392,7 @@ void primitive_inst::do_runtime_in_place_kv_cache() {
 void primitive_inst::do_runtime_skip_gather() {
     // Check pattern
     if (!get_node().is_type<gather>()
-        || !get_node().can_be_optimized()
+        || !get_node().is_runtime_skippable()
         || _impl_params->has_fused_primitives()
         || _impl_params->get_input_layout(0).data_type != _impl_params->get_output_layout().data_type
         || get_node().get_dependency(1).is_constant() || get_node().get_dependency(1).is_type<data>())
@@ -1461,7 +1464,6 @@ void primitive_inst::do_runtime_skip_permute() {
     // Check pattern
     if (!get_node().is_type<permute>()
         || is_output()
-        || !get_node().can_be_optimized()
         || !get_node().is_runtime_skippable()
         || _impl_params->has_fused_primitives()
         || _impl_params->get_input_layout(0).data_type != _impl_params->get_output_layout().data_type)
@@ -1501,7 +1503,7 @@ void primitive_inst::do_runtime_skip_permute() {
 void primitive_inst::do_runtime_skip_strided_slice() {
     OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_skip_strided_slice: " + id()));
     // Check pattern
-    if (!get_node().is_type<strided_slice>() || !get_node().can_be_optimized())
+    if (!get_node().is_type<strided_slice>() || !get_node().is_runtime_skippable())
         return;
 
     GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_strided_slice] " << id() << " : check optimizability" << std::endl;
@@ -1525,7 +1527,7 @@ void primitive_inst::do_runtime_skip_strided_slice() {
 void primitive_inst::do_runtime_skip_broadcast() {
     OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_skip_broadcast: " + id()));
     // Check pattern
-    if (!get_node().is_type<broadcast>() || !get_node().can_be_optimized())
+    if (!get_node().is_type<broadcast>() || !get_node().is_runtime_skippable())
         return;
 
     GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_broadcast] " << id() << " : check optimizability" << std::endl;
@@ -1634,7 +1636,7 @@ void primitive_inst::do_runtime_skip_scatter_update() {
     if (!(get_node().is_type<scatter_update>()
         || get_node().is_type<scatter_elements_update>()
         || get_node().is_type<scatter_nd_update>())
-        || !get_node().can_be_optimized())
+        || !get_node().is_runtime_skippable())
         return;
 
     GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_scatter_update] " << id() << " : check optimizability" << std::endl;
@@ -1780,6 +1782,10 @@ void primitive_inst::prepare_primitive() {
     }
     GPU_DEBUG_TRACE_DETAIL << "-----------------------------------------------------------------" << std::endl;
 
+    // If it is optimized out or skipped for zero dimension at the previous iteration,
+    // Set this flag true to reset output memory in realloc_if_needed.
+    const bool prev_execution_skipped = can_be_optimized()
+                        || (_impl_params->output_layouts[0].is_static() && _impl_params->output_layouts[0].count() == 0);
     const auto orig_outputs = _outputs;
     if ((is_dynamic() || _node->is_in_shape_of_subgraph()) && !has_inner_networks()) {
         do_runtime_in_place_concat();
@@ -1839,7 +1845,7 @@ void primitive_inst::prepare_primitive() {
             update_impl(can_use_async_compilation);
             if (get_flag(ExecutionFlags::IMPL_CHANGED)) {
                 update_weights();
-                realloc_if_needed();
+                realloc_if_needed(prev_execution_skipped);
             }
         }
 
@@ -1848,7 +1854,7 @@ void primitive_inst::prepare_primitive() {
         if (_node->is_type<paged_attention>() && !get_flag(ExecutionFlags::IMPL_CHANGED) && _impl->requires_update(*this, *_impl_params)) {
             _impl->update(*this, *_impl_params);
 
-            realloc_if_needed();
+            realloc_if_needed(prev_execution_skipped);
         }
 
         OPENVINO_ASSERT(_impl_params->get_output_layout().is_static(),

diff --git a/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_gather_at_runtime.cpp b/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_gather_at_runtime.cpp
@@ -0,0 +1,101 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils.h"
+
+#include <intel_gpu/primitives/input_layout.hpp>
+#include <intel_gpu/primitives/gather.hpp>
+#include <intel_gpu/primitives/reorder.hpp>
+#include <intel_gpu/primitives/reshape.hpp>
+#include <intel_gpu/primitives/data.hpp>
+
+#include "gather_inst.h"
+#include "program_wrapper.h"
+
+#include <cmath>
+#include <algorithm>
+
+using namespace cldnn;
+using namespace ::tests;
+
+namespace skip_gather_tests {
+enum execution_status {
+	optimized = 0,
+	skipped   = 1,
+	executed  = 2
+};
+
+struct gather_iter_params {
+    ov::PartialShape input1_shape;
+    ov::PartialShape input2_shape;
+    execution_status expected_status;
+};
+
+
+struct skip_gather_params {
+    std::vector<gather_iter_params> input_data;
+    int axis;
+};
+
+class skip_gather_at_runtime_test : public testing::TestWithParam<skip_gather_params> {};
+
+TEST_P(skip_gather_at_runtime_test, runtime_skip) {
+    auto p = GetParam();
+    auto& engine = get_test_engine();
+    auto axis = p.axis;
+    auto input1_rank = p.input_data[0].input1_shape.size();
+    auto input1_layout_dynamic = layout {ov::PartialShape::dynamic(input1_rank), data_types::f16, format::get_default_format(input1_rank)};
+    auto input2_rank = p.input_data[0].input2_shape.size();
+    auto input2_layout_dynamic = layout {ov::PartialShape::dynamic(input2_rank), data_types::f16, format::get_default_format(input2_rank)};
+    topology topology(input_layout("input1", input1_layout_dynamic),
+                        input_layout("input2", input1_layout_dynamic),
+                        reshape("squeeze", input_info("input2"), false, {-1}, {-1}, reshape::reshape_mode::base),
+                        gather("gather",
+                                input_info("input1"),
+                                input_info("squeeze"),
+                                axis,
+                                p.input_data[0].input1_shape.size(),
+                                ov::Shape{},
+                                0,
+                                true),
+                        reorder("reorder", input_info("gather"), format::get_default_format(input1_rank), data_types::f32));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::optimize_data(true));
+
+    network network(engine, topology, config);
+    auto gather_inst = network.get_primitive("gather");
+    for (auto in_shape_data : p.input_data) {
+        auto input1_static_layout = layout {in_shape_data.input1_shape, data_types::f16, format::get_default_format(input1_rank)};
+        auto input1_mem = engine.allocate_memory(input1_static_layout);
+        network.set_input_data("input1", input1_mem);
+
+        auto input2_static_layout = layout {in_shape_data.input2_shape, data_types::f16, format::get_default_format(input2_rank)};
+        auto intpu2_unit_static_layout = layout {ov::PartialShape{1}, data_types::f16, format::get_default_format(input2_rank)};
+        auto input2_mem = (input2_static_layout.count() == 0)? engine.allocate_memory(intpu2_unit_static_layout) : engine.allocate_memory(input2_static_layout);
+        if (input2_static_layout.count() == 0)
+            input2_mem = engine.reinterpret_buffer(*input2_mem, input2_static_layout);
+        network.set_input_data("input2", input2_mem);
+
+        auto outputs = network.execute();
+        if (in_shape_data.expected_status == execution_status::executed) {
+            ASSERT_FALSE(engine.is_the_same_buffer(gather_inst->dep_memory(0),  gather_inst->output_memory(0)));
+            ASSERT_FALSE(gather_inst->can_be_optimized());
+        } else if (in_shape_data.expected_status == execution_status::optimized) {
+            ASSERT_TRUE(engine.is_the_same_buffer(gather_inst->dep_memory(0),   gather_inst->output_memory(0)));
+            ASSERT_TRUE(gather_inst->can_be_optimized());
+        } else {
+            ASSERT_TRUE(gather_inst->get_output_layout(0).count() == 0);
+        }
+    }
+}
+
+INSTANTIATE_TEST_SUITE_P(smoke, skip_gather_at_runtime_test,
+    testing::ValuesIn(std::vector<skip_gather_params> {
+        {{{ov::PartialShape{1,1,8}, ov::PartialShape{1,1}, execution_status::optimized},{ov::PartialShape{1,1,8}, ov::PartialShape{1,0}, execution_status::skipped},  {ov::PartialShape{1,11,8}, ov::PartialShape{1,6}, execution_status::executed}}, 1},
+        {{{ov::PartialShape{1,2,8}, ov::PartialShape{1,1}, execution_status::executed},{ov::PartialShape{1,1,8}, ov::PartialShape{1,0}, execution_status::skipped},  {ov::PartialShape{1,11,8}, ov::PartialShape{1,6}, execution_status::executed}}, 1},
+        {{{ov::PartialShape{1,1,8}, ov::PartialShape{1,1}, execution_status::optimized},{ov::PartialShape{1,1,8}, ov::PartialShape{1,1}, execution_status::optimized},{ov::PartialShape{1,11,8}, ov::PartialShape{1,6}, execution_status::executed}}, 1}
+    }));
+}  // skip gather tests