Skip to content

Commit

Permalink
[GPU] Fix reset output issue (#27695)
Browse files Browse the repository at this point in the history
### Details:
 - *Fix invalid output memory usages by reset output  memory*
- *At the previous iteration, gather primitive has output memory which
is refer input memory because it is optimized out but it also use input
memory as output without reallocation memory at the next iteration which
turns to executed from optimized out. it makes memory corruption issue.*
- *Reset output memory when the current prim status is changed from
optimized / skipped to executed*

![image](https://github.com/user-attachments/assets/7fa53f2b-80ec-4fa2-8e4d-b3df0f052c51)


### Tickets:
 - *154591*
  • Loading branch information
ahnyoung-paul authored Dec 2, 2024
1 parent 4c8c290 commit cfe2a59
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
// concat buffer fusing for dynamic shape is adaptively applied at runtime. So we need to build dynamic impl at build time.
if (impl_param.can_be_optimized() &&
!((impl_param.is_type<concatenation>() ||
impl_param.is_type<strided_slice>() ||
impl_param.is_type<crop>() ||
impl_param.runtime_skippable()) && impl_param.is_dynamic())) {
return make_unique<ImplType>(kernel_selector::kernel_data{});
Expand Down
4 changes: 3 additions & 1 deletion src/plugins/intel_gpu/src/graph/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ class primitive_inst {
bool use_async_compilation();
// if primitive_inst doesn't replace impl to new impl(static impl with opt kerenl or dynamic impl), return false
void update_impl(bool use_async_compilation);
void realloc_if_needed();
void realloc_if_needed(bool prev_execution_skipped = false);

cldnn::network::ptr get_unfused_subgraph();

Expand Down Expand Up @@ -481,6 +481,8 @@ class primitive_inst {
return false;
}

void clear_output_memory();

// This could be implemented via single map std::unordered_map<instrumentation::perf_counter_key, std::tuple<int64_t, size_t>>
// but the overhead on using perf_counter_key as map key is too big, thus we use hash as map key
// and store mapping onto original perf_clounter_key for further data analysis and dumps
Expand Down
66 changes: 36 additions & 30 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,12 @@ bool primitive_inst::all_dependencies_cpu_impl() const {
return check_all_deps_cpu(this);
}

void primitive_inst::realloc_if_needed() {
void primitive_inst::clear_output_memory() {
_outputs[0] = nullptr;
_max_output_layout_count[0] = 0;
}

void primitive_inst::realloc_if_needed(bool prev_execution_skipped) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("realloc_if_needed: " + id()));
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);
Expand Down Expand Up @@ -738,21 +743,15 @@ void primitive_inst::realloc_if_needed() {

// Clear out memory if was previously reused, but now primitive can't be optimized
if (!_node->is_type<concatenation>() && (_node->is_runtime_skippable() || _node->is_type<crop>())) {
std::function<void(cldnn::primitive_inst*, cldnn::memory::ptr)> reset_user_output_memory;
reset_user_output_memory = [&](cldnn::primitive_inst* curr_inst, cldnn::memory::ptr input_mem_ptr) {
auto curr_output_memory_ptr = curr_inst->output_memory_ptr(0);
if (curr_inst->can_be_optimized()
&& (curr_output_memory_ptr
&& get_network().get_engine().is_the_same_buffer(*curr_output_memory_ptr, *input_mem_ptr))) {
if (curr_inst->mem_allocated()) {
get_network().get_memory_pool().release_memory(curr_inst->_outputs[0].get(),
curr_inst->get_node().get_unique_id(), curr_inst->id(), get_network_id());
_mem_allocated = false;
}
curr_inst->_outputs[0] = nullptr;
curr_inst->_max_output_layout_count[0] = 0;
for (auto& user_inst : curr_inst->get_user_insts()) {
reset_user_output_memory(user_inst, input_mem_ptr);
std::function<void(cldnn::primitive_inst*, cldnn::memory::ptr)> reset_user_output_memory
= [&](cldnn::primitive_inst* curr_inst, cldnn::memory::ptr target_mem_ptr) {
for (auto& user_inst : curr_inst->get_user_insts()) {
auto curr_output_memory_ptr = user_inst->output_memory_ptr(0);
if (user_inst->can_be_optimized()
&& (curr_output_memory_ptr
&& get_network().get_engine().is_the_same_buffer(*curr_output_memory_ptr, *target_mem_ptr))) {
user_inst->clear_output_memory();
reset_user_output_memory(user_inst, target_mem_ptr);
}
}
};
Expand All @@ -766,9 +765,7 @@ void primitive_inst::realloc_if_needed() {
// * iter1: node1(skipped) -> node2(skipped) -> node3(executed)
if (_outputs[0] && dep_memory_ptr(0)
&& !_network.get_engine().is_the_same_buffer(dep_memory(0), output_memory(0))) {
for (auto& user_inst : get_user_insts()) {
reset_user_output_memory(user_inst, dep_memory_ptr(0));
}
reset_user_output_memory(this, dep_memory_ptr(0));
}
return;
} else if (_outputs[0] && dep_memory_ptr(0) &&
Expand All @@ -778,16 +775,22 @@ void primitive_inst::realloc_if_needed() {
get_node().get_unique_id(), id(), get_network_id());
_mem_allocated = false;
}
_outputs[0] = nullptr;
_max_output_layout_count[0] = 0;
clear_output_memory();
// Check users recursively and if the users is can_be_optimized && runtime_skippable
// && output_memory of user is same as current input memory,
// then reset the users output memory too.
// Ex.
// * iter0: node1(skipped) -> node2(skipped) -> node3(skipped)
// * iter1: node1(executed) -> node2(skipped) -> node3(executed)
for (auto& user_inst : get_user_insts()) {
reset_user_output_memory(user_inst, dep_memory_ptr(0));
reset_user_output_memory(this, dep_memory_ptr(0));
} else {
// when this inst was not executed at the previous iteration,
// Reset output memory becuase current output memory is invalid.
if (prev_execution_skipped) {
if (_outputs[0]) {
reset_user_output_memory(this, _outputs[0]);
}
clear_output_memory();
}
}
}
Expand Down Expand Up @@ -1389,7 +1392,7 @@ void primitive_inst::do_runtime_in_place_kv_cache() {
void primitive_inst::do_runtime_skip_gather() {
// Check pattern
if (!get_node().is_type<gather>()
|| !get_node().can_be_optimized()
|| !get_node().is_runtime_skippable()
|| _impl_params->has_fused_primitives()
|| _impl_params->get_input_layout(0).data_type != _impl_params->get_output_layout().data_type
|| get_node().get_dependency(1).is_constant() || get_node().get_dependency(1).is_type<data>())
Expand Down Expand Up @@ -1461,7 +1464,6 @@ void primitive_inst::do_runtime_skip_permute() {
// Check pattern
if (!get_node().is_type<permute>()
|| is_output()
|| !get_node().can_be_optimized()
|| !get_node().is_runtime_skippable()
|| _impl_params->has_fused_primitives()
|| _impl_params->get_input_layout(0).data_type != _impl_params->get_output_layout().data_type)
Expand Down Expand Up @@ -1501,7 +1503,7 @@ void primitive_inst::do_runtime_skip_permute() {
void primitive_inst::do_runtime_skip_strided_slice() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_skip_strided_slice: " + id()));
// Check pattern
if (!get_node().is_type<strided_slice>() || !get_node().can_be_optimized())
if (!get_node().is_type<strided_slice>() || !get_node().is_runtime_skippable())
return;

GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_strided_slice] " << id() << " : check optimizability" << std::endl;
Expand All @@ -1525,7 +1527,7 @@ void primitive_inst::do_runtime_skip_strided_slice() {
void primitive_inst::do_runtime_skip_broadcast() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_skip_broadcast: " + id()));
// Check pattern
if (!get_node().is_type<broadcast>() || !get_node().can_be_optimized())
if (!get_node().is_type<broadcast>() || !get_node().is_runtime_skippable())
return;

GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_broadcast] " << id() << " : check optimizability" << std::endl;
Expand Down Expand Up @@ -1634,7 +1636,7 @@ void primitive_inst::do_runtime_skip_scatter_update() {
if (!(get_node().is_type<scatter_update>()
|| get_node().is_type<scatter_elements_update>()
|| get_node().is_type<scatter_nd_update>())
|| !get_node().can_be_optimized())
|| !get_node().is_runtime_skippable())
return;

GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_scatter_update] " << id() << " : check optimizability" << std::endl;
Expand Down Expand Up @@ -1780,6 +1782,10 @@ void primitive_inst::prepare_primitive() {
}
GPU_DEBUG_TRACE_DETAIL << "-----------------------------------------------------------------" << std::endl;

// If it is optimized out or skipped for zero dimension at the previous iteration,
// Set this flag true to reset output memory in realloc_if_needed.
const bool prev_execution_skipped = can_be_optimized()
|| (_impl_params->output_layouts[0].is_static() && _impl_params->output_layouts[0].count() == 0);
const auto orig_outputs = _outputs;
if ((is_dynamic() || _node->is_in_shape_of_subgraph()) && !has_inner_networks()) {
do_runtime_in_place_concat();
Expand Down Expand Up @@ -1839,7 +1845,7 @@ void primitive_inst::prepare_primitive() {
update_impl(can_use_async_compilation);
if (get_flag(ExecutionFlags::IMPL_CHANGED)) {
update_weights();
realloc_if_needed();
realloc_if_needed(prev_execution_skipped);
}
}

Expand All @@ -1848,7 +1854,7 @@ void primitive_inst::prepare_primitive() {
if (_node->is_type<paged_attention>() && !get_flag(ExecutionFlags::IMPL_CHANGED) && _impl->requires_update(*this, *_impl_params)) {
_impl->update(*this, *_impl_params);

realloc_if_needed();
realloc_if_needed(prev_execution_skipped);
}

OPENVINO_ASSERT(_impl_params->get_output_layout().is_static(),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "test_utils.h"

#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/gather.hpp>
#include <intel_gpu/primitives/reorder.hpp>
#include <intel_gpu/primitives/reshape.hpp>
#include <intel_gpu/primitives/data.hpp>

#include "gather_inst.h"
#include "program_wrapper.h"

#include <cmath>
#include <algorithm>

using namespace cldnn;
using namespace ::tests;

namespace skip_gather_tests {
enum execution_status {
optimized = 0,
skipped = 1,
executed = 2
};

struct gather_iter_params {
ov::PartialShape input1_shape;
ov::PartialShape input2_shape;
execution_status expected_status;
};


struct skip_gather_params {
std::vector<gather_iter_params> input_data;
int axis;
};

class skip_gather_at_runtime_test : public testing::TestWithParam<skip_gather_params> {};

TEST_P(skip_gather_at_runtime_test, runtime_skip) {
auto p = GetParam();
auto& engine = get_test_engine();
auto axis = p.axis;
auto input1_rank = p.input_data[0].input1_shape.size();
auto input1_layout_dynamic = layout {ov::PartialShape::dynamic(input1_rank), data_types::f16, format::get_default_format(input1_rank)};
auto input2_rank = p.input_data[0].input2_shape.size();
auto input2_layout_dynamic = layout {ov::PartialShape::dynamic(input2_rank), data_types::f16, format::get_default_format(input2_rank)};
topology topology(input_layout("input1", input1_layout_dynamic),
input_layout("input2", input1_layout_dynamic),
reshape("squeeze", input_info("input2"), false, {-1}, {-1}, reshape::reshape_mode::base),
gather("gather",
input_info("input1"),
input_info("squeeze"),
axis,
p.input_data[0].input1_shape.size(),
ov::Shape{},
0,
true),
reorder("reorder", input_info("gather"), format::get_default_format(input1_rank), data_types::f32));

ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::optimize_data(true));

network network(engine, topology, config);
auto gather_inst = network.get_primitive("gather");
for (auto in_shape_data : p.input_data) {
auto input1_static_layout = layout {in_shape_data.input1_shape, data_types::f16, format::get_default_format(input1_rank)};
auto input1_mem = engine.allocate_memory(input1_static_layout);
network.set_input_data("input1", input1_mem);

auto input2_static_layout = layout {in_shape_data.input2_shape, data_types::f16, format::get_default_format(input2_rank)};
auto intpu2_unit_static_layout = layout {ov::PartialShape{1}, data_types::f16, format::get_default_format(input2_rank)};
auto input2_mem = (input2_static_layout.count() == 0)? engine.allocate_memory(intpu2_unit_static_layout) : engine.allocate_memory(input2_static_layout);
if (input2_static_layout.count() == 0)
input2_mem = engine.reinterpret_buffer(*input2_mem, input2_static_layout);
network.set_input_data("input2", input2_mem);

auto outputs = network.execute();
if (in_shape_data.expected_status == execution_status::executed) {
ASSERT_FALSE(engine.is_the_same_buffer(gather_inst->dep_memory(0), gather_inst->output_memory(0)));
ASSERT_FALSE(gather_inst->can_be_optimized());
} else if (in_shape_data.expected_status == execution_status::optimized) {
ASSERT_TRUE(engine.is_the_same_buffer(gather_inst->dep_memory(0), gather_inst->output_memory(0)));
ASSERT_TRUE(gather_inst->can_be_optimized());
} else {
ASSERT_TRUE(gather_inst->get_output_layout(0).count() == 0);
}
}
}

INSTANTIATE_TEST_SUITE_P(smoke, skip_gather_at_runtime_test,
testing::ValuesIn(std::vector<skip_gather_params> {
{{{ov::PartialShape{1,1,8}, ov::PartialShape{1,1}, execution_status::optimized},{ov::PartialShape{1,1,8}, ov::PartialShape{1,0}, execution_status::skipped}, {ov::PartialShape{1,11,8}, ov::PartialShape{1,6}, execution_status::executed}}, 1},
{{{ov::PartialShape{1,2,8}, ov::PartialShape{1,1}, execution_status::executed},{ov::PartialShape{1,1,8}, ov::PartialShape{1,0}, execution_status::skipped}, {ov::PartialShape{1,11,8}, ov::PartialShape{1,6}, execution_status::executed}}, 1},
{{{ov::PartialShape{1,1,8}, ov::PartialShape{1,1}, execution_status::optimized},{ov::PartialShape{1,1,8}, ov::PartialShape{1,1}, execution_status::optimized},{ov::PartialShape{1,11,8}, ov::PartialShape{1,6}, execution_status::executed}}, 1}
}));
} // skip gather tests

0 comments on commit cfe2a59

Please sign in to comment.