Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
replace base_ptr/width/height/surface_pitch
Browse files Browse the repository at this point in the history
  • Loading branch information
sunjiweiswift committed Aug 23, 2024
1 parent b259fba commit f7085fc
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 76 deletions.
11 changes: 5 additions & 6 deletions include/common/core/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -477,16 +477,15 @@ __XETLA_API xetla_vector<T, N> xetla_load_global(
return ret.xetla_format<T>();
} else if constexpr (BlockWidth * sizeof(T) < sizeof(uint32_t)) {
constexpr auto scale_factor = sizeof(uint32_t) / sizeof(T);
xetla_vector<uint32_t, N> ret = __ESIMD_ENS::lsc_load_2d<
xetla_vector<uint32_t, N> ret = xetla_load_global<
uint32_t,
BlockWidth,
BlockHeight,
NBlocks,
Transposed,
Transformed,
gpu::xetla::detail::get_cache_hint(L1H),
gpu::xetla::detail::get_cache_hint(L2H),
N>(
L1H,
L2H>(
reinterpret_cast<const uint32_t*>(Ptr),
SurfaceWidth,
SurfaceHeight,
Expand All @@ -505,7 +504,7 @@ __XETLA_API xetla_vector<T, N> xetla_load_global(
Transformed,
gpu::xetla::detail::get_cache_hint(L1H),
gpu::xetla::detail::get_cache_hint(L2H),
N>(Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
N>(Ptr, SurfaceWidth - 1, SurfaceHeight - 1, SurfacePitch - 1, X, Y);
}
}

Expand Down Expand Up @@ -788,7 +787,7 @@ __XETLA_API void xetla_store_global(
BlockHeight,
gpu::xetla::detail::get_cache_hint(L1H),
gpu::xetla::detail::get_cache_hint(L2H)>(
Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Vals);
Ptr, SurfaceWidth - 1, SurfaceHeight - 1, SurfacePitch - 1, X, Y, Vals);
}
}
/// template <typename T, int N, int VS = 1, typename OffsetT,
Expand Down
62 changes: 27 additions & 35 deletions include/subgroup/tile/impl/load_xe.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ tile_load(tile_t& tile, payload_t& payload) {

static constexpr reg_layout reg_layout_ = tile_desc::register_layout;
static constexpr bool is_vnni_reverse =
payload_t::mem_dword_qword_transpose &&
payload_t::mem_transpose_dtype_less4bytes &&
((reg_layout_ == reg_layout::tiled) ||
(reg_layout_ == reg_layout::transpose_tiled));
static constexpr bool reg_transpose = tile_desc::reg_transpose;
Expand Down Expand Up @@ -180,6 +180,7 @@ tile_load(tile_t& tile, payload_t& payload) {
#pragma unroll
for (uint32_t i = 0; i < num_block_y; ++i) {
constexpr uint32_t load_block_elems = block_elems * arr_len;
int offset_y = i * block_size_y;
auto payload_row =
payload_2d.xetla_select<num_block_x, 1, 16, 1>(i * num_block_x, 0);
detail::reset_tile_desc_core<
Expand All @@ -191,6 +192,7 @@ tile_load(tile_t& tile, payload_t& payload) {
mem_transpose>(payload_row);
#pragma unroll
for (uint32_t j = 0; j < num_block_x; j += arr_len) {
uint32_t offset_x = j * block_size_x;
xetla_tdescriptor tdesc = payload_row.row(j);
auto reg_blk = tile.reg.xetla_select<load_block_elems, 1>(
(i * num_block_x + j) * block_elems);
Expand All @@ -201,35 +203,29 @@ tile_load(tile_t& tile, payload_t& payload) {
xetla_vector<dtype, tmp_size> reg_tmp;
#pragma unroll
for (uint32_t ii = 0; ii < block_size_y / ld_blk_size_y; ++ii) {
offset_y += ld_blk_size_y;
constexpr uint32_t load_elems = ld_blk_size_y * block_size_x * arr_len;

// reg_tmp.xetla_format<native_type_t<load_dtype>>() =
// xetla_tload_global<
// load_dtype,
// ld_blk_height * block_size_x * arr_len / scale_factor,
// L1,
// L2,
// trans,
// mem_transform,
// arch_tag>(tdesc);
reg_tmp.xetla_format<native_type_t<load_dtype>>() = xetla_load_global<
native_type_t<load_dtype>,
(mem_transpose ? ld_blk_size_y : block_size_x) / scale_factor,
(mem_transpose ? block_size_x : ld_blk_size_y),
(trans ? ld_blk_size_y : block_size_x) / scale_factor,
(trans ? block_size_x : ld_blk_size_y),
// block_size_x / scale_factor,
// ld_blk_size_y,
arr_len,
trans,
mem_transform,
L1,
L2>(
(native_type_t<load_dtype>*)::gpu::xetla::detail::
xetla_get_tensor_base_address(tdesc),
::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
payload.base_ptr,
payload.surface_width,
payload.surface_height,
payload.surface_pitch,
mem_transpose
// ? ::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc)
? (payload.offset_x + offset_y / scale_factor)
: (payload.offset_x + offset_x / scale_factor),
payload.offset_y + (mem_transpose ? offset_x : offset_y));
if constexpr (reg_transpose && trans) {
reg_blk.xetla_select<load_elems, 1>(ii * load_elems)
.xetla_format<native_type_t<load_dtype>>() =
Expand All @@ -246,7 +242,6 @@ tile_load(tile_t& tile, payload_t& payload) {
} else {
reg_blk.xetla_select<tmp_size, 1>(ii * tmp_size) = reg_tmp;
}

if constexpr (mem_transpose) {
xetla_update_tdesc_offsetx(
tdesc.xetla_format<uint32_t>(), ld_blk_size_y / scale_factor);
Expand Down Expand Up @@ -284,11 +279,10 @@ tile_load(tile_t& tile, payload_t& payload) {
mem_transform,
L1,
L2>(
(native_type_t<load_dtype>*)::gpu::xetla::detail::
xetla_get_tensor_base_address(tdesc),
::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
payload.base_ptr,
payload.surface_width,
payload.surface_height,
payload.surface_pitch,
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
// xetla_tload_global<
Expand Down Expand Up @@ -345,11 +339,10 @@ tile_load(tile_t& tile, payload_t& payload) {
mem_transform,
L1,
L2>(
(native_type_t<load_dtype>*)::gpu::xetla::detail::
xetla_get_tensor_base_address(tdesc),
::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
payload.base_ptr,
payload.surface_width,
payload.surface_height,
payload.surface_pitch,
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
// xetla_tload_global<
Expand Down Expand Up @@ -412,11 +405,10 @@ tile_load(tile_t& tile, payload_t& payload) {
mem_transform,
L1,
L2>(
(native_type_t<load_dtype>*)::gpu::xetla::detail::
xetla_get_tensor_base_address(tdesc),
::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
payload.base_ptr,
payload.surface_width,
payload.surface_height,
payload.surface_pitch,
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
// xetla_tload_global<
Expand Down
47 changes: 26 additions & 21 deletions include/subgroup/tile/impl/payload_xe.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@ struct mem_payload_t<
static constexpr bool mem_transform = (sizeof(dtype) <= 2) && !trans &&
(register_layout == reg_layout::vnni_tiled ||
register_layout == reg_layout::vnni_tiled_col_major);
static constexpr bool mem_dword_qword_transpose =
static constexpr bool mem_transpose_dtype_less4bytes =
(sizeof(dtype) < 4) && trans;

using mem_dtype = typename std::
conditional<mem_dword_qword_transpose, uint32_t, dtype>::type;
conditional_t<mem_transpose_dtype_less4bytes, uint32_t, dtype>;
static constexpr uint32_t scale_factor = sizeof(mem_dtype) / sizeof(dtype);
mem_dtype* base_ptr;
uint32_t surface_width;
Expand Down Expand Up @@ -111,8 +111,9 @@ struct mem_payload_t<
this->surface_height =
(mem_transpose ? mem_desc.shape.x : mem_desc.shape.y);
this->surface_pitch = mem_desc.shape.stride * sizeof(dtype);
this->offset_x = mem_desc.coord.x;
this->offset_y = mem_desc.coord.y;
this->offset_x =
(mem_transpose ? mem_desc.coord.y : mem_desc.coord.x) / scale_factor;
this->offset_y = mem_transpose ? mem_desc.coord.x : mem_desc.coord.y;

xetla_tdescriptor base_tdesc = mem_desc.get_tdesc();
int32_t offset = gpu::xetla::detail::xetla_get_tensor_offset_x(base_tdesc) /
Expand All @@ -130,13 +131,14 @@ struct mem_payload_t<
int32_t surface_offset_x = 0,
int32_t surface_offset_y = 0) {
this->base_ptr = (mem_dtype*)p;
this->surface_width = surface_width;
this->surface_width = surface_width * sizeof(dtype);
this->surface_height = surface_height;
this->surface_pitch = surface_pitch;
this->offset_x = surface_offset_x;
this->surface_pitch = surface_pitch * sizeof(dtype);
this->offset_x = surface_offset_x / scale_factor;
this->offset_y = surface_offset_y;

xetla_tdescriptor base_tdesc;

xetla_fill_tdesc(
base_tdesc.xetla_format<uint32_t>(),
p,
Expand All @@ -155,8 +157,9 @@ struct mem_payload_t<
this->surface_height =
(mem_transpose ? mem_desc.shape.x : mem_desc.shape.y);
this->surface_pitch = mem_desc.shape.stride * sizeof(dtype);
this->offset_x = mem_desc.coord.x;
this->offset_y = mem_desc.coord.y;
this->offset_x =
(mem_transpose ? mem_desc.coord.y : mem_desc.coord.x) / scale_factor;
this->offset_y = (mem_transpose ? mem_desc.coord.x : mem_desc.coord.y);

xetla_tdescriptor base_tdesc = mem_desc.get_tdesc();
int32_t offset = gpu::xetla::detail::xetla_get_tensor_offset_x(base_tdesc) /
Expand All @@ -181,11 +184,11 @@ struct mem_payload_t<
uint32_t surface_pitch,
int32_t surface_offset_x = 0,
int32_t surface_offset_y = 0) {
this->base_ptr = (mem_dtype)p;
this->surface_width = surface_width;
this->base_ptr = (mem_dtype*)p;
this->surface_width = surface_width * sizeof(dtype);
this->surface_height = surface_height;
this->surface_pitch = surface_pitch;
this->offset_x = surface_offset_x;
this->surface_pitch = surface_pitch * sizeof(dtype);
this->offset_x = surface_offset_x / scale_factor;
this->offset_y = surface_offset_y;

xetla_tdescriptor base_tdesc;
Expand Down Expand Up @@ -1708,11 +1711,12 @@ struct prefetch_payload_t<
reg_layout_>,
num_coop_sg_,
arch_tag_,
std::enable_if_t<(!arch_has_2d_load_store<arch_tag_>)&&(
((block_size_y_ != 1 || tile_size_y_ != 1) &&
mem_layout_ == mem_layout::row_major) ||
((block_size_x_ != 1 || tile_size_x_ != 1) &&
mem_layout_ == mem_layout::col_major))>> {
std::enable_if_t<
(!arch_has_2d_load_store<arch_tag_>) &&
(((block_size_y_ != 1 || tile_size_y_ != 1) &&
mem_layout_ == mem_layout::row_major) ||
((block_size_x_ != 1 || tile_size_x_ != 1) &&
mem_layout_ == mem_layout::col_major))>> {
using dtype = native_type_t<dtype_>;
using mem_desc_t =
mem_desc_t<dtype_, mem_layout_, mem_space::global, alignment_>;
Expand Down Expand Up @@ -1968,9 +1972,10 @@ struct prefetch_payload_t<
reg_layout_>,
num_coop_sg_,
arch_tag_,
std::enable_if_t<(arch_has_2d_load_store<arch_tag_>)&&(
((tile_size_y_ != 1) && mem_layout_ == mem_layout::row_major) ||
((tile_size_x_ != 1) && mem_layout_ == mem_layout::col_major))>> {
std::enable_if_t<
(arch_has_2d_load_store<arch_tag_>) &&
(((tile_size_y_ != 1) && mem_layout_ == mem_layout::row_major) ||
((tile_size_x_ != 1) && mem_layout_ == mem_layout::col_major))>> {
using dtype = dtype_;
using mem_desc_t =
mem_desc_t<dtype_, mem_layout_, mem_space::global, alignment_>;
Expand Down
67 changes: 53 additions & 14 deletions include/subgroup/tile/impl/store_xe.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,10 @@ tile_store(tile_t& tile, payload_t& payload) {
st_block_size_y,
L1,
L2>(
(dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address(tdesc),
::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
payload.base_ptr,
payload.surface_width,
payload.surface_height,
payload.surface_pitch,
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc),
st_blk);
Expand All @@ -194,12 +194,25 @@ tile_store(tile_t& tile, payload_t& payload) {
(block_size_x * arr_len - 1) | ((blk_remained_y - 1) << 8);
gpu::xetla::detail::xetla_set_block_widthx_widthy_arrlen(
tdesc.xetla_format<uint32_t>(), block_widthx_widthy_arrlen);
xetla_tstore_global<
// xetla_tstore_global<
// dtype,
// blk_remained_elems,
// L1,
// L2,
// payload_t::arch_tag>(tdesc, st_blk);
xetla_store_global<
dtype,
blk_remained_elems,
block_size_x * arr_len,
blk_remained_y,
L1,
L2,
payload_t::arch_tag>(tdesc, st_blk);
L2>(
payload.base_ptr,
payload.surface_width,
payload.surface_height,
payload.surface_pitch,
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc),
st_blk);
}
}
}
Expand Down Expand Up @@ -244,8 +257,21 @@ tile_store(tile_t& tile, payload_t& payload) {
remained_st_blk_size_y * block_size_x * arr_len;
auto st_blk =
combine_blk.xetla_select<store_elems, 1>(ii * store_elems);
xetla_tstore_global<dtype, store_elems, L1, L2, payload_t::arch_tag>(
tdesc, st_blk);
// xetla_tstore_global<dtype, store_elems, L1, L2, payload_t::arch_tag>(
// tdesc, st_blk);
xetla_store_global<
dtype,
block_size_x * arr_len,
remained_st_blk_size_y,
L1,
L2>(
payload.base_ptr,
payload.surface_width,
payload.surface_height,
payload.surface_pitch,
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc),
st_blk);
xetla_update_tdesc_offsety(
tdesc.xetla_format<uint32_t>(), remained_st_blk_size_y);
}
Expand All @@ -263,12 +289,25 @@ tile_store(tile_t& tile, payload_t& payload) {
(block_size_x * arr_len - 1) | ((final_st_blk_size_y - 1) << 8);
gpu::xetla::detail::xetla_set_block_widthx_widthy_arrlen(
tdesc.xetla_format<uint32_t>(), block_widthx_widthy_arrlen);
xetla_tstore_global<
// xetla_tstore_global<
// dtype,
// final_store_elems,
// L1,
// L2,
// payload_t::arch_tag>(tdesc, st_blk);
xetla_store_global<
dtype,
final_store_elems,
block_size_x * arr_len,
final_st_blk_size_y,
L1,
L2,
payload_t::arch_tag>(tdesc, st_blk);
L2>(
payload.base_ptr,
payload.surface_width,
payload.surface_height,
payload.surface_pitch,
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc),
st_blk);
}
}
}
Expand Down

0 comments on commit f7085fc

Please sign in to comment.