diff --git a/include/common/core/memory.hpp b/include/common/core/memory.hpp index b9cfa2542..ed75cf935 100644 --- a/include/common/core/memory.hpp +++ b/include/common/core/memory.hpp @@ -477,16 +477,15 @@ __XETLA_API xetla_vector xetla_load_global( return ret.xetla_format(); } else if constexpr (BlockWidth * sizeof(T) < sizeof(uint32_t)) { constexpr auto scale_factor = sizeof(uint32_t) / sizeof(T); - xetla_vector ret = __ESIMD_ENS::lsc_load_2d< + xetla_vector ret = xetla_load_global< uint32_t, BlockWidth, BlockHeight, NBlocks, Transposed, Transformed, - gpu::xetla::detail::get_cache_hint(L1H), - gpu::xetla::detail::get_cache_hint(L2H), - N>( + L1H, + L2H>( reinterpret_cast(Ptr), SurfaceWidth, SurfaceHeight, @@ -505,7 +504,7 @@ __XETLA_API xetla_vector xetla_load_global( Transformed, gpu::xetla::detail::get_cache_hint(L1H), gpu::xetla::detail::get_cache_hint(L2H), - N>(Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y); + N>(Ptr, SurfaceWidth - 1, SurfaceHeight - 1, SurfacePitch - 1, X, Y); } } @@ -788,7 +787,7 @@ __XETLA_API void xetla_store_global( BlockHeight, gpu::xetla::detail::get_cache_hint(L1H), gpu::xetla::detail::get_cache_hint(L2H)>( - Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Vals); + Ptr, SurfaceWidth - 1, SurfaceHeight - 1, SurfacePitch - 1, X, Y, Vals); } } /// template (i * num_block_x, 0); detail::reset_tile_desc_core< @@ -191,6 +192,7 @@ tile_load(tile_t& tile, payload_t& payload) { mem_transpose>(payload_row); #pragma unroll for (uint32_t j = 0; j < num_block_x; j += arr_len) { + uint32_t offset_x = j * block_size_x; xetla_tdescriptor tdesc = payload_row.row(j); auto reg_blk = tile.reg.xetla_select( (i * num_block_x + j) * block_elems); @@ -201,21 +203,13 @@ tile_load(tile_t& tile, payload_t& payload) { xetla_vector reg_tmp; #pragma unroll for (uint32_t ii = 0; ii < block_size_y / ld_blk_size_y; ++ii) { + offset_y += ld_blk_size_y; constexpr uint32_t load_elems = ld_blk_size_y * block_size_x * arr_len; - // reg_tmp.xetla_format>() = - // xetla_tload_global< - // load_dtype, - // ld_blk_height * block_size_x * arr_len / scale_factor, - // L1, - // L2, - // trans, - // mem_transform, - // arch_tag>(tdesc); reg_tmp.xetla_format>() = xetla_load_global< native_type_t, - (mem_transpose ? ld_blk_size_y : block_size_x) / scale_factor, - (mem_transpose ? block_size_x : ld_blk_size_y), + (trans ? ld_blk_size_y : block_size_x) / scale_factor, + (trans ? block_size_x : ld_blk_size_y), // block_size_x / scale_factor, // ld_blk_size_y, arr_len, @@ -223,13 +217,15 @@ tile_load(tile_t& tile, payload_t& payload) { mem_transform, L1, L2>( - (native_type_t*)::gpu::xetla::detail:: - xetla_get_tensor_base_address(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc)); + payload.base_ptr, + payload.surface_width, + payload.surface_height, + payload.surface_pitch, + mem_transpose + // ? ::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc) + ? (payload.offset_x + offset_y / scale_factor) + : (payload.offset_x + offset_x / scale_factor), + payload.offset_y + (mem_transpose ? offset_x : offset_y)); if constexpr (reg_transpose && trans) { reg_blk.xetla_select(ii * load_elems) .xetla_format>() = @@ -246,7 +242,6 @@ tile_load(tile_t& tile, payload_t& payload) { } else { reg_blk.xetla_select(ii * tmp_size) = reg_tmp; } - if constexpr (mem_transpose) { xetla_update_tdesc_offsetx( tdesc.xetla_format(), ld_blk_size_y / scale_factor); @@ -284,11 +279,10 @@ tile_load(tile_t& tile, payload_t& payload) { mem_transform, L1, L2>( - (native_type_t*)::gpu::xetla::detail:: - xetla_get_tensor_base_address(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc), + payload.base_ptr, + payload.surface_width, + payload.surface_height, + payload.surface_pitch, ::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc), ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc)); // xetla_tload_global< @@ -345,11 +339,10 @@ tile_load(tile_t& tile, payload_t& payload) { mem_transform, L1, L2>( - (native_type_t*)::gpu::xetla::detail:: - xetla_get_tensor_base_address(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc), + payload.base_ptr, + payload.surface_width, + payload.surface_height, + payload.surface_pitch, ::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc), ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc)); // xetla_tload_global< @@ -412,11 +405,10 @@ tile_load(tile_t& tile, payload_t& payload) { mem_transform, L1, L2>( - (native_type_t*)::gpu::xetla::detail:: - xetla_get_tensor_base_address(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc), + payload.base_ptr, + payload.surface_width, + payload.surface_height, + payload.surface_pitch, ::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc), ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc)); // xetla_tload_global< diff --git a/include/subgroup/tile/impl/payload_xe.hpp b/include/subgroup/tile/impl/payload_xe.hpp index ea47e9e15..408e79d18 100644 --- a/include/subgroup/tile/impl/payload_xe.hpp +++ b/include/subgroup/tile/impl/payload_xe.hpp @@ -78,11 +78,11 @@ struct mem_payload_t< static constexpr bool mem_transform = (sizeof(dtype) <= 2) && !trans && (register_layout == reg_layout::vnni_tiled || register_layout == reg_layout::vnni_tiled_col_major); - static constexpr bool mem_dword_qword_transpose = + static constexpr bool mem_transpose_dtype_less4bytes = (sizeof(dtype) < 4) && trans; using mem_dtype = typename std:: - conditional::type; + conditional_t; static constexpr uint32_t scale_factor = sizeof(mem_dtype) / sizeof(dtype); mem_dtype* base_ptr; uint32_t surface_width; @@ -111,8 +111,9 @@ struct mem_payload_t< this->surface_height = (mem_transpose ? mem_desc.shape.x : mem_desc.shape.y); this->surface_pitch = mem_desc.shape.stride * sizeof(dtype); - this->offset_x = mem_desc.coord.x; - this->offset_y = mem_desc.coord.y; + this->offset_x = + (mem_transpose ? mem_desc.coord.y : mem_desc.coord.x) / scale_factor; + this->offset_y = mem_transpose ? mem_desc.coord.x : mem_desc.coord.y; xetla_tdescriptor base_tdesc = mem_desc.get_tdesc(); int32_t offset = gpu::xetla::detail::xetla_get_tensor_offset_x(base_tdesc) / @@ -130,13 +131,14 @@ struct mem_payload_t< int32_t surface_offset_x = 0, int32_t surface_offset_y = 0) { this->base_ptr = (mem_dtype*)p; - this->surface_width = surface_width; + this->surface_width = surface_width * sizeof(dtype); this->surface_height = surface_height; - this->surface_pitch = surface_pitch; - this->offset_x = surface_offset_x; + this->surface_pitch = surface_pitch * sizeof(dtype); + this->offset_x = surface_offset_x / scale_factor; this->offset_y = surface_offset_y; xetla_tdescriptor base_tdesc; + xetla_fill_tdesc( base_tdesc.xetla_format(), p, @@ -155,8 +157,9 @@ struct mem_payload_t< this->surface_height = (mem_transpose ? mem_desc.shape.x : mem_desc.shape.y); this->surface_pitch = mem_desc.shape.stride * sizeof(dtype); - this->offset_x = mem_desc.coord.x; - this->offset_y = mem_desc.coord.y; + this->offset_x = + (mem_transpose ? mem_desc.coord.y : mem_desc.coord.x) / scale_factor; + this->offset_y = (mem_transpose ? mem_desc.coord.x : mem_desc.coord.y); xetla_tdescriptor base_tdesc = mem_desc.get_tdesc(); int32_t offset = gpu::xetla::detail::xetla_get_tensor_offset_x(base_tdesc) / @@ -181,11 +184,11 @@ struct mem_payload_t< uint32_t surface_pitch, int32_t surface_offset_x = 0, int32_t surface_offset_y = 0) { - this->base_ptr = (mem_dtype)p; - this->surface_width = surface_width; + this->base_ptr = (mem_dtype*)p; + this->surface_width = surface_width * sizeof(dtype); this->surface_height = surface_height; - this->surface_pitch = surface_pitch; - this->offset_x = surface_offset_x; + this->surface_pitch = surface_pitch * sizeof(dtype); + this->offset_x = surface_offset_x / scale_factor; this->offset_y = surface_offset_y; xetla_tdescriptor base_tdesc; @@ -1708,11 +1711,12 @@ struct prefetch_payload_t< reg_layout_>, num_coop_sg_, arch_tag_, - std::enable_if_t<(!arch_has_2d_load_store)&&( - ((block_size_y_ != 1 || tile_size_y_ != 1) && - mem_layout_ == mem_layout::row_major) || - ((block_size_x_ != 1 || tile_size_x_ != 1) && - mem_layout_ == mem_layout::col_major))>> { + std::enable_if_t< + (!arch_has_2d_load_store) && + (((block_size_y_ != 1 || tile_size_y_ != 1) && + mem_layout_ == mem_layout::row_major) || + ((block_size_x_ != 1 || tile_size_x_ != 1) && + mem_layout_ == mem_layout::col_major))>> { using dtype = native_type_t; using mem_desc_t = mem_desc_t; @@ -1968,9 +1972,10 @@ struct prefetch_payload_t< reg_layout_>, num_coop_sg_, arch_tag_, - std::enable_if_t<(arch_has_2d_load_store)&&( - ((tile_size_y_ != 1) && mem_layout_ == mem_layout::row_major) || - ((tile_size_x_ != 1) && mem_layout_ == mem_layout::col_major))>> { + std::enable_if_t< + (arch_has_2d_load_store) && + (((tile_size_y_ != 1) && mem_layout_ == mem_layout::row_major) || + ((tile_size_x_ != 1) && mem_layout_ == mem_layout::col_major))>> { using dtype = dtype_; using mem_desc_t = mem_desc_t; diff --git a/include/subgroup/tile/impl/store_xe.hpp b/include/subgroup/tile/impl/store_xe.hpp index 813052455..83f07653b 100644 --- a/include/subgroup/tile/impl/store_xe.hpp +++ b/include/subgroup/tile/impl/store_xe.hpp @@ -171,10 +171,10 @@ tile_store(tile_t& tile, payload_t& payload) { st_block_size_y, L1, L2>( - (dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc), - ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc), + payload.base_ptr, + payload.surface_width, + payload.surface_height, + payload.surface_pitch, ::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc), ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc), st_blk); @@ -194,12 +194,25 @@ tile_store(tile_t& tile, payload_t& payload) { (block_size_x * arr_len - 1) | ((blk_remained_y - 1) << 8); gpu::xetla::detail::xetla_set_block_widthx_widthy_arrlen( tdesc.xetla_format(), block_widthx_widthy_arrlen); - xetla_tstore_global< + // xetla_tstore_global< + // dtype, + // blk_remained_elems, + // L1, + // L2, + // payload_t::arch_tag>(tdesc, st_blk); + xetla_store_global< dtype, - blk_remained_elems, + block_size_x * arr_len, + blk_remained_y, L1, - L2, - payload_t::arch_tag>(tdesc, st_blk); + L2>( + payload.base_ptr, + payload.surface_width, + payload.surface_height, + payload.surface_pitch, + ::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc), + ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc), + st_blk); } } } @@ -244,8 +257,21 @@ tile_store(tile_t& tile, payload_t& payload) { remained_st_blk_size_y * block_size_x * arr_len; auto st_blk = combine_blk.xetla_select(ii * store_elems); - xetla_tstore_global( - tdesc, st_blk); + // xetla_tstore_global( + // tdesc, st_blk); + xetla_store_global< + dtype, + block_size_x * arr_len, + remained_st_blk_size_y, + L1, + L2>( + payload.base_ptr, + payload.surface_width, + payload.surface_height, + payload.surface_pitch, + ::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc), + ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc), + st_blk); xetla_update_tdesc_offsety( tdesc.xetla_format(), remained_st_blk_size_y); } @@ -263,12 +289,25 @@ tile_store(tile_t& tile, payload_t& payload) { (block_size_x * arr_len - 1) | ((final_st_blk_size_y - 1) << 8); gpu::xetla::detail::xetla_set_block_widthx_widthy_arrlen( tdesc.xetla_format(), block_widthx_widthy_arrlen); - xetla_tstore_global< + // xetla_tstore_global< + // dtype, + // final_store_elems, + // L1, + // L2, + // payload_t::arch_tag>(tdesc, st_blk); + xetla_store_global< dtype, - final_store_elems, + block_size_x * arr_len, + final_st_blk_size_y, L1, - L2, - payload_t::arch_tag>(tdesc, st_blk); + L2>( + payload.base_ptr, + payload.surface_width, + payload.surface_height, + payload.surface_pitch, + ::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc), + ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc), + st_blk); } } }