From 304fc64080077287c23045c20d090054548370c6 Mon Sep 17 00:00:00 2001 From: Denis Yaroshevskiy Date: Wed, 24 Jan 2024 19:05:23 +0000 Subject: [PATCH] debug avx2 slide left issue (#1734) --- .../eve/detail/shuffle_v2/simd/x86/idxm.hpp | 49 +++++++++++++++++-- .../detail/shuffle_v2/simd/x86/shuffle_l3.hpp | 34 +++++-------- .../shuffle_v2/simd/x86/shuffle_l4_l5.hpp | 2 +- test/unit/api/regular/shuffle_v2/idxm.cpp | 17 ++++--- .../api/regular/shuffle_v2/slide_left_1.cpp | 9 ++-- 5 files changed, 72 insertions(+), 39 deletions(-) diff --git a/include/eve/detail/shuffle_v2/simd/x86/idxm.hpp b/include/eve/detail/shuffle_v2/simd/x86/idxm.hpp index 389610b566..38f00cffb8 100644 --- a/include/eve/detail/shuffle_v2/simd/x86/idxm.hpp +++ b/include/eve/detail/shuffle_v2/simd/x86/idxm.hpp @@ -53,12 +53,13 @@ x86_permute2f128_one_reg_mask(std::span _idxs) constexpr int x86_blend_immediate_mask(std::span idxs, std::ptrdiff_t g) { - int r = 0; - int s = std::ssize(idxs); + int r = 0; + int s = std::ssize(idxs); int pos = 0; - for(auto i : idxs ) + for( auto i : idxs ) { - for (int j = 0; j != g; ++j) { + for( int j = 0; j != g; ++j ) + { // we_ < s if( i * g >= s ) { r |= 1 << pos; } ++pos; @@ -67,4 +68,44 @@ x86_blend_immediate_mask(std::span idxs, std::ptrdiff_t g) return r; } +template +constexpr auto +x86_pshuvb_pattern(const std::array& idxs); + +template +constexpr auto +x86_pshuvb_pattern(std::span idxs) +{ + if constexpr( G != 1 ) return x86_pshuvb_pattern<1>(expand_group(idxs)); + else + { + static_assert(N == 16 || N == 32 || N == 64); + using arr_t = std::array; + using res_t = std::optional; + + arr_t res = {}; + for( std::size_t i = 0; i != N; ++i ) + { + std::ptrdiff_t lb = i / 16 * 16; + std::ptrdiff_t ub = lb + 16; + if( idxs[i] < 0 ) + { + res[i] = 0xff; + continue; + } + if( idxs[i] < lb || idxs[i] > ub ) return res_t {}; + res[i] = idxs[i] - lb; + } + + return res_t {res}; + } +} + +template +constexpr auto +x86_pshuvb_pattern(const std::array& idxs) +{ + return x86_pshuvb_pattern(std::span(idxs)); +} + } diff --git a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l3.hpp b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l3.hpp index 79e72b7c3c..80c7f9b561 100644 --- a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l3.hpp +++ b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l3.hpp @@ -14,21 +14,10 @@ template EVE_FORCEINLINE auto x86_pshuvb(pattern_t, wide x) { - if constexpr( N() == 16 ) - { - wide mask {I...}; - return _mm_shuffle_epi8(x, mask); - } - else if constexpr( N() == 32 ) - { - wide mask {I..., I...}; - return _mm256_shuffle_epi8(x, mask); - } - else - { - wide mask {I..., I..., I..., I...}; - return _mm512_shuffle_epi8(x, mask); - } + wide mask {I...}; + if constexpr( N() == 16 ) return _mm_shuffle_epi8(x, mask); + else if constexpr( N() == 32 ) return _mm256_shuffle_epi8(x, mask); + else return _mm512_shuffle_epi8(x, mask); } template @@ -37,16 +26,15 @@ shuffle_l3_x86_pshuvb(P, fixed, wide x) { if constexpr( current_api < ssse3 ) return no_matching_shuffle; else if constexpr( current_api == avx && P::reg_size == 32 ) return no_matching_shuffle; - else if constexpr( !P::repeated_16 ) return no_matching_shuffle; else { - constexpr auto no_we = idxm::replace_we(*P::repeated_16, 0xff); - constexpr auto no_na = idxm::replace_na(no_we, 0xff); - constexpr auto expanded = idxm::expand_group(no_na); - - using u8xN = wide>; - - return x86_pshuvb(idxm::to_pattern(), eve::bit_cast(x, eve::as {})); + constexpr auto pshuvb_pattern = idxm::x86_pshuvb_pattern(P::idxs); + if constexpr (!pshuvb_pattern) return no_matching_shuffle; + else + { + using u8xN = wide>; + return x86_pshuvb(idxm::to_pattern<*pshuvb_pattern>(), eve::bit_cast(x, eve::as {})); + } } } diff --git a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l4_l5.hpp b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l4_l5.hpp index fa558db27e..f2b2726575 100644 --- a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l4_l5.hpp +++ b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l4_l5.hpp @@ -42,7 +42,7 @@ shuffle_l4_l5_x86_put_u64x2_in_position(P, fixed, wide x) if constexpr( P::reg_size < 32 ) return no; // there is nothing we can do for shorts on avx else if constexpr( P::reg_size == 32 && P::g_size <= 2 && current_api == avx ) return no; - else if constexpr( P::has_zeroes && current_api <= avx2 ) return no; + else if constexpr( P::has_zeroes && current_api < avx2 ) return no; else if constexpr( !P::shuffle_16_first ) return no; else { diff --git a/test/unit/api/regular/shuffle_v2/idxm.cpp b/test/unit/api/regular/shuffle_v2/idxm.cpp index 4765ddbdc4..f270172e40 100644 --- a/test/unit/api/regular/shuffle_v2/idxm.cpp +++ b/test/unit/api/regular/shuffle_v2/idxm.cpp @@ -192,11 +192,11 @@ TTS_CASE("is_repeating_pattern") no_test(std::array {na_, 0, we_, 3}); // 2 registers [0, 1, 2, 3] [4, 5, 6, 7] - yes_test(std::array{0, 4, 2, 6}, std::array{0, 4}); - yes_test(std::array{1, 4, 3, 6}, std::array{1, 4}); - yes_test(std::array{1, 4, 3, we_}, std::array{1, 4}); - yes_test(std::array{1, we_, 3, 6}, std::array{1, 4}); - yes_test(std::array{na_, 4, na_, 6}, std::array{na_, 4}); + yes_test(std::array {0, 4, 2, 6}, std::array {0, 4}); + yes_test(std::array {1, 4, 3, 6}, std::array {1, 4}); + yes_test(std::array {1, 4, 3, we_}, std::array {1, 4}); + yes_test(std::array {1, we_, 3, 6}, std::array {1, 4}); + yes_test(std::array {na_, 4, na_, 6}, std::array {na_, 4}); no_test(std::array {0, 4, 2, 7}); no_test(std::array {0, 3}); @@ -837,7 +837,8 @@ TTS_CASE("put bigger group in position") yes_test(std::array {3, 2, 0, 1}, eve::lane<2>, std::array {1, 0}, std::array {1, 0, 2, 3}); yes_test(std::array {3, 2, na_, 1}, eve::lane<2>, std::array {1, 0}, std::array {1, 0, na_, 3}); yes_test(std::array {3, 2, 3, 2}, eve::lane<2>, std::array {1, 1}, std::array {1, 0, 3, 2}); - yes_test(std::array {3, 2, na_, na_}, eve::lane<2>, std::array {1, we_}, std::array {1, 0, na_, na_}); + yes_test( + std::array {3, 2, na_, na_}, eve::lane<2>, std::array {1, we_}, std::array {1, 0, na_, na_}); yes_test(std::array {3, 2, 0, 1}, eve::lane<4>, std::array {0}, std::array {3, 2, 0, 1}); yes_test(std::array {3, 2, 0, 1}, eve::lane<4>, std::array {0}, std::array {3, 2, 0, 1}); yes_test(std::array {3, 2, 0, 1}, eve::lane<1>, std::array {3, 2, 0, 1}, std::array {0, 1, 2, 3}); @@ -845,6 +846,10 @@ TTS_CASE("put bigger group in position") eve::lane<2>, std::array {1, 3, 3, 0}, std::array {1, 0, 2, 3, 4, 5, 6, 7}); + yes_test(std::array {7, na_, na_, na_, na_, na_, na_, na_}, + eve::lane<4>, + std::array {1, we_}, + std::array {3, na_, na_, na_, na_, na_, na_, na_}); no_test(std::array {3, 0, 0, 1}, eve::lane<2>); }; diff --git a/test/unit/api/regular/shuffle_v2/slide_left_1.cpp b/test/unit/api/regular/shuffle_v2/slide_left_1.cpp index 640b631a78..3c05bc54db 100644 --- a/test/unit/api/regular/shuffle_v2/slide_left_1.cpp +++ b/test/unit/api/regular/shuffle_v2/slide_left_1.cpp @@ -34,11 +34,10 @@ TTS_CASE("Slide left 1, example") { TTS_CASE("Explicit") { using w_i = eve::wide>; w_i x{1, 2, 3, 4, 5, 6, 7, 8}; - constexpr auto na_ = eve::na_; - auto [y, l] = eve::shuffle_v2_core(x, eve::lane<4>, eve::pattern<0, na_>); - //auto y = eve::slide_left2(x, eve::index<4>); - //TTS_EQUAL(y, w_i({8, 0, 0, 0, 0, 0, 0, 0})); - TTS_EQUAL(l(), 2); + //constexpr auto na_ = eve::na_; + auto y = eve::slide_left2(x, eve::index<7>); + TTS_EQUAL(y, w_i({8, 0, 0, 0, 0, 0, 0, 0})); + //TTS_EQUAL(l(), 2); }; #endif