diff --git a/include/eve/detail/shuffle_v2/idxm.hpp b/include/eve/detail/shuffle_v2/idxm.hpp index 9cceabb84b..38dc406438 100644 --- a/include/eve/detail/shuffle_v2/idxm.hpp +++ b/include/eve/detail/shuffle_v2/idxm.hpp @@ -246,26 +246,30 @@ fix_indexes_to_fundamental(const std::array& p, } constexpr bool -shuffle_within_halves(std::span idxs) +shuffle_within_n(std::span idxs, std::ptrdiff_t n) { const std::ptrdiff_t ssize = std::ssize(idxs); - const std::ptrdiff_t half = ssize / 2; - - if( ssize % 2 ) return false; - - for( std::ptrdiff_t i = 0; i != half; ++i ) - { - if( idxs[i] >= half ) return false; - } - - for( std::ptrdiff_t i = half; i != ssize; ++i ) - { - if( 0 <= idxs[i] && idxs[i] < half ) return false; + if (ssize % n) return false; + + for (std::ptrdiff_t part = 0; part != ssize; part += n) { + std::ptrdiff_t ub = part + n; + for (std::ptrdiff_t i = part; i != ub; ++i) { + std::ptrdiff_t x = idxs[i]; + if (x < 0) continue; + if (x < part) return false; + if (x >= ub) return false; + } } return true; } +constexpr bool +shuffle_within_halves(std::span idxs) +{ + return shuffle_within_n(idxs, std::ssize(idxs) / 2); +} + template constexpr auto shuffle_halves_independetly(const std::array& p) @@ -849,6 +853,33 @@ is_slide_right(std::span idxs) return (m - idxs.data()) - *m; } +constexpr std::optional +is_slide_left_2(std::span idxs, std::ptrdiff_t reg_groups) +{ + if (idxs.empty()) return std::nullopt; + const auto *f = idxs.data(); + const auto *l = idxs.data() + idxs.size(); + const auto *start2 = l; + + while (*--start2 == we_); + + std::ptrdiff_t slide = 0; + + // start2 is not l by construction + if (*start2 >= reg_groups) { + slide = l - start2 + (*start2 - reg_groups); + } else { + start2 = l; + slide = 0; + } + + start2 = l - slide; + + if (!is_in_order_from({start2, l}, reg_groups)) return std::nullopt; + if (!is_in_order_from({f, start2}, slide)) return std::nullopt; + return slide; +} + template constexpr std::optional> slide_as_slide2_with_0(std::span idxs) diff --git a/include/eve/detail/shuffle_v2/native_shuffle_helpers.hpp b/include/eve/detail/shuffle_v2/native_shuffle_helpers.hpp index 93f8519830..33949bbe61 100644 --- a/include/eve/detail/shuffle_v2/native_shuffle_helpers.hpp +++ b/include/eve/detail/shuffle_v2/native_shuffle_helpers.hpp @@ -61,6 +61,7 @@ struct expanded_pattern_t : pattern_t static constexpr std::ptrdiff_t e_t_size = (int)sizeof(eve::element_type_t); static constexpr std::ptrdiff_t g_size = e_t_size * G; static constexpr std::ptrdiff_t reg_size = T::size() * e_t_size; + static constexpr std::ptrdiff_t reg_groups = T::size() / G; static constexpr std::ptrdiff_t out_reg_size = idxs.size() * g_size; static constexpr bool has_zeroes = idxm::has_zeroes(idxs); diff --git a/include/eve/detail/shuffle_v2/simd/x86/idxm.hpp b/include/eve/detail/shuffle_v2/simd/x86/idxm.hpp index 530a016c3a..b2668aa9f4 100644 --- a/include/eve/detail/shuffle_v2/simd/x86/idxm.hpp +++ b/include/eve/detail/shuffle_v2/simd/x86/idxm.hpp @@ -75,8 +75,7 @@ x86_blend_immediate_mask(std::span idxs, std::ptrdiff_t g) } template -constexpr auto -x86_pshuvb_pattern(const std::array& idxs); +constexpr auto x86_pshuvb_pattern(const std::array& idxs); template constexpr auto @@ -89,6 +88,8 @@ x86_pshuvb_pattern(std::span idxs) using arr_t = std::array; using res_t = std::optional; + if( !shuffle_within_n(idxs, 16) ) return res_t {}; + arr_t res = {}; for( std::size_t i = 0; i != N; ++i ) { @@ -116,20 +117,49 @@ x86_pshuvb_pattern(const std::array& idxs) template constexpr std::optional -mm512_shuffle_i64x2_idx(std::array idxs) { - if constexpr (N == 2) return mm512_shuffle_i64x2_idx(expand_group<2>(idxs)); - else if constexpr ( N > 4 ) { +mm512_shuffle_i64x2_idx(std::array idxs) +{ + if constexpr( N == 2 ) return mm512_shuffle_i64x2_idx(expand_group<2>(idxs)); + else if constexpr( N > 4 ) + { auto upscaled = upscale_pattern(idxs); - if (upscaled) return mm512_shuffle_i64x2_idx(*upscaled); + if( upscaled ) return mm512_shuffle_i64x2_idx(*upscaled); else return std::nullopt; - } else { - if (idxs[0] >= 4 || idxs[1] >= 4) return std::nullopt; - if (idxs[2] < 4 || idxs[3] < 4) return std::nullopt; - idxs[2]-=4; - idxs[3]-=4; + } + else + { + if( idxs[0] >= 4 || idxs[1] >= 4 ) return std::nullopt; + if( idxs[2] < 4 || idxs[3] < 4 ) return std::nullopt; + idxs[2] -= 4; + idxs[3] -= 4; return idxs[0] | (idxs[1] << 2) | (idxs[2] << 4) | (idxs[3] << 6); } +} + +template +constexpr auto +slide_2_left_in_16_pattern(std::ptrdiff_t g_size, std::ptrdiff_t slide) +{ + std::ptrdiff_t n16 = 16 / g_size; + std::array res = {}; + + for( std::ptrdiff_t i = 0; i != N; i += n16 ) + { + for( std::ptrdiff_t j = 0; j != n16; ++j ) + { + std::ptrdiff_t full = i + j; + if( j < (n16 - slide) ) res[full] = full + slide; + else res[full] = full + N - (n16 - slide); + } + } + return res; +} +template +constexpr auto +slide_by_16_then_alignr(const std::array& idxs, std::ptrdiff_t g_size) +{ + return slide_by_16_then_alignr(std::span(idxs), g_size); } } diff --git a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l2.hpp b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l2.hpp index 3ab9a5c272..12d2f45d59 100644 --- a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l2.hpp +++ b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l2.hpp @@ -420,7 +420,7 @@ shuffle_l2_x86_within_128x2_alignr(P, fixed, wide x, wide y) else if constexpr( current_api < ssse3 ) return no_matching_shuffle; else { - constexpr auto starts_from = idxm::is_in_order(*P::repeated_16); + constexpr auto starts_from = idxm::is_slide_left_2(*P::repeated_16, P::reg_groups); if constexpr( !starts_from ) return no_matching_shuffle; else diff --git a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l3.hpp b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l3.hpp index c49698ce03..4d6558dd51 100644 --- a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l3.hpp +++ b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l3.hpp @@ -28,7 +28,7 @@ shuffle_l3_x86_pshuvb(P, fixed, wide x) else if constexpr( current_api == avx && P::reg_size == 32 ) return no_matching_shuffle; else { - constexpr auto pshuvb_pattern = idxm::x86_pshuvb_pattern(P::idxs); + constexpr auto pshuvb_pattern = idxm::x86_pshuvb_pattern(P::idxs); if constexpr( !pshuvb_pattern ) return no_matching_shuffle; else { diff --git a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l4_l5.hpp b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l4_l5.hpp index c4815d53f1..abb1b5bb60 100644 --- a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l4_l5.hpp +++ b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l4_l5.hpp @@ -48,13 +48,34 @@ shuffle_l4_l5_x86_put_u64x2_in_position(P, fixed, wide x) { constexpr auto p0 = get<0>(*P::shuffle_16in16); constexpr auto p1 = get<1>(*P::shuffle_16in16); - auto [r0, l0] = shuffle_v2_core(x, eve::lane, idxm::to_pattern()); - auto [r1, l1] = shuffle_v2_core(r0, eve::lane, idxm::to_pattern()); + auto [r0, l0] = shuffle_v2_core(x, eve::lane, idxm::to_pattern()); + auto [r1, l1] = shuffle_v2_core(r0, eve::lane, idxm::to_pattern()); return kumi::tuple {r1, idxm::add_shuffle_levels(l0, l1)}; } } +template +EVE_FORCEINLINE auto +shuffle_l4_l5_x86_slide_less_than_16(P, fixed, wide x) +{ + constexpr auto no = kumi::tuple {no_matching_shuffle, eve::index<-1>}; + // Coudn't figure out how to generalize well + // only slide left for now + // No masking 0s on avx512 + if constexpr( current_api < avx2 ) return no; + else if constexpr ( constexpr auto slide = idxm::is_slide_left(P::idxs) ) + { + static_assert(G == 1, "verifying assumptions"); + constexpr auto alignr_p = idxm::slide_2_left_in_16_pattern(P::g_size, *slide); + + wide y = shuffle_l<2>(x, lane<16 / sizeof(T)>, pattern<1, na_>); + + return kumi::tuple{shuffle_l<2>(x, y, idxm::to_pattern()), index<4>}; + } + else return no; +} + template EVE_FORCEINLINE auto shuffle_l4_l5_(EVE_SUPPORTS(sse2_), P p, fixed g, wide x) @@ -65,7 +86,8 @@ requires(P::out_reg_size == P::reg_size) { return r; } - else if constexpr( auto r = shuffle_l4_broadcast_lane_set_get(p, g, x); matched_shuffle(r))> ) + else if constexpr( auto r = shuffle_l4_broadcast_lane_set_get(p, g, x); + matched_shuffle(r))> ) { return r; } @@ -74,6 +96,11 @@ requires(P::out_reg_size == P::reg_size) { return r; } + else if constexpr( auto r = shuffle_l4_l5_x86_slide_less_than_16(p, g, x); + matched_shuffle(r))> ) + { + return r; + } else return kumi::tuple {no_matching_shuffle, eve::index<-1>}; } @@ -82,11 +109,12 @@ EVE_FORCEINLINE auto shuffle_l4_l5_(EVE_SUPPORTS(avx512_), P p, fixed g, logical> x) requires(P::out_reg_size == P::reg_size) { - if constexpr( auto r = shuffle_l4_broadcast_lane_set_get(p, g, x); matched_shuffle(r))> ) - { - return r; - } - else return kumi::tuple {no_matching_shuffle, eve::index<-1>}; + if constexpr( auto r = shuffle_l4_broadcast_lane_set_get(p, g, x); + matched_shuffle(r))> ) + { + return r; + } + else return kumi::tuple {no_matching_shuffle, eve::index<-1>}; } } diff --git a/test/unit/api/regular/shuffle_v2/idxm.cpp b/test/unit/api/regular/shuffle_v2/idxm.cpp index 7cd11b736d..4df02a2fe3 100644 --- a/test/unit/api/regular/shuffle_v2/idxm.cpp +++ b/test/unit/api/regular/shuffle_v2/idxm.cpp @@ -156,6 +156,8 @@ TTS_CASE("shuffle_within_halves") auto in = to_idxs(_in); bool actual = eve::detail::idxm::shuffle_within_halves(in); TTS_EQUAL(expected, actual) << tts::as_string(in); + actual = eve::detail::idxm::shuffle_within_n(in, std::ssize(in) / 2); + TTS_EQUAL(expected, actual) << tts::as_string(in); }; test(std::array {0, 1, we_, we_}, true); @@ -166,6 +168,23 @@ TTS_CASE("shuffle_within_halves") test(std::array {3, we_, we_, we_}, false); }; +TTS_CASE("shuffle_within_n") +{ + auto test = [](auto _in, std::ptrdiff_t n, bool expected) + { + auto in = to_idxs(_in); + bool actual = eve::detail::idxm::shuffle_within_n(in, n); + TTS_EQUAL(expected, actual) << tts::as_string(in); + }; + + test(std::array {0, 1, 3, 2, 5, 4, 7, 6}, 2, true); + test(std::array {0, we_, 3, na_, 5, na_, 7, 6}, 2, true); + test(std::array {0, 3, 1, 2, 5, 4, 7, 6}, 2, false); + test(std::array {0, 1, 3, 2, 5, 4, 7, 0}, 2, false); + test(std::array {0, 1, 3, 2, 5, 4, 7, na_}, 2, true); + test(std::array {1, 2, 3, 4, 5, 6, 7, na_}, 4, false); +}; + TTS_CASE("shuffle_halves_independetly") { auto test = [](auto _in, auto _expected_lo, auto _expected_hi) @@ -780,6 +799,32 @@ TTS_CASE("is_slide_right") test(std::array {na_, na_, 1, 2}, -1); }; +TTS_CASE("is_slide_left2") +{ + auto test = [](auto _in, int reg_groups, int expected) + { + auto in = to_idxs(_in); + auto actual = eve::detail::idxm::is_slide_left_2(in, reg_groups).value_or(-1); + TTS_EQUAL(expected, actual) << tts::as_string(in); + }; + + test(std::array {0, 1, 2, 3}, 4, 0); // not slide 2, result is unimportant + test(std::array {1, 2, 3, 4}, 4, 1); + test(std::array {1, 2, 3, 16}, 16, 1); + test(std::array {1, we_, 3, 4}, 4, 1); + test(std::array {1, we_, 3, 16}, 16, 1); + test(std::array {2, 3, 4, we_}, 4, 2); + test(std::array {2, 3, 16, we_}, 16, 2); + // {2, 3, we_, we_} - is not an important case because it's not slide2 at all. + test(std::array {2, 3, we_, 17}, 16, 2); + test(std::array {3, 4, 5, 6}, 4, 3); + test(std::array {4, 5, 6, 7}, 4, 4); + test(std::array {1, 2, na_, 4}, 4, -1); + test(std::array {1, 2, 3, na_}, 4, -1); + test(std::array {2, we_, 3, 16}, 16, -1); +}; + + TTS_CASE("slide_as_slide2_with_0") { auto yes_test = [](auto _in, auto _expected)