From 19c43486800fc3656278a552ab41301bcc5e0dd3 Mon Sep 17 00:00:00 2001 From: Denis Yaroshevskiy Date: Fri, 18 Aug 2023 11:12:31 +0100 Subject: [PATCH] naming some shuffles (#1642) --- include/eve/detail/function/reduce.hpp | 4 +- .../simd/common/deinterleave_groups.hpp | 1 - include/eve/detail/shuffle_v2/idxm.hpp | 74 ++++++- .../shuffle_v2/native_shuffle_helpers.hpp | 12 ++ include/eve/detail/shuffle_v2/shuffle_l2.hpp | 4 + .../eve/detail/shuffle_v2/shuffle_l6_l7.hpp | 2 + .../eve/detail/shuffle_v2/shuffle_v2_fwd.hpp | 2 +- .../shuffle_v2/simd/arm/neon/shuffle_l2.hpp | 39 +++- .../shuffle_v2/simd/arm/neon/shuffle_l3.hpp | 24 +++ .../shuffle_v2/simd/arm/sve/shuffle_l2.hpp | 40 ++++ .../shuffle_v2/simd/common/shuffle_l6_l7.hpp | 41 ++++ .../simd/common/shuffle_l_fallback.hpp | 14 ++ .../detail/shuffle_v2/simd/ppc/shuffle_l3.hpp | 24 +++ .../eve/detail/shuffle_v2/simd/x86/idxm.hpp | 13 +- .../detail/shuffle_v2/simd/x86/shuffle_l2.hpp | 71 ++++--- .../detail/shuffle_v2/simd/x86/shuffle_l3.hpp | 28 +++ .../simd/x86/shuffle_l_fallback.hpp | 35 +++- include/eve/module/core.hpp | 8 +- .../find_optimized_shuffle_pattern.hpp | 32 ++- .../eve/module/core/named_shuffles/blend.hpp | 141 +++++++++++++ .../eve/module/core/named_shuffles/core.hpp | 11 + .../named_shuffles/named_shuffle_utils.hpp} | 30 ++- .../swap_adjacent.hpp} | 79 +------ include/eve/module/core/regular/core.hpp | 1 - .../eve/module/core/regular/has_equal_in.hpp | 2 +- .../eve/module/core/regular/impl/reduce.hpp | 2 +- .../eve/module/core/regular/impl/rotate.hpp | 4 +- .../arm/neon/deinterleave_groups_shuffle.hpp | 4 +- .../regular/impl/simd/arm/neon/reverse.hpp | 10 +- .../simd/arm/neon/swap_adjacent_groups.hpp | 64 ------ .../impl/simd/ppc/swap_adjacent_groups.hpp | 29 --- .../simd/x86/deinterleave_groups_shuffle.hpp | 4 +- .../core/regular/impl/simd/x86/reverse.hpp | 6 +- .../impl/simd/x86/swap_adjacent_groups.hpp | 194 ------------------ .../regular/impl/swap_adjacent_groups.hpp | 88 -------- test/doc/CMakeLists.txt | 17 +- test/doc/core/named_shuffles/blend.cpp | 22 ++ .../doc/core/named_shuffles/swap_adjacent.cpp | 15 ++ .../doc/core/regular/swap_adjacent_groups.cpp | 24 --- test/unit/api/regular/shuffle_v2/blend.cpp | 41 ++++ test/unit/api/regular/shuffle_v2/idxm.cpp | 44 ++++ .../regular/shuffle_v2/shuffle_v2_test.hpp | 124 +++++++---- .../regular/swizzle/swap_adjacent_groups.cpp | 47 ----- .../tuple/swizzle/swap_adjacent_groups.cpp | 51 ----- test/unit/internals/optimize_pattern.cpp | 48 ++--- 45 files changed, 865 insertions(+), 705 deletions(-) create mode 100644 include/eve/detail/shuffle_v2/simd/arm/sve/shuffle_l2.hpp create mode 100644 include/eve/detail/shuffle_v2/simd/common/shuffle_l6_l7.hpp create mode 100644 include/eve/module/core/named_shuffles/blend.hpp create mode 100644 include/eve/module/core/named_shuffles/core.hpp rename include/eve/{detail/shuffle_v2/named_shuffle.hpp => module/core/named_shuffles/named_shuffle_utils.hpp} (60%) rename include/eve/module/core/{regular/swap_adjacent_groups.hpp => named_shuffles/swap_adjacent.hpp} (53%) delete mode 100644 include/eve/module/core/regular/impl/simd/arm/neon/swap_adjacent_groups.hpp delete mode 100644 include/eve/module/core/regular/impl/simd/ppc/swap_adjacent_groups.hpp delete mode 100644 include/eve/module/core/regular/impl/simd/x86/swap_adjacent_groups.hpp delete mode 100644 include/eve/module/core/regular/impl/swap_adjacent_groups.hpp create mode 100644 test/doc/core/named_shuffles/blend.cpp create mode 100644 test/doc/core/named_shuffles/swap_adjacent.cpp delete mode 100644 test/doc/core/regular/swap_adjacent_groups.cpp create mode 100644 test/unit/api/regular/shuffle_v2/blend.cpp delete mode 100644 test/unit/api/regular/swizzle/swap_adjacent_groups.cpp delete mode 100644 test/unit/api/tuple/swizzle/swap_adjacent_groups.cpp diff --git a/include/eve/detail/function/reduce.hpp b/include/eve/detail/function/reduce.hpp index 04972b1f32..f7dcdeecfb 100644 --- a/include/eve/detail/function/reduce.hpp +++ b/include/eve/detail/function/reduce.hpp @@ -9,7 +9,7 @@ #include #include -#include +#include #include namespace eve::detail @@ -29,7 +29,7 @@ namespace eve::detail return [&](std::index_sequence) mutable { - ((v = f(v,swap_adjacent_groups(v, fixed<(1<{} ))),...); + ((v = f(v,swap_adjacent(v, fixed<(1<{} ))),...); return v; }(std::make_index_sequence{}); } diff --git a/include/eve/detail/function/simd/common/deinterleave_groups.hpp b/include/eve/detail/function/simd/common/deinterleave_groups.hpp index 52e67d4c67..8cc420320e 100644 --- a/include/eve/detail/function/simd/common/deinterleave_groups.hpp +++ b/include/eve/detail/function/simd/common/deinterleave_groups.hpp @@ -9,7 +9,6 @@ #include #include -#include #include namespace eve::detail diff --git a/include/eve/detail/shuffle_v2/idxm.hpp b/include/eve/detail/shuffle_v2/idxm.hpp index db28d334ea..af785e7c2e 100644 --- a/include/eve/detail/shuffle_v2/idxm.hpp +++ b/include/eve/detail/shuffle_v2/idxm.hpp @@ -309,9 +309,7 @@ constexpr auto repeated_pattern_of_size = [] { std::optional> res; constexpr auto repeated = idxm::reduce_repeated_pattern_until_impl(); - if constexpr (repeated.size() == target) { - res = repeated; - } + if constexpr( repeated.size() == target ) { res = repeated; } return res; } }(); @@ -617,6 +615,56 @@ replace_na(const std::array& idxs, std::ptrdiff_t with) return replace_na(std::span(idxs), with); } +template +constexpr auto +just_second_shuffle(std::span idxs, std::ptrdiff_t with) +{ + std::array res = {}; + + auto s = (std::ptrdiff_t)N; + + for( std::size_t i = 0; i != N; ++i ) + { + auto in = idxs[i]; + if( 0 <= in && in < s ) res[i] = with; + else if ( in < 0) res[i] = in; + else res[i] = in - s; + } + return res; +} + +template +constexpr auto +just_second_shuffle(const std::array& idxs, std::ptrdiff_t with) +{ + return just_second_shuffle(std::span(idxs), with); +} + +template +constexpr auto +just_first_shuffle(std::span idxs, std::ptrdiff_t with) +{ + std::array res = {}; + + auto s = (std::ptrdiff_t)N; + + for( std::size_t i = 0; i != N; ++i ) + { + auto in = idxs[i]; + if( in >= s ) res[i] = with; + else res[i] = in; + } + + return res; +} + +template +constexpr auto +just_first_shuffle(const std::array& idxs, std::ptrdiff_t with) +{ + return just_first_shuffle(std::span(idxs), with); +} + constexpr bool is_blend(std::span idxs, std::ptrdiff_t cardinal) { @@ -814,6 +862,26 @@ split_to_groups(const std::array& idxs) return split_to_groups(std::span(idxs)); } +constexpr auto add_shuffle_levels(std::span ls) { + std::ptrdiff_t base = 0; + std::ptrdiff_t use_masks = 0; + + for (auto l : ls) { + base += l & (~1); + use_masks |= l & 1; + } + return base + use_masks; +} + +constexpr auto add_shuffle_levels(std::array ls) { + return add_shuffle_levels(std::span(ls)); +} + +template +constexpr auto add_shuffle_levels(eve::index_t... ) { + return index; +} + } // namespace eve::detail::idxm #if defined(EVE_INCLUDE_X86_HEADER) diff --git a/include/eve/detail/shuffle_v2/native_shuffle_helpers.hpp b/include/eve/detail/shuffle_v2/native_shuffle_helpers.hpp index 5145571fea..2d8d3f97c5 100644 --- a/include/eve/detail/shuffle_v2/native_shuffle_helpers.hpp +++ b/include/eve/detail/shuffle_v2/native_shuffle_helpers.hpp @@ -52,9 +52,21 @@ struct expanded_pattern_t : pattern_t static constexpr auto repeated_8 = idxm::repeated_pattern_of_size<8 / g_size, I...>; static constexpr auto repeated_16 = idxm::repeated_pattern_of_size<16 / g_size, I...>; static constexpr auto repeated_32 = idxm::repeated_pattern_of_size<32 / g_size, I...>; + + static constexpr std::array xy_swapped = idxm::swap_xy(idxs, std::ssize(idxs)); }; template constexpr expanded_pattern_t expanded_pattern; +template +EVE_FORCEINLINE auto +shuffle_2_using_or(pattern_t, fixed g, T x, T y) +{ + constexpr std::array idxs{I...}; + auto [x_, xl] = shuffle_v2_core(x, g, idxm::to_pattern()); + auto [y_, yl] = shuffle_v2_core(y, g, idxm::to_pattern()); + return kumi::tuple{ x_ | y_, idxm::add_shuffle_levels(xl, yl, eve::index<2>) }; +} + } diff --git a/include/eve/detail/shuffle_v2/shuffle_l2.hpp b/include/eve/detail/shuffle_v2/shuffle_l2.hpp index 5048ac1542..cf10c959bb 100644 --- a/include/eve/detail/shuffle_v2/shuffle_l2.hpp +++ b/include/eve/detail/shuffle_v2/shuffle_l2.hpp @@ -32,3 +32,7 @@ EVE_CALLABLE_API(shuffle_l2_, shuffle_l2) #if defined(EVE_INCLUDE_ARM_HEADER) # include #endif + +#if defined(EVE_INCLUDE_SVE_HEADER) +# include +#endif diff --git a/include/eve/detail/shuffle_v2/shuffle_l6_l7.hpp b/include/eve/detail/shuffle_v2/shuffle_l6_l7.hpp index 34c46856cb..f87f8ad3e2 100644 --- a/include/eve/detail/shuffle_v2/shuffle_l6_l7.hpp +++ b/include/eve/detail/shuffle_v2/shuffle_l6_l7.hpp @@ -26,6 +26,8 @@ namespace detail EVE_CALLABLE_API(shuffle_l6_l7_, shuffle_l6_l7) } +#include + #if defined(EVE_INCLUDE_X86_HEADER) # include #endif diff --git a/include/eve/detail/shuffle_v2/shuffle_v2_fwd.hpp b/include/eve/detail/shuffle_v2/shuffle_v2_fwd.hpp index e49c35a8a2..774d9b50f0 100644 --- a/include/eve/detail/shuffle_v2/shuffle_v2_fwd.hpp +++ b/include/eve/detail/shuffle_v2/shuffle_v2_fwd.hpp @@ -86,7 +86,7 @@ namespace eve //! If to produce 4 results we need to apply it 4 times it doesn't become more complex, //! you just have more work to do. //! From a practicle stand point, we want to be able to say: -//! "swap_adjacent_groups is at most level 5", and not "5 * number of output registers" +//! "swap_adjacent is at most level 5", and not "5 * number of output registers" //! //! If it proves to be important to accumulate all shuffles, we will change it in the future. //! @} diff --git a/include/eve/detail/shuffle_v2/simd/arm/neon/shuffle_l2.hpp b/include/eve/detail/shuffle_v2/simd/arm/neon/shuffle_l2.hpp index 61011c31b0..107f8b2d55 100644 --- a/include/eve/detail/shuffle_v2/simd/arm/neon/shuffle_l2.hpp +++ b/include/eve/detail/shuffle_v2/simd/arm/neon/shuffle_l2.hpp @@ -65,10 +65,10 @@ vcopy_lane(eve::wide x, eve::index_t, eve::wide y, eve::index_t< } else { - if constexpr( sizeof(T) == 8 ) return vcopy_laneq_u64(x, To, y, From); - else if constexpr( sizeof(T) == 4 ) return vcopy_laneq_u32(x, To, y, From); - else if constexpr( sizeof(T) == 2 ) return vcopy_laneq_u16(x, To, y, From); - else return vcopy_laneq_u8(x, To, y, From); + if constexpr( sizeof(T) == 8 ) return vcopyq_laneq_u64(x, To, y, From); + else if constexpr( sizeof(T) == 4 ) return vcopyq_laneq_u32(x, To, y, From); + else if constexpr( sizeof(T) == 2 ) return vcopyq_laneq_u16(x, To, y, From); + else return vcopyq_laneq_u8(x, To, y, From); } } @@ -215,4 +215,35 @@ requires(P::out_reg_size == P::reg_size) else return no_matching_shuffle_t {}; } +template +EVE_FORCEINLINE auto +shuffle_l2_neon_copy_lane_other(P, fixed, wide x, wide y) +{ + constexpr auto to_from0 = idxm::is_just_setting_one_lane(P::idxs); + constexpr auto to_from1 = idxm::is_just_setting_one_lane(P::xy_swapped); + + if constexpr( current_api < asimd ) return no_matching_shuffle_t {}; + else if constexpr( to_from0 ) + { + return vcopy_lane(x, eve::index<(*to_from0)[0]>, y, eve::index<(*to_from0)[1] - N::value * G>); + } + else if constexpr( to_from1 ) + { + return vcopy_lane(y, eve::index<(*to_from1)[0]>, x, eve::index<(*to_from1)[1] - N::value * G>); + } + else return no_matching_shuffle_t {}; +} + +template +EVE_FORCEINLINE auto +shuffle_l2_(EVE_SUPPORTS(neon128_), P p, fixed g, wide x, wide y) +requires(P::out_reg_size == P::reg_size) +{ + if constexpr( auto r = shuffle_l2_neon_copy_lane_other(p, g, x, y); matched_shuffle ) + { + return r; + } + else return no_matching_shuffle_t {}; +} + } diff --git a/include/eve/detail/shuffle_v2/simd/arm/neon/shuffle_l3.hpp b/include/eve/detail/shuffle_v2/simd/arm/neon/shuffle_l3.hpp index e6573f5778..80a858bb6a 100644 --- a/include/eve/detail/shuffle_v2/simd/arm/neon/shuffle_l3.hpp +++ b/include/eve/detail/shuffle_v2/simd/arm/neon/shuffle_l3.hpp @@ -7,6 +7,8 @@ //================================================================================================== #pragma once +#include + namespace eve::detail { @@ -56,6 +58,19 @@ shuffle_l3_neon_tbl(P, fixed, wide x) } } + +template +EVE_FORCEINLINE auto +shuffle_l3_neon_bit_select(P, fixed, wide x, wide y) +{ + if constexpr ( !idxm::is_blend(P::idxs, N::value / G) ) return no_matching_shuffle_t{}; + else + { + eve::logical> m([](int i, int size) { return P::idxs[i / G] >= size / G; }); + return eve::bit_select(m, y, x); + } +} + template EVE_FORCEINLINE auto shuffle_l3_(EVE_SUPPORTS(neon128_), P p, fixed g, wide x) @@ -66,4 +81,13 @@ shuffle_l3_(EVE_SUPPORTS(neon128_), P p, fixed g, wide x) else return no_matching_shuffle_t {}; } +template +EVE_FORCEINLINE auto +shuffle_l3_(EVE_SUPPORTS(neon128_), P p, fixed g, wide x, wide y) + requires(P::out_reg_size == P::reg_size) +{ + if constexpr( auto r = shuffle_l3_neon_bit_select(p, g, x, y); matched_shuffle ) return r; + else return no_matching_shuffle_t {}; +} + } diff --git a/include/eve/detail/shuffle_v2/simd/arm/sve/shuffle_l2.hpp b/include/eve/detail/shuffle_v2/simd/arm/sve/shuffle_l2.hpp new file mode 100644 index 0000000000..52299cb105 --- /dev/null +++ b/include/eve/detail/shuffle_v2/simd/arm/sve/shuffle_l2.hpp @@ -0,0 +1,40 @@ +//================================================================================================== +/* + EVE - Expressive Vector Engine + Copyright : EVE Project Contributors + SPDX-License-Identifier: BSL-1.0 +*/ +//================================================================================================== +#pragma once + +#include + +namespace eve::detail +{ + +template +EVE_FORCEINLINE auto +shuffle_l2_sve_blend(P, fixed, wide x, wide y) +{ + // here using idxs, not idxs2match, no zeroing blend on sve + if constexpr( !idxm::is_blend(P::idxs, N::value / G) ) return no_matching_shuffle; + else + { + eve::logical> m([](int i, int size) { return P::idxs[i / G] >= size / G; }); + return wide{svsel(m, y, x)}; + } +} + +template +EVE_FORCEINLINE auto +shuffle_l2_(EVE_SUPPORTS(sve_), P p, fixed g, wide x, wide y) +requires(P::out_reg_size == P::reg_size) +{ + if constexpr( auto r = shuffle_l2_sve_blend(p, g, x, y); matched_shuffle ) + { + return r; + } + else return no_matching_shuffle_t {}; +} + +} diff --git a/include/eve/detail/shuffle_v2/simd/common/shuffle_l6_l7.hpp b/include/eve/detail/shuffle_v2/simd/common/shuffle_l6_l7.hpp new file mode 100644 index 0000000000..fc94c37197 --- /dev/null +++ b/include/eve/detail/shuffle_v2/simd/common/shuffle_l6_l7.hpp @@ -0,0 +1,41 @@ +//================================================================================================== +/* + EVE - Expressive Vector Engine + Copyright : EVE Project Contributors + SPDX-License-Identifier: BSL-1.0 +*/ +//================================================================================================== +#pragma once + +namespace eve::detail +{ + +template +EVE_FORCEINLINE auto +shuffle_l6_l7_blend_nonwide_logicals(P, fixed, T x, T y) +{ + if constexpr( !idxm::is_blend(P::idxs, T::size() / G) ) + { + return kumi::tuple {no_matching_shuffle, eve::index<-1>}; + } + else + { + T m([](int i, int size) { return P::idxs[i / G] >= size / G; }); + return kumi::tuple {(x && !m) || (y && m), eve::index<6>}; + } +} + +// common non wide logical thinking +template +EVE_FORCEINLINE auto +shuffle_l6_l7_(EVE_SUPPORTS(cpu_), P p, fixed g, T x, T y) +{ + if constexpr( auto r = shuffle_l6_l7_blend_nonwide_logicals(p, g, x, y); + matched_shuffle(r))> ) + { + return r; + } + else return no_matching_shuffle; +} + +} diff --git a/include/eve/detail/shuffle_v2/simd/common/shuffle_l_fallback.hpp b/include/eve/detail/shuffle_v2/simd/common/shuffle_l_fallback.hpp index 43b0fe6955..f40aa2510c 100644 --- a/include/eve/detail/shuffle_v2/simd/common/shuffle_l_fallback.hpp +++ b/include/eve/detail/shuffle_v2/simd/common/shuffle_l_fallback.hpp @@ -20,4 +20,18 @@ requires(!abi_t::is_wide_logical) && requires { shuffle_v2_core(x.bits(), // to mask and from mask + 2 each return kumi::tuple {to_logical(shuffled), eve::index}; } + + +template +EVE_FORCEINLINE auto +shuffle_l_fallback_(EVE_SUPPORTS(cpu_), pattern_t p, fixed g, T x, T y) +{ + // sse2 has no blend. Other should try blend/no blend options. + if constexpr ( eve::current_api < eve::sse4_1 ) + { + return shuffle_2_using_or(p, g, x, y); + } + else return kumi::tuple{no_matching_shuffle, eve::index<-1>}; +} + } diff --git a/include/eve/detail/shuffle_v2/simd/ppc/shuffle_l3.hpp b/include/eve/detail/shuffle_v2/simd/ppc/shuffle_l3.hpp index 6221e03e1c..3d7ef25860 100644 --- a/include/eve/detail/shuffle_v2/simd/ppc/shuffle_l3.hpp +++ b/include/eve/detail/shuffle_v2/simd/ppc/shuffle_l3.hpp @@ -49,6 +49,30 @@ shuffle_l3_(EVE_SUPPORTS(vmx_), P p, fixed g, wide x) } } +template +EVE_FORCEINLINE auto +shuffle_l3_ppc_vec_sel(P, fixed, wide x, wide y) +{ + if constexpr( !idxm::is_blend(P::idxs, N::value / G) ) return no_matching_shuffle; + else + { + eve::logical> m([](int i, int size) { return P::idxs[i / G] >= size / G; }); + return vec_sel(x.storage(), y.storage(), m.storage()); + } +} + +template +EVE_FORCEINLINE auto +shuffle_l3_(EVE_SUPPORTS(vmx_), P p, fixed g, wide x, wide y) +requires(P::out_reg_size == P::reg_size) +{ + if constexpr( auto r = shuffle_l3_ppc_vec_sel(p, g, x, y); matched_shuffle ) + { + return r; + } + else return no_matching_shuffle_t {}; +} + } // return vec_perm(what.storage(), what.storage(), pattern.storage()); diff --git a/include/eve/detail/shuffle_v2/simd/x86/idxm.hpp b/include/eve/detail/shuffle_v2/simd/x86/idxm.hpp index e7ab696cdb..389610b566 100644 --- a/include/eve/detail/shuffle_v2/simd/x86/idxm.hpp +++ b/include/eve/detail/shuffle_v2/simd/x86/idxm.hpp @@ -51,15 +51,18 @@ x86_permute2f128_one_reg_mask(std::span _idxs) } constexpr int -x86_blend_immediate_mask(std::span idxs) +x86_blend_immediate_mask(std::span idxs, std::ptrdiff_t g) { int r = 0; int s = std::ssize(idxs); - for( int pos = 0; auto i : idxs ) + int pos = 0; + for(auto i : idxs ) { - // we_ < s - if( i >= s ) { r |= 1 << pos; } - ++pos; + for (int j = 0; j != g; ++j) { + // we_ < s + if( i * g >= s ) { r |= 1 << pos; } + ++pos; + } } return r; } diff --git a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l2.hpp b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l2.hpp index 0fd349a2f7..9f3861fdac 100644 --- a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l2.hpp +++ b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l2.hpp @@ -7,6 +7,8 @@ //================================================================================================== #pragma once +#include + namespace eve::detail { @@ -100,6 +102,7 @@ EVE_FORCEINLINE auto shuffle_l2_x86_within_128_alignr(P, fixed, wide x) { if constexpr( current_api == avx && P::reg_size == 32 ) return no_matching_shuffle; + else if constexpr( current_api < ssse3 ) return no_matching_shuffle; else { constexpr auto rotation = idxm::is_rotate(*P::repeated_16); @@ -154,7 +157,7 @@ shuffle_l2_x86_128_insert_one_zero(P, fixed, wide x) { constexpr auto pos = eve::detail::idxm::is_just_setting_one_zero(P::idxs2match); if constexpr( !pos ) return no_matching_shuffle; - else if constexpr ( P::reg_size == 16 ) + else if constexpr( P::reg_size == 16 ) { constexpr int m = *pos; @@ -215,7 +218,7 @@ EVE_FORCEINLINE auto shuffle_l2_x86_u64x2(P p, fixed g, wide x) { if constexpr( P::g_size < 16 ) return no_matching_shuffle; - else if constexpr ( P::reg_size == 32) + else if constexpr( P::reg_size == 32 ) { constexpr int mm = idxm::x86_permute2f128_one_reg_mask(P::idxs); return _mm256_permute2f128_si256(x, x, mm); @@ -251,11 +254,10 @@ requires std::same_as, x86_128_> && (P::out_reg_size == 16) else return no_matching_shuffle; } - template EVE_FORCEINLINE auto shuffle_l2_(EVE_SUPPORTS(avx_), P p, fixed g, wide x) -requires (P::out_reg_size == P::reg_size) +requires(P::out_reg_size == P::reg_size) { if constexpr( auto r = shuffle_l2_x86_within_128(p, g, x); matched_shuffle ) { @@ -279,39 +281,56 @@ requires (P::out_reg_size == P::reg_size) template EVE_FORCEINLINE auto -shuffle_l2_(EVE_SUPPORTS(sse2_), P, fixed, wide x, wide y) -requires std::same_as, x86_128_> && (P::out_reg_size == P::reg_size) +shuffle_l2_x86_blend(P, fixed, wide x, wide y) { // Immediate blends are very good, even if are covered by other ops // https://stackoverflow.com/questions/76552874/how-should-i-chose-between-mm-move-sd-mm-shuffle-pd-mm-blend-pd // - // FIX-1617 - enable `_mm_blend_epi16` - if constexpr( eve::current_api >= eve::sse4_1 && sizeof(T) >= 4 - && idxm::is_blend(P::idxs2match, N::value / G) ) + + // here using idxs, not idxs2match, no zeroing blend on avx512 + if constexpr( !idxm::is_blend(P::idxs, N::value / G) ) return no_matching_shuffle; + else if constexpr( P::reg_size <= 32 && P::g_size >= 4 ) { - constexpr int m = idxm::x86_blend_immediate_mask(P::idxs2match); + constexpr int m = idxm::x86_blend_immediate_mask(P::idxs, G); - if constexpr( sizeof(T) == 8 ) - { - auto x_f64 = bit_cast(x, eve::as>> {}); - auto y_f64 = bit_cast(y, eve::as>> {}); - return _mm_blend_pd(x_f64, y_f64, m); - } - else if constexpr( sizeof(T) == 4 ) + if constexpr( eve::current_api >= eve::sse4_1 && P::g_size >= 8 ) { - auto x_f32 = bit_cast(x, eve::as>> {}); - auto y_f32 = bit_cast(y, eve::as>> {}); - return _mm_blend_ps(x_f32, y_f32, m); + auto x_f64 = bit_cast(x, eve::as> {}); + auto y_f64 = bit_cast(y, eve::as> {}); + + if constexpr( P::reg_size == 16 ) return _mm_blend_pd(x_f64, y_f64, m); + else return _mm256_blend_pd(x_f64, y_f64, m); } -#if 0 // FIX-1617 - enable `_mm_blend_epi16` - else + else if constexpr( eve::current_api >= eve::sse4_1 && P::g_size >= 4 ) { - // - return _mm_blend_epi16(x, y, m); + auto x_f32 = bit_cast(x, eve::as> {}); + auto y_f32 = bit_cast(y, eve::as> {}); + + if constexpr( P::reg_size == 16 ) return _mm_blend_ps(x_f32, y_f32, m); + else return _mm256_blend_ps(x_f32, y_f32, m); } -#endif + // FIX-1617 - enable `_mm_blend_epi16` + else return no_matching_shuffle; + } + else if constexpr( eve::current_api >= avx512 ) + { + // On avx512 we don't count logical masks + eve::logical> m([](int i, int size) { return P::idxs[i / G] >= size / G; }); + return eve::if_else(m, y, x); + } + else return no_matching_shuffle; +} + +template +EVE_FORCEINLINE auto +shuffle_l2_(EVE_SUPPORTS(sse2_), P p, fixed g, wide x, wide y) +requires (P::out_reg_size == P::reg_size) +{ + if constexpr( auto r = shuffle_l2_x86_blend(p, g, x, y); matched_shuffle ) + { + return r; } - else if constexpr( sizeof(T) == 8 ) + else if constexpr( sizeof(T) == 8 && P::reg_size == 16 ) { // half from x, half from y // No w/e or zeroes are possible here diff --git a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l3.hpp b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l3.hpp index 23702ddb9c..b17eea971a 100644 --- a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l3.hpp +++ b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l3.hpp @@ -63,4 +63,32 @@ requires(P::out_reg_size == P::reg_size) else return no_matching_shuffle; } +template +EVE_FORCEINLINE auto +shuffle_l3_x86_blendv(P, fixed, wide x, wide y) +{ + // avx512 should not be considered here at all + if constexpr( current_api >= avx512 ) return no_matching_shuffle; + else if constexpr( current_api < eve::sse4_1 || (current_api == eve::avx && P::reg_size == 32) ) + { + return no_matching_shuffle; + } + else if constexpr( !idxm::is_blend(P::idxs, N::value / G) ) return no_matching_shuffle; + else + { + eve::logical> m([](int i, int size) { return P::idxs[i / G] >= size / G; }); + if constexpr( P::reg_size == 16 ) return _mm_blendv_epi8(x, y, m.bits()); + else return _mm256_blendv_epi8(x, y, m.bits()); + } +} + +template +EVE_FORCEINLINE auto +shuffle_l3_(EVE_SUPPORTS(sse2_), P p, fixed g, wide x, wide y) +requires(P::out_reg_size == P::reg_size) +{ + if constexpr( auto r = shuffle_l3_x86_blendv(p, g, x, y); matched_shuffle ) return r; + else return no_matching_shuffle; +} + } diff --git a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l_fallback.hpp b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l_fallback.hpp index 7b595d9fc1..e801bfeea6 100644 --- a/include/eve/detail/shuffle_v2/simd/x86/shuffle_l_fallback.hpp +++ b/include/eve/detail/shuffle_v2/simd/x86/shuffle_l_fallback.hpp @@ -33,9 +33,40 @@ requires std::same_as, x86_256_> && (P::out_reg_size == P::reg_size) x = wide {shuffled_lo, shuffled_hi}; - auto l = eve::index; + // 4 to extract + combine + return kumi::tuple {x, idxm::add_shuffle_levels(lo_l, hi_l, eve::index<4>)}; + } + else return kumi::tuple {no_matching_shuffle, eve::index<-1>}; +} + +template +EVE_FORCEINLINE auto +shuffle_l_fallback_(EVE_SUPPORTS(avx_), P p, fixed g, wide x, wide y) +requires std::same_as, x86_256_> && (P::out_reg_size == P::reg_size) + && (P::g_size <= 2) && (current_api == avx) +{ + auto x_01 = x.slice(); + auto y_01 = y.slice(); + auto p_01 = idxm::slice_pattern(p); + + auto x0 = get<0>(x_01); + auto x1 = get<1>(x_01); + auto y0 = get<0>(y_01); + auto y1 = get<1>(y_01); + auto p0 = get<0>(p_01); + auto p1 = get<1>(p_01); + + if constexpr( + requires { shuffle_v2_core(x0, x1, y0, y1, g, p0); } + && requires { shuffle_v2_core(x0, x1, y0, y1, g, p1); } ) + { + auto [shuffled0, l0] = shuffle_v2_core(x0, x1, y0, y1, g, p0); + auto [shuffled1, l1] = shuffle_v2_core(x0, x1, y0, y1, g, p1); + + x = wide {shuffled0, shuffled1}; - return kumi::tuple {x, l}; + // 4 to extract + combine + return kumi::tuple {x, idxm::add_shuffle_levels(l0, l1, eve::index<4>)}; } else return kumi::tuple {no_matching_shuffle, eve::index<-1>}; } diff --git a/include/eve/module/core.hpp b/include/eve/module/core.hpp index 77c26d3a52..4f981af7d4 100644 --- a/include/eve/module/core.hpp +++ b/include/eve/module/core.hpp @@ -88,14 +88,20 @@ //! @defgroup core_simd SIMD Specific Operations //! @ingroup core //! Proper SIMD operations as shuffling, splitting and merging SIMD vectors +//! +//! @defgroup core_named_shuffles Named Shuffles +//! @ingroup core +//! Functions that are just shuffles with a different api. +//! //! @} //================================================================================================== #include #include #include -#include #include +#include +#include #include #include #include diff --git a/include/eve/module/core/detail/generic/find_optimized_shuffle_pattern.hpp b/include/eve/module/core/detail/generic/find_optimized_shuffle_pattern.hpp index 2a4c8026e9..ec14bc6f99 100644 --- a/include/eve/module/core/detail/generic/find_optimized_shuffle_pattern.hpp +++ b/include/eve/module/core/detail/generic/find_optimized_shuffle_pattern.hpp @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -100,6 +100,34 @@ template struct bound } }; +// Part time migration to shuffle_v2 + +template +inline constexpr auto swap_adjacent_groups_pattern = fix_pattern( + [](auto i, auto) + { + if constexpr( G != N && G != 0 ) return (i + G) % (G * 2) + (G * 2) * (i / (G * 2)); + else return i; + }); + +template +inline constexpr auto is_swag = []() +{ + // List all possible swags for a current size + constexpr auto sz = sizeof...(I); + constexpr auto x = [](std::index_sequence) + { + return kumi::make_tuple(swap_adjacent_groups_pattern...); + } + (std::make_index_sequence {}); + + // Find the fitting one + constexpr auto idx = detail::find_index(pattern, x); + return fixed {}; +}(); + +// --------------------------------- + //================================================================================================ // Look to see if a given pattern is optimizable and returns the optimized function object //================================================================================================ @@ -115,7 +143,7 @@ find_optimized_shuffle_pattern() else if constexpr( is_identity ) return bound {identity_swizzle {}, p}; else if constexpr( is_swag != sz ) { - return bound {swap_adjacent_groups, is_swag}; + return bound {swap_adjacent, is_swag}; } else if constexpr( constexpr auto st = is_broadcast_group ) { diff --git a/include/eve/module/core/named_shuffles/blend.hpp b/include/eve/module/core/named_shuffles/blend.hpp new file mode 100644 index 0000000000..4fc9ff51ac --- /dev/null +++ b/include/eve/module/core/named_shuffles/blend.hpp @@ -0,0 +1,141 @@ +//================================================================================================== +/* + EVE - Expressive Vector Engine + Copyright : EVE Project Contributors + SPDX-License-Identifier: BSL-1.0 +*/ +//================================================================================================== +#pragma once + +#include +#include + +namespace eve +{ + +//================================================================================================ +//! @addtogroup core_named_shuffles +//! @{ +//! @var blend +//! @brief a named shuffle for mixing 2 registers together, without changing positions. +//! @note You might be looking for `eve::if_else_`. +//! +//! (TODO: support mixing more than 2 registers) +//! +//! Accepts a pattern with just 0 and 1 that indicates number of the element for the current +//! slot +//! +//! blend([0, 1, 2, 3], [4, 5, 6, 7], pattern<1, 0, 0, 1>) -> [4, 1, 2, 7]. +//! +//! As any named shuffle, allows to specify a group size. +//! +//! @code +//! template // (1) +//! T blend(T x, T y, fixed, pattern_t) +//! template +//! T blend(T x, T y, fixed, pattern_formula auto gen) +//! +//! template // (2) +//! T blend(T x, T y, pattern_t) +//! template +//! T blend(T x, T y, pattern_formula auto gen) +//! @endcode +//! +//! (2) calls (1) with G == 1 +//! (1) blends groups of elements from 2 registers, +//! I == 0 -> take a group from x +//! I == 1 -> take a group from y +//! +//! **Return value** +//! result of the shuffle +//! +//! @groupheader{Example} +//! +//! @godbolt{doc/core/named_shuffles/blend.cpp} +//! @} +//================================================================================================ +struct blend_t +{ + template + static constexpr auto pattern(eve::as, eve::as, eve::fixed, pattern_t) + { + static_assert(((0 <= I && I <= 1) && ...), "pattern for blend has to only contain 0 and 1"); + static_assert(pattern_t::size() * G == T::size(), "pattern has wrong number of elements"); + + return eve::fix_pattern( + [](int i, int size) + { + constexpr std::array idxs {I...}; + return idxs[i] * size + i % size; + }); + } + + template + static constexpr std::ptrdiff_t level(eve::as, eve::as, eve::fixed g, pattern_t p) + { + if constexpr( sizeof...(I) == 1 ) return 0; + else if constexpr( eve::has_aggregated_abi_v ) + { + using half_t = decltype(T {}.slice(lower_)); + + auto [p0, p1] = detail::idxm::slice_pattern::size() / 2>(p); + + return std::max(level(as {}, as {}, g, p0), + level(as {}, as {}, g, p1)); + } + + if( ((I == 0) && ...) ) return 0; + if( ((I == 1) && ...) ) return 0; + + if( current_api >= sve ) return logical_simd_value ? 6 : 2; + if( current_api >= avx512 ) return logical_simd_value ? 6 : 2; + if( current_api >= vmx ) return 3; + + const std::ptrdiff_t g_size = sizeof(element_type_t) * G; + const std::size_t reg_size = sizeof(element_type_t) * T::size(); + const std::size_t count_from_x = ((I == 0) + ...); + const std::size_t count_from_y = ((I == 1) + ...); + + if( current_api >= neon ) + { + if( current_api >= asimd && (count_from_x == 1 || count_from_y == 1) ) return 2; + return 3; + } + + if( current_api >= sse2 ) + { + if constexpr( current_api == avx && reg_size >= 32 && g_size <= 2 ) + { + using half_t = decltype(T {}.slice(lower_)); + auto [p0, p1] = detail::idxm::slice_pattern::size() / 2>(p); + auto l0 = level(as {}, as {}, g, p0); + auto l1 = level(as {}, as {}, g, p1); + return detail::idxm::add_shuffle_levels(std::array {l0, l1, 4}); + } + if( current_api >= sse4_1 ) return g_size >= 4 ? 2 : 3; + + if( g_size >= 8 ) return 2; + if( g_size == 2 && reg_size == 4 ) return 6; + return 7; + } + + return 2; + } + + template + static constexpr auto + pattern(eve::as tgt, eve::as, eve::fixed g, pattern_formula auto gen) + { + return pattern(tgt, tgt, g, fix_pattern(gen)); + } + + template + static constexpr auto level(eve::as tgt, eve::as, eve::fixed g, pattern_formula auto gen) + { + return level(tgt, tgt, g, fix_pattern(gen)); + } +}; + +inline constexpr auto blend = detail::named_shuffle_2 {}; + +} diff --git a/include/eve/module/core/named_shuffles/core.hpp b/include/eve/module/core/named_shuffles/core.hpp new file mode 100644 index 0000000000..5bf1cc06d0 --- /dev/null +++ b/include/eve/module/core/named_shuffles/core.hpp @@ -0,0 +1,11 @@ +//================================================================================================== +/** + EVE - Expressive Vector Engine + Copyright : EVE Project Contributors + SPDX-License-Identifier: BSL-1.0 +**/ +//================================================================================================== +#pragma once + +#include +#include diff --git a/include/eve/detail/shuffle_v2/named_shuffle.hpp b/include/eve/module/core/named_shuffles/named_shuffle_utils.hpp similarity index 60% rename from include/eve/detail/shuffle_v2/named_shuffle.hpp rename to include/eve/module/core/named_shuffles/named_shuffle_utils.hpp index e84fdf9820..60769b816a 100644 --- a/include/eve/detail/shuffle_v2/named_shuffle.hpp +++ b/include/eve/module/core/named_shuffles/named_shuffle_utils.hpp @@ -5,6 +5,7 @@ SPDX-License-Identifier: BSL-1.0 **/ //================================================================================================== +#pragma once #include @@ -43,21 +44,36 @@ template struct named_shuffle_1 : Name { template EVE_FORCEINLINE auto operator()(T x, eve::fixed g, Args... args) const - requires requires { Name::pattern(eve::as {}, g, args...); } + requires requires { Name::pattern(eve::as {}, g, args...); } { - if constexpr (G <= 0) - { - static_assert(G > 0, "Group size <= 0 is not supported"); - } - else return shuffle_v2_core(x, g, Name::pattern(eve::as {}, g, args...)); + if constexpr( G <= 0 ) { static_assert(G > 0, "Group size <= 0 is not supported"); } + else return shuffle_v2(x, g, Name::pattern(eve::as {}, g, args...)); } template EVE_FORCEINLINE auto operator()(T x, Args... args) const - -> decltype(operator()(x, eve::lane<1>, args...)) + requires requires { Name::pattern(eve::as {}, eve::lane<1>, args...); } { return operator()(x, eve::lane<1>, args...); } }; +template struct named_shuffle_2 : Name +{ + template + EVE_FORCEINLINE auto operator()(T x, T y, eve::fixed g, Args... args) const + requires requires { Name::pattern(eve::as {}, eve::as {}, g, args...); } + { + if constexpr( G <= 0 ) { static_assert(G > 0, "Group size <= 0 is not supported"); } + else return shuffle_v2(x, y, g, Name::pattern(eve::as {}, eve::as {}, g, args...)); + } + + template + EVE_FORCEINLINE auto operator()(T x, T y, Args... args) const + requires requires { Name::pattern(eve::as {}, eve::as {}, eve::lane<1>, args...); } + { + return operator()(x, y, eve::lane<1>, args...); + } +}; + } diff --git a/include/eve/module/core/regular/swap_adjacent_groups.hpp b/include/eve/module/core/named_shuffles/swap_adjacent.hpp similarity index 53% rename from include/eve/module/core/regular/swap_adjacent_groups.hpp rename to include/eve/module/core/named_shuffles/swap_adjacent.hpp index 70eee4fd91..578211a2a7 100644 --- a/include/eve/module/core/regular/swap_adjacent_groups.hpp +++ b/include/eve/module/core/named_shuffles/swap_adjacent.hpp @@ -7,26 +7,27 @@ //================================================================================================== #pragma once -#include +#include #include namespace eve { //================================================================================================ -//! @addtogroup core +//! @addtogroup core_named_shuffles //! @{ //! @var swap_adjacent -//! @brief goes through all pairs of elements and them: [0, 1, 2, 3] => [1,0,3, 2] +//! @brief a named shuffle that goes all pairs of elements and swaps them: +//! [0, 1, 2, 3] => [1, 0, 3, 2] //! Can do that for group sizes instead of elements. -//! If group size is equal to the register size - that's a noop. +//! Group size has to be 0 < G < T::size() //! //! @code //! template -//! T swap_adjacent(T x, fixed const &) noexcept; // (1) +//! T swap_adjacent(T x, fixed) // (1) //! //! template -//! T swap_adjacent(T x) noexcept; // (2) +//! T swap_adjacent(T x) // (2) //! @endcode //! //! (2) calls (1) with G == 1; @@ -35,10 +36,10 @@ namespace eve //! **Return value** //! //! Return x where groups with contiguous groups of N elements swapped -//! for N = 0 or cardinal_v acts as noop. +//! //! @groupheader{Example} //! -//! @godbolt{doc/core/regular/swap_adjacent_groups.cpp} +//! @godbolt{doc/core/named_shuffles/swap_adjacent.cpp} //! @} //================================================================================================ struct swap_adjacent_t @@ -71,7 +72,7 @@ struct swap_adjacent_t std::ptrdiff_t half_l = level(eve::as{}, g); // since we are adding, we need to deal with aggregation if (reg_size > 32) return half_l; - return 2 * half_l + 4; + return detail::idxm::add_shuffle_levels({half_l, half_l, 4}); } if( current_api >= sse2 ) @@ -97,64 +98,4 @@ struct swap_adjacent_t inline constexpr auto swap_adjacent = detail::named_shuffle_1 {}; -//================================================================================================ -//! @addtogroup core -//! @{ -//! @var swap_adjacent_groups -//! @brief swap adjacent groups of elements of chosen number. -//! -//! **Defined in Header** -//! -//! @code -//! #include -//! @endcode -//! -//! @groupheader{Callable Signatures} -//! -//! @code -//! namespace eve -//! { -//! template -//! T swap_adjacent groups(T x, fixed const &) noexcept; -//! @endcode -//! -//! **Parameters** -//! -//! * `x` : [argument](@ref eve::integral_value). -//! * `N` : number of elements in group -//! -//! **Return value** -//! -//! Return x where groups with contiguous groups of N elements swapped -//! for N = 0 or cardinal_v acts as noop. -//! -//! @groupheader{Example} -//! -//! @godbolt{doc/core/regular/swap_adjacent_groups.cpp} -//! -//! @} -//================================================================================================ -EVE_MAKE_CALLABLE(swap_adjacent_groups_, swap_adjacent_groups); - -template -inline constexpr auto swap_adjacent_groups_pattern = fix_pattern( - [](auto i, auto) - { - if constexpr( G != N && G != 0 ) return (i + G) % (G * 2) + (G * 2) * (i / (G * 2)); - else return i; - }); } - -#include - -#if defined(EVE_INCLUDE_X86_HEADER) -# include -#endif - -#if defined(EVE_INCLUDE_ARM_HEADER) -# include -#endif - -#if defined(EVE_INCLUDE_POWERPC_HEADER) -# include -#endif diff --git a/include/eve/module/core/regular/core.hpp b/include/eve/module/core/regular/core.hpp index 5a7af0b5e1..fdd9b5e62f 100644 --- a/include/eve/module/core/regular/core.hpp +++ b/include/eve/module/core/regular/core.hpp @@ -211,7 +211,6 @@ #include #include #include -#include #include #include #include diff --git a/include/eve/module/core/regular/has_equal_in.hpp b/include/eve/module/core/regular/has_equal_in.hpp index 76b8a0197f..115be7506e 100644 --- a/include/eve/module/core/regular/has_equal_in.hpp +++ b/include/eve/module/core/regular/has_equal_in.hpp @@ -14,7 +14,7 @@ namespace eve { //================================================================================================ - //! @addtogroup core + //! @addtogroup core_simd //! @{ //! @var has_equal_in //! diff --git a/include/eve/module/core/regular/impl/reduce.hpp b/include/eve/module/core/regular/impl/reduce.hpp index fcb41189f5..4f23491021 100644 --- a/include/eve/module/core/regular/impl/reduce.hpp +++ b/include/eve/module/core/regular/impl/reduce.hpp @@ -20,7 +20,7 @@ #include #include #include -#include +#include namespace eve::detail { diff --git a/include/eve/module/core/regular/impl/rotate.hpp b/include/eve/module/core/regular/impl/rotate.hpp index 863fad9041..15b2c97458 100644 --- a/include/eve/module/core/regular/impl/rotate.hpp +++ b/include/eve/module/core/regular/impl/rotate.hpp @@ -7,7 +7,7 @@ //================================================================================================== #pragma once -#include +#include #include namespace eve::detail @@ -24,7 +24,7 @@ EVE_FORCEINLINE T rotate_(EVE_SUPPORTS(cpu_), T x, index_t) requires (M <= T::size()) { if constexpr ( M == T::size() || M == 0 ) return x; - else if constexpr ( M == T::size() / 2 ) return swap_adjacent_groups(x, eve::lane); + else if constexpr ( M == T::size() / 2 ) return swap_adjacent(x, eve::lane); else if constexpr ( is_bundle_v ) { return T(kumi::map(rotate_lambda{}, x)); diff --git a/include/eve/module/core/regular/impl/simd/arm/neon/deinterleave_groups_shuffle.hpp b/include/eve/module/core/regular/impl/simd/arm/neon/deinterleave_groups_shuffle.hpp index c0001f62e3..231ed7e2b3 100644 --- a/include/eve/module/core/regular/impl/simd/arm/neon/deinterleave_groups_shuffle.hpp +++ b/include/eve/module/core/regular/impl/simd/arm/neon/deinterleave_groups_shuffle.hpp @@ -8,7 +8,7 @@ #pragma once #include -#include +#include namespace eve::detail { @@ -48,7 +48,7 @@ template else { // 1 rev instruction - auto swapped = swap_adjacent_groups(v, lane<1>); + auto swapped = swap_adjacent(v); // sizeof(T) == 4 ========================= if constexpr( c == category::float32x4 ) return vuzp1q_f32(v, swapped); diff --git a/include/eve/module/core/regular/impl/simd/arm/neon/reverse.hpp b/include/eve/module/core/regular/impl/simd/arm/neon/reverse.hpp index 82eb43f0b3..836dfbf056 100644 --- a/include/eve/module/core/regular/impl/simd/arm/neon/reverse.hpp +++ b/include/eve/module/core/regular/impl/simd/arm/neon/reverse.hpp @@ -8,7 +8,7 @@ #pragma once #include -#include +#include namespace eve::detail { @@ -23,10 +23,10 @@ reverse_(EVE_SUPPORTS(neon128_), wide v) noexcept requires arm_abi= 8 && one_instruction_basic_shuffle ) return reverse_(EVE_RETARGET(cpu_), v); else { - if constexpr( N() >= 16 ) v = eve::swap_adjacent_groups(v, lane<8>); - if constexpr( N() >= 8 ) v = eve::swap_adjacent_groups(v, lane<4>); - if constexpr( N() >= 4 ) v = eve::swap_adjacent_groups(v, lane<2>); - if constexpr( N() >= 2 ) v = eve::swap_adjacent_groups(v, lane<1>); + if constexpr( N() >= 16 ) v = eve::swap_adjacent(v, lane<8>); + if constexpr( N() >= 8 ) v = eve::swap_adjacent(v, lane<4>); + if constexpr( N() >= 4 ) v = eve::swap_adjacent(v, lane<2>); + if constexpr( N() >= 2 ) v = eve::swap_adjacent(v, lane<1>); return v; } } diff --git a/include/eve/module/core/regular/impl/simd/arm/neon/swap_adjacent_groups.hpp b/include/eve/module/core/regular/impl/simd/arm/neon/swap_adjacent_groups.hpp deleted file mode 100644 index f970845b29..0000000000 --- a/include/eve/module/core/regular/impl/simd/arm/neon/swap_adjacent_groups.hpp +++ /dev/null @@ -1,64 +0,0 @@ -//================================================================================================== -/* - EVE - Expressive Vector Engine - Copyright : EVE Project Contributors - SPDX-License-Identifier: BSL-1.0 -*/ -//================================================================================================== -#pragma once - -#include -#include -#include - -namespace eve::detail -{ -template - EVE_FORCEINLINE wide - swap_adjacent_groups_(EVE_SUPPORTS(neon128_), wide v, fixed) noexcept - requires(G <= N::value) - && arm_abi> -{ - using that_t = wide; - - if constexpr( G == N::value || G == 0) { return v; } - else - { - constexpr auto c = categorize(); - - if constexpr( G == 1 ) - { - if constexpr( c == category::int64x2 ) return vextq_s64(v, v, 1); - else if constexpr( c == category::uint64x2 ) return vextq_u64(v, v, 1); - else if constexpr( c == category::float64x1 ) return v; - else if constexpr( c == category::int64x1 ) return v; - else if constexpr( c == category::uint64x1 ) return v; - else if constexpr( c == category::float32x4 ) return vrev64q_f32(v); - else if constexpr( c == category::int32x4 ) return vrev64q_s32(v); - else if constexpr( c == category::uint32x4 ) return vrev64q_u32(v); - else if constexpr( c == category::float32x2 ) return vrev64_f32(v); - else if constexpr( c == category::int32x2 ) return vrev64_s32(v); - else if constexpr( c == category::uint32x2 ) return vrev64_u32(v); - else if constexpr( c == category::int16x8 ) return vrev32q_s16(v); - else if constexpr( c == category::uint16x8 ) return vrev32q_u16(v); - else if constexpr( c == category::int16x4 ) return vrev32_s16(v); - else if constexpr( c == category::uint16x4 ) return vrev32_u16(v); - else if constexpr( c == category::int8x16 ) return vrev16q_s8(v); - else if constexpr( c == category::uint8x16 ) return vrev16q_u8(v); - else if constexpr( c == category::int8x8 ) return vrev16_s8(v); - else if constexpr( c == category::uint8x8 ) return vrev16_u8(v); - else if constexpr( c == category::float64x2 ) - { - if constexpr( current_api >= asimd ) return vextq_f64(v, v, 1); - else return that_t(v.get(1), v.get(0)); - } - } - else - { - using up_t = upgrade_t; - auto const up = bit_cast(v, as>()); - return bit_cast(swap_adjacent_groups(up, fixed {}), as(v)); - } - } -} -} diff --git a/include/eve/module/core/regular/impl/simd/ppc/swap_adjacent_groups.hpp b/include/eve/module/core/regular/impl/simd/ppc/swap_adjacent_groups.hpp deleted file mode 100644 index eee94bcd94..0000000000 --- a/include/eve/module/core/regular/impl/simd/ppc/swap_adjacent_groups.hpp +++ /dev/null @@ -1,29 +0,0 @@ -//================================================================================================== -/* - EVE - Expressive Vector Engine - Copyright : EVE Project Contributors - SPDX-License-Identifier: BSL-1.0 -*/ -//================================================================================================== -#pragma once - -#include -#include -#include - -namespace eve::detail -{ -template - EVE_FORCEINLINE wide - swap_adjacent_groups_(EVE_SUPPORTS(vmx_), wide v, fixed) noexcept - requires(G <= N::value) - && ppc_abi> -{ - if constexpr( G == N::value || G == 0) { return v; } - else - { - /// TODO: Maybe there's better than just vec_perm ?? - return basic_shuffle(v, swap_adjacent_groups_pattern); - } -} -} diff --git a/include/eve/module/core/regular/impl/simd/x86/deinterleave_groups_shuffle.hpp b/include/eve/module/core/regular/impl/simd/x86/deinterleave_groups_shuffle.hpp index 4ea696f515..265f60db2c 100644 --- a/include/eve/module/core/regular/impl/simd/x86/deinterleave_groups_shuffle.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/deinterleave_groups_shuffle.hpp @@ -7,8 +7,8 @@ //================================================================================================== #pragma once +#include #include -#include namespace eve::detail { @@ -36,7 +36,7 @@ template return deinterleave_groups_shuffle_as_doubles(v, lane); else { - auto swapped = swap_adjacent_groups(v, lane); + auto swapped = swap_adjacent(v, lane); auto lo = _mm256_unpacklo_pd(v, swapped); auto hi = _mm256_unpackhi_pd(swapped, v); diff --git a/include/eve/module/core/regular/impl/simd/x86/reverse.hpp b/include/eve/module/core/regular/impl/simd/x86/reverse.hpp index db2cf91e1f..7f57ceee05 100644 --- a/include/eve/module/core/regular/impl/simd/x86/reverse.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/reverse.hpp @@ -8,7 +8,7 @@ #pragma once #include -#include +#include namespace eve::detail { @@ -84,8 +84,8 @@ EVE_FORCEINLINE wide { if constexpr( N() == 4 ) { - v = eve::swap_adjacent_groups(v, eve::lane<2>); - return eve::swap_adjacent_groups(v, eve::lane<1>); + v = eve::swap_adjacent(v, eve::lane<2>); + return eve::swap_adjacent(v); } else { diff --git a/include/eve/module/core/regular/impl/simd/x86/swap_adjacent_groups.hpp b/include/eve/module/core/regular/impl/simd/x86/swap_adjacent_groups.hpp deleted file mode 100644 index 3b2b05b232..0000000000 --- a/include/eve/module/core/regular/impl/simd/x86/swap_adjacent_groups.hpp +++ /dev/null @@ -1,194 +0,0 @@ -//================================================================================================== -/* - EVE - Expressive Vector Engine - Copyright : EVE Project Contributors - SPDX-License-Identifier: BSL-1.0 -*/ -//================================================================================================== -#pragma once - -#include -#include -#include -#include - -namespace eve::detail -{ -template - EVE_FORCEINLINE wide - swap_adjacent_groups_(EVE_SUPPORTS(sse2_), wide v, fixed) noexcept - requires(G <= N::value) - && x86_abi> -{ - constexpr auto sf4 = _MM_SHUFFLE(2, 3, 0, 1); - using that_t = wide; - - if constexpr( G == N::value || G == 0) { return v; } - else - { - constexpr auto size = G * sizeof(T); - - if constexpr( std::same_as, x86_128_> ) - { - if constexpr( size == 1 ) - { - if constexpr( current_api >= ssse3 ) - { - return _mm_shuffle_epi8( - v.storage(), _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); - } - else - { - auto up = convert(v, as> {}); - auto r = swap_adjacent_groups(up, fixed {}); - - if constexpr( N::value < 16 ) - { - if constexpr( std::is_signed_v ) return bit_cast(_mm_packs_epi16(r, r), as(v)); - else return bit_cast(_mm_packus_epi16(r, r), as(v)); - } - else - { - auto [l, h] = r.slice(); - if constexpr( std::is_signed_v ) return _mm_packs_epi16(l, h); - else return _mm_packus_epi16(l, h); - } - } - } - else if constexpr( size == 2 ) - { - auto s = v.storage(); - s = _mm_shufflehi_epi16(s, sf4); - return _mm_shufflelo_epi16(s, sf4); - } - else if constexpr( size == 4 ) - { - if constexpr( std::same_as ) return _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1)); - else - { - auto s = v.storage(); - s = _mm_shuffle_epi32(s, sf4); - return s; - } - } - else if constexpr( size == 8 ) - { - if constexpr( std::same_as ) return _mm_shuffle_pd(v, v, _MM_SHUFFLE2(0, 1)); - else if constexpr( std::same_as ) - return _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 0, 3, 2)); - else - { - auto s = v.storage(); - s = _mm_shuffle_epi32(s, _MM_SHUFFLE(1, 0, 3, 2)); - return s; - } - } - } - else if constexpr( std::same_as, x86_256_> ) - { - if constexpr( size == 1 ) - { - if constexpr( current_api >= avx2 ) - { - auto const hm = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); - return _mm256_shuffle_epi8(v, _mm256_set_m128i(hm, hm)); - } - else - { - auto [l, h] = v.slice(); - return that_t(swap_adjacent_groups(l, fixed {}), swap_adjacent_groups(h, fixed {})); - } - } - else if constexpr( size == 2 ) - { - if constexpr( current_api >= avx2 ) - { - v = _mm256_shufflehi_epi16(v, sf4); - return _mm256_shufflelo_epi16(v, sf4); - } - else - { - auto [l, h] = v.slice(); - return that_t(swap_adjacent_groups(l, fixed {}), swap_adjacent_groups(h, fixed {})); - } - } - else if constexpr( size == 4 ) - { - if constexpr( std::same_as ) - { - return _mm256_permutevar_ps(v, as_integer_t>(1, 0, 3, 2, 5, 4, 7, 6)); - } - else if constexpr( current_api >= avx2 ) { return _mm256_shuffle_epi32(v, sf4); } - else - { - auto sd = bit_cast(v, as> {}); - sd = swap_adjacent_groups(sd, fixed<1> {}); - return bit_cast(sd, as(v)); - } - } - else if constexpr( size == 8 ) - { - if constexpr( std::same_as ) - { - return _mm256_permutevar_pd(v, as_integer_t>(2, 0, 2, 0)); - } - else if constexpr( std::same_as ) - { - return _mm256_permutevar_ps(v, as_integer_t>(6, 7, 4, 5, 2, 3, 0, 1)); - } - else if constexpr( current_api >= avx2 ) { return _mm256_permute4x64_epi64(v, sf4); } - else - { - auto sd = bit_cast(v, as> {}); - sd = swap_adjacent_groups(sd, fixed<2> {}); - return bit_cast(sd, as(v)); - } - } - else if constexpr( size == 16 ) - { - if constexpr( std::same_as ) { return _mm256_permute2f128_pd(v, v, 0x21); } - else if constexpr( std::same_as ) - { - auto [l, h] = v.slice(); - return that_t(h, l); - } - else if constexpr( current_api >= avx2 ) - { - return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(1, 0, 3, 2)); - } - else - { - auto [l, h] = v.slice(); - return that_t(h, l); - } - } - } - else if constexpr( std::same_as, x86_512_> ) - { - // We have perfect shuffle so LET'S ROCK'N'ROLL - return basic_shuffle(v, swap_adjacent_groups_pattern); - } - } -} - -template - EVE_FORCEINLINE logical> - swap_adjacent_groups_(EVE_SUPPORTS(sse2_), logical> v, fixed) noexcept - requires(G <= N::value) - && x86_abi> -{ - if constexpr( G == N::value ) { return v; } - else if constexpr( !abi_t::is_wide_logical ) - { - // Reconstruct mask, swag then turn to mask again - auto const m = v.mask(); - auto const swag = swap_adjacent_groups(m, fixed {}); - return to_logical(swag); - } - else - { - // Use the common implementation - return swap_adjacent_groups_(EVE_RETARGET(cpu_), v, fixed {}); - } -} -} diff --git a/include/eve/module/core/regular/impl/swap_adjacent_groups.hpp b/include/eve/module/core/regular/impl/swap_adjacent_groups.hpp deleted file mode 100644 index 9deb87b3ba..0000000000 --- a/include/eve/module/core/regular/impl/swap_adjacent_groups.hpp +++ /dev/null @@ -1,88 +0,0 @@ -//================================================================================================== -/* - EVE - Expressive Vector Engine - Copyright : EVE Project Contributors - SPDX-License-Identifier: BSL-1.0 -*/ -//================================================================================================== -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace eve::detail -{ -//============================================================================================== -// Classify a pattern as a swag -template -inline constexpr auto is_swag = []() -{ - // List all possible swags for a current size - constexpr auto sz = sizeof...(I); - constexpr auto x = [](std::index_sequence) - { - return kumi::make_tuple(swap_adjacent_groups_pattern...); - } - (std::make_index_sequence {}); - - // Find the fitting one - constexpr auto idx = detail::find_index(pattern, x); - return fixed {}; -}(); - -template -EVE_FORCEINLINE auto -swap_adjacent_groups_(EVE_SUPPORTS(cpu_), Wide v, fixed) noexcept requires(G <= Wide::size()) -{ - if constexpr( G == Wide::size() || G == 0) { return v; } - else if constexpr( has_aggregated_abi_v ) - { - if constexpr( G == Wide::size() / 2 ) - { - auto [l, h] = v.slice(); - return Wide {h, l}; - } - else - { - auto [l, h] = v.slice(); - return Wide {swap_adjacent_groups(l, fixed {}), swap_adjacent_groups(h, fixed {})}; - } - } - else if constexpr( is_bundle_v ) - { - return Wide( - kumi::map([&](T m) { return swap_adjacent_groups(m, fixed {}); }, v)); - } - else - { - // In this case, we don't have anything special to do so we just shuffle - return basic_shuffle(v, swap_adjacent_groups_pattern); - } -} - -template -EVE_FORCEINLINE auto -swap_adjacent_groups_(EVE_SUPPORTS(cpu_), logical v, fixed f) noexcept - requires(G <= Wide::size()) -{ - if constexpr( logical_value && Wide::abi_type::is_wide_logical ) - { - return bit_cast(swap_adjacent_groups(v.mask(), f), as(v)); - } - else - { - // Reconstruct mask, swag then turn to mask again - auto const m = v.mask(); - auto const bg = swap_adjacent_groups(m, f); - return to_logical(bg); - } -} -} diff --git a/test/doc/CMakeLists.txt b/test/doc/CMakeLists.txt index 2a0e34807f..cd868357cd 100644 --- a/test/doc/CMakeLists.txt +++ b/test/doc/CMakeLists.txt @@ -48,14 +48,15 @@ glob_unit("doc" ${doc_root} "quaternion/regular/*.cpp" ) ##================================================================================================== add_custom_target(doc.core.exe ) add_dependencies(doc.exe doc.core.exe ) -glob_unit("doc" ${doc_root} "core/constant/*.cpp" ) -glob_unit("doc" ${doc_root} "core/fuzzy/*.cpp" ) -glob_unit("doc" ${doc_root} "core/masked/*.cpp" ) -glob_unit("doc" ${doc_root} "core/pedantic/*.cpp" ) -glob_unit("doc" ${doc_root} "core/raw/*.cpp" ) -glob_unit("doc" ${doc_root} "core/regular/*.cpp" ) -glob_unit("doc" ${doc_root} "core/roundings/*.cpp" ) -glob_unit("doc" ${doc_root} "core/saturated/*.cpp" ) +glob_unit("doc" ${doc_root} "core/constant/*.cpp" ) +glob_unit("doc" ${doc_root} "core/fuzzy/*.cpp" ) +glob_unit("doc" ${doc_root} "core/masked/*.cpp" ) +glob_unit("doc" ${doc_root} "core/named_shuffles/*.cpp" ) +glob_unit("doc" ${doc_root} "core/pedantic/*.cpp" ) +glob_unit("doc" ${doc_root} "core/raw/*.cpp" ) +glob_unit("doc" ${doc_root} "core/regular/*.cpp" ) +glob_unit("doc" ${doc_root} "core/roundings/*.cpp" ) +glob_unit("doc" ${doc_root} "core/saturated/*.cpp" ) ##================================================================================================== ## GLOB and process elliptic doc tests diff --git a/test/doc/core/named_shuffles/blend.cpp b/test/doc/core/named_shuffles/blend.cpp new file mode 100644 index 0000000000..687092f8c6 --- /dev/null +++ b/test/doc/core/named_shuffles/blend.cpp @@ -0,0 +1,22 @@ +#include + +#include + +using w_t = eve::wide>; + +int +main() +{ + w_t x {0, 1, 2, 3}; + w_t y {4, 5, 6, 7}; + + // basic example + TTS_EXPECT(eve::all(w_t({4, 1, 2, 7}) == eve::blend(x, y, eve::pattern<1, 0, 0, 1>))); + + // basic example, formula + TTS_EXPECT( + eve::all(w_t({0, 5, 2, 7}) == eve::blend(x, y, [](int i, int /*size*/) { return i % 2; }))); + + // mixing groups + TTS_EXPECT(eve::all(w_t({4, 5, 2, 3}) == eve::blend(x, y, eve::lane<2>, eve::pattern<1, 0>))); +} diff --git a/test/doc/core/named_shuffles/swap_adjacent.cpp b/test/doc/core/named_shuffles/swap_adjacent.cpp new file mode 100644 index 0000000000..55e6bba6fb --- /dev/null +++ b/test/doc/core/named_shuffles/swap_adjacent.cpp @@ -0,0 +1,15 @@ +#include +#include + +using w_t = eve::wide>; + +int main() +{ + w_t x {0, 1, 2, 3}; + + TTS_EXPECT( eve::all( w_t({1, 0, 3, 2}) == eve::swap_adjacent(x)) ); + TTS_EXPECT( eve::all( w_t({1, 0, 3, 2}) == eve::swap_adjacent(x, eve::lane<1>)) ); + TTS_EXPECT( eve::all( w_t({2, 3, 0, 1}) == eve::swap_adjacent(x, eve::lane<2>)) ); + + // lane<0> and lane<4> are not OK. +} diff --git a/test/doc/core/regular/swap_adjacent_groups.cpp b/test/doc/core/regular/swap_adjacent_groups.cpp deleted file mode 100644 index e25fdfa6fc..0000000000 --- a/test/doc/core/regular/swap_adjacent_groups.cpp +++ /dev/null @@ -1,24 +0,0 @@ - -#include -#include -#include - -using wide_it = eve::wide>; - -int main() -{ - wide_it pi = {1, 2, 3, 4, 5, 6, 7, 8}; - constexpr auto _1 = eve::fixed<1>(); - constexpr auto _2 = eve::fixed<2>(); - constexpr auto _4 = eve::fixed<4>(); - constexpr auto _8 = eve::fixed<8>(); - - std::cout << "---- simd" << '\n' - << "<- pi = " << pi << '\n' - << "-> swap_adjacent_groups(pi, _1) = " << eve::swap_adjacent_groups(pi, _1) << '\n' - << "-> swap_adjacent_groups(pi, _2) = " << eve::swap_adjacent_groups(pi, _2) << '\n' - << "-> swap_adjacent_groups(pi, _4) = " << eve::swap_adjacent_groups(pi, _4) << '\n' - << "-> swap_adjacent_groups(pi, _8) = " << eve::swap_adjacent_groups(pi, _8) << '\n' -; - -} diff --git a/test/unit/api/regular/shuffle_v2/blend.cpp b/test/unit/api/regular/shuffle_v2/blend.cpp new file mode 100644 index 0000000000..6af0e114e3 --- /dev/null +++ b/test/unit/api/regular/shuffle_v2/blend.cpp @@ -0,0 +1,41 @@ +//================================================================================================== +/** + EVE - Expressive Vector Engine + Copyright : EVE Project Contributors + SPDX-License-Identifier: BSL-1.0 +**/ +//================================================================================================== +#include "unit/api/regular/shuffle_v2/shuffle_v2_test.hpp" + +namespace +{ + +// This is enough +template +constexpr auto +all_blend_patterns_to_test_for(eve::fixed) +{ + return kumi::tuple { + eve::fix_pattern([](int i, int) { return i % 2; }), + eve::fix_pattern([](int i, int) { return i % 3 == 0; }), + eve::fix_pattern([](int i, int) { return i == 0 ? 0 : 1; }), + eve::fix_pattern([](int i, int) { return i == 0 ? 1 : 0; }), + }; +} + +TTS_CASE_TPL("Check blend, generic", eve::test::simd::all_types) +(tts::type) +{ + shuffle_test::named_shuffle2_test< + /*supports_G_eq_T_Size*/ true>( + eve::as {}, + eve::blend, + [](eve::fixed) + { + auto to_test_for = all_blend_patterns_to_test_for(eve::lane); + auto lifted = kumi::map([](auto x) { return kumi::make_tuple(x); }, to_test_for); + return lifted; + }); +}; + +} // namespace diff --git a/test/unit/api/regular/shuffle_v2/idxm.cpp b/test/unit/api/regular/shuffle_v2/idxm.cpp index 443acada29..c90a6bccec 100644 --- a/test/unit/api/regular/shuffle_v2/idxm.cpp +++ b/test/unit/api/regular/shuffle_v2/idxm.cpp @@ -606,6 +606,38 @@ TTS_CASE("repace_na") test(std::array {na_, na_}, 1, std::array {1, 1}); }; +TTS_CASE("just_second_shuffle") +{ + auto test = [](auto _in, std::ptrdiff_t with, auto _expected) + { + auto in = to_idxs(_in); + auto expected = to_idxs(_expected); + auto actual = eve::detail::idxm::just_second_shuffle(in, with); + TTS_EQUAL(expected, actual) << "with: " << with; + }; + + test(std::array {0, 1}, na_, std::array {na_, na_}); + test(std::array {3, 2}, na_, std::array {1, 0}); + test(std::array {na_, 0}, na_, std::array {na_, na_}); + test(std::array {we_, 0}, na_, std::array {we_, na_}); +}; + +TTS_CASE("just_first_shuffle") +{ + auto test = [](auto _in, std::ptrdiff_t with, auto _expected) + { + auto in = to_idxs(_in); + auto expected = to_idxs(_expected); + auto actual = eve::detail::idxm::just_first_shuffle(in, with); + TTS_EQUAL(expected, actual) << "with: " << with; + }; + + test(std::array {0, 1}, na_, std::array {0, 1}); + test(std::array {3, 2}, na_, std::array {na_, na_}); + test(std::array {na_, 2}, na_, std::array {na_, na_}); + test(std::array {we_, 2}, na_, std::array {we_, na_}); +}; + TTS_CASE("is_blend") { auto test = [](auto _in, std::ptrdiff_t cardinal, bool expected) @@ -757,4 +789,16 @@ TTS_CASE("split_to_groups") test(std::array {0, 1, 2, 3}, eve::lane<4>, std::array {0, 1, 2, 3}); }; +TTS_CASE("add shuffle levels") +{ + auto add = [](auto... args) { return eve::detail::idxm::add_shuffle_levels(args...); }; + TTS_EQUAL(1, add(eve::index<1>, eve::index<1>)); + TTS_EQUAL(1, add(eve::index<1>, eve::index<1>, eve::index<1>)); + TTS_EQUAL(3, add(eve::index<2>, eve::index<1>, eve::index<1>)); + TTS_EQUAL(4, add(eve::index<2>, eve::index<2>)); + TTS_EQUAL(6, add(eve::index<4>, eve::index<2>)); + TTS_EQUAL(7, add(eve::index<4>, eve::index<3>)); + TTS_EQUAL(7, add(eve::index<4>, eve::index<3>, eve::index<1>)); +}; + } diff --git a/test/unit/api/regular/shuffle_v2/shuffle_v2_test.hpp b/test/unit/api/regular/shuffle_v2/shuffle_v2_test.hpp index f02c173449..e5f0e426f5 100644 --- a/test/unit/api/regular/shuffle_v2/shuffle_v2_test.hpp +++ b/test/unit/api/regular/shuffle_v2/shuffle_v2_test.hpp @@ -53,16 +53,12 @@ verify(T x, eve::fixed, eve::pattern_t p, U shuffled) template void -run_one_case(std::ptrdiff_t expected_level, - T input, - auto g, - eve::pattern_t p, - auto shuffle_pattern_arg) +run_one_case(std::ptrdiff_t expected_level, T input, auto g, eve::pattern_t p) { using e_t = eve::element_type_t; - if constexpr( requires { eve::shuffle_v2_core(input, g, shuffle_pattern_arg); } ) + if constexpr( requires { eve::shuffle_v2_core(input, g, p); } ) { - auto [shuffled, l] = eve::shuffle_v2_core(input, g, shuffle_pattern_arg); + auto [shuffled, l] = eve::shuffle_v2_core(input, g, p); verify(input, g, p, shuffled); std::array idxs {I...}; @@ -77,22 +73,26 @@ run_one_case(std::ptrdiff_t expected_level, else { TTS_FAIL("Failed to shuffle, G: " << g() << "\npattern: " << p); } } -template +template void -run_one_case(std::ptrdiff_t expected_level, auto input, auto g, eve::pattern_t p) +run2_one_case(std::ptrdiff_t expected_level, T x, T y, auto g, eve::pattern_t p) { - run_one_case(expected_level, input, g, p, p); -} + using e_t = eve::element_type_t; + if constexpr( requires { eve::shuffle_v2_core(x, y, g, p); } ) + { + auto [shuffled, l] = eve::shuffle_v2_core(x, y, g, p); + verify(eve::combine(x, y), g, p, shuffled); -template -void -run_one_case(std::ptrdiff_t expected_level, - T input, - eve::fixed g, - eve::pattern_formula auto formula) -{ - static_assert(G != 0); - run_one_case(expected_level, input, g, eve::fix_pattern(formula), formula); + std::array idxs {I...}; + + if( g() >= T::size() || g() >= eve::expected_cardinal_v || idxs.size() == 1 ) + { + expected_level = eve::detail::idxm::has_zeroes(idxs); + } + if( eve::has_emulated_abi_v ) expected_level = 0; + TTS_EQUAL(expected_level, l()) << "G: " << g() << "\npattern: " << p; + } + else { TTS_FAIL("Failed to shuffle, G: " << g() << "\npattern: " << p); } } template @@ -115,15 +115,7 @@ run2(auto expected_level, eve::pattern_t p = {}) auto xy = input.slice(); auto x = get<0>(xy); auto y = get<1>(xy); - - if constexpr( requires { eve::shuffle_v2_core(x, y, eve::lane, p); } ) - { - auto [shuffled, l] = eve::shuffle_v2_core(x, y, eve::lane, p); - verify(input, eve::lane, p, shuffled); - TTS_EQUAL(expected_level(std::array {I...}), l()) - << "sizeof(T): " << sizeof(T) << " G: " << G << "\npattern: " << p; - } - else { TTS_FAIL("Failed to shuffle, G: " << G << "\npattern: " << p); } + run2_one_case(expected_level(std::array {I...}), x, y, eve::lane, p); } template @@ -186,6 +178,23 @@ for_each_group_size(eve::as, auto op) { (op(eve::lane<1 << (int)I>), ...); }(std::make_index_sequence {}); } +template +void +for_each_group_with_params(eve::as tgt, auto extra_param_gen, auto op) +{ + for_each_group_size(tgt, + [&](auto g) + { + kumi::for_each( + [&](auto extra) + { + auto params = kumi::cat(kumi::tuple {g}, extra); + kumi::apply(op, params); + }, + extra_param_gen(g)); + }); +} + template void named_shuffle1_test_one_input(T input, NamedShuffle named_shuffle, auto... args) @@ -203,6 +212,25 @@ named_shuffle1_test_one_input(T input, NamedShuffle named_shuffle, auto... args) }); } +template +void +named_shuffle2_test_one_input(T x, T y, NamedShuffle named_shuffle, auto extra_param_gen) +{ + auto tgt = eve::as {}; + for_each_group_with_params(tgt, + extra_param_gen, + [&](eve::fixed g, auto... extra) + { + if constexpr( G != T::size() || supports_G_eq_T_Size ) + { + std::ptrdiff_t expected_level = + named_shuffle.level(tgt, tgt, g, extra...); + auto pattern = named_shuffle.pattern(tgt, tgt, g, extra...); + run2_one_case(expected_level, x, y, g, pattern); + } + }); +} + template void named_shuffle1_test(eve::as, NamedShuffle named_shuffle, auto... args) @@ -221,6 +249,33 @@ named_shuffle1_test(eve::as, NamedShuffle named_shuffle, auto... args) named_shuffle1_test_one_input(mask, named_shuffle, args...); } +template +void +named_shuffle2_test(eve::as>, NamedShuffle named_shuffle, auto extra_args_gen) +{ + if( N::value == 1 && !supports_G_eq_T_Size ) + { + TTS_PASS(); + return; + } + + using wide2 = eve::wide>; + + { + wide2 xy {[](int i, int) { return i + 1; }}; + + auto [x, y] = xy.slice(); + named_shuffle2_test_one_input(x, y, named_shuffle, extra_args_gen); + } + + { + eve::logical xy {[](int i, int) { return std::countl_zero((unsigned)i) & 1; }}; + + auto [x, y] = xy.slice(); + named_shuffle2_test_one_input(x, y, named_shuffle, extra_args_gen); + } +} + #if !defined(EVE_NO_SIMD) template void @@ -234,21 +289,18 @@ debug_call_shuffle_l_directly() w_t x {[](int i, int) { return i + 1; }}; // disabled since no shuffle_l2 for now and breaks compilation -#if !defined(EVE_INCLUDE_SVE_HEADER) && !defined(EVE_INCLUDE_POWERPC_HEADER) +# if !defined(EVE_INCLUDE_SVE_HEADER) && !defined(EVE_INCLUDE_POWERPC_HEADER) if constexpr( l == 2 ) { eve::detail::shuffle_l2_(eve::detail::delay_t {}, eve::current_api, p, g, x); } else -#endif - if constexpr ( l == 99 ) +# endif + if constexpr( l == 99 ) { eve::detail::shuffle_l_fallback_(eve::detail::delay_t {}, eve::current_api, p, g, x); } - else - { - std::cout << __func__ << " you need to add l: " << l << std::endl; - } + else { std::cout << __func__ << " you need to add l: " << l << std::endl; } } #endif diff --git a/test/unit/api/regular/swizzle/swap_adjacent_groups.cpp b/test/unit/api/regular/swizzle/swap_adjacent_groups.cpp deleted file mode 100644 index aa992d8e08..0000000000 --- a/test/unit/api/regular/swizzle/swap_adjacent_groups.cpp +++ /dev/null @@ -1,47 +0,0 @@ -//================================================================================================== -/** - EVE - Expressive Vector Engine - Copyright : EVE Project Contributors - SPDX-License-Identifier: BSL-1.0 -**/ -//================================================================================================== -#include "test.hpp" -#include -#include -#include -#include - -//================================================================================================== -// SWAG test -//================================================================================================== -TTS_CASE_WITH( "Check behavior of SWAGs swizzle" - , eve::test::simd::all_types - , tts::generate ( tts::randoms(-50, 50) - , tts::logicals(1, 2) - ) - ) -(T data, L logicals) -{ - constexpr auto ssz = std::bit_width( std::size_t(T::size()) ); - [&]( std::index_sequence) - { - auto f = [&](S simd, std::integral_constant) - { - constexpr std::size_t sz = 1ULL << N; - - S ref = [=](auto i, auto c) - { - constexpr auto p = eve::swap_adjacent_groups_pattern; - return simd.get(p(i,c)); - }; - - constexpr auto swags = eve::swap_adjacent_groups_pattern; - TTS_EQUAL( eve::shuffle(simd,swags) , ref); - TTS_EQUAL( eve::swap_adjacent_groups(simd, eve::lane) , ref); - }; - - ( f(data , std::integral_constant{}), ... ); - ( f(logicals, std::integral_constant{}), ... ); - }( std::make_index_sequence{} ); - TTS_EQUAL(eve::swap_adjacent_groups(data, eve::fixed<0>{}), data); -}; diff --git a/test/unit/api/tuple/swizzle/swap_adjacent_groups.cpp b/test/unit/api/tuple/swizzle/swap_adjacent_groups.cpp deleted file mode 100644 index 11e12c42ee..0000000000 --- a/test/unit/api/tuple/swizzle/swap_adjacent_groups.cpp +++ /dev/null @@ -1,51 +0,0 @@ -//================================================================================================== -/** - EVE - Expressive Vector Engine - Copyright : EVE Project Contributors - SPDX-License-Identifier: BSL-1.0 -**/ -//================================================================================================== -#include "test.hpp" -#include -#include -#include -#include - -//================================================================================================== -// SWAG test -//================================================================================================== -//================================================================================================== -// slide_right test -//================================================================================================== -TTS_CASE_TPL( "Check behavior of slide_right swizzle", eve::test::scalar::all_types) -(tts::type) -{ - using s_t = kumi::tuple; - using w_t = eve::wide; - - constexpr auto ssz = std::bit_width( std::size_t(w_t::size()) ); - - w_t data = [](auto i, auto) { return s_t { static_cast(65+i) - , static_cast(i + 1) - , 1.5*(1+i) - }; - }; - [&]( std::index_sequence) - { - auto f = [&](S simd, std::integral_constant) - { - constexpr std::size_t sz = 1ULL << N; - S ref = [&](auto i, auto c) - { - constexpr auto p = eve::swap_adjacent_groups_pattern; - return simd.get(p(i,c)); - }; - - constexpr auto swags = eve::swap_adjacent_groups_pattern; - TTS_EQUAL( eve::shuffle(simd,swags) , ref); - TTS_EQUAL( eve::swap_adjacent_groups(simd, eve::lane) , ref); - }; - - ( f(data , std::integral_constant{}), ... ); - }( std::make_index_sequence{} ); -}; diff --git a/test/unit/internals/optimize_pattern.cpp b/test/unit/internals/optimize_pattern.cpp index 714ca06f06..de7cb18ad2 100644 --- a/test/unit/internals/optimize_pattern.cpp +++ b/test/unit/internals/optimize_pattern.cpp @@ -152,79 +152,79 @@ TTS_CASE("Check broadcast patterns get optimized") }(std::make_integer_sequence{}); }; -TTS_CASE("Check swap_adjacent_groups patterns get optimized") +TTS_CASE("Check swap_adjacent patterns get optimized") { using eve::fixed; using eve::detail::find_optimized_shuffle_pattern; using eve::detail::bound; - using eve::callable_swap_adjacent_groups_; + using callable = eve::detail::named_shuffle_1; TTS_EXPR_IS ( (find_optimized_shuffle_pattern<4,1,0>()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<2,1,0>()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<4,1,0,3,2>()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<4,2,3,0,1>()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<8,1,0,3,2,5,4,7,6>()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<8,2,3,0,1,6,7,4,5>()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<8,4,5,6,7,0,1,2,3>()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<16,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14>()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<16,2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 >()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<16,4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11 >()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<16,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7 >()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<32,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30 >()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<32,2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29 >()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<32,4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11,20,21,22,23,16,17,18,19,28,29,30,31,24,25,26,27 >()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<32,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31,16,17,18,19,20,21,22,23 >()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<32,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 >()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<64,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62 >()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<64,2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,59,56,57,62,63,60,61 >()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<64,4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11,20,21,22,23,16,17,18,19,28,29,30,31,24,25,26,27,36,37,38,39,32,33,34,35,44,45,46,47,40,41,42,43,52,53,54,55,48,49,50,51,60,61,62,63,56,57,58,59 >()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<64,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31,16,17,18,19,20,21,22,23,40,41,42,43,44,45,46,47,32,33,34,35,36,37,38,39,56,57,58,59,60,61,62,63,48,49,50,51,52,53,54,55 >()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<64,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47 >()) - , (bound>) + , (bound>) ); TTS_EXPR_IS ( (find_optimized_shuffle_pattern<64,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 >()) - , (bound>) + , (bound>) ); };