From f30f245e66837edbfc1cfee61052e809c360bc20 Mon Sep 17 00:00:00 2001 From: jtlap Date: Fri, 24 May 2024 19:52:48 +0200 Subject: [PATCH] Cleanup popcount --- .../eve/module/core/regular/impl/popcount.hpp | 28 +-- .../regular/impl/simd/arm/sve/popcount.hpp | 28 +-- .../core/regular/impl/simd/x86/popcount.hpp | 168 +++++++++--------- include/eve/module/core/regular/popcount.hpp | 19 +- 4 files changed, 120 insertions(+), 123 deletions(-) diff --git a/include/eve/module/core/regular/impl/popcount.hpp b/include/eve/module/core/regular/impl/popcount.hpp index 7cf7b4de2b..01ddad2fdd 100644 --- a/include/eve/module/core/regular/impl/popcount.hpp +++ b/include/eve/module/core/regular/impl/popcount.hpp @@ -10,32 +10,23 @@ #include #include #include -#include #include -#include #include - #include -#include - #if defined(SPY_COMPILER_IS_MSVC) # include #endif namespace eve::detail { -template -EVE_FORCEINLINE auto -popcount_(EVE_SUPPORTS(cpu_), T x) noexcept -{ - using r_t = as_integer_t; - if constexpr( has_native_abi_v ) + template + EVE_FORCEINLINE constexpr auto popcount_(EVE_REQUIRES(cpu_), O const&, T x) noexcept { - if constexpr( scalar_value ) { return r_t(std::popcount(x)); } + if constexpr( scalar_value ) + return T(std::popcount(x)); else { - // return map(eve::popcount, v); constexpr auto siz = sizeof(eve::element_type_t) * 8; if constexpr( siz == 8 ) { @@ -63,15 +54,4 @@ popcount_(EVE_SUPPORTS(cpu_), T x) noexcept } } } - else { return apply_over(popcount, x); } -} - -// ----------------------------------------------------------------------------------------------- -// Masked case -template -EVE_FORCEINLINE auto -popcount_(EVE_SUPPORTS(cpu_), C const& cond, U const& t) noexcept -{ - return mask_op(cond, eve::popcount, t); -} } diff --git a/include/eve/module/core/regular/impl/simd/arm/sve/popcount.hpp b/include/eve/module/core/regular/impl/simd/arm/sve/popcount.hpp index 816a47415a..3642758426 100644 --- a/include/eve/module/core/regular/impl/simd/arm/sve/popcount.hpp +++ b/include/eve/module/core/regular/impl/simd/arm/sve/popcount.hpp @@ -13,26 +13,26 @@ namespace eve::detail { -template -EVE_FORCEINLINE auto -popcount_(EVE_SUPPORTS(sve_), wide v) noexcept -> wide +template +EVE_FORCEINLINE wide popcount_(EVE_REQUIRES(sve_), + O const&, + wide v) noexcept requires sve_abi> { - return popcount[ignore_none](v); + return svcnt_x(sve_true(), v); } -template -EVE_FORCEINLINE auto -popcount_(EVE_SUPPORTS(sve_), C const& cond, wide v) noexcept -> wide +template +EVE_FORCEINLINE wide popcount_(EVE_SUPPORTS(sve_), + C const& cond, + O const&, + wide v) noexcept requires sve_abi> { - if constexpr( C::is_complete && C::is_inverted ) - { - return svcnt_x(sve_true(), v); - } + auto alt = alternative(cond, v, as(v)); + if constexpr( C::is_complete ) + return alt; else - { - return svcnt_m(alternative(cond, v, as(v)), expand_mask(cond, as(v)), v); - } + return svcnt_m(alt, expand_mask(cond, as(v)), v); } } diff --git a/include/eve/module/core/regular/impl/simd/x86/popcount.hpp b/include/eve/module/core/regular/impl/simd/x86/popcount.hpp index a6a19394ee..e30150eedf 100644 --- a/include/eve/module/core/regular/impl/simd/x86/popcount.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/popcount.hpp @@ -23,91 +23,34 @@ namespace eve::detail { -template -EVE_FORCEINLINE auto -popcount_(EVE_SUPPORTS(sse2_), wide x) noexcept requires std::same_as, x86_128_> -{ - auto putcounts = [](auto xx) - { - using N8 = fixed; - using i8_t = wide; - const i8_t pattern_2bit(0x55); - const i8_t pattern_4bit(0x33); - const i8_t pattern_16bit(0x0f); - xx -= bit_shr(xx, 1) & pattern_2bit; // put count of each 2 bits into those 2 bits - xx = (xx & pattern_4bit) - + (bit_shr(xx, 2) & pattern_4bit); // put count of each 4 bits into those 4 bits - xx = (xx + bit_shr(xx, 4)) & pattern_16bit; // put count of each 8 bits into those 8 bits - return xx; - }; - - using r_t = wide, N>; - if constexpr( sizeof(T) == 8 || sizeof(T) == 1 ) - { - using N16 = fixed<(sizeof(T) < 8) ? 8u : sizeof(T)>; - using i16_t = wide; - auto xx = bit_cast(x, as()); - if constexpr( sizeof(T) == 8 ) - { - xx = putcounts(xx); - return bit_cast(_mm_sad_epu8(xx, _mm_setzero_si128()), as()); - } - else if constexpr( sizeof(T) == 1 ) - { - const i16_t masklow(0xff); - return bit_cast(popcount(xx & masklow) + (popcount(bit_shr(xx, 8) & masklow) << 8), - as()); - } - } - else if constexpr( sizeof(T) == 4 || sizeof(T) == 2 ) - { - using N8 = fixed; - using i8_t = wide; - const i8_t pattern_2bit(0x55); - const i8_t pattern_4bit(0x33); - const i8_t pattern_16bit(0x0f); - const r_t mask(0x7f); - x = putcounts(x); - if constexpr( sizeof(T) >= 2 ) - x += bit_shr(x, 8); // put count of each 16 bits into their lowest 8 bits - if constexpr( sizeof(T) >= 4 ) - x += bit_shr(x, 16); // put count of each 32 bits into their lowest 8 bits - return bit_cast(x & mask, as()); - } -} - -///////////////////////////////////////////////////////////////////////////// -// 256 bits -template -EVE_FORCEINLINE auto -popcount_(EVE_SUPPORTS(avx_), wide x) noexcept requires std::same_as, x86_256_> -{ - using r_t = wide, N>; - if constexpr( current_api >= avx2 ) + template + EVE_FORCEINLINE auto popcount_(EVE_REQUIRES(sse2_), O const&, wide x) noexcept + requires std::same_as, x86_128_> { auto putcounts = [](auto xx) - { - using N8 = fixed; - using i8_t = wide; - const i8_t pattern_2bit(0x55); - const i8_t pattern_4bit(0x33); - const i8_t pattern_16bit(0x0f); - xx -= bit_shr(xx, 1) & pattern_2bit; // put count of each 2 bits into those 2 bits - xx = (xx & pattern_4bit) - + (bit_shr(xx, 2) & pattern_4bit); // put count of each 4 bits into those 4 bits - xx = (xx + bit_shr(xx, 4)) & pattern_16bit; // put count of each 8 bits into those 8 bits - return xx; - }; + { + using N8 = fixed; + using i8_t = wide; + const i8_t pattern_2bit(0x55); + const i8_t pattern_4bit(0x33); + const i8_t pattern_16bit(0x0f); + xx -= bit_shr(xx, 1) & pattern_2bit; // put count of each 2 bits into those 2 bits + xx = (xx & pattern_4bit) + + (bit_shr(xx, 2) & pattern_4bit); // put count of each 4 bits into those 4 bits + xx = (xx + bit_shr(xx, 4)) & pattern_16bit; // put count of each 8 bits into those 8 bits + return xx; + }; + using r_t = wide; if constexpr( sizeof(T) == 8 || sizeof(T) == 1 ) { - using N16 = fixed<(sizeof(T) < 8) ? 16 : sizeof(T) * 2>; + using N16 = fixed<(sizeof(T) < 8) ? 8u : sizeof(T)>; using i16_t = wide; auto xx = bit_cast(x, as()); if constexpr( sizeof(T) == 8 ) { xx = putcounts(xx); - return bit_cast(_mm256_sad_epu8(xx, _mm256_setzero_si256()), as()); + return bit_cast(_mm_sad_epu8(xx, _mm_setzero_si128()), as()); } else if constexpr( sizeof(T) == 1 ) { @@ -118,25 +61,84 @@ popcount_(EVE_SUPPORTS(avx_), wide x) noexcept requires std::same_as; + using i8_t = wide; + const i8_t pattern_2bit(0x55); + const i8_t pattern_4bit(0x33); + const i8_t pattern_16bit(0x0f); + const r_t mask(0x7f); x = putcounts(x); if constexpr( sizeof(T) >= 2 ) x += bit_shr(x, 8); // put count of each 16 bits into their lowest 8 bits if constexpr( sizeof(T) >= 4 ) x += bit_shr(x, 16); // put count of each 32 bits into their lowest 8 bits - if constexpr( sizeof(T) >= 8 ) - x += bit_shr(x, 32); // put count of each 64 bits into their lowest 8 bits - const r_t mask(0x7f); return bit_cast(x & mask, as()); } } - else + + ///////////////////////////////////////////////////////////////////////////// + // 256 bits + template + EVE_FORCEINLINE auto + popcount_(EVE_REQUIRES(avx_), O const& o, wide x) noexcept + requires std::same_as, x86_256_> { - if constexpr( sizeof(T) >= 8 ) return popcount_(EVE_RETARGET(cpu_), x); + using r_t = wide; + if constexpr( current_api >= avx2 ) + { + auto putcounts = [](auto xx) + { + using N8 = fixed; + using i8_t = wide; + const i8_t pattern_2bit(0x55); + const i8_t pattern_4bit(0x33); + const i8_t pattern_16bit(0x0f); + xx -= bit_shr(xx, 1) & pattern_2bit; // put count of each 2 bits into those 2 bits + xx = (xx & pattern_4bit) + + (bit_shr(xx, 2) & pattern_4bit); // put count of each 4 bits into those 4 bits + xx = (xx + bit_shr(xx, 4)) & pattern_16bit; // put count of each 8 bits into those 8 bits + return xx; + }; + + if constexpr( sizeof(T) == 8 || sizeof(T) == 1 ) + { + using N16 = fixed<(sizeof(T) < 8) ? 16 : sizeof(T) * 2>; + using i16_t = wide; + auto xx = bit_cast(x, as()); + if constexpr( sizeof(T) == 8 ) + { + xx = putcounts(xx); + return bit_cast(_mm256_sad_epu8(xx, _mm256_setzero_si256()), as()); + } + else if constexpr( sizeof(T) == 1 ) + { + const i16_t masklow(0xff); + return bit_cast(popcount(xx & masklow) + (popcount(bit_shr(xx, 8) & masklow) << 8), + as()); + } + } + else if constexpr( sizeof(T) == 4 || sizeof(T) == 2 ) + { + x = putcounts(x); + if constexpr( sizeof(T) >= 2 ) + x += bit_shr(x, 8); // put count of each 16 bits into their lowest 8 bits + if constexpr( sizeof(T) >= 4 ) + x += bit_shr(x, 16); // put count of each 32 bits into their lowest 8 bits + if constexpr( sizeof(T) >= 8 ) + x += bit_shr(x, 32); // put count of each 64 bits into their lowest 8 bits + const r_t mask(0x7f); + return bit_cast(x & mask, as()); + } + } else { - auto [lo, hi] = x.slice(); - return r_t(popcount(lo), popcount(hi)); + if constexpr( sizeof(T) >= 8 ) + return popcount.behavior(cpu_{}, o, x); + else + { + auto [lo, hi] = x.slice(); + return r_t(popcount(lo), popcount(hi)); + } } } } -} diff --git a/include/eve/module/core/regular/popcount.hpp b/include/eve/module/core/regular/popcount.hpp index c80d68eb90..2a625b9696 100644 --- a/include/eve/module/core/regular/popcount.hpp +++ b/include/eve/module/core/regular/popcount.hpp @@ -7,10 +7,24 @@ #pragma once #include -#include +#include +#include +#include namespace eve { + template + struct popcount_t : elementwise_callable + { + template + EVE_FORCEINLINE constexpr T operator()(T t) const noexcept + { + return EVE_DISPATCH_CALL(t); + } + + EVE_CALLABLE_OBJECT(popcount_t, popcount_); + }; + //================================================================================================ //! @addtogroup core_bitops //! @{ @@ -47,7 +61,8 @@ namespace eve //! @godbolt{doc/core/popcount.cpp} //! @} //================================================================================================ -EVE_MAKE_CALLABLE(popcount_, popcount); + inline constexpr auto popcount = functor; + } #include