Skip to content

Commit

Permalink
Cleanup popcount
Browse files Browse the repository at this point in the history
  • Loading branch information
jtlap authored May 24, 2024
1 parent 2cf335f commit f30f245
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 123 deletions.
28 changes: 4 additions & 24 deletions include/eve/module/core/regular/impl/popcount.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,23 @@
#include <eve/as.hpp>
#include <eve/concept/value.hpp>
#include <eve/detail/apply_over.hpp>
#include <eve/detail/has_abi.hpp>
#include <eve/detail/spy.hpp>
#include <eve/module/core/regular/bit_cast.hpp>
#include <eve/traits.hpp>

#include <bit>

#include <type_traits>

#if defined(SPY_COMPILER_IS_MSVC)
# include <intrin.h>
#endif

namespace eve::detail
{
template<unsigned_value T>
EVE_FORCEINLINE auto
popcount_(EVE_SUPPORTS(cpu_), T x) noexcept
{
using r_t = as_integer_t<T, unsigned>;
if constexpr( has_native_abi_v<T> )
template<typename T, callable_options O>
EVE_FORCEINLINE constexpr auto popcount_(EVE_REQUIRES(cpu_), O const&, T x) noexcept
{
if constexpr( scalar_value<T> ) { return r_t(std::popcount(x)); }
if constexpr( scalar_value<T> )
return T(std::popcount(x));
else
{
// return map(eve::popcount, v);
constexpr auto siz = sizeof(eve::element_type_t<T>) * 8;
if constexpr( siz == 8 )
{
Expand Down Expand Up @@ -63,15 +54,4 @@ popcount_(EVE_SUPPORTS(cpu_), T x) noexcept
}
}
}
else { return apply_over(popcount, x); }
}

// -----------------------------------------------------------------------------------------------
// Masked case
template<conditional_expr C, unsigned_value U>
EVE_FORCEINLINE auto
popcount_(EVE_SUPPORTS(cpu_), C const& cond, U const& t) noexcept
{
return mask_op(cond, eve::popcount, t);
}
}
28 changes: 14 additions & 14 deletions include/eve/module/core/regular/impl/simd/arm/sve/popcount.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,26 @@

namespace eve::detail
{
template<unsigned_scalar_value T, typename N>
EVE_FORCEINLINE auto
popcount_(EVE_SUPPORTS(sve_), wide<T, N> v) noexcept -> wide<T, N>
template<unsigned_scalar_value T, typename N, callable_options O>
EVE_FORCEINLINE wide<T, N> popcount_(EVE_REQUIRES(sve_),
O const&,
wide<T, N> v) noexcept
requires sve_abi<abi_t<T, N>>
{
return popcount[ignore_none](v);
return svcnt_x(sve_true<T>(), v);
}

template<conditional_expr C, unsigned_scalar_value T, typename N>
EVE_FORCEINLINE auto
popcount_(EVE_SUPPORTS(sve_), C const& cond, wide<T, N> v) noexcept -> wide<T, N>
template<conditional_expr C, unsigned_scalar_value T, typename N, callable_options O>
EVE_FORCEINLINE wide<T, N> popcount_(EVE_SUPPORTS(sve_),
C const& cond,
O const&,
wide<T, N> v) noexcept
requires sve_abi<abi_t<T, N>>
{
if constexpr( C::is_complete && C::is_inverted )
{
return svcnt_x(sve_true<T>(), v);
}
auto alt = alternative(cond, v, as(v));
if constexpr( C::is_complete )
return alt;
else
{
return svcnt_m(alternative(cond, v, as(v)), expand_mask(cond, as(v)), v);
}
return svcnt_m(alt, expand_mask(cond, as(v)), v);
}
}
168 changes: 85 additions & 83 deletions include/eve/module/core/regular/impl/simd/x86/popcount.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,91 +23,34 @@
namespace eve::detail
{

template<unsigned_scalar_value T, typename N>
EVE_FORCEINLINE auto
popcount_(EVE_SUPPORTS(sse2_), wide<T, N> x) noexcept requires std::same_as<abi_t<T, N>, x86_128_>
{
auto putcounts = [](auto xx)
{
using N8 = fixed<N::value * sizeof(T)>;
using i8_t = wide<std::int8_t, N8>;
const i8_t pattern_2bit(0x55);
const i8_t pattern_4bit(0x33);
const i8_t pattern_16bit(0x0f);
xx -= bit_shr(xx, 1) & pattern_2bit; // put count of each 2 bits into those 2 bits
xx = (xx & pattern_4bit)
+ (bit_shr(xx, 2) & pattern_4bit); // put count of each 4 bits into those 4 bits
xx = (xx + bit_shr(xx, 4)) & pattern_16bit; // put count of each 8 bits into those 8 bits
return xx;
};

using r_t = wide<as_integer_t<T, unsigned>, N>;
if constexpr( sizeof(T) == 8 || sizeof(T) == 1 )
{
using N16 = fixed<(sizeof(T) < 8) ? 8u : sizeof(T)>;
using i16_t = wide<uint16_t, N16>;
auto xx = bit_cast(x, as<i16_t>());
if constexpr( sizeof(T) == 8 )
{
xx = putcounts(xx);
return bit_cast(_mm_sad_epu8(xx, _mm_setzero_si128()), as<r_t>());
}
else if constexpr( sizeof(T) == 1 )
{
const i16_t masklow(0xff);
return bit_cast(popcount(xx & masklow) + (popcount(bit_shr(xx, 8) & masklow) << 8),
as<r_t>());
}
}
else if constexpr( sizeof(T) == 4 || sizeof(T) == 2 )
{
using N8 = fixed<N::value * sizeof(T)>;
using i8_t = wide<std::int8_t, N8>;
const i8_t pattern_2bit(0x55);
const i8_t pattern_4bit(0x33);
const i8_t pattern_16bit(0x0f);
const r_t mask(0x7f);
x = putcounts(x);
if constexpr( sizeof(T) >= 2 )
x += bit_shr(x, 8); // put count of each 16 bits into their lowest 8 bits
if constexpr( sizeof(T) >= 4 )
x += bit_shr(x, 16); // put count of each 32 bits into their lowest 8 bits
return bit_cast(x & mask, as<r_t>());
}
}

/////////////////////////////////////////////////////////////////////////////
// 256 bits
template<unsigned_scalar_value T, typename N>
EVE_FORCEINLINE auto
popcount_(EVE_SUPPORTS(avx_), wide<T, N> x) noexcept requires std::same_as<abi_t<T, N>, x86_256_>
{
using r_t = wide<as_integer_t<T, unsigned>, N>;
if constexpr( current_api >= avx2 )
template<unsigned_scalar_value T, typename N, callable_options O>
EVE_FORCEINLINE auto popcount_(EVE_REQUIRES(sse2_), O const&, wide<T, N> x) noexcept
requires std::same_as<abi_t<T, N>, x86_128_>
{
auto putcounts = [](auto xx)
{
using N8 = fixed<N::value * sizeof(T)>;
using i8_t = wide<std::int8_t, N8>;
const i8_t pattern_2bit(0x55);
const i8_t pattern_4bit(0x33);
const i8_t pattern_16bit(0x0f);
xx -= bit_shr(xx, 1) & pattern_2bit; // put count of each 2 bits into those 2 bits
xx = (xx & pattern_4bit)
+ (bit_shr(xx, 2) & pattern_4bit); // put count of each 4 bits into those 4 bits
xx = (xx + bit_shr(xx, 4)) & pattern_16bit; // put count of each 8 bits into those 8 bits
return xx;
};
{
using N8 = fixed<N::value * sizeof(T)>;
using i8_t = wide<std::int8_t, N8>;
const i8_t pattern_2bit(0x55);
const i8_t pattern_4bit(0x33);
const i8_t pattern_16bit(0x0f);
xx -= bit_shr(xx, 1) & pattern_2bit; // put count of each 2 bits into those 2 bits
xx = (xx & pattern_4bit)
+ (bit_shr(xx, 2) & pattern_4bit); // put count of each 4 bits into those 4 bits
xx = (xx + bit_shr(xx, 4)) & pattern_16bit; // put count of each 8 bits into those 8 bits
return xx;
};

using r_t = wide<T, N>;
if constexpr( sizeof(T) == 8 || sizeof(T) == 1 )
{
using N16 = fixed<(sizeof(T) < 8) ? 16 : sizeof(T) * 2>;
using N16 = fixed<(sizeof(T) < 8) ? 8u : sizeof(T)>;
using i16_t = wide<uint16_t, N16>;
auto xx = bit_cast(x, as<i16_t>());
if constexpr( sizeof(T) == 8 )
{
xx = putcounts(xx);
return bit_cast(_mm256_sad_epu8(xx, _mm256_setzero_si256()), as<r_t>());
return bit_cast(_mm_sad_epu8(xx, _mm_setzero_si128()), as<r_t>());
}
else if constexpr( sizeof(T) == 1 )
{
Expand All @@ -118,25 +61,84 @@ popcount_(EVE_SUPPORTS(avx_), wide<T, N> x) noexcept requires std::same_as<abi_t
}
else if constexpr( sizeof(T) == 4 || sizeof(T) == 2 )
{
using N8 = fixed<N::value * sizeof(T)>;
using i8_t = wide<std::int8_t, N8>;
const i8_t pattern_2bit(0x55);
const i8_t pattern_4bit(0x33);
const i8_t pattern_16bit(0x0f);
const r_t mask(0x7f);
x = putcounts(x);
if constexpr( sizeof(T) >= 2 )
x += bit_shr(x, 8); // put count of each 16 bits into their lowest 8 bits
if constexpr( sizeof(T) >= 4 )
x += bit_shr(x, 16); // put count of each 32 bits into their lowest 8 bits
if constexpr( sizeof(T) >= 8 )
x += bit_shr(x, 32); // put count of each 64 bits into their lowest 8 bits
const r_t mask(0x7f);
return bit_cast(x & mask, as<r_t>());
}
}
else

/////////////////////////////////////////////////////////////////////////////
// 256 bits
template<unsigned_scalar_value T, typename N, callable_options O>
EVE_FORCEINLINE auto
popcount_(EVE_REQUIRES(avx_), O const& o, wide<T, N> x) noexcept
requires std::same_as<abi_t<T, N>, x86_256_>
{
if constexpr( sizeof(T) >= 8 ) return popcount_(EVE_RETARGET(cpu_), x);
using r_t = wide<T, N>;
if constexpr( current_api >= avx2 )
{
auto putcounts = [](auto xx)
{
using N8 = fixed<N::value * sizeof(T)>;
using i8_t = wide<std::int8_t, N8>;
const i8_t pattern_2bit(0x55);
const i8_t pattern_4bit(0x33);
const i8_t pattern_16bit(0x0f);
xx -= bit_shr(xx, 1) & pattern_2bit; // put count of each 2 bits into those 2 bits
xx = (xx & pattern_4bit)
+ (bit_shr(xx, 2) & pattern_4bit); // put count of each 4 bits into those 4 bits
xx = (xx + bit_shr(xx, 4)) & pattern_16bit; // put count of each 8 bits into those 8 bits
return xx;
};

if constexpr( sizeof(T) == 8 || sizeof(T) == 1 )
{
using N16 = fixed<(sizeof(T) < 8) ? 16 : sizeof(T) * 2>;
using i16_t = wide<uint16_t, N16>;
auto xx = bit_cast(x, as<i16_t>());
if constexpr( sizeof(T) == 8 )
{
xx = putcounts(xx);
return bit_cast(_mm256_sad_epu8(xx, _mm256_setzero_si256()), as<r_t>());
}
else if constexpr( sizeof(T) == 1 )
{
const i16_t masklow(0xff);
return bit_cast(popcount(xx & masklow) + (popcount(bit_shr(xx, 8) & masklow) << 8),
as<r_t>());
}
}
else if constexpr( sizeof(T) == 4 || sizeof(T) == 2 )
{
x = putcounts(x);
if constexpr( sizeof(T) >= 2 )
x += bit_shr(x, 8); // put count of each 16 bits into their lowest 8 bits
if constexpr( sizeof(T) >= 4 )
x += bit_shr(x, 16); // put count of each 32 bits into their lowest 8 bits
if constexpr( sizeof(T) >= 8 )
x += bit_shr(x, 32); // put count of each 64 bits into their lowest 8 bits
const r_t mask(0x7f);
return bit_cast(x & mask, as<r_t>());
}
}
else
{
auto [lo, hi] = x.slice();
return r_t(popcount(lo), popcount(hi));
if constexpr( sizeof(T) >= 8 )
return popcount.behavior(cpu_{}, o, x);
else
{
auto [lo, hi] = x.slice();
return r_t(popcount(lo), popcount(hi));
}
}
}
}
}
19 changes: 17 additions & 2 deletions include/eve/module/core/regular/popcount.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,24 @@
#pragma once

#include <eve/arch.hpp>
#include <eve/detail/overload.hpp>
#include <eve/traits/overload.hpp>
#include <eve/module/core/decorator/core.hpp>
#include <eve/detail/assert_utils.hpp>

namespace eve
{
template<typename Options>
struct popcount_t : elementwise_callable<popcount_t, Options>
{
template<unsigned_value T>
EVE_FORCEINLINE constexpr T operator()(T t) const noexcept
{
return EVE_DISPATCH_CALL(t);
}

EVE_CALLABLE_OBJECT(popcount_t, popcount_);
};

//================================================================================================
//! @addtogroup core_bitops
//! @{
Expand Down Expand Up @@ -47,7 +61,8 @@ namespace eve
//! @godbolt{doc/core/popcount.cpp}
//! @}
//================================================================================================
EVE_MAKE_CALLABLE(popcount_, popcount);
inline constexpr auto popcount = functor<popcount_t>;

}

#include <eve/module/core/regular/impl/popcount.hpp>
Expand Down

0 comments on commit f30f245

Please sign in to comment.