Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
SadiinsoSnowfall committed Nov 12, 2024
1 parent 85a0ca0 commit ca0d1d5
Showing 1 changed file with 61 additions and 61 deletions.
122 changes: 61 additions & 61 deletions include/eve/detail/function/simd/x86/interleave.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,24 @@ namespace eve::detail
// Interleave pairs of wides
//================================================================================================
template<callable_options O, scalar_value T, typename N>
EVE_FORCEINLINE kumi::tuple<wide<T, N>, wide<T, N>> interleave_(EVE_REQUIRES(sse2_), O const& o, wide<T, N> a, wide<T, N> b) noexcept
EVE_FORCEINLINE kumi::tuple<wide<T, N>, wide<T, N>> interleave_(EVE_REQUIRES(sse2_) , O const& o, wide<T,N> v0, wide<T,N> v1) noexcept
requires (N::value > 1) && x86_abi<abi_t<T,N>>
{
using type = wide<T,N>;
constexpr auto c = categorize<type>();

if constexpr( c == category::float64x2 )
{
return kumi::make_tuple(type(_mm_unpacklo_pd(a, b)),type(_mm_unpackhi_pd(a, b)));
return kumi::make_tuple(type(_mm_unpacklo_pd(v0, v1)),type(_mm_unpackhi_pd(v0, v1)));
}
else if constexpr( match(c,category::int64x2, category::uint64x2) )
{
return kumi::make_tuple(type(_mm_unpacklo_epi64(a,b)), type(_mm_unpackhi_epi64(a,b)));
return kumi::make_tuple(type(_mm_unpacklo_epi64(v0,v1)), type(_mm_unpackhi_epi64(v0,v1)));
}
else if constexpr( c == category::float64x4 )
{
auto[a0,a1] = a.slice();
auto[b0,b1] = b.slice();
auto[a0,a1] = v0.slice();
auto[b0,b1] = v1.slice();
auto[x0,x1] = eve::interleave(a0,b0);
auto[y0,y1] = eve::interleave(a1,b1);

Expand All @@ -44,33 +44,33 @@ namespace eve::detail
{
if constexpr(N::value == 2)
{
auto xy = _mm_shuffle_ps(a,b,0b10011001);
return kumi::make_tuple ( type(_mm_unpacklo_ps(a,b))
auto xy = _mm_shuffle_ps(v0,v1,0b10011001);
return kumi::make_tuple ( type(_mm_unpacklo_ps(v0,v1))
, type(_mm_shuffle_ps(xy,xy,0b11011000))
);
}
else if constexpr(N::value == 4)
{
return kumi::make_tuple(type(_mm_unpacklo_ps(a,b)), type(_mm_unpackhi_ps(a,b)));
return kumi::make_tuple(type(_mm_unpacklo_ps(v0,v1)), type(_mm_unpackhi_ps(v0,v1)));
}
}
else if constexpr( match(c,category::int32x4, category::uint32x4) )
{
if constexpr(N::value == 2)
{
using ftype = as<wide< as_floating_point_t<T>,N>>;
auto that = interleave(bit_cast(a,ftype()),bit_cast(b,ftype()));
return kumi::map( [](auto m){ return bit_cast(m,as<type>{}); }, that);
auto that = interleave(bit_cast(v0,ftype()),bit_cast(v1,ftype()));
return kumi::map( [](auto m){ return bit_cast(m,as<type>()); }, that);
}
else if constexpr(N::value == 4)
{
return kumi::make_tuple(type(_mm_unpacklo_epi32(a,b)), type(_mm_unpackhi_epi32(a,b)));
return kumi::make_tuple(type(_mm_unpacklo_epi32(v0,v1)), type(_mm_unpackhi_epi32(v0,v1)));
}
}
else if constexpr( c == category::float32x8 )
{
auto[a0,a1] = a.slice();
auto[b0,b1] = b.slice();
auto[a0,a1] = v0.slice();
auto[b0,b1] = v1.slice();
auto[x0,x1] = eve::interleave(a0,b0);
auto[y0,y1] = eve::interleave(a1,b1);

Expand All @@ -80,11 +80,11 @@ namespace eve::detail
{
if constexpr(N::value == 8)
{
return kumi::make_tuple(type(_mm_unpacklo_epi16(a,b)), type(_mm_unpackhi_epi16(a,b)));
return kumi::make_tuple(type(_mm_unpacklo_epi16(v0,v1)), type(_mm_unpackhi_epi16(v0,v1)));
}
else
{
wide<T,fixed<8>> v01 = _mm_unpacklo_epi16(a.storage(),b.storage());
wide<T,fixed<8>> v01 = _mm_unpacklo_epi16(v0.storage(),v1.storage());
if constexpr(N::value == 2 )
{
auto[l,h] = v01.slice(lower_).slice();
Expand All @@ -101,11 +101,11 @@ namespace eve::detail
{
if constexpr(N::value == 16)
{
return kumi::make_tuple(type(_mm_unpacklo_epi8(a,b)), type(_mm_unpackhi_epi8(a,b)));
return kumi::make_tuple(type(_mm_unpacklo_epi8(v0,v1)), type(_mm_unpackhi_epi8(v0,v1)));
}
else
{
wide<T,fixed<16>> v01 = _mm_unpacklo_epi8(a.storage(),b.storage());
wide<T,fixed<16>> v01 = _mm_unpacklo_epi8(v0.storage(),v1.storage());

if constexpr(N::value == 2)
{
Expand All @@ -126,38 +126,38 @@ namespace eve::detail
}
else if constexpr( match(c,category::int8x16, category::uint8x16) )
{
if constexpr (N() == 16) return kumi::make_tuple(type(_mm_unpacklo_epi8(a, b)), type(_mm_unpackhi_epi8(a, b)));
if constexpr (N() == 16) return kumi::make_tuple(type(_mm_unpacklo_epi8(v0, v1)), type(_mm_unpackhi_epi8(v0, v1)));
else
{
type combined = _mm_unpacklo_epi8(a, b);
type combined = _mm_unpacklo_epi8(v0, v1);
auto [lo, hi] = combined.slice();
return kumi::make_tuple(eve::bit_cast(lo, eve::as{a}), eve::bit_cast(lo, eve::as{b}));
return kumi::make_tuple(eve::bit_cast(lo, eve::as(v0)), eve::bit_cast(lo, eve::as(v1)));
}
}
else if constexpr ( match(c,category::int8x32, category::uint8x32,
category::int16x16, category::uint16x16)
&& current_api >= avx2 )
{
type ul_lanes = sizeof(T) == 2 ? _mm256_unpacklo_epi16(a, b) : _mm256_unpacklo_epi8(a, b);
type uh_lanes = sizeof(T) == 2 ? _mm256_unpackhi_epi16(a, b) : _mm256_unpackhi_epi8(a, b);
type ul_lanes = sizeof(T) == 2 ? _mm256_unpacklo_epi16(v0, v1) : _mm256_unpacklo_epi8(v0, v1);
type uh_lanes = sizeof(T) == 2 ? _mm256_unpackhi_epi16(v0, v1) : _mm256_unpackhi_epi8(v0, v1);
type ul = _mm256_permute2f128_si256(ul_lanes, uh_lanes, 0x20);
type uh = _mm256_permute2f128_si256(ul_lanes, uh_lanes, 0x31);
return kumi::make_tuple(ul, uh);
}
else if constexpr ( match(c, category::int8x32 , category::uint8x32,
category::int16x16, category::uint16x16) )
{
auto [a0, a1] = a.slice();
auto [b0, b1] = b.slice();
auto [a0, a1] = v0.slice();
auto [b0, b1] = v1.slice();
auto [ab00, ab01] = interleave(a0, b0);
auto [ab10, ab11] = interleave(a1, b1);
return kumi::make_tuple(type(ab00, ab01), type(ab10, ab11));
}
else if constexpr ( match(c, category::int8x64 , category::uint8x64,
category::int16x32, category::uint16x32) )
{
type ul_lanes = sizeof(T) == 2 ? _mm512_unpacklo_epi16(a, b) : _mm512_unpacklo_epi8(a, b);
type uh_lanes = sizeof(T) == 2 ? _mm512_unpackhi_epi16(a, b) : _mm512_unpackhi_epi8(a, b);
type ul_lanes = sizeof(T) == 2 ? _mm512_unpacklo_epi16(v0, v1) : _mm512_unpacklo_epi8(v0, v1);
type uh_lanes = sizeof(T) == 2 ? _mm512_unpackhi_epi16(v0, v1) : _mm512_unpackhi_epi8(v0, v1);

// Can't use shuffle_i32x4, only applies within lanes
using idx_t = typename type::template rebind<std::uint64_t, fixed<8>>;
Expand All @@ -170,14 +170,14 @@ namespace eve::detail
}
else
{
return interleave.behavior(as<kumi::tuple<wide<T, N>, wide<T, N>>>{}, cpu_{}, o, a, b);
return interleave.behavior(as<kumi::tuple<wide<T, N>, wide<T, N>>>{}, cpu_{}, o, v0, v1);
}
}

template<callable_options O, scalar_value T, typename N>
EVE_FORCEINLINE auto interleave_( EVE_REQUIRES(sse2_)
, O const& o
, logical<wide<T,N>> a, logical<wide<T,N>> b
, logical<wide<T,N>> v0, logical<wide<T,N>> v1
) noexcept
requires (N::value > 1) && x86_abi<abi_t<T,N>>
{
Expand All @@ -198,8 +198,8 @@ namespace eve::detail
else return _pdep_u64(v,0xAAAAAAAAAAAAAAAAULL);
};

auto x = a.storage().value;
auto y = b.storage().value;
auto x = v0.storage().value;
auto y = v1.storage().value;

auto lx = deposit_low (x);
auto ly = deposit_high(y);
Expand All @@ -214,7 +214,7 @@ namespace eve::detail
}
else
{
return interleave.behavior(as<kumi::tuple<wide<T, N>, wide<T, N>>>{}, cpu_{}, o, a, b);
return interleave.behavior(as<kumi::tuple<wide<T, N>, wide<T, N>>>{}, cpu_{}, o, v0, v1);
}
}

Expand All @@ -224,48 +224,48 @@ namespace eve::detail
template<callable_options O, scalar_value T, typename N>
EVE_FORCEINLINE auto interleave_( EVE_REQUIRES(sse2_)
, O const& o
, wide<T,N> a, wide<T,N> b, wide<T,N> c
, wide<T,N> v0, wide<T,N> v1, wide<T,N> v2
) noexcept
requires (N::value > 1) && x86_abi<abi_t<T,N>>
{
using type = wide<T,N>;
constexpr auto cat = categorize<type>();
constexpr auto c = categorize<type>();

if constexpr( cat == category::float32x4 )
if constexpr( c == category::float32x4 )
{
if constexpr(N::value == 2)
{
return interleave.behavior(as<kumi::tuple<wide<T, N>, wide<T, N>, wide<T, N>>>{}, cpu_{}, o, a, b, c);
return interleave.behavior(as<kumi::tuple<wide<T, N>, wide<T, N>>>{}, cpu_{}, o, v0, v1, v2);
}
else if constexpr(N::value == 4)
{
type xy = _mm_unpacklo_ps(a,b);
type xz = _mm_shuffle_ps(a,c,197);
type xy = _mm_unpacklo_ps(v0,v1);
type xz = _mm_shuffle_ps(v0,v2,197);

xy = _mm_shuffle_ps(xy,xz,36);
xz = _mm_shuffle_ps(c,b,17);
xz = _mm_shuffle_ps(v2,v1,17);

auto dy = eve::bit_cast(b, as<wide<double,fixed<2>>>{});
dy = _mm_unpackhi_pd(dy,eve::bit_cast(a, as<wide<double,fixed<2>>>{}));
auto yyx = eve::bit_cast(dy, as<type>{});
auto dy = eve::bit_cast(v1, eve::as<wide<double,fixed<2>>>());
dy = _mm_unpackhi_pd(dy,eve::bit_cast(v0, eve::as<wide<double,fixed<2>>>()));
auto yyx = eve::bit_cast(dy, eve::as<type>());

xz = _mm_shuffle_ps(xz, yyx, 34);
a = _mm_shuffle_ps(a, c, 35);
c = _mm_shuffle_ps(c, b,255);
a = _mm_shuffle_ps(a, c, 34);
xz = _mm_shuffle_ps(xz,yyx, 34);
v0 = _mm_shuffle_ps(v0, v2, 35);
v2 = _mm_shuffle_ps(v2, v1,255);
v0 = _mm_shuffle_ps(v0, v2, 34);

return kumi::make_tuple(xy, xz, a);
return kumi::make_tuple(xy,xz,v0);
}
}
else if constexpr( match(cat,category::int32x4, category::uint32x4) )
else if constexpr( match(c,category::int32x4, category::uint32x4) )
{
using ftype = as<wide< as_floating_point_t<T>,N>>;
auto that = interleave(bit_cast(a,ftype()),bit_cast(b,ftype()),bit_cast(c,ftype()));
return kumi::map( [](auto m){ return bit_cast(m,as<type>{}); }, that);
auto that = interleave(bit_cast(v0,ftype()),bit_cast(v1,ftype()),bit_cast(v2,ftype()));
return kumi::map( [](auto m){ return bit_cast(m,as<type>()); }, that);
}
else
{
return interleave.behavior(as<kumi::tuple<wide<T, N>, wide<T, N>, wide<T, N>>>{}, cpu_{}, o, a, b, c);
return interleave.behavior(as<kumi::tuple<wide<T, N>, wide<T, N>>>{}, cpu_{}, o, v0, v1, v2);
}
}

Expand All @@ -275,41 +275,41 @@ namespace eve::detail
template<callable_options O, scalar_value T, typename N>
EVE_FORCEINLINE auto interleave_( EVE_REQUIRES(sse2_)
, O const& o
, wide<T,N> a, wide<T,N> b, wide<T,N> c, wide<T,N> d
, wide<T,N> v0, wide<T,N> v1, wide<T,N> v2, wide<T,N> v3
) noexcept
requires (N::value > 1) && x86_abi<abi_t<T,N>>
{
using type = wide<T,N>;
constexpr auto cat = categorize<type>();
constexpr auto c = categorize<type>();

if constexpr( cat == category::float32x4 )
if constexpr( c == category::float32x4 )
{
if constexpr(N::value == 2)
{
return interleave.behavior(as<kumi::tuple<wide<T, N>, wide<T, N>, wide<T, N>, wide<T, N>>>{}, cpu_{}, o, a, b, c, d);
return interleave.behavior(as<kumi::tuple<wide<T, N>, wide<T, N>>>{}, cpu_{}, o, v0, v1, v2, v3);
}
else if constexpr(N::value == 4)
{
auto[t0,t1] = interleave(a,c);
auto[t2,t3] = interleave(b,d);
auto[t0,t1] = interleave(v0,v2);
auto[t2,t3] = interleave(v1,v3);

auto[r0,r1] = interleave(t0,t2);
auto[r2,r3] = interleave(t1,t3);

return kumi::tuple{r0,r1,r2,r3};
}
}
else if constexpr( match(cat, category::int32x4, category::uint32x4) )
else if constexpr( match(c,category::int32x4, category::uint32x4) )
{
using ftype = as<wide< as_floating_point_t<T>,N>>;
auto that = interleave( bit_cast(a, ftype()),bit_cast(b, ftype())
, bit_cast(c, ftype()),bit_cast(d, ftype())
auto that = interleave( bit_cast(v0,ftype()),bit_cast(v1,ftype())
, bit_cast(v2,ftype()),bit_cast(v3,ftype())
);
return kumi::map( [](auto m){ return bit_cast(m,as<type>{}); }, that);
return kumi::map( [](auto m){ return bit_cast(m,as<type>()); }, that);
}
else
{
return interleave.behavior(as<kumi::tuple<wide<T, N>, wide<T, N>, wide<T, N>, wide<T, N>>>{}, cpu_{}, o, a, b, c, d);
return interleave.behavior(as<kumi::tuple<wide<T, N>, wide<T, N>>>{}, cpu_{}, o, v0, v1, v2, v3);
}
}
}

0 comments on commit ca0d1d5

Please sign in to comment.