Skip to content

Commit

Permalink
More slide implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
DenisYaroshevskiy authored Feb 20, 2024
1 parent baa0924 commit 2d3f83c
Show file tree
Hide file tree
Showing 7 changed files with 169 additions and 34 deletions.
57 changes: 44 additions & 13 deletions include/eve/detail/shuffle_v2/idxm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,26 +246,30 @@ fix_indexes_to_fundamental(const std::array<std::ptrdiff_t, PatternSize>& p,
}

constexpr bool
shuffle_within_halves(std::span<const std::ptrdiff_t> idxs)
shuffle_within_n(std::span<const std::ptrdiff_t> idxs, std::ptrdiff_t n)
{
const std::ptrdiff_t ssize = std::ssize(idxs);
const std::ptrdiff_t half = ssize / 2;

if( ssize % 2 ) return false;

for( std::ptrdiff_t i = 0; i != half; ++i )
{
if( idxs[i] >= half ) return false;
}

for( std::ptrdiff_t i = half; i != ssize; ++i )
{
if( 0 <= idxs[i] && idxs[i] < half ) return false;
if (ssize % n) return false;

for (std::ptrdiff_t part = 0; part != ssize; part += n) {
std::ptrdiff_t ub = part + n;
for (std::ptrdiff_t i = part; i != ub; ++i) {
std::ptrdiff_t x = idxs[i];
if (x < 0) continue;
if (x < part) return false;
if (x >= ub) return false;
}
}

return true;
}

constexpr bool
shuffle_within_halves(std::span<const std::ptrdiff_t> idxs)
{
return shuffle_within_n(idxs, std::ssize(idxs) / 2);
}

template<std::size_t N>
constexpr auto
shuffle_halves_independetly(const std::array<std::ptrdiff_t, N>& p)
Expand Down Expand Up @@ -849,6 +853,33 @@ is_slide_right(std::span<const std::ptrdiff_t> idxs)
return (m - idxs.data()) - *m;
}

constexpr std::optional<std::ptrdiff_t>
is_slide_left_2(std::span<const std::ptrdiff_t> idxs, std::ptrdiff_t reg_groups)
{
if (idxs.empty()) return std::nullopt;
const auto *f = idxs.data();
const auto *l = idxs.data() + idxs.size();
const auto *start2 = l;

while (*--start2 == we_);

std::ptrdiff_t slide = 0;

// start2 is not l by construction
if (*start2 >= reg_groups) {
slide = l - start2 + (*start2 - reg_groups);
} else {
start2 = l;
slide = 0;
}

start2 = l - slide;

if (!is_in_order_from({start2, l}, reg_groups)) return std::nullopt;
if (!is_in_order_from({f, start2}, slide)) return std::nullopt;
return slide;
}

template<std::size_t N>
constexpr std::optional<std::array<std::ptrdiff_t, N>>
slide_as_slide2_with_0(std::span<const std::ptrdiff_t, N> idxs)
Expand Down
1 change: 1 addition & 0 deletions include/eve/detail/shuffle_v2/native_shuffle_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ struct expanded_pattern_t : pattern_t<I...>
static constexpr std::ptrdiff_t e_t_size = (int)sizeof(eve::element_type_t<T>);
static constexpr std::ptrdiff_t g_size = e_t_size * G;
static constexpr std::ptrdiff_t reg_size = T::size() * e_t_size;
static constexpr std::ptrdiff_t reg_groups = T::size() / G;
static constexpr std::ptrdiff_t out_reg_size = idxs.size() * g_size;

static constexpr bool has_zeroes = idxm::has_zeroes(idxs);
Expand Down
52 changes: 41 additions & 11 deletions include/eve/detail/shuffle_v2/simd/x86/idxm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ x86_blend_immediate_mask(std::span<const std::ptrdiff_t> idxs, std::ptrdiff_t g)
}

template<std::ptrdiff_t G, std::size_t N>
constexpr auto
x86_pshuvb_pattern(const std::array<std::ptrdiff_t, N>& idxs);
constexpr auto x86_pshuvb_pattern(const std::array<std::ptrdiff_t, N>& idxs);

template<std::ptrdiff_t G, std::size_t N>
constexpr auto
Expand All @@ -89,6 +88,8 @@ x86_pshuvb_pattern(std::span<const std::ptrdiff_t, N> idxs)
using arr_t = std::array<std::ptrdiff_t, N>;
using res_t = std::optional<arr_t>;

if( !shuffle_within_n(idxs, 16) ) return res_t {};

arr_t res = {};
for( std::size_t i = 0; i != N; ++i )
{
Expand Down Expand Up @@ -116,20 +117,49 @@ x86_pshuvb_pattern(const std::array<std::ptrdiff_t, N>& idxs)

template<std::size_t N>
constexpr std::optional<int>
mm512_shuffle_i64x2_idx(std::array<std::ptrdiff_t, N> idxs) {
if constexpr (N == 2) return mm512_shuffle_i64x2_idx(expand_group<2>(idxs));
else if constexpr ( N > 4 ) {
mm512_shuffle_i64x2_idx(std::array<std::ptrdiff_t, N> idxs)
{
if constexpr( N == 2 ) return mm512_shuffle_i64x2_idx(expand_group<2>(idxs));
else if constexpr( N > 4 )
{
auto upscaled = upscale_pattern(idxs);
if (upscaled) return mm512_shuffle_i64x2_idx(*upscaled);
if( upscaled ) return mm512_shuffle_i64x2_idx(*upscaled);
else return std::nullopt;
} else {
if (idxs[0] >= 4 || idxs[1] >= 4) return std::nullopt;
if (idxs[2] < 4 || idxs[3] < 4) return std::nullopt;
idxs[2]-=4;
idxs[3]-=4;
}
else
{
if( idxs[0] >= 4 || idxs[1] >= 4 ) return std::nullopt;
if( idxs[2] < 4 || idxs[3] < 4 ) return std::nullopt;
idxs[2] -= 4;
idxs[3] -= 4;
return idxs[0] | (idxs[1] << 2) | (idxs[2] << 4) | (idxs[3] << 6);
}
}

template<std::size_t N>
constexpr auto
slide_2_left_in_16_pattern(std::ptrdiff_t g_size, std::ptrdiff_t slide)
{
std::ptrdiff_t n16 = 16 / g_size;
std::array<std::ptrdiff_t, N> res = {};

for( std::ptrdiff_t i = 0; i != N; i += n16 )
{
for( std::ptrdiff_t j = 0; j != n16; ++j )
{
std::ptrdiff_t full = i + j;
if( j < (n16 - slide) ) res[full] = full + slide;
else res[full] = full + N - (n16 - slide);
}
}
return res;
}

template<std::size_t N>
constexpr auto
slide_by_16_then_alignr(const std::array<std::ptrdiff_t, N>& idxs, std::ptrdiff_t g_size)
{
return slide_by_16_then_alignr(std::span<const std::ptrdiff_t, N>(idxs), g_size);
}

}
2 changes: 1 addition & 1 deletion include/eve/detail/shuffle_v2/simd/x86/shuffle_l2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ shuffle_l2_x86_within_128x2_alignr(P, fixed<G>, wide<T, N> x, wide<T, N> y)
else if constexpr( current_api < ssse3 ) return no_matching_shuffle;
else
{
constexpr auto starts_from = idxm::is_in_order(*P::repeated_16);
constexpr auto starts_from = idxm::is_slide_left_2(*P::repeated_16, P::reg_groups);

if constexpr( !starts_from ) return no_matching_shuffle;
else
Expand Down
2 changes: 1 addition & 1 deletion include/eve/detail/shuffle_v2/simd/x86/shuffle_l3.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ shuffle_l3_x86_pshuvb(P, fixed<G>, wide<T, N> x)
else if constexpr( current_api == avx && P::reg_size == 32 ) return no_matching_shuffle;
else
{
constexpr auto pshuvb_pattern = idxm::x86_pshuvb_pattern<G * sizeof(T)>(P::idxs);
constexpr auto pshuvb_pattern = idxm::x86_pshuvb_pattern<P::g_size>(P::idxs);
if constexpr( !pshuvb_pattern ) return no_matching_shuffle;
else
{
Expand Down
44 changes: 36 additions & 8 deletions include/eve/detail/shuffle_v2/simd/x86/shuffle_l4_l5.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,34 @@ shuffle_l4_l5_x86_put_u64x2_in_position(P, fixed<G>, wide<T, N> x)
{
constexpr auto p0 = get<0>(*P::shuffle_16in16);
constexpr auto p1 = get<1>(*P::shuffle_16in16);
auto [r0, l0] = shuffle_v2_core(x, eve::lane<G>, idxm::to_pattern<p0>());
auto [r1, l1] = shuffle_v2_core(r0, eve::lane<G>, idxm::to_pattern<p1>());
auto [r0, l0] = shuffle_v2_core(x, eve::lane<G>, idxm::to_pattern<p0>());
auto [r1, l1] = shuffle_v2_core(r0, eve::lane<G>, idxm::to_pattern<p1>());

return kumi::tuple {r1, idxm::add_shuffle_levels(l0, l1)};
}
}

template<typename P, typename T, typename N, std::ptrdiff_t G>
EVE_FORCEINLINE auto
shuffle_l4_l5_x86_slide_less_than_16(P, fixed<G>, wide<T, N> x)
{
constexpr auto no = kumi::tuple {no_matching_shuffle, eve::index<-1>};
// Coudn't figure out how to generalize well
// only slide left for now
// No masking 0s on avx512
if constexpr( current_api < avx2 ) return no;
else if constexpr ( constexpr auto slide = idxm::is_slide_left(P::idxs) )
{
static_assert(G == 1, "verifying assumptions");
constexpr auto alignr_p = idxm::slide_2_left_in_16_pattern<N::value>(P::g_size, *slide);

wide<T, N> y = shuffle_l<2>(x, lane<16 / sizeof(T)>, pattern<1, na_>);

return kumi::tuple{shuffle_l<2>(x, y, idxm::to_pattern<alignr_p>()), index<4>};
}
else return no;
}

template<typename P, arithmetic_scalar_value T, typename N, std::ptrdiff_t G>
EVE_FORCEINLINE auto
shuffle_l4_l5_(EVE_SUPPORTS(sse2_), P p, fixed<G> g, wide<T, N> x)
Expand All @@ -65,7 +86,8 @@ requires(P::out_reg_size == P::reg_size)
{
return r;
}
else if constexpr( auto r = shuffle_l4_broadcast_lane_set_get(p, g, x); matched_shuffle<decltype(get<0>(r))> )
else if constexpr( auto r = shuffle_l4_broadcast_lane_set_get(p, g, x);
matched_shuffle<decltype(get<0>(r))> )
{
return r;
}
Expand All @@ -74,6 +96,11 @@ requires(P::out_reg_size == P::reg_size)
{
return r;
}
else if constexpr( auto r = shuffle_l4_l5_x86_slide_less_than_16(p, g, x);
matched_shuffle<decltype(get<0>(r))> )
{
return r;
}
else return kumi::tuple {no_matching_shuffle, eve::index<-1>};
}

Expand All @@ -82,11 +109,12 @@ EVE_FORCEINLINE auto
shuffle_l4_l5_(EVE_SUPPORTS(avx512_), P p, fixed<G> g, logical<wide<T, N>> x)
requires(P::out_reg_size == P::reg_size)
{
if constexpr( auto r = shuffle_l4_broadcast_lane_set_get(p, g, x); matched_shuffle<decltype(get<0>(r))> )
{
return r;
}
else return kumi::tuple {no_matching_shuffle, eve::index<-1>};
if constexpr( auto r = shuffle_l4_broadcast_lane_set_get(p, g, x);
matched_shuffle<decltype(get<0>(r))> )
{
return r;
}
else return kumi::tuple {no_matching_shuffle, eve::index<-1>};
}

}
45 changes: 45 additions & 0 deletions test/unit/api/regular/shuffle_v2/idxm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ TTS_CASE("shuffle_within_halves")
auto in = to_idxs(_in);
bool actual = eve::detail::idxm::shuffle_within_halves(in);
TTS_EQUAL(expected, actual) << tts::as_string(in);
actual = eve::detail::idxm::shuffle_within_n(in, std::ssize(in) / 2);
TTS_EQUAL(expected, actual) << tts::as_string(in);
};

test(std::array {0, 1, we_, we_}, true);
Expand All @@ -166,6 +168,23 @@ TTS_CASE("shuffle_within_halves")
test(std::array {3, we_, we_, we_}, false);
};

TTS_CASE("shuffle_within_n")
{
auto test = [](auto _in, std::ptrdiff_t n, bool expected)
{
auto in = to_idxs(_in);
bool actual = eve::detail::idxm::shuffle_within_n(in, n);
TTS_EQUAL(expected, actual) << tts::as_string(in);
};

test(std::array {0, 1, 3, 2, 5, 4, 7, 6}, 2, true);
test(std::array {0, we_, 3, na_, 5, na_, 7, 6}, 2, true);
test(std::array {0, 3, 1, 2, 5, 4, 7, 6}, 2, false);
test(std::array {0, 1, 3, 2, 5, 4, 7, 0}, 2, false);
test(std::array {0, 1, 3, 2, 5, 4, 7, na_}, 2, true);
test(std::array {1, 2, 3, 4, 5, 6, 7, na_}, 4, false);
};

TTS_CASE("shuffle_halves_independetly")
{
auto test = [](auto _in, auto _expected_lo, auto _expected_hi)
Expand Down Expand Up @@ -780,6 +799,32 @@ TTS_CASE("is_slide_right")
test(std::array {na_, na_, 1, 2}, -1);
};

TTS_CASE("is_slide_left2")
{
auto test = [](auto _in, int reg_groups, int expected)
{
auto in = to_idxs(_in);
auto actual = eve::detail::idxm::is_slide_left_2(in, reg_groups).value_or(-1);
TTS_EQUAL(expected, actual) << tts::as_string(in);
};

test(std::array {0, 1, 2, 3}, 4, 0); // not slide 2, result is unimportant
test(std::array {1, 2, 3, 4}, 4, 1);
test(std::array {1, 2, 3, 16}, 16, 1);
test(std::array {1, we_, 3, 4}, 4, 1);
test(std::array {1, we_, 3, 16}, 16, 1);
test(std::array {2, 3, 4, we_}, 4, 2);
test(std::array {2, 3, 16, we_}, 16, 2);
// {2, 3, we_, we_} - is not an important case because it's not slide2 at all.
test(std::array {2, 3, we_, 17}, 16, 2);
test(std::array {3, 4, 5, 6}, 4, 3);
test(std::array {4, 5, 6, 7}, 4, 4);
test(std::array {1, 2, na_, 4}, 4, -1);
test(std::array {1, 2, 3, na_}, 4, -1);
test(std::array {2, we_, 3, 16}, 16, -1);
};


TTS_CASE("slide_as_slide2_with_0")
{
auto yes_test = [](auto _in, auto _expected)
Expand Down

0 comments on commit 2d3f83c

Please sign in to comment.