diff --git a/include/eve/module/core/regular/impl/simd/x86/add.hpp b/include/eve/module/core/regular/impl/simd/x86/add.hpp index b267acdf5e..fa480cecf4 100644 --- a/include/eve/module/core/regular/impl/simd/x86/add.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/add.hpp @@ -126,42 +126,36 @@ namespace eve::detail if constexpr(floating_value &&( O::contains(lower) || O::contains(upper)) && !O::contains(strict)) { - if constexpr(current_api >= avx512) + auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; + if constexpr ( c == category::float64x8 ) return _mm512_mask_add_round_pd (src, m, v, w, dir); + else if constexpr ( c == category::float32x16 ) return _mm512_mask_add_round_ps (src, m, v, w, dir); + else if constexpr ( c == category::float64x4 || c == category::float64x2 || + c == category::float32x8 || c == category::float32x4 || c == category::float32x2) { - auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; - if constexpr ( c == category::float64x8 ) return _mm512_mask_add_round_pd (src, m, v, w, dir); - else if constexpr ( c == category::float32x16 ) return _mm512_mask_add_round_ps (src, m, v, w, dir); - else if constexpr ( c == category::float64x4 || c == category::float64x2 || - c == category::float32x8 || c == category::float32x4 || c == category::float32x2) - { - auto vv = eve::combine(v, w); - auto ww = eve::combine(w, v); - auto vvpww = add[opts.drop(condition_key)](vv, ww); - auto s = slice(vvpww, eve::upper_); - return if_else(cx,s,src); - } - else return add.behavior(cpu_{}, opts, v, w); + auto vv = eve::combine(v, w); + auto ww = eve::combine(w, v); + auto vvpww = add[opts.drop(condition_key)](vv, ww); + auto s = slice(vvpww, eve::upper_); + return if_else(cx,s,src); } - else return add.behavior(cpu_{}, opts, v, w); + else return add.behavior(cpu_{}, opts, v, w); } else if constexpr(O::contains(saturated)) { - constexpr auto sup_avx2 = current_api >= avx2; - - if constexpr( floating_value ) return add[cx](v, w); - else if constexpr( c == category::int16x32 ) return _mm512_mask_adds_epi16(src, m, v, w); - else if constexpr( c == category::uint16x32 ) return _mm512_mask_adds_epu16(src, m, v, w); - else if constexpr( c == category::int8x64 ) return _mm512_mask_adds_epi8(src, m, v, w); - else if constexpr( c == category::uint8x64 ) return _mm512_mask_adds_epu8(src, m, v, w); - else if constexpr( sup_avx2 && c == category::int16x16 ) return _mm256_mask_adds_epi16(src, m, v, w); - else if constexpr( sup_avx2 && c == category::uint16x16 ) return _mm256_mask_adds_epu16(src, m, v, w); - else if constexpr( sup_avx2 && c == category::int8x32 ) return _mm256_mask_adds_epi8(src, m, v, w); - else if constexpr( sup_avx2 && c == category::uint8x32 ) return _mm256_mask_adds_epu8(src, m, v, w); - else if constexpr( c == category::int16x8 ) return _mm_mask_adds_epi16(src, m, v, w); - else if constexpr( c == category::uint16x8 ) return _mm_mask_adds_epu16(src, m, v, w); - else if constexpr( c == category::int8x16 ) return _mm_mask_adds_epi8(src, m, v, w); - else if constexpr( c == category::uint8x16 ) return _mm_mask_adds_epu8(src, m, v, w); - else return add.behavior(cpu_{}, opts, v, w); + if constexpr( floating_value ) return add[cx](v, w); + else if constexpr( c == category::int16x32 ) return _mm512_mask_adds_epi16(src, m, v, w); + else if constexpr( c == category::uint16x32 ) return _mm512_mask_adds_epu16(src, m, v, w); + else if constexpr( c == category::int8x64 ) return _mm512_mask_adds_epi8(src, m, v, w); + else if constexpr( c == category::uint8x64 ) return _mm512_mask_adds_epu8(src, m, v, w); + else if constexpr( c == category::int16x16 ) return _mm256_mask_adds_epi16(src, m, v, w); + else if constexpr( c == category::uint16x16 ) return _mm256_mask_adds_epu16(src, m, v, w); + else if constexpr( c == category::int8x32 ) return _mm256_mask_adds_epi8(src, m, v, w); + else if constexpr( c == category::uint8x32 ) return _mm256_mask_adds_epu8(src, m, v, w); + else if constexpr( c == category::int16x8 ) return _mm_mask_adds_epi16(src, m, v, w); + else if constexpr( c == category::uint16x8 ) return _mm_mask_adds_epu16(src, m, v, w); + else if constexpr( c == category::int8x16 ) return _mm_mask_adds_epi8(src, m, v, w); + else if constexpr( c == category::uint8x16 ) return _mm_mask_adds_epu8(src, m, v, w); + else return add.behavior(cpu_{}, opts, v, w); } else { diff --git a/include/eve/module/core/regular/impl/simd/x86/countl_zero.hpp b/include/eve/module/core/regular/impl/simd/x86/countl_zero.hpp index 75bf7a2621..f1e4f499d3 100644 --- a/include/eve/module/core/regular/impl/simd/x86/countl_zero.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/countl_zero.hpp @@ -38,7 +38,7 @@ namespace eve::detail else if constexpr( c == category::uint64x2 ) return r_t(_mm_lzcnt_epi64(a0)); else if constexpr( c == category::uint32x4 ) return r_t(_mm_lzcnt_epi32(a0)); } - else if constexpr( current_api >= sse2 ) + else { //Inspired from: https://stackoverflow.com/questions/58823140/count-leading-zero-bits-for-each-element-in-avx2-vector-emulate-mm256-lzcnt-ep using ri_t = wide; @@ -55,8 +55,6 @@ namespace eve::detail } else return countl_zero.behavior(cpu_{}, opts, a0); } - else - return countl_zero.behavior(cpu_{}, opts, a0); } else { diff --git a/include/eve/module/core/regular/impl/simd/x86/div.hpp b/include/eve/module/core/regular/impl/simd/x86/div.hpp index 2cf2db193e..382904c602 100644 --- a/include/eve/module/core/regular/impl/simd/x86/div.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/div.hpp @@ -96,22 +96,19 @@ namespace eve::detail } else if constexpr (floating_value && !O::contains(strict) && (O::contains(lower) || O::contains(upper))) { - if constexpr(current_api >= avx512) + auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; + + if constexpr ( c == category::float64x8 ) return _mm512_add_round_pd (v, w, dir); + else if constexpr ( c == category::float32x16 ) return _mm512_add_round_ps (v, w, dir); + else if constexpr ( c == category::float64x4 || c == category::float64x2 || + c == category::float32x8 || c == category::float32x4 || c == category::float32x2) { - auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; - if constexpr ( c == category::float64x8 ) return _mm512_add_round_pd (v, w, dir); - else if constexpr ( c == category::float32x16 ) return _mm512_add_round_ps (v, w, dir); - else if constexpr ( c == category::float64x4 || c == category::float64x2 || - c == category::float32x8 || c == category::float32x4 || c == category::float32x2) - { - auto vv = combine(v, v); - auto ww = combine(w, w); - auto vvpww = div[o](vv, ww); - auto s = slice(vvpww, eve::upper_); - return if_else(cx,s,src); - } + auto vv = combine(v, v); + auto ww = combine(w, w); + auto vvpww = div[o](vv, ww); + auto s = slice(vvpww, eve::upper_); + return if_else(cx,s,src); } - return div.behavior(cpu_{}, o, v, w); } else if constexpr (O::contains(toward_zero) || O::contains(upward) || O::contains(downward) || O::contains(to_nearest)) diff --git a/include/eve/module/core/regular/impl/simd/x86/fma.hpp b/include/eve/module/core/regular/impl/simd/x86/fma.hpp index 43e0c45602..584fac755c 100644 --- a/include/eve/module/core/regular/impl/simd/x86/fma.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/fma.hpp @@ -103,26 +103,22 @@ namespace eve::detail { if constexpr(!O::contains(strict)) { - if constexpr(current_api >= avx512) + auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; + if constexpr ( cx == category::float64x8 ) return _mm512_mask_fmadd_round_pd (a, m, b, c, dir); + else if constexpr ( cx == category::float32x16 ) return _mm512_mask_fmadd_round_ps (a, m, b, c, dir); + else if constexpr ( cx == category::float64x4 || cx == category::float64x2 || + cx == category::float32x8 || cx == category::float32x4 || cx == category::float32x2) { - auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; - if constexpr ( cx == category::float64x8 ) return _mm512_mask_fmadd_round_pd (a, m, b, c, dir); - else if constexpr ( cx == category::float32x16 ) return _mm512_mask_fmadd_round_ps (a, m, b, c, dir); - else if constexpr ( cx == category::float64x4 || cx == category::float64x2 || - cx == category::float32x8 || cx == category::float32x4 || cx == category::float32x2) - { - auto aa = eve::combine(a, a); - auto bb = eve::combine(b, b); - auto cc = eve::combine(c, c); - auto aabbcc = fma[opts.drop(condition_key)](aa, bb, cc); - auto s = slice(aabbcc, eve::upper_); - return if_else(mask,s,src); - } - else return fma.behavior(cpu_{}, opts, a, b, c); + auto aa = eve::combine(a, a); + auto bb = eve::combine(b, b); + auto cc = eve::combine(c, c); + auto aabbcc = fma[opts.drop(condition_key)](aa, bb, cc); + auto s = slice(aabbcc, eve::upper_); + return if_else(mask,s,src); } - else return fma.behavior(cpu_{}, opts, a, b, c); + else return fma.behavior(cpu_{}, opts, a, b, c); } - else return fma.behavior(cpu_{}, opts, a, b, c); + else return fma.behavior(cpu_{}, opts, a, b, c); } else if constexpr( cx == category::float32x16 ) return _mm512_mask_fmadd_ps(a, m, b, c); else if constexpr( cx == category::float64x8 ) return _mm512_mask_fmadd_pd(a, m, b, c); diff --git a/include/eve/module/core/regular/impl/simd/x86/fms.hpp b/include/eve/module/core/regular/impl/simd/x86/fms.hpp index 199814bc8e..419d269f87 100644 --- a/include/eve/module/core/regular/impl/simd/x86/fms.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/fms.hpp @@ -107,24 +107,20 @@ namespace eve::detail { if constexpr(!O::contains(strict)) { - if constexpr(current_api >= avx512) + auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; + if constexpr ( cx == category::float64x8 ) return _mm512_mask_fmsub_round_pd (v, m, w, x, dir); + else if constexpr ( cx == category::float32x16 ) return _mm512_mask_fmsub_round_ps (v, m, w, x, dir); + else if constexpr ( cx == category::float64x4 || cx == category::float64x2 || + cx == category::float32x8 || cx == category::float32x4 || cx == category::float32x2) { - auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; - if constexpr ( cx == category::float64x8 ) return _mm512_mask_fmsub_round_pd (v, m, w, x, dir); - else if constexpr ( cx == category::float32x16 ) return _mm512_mask_fmsub_round_ps (v, m, w, x, dir); - else if constexpr ( cx == category::float64x4 || cx == category::float64x2 || - cx == category::float32x8 || cx == category::float32x4 || cx == category::float32x2) - { - auto aa = eve::combine(v, v); - auto bb = eve::combine(w, w); - auto cc = eve::combine(x, x); - auto aabbcc = fms[opts.drop(condition_key)](aa, bb, cc); - auto s = slice(aabbcc, eve::upper_); - return if_else(mask,s,src); - } - else return fms.behavior(cpu_{}, opts, v, w, x); + auto aa = eve::combine(v, v); + auto bb = eve::combine(w, w); + auto cc = eve::combine(x, x); + auto aabbcc = fms[opts.drop(condition_key)](aa, bb, cc); + auto s = slice(aabbcc, eve::upper_); + return if_else(mask,s,src); } - else return fms.behavior(cpu_{}, opts, v, w, x); + else return fms.behavior(cpu_{}, opts, v, w, x); } else return fms.behavior(cpu_{}, opts, v, w, x); } diff --git a/include/eve/module/core/regular/impl/simd/x86/fnma.hpp b/include/eve/module/core/regular/impl/simd/x86/fnma.hpp index 91e9785efb..89bf13374a 100644 --- a/include/eve/module/core/regular/impl/simd/x86/fnma.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/fnma.hpp @@ -108,26 +108,22 @@ namespace eve::detail { if constexpr(!O::contains(strict)) { - if constexpr(current_api >= avx512) + auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; + if constexpr ( cx == category::float64x8 ) return _mm512_mask_fnmadd_round_pd (a, m, b, c, dir); + else if constexpr ( cx == category::float32x16 ) return _mm512_mask_fnmadd_round_ps (a, m, b, c, dir); + else if constexpr ( cx == category::float64x4 || cx == category::float64x2 || + cx == category::float32x8 || cx == category::float32x4 || cx == category::float32x2) { - auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; - if constexpr ( cx == category::float64x8 ) return _mm512_mask_fnmadd_round_pd (a, m, b, c, dir); - else if constexpr ( cx == category::float32x16 ) return _mm512_mask_fnmadd_round_ps (a, m, b, c, dir); - else if constexpr ( cx == category::float64x4 || cx == category::float64x2 || - cx == category::float32x8 || cx == category::float32x4 || cx == category::float32x2) - { - auto aa = eve::combine(a, a); - auto bb = eve::combine(b, b); - auto cc = eve::combine(c, c); - auto aabbcc = fnma[opts.drop(condition_key)](aa, bb, cc); - auto s = slice(aabbcc, eve::upper_); - return if_else(mask,s,src); - } - else return fnma.behavior(cpu_{}, opts, a, b, c); + auto aa = eve::combine(a, a); + auto bb = eve::combine(b, b); + auto cc = eve::combine(c, c); + auto aabbcc = fnma[opts.drop(condition_key)](aa, bb, cc); + auto s = slice(aabbcc, eve::upper_); + return if_else(mask,s,src); } - else return fnma.behavior(cpu_{}, opts, a, b, c); + else return fnma.behavior(cpu_{}, opts, a, b, c); } - else return fnma.behavior(cpu_{}, opts, a, b, c); + else return fnma.behavior(cpu_{}, opts, a, b, c); } if ((O::contains(lower) || O::contains(upper))&& floating_value) return if_else(mask, eve::fnma[opts.drop(condition_key)](a, b, c), a); else if constexpr( cx == category::float32x16 ) return _mm512_mask_fnmadd_ps(a, m, b, c); diff --git a/include/eve/module/core/regular/impl/simd/x86/fnms.hpp b/include/eve/module/core/regular/impl/simd/x86/fnms.hpp index 7ba679f138..c03e6bdf00 100644 --- a/include/eve/module/core/regular/impl/simd/x86/fnms.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/fnms.hpp @@ -105,26 +105,22 @@ namespace eve::detail { if constexpr(!O::contains(strict)) { - if constexpr(current_api >= avx512) + auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; + if constexpr ( cx == category::float64x8 ) return _mm512_mask_fnmsub_round_pd (a, m, b, c, dir); + else if constexpr ( cx == category::float32x16 ) return _mm512_mask_fnmsub_round_ps (a, m, b, c, dir); + else if constexpr ( cx == category::float64x4 || cx == category::float64x2 || + cx == category::float32x8 || cx == category::float32x4 || cx == category::float32x2) { - auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; - if constexpr ( cx == category::float64x8 ) return _mm512_mask_fnmsub_round_pd (a, m, b, c, dir); - else if constexpr ( cx == category::float32x16 ) return _mm512_mask_fnmsub_round_ps (a, m, b, c, dir); - else if constexpr ( cx == category::float64x4 || cx == category::float64x2 || - cx == category::float32x8 || cx == category::float32x4 || cx == category::float32x2) - { - auto aa = eve::combine(a, a); - auto bb = eve::combine(b, b); - auto cc = eve::combine(c, c); - auto aabbcc = fnms[opts.drop(condition_key)](aa, bb, cc); - auto s = slice(aabbcc, eve::upper_); - return if_else(mask,s,src); - } - else return fnms.behavior(cpu_{}, opts, a, b, c); + auto aa = eve::combine(a, a); + auto bb = eve::combine(b, b); + auto cc = eve::combine(c, c); + auto aabbcc = fnms[opts.drop(condition_key)](aa, bb, cc); + auto s = slice(aabbcc, eve::upper_); + return if_else(mask,s,src); } - else return fnms.behavior(cpu_{}, opts, a, b, c); + else return fnms.behavior(cpu_{}, opts, a, b, c); } - else return fnms.behavior(cpu_{}, opts, a, b, c); + else return fnms.behavior(cpu_{}, opts, a, b, c); } if ((O::contains(lower) || O::contains(upper))&& floating_value) return if_else(mask, eve::fnms[opts.drop(condition_key)](a, b, c), a); else if constexpr( cx == category::float32x16 ) return _mm512_mask_fnmsub_ps(a, m, b, c); diff --git a/include/eve/module/core/regular/impl/simd/x86/mul.hpp b/include/eve/module/core/regular/impl/simd/x86/mul.hpp index f28bea9e37..ad0187eb38 100644 --- a/include/eve/module/core/regular/impl/simd/x86/mul.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/mul.hpp @@ -135,23 +135,19 @@ namespace eve::detail if constexpr(floating_value &&( O::contains(lower) || O::contains(upper)) && !O::contains(strict)) { - if constexpr(current_api >= avx512) + auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; + if constexpr ( c == category::float64x8 ) return _mm512_mask_mul_round_pd (src, m, a, b, dir); + else if constexpr ( c == category::float32x16 ) return _mm512_mask_mul_round_ps (src, m, a, b, dir); + else if constexpr ( c == category::float64x4 || c == category::float64x2 || + c == category::float32x8 || c == category::float32x4 || c == category::float32x2) { - auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; - if constexpr ( c == category::float64x8 ) return _mm512_mask_mul_round_pd (src, m, a, b, dir); - else if constexpr ( c == category::float32x16 ) return _mm512_mask_mul_round_ps (src, m, a, b, dir); - else if constexpr ( c == category::float64x4 || c == category::float64x2 || - c == category::float32x8 || c == category::float32x4 || c == category::float32x2) - { - auto aa = eve::combine(a, a); - auto bb = eve::combine(b, b); - auto aapbb = mul[opts.drop(condition_key)](aa, bb); - auto s = slice(aapbb, eve::upper_); - return if_else(cx,s,src); - } - else return add.behavior(cpu_{}, opts, a, b); + auto aa = eve::combine(a, a); + auto bb = eve::combine(b, b); + auto aapbb = mul[opts.drop(condition_key)](aa, bb); + auto s = slice(aapbb, eve::upper_); + return if_else(cx,s,src); } - else return add.behavior(cpu_{}, opts, a, b); + else return add.behavior(cpu_{}, opts, a, b); } else if constexpr(O::contains(saturated)) { diff --git a/include/eve/module/core/regular/impl/simd/x86/negate.hpp b/include/eve/module/core/regular/impl/simd/x86/negate.hpp index cbe9fc387c..dd909812c3 100644 --- a/include/eve/module/core/regular/impl/simd/x86/negate.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/negate.hpp @@ -21,10 +21,10 @@ namespace eve::detail wide a1) noexcept requires std::same_as, x86_128_> { - if( sizeof(T) == 8 ) return negate.behavior(cpu_{}, opts, a0, a1); - else if( sizeof(T) == 4 ) return _mm_sign_epi32(a0, a1); - else if( sizeof(T) == 2 ) return _mm_sign_epi16(a0, a1); - else if( sizeof(T) == 1 ) return _mm_sign_epi8(a0, a1); + if constexpr ( sizeof(T) == 8 ) return negate.behavior(cpu_{}, opts, a0, a1); + else if constexpr ( sizeof(T) == 4 ) return _mm_sign_epi32(a0, a1); + else if constexpr ( sizeof(T) == 2 ) return _mm_sign_epi16(a0, a1); + else if constexpr ( sizeof(T) == 1 ) return _mm_sign_epi8(a0, a1); } // ----------------------------------------------------------------------------------------------- @@ -36,9 +36,9 @@ namespace eve::detail wide a1) noexcept requires std::same_as, x86_256_> { - if( sizeof(T) == 8 ) return negate.behavior(cpu_{}, opts, a0, a1); - else if( sizeof(T) == 4 ) return _mm256_sign_epi32(a0, a1); - else if( sizeof(T) == 2 ) return _mm256_sign_epi16(a0, a1); - else if( sizeof(T) == 1 ) return _mm256_sign_epi8(a0, a1); + if constexpr ( sizeof(T) == 8 ) return negate.behavior(cpu_{}, opts, a0, a1); + else if constexpr ( sizeof(T) == 4 ) return _mm256_sign_epi32(a0, a1); + else if constexpr ( sizeof(T) == 2 ) return _mm256_sign_epi16(a0, a1); + else if constexpr ( sizeof(T) == 1 ) return _mm256_sign_epi8(a0, a1); } } diff --git a/include/eve/module/core/regular/impl/simd/x86/rec.hpp b/include/eve/module/core/regular/impl/simd/x86/rec.hpp index 7de1c0d438..07023def12 100644 --- a/include/eve/module/core/regular/impl/simd/x86/rec.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/rec.hpp @@ -62,22 +62,24 @@ namespace eve::detail } else if constexpr(O::contains(pedantic) || current_api < avx512) { - if (current_api >= avx512) + if constexpr (current_api >= avx512) { if constexpr( c == category::float32x16) return _mm512_div_ps(one(eve::as(v)), v); else if constexpr( c == category::float64x8 ) return _mm512_div_pd(one(eve::as(v)), v); + else return rec.behavior(cpu_{}, o, v); } - if (current_api >= avx) + else if constexpr (current_api >= avx) { if constexpr( c == category::float32x8 ) return _mm256_div_ps(one(eve::as(v)), v); else if constexpr( c == category::float64x4 ) return _mm256_div_pd(one(eve::as(v)), v); + else return rec.behavior(cpu_{}, o, v); } - if (current_api >= sse2) + else { if constexpr( c == category::float32x4 ) return _mm_div_ps(one(eve::as(v)), v); else if constexpr( c == category::float64x2 ) return _mm_div_pd(one(eve::as(v)), v); + else return rec.behavior(cpu_{}, o, v); } - return rec.behavior(cpu_{}, o, v); } else { diff --git a/include/eve/module/core/regular/impl/simd/x86/sub.hpp b/include/eve/module/core/regular/impl/simd/x86/sub.hpp index 15dd6a0103..c391d2fd37 100644 --- a/include/eve/module/core/regular/impl/simd/x86/sub.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/sub.hpp @@ -134,42 +134,36 @@ namespace eve::detail if constexpr(floating_value &&( O::contains(lower) || O::contains(upper)) && !O::contains(strict)) { - if constexpr(current_api >= avx512) + auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; + if constexpr ( c == category::float64x8 ) return _mm512_mask_sub_round_pd (src, m, v, w, dir); + else if constexpr ( c == category::float32x16 ) return _mm512_mask_sub_round_ps (src, m, v, w, dir); + else if constexpr ( c == category::float64x4 || c == category::float64x2 || + c == category::float32x8 || c == category::float32x4 || c == category::float32x2) { - auto constexpr dir =(O::contains(lower) ? _MM_FROUND_TO_NEG_INF : _MM_FROUND_TO_POS_INF) |_MM_FROUND_NO_EXC; - if constexpr ( c == category::float64x8 ) return _mm512_mask_sub_round_pd (src, m, v, w, dir); - else if constexpr ( c == category::float32x16 ) return _mm512_mask_sub_round_ps (src, m, v, w, dir); - else if constexpr ( c == category::float64x4 || c == category::float64x2 || - c == category::float32x8 || c == category::float32x4 || c == category::float32x2) - { - auto vv = eve::combine(v, w); - auto ww = eve::combine(w, v); - auto vvpww = sub[opts.drop(condition_key)](vv, ww); - auto s = slice(vvpww, eve::upper_); - return if_else(cx,s,src); - } - else return add.behavior(cpu_{}, opts, v, w); + auto vv = eve::combine(v, w); + auto ww = eve::combine(w, v); + auto vvpww = sub[opts.drop(condition_key)](vv, ww); + auto s = slice(vvpww, eve::upper_); + return if_else(cx,s,src); } - else return add.behavior(cpu_{}, opts, v, w); + else return add.behavior(cpu_{}, opts, v, w); } else if constexpr(O::contains(saturated)) { - constexpr auto sup_avx2 = current_api >= avx2; - - if constexpr( floating_value ) return sub[cx](v, w); - else if constexpr( c == category::int16x32 ) return _mm512_mask_subs_epi16(src, m, v, w); - else if constexpr( c == category::uint16x32 ) return _mm512_mask_subs_epu16(src, m, v, w); - else if constexpr( c == category::int8x64 ) return _mm512_mask_subs_epi8(src, m, v, w); - else if constexpr( c == category::uint8x64 ) return _mm512_mask_subs_epu8(src, m, v, w); - else if constexpr( sup_avx2 && c == category::int16x16 ) return _mm256_mask_subs_epi16(src, m, v, w); - else if constexpr( sup_avx2 && c == category::uint16x16 ) return _mm256_mask_subs_epu16(src, m, v, w); - else if constexpr( sup_avx2 && c == category::int8x32 ) return _mm256_mask_subs_epi8(src, m, v, w); - else if constexpr( sup_avx2 && c == category::uint8x32 ) return _mm256_mask_subs_epu8(src, m, v, w); - else if constexpr( c == category::int16x8 ) return _mm_mask_subs_epi16(src, m, v, w); - else if constexpr( c == category::uint16x8 ) return _mm_mask_subs_epu16(src, m, v, w); - else if constexpr( c == category::int8x16 ) return _mm_mask_subs_epi8(src, m, v, w); - else if constexpr( c == category::uint8x16 ) return _mm_mask_subs_epu8(src, m, v, w); - else return sub.behavior(cpu_{}, opts, v, w); + if constexpr( floating_value ) return sub[cx](v, w); + else if constexpr( c == category::int16x32 ) return _mm512_mask_subs_epi16(src, m, v, w); + else if constexpr( c == category::uint16x32 ) return _mm512_mask_subs_epu16(src, m, v, w); + else if constexpr( c == category::int8x64 ) return _mm512_mask_subs_epi8(src, m, v, w); + else if constexpr( c == category::uint8x64 ) return _mm512_mask_subs_epu8(src, m, v, w); + else if constexpr( c == category::int16x16 ) return _mm256_mask_subs_epi16(src, m, v, w); + else if constexpr( c == category::uint16x16 ) return _mm256_mask_subs_epu16(src, m, v, w); + else if constexpr( c == category::int8x32 ) return _mm256_mask_subs_epi8(src, m, v, w); + else if constexpr( c == category::uint8x32 ) return _mm256_mask_subs_epu8(src, m, v, w); + else if constexpr( c == category::int16x8 ) return _mm_mask_subs_epi16(src, m, v, w); + else if constexpr( c == category::uint16x8 ) return _mm_mask_subs_epu16(src, m, v, w); + else if constexpr( c == category::int8x16 ) return _mm_mask_subs_epi8(src, m, v, w); + else if constexpr( c == category::uint8x16 ) return _mm_mask_subs_epu8(src, m, v, w); + else return sub.behavior(cpu_{}, opts, v, w); } else {