Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay committed Jun 28, 2024
1 parent 79827a4 commit 0183320
Showing 1 changed file with 121 additions and 104 deletions.
225 changes: 121 additions & 104 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -3274,8 +3274,8 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
uint64_t d[2];
d[0] = (a0.f64) >= (b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
d[1] = (a1.f64) >= (b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
d[0] = a0.f64 >= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1.f64 >= b1.f64 ? ~UINT64_C(0) : UINT64_C(0);

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand All @@ -3291,12 +3291,13 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
return _mm_move_sd(a, _mm_cmpge_pd(a, b));
#else
// expand "_mm_cmpge_pd()" to reduce unnecessary operations
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
uint64_t d[2];
d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1;
d[0] = a0.f64 >= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1.u64;

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand Down Expand Up @@ -3338,13 +3339,14 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
return vreinterpretq_m128d_u64(
vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
#else
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
uint64_t d[2];
d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
d[0] = a0.f64 > b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1.f64 > b1.f64 ? ~UINT64_C(0) : UINT64_C(0);

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand All @@ -3360,12 +3362,13 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
#else
// expand "_mm_cmpge_pd()" to reduce unnecessary operations
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
uint64_t d[2];
d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1;
d[0] = a0.f64 > b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1.u64;

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand All @@ -3380,13 +3383,14 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
return vreinterpretq_m128d_u64(
vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
#else
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
uint64_t d[2];
d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
d[0] = a0.f64 <= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1.f64 <= b1.f64 ? ~UINT64_C(0) : UINT64_C(0);

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand All @@ -3402,12 +3406,13 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
return _mm_move_sd(a, _mm_cmple_pd(a, b));
#else
// expand "_mm_cmpge_pd()" to reduce unnecessary operations
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
uint64_t d[2];
d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1;
d[0] = a0.f64 <= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1.u64;

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand Down Expand Up @@ -3452,13 +3457,14 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
return vreinterpretq_m128d_u64(
vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
#else
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
uint64_t d[2];
d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
d[0] = a0.f64 < b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1.f64 < b1.f64 ? ~UINT64_C(0) : UINT64_C(0);

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand All @@ -3473,12 +3479,13 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
#if defined(__aarch64__) || defined(_M_ARM64)
return _mm_move_sd(a, _mm_cmplt_pd(a, b));
#else
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
uint64_t d[2];
d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1;
d[0] = a0.f64 < b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1.u64;

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand Down Expand Up @@ -3520,15 +3527,16 @@ FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
vdupq_n_u64(UINT64_MAX)));
#else
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
uint64_t d[2];
d[0] =
!((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
!(a0.f64 >= b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
d[1] =
!((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
!(a1.f64 >= b1.f64) ? ~UINT64_C(0) : UINT64_C(0);

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand All @@ -3553,15 +3561,16 @@ FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
vdupq_n_u64(UINT64_MAX)));
#else
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
uint64_t d[2];
d[0] =
!((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
!(a0.f64 > b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
d[1] =
!((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
!(a1.f64 > b1.f64) ? ~UINT64_C(0) : UINT64_C(0);

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand All @@ -3586,15 +3595,16 @@ FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
vdupq_n_u64(UINT64_MAX)));
#else
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
uint64_t d[2];
d[0] =
!((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
!(a0.f64 <= b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
d[1] =
!((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
!(a1.f64 <= b1.f64) ? ~UINT64_C(0) : UINT64_C(0);

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand All @@ -3619,15 +3629,16 @@ FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
vdupq_n_u64(UINT64_MAX)));
#else
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
uint64_t d[2];
d[0] =
!((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
!(a0.f64 < b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
d[1] =
!((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
!(a1.f64 < b1.f64) ? ~UINT64_C(0) : UINT64_C(0);

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand Down Expand Up @@ -3655,17 +3666,18 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
#else
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
uint64_t d[2];
d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
(*(double *) &b0) == (*(double *) &b0))
d[0] = (a0.f64 == a0.f64 &&
b0.f64 == b0.f64)
? ~UINT64_C(0)
: UINT64_C(0);
d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
(*(double *) &b1) == (*(double *) &b1))
d[1] = (a1.f64 == a1.f64 &&
b1.f64 == b1.f64)
? ~UINT64_C(0)
: UINT64_C(0);

Expand All @@ -3682,15 +3694,16 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
#if defined(__aarch64__) || defined(_M_ARM64)
return _mm_move_sd(a, _mm_cmpord_pd(a, b));
#else
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
bit64_union_t a0, a1, b0;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
uint64_t d[2];
d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
(*(double *) &b0) == (*(double *) &b0))
d[0] = (a0.f64 == a0.f64 &&
b0.f64 == b0.f64)
? ~UINT64_C(0)
: UINT64_C(0);
d[1] = a1;
d[1] = a1.u64;

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand All @@ -3710,17 +3723,18 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
return vreinterpretq_m128d_s32(
vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
#else
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
uint64_t d[2];
d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
(*(double *) &b0) == (*(double *) &b0))
d[0] = (a0.f64 == a0.f64 &&
b0.f64 == b0.f64)
? UINT64_C(0)
: ~UINT64_C(0);
d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
(*(double *) &b1) == (*(double *) &b1))
d[1] = (a1.f64 == a1.f64 &&
b1.f64 == b1.f64)
? UINT64_C(0)
: ~UINT64_C(0);

Expand All @@ -3737,15 +3751,16 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
#if defined(__aarch64__) || defined(_M_ARM64)
return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
#else
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
bit64_union_t a0, a1, b0;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
uint64_t d[2];
d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
(*(double *) &b0) == (*(double *) &b0))
d[0] = (a0.f64 == a0.f64 &&
b0.f64 == b0.f64)
? UINT64_C(0)
: ~UINT64_C(0);
d[1] = a1;
d[1] = a1.u64;

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand Down Expand Up @@ -4471,13 +4486,14 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
#endif
#else
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
uint64_t d[2];
d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
d[0] = a0.f64 > b0.f64 ? a0.u64 : b0.u64;
d[1] = a1.f64 > b1.f64 ? a1.u64 : b1.u64;

return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
Expand Down Expand Up @@ -4532,13 +4548,14 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
#endif
#else
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
uint64_t d[2];
d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
d[0] = a0.f64 < b0.f64 ? a0.u64 : b0.u64;
d[1] = a1.f64 < b1.f64 ? a1.u64 : b1.u64;
return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif
}
Expand Down

0 comments on commit 0183320

Please sign in to comment.