wip

DLTcollab · Jun 28, 2024 · 0183320 · 0183320
1 parent 79827a4
commit 0183320
Showing 1 changed file with 121 additions and 104 deletions.
diff --git a/sse2neon.h b/sse2neon.h
@@ -3274,8 +3274,8 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
     b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
     b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2]; 
-    d[0] = (a0.f64) >= (b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (a1.f64) >= (b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0.f64 >= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.f64 >= b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3291,12 +3291,13 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmpge_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
+    d[0] = a0.f64 >= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.u64;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3338,13 +3339,14 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0.f64 > b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.f64 > b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3360,12 +3362,13 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
+    d[0] = a0.f64 > b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.u64;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3380,13 +3383,14 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0.f64 <= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.f64 <= b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3402,12 +3406,13 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmple_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
+    d[0] = a0.f64 <= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.u64;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3452,13 +3457,14 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0.f64 < b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.f64 < b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3473,12 +3479,13 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmplt_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
+    d[0] = a0.f64 < b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.u64;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3520,15 +3527,16 @@ FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
     d[0] =
-        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+        !(a0.f64 >= b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
     d[1] =
-        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+        !(a1.f64 >= b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3553,15 +3561,16 @@ FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
     d[0] =
-        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+        !(a0.f64 > b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
     d[1] =
-        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+        !(a1.f64 > b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3586,15 +3595,16 @@ FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
     d[0] =
-        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+        !(a0.f64 <= b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
     d[1] =
-        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+        !(a1.f64 <= b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3619,15 +3629,16 @@ FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
     d[0] =
-        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+        !(a0.f64 < b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
     d[1] =
-        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+        !(a1.f64 < b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3655,17 +3666,18 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
         vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
     return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
+    d[0] = (a0.f64 == a0.f64 &&
+            b0.f64 == b0.f64)
                ? ~UINT64_C(0)
                : UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
+    d[1] = (a1.f64 == a1.f64 &&
+            b1.f64 == b1.f64)
                ? ~UINT64_C(0)
                : UINT64_C(0);
 
@@ -3682,15 +3694,16 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpord_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    bit64_union_t a0, a1, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
+    d[0] = (a0.f64 == a0.f64 &&
+            b0.f64 == b0.f64)
                ? ~UINT64_C(0)
                : UINT64_C(0);
-    d[1] = a1;
+    d[1] = a1.u64;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3710,17 +3723,18 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_s32(
         vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
+    d[0] = (a0.f64 == a0.f64 &&
+            b0.f64 == b0.f64)
                ? UINT64_C(0)
                : ~UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
+    d[1] = (a1.f64 == a1.f64 &&
+            b1.f64 == b1.f64)
                ? UINT64_C(0)
                : ~UINT64_C(0);
 
@@ -3737,15 +3751,16 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    bit64_union_t a0, a1, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
+    d[0] = (a0.f64 == a0.f64 &&
+            b0.f64 == b0.f64)
                ? UINT64_C(0)
                : ~UINT64_C(0);
-    d[1] = a1;
+    d[1] = a1.u64;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -4471,13 +4486,14 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
         vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
+    d[0] = a0.f64 > b0.f64 ? a0.u64 : b0.u64;
+    d[1] = a1.f64 > b1.f64 ? a1.u64 : b1.u64;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -4532,13 +4548,14 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
         vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
+    d[0] = a0.f64 < b0.f64 ? a0.u64 : b0.u64;
+    d[1] = a1.f64 < b1.f64 ? a1.u64 : b1.u64;
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }