Fix vec_copysign implementations per issue open-power-sdk#158, Part 1B.

Rebased after the add/subqpo merge. Seems the GCC (and Clang followed) initially reversed the operands for vec_cpsgn() (so the sign is copied from operand b into a). This differs from the Intrinsic reference and ISA that say the sign is copied from operand a into b. Unfortunately PVECLIB implementations of vec_copysignf32(), vec_copysignf64(), and vec_copysignf128() duplicated the results of original GCC vec_cpsgn() for the implementation, This has been reported as a bug and now GCC and Clang are in the process of "fixing" this bug to match the Intrinsic reference guide. The fix will be applied to currently supported versions but older compiler version will remain unchanged. This change set will correct the PVECLIB copysign implementation to match the intrinsic reference manual. We will the macro PVECLIB_CPSGN_FIXED to isolate the PVECLIB implementations from the compiler changes. * src/pveclib/vec_f128_ppc.h (vec_copysignf128): Change to match operand order from Vector Intrinsic Reference. (vec_xscvsqqp [_ARCH_PWR9]): Use correct vec_copysignf128 operand order. * src/pveclib/vec_f32_ppc.h (vec_copysignf32): Change to match operand order from Vector Intrinsic Reference. * src/pveclib/vec_f64_ppc.h (vec_copysignf64): Change to match operand order from Vector Intrinsic Reference. * src/testsuite/arith128_test_f128.c (test_copysignf128): Define new __float128 const f128_nsnan. Reverse test operands and results to match corrected vec_copysignf128() implementation. Remove duplicated tests. * src/testsuite/arith128_test_f32.c (test_float_cpsgn): Reverse test operands and results to match corrected vec_copysignf32() implementation. * src/testsuite/arith128_test_f64.c (test_double_cpsgn): Reverse test operands and results to match corrected vec_copysignf64() implementation. * src/testsuite/vec_f32_dummy.c (test_vec_copysignf32): new compile test. * src/testsuite/vec_f64_dummy.c (test_vec_copysignf64): new compile test. Signed-off-by: Steven Munroe <[email protected]>
munroesj52 · Jan 25, 2022 · 8f14ea4 · 8f14ea4
1 parent ef3aff3
commit 8f14ea4
Show file tree

Hide file tree

Showing 8 changed files with 154 additions and 206 deletions.
diff --git a/src/pveclib/vec_f128_ppc.h b/src/pveclib/vec_f128_ppc.h
@@ -441,7 +441,7 @@ vec_xsxexpqp (__binary128 f128)
  * \endcode
  *
  * \note Would like to use the intrinsic scalar_extract_exp() here but
- * this is not available until GCC 11.
+ * this is not available until GCC 11 (or later).
  * Also GCC defines these scalar built-ins to return integer scalar
  * values in GPRs.
  * This would defeat the purpose of an all vector implementation.
@@ -1769,7 +1769,7 @@ static inline vec_xscvsqqp (vi128_t int128)
   lo64 = int64[VEC_DW_L];
   result = (hi64 * two64) + lo64;
   // copy the __int128's sign into the __binary128 result
-  result = vec_copysignf128 (result, i_sign);
+  result = vec_copysignf128 (i_sign, result);
 #elif  defined (_ARCH_PWR8)
 ...
 #endif
@@ -3806,26 +3806,37 @@ vec_all_iszerof128 (__binary128 f128)
 #endif
 }
 
-/** \brief Copy the sign bit from f128y and merge with the magnitude
- *  from f128x. The merged result is returned as a __float128 value.
+/** \brief Copy the sign bit from f128x and merge with the magnitude
+ *  from f128y. The merged result is returned as a __float128 value.
+ *
+ *  \note This operation was patterned after the intrinsic vec_cpsgn
+ *  (altivec.h) introduced for POWER7 and VSX. It turns out the
+ *  original (GCC 4.9) compiler implementation reversed the operands
+ *  and does not match the PowerISA or the Vector Intrinsic Programming
+ *  Reference manuals. Subsequent compilers and PVECLIB
+ *  implementations replicated this (operand order) error.
+ *  This has now been reported as bug against the compilers, which are
+ *  in the process of applying fixes and distributing updates.
+ *  This version of PVECLIB is updated to match the Vector Intrinsic
+ *  Programming Reference.
  *
  *  |processor|Latency|Throughput|
  *  |--------:|:-----:|:---------|
  *  |power8   | 2-11  | 2/cycle  |
  *  |power9   | 2     | 4/cycle  |
  *
- *  @param f128x a __float128 value containing the magnitude.
- *  @param f128y a __float128 value containing the sign bit.
- *  @return a __float128 value with magnitude from f128x and the
- *  sign of f128y.
+ *  @param f128x a __float128 value containing the sign bit.
+ *  @param f128y a __float128 value containing the magnitude.
+ *  @return a __float128 value with magnitude from f128y and the
+ *  sign of f128x.
  */
 static inline __binary128
 vec_copysignf128 (__binary128 f128x, __binary128 f128y)
 {
   __binary128 result;
 #if _ARCH_PWR9
   __asm__(
-      "xscpsgnqp %0,%2,%1;\n"
+      "xscpsgnqp %0,%1,%2;\n"
       : "=v" (result)
       : "v" (f128x), "v" (f128y)
       :);
@@ -3835,7 +3846,7 @@ vec_copysignf128 (__binary128 f128x, __binary128 f128y)
   tmpx = vec_xfer_bin128_2_vui32t (f128x);
   tmpy = vec_xfer_bin128_2_vui32t (f128y);
 
-  tmp = vec_sel (tmpx, tmpy, signmask);
+  tmp = vec_sel (tmpy, tmpx, signmask);
   result = vec_xfer_vui32t_2_bin128 (tmp);
 #endif
   return (result);
@@ -8006,7 +8017,7 @@ static inline vec_xscvsqqp (vi128_t int128)
   lo64 = int64[VEC_DW_L];
   result = (hi64 * two64) + lo64;
   // Copy the __int128's sign into the __binary128 result
-  result = vec_copysignf128 (result, i_sign);
+  result = vec_copysignf128 (i_sign, result);
 #elif  defined (_ARCH_PWR8)
   vui64_t q_exp;
   vui128_t q_sig;

diff --git a/src/pveclib/vec_f32_ppc.h b/src/pveclib/vec_f32_ppc.h
@@ -788,31 +788,52 @@ vec_any_iszerof32 (vf32_t vf32)
 #endif
 }
 
-/** \brief Copy the sign bit from vf32y merged with magnitude from
- *  vf32x and return the resulting vector float values.
+/** \brief Copy the sign bit from vf32x merged with magnitude from
+ *  vf32y and return the resulting vector float values.
+ *
+ *  \note This operation was patterned after the intrinsic vec_cpsgn
+ *  (altivec.h) introduced for POWER7 and VSX. It turns out the
+ *  original (GCC 4.9) compiler implementation reversed the operands
+ *  and does not match the PowerISA or the Vector Intrinsic Programming
+ *  Reference manuals. Subsequent compilers and PVECLIB
+ *  implementations replicated this (operand order) error.
+ *  This has now been reported as bug against the compilers, which are
+ *  in the process of applying fixes and distributing updates.
+ *  This version of PVECLIB is updated to match the Vector Intrinsic
+ *  Programming Reference. This implementation is independent of the
+ *  compilers update status.
  *
  *  |processor|Latency|Throughput|
  *  |--------:|:-----:|:---------|
  *  |power8   | 6-7   | 2/cycle  |
  *  |power9   | 2     | 2/cycle  |
  *
- *  @param vf32x vector float values containing the magnitudes.
- *  @param vf32y vector float values containing the sign bits.
- *  @return vector float values with magnitude from vf32x and the
- *  sign of vf32y.
+ *  @param vf32x vector float values containing the sign bits.
+ *  @param vf32y vector float values containing the magnitudes.
+ *  @return vector float values with magnitude from vf32y and the
+ *  sign of vf32x.
  */
 static inline vf32_t
 vec_copysignf32 (vf32_t vf32x, vf32_t vf32y)
 {
 #if _ARCH_PWR7
-  /* P9 has a 2 cycle xvcpsgnsp and eliminates a const load. */
+#ifdef PVECLIB_CPSGN_FIXED
   return (vec_cpsgn (vf32x, vf32y));
+#else
+  vf32_t result;
+  __asm__(
+      "xvcpsgnsp %x0,%x1,%x2;\n"
+      : "=wa" (result)
+      : "wa" (vf32x), "wa" (vf32y)
+      :);
+  return (result);
+#endif
 #else
   const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000,
       0x80000000, 0x80000000);
   vf32_t result;
 
-  result = (vf32_t)vec_sel ((vui32_t)vf32x, (vui32_t)vf32y, signmask);
+  result = (vf32_t)vec_sel ((vui32_t)vf32y, (vui32_t)vf32x, signmask);
   return (result);
 #endif
 }

diff --git a/src/pveclib/vec_f64_ppc.h b/src/pveclib/vec_f64_ppc.h
@@ -780,31 +780,53 @@ vec_any_iszerof64 (vf64_t vf64)
 #endif
 }
 
-/** \brief Copy the sign bit from vf64y merged with magnitude from
- *  vf64x and return the resulting vector double values.
+/** \brief Copy the sign bit from vf64x merged with magnitude from
+ *  vf64y and return the resulting vector double values.
+ *
+ *  \note This operation was patterned after the intrinsic vec_cpsgn
+ *  (altivec.h) introduced for POWER7 and VSX. It turns out the
+ *  original (GCC 4.9) compiler implementation reversed the operands
+ *  and does not match the PowerISA or the Vector Intrinsic Programming
+ *  Reference manuals. Subsequent compilers and PVECLIB
+ *  implementations replicated this (operand order) error.
+ *  This has now been reported as bug against the compilers, which are
+ *  in the process of applying fixes and distributing updates.
+ *  This version of PVECLIB is updated to match the Vector Intrinsic
+ *  Programming Reference. This implementation is independent of the
+ *  compilers update status.
  *
  *  |processor|Latency|Throughput|
  *  |--------:|:-----:|:---------|
  *  |power8   | 6-7   | 2/cycle  |
  *  |power9   | 2     | 2/cycle  |
  *
- *  @param vf64x vector double values containing the magnitudes.
- *  @param vf64y vector double values containing the sign bits.
- *  @return vector double values with magnitude from vf64x and the
- *  sign of vf64y.
+ *  @param vf64x vector double values containing the sign bits.
+ *  @param vf64y vector double values containing the magnitudes.
+ *  @return vector double values with magnitude from vf64y and the
+ *  sign of vf64x.
  */
 static inline vf64_t
-vec_copysignf64 (vf64_t vf64x , vf64_t vf64y)
+vec_copysignf64 (vf64_t vf64x, vf64_t vf64y)
 {
 #if _ARCH_PWR7
   /* P9 has a 2 cycle xvcpsgndp and eliminates a const load. */
-	return (vec_cpsgn (vf64x, vf64y));
+#ifdef PVECLIB_CPSGN_FIXED
+  return (vec_cpsgn (vf64x, vf64y));
+#else
+  vf64_t result;
+  __asm__(
+      "xvcpsgndp %x0,%x1,%x2;\n"
+      : "=wa" (result)
+      : "wa" (vf64x), "wa" (vf64y)
+      :);
+  return (result);
+#endif
 #else
-	const vui32_t signmask  = CONST_VINT128_W(0x80000000, 0, 0x80000000, 0);
-	vf64_t result;
+  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0, 0x80000000, 0);
+  vf64_t result;
 
-	result  = (vf64_t)vec_sel ((vui32_t)vf64x, (vui32_t)vf64y, signmask);
-	return (result);
+  result = (vf64_t) vec_sel ((vui32_t) vf64y, (vui32_t) vf64x, signmask);
+  return (result);
 #endif
 }