Merge pull request #161 from munroesj52/cpsgn-fix1b

Fix vec_copysign implementations per issue #158, Part 1B.
open-power-sdk · Feb 2, 2022 · 8e670f4 · 8e670f4
2 parents ef3aff3 + 8f14ea4
commit 8e670f4
Show file tree

Hide file tree

Showing 8 changed files with 154 additions and 206 deletions.
diff --git a/src/pveclib/vec_f128_ppc.h b/src/pveclib/vec_f128_ppc.h
@@ -441,7 +441,7 @@ vec_xsxexpqp (__binary128 f128)
  * \endcode
  *
  * \note Would like to use the intrinsic scalar_extract_exp() here but
- * this is not available until GCC 11.
+ * this is not available until GCC 11 (or later).
  * Also GCC defines these scalar built-ins to return integer scalar
  * values in GPRs.
  * This would defeat the purpose of an all vector implementation.
@@ -1769,7 +1769,7 @@ static inline vec_xscvsqqp (vi128_t int128)
   lo64 = int64[VEC_DW_L];
   result = (hi64 * two64) + lo64;
   // copy the __int128's sign into the __binary128 result
-  result = vec_copysignf128 (result, i_sign);
+  result = vec_copysignf128 (i_sign, result);
 #elif  defined (_ARCH_PWR8)
 ...
 #endif
@@ -3806,26 +3806,37 @@ vec_all_iszerof128 (__binary128 f128)
 #endif
 }
 
-/** \brief Copy the sign bit from f128y and merge with the magnitude
- *  from f128x. The merged result is returned as a __float128 value.
+/** \brief Copy the sign bit from f128x and merge with the magnitude
+ *  from f128y. The merged result is returned as a __float128 value.
+ *
+ *  \note This operation was patterned after the intrinsic vec_cpsgn
+ *  (altivec.h) introduced for POWER7 and VSX. It turns out the
+ *  original (GCC 4.9) compiler implementation reversed the operands
+ *  and does not match the PowerISA or the Vector Intrinsic Programming
+ *  Reference manuals. Subsequent compilers and PVECLIB
+ *  implementations replicated this (operand order) error.
+ *  This has now been reported as bug against the compilers, which are
+ *  in the process of applying fixes and distributing updates.
+ *  This version of PVECLIB is updated to match the Vector Intrinsic
+ *  Programming Reference.
  *
  *  |processor|Latency|Throughput|
  *  |--------:|:-----:|:---------|
  *  |power8   | 2-11  | 2/cycle  |
  *  |power9   | 2     | 4/cycle  |
  *
- *  @param f128x a __float128 value containing the magnitude.
- *  @param f128y a __float128 value containing the sign bit.
- *  @return a __float128 value with magnitude from f128x and the
- *  sign of f128y.
+ *  @param f128x a __float128 value containing the sign bit.
+ *  @param f128y a __float128 value containing the magnitude.
+ *  @return a __float128 value with magnitude from f128y and the
+ *  sign of f128x.
  */
 static inline __binary128
 vec_copysignf128 (__binary128 f128x, __binary128 f128y)
 {
   __binary128 result;
 #if _ARCH_PWR9
   __asm__(
-      "xscpsgnqp %0,%2,%1;\n"
+      "xscpsgnqp %0,%1,%2;\n"
       : "=v" (result)
       : "v" (f128x), "v" (f128y)
       :);
@@ -3835,7 +3846,7 @@ vec_copysignf128 (__binary128 f128x, __binary128 f128y)
   tmpx = vec_xfer_bin128_2_vui32t (f128x);
   tmpy = vec_xfer_bin128_2_vui32t (f128y);
 
-  tmp = vec_sel (tmpx, tmpy, signmask);
+  tmp = vec_sel (tmpy, tmpx, signmask);
   result = vec_xfer_vui32t_2_bin128 (tmp);
 #endif
   return (result);
@@ -8006,7 +8017,7 @@ static inline vec_xscvsqqp (vi128_t int128)
   lo64 = int64[VEC_DW_L];
   result = (hi64 * two64) + lo64;
   // Copy the __int128's sign into the __binary128 result
-  result = vec_copysignf128 (result, i_sign);
+  result = vec_copysignf128 (i_sign, result);
 #elif  defined (_ARCH_PWR8)
   vui64_t q_exp;
   vui128_t q_sig;

diff --git a/src/pveclib/vec_f32_ppc.h b/src/pveclib/vec_f32_ppc.h
@@ -788,31 +788,52 @@ vec_any_iszerof32 (vf32_t vf32)
 #endif
 }
 
-/** \brief Copy the sign bit from vf32y merged with magnitude from
- *  vf32x and return the resulting vector float values.
+/** \brief Copy the sign bit from vf32x merged with magnitude from
+ *  vf32y and return the resulting vector float values.
+ *
+ *  \note This operation was patterned after the intrinsic vec_cpsgn
+ *  (altivec.h) introduced for POWER7 and VSX. It turns out the
+ *  original (GCC 4.9) compiler implementation reversed the operands
+ *  and does not match the PowerISA or the Vector Intrinsic Programming
+ *  Reference manuals. Subsequent compilers and PVECLIB
+ *  implementations replicated this (operand order) error.
+ *  This has now been reported as bug against the compilers, which are
+ *  in the process of applying fixes and distributing updates.
+ *  This version of PVECLIB is updated to match the Vector Intrinsic
+ *  Programming Reference. This implementation is independent of the
+ *  compilers update status.
  *
  *  |processor|Latency|Throughput|
  *  |--------:|:-----:|:---------|
  *  |power8   | 6-7   | 2/cycle  |
  *  |power9   | 2     | 2/cycle  |
  *
- *  @param vf32x vector float values containing the magnitudes.
- *  @param vf32y vector float values containing the sign bits.
- *  @return vector float values with magnitude from vf32x and the
- *  sign of vf32y.
+ *  @param vf32x vector float values containing the sign bits.
+ *  @param vf32y vector float values containing the magnitudes.
+ *  @return vector float values with magnitude from vf32y and the
+ *  sign of vf32x.
  */
 static inline vf32_t
 vec_copysignf32 (vf32_t vf32x, vf32_t vf32y)
 {
 #if _ARCH_PWR7
-  /* P9 has a 2 cycle xvcpsgnsp and eliminates a const load. */
+#ifdef PVECLIB_CPSGN_FIXED
   return (vec_cpsgn (vf32x, vf32y));
+#else
+  vf32_t result;
+  __asm__(
+      "xvcpsgnsp %x0,%x1,%x2;\n"
+      : "=wa" (result)
+      : "wa" (vf32x), "wa" (vf32y)
+      :);
+  return (result);
+#endif
 #else
   const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000,
       0x80000000, 0x80000000);
   vf32_t result;
 
-  result = (vf32_t)vec_sel ((vui32_t)vf32x, (vui32_t)vf32y, signmask);
+  result = (vf32_t)vec_sel ((vui32_t)vf32y, (vui32_t)vf32x, signmask);
   return (result);
 #endif
 }

diff --git a/src/pveclib/vec_f64_ppc.h b/src/pveclib/vec_f64_ppc.h
@@ -780,31 +780,53 @@ vec_any_iszerof64 (vf64_t vf64)
 #endif
 }
 
-/** \brief Copy the sign bit from vf64y merged with magnitude from
- *  vf64x and return the resulting vector double values.
+/** \brief Copy the sign bit from vf64x merged with magnitude from
+ *  vf64y and return the resulting vector double values.
+ *
+ *  \note This operation was patterned after the intrinsic vec_cpsgn
+ *  (altivec.h) introduced for POWER7 and VSX. It turns out the
+ *  original (GCC 4.9) compiler implementation reversed the operands
+ *  and does not match the PowerISA or the Vector Intrinsic Programming
+ *  Reference manuals. Subsequent compilers and PVECLIB
+ *  implementations replicated this (operand order) error.
+ *  This has now been reported as bug against the compilers, which are
+ *  in the process of applying fixes and distributing updates.
+ *  This version of PVECLIB is updated to match the Vector Intrinsic
+ *  Programming Reference. This implementation is independent of the
+ *  compilers update status.
  *
  *  |processor|Latency|Throughput|
  *  |--------:|:-----:|:---------|
  *  |power8   | 6-7   | 2/cycle  |
  *  |power9   | 2     | 2/cycle  |
  *
- *  @param vf64x vector double values containing the magnitudes.
- *  @param vf64y vector double values containing the sign bits.
- *  @return vector double values with magnitude from vf64x and the
- *  sign of vf64y.
+ *  @param vf64x vector double values containing the sign bits.
+ *  @param vf64y vector double values containing the magnitudes.
+ *  @return vector double values with magnitude from vf64y and the
+ *  sign of vf64x.
  */
 static inline vf64_t
-vec_copysignf64 (vf64_t vf64x , vf64_t vf64y)
+vec_copysignf64 (vf64_t vf64x, vf64_t vf64y)
 {
 #if _ARCH_PWR7
   /* P9 has a 2 cycle xvcpsgndp and eliminates a const load. */
-	return (vec_cpsgn (vf64x, vf64y));
+#ifdef PVECLIB_CPSGN_FIXED
+  return (vec_cpsgn (vf64x, vf64y));
+#else
+  vf64_t result;
+  __asm__(
+      "xvcpsgndp %x0,%x1,%x2;\n"
+      : "=wa" (result)
+      : "wa" (vf64x), "wa" (vf64y)
+      :);
+  return (result);
+#endif
 #else
-	const vui32_t signmask  = CONST_VINT128_W(0x80000000, 0, 0x80000000, 0);
-	vf64_t result;
+  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0, 0x80000000, 0);
+  vf64_t result;
 
-	result  = (vf64_t)vec_sel ((vui32_t)vf64x, (vui32_t)vf64y, signmask);
-	return (result);
+  result = (vf64_t) vec_sel ((vui32_t) vf64y, (vui32_t) vf64x, signmask);
+  return (result);
 #endif
 }