From 6f156dffd88824d316eabbf24b3f1cf70d0619f8 Mon Sep 17 00:00:00 2001
From: Emma Qiao <qqiao@nvidia.com>
Date: Wed, 2 Aug 2023 05:56:53 -0700
Subject: [PATCH] Fix math operator error on CUDA 12.2

---
 .../src/cpu/layers/batch_norm_layer_cpu.cpp   |  4 +-
 HugeCTR/src/cpu/layers/elu_layer_cpu.cpp      |  3 +-
 .../src/cpu/layers/fm_order2_layer_cpu.cpp    |  4 +-
 .../layers/fully_connected_layer_half_cpu.cpp |  4 +-
 .../fused_fully_connected_layer_cpu.cpp       |  8 ++--
 .../src/cpu/layers/interaction_layer_cpu.cpp  |  4 +-
 .../batch_norm_layer_test.cpp                 |  4 +-
 .../fully_connected_layer_half_test.cpp       |  6 +--
 .../fused_fully_connected_layer_test.cpp      | 10 ++---
 ...d_relu_bias_fully_connected_layer_test.cpp | 10 ++---
 .../layer_norm_layer_test.cpp                 | 11 +++---
 .../trainable_layers/mlp_test.cpp             | 17 +++++----
 .../multi_cross_layer_test.cpp                |  6 +--
 test/utest/layers/add_layer_test.cpp          |  2 +-
 test/utest/layers/batch_norm_layer_test.cpp   |  4 +-
 test/utest/layers/dropout_layer_test.cpp      |  4 +-
 test/utest/layers/dropout_layer_test_old.cpp  |  4 +-
 .../elementwise_multiply_layer_test.cpp       |  2 +-
 test/utest/layers/elu_layer_test.cpp          | 19 ++++++++++
 test/utest/layers/elu_layer_test_old.cpp      | 19 ++++++++++
 test/utest/layers/fm_order2_layer_test.cpp    |  3 +-
 .../utest/layers/fm_order2_layer_test_old.cpp |  4 +-
 .../fully_connected_layer_half_test.cpp       |  6 +--
 .../fused_fully_connected_layer_test.cpp      | 10 ++---
 ...d_relu_bias_fully_connected_layer_test.cpp | 10 ++---
 test/utest/layers/group_dense_layer_test.cpp  | 17 +++++----
 test/utest/layers/interaction_layer_test.cpp  | 29 +++++++++++++--
 .../layers/interaction_layer_test_old.cpp     | 29 +++++++++++++--
 test/utest/layers/layer_norm_layer_test.cpp   | 12 +++---
 test/utest/layers/mlp_test.cpp                | 19 +++++-----
 test/utest/layers/multi_cross_layer_test.cpp  | 18 +++++++--
 .../multi_head_attention_layer_test.cpp       | 34 +++++++++++++++++
 .../utest/layers/sequence_mask_layer_test.cpp |  2 +-
 .../layers/sequence_mask_layer_test_old.cpp   |  2 +-
 test/utest/test_utils.hpp                     | 37 ++++++++++++++++++-
 35 files changed, 271 insertions(+), 106 deletions(-)
diff --git a/HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp b/HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp
index c9d5211dbb..85fe14dfbd 100644
--- a/HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp
+++ b/HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp
@@ -142,7 +142,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
     float var = 0.0f;
     for (int i = 0; i < batch_size; i++) {
       int idx = i * num_feature + j;
-      float diff = in[idx] - mean;
+      float diff = __half2float(in[idx]) - mean;
       var += (diff * diff);
     }
     var /= batch_size;
@@ -162,7 +162,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
     for (int i = 0; i < batch_size; i++) {
       int idx = i * num_feature + j;
       val1 += (__half2float(out[idx]) * gamma[j]);
-      val2 += __half2float(in[idx] - mean);
+      val2 += (__half2float(in[idx]) - mean);
     }
     val1 *= (-inv_std);
     val2 *= (d_var / batch_size) * -2;
diff --git a/HugeCTR/src/cpu/layers/elu_layer_cpu.cpp b/HugeCTR/src/cpu/layers/elu_layer_cpu.cpp
index 01b75a00a1..891de22d61 100644
--- a/HugeCTR/src/cpu/layers/elu_layer_cpu.cpp
+++ b/HugeCTR/src/cpu/layers/elu_layer_cpu.cpp
@@ -27,7 +27,8 @@ namespace {
 template <typename T>
 void elu_cpu(const T* in, T* out, int len, T alpha) {
   for (int i = 0; i < len; ++i) {
-    out[i] = (in[i] < 0) ? T(alpha * (exp(in[i]) - 1)) : in[i];
+    out[i] =
+        (__half2float(in[i]) < 0) ? T(__half2float(alpha) * (exp(__half2float(in[i])) - 1)) : in[i];
   }
 }
 
diff --git a/HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp b/HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp
index a593e1a623..0293af5f0e 100644
--- a/HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp
+++ b/HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp
@@ -91,8 +91,8 @@ void fm_order2_bprop_cpu(const __half* in, const __half* top_grad, __half* dgrad
       }
       for (int k = 0; k < slot_num; k++) {
         int index = offset + k * emb_vec_size;
-        dgrad[index] =
-            __float2half(__half2float(top_grad[i * emb_vec_size + j]) * (sum - in[index]));
+        dgrad[index] = __float2half(__half2float(top_grad[i * emb_vec_size + j]) *
+                                    (sum - __half2float(in[index])));
       }
     }
   }
diff --git a/HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp b/HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp
index 52e23dc1ad..1b2566327b 100644
--- a/HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp
+++ b/HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp
@@ -29,7 +29,7 @@ void cpu_mm(__half* c, const __half* a, bool transpose_a, const __half* b, bool
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = sum;
     }
@@ -47,7 +47,7 @@ void cpu_add_bias(__half* top, const __half* bias, int m, int n) {
 void cpu_reverse_add_bias(__half* bias_grad, const __half* top, int m, int n) {
   for (int i = 0; i < n; ++i) {
     float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += top[j * n + i];
+    for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]);
     bias_grad[i] = sum;
   }
 }
diff --git a/HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp b/HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp
index a25317dc1b..d2da98d944 100644
--- a/HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp
+++ b/HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp
@@ -29,7 +29,7 @@ void cpu_mm(__half* c, const __half* a, bool transpose_a, const __half* b, bool
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = sum;
     }
@@ -41,7 +41,7 @@ void cpu_add_bias_and_re(__half* top, __half* middle, const __half* bias, int m,
     for (int j = 0; j < n; ++j) {
       __half t = top[i * n + j] + bias[j];
       middle[i * n + j] = t;
-      top[i * n + j] = t < 0 ? __float2half(0.0f) : t;
+      top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t;
     }
   }
 }
@@ -50,7 +50,7 @@ void cpu_reverse_add_bias_and_re(__half* bias_grad, __half* middle, const __half
                                  int n) {
   for (int i = 0; i < m; ++i)
     for (int j = 0; j < n; ++j) {
-      if (middle[i * n + j] < 0) {
+      if (__half2float(middle[i * n + j]) < 0) {
         middle[i * n + j] = 0.0f;
       } else {
         middle[i * n + j] = top[i * n + j];
@@ -59,7 +59,7 @@ void cpu_reverse_add_bias_and_re(__half* bias_grad, __half* middle, const __half
 
   for (int i = 0; i < n; ++i) {
     float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += middle[j * n + i];
+    for (int j = 0; j < m; ++j) sum += __half2float(middle[j * n + i]);
     bias_grad[i] = sum;
   }
 }
diff --git a/HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp b/HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp
index 8eb424f77a..74e3ea977f 100644
--- a/HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp
+++ b/HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp
@@ -74,8 +74,8 @@ void matmul_cpu(size_t height, size_t in_width, size_t n_ins, T *h_concat, T *h_
       for (size_t n = 0; n < n_ins; n++) {
         float accum = 0.0f;
         for (size_t k = 0; k < in_width; k++) {
-          accum += h_concat[concat_stride + m * in_width + k] *
-                   h_concat[concat_stride + n * in_width + k];
+          accum += __half2float(h_concat[concat_stride + m * in_width + k] *
+                                h_concat[concat_stride + n * in_width + k]);
         }
         h_mat[mat_stride + m * n_ins + n] = accum;
       }
diff --git a/test/utest/core23_dev_test/trainable_layers/batch_norm_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/batch_norm_layer_test.cpp
index e47a30027a..f9201ca408 100644
--- a/test/utest/core23_dev_test/trainable_layers/batch_norm_layer_test.cpp
+++ b/test/utest/core23_dev_test/trainable_layers/batch_norm_layer_test.cpp
@@ -158,7 +158,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
     float var = 0.0f;
     for (int i = 0; i < batch_size; i++) {
       int idx = i * num_feature + j;
-      float diff = in[idx] - mean;
+      float diff = __half2float(in[idx]) - mean;
       var += (diff * diff);
     }
     var /= batch_size;
@@ -178,7 +178,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
     for (int i = 0; i < batch_size; i++) {
       int idx = i * num_feature + j;
       val1 += (__half2float(out[idx]) * gamma[j]);
-      val2 += __half2float(in[idx] - mean);
+      val2 += __half2float(in[idx]) - mean;
     }
     val1 *= (-inv_std);
     val2 *= (d_var / batch_size) * -2;
diff --git a/test/utest/core23_dev_test/trainable_layers/fully_connected_layer_half_test.cpp b/test/utest/core23_dev_test/trainable_layers/fully_connected_layer_half_test.cpp
index 372bcef6f9..97b1b93d94 100644
--- a/test/utest/core23_dev_test/trainable_layers/fully_connected_layer_half_test.cpp
+++ b/test/utest/core23_dev_test/trainable_layers/fully_connected_layer_half_test.cpp
@@ -33,7 +33,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = sum;
     }
@@ -51,7 +51,7 @@ static void cpu_add_bias(__half *top, const __half *bias, int m, int n) {
 static void cpu_reverse_add_bias(__half *bias_grad, const __half *top, int m, int n) {
   for (int i = 0; i < n; ++i) {
     float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += top[j * n + i];
+    for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]);
     bias_grad[i] = sum;
   }
 }
@@ -59,7 +59,7 @@ static void cpu_reverse_add_bias(__half *bias_grad, const __half *top, int m, in
 static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
   size_t m = 0;
   for (size_t i = 0; i < n; i++) {
-    if (fabs(arr1[i] - arr2[i]) > threshold) {
+    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
       m++;
     }
   }
diff --git a/test/utest/core23_dev_test/trainable_layers/fused_fully_connected_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/fused_fully_connected_layer_test.cpp
index 9dc200d0d9..2c9102e422 100644
--- a/test/utest/core23_dev_test/trainable_layers/fused_fully_connected_layer_test.cpp
+++ b/test/utest/core23_dev_test/trainable_layers/fused_fully_connected_layer_test.cpp
@@ -34,7 +34,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = sum;
     }
@@ -46,7 +46,7 @@ static void cpu_add_bias_and_re(__half *top, __half *middle, const __half *bias,
     for (int j = 0; j < n; ++j) {
       __half t = top[i * n + j] + bias[j];
       middle[i * n + j] = t;
-      top[i * n + j] = t < 0 ? __float2half(0.0f) : t;
+      top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t;
     }
   }
 }
@@ -55,7 +55,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const
                                         int n) {
   for (int i = 0; i < m; ++i)
     for (int j = 0; j < n; ++j) {
-      if (middle[i * n + j] < 0) {
+      if (__half2float(middle[i * n + j]) < 0) {
         middle[i * n + j] = 0.0f;
       } else {
         middle[i * n + j] = top[i * n + j];
@@ -64,7 +64,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const
 
   for (int i = 0; i < n; ++i) {
     float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += middle[j * n + i];
+    for (int j = 0; j < m; ++j) sum += __half2float(middle[j * n + i]);
     bias_grad[i] = sum;
   }
 }
@@ -72,7 +72,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const
 static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
   size_t m = 0;
   for (size_t i = 0; i < n; i++) {
-    if (fabs(arr1[i] - arr2[i]) > threshold) {
+    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
       m++;
     }
   }
diff --git a/test/utest/core23_dev_test/trainable_layers/fused_relu_bias_fully_connected_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/fused_relu_bias_fully_connected_layer_test.cpp
index ea54c8a397..9f984cd941 100644
--- a/test/utest/core23_dev_test/trainable_layers/fused_relu_bias_fully_connected_layer_test.cpp
+++ b/test/utest/core23_dev_test/trainable_layers/fused_relu_bias_fully_connected_layer_test.cpp
@@ -34,7 +34,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = sum;
     }
@@ -46,7 +46,7 @@ static void cpu_add_bias_and_re(__half *top, __half *middle, const __half *bias,
     for (int j = 0; j < n; ++j) {
       __half t = top[i * n + j] + bias[j];
       middle[i * n + j] = t;
-      top[i * n + j] = t < 0 ? __float2half(0.0f) : t;
+      top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t;
     }
   }
 }
@@ -55,7 +55,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __
                                         int m, int n) {
   for (int i = 0; i < m; ++i)
     for (int j = 0; j < n; ++j) {
-      if (top[i * n + j] < 0) {
+      if (__half2float(top[i * n + j]) < 0) {
         top[i * n + j] = 0.0f;
       } else {
         top[i * n + j] = bprop_out[i * n + j];
@@ -64,7 +64,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __
 
   for (int i = 0; i < n; ++i) {
     float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += top[j * n + i];
+    for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]);
     bias_grad[i] = sum;
   }
 }
@@ -72,7 +72,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __
 static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
   size_t m = 0;
   for (size_t i = 0; i < n; i++) {
-    if (fabs(arr1[i] - arr2[i]) > threshold) {
+    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
       m++;
     }
   }
diff --git a/test/utest/core23_dev_test/trainable_layers/layer_norm_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/layer_norm_layer_test.cpp
index 4709f91f03..281586a143 100644
--- a/test/utest/core23_dev_test/trainable_layers/layer_norm_layer_test.cpp
+++ b/test/utest/core23_dev_test/trainable_layers/layer_norm_layer_test.cpp
@@ -92,7 +92,7 @@ void layer_norm_fprop_cpu<__half>(const __half* gamma, const __half* beta, const
     for (int j = 0; j < num_feature; j++) {
       int idx = i * num_feature + j;
       float in_norm = (__half2float(in[idx]) - mean) / sqrt(var + eps);
-      out[idx] = __float2half(gamma[j] * in_norm + beta[j]);
+      out[idx] = gamma[j] * __float2half(in_norm) + beta[j];
     }
   }
 }
@@ -176,7 +176,7 @@ void layer_norm_bprop_cpu<__half>(const __half* gamma, const __half* out, __half
     float d_var = 0.0f;
     for (int j = 0; j < num_feature; j++) {
       int idx = i * num_feature + j;
-      float val = (__half2float(out[idx]) * gamma[j]) * (__half2float(in[idx]) - mean);
+      float val = __half2float(out[idx] * gamma[j]) * (__half2float(in[idx]) - mean);
       d_var += val;
     }
     d_var *= (-0.5f) * pow(inv_std, 3);
@@ -184,15 +184,16 @@ void layer_norm_bprop_cpu<__half>(const __half* gamma, const __half* out, __half
     float d_mu = 0.0f;
     for (int j = 0; j < num_feature; j++) {
       int idx = i * num_feature + j;
-      d_mu += __half2float(out[idx]) * gamma[j] * inv_std;
+      d_mu += __half2float(out[idx] * gamma[j]) * inv_std;
     }
     d_mu *= (-1.0f / num_feature);
 
     for (int j = 0; j < num_feature; j++) {
       int idx = i * num_feature + j;
-      gamma_grad[j] = gamma_grad[j] + out[idx] * (in[idx] - mean) * inv_std;
+      gamma_grad[j] =
+          gamma_grad[j] + out[idx] * (in[idx] - __float2half(mean)) * __float2half(inv_std);
       beta_grad[j] = beta_grad[j] + out[idx];
-      in[idx] = __float2half((__half2float(out[idx]) * gamma[j]) * inv_std +
+      in[idx] = __float2half(__half2float(out[idx] * gamma[j]) * inv_std +
                              d_var * (2.0 / num_feature) * (__half2float(in[idx]) - mean) + d_mu);
     }
   }
diff --git a/test/utest/core23_dev_test/trainable_layers/mlp_test.cpp b/test/utest/core23_dev_test/trainable_layers/mlp_test.cpp
index 0b0dd5fa6b..4220cb2041 100644
--- a/test/utest/core23_dev_test/trainable_layers/mlp_test.cpp
+++ b/test/utest/core23_dev_test/trainable_layers/mlp_test.cpp
@@ -65,7 +65,7 @@ static void cpu_mm(T* c, const T* a, bool transpose_a, const T* b, bool transpos
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = static_cast<half>(beta * static_cast<float>(c[i * n + j]) + sum);
     }
@@ -80,7 +80,7 @@ static void cpu_add_bias_and_re(T* top, T* middle, const T* bias, bool is_relu,
       T t = top[i * n + j] + (use_bias ? bias[j] : T(0.0f));
       middle[i * n + j] = t;
       if (is_relu)
-        top[i * n + j] = t < 0 ? T(0.0f) : t;
+        top[i * n + j] = __half2float(t) < 0 ? T(0.0f) : t;
       else
         top[i * n + j] = t;
     }
@@ -92,7 +92,8 @@ static void cpu_reverse_add_bias_and_re(T* bias_grad, T* dRelu, T* middle, const
                                         int m, int n, bool is_tail, bool use_bias) {
   for (int i = 0; i < m; ++i) {
     for (int j = 0; j < n; ++j) {
-      if ((middle[i * n + j] <= 0 && is_tail) || (middle[i * n + j] < 0 && !is_tail)) {
+      if ((__half2float(middle[i * n + j]) <= 0 && is_tail) ||
+          (__half2float(middle[i * n + j]) < 0 && !is_tail)) {
         dRelu[i * n + j] = 0.0f;
       } else {
         dRelu[i * n + j] = bprop_out[i * n + j];
@@ -102,7 +103,7 @@ static void cpu_reverse_add_bias_and_re(T* bias_grad, T* dRelu, T* middle, const
   if (use_bias) {
     for (int i = 0; i < n; ++i) {
       float sum = 0.0f;
-      for (int j = 0; j < m; ++j) sum += dRelu[j * n + i];
+      for (int j = 0; j < m; ++j) sum += __half2float(dRelu[j * n + i]);
       bias_grad[i] = sum;
     }
   }
@@ -135,11 +136,11 @@ static float compare_array(const T* arr1, const T* arr2, size_t n, float thresho
       HCTR_LOG(INFO, WORLD, "Nan or Inf Error\n");
       return INT_MAX;
     }
-    if (fabs(arr1[i] - arr2[i]) > threshold) {
-      if (arr2[i] == 0 && fabs(arr1[i]) > threshold) {
+    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
+      if (__half2float(arr2[i]) == 0 && fabs(__half2float(arr1[i])) > threshold) {
         HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]);
         m++;
-      } else if (fabs(arr1[i] - arr2[i]) / arr2[i] > threshold) {
+      } else if (fabs(__half2float(arr1[i] - arr2[i])) / __half2float(arr2[i]) > threshold) {
         HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]);
         m++;
       }
@@ -484,7 +485,7 @@ static void mlp_test(std::vector<Layer_t> network,
           for (uint32_t col = 0; col < fc_out_dims[i]; col++) {
             float sum = 0.0;
             for (uint32_t row = 0; row < batch_size; row++) {
-              sum = sum + p.h_top_grad[i].get()[row * fc_out_dims[i] + col];
+              sum = sum + __half2float(p.h_top_grad[i].get()[row * fc_out_dims[i] + col]);
             }
             p.h_bias_grad[i].get()[col] = sum;
           }
diff --git a/test/utest/core23_dev_test/trainable_layers/multi_cross_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/multi_cross_layer_test.cpp
index 2b25676242..b85c27aeb5 100644
--- a/test/utest/core23_dev_test/trainable_layers/multi_cross_layer_test.cpp
+++ b/test/utest/core23_dev_test/trainable_layers/multi_cross_layer_test.cpp
@@ -185,7 +185,7 @@ class MultiCrossLayerTest {
             // column of A is rowA
             acc = acc + (A[r * rowB + k] * B[c * rowB + k]);
           }
-          C[r * colB + c] = C[r * colB + c] * beta + acc;
+          C[r * colB + c] = __float2half(__half2float(C[r * colB + c]) * beta + __half2float(acc));
         }
       }
     } else if (transA) {
@@ -197,7 +197,7 @@ class MultiCrossLayerTest {
             // column of A is rowA
             acc = acc + (A[k * rowA + r] * B[k * colB + c]);
           }
-          C[r * colB + c] = C[r * colB + c] * beta + acc;
+          C[r * colB + c] = __float2half(__half2float(C[r * colB + c]) * beta + __half2float(acc));
         }
       }
     } else {
@@ -208,7 +208,7 @@ class MultiCrossLayerTest {
             // column of A is rowB
             acc = acc + (A[r * rowB + k] * B[k * colB + c]);
           }
-          C[r * colB + c] = C[r * colB + c] * beta + acc;
+          C[r * colB + c] = __float2half(__half2float(C[r * colB + c]) * beta + __half2float(acc));
         }
       }
     }
diff --git a/test/utest/layers/add_layer_test.cpp b/test/utest/layers/add_layer_test.cpp
index dbc3cef9dd..a5471c6b3a 100644
--- a/test/utest/layers/add_layer_test.cpp
+++ b/test/utest/layers/add_layer_test.cpp
@@ -49,7 +49,7 @@ void add_cpu(Vector input, T *output, size_t size, size_t num) {
   for (auto i = 0; i < size; i++) {
     float tmp = 0.f;
     for (size_t j = 0; j < num; j++) {
-      tmp += input[j][i];
+      tmp += (float)input[j][i];
     }
     output[i] = tmp;
   }
diff --git a/test/utest/layers/batch_norm_layer_test.cpp b/test/utest/layers/batch_norm_layer_test.cpp
index 5b057d825c..7a8ebbbf20 100644
--- a/test/utest/layers/batch_norm_layer_test.cpp
+++ b/test/utest/layers/batch_norm_layer_test.cpp
@@ -158,7 +158,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
     float var = 0.0f;
     for (int i = 0; i < batch_size; i++) {
       int idx = i * num_feature + j;
-      float diff = in[idx] - mean;
+      float diff = __half2float(in[idx]) - mean;
       var += (diff * diff);
     }
     var /= batch_size;
@@ -178,7 +178,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
     for (int i = 0; i < batch_size; i++) {
       int idx = i * num_feature + j;
       val1 += (__half2float(out[idx]) * gamma[j]);
-      val2 += __half2float(in[idx] - mean);
+      val2 += __half2float(in[idx]) - mean;
     }
     val1 *= (-inv_std);
     val2 *= (d_var / batch_size) * -2;
diff --git a/test/utest/layers/dropout_layer_test.cpp b/test/utest/layers/dropout_layer_test.cpp
index 5a205bbb35..ee9c2166c0 100644
--- a/test/utest/layers/dropout_layer_test.cpp
+++ b/test/utest/layers/dropout_layer_test.cpp
@@ -72,7 +72,7 @@ void dropout_test(int64_t dim0, int64_t dim1, float rate) {
 
   int cnt_zero_fprop = 0;
   for (int i = 0; i < len; i++) {
-    if (std::abs(h_top[i] - 0.f) < eps) {
+    if (std::abs(__half2float(h_top[i]) - 0.f) < eps) {
       cnt_zero_fprop++;
     }
   }
@@ -91,7 +91,7 @@ void dropout_test(int64_t dim0, int64_t dim1, float rate) {
                     core23::DeviceType::CPU, bottom_tensor.device());
   int cnt_zero_bprop = 0;
   for (int i = 0; i < len; i++) {
-    if (std::abs(h_bottom[i] - 0.f) < eps) {
+    if (std::abs(__half2float(h_bottom[i]) - 0.f) < eps) {
       cnt_zero_bprop++;
     }
   }
diff --git a/test/utest/layers/dropout_layer_test_old.cpp b/test/utest/layers/dropout_layer_test_old.cpp
index c4dcc6f4f7..e2331912f5 100644
--- a/test/utest/layers/dropout_layer_test_old.cpp
+++ b/test/utest/layers/dropout_layer_test_old.cpp
@@ -64,7 +64,7 @@ void dropout_test(size_t dim0, size_t dim1, float rate) {
   HCTR_LIB_THROW(cudaMemcpy(h_out.get(), d_out, n_bytes, cudaMemcpyDeviceToHost));
   int cnt_zero_fprop = 0;
   for (int i = 0; i < len; i++) {
-    if (std::abs(h_out[i] - 0.f) < eps) {
+    if (std::abs(__half2float(h_out[i]) - 0.f) < eps) {
       cnt_zero_fprop++;
     }
   }
@@ -82,7 +82,7 @@ void dropout_test(size_t dim0, size_t dim1, float rate) {
   HCTR_LIB_THROW(cudaMemcpy(h_in.get(), d_in, n_bytes, cudaMemcpyDeviceToHost));
   int cnt_zero_bprop = 0;
   for (int i = 0; i < len; i++) {
-    if (std::abs(h_out[i] - 0.f) < eps) {
+    if (std::abs(__half2float(h_out[i]) - 0.f) < eps) {
       cnt_zero_bprop++;
     }
   }
diff --git a/test/utest/layers/elementwise_multiply_layer_test.cpp b/test/utest/layers/elementwise_multiply_layer_test.cpp
index 111c6f4126..61d331a442 100644
--- a/test/utest/layers/elementwise_multiply_layer_test.cpp
+++ b/test/utest/layers/elementwise_multiply_layer_test.cpp
@@ -58,7 +58,7 @@ void elementwise_multiply_dgrad_cpu(const T *top_grad, T **dgrad, const T *fprop
 
   for (size_t i = 0; i < size; i++) {
     for (size_t j = 0; j < num; j++) {
-      if (0 == fprop_output[i]) {
+      if (0 == __half2float(fprop_output[i])) {
         dgrad[j][i] = zero;
       } else {
         T d_input = dgrad[j][i];
diff --git a/test/utest/layers/elu_layer_test.cpp b/test/utest/layers/elu_layer_test.cpp
index 032a60951b..5f6805318f 100644
--- a/test/utest/layers/elu_layer_test.cpp
+++ b/test/utest/layers/elu_layer_test.cpp
@@ -56,6 +56,25 @@ void elu_bprop_cpu(const T* d_out, T* d_in, int len, T alpha) {
   }
 }
 
+template <>
+void elu_cpu(const __half* in, __half* out, int len, __half alpha) {
+  for (int i = 0; i < len; ++i) {
+    out[i] = (__half2float(in[i]) < 0)
+                 ? __half(__half2float(alpha) * (exp(__half2float(in[i])) - 1))
+                 : in[i];
+  }
+}
+
+template <>
+void elu_bprop_cpu(const __half* d_out, __half* d_in, int len, __half alpha) {
+  for (int i = 0; i < len; ++i) {
+    d_in[i] =
+        (__half2float(d_in[i]) < 0)
+            ? __half(__half2float(alpha) * exp(__half2float(d_in[i])) * __half2float(d_out[i]))
+            : d_out[i];
+  }
+}
+
 template <typename T>
 void elu_test(int64_t dim0, int64_t dim1, T alpha) {
   constexpr bool use_mixed_precision = std::is_same_v<T, __half>;
diff --git a/test/utest/layers/elu_layer_test_old.cpp b/test/utest/layers/elu_layer_test_old.cpp
index dfac55cb4f..944031fd0a 100644
--- a/test/utest/layers/elu_layer_test_old.cpp
+++ b/test/utest/layers/elu_layer_test_old.cpp
@@ -53,6 +53,25 @@ void elu_bprop_cpu(const T* d_out, T* d_in, int len, T alpha) {
   }
 }
 
+template <>
+void elu_cpu(const __half* in, __half* out, int len, __half alpha) {
+  for (int i = 0; i < len; ++i) {
+    out[i] = (__half2float(in[i]) < 0)
+                 ? __half(__half2float(alpha) * (exp(__half2float(in[i])) - 1))
+                 : in[i];
+  }
+}
+
+template <>
+void elu_bprop_cpu(const __half* d_out, __half* d_in, int len, __half alpha) {
+  for (int i = 0; i < len; ++i) {
+    d_in[i] =
+        (__half2float(d_in[i]) < 0)
+            ? __half(__half2float(alpha) * exp(__half2float(d_in[i])) * __half2float(d_out[i]))
+            : d_out[i];
+  }
+}
+
 template <typename T>
 void elu_test(size_t dim0, size_t dim1, T alpha) {
   std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
diff --git a/test/utest/layers/fm_order2_layer_test.cpp b/test/utest/layers/fm_order2_layer_test.cpp
index dc341b8002..aa0b3a07ae 100644
--- a/test/utest/layers/fm_order2_layer_test.cpp
+++ b/test/utest/layers/fm_order2_layer_test.cpp
@@ -109,8 +109,7 @@ void fm_order2_bprop_cpu(const __half* in, const __half* top_grad, __half* dgrad
       }
       for (int k = 0; k < slot_num; k++) {
         int index = offset + k * emb_vec_size;
-        dgrad[index] =
-            __float2half(__half2float(top_grad[i * emb_vec_size + j]) * (sum - in[index]));
+        dgrad[index] = top_grad[i * emb_vec_size + j] * (__float2half(sum) - in[index]);
       }
     }
   }
diff --git a/test/utest/layers/fm_order2_layer_test_old.cpp b/test/utest/layers/fm_order2_layer_test_old.cpp
index 8fddc654ca..5177749878 100644
--- a/test/utest/layers/fm_order2_layer_test_old.cpp
+++ b/test/utest/layers/fm_order2_layer_test_old.cpp
@@ -108,8 +108,8 @@ void fm_order2_bprop_cpu(const __half* in, const __half* top_grad, __half* dgrad
       }
       for (int k = 0; k < slot_num; k++) {
         int index = offset + k * emb_vec_size;
-        dgrad[index] =
-            __float2half(__half2float(top_grad[i * emb_vec_size + j]) * (sum - in[index]));
+        dgrad[index] = __float2half(__half2float(top_grad[i * emb_vec_size + j]) *
+                                    (sum - __half2float(in[index])));
       }
     }
   }
diff --git a/test/utest/layers/fully_connected_layer_half_test.cpp b/test/utest/layers/fully_connected_layer_half_test.cpp
index b6da5c3368..dfc8ccbbb2 100644
--- a/test/utest/layers/fully_connected_layer_half_test.cpp
+++ b/test/utest/layers/fully_connected_layer_half_test.cpp
@@ -32,7 +32,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = sum;
     }
@@ -50,7 +50,7 @@ static void cpu_add_bias(__half *top, const __half *bias, int m, int n) {
 static void cpu_reverse_add_bias(__half *bias_grad, const __half *top, int m, int n) {
   for (int i = 0; i < n; ++i) {
     float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += top[j * n + i];
+    for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]);
     bias_grad[i] = sum;
   }
 }
@@ -58,7 +58,7 @@ static void cpu_reverse_add_bias(__half *bias_grad, const __half *top, int m, in
 static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
   size_t m = 0;
   for (size_t i = 0; i < n; i++) {
-    if (fabs(arr1[i] - arr2[i]) > threshold) {
+    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
       m++;
     }
   }
diff --git a/test/utest/layers/fused_fully_connected_layer_test.cpp b/test/utest/layers/fused_fully_connected_layer_test.cpp
index acd3e10ed6..eb7ec0dff1 100644
--- a/test/utest/layers/fused_fully_connected_layer_test.cpp
+++ b/test/utest/layers/fused_fully_connected_layer_test.cpp
@@ -33,7 +33,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai]) * __half2float(b[bi]);
       }
       c[i * n + j] = sum;
     }
@@ -45,7 +45,7 @@ static void cpu_add_bias_and_re(__half *top, __half *middle, const __half *bias,
     for (int j = 0; j < n; ++j) {
       __half t = top[i * n + j] + bias[j];
       middle[i * n + j] = t;
-      top[i * n + j] = t < 0 ? __float2half(0.0f) : t;
+      top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t;
     }
   }
 }
@@ -54,7 +54,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const
                                         int n) {
   for (int i = 0; i < m; ++i)
     for (int j = 0; j < n; ++j) {
-      if (middle[i * n + j] < 0) {
+      if (__half2float(middle[i * n + j]) < 0) {
         middle[i * n + j] = 0.0f;
       } else {
         middle[i * n + j] = top[i * n + j];
@@ -63,7 +63,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const
 
   for (int i = 0; i < n; ++i) {
     float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += middle[j * n + i];
+    for (int j = 0; j < m; ++j) sum += __half2float(middle[j * n + i]);
     bias_grad[i] = sum;
   }
 }
@@ -71,7 +71,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const
 static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
   size_t m = 0;
   for (size_t i = 0; i < n; i++) {
-    if (fabs(arr1[i] - arr2[i]) > threshold) {
+    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
       m++;
     }
   }
diff --git a/test/utest/layers/fused_relu_bias_fully_connected_layer_test.cpp b/test/utest/layers/fused_relu_bias_fully_connected_layer_test.cpp
index 0370ac750c..721c78a195 100644
--- a/test/utest/layers/fused_relu_bias_fully_connected_layer_test.cpp
+++ b/test/utest/layers/fused_relu_bias_fully_connected_layer_test.cpp
@@ -33,7 +33,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai]) * __half2float(b[bi]);
       }
       c[i * n + j] = sum;
     }
@@ -45,7 +45,7 @@ static void cpu_add_bias_and_re(__half *top, __half *middle, const __half *bias,
     for (int j = 0; j < n; ++j) {
       __half t = top[i * n + j] + bias[j];
       middle[i * n + j] = t;
-      top[i * n + j] = t < 0 ? __float2half(0.0f) : t;
+      top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t;
     }
   }
 }
@@ -54,7 +54,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __
                                         int m, int n) {
   for (int i = 0; i < m; ++i)
     for (int j = 0; j < n; ++j) {
-      if (top[i * n + j] < 0) {
+      if (__half2float(top[i * n + j]) < 0) {
         top[i * n + j] = 0.0f;
       } else {
         top[i * n + j] = bprop_out[i * n + j];
@@ -63,7 +63,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __
 
   for (int i = 0; i < n; ++i) {
     float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += top[j * n + i];
+    for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]);
     bias_grad[i] = sum;
   }
 }
@@ -71,7 +71,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __
 static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
   size_t m = 0;
   for (size_t i = 0; i < n; i++) {
-    if (fabs(arr1[i] - arr2[i]) > threshold) {
+    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
       m++;
     }
   }
diff --git a/test/utest/layers/group_dense_layer_test.cpp b/test/utest/layers/group_dense_layer_test.cpp
index 11a7516655..85aaa0ff81 100644
--- a/test/utest/layers/group_dense_layer_test.cpp
+++ b/test/utest/layers/group_dense_layer_test.cpp
@@ -65,7 +65,7 @@ static void cpu_mm(__half* c, const __half* a, bool transpose_a, const __half* b
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = static_cast<half>(beta * static_cast<float>(c[i * n + j]) + sum);
     }
@@ -79,7 +79,7 @@ static void cpu_add_bias_and_re(__half* top, __half* middle, const __half* bias,
       __half t = top[i * n + j] + bias[j];
       middle[i * n + j] = t;
       if (is_relu)
-        top[i * n + j] = t < 0 ? __float2half(0.0f) : t;
+        top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t;
       else
         top[i * n + j] = t;
     }
@@ -90,7 +90,8 @@ static void cpu_reverse_add_bias_and_re(__half* bias_grad, __half* dRelu, __half
                                         const __half* bprop_out, int m, int n, bool is_tail) {
   for (int i = 0; i < m; ++i) {
     for (int j = 0; j < n; ++j) {
-      if ((middle[i * n + j] <= 0 && is_tail) || (middle[i * n + j] < 0 && !is_tail)) {
+      if ((__half2float(middle[i * n + j]) <= 0 && is_tail) ||
+          (__half2float(middle[i * n + j]) < 0 && !is_tail)) {
         dRelu[i * n + j] = 0.0f;
       } else {
         dRelu[i * n + j] = bprop_out[i * n + j];
@@ -99,7 +100,7 @@ static void cpu_reverse_add_bias_and_re(__half* bias_grad, __half* dRelu, __half
   }
   for (int i = 0; i < n; ++i) {
     float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += dRelu[j * n + i];
+    for (int j = 0; j < m; ++j) sum += __half2float(dRelu[j * n + i]);
     bias_grad[i] = sum;
   }
 }
@@ -130,11 +131,11 @@ static float compare_array(const __half* arr1, const __half* arr2, size_t n, flo
       HCTR_LOG(INFO, WORLD, "Nan or Inf Error\n");
       return INT_MAX;
     }
-    if (fabs(arr1[i] - arr2[i]) > threshold) {
-      if (arr2[i] == 0 && fabs(arr1[i]) > threshold) {
+    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
+      if (__half2float(arr2[i]) == 0 && fabs(__half2float(arr1[i])) > threshold) {
         HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]);
         m++;
-      } else if (fabs(arr1[i] - arr2[i]) / arr2[i] > threshold) {
+      } else if (fabs(__half2float(arr1[i] - arr2[i])) / __half2float(arr2[i]) > threshold) {
         HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]);
         m++;
       }
@@ -451,7 +452,7 @@ static void group_dense_layer_test(uint32_t* input_dims, uint32_t* output_dims,
         for (uint32_t col = 0; col < output_dims[i]; col++) {
           float sum = 0.0;
           for (uint32_t row = 0; row < batch_size; row++) {
-            sum = sum + h_top_grad[i][row * output_dims[i] + col];
+            sum = sum + __half2float(h_top_grad[i][row * output_dims[i] + col]);
           }
           h_bias_grad[i][col] = sum;
         }
diff --git a/test/utest/layers/interaction_layer_test.cpp b/test/utest/layers/interaction_layer_test.cpp
index 320a7a3657..2165005f91 100644
--- a/test/utest/layers/interaction_layer_test.cpp
+++ b/test/utest/layers/interaction_layer_test.cpp
@@ -45,6 +45,26 @@ __half get_eps(bool use_tf32) {
   return __float2half(1);
 }
 
+template <typename T>
+float get_accum(const T a, const T b) {
+  return float(a * b);
+}
+
+template <>
+float get_accum(const __half a, const __half b) {
+  return __half2float(a) * __half2float(b);
+}
+
+template <typename T>
+float get_sec_accum(const T a, const T b, const T c) {
+  return float((a + b) * c);
+}
+
+template <>
+float get_sec_accum(const __half a, const __half b, const __half c) {
+  return (__half2float(a) + __half2float(b)) * __half2float(c);
+}
+
 template <typename T>
 void interaction_layer_test(int64_t height, int64_t n_emb, int64_t in_width,
                             bool enable_tf32_compute = false) {
@@ -141,8 +161,8 @@ void interaction_layer_test(int64_t height, int64_t n_emb, int64_t in_width,
       for (size_t n = 0; n < n_ins; n++) {
         float accum = 0.0f;
         for (size_t k = 0; k < in_width; k++) {
-          accum += h_concat[concat_stride + m * in_width + k] *
-                   h_concat[concat_stride + n * in_width + k];
+          accum += get_accum(h_concat[concat_stride + m * in_width + k],
+                             h_concat[concat_stride + n * in_width + k]);
         }
         h_matmul[mat_stride + m * n_ins + n] = accum;
       }
@@ -217,8 +237,9 @@ void interaction_layer_test(int64_t height, int64_t n_emb, int64_t in_width,
       for (size_t n = 0; n < in_width; n++) {
         float accum = 0.0f;
         for (size_t k = 0; k < n_ins; k++) {
-          accum += (h_matmul[mat_stride + m * n_ins + k] + h_matmul[mat_stride + k * n_ins + m]) *
-                   h_concat_tmp[concat_stride + k * in_width + n];
+          accum += get_sec_accum(h_matmul[mat_stride + m * n_ins + k],
+                                 h_matmul[mat_stride + k * n_ins + m],
+                                 h_concat_tmp[concat_stride + k * in_width + n]);
         }
         h_concat[concat_stride + m * in_width + n] = 1.0f * accum;
       }
diff --git a/test/utest/layers/interaction_layer_test_old.cpp b/test/utest/layers/interaction_layer_test_old.cpp
index 4b35f9bcc2..6b7f10b789 100644
--- a/test/utest/layers/interaction_layer_test_old.cpp
+++ b/test/utest/layers/interaction_layer_test_old.cpp
@@ -40,6 +40,26 @@ __half get_eps(bool use_tf32) {
   return __float2half(1);
 }
 
+template <typename T>
+float get_accum(const T a, const T b) {
+  return float(a * b);
+}
+
+template <>
+float get_accum(const __half a, const __half b) {
+  return __half2float(a) * __half2float(b);
+}
+
+template <typename T>
+float get_sec_accum(const T a, const T b, const T c) {
+  return float((a + b) * c);
+}
+
+template <>
+float get_sec_accum(const __half a, const __half b, const __half c) {
+  return (__half2float(a) + __half2float(b)) * __half2float(c);
+}
+
 template <typename T>
 void interaction_layer_test(size_t height, size_t n_emb, size_t in_width,
                             bool enable_tf32_compute = false) {
@@ -138,8 +158,8 @@ void interaction_layer_test(size_t height, size_t n_emb, size_t in_width,
       for (size_t n = 0; n < n_ins; n++) {
         float accum = 0.0f;
         for (size_t k = 0; k < in_width; k++) {
-          accum += h_concat[concat_stride + m * in_width + k] *
-                   h_concat[concat_stride + n * in_width + k];
+          accum += get_accum(h_concat[concat_stride + m * in_width + k],
+                             h_concat[concat_stride + n * in_width + k]);
         }
         h_mat[mat_stride + m * n_ins + n] = accum;
       }
@@ -217,8 +237,9 @@ void interaction_layer_test(size_t height, size_t n_emb, size_t in_width,
       for (size_t n = 0; n < in_width; n++) {
         float accum = 0.0f;
         for (size_t k = 0; k < n_ins; k++) {
-          accum += (h_mat[mat_stride + m * n_ins + k] + h_mat[mat_stride + k * n_ins + m]) *
-                   h_concat_tmp[concat_stride + k * in_width + n];
+          accum +=
+              get_sec_accum(h_mat[mat_stride + m * n_ins + k], h_mat[mat_stride + k * n_ins + m],
+                            h_concat_tmp[concat_stride + k * in_width + n]);
         }
         h_concat[concat_stride + m * in_width + n] = 1.0f * accum;
       }
diff --git a/test/utest/layers/layer_norm_layer_test.cpp b/test/utest/layers/layer_norm_layer_test.cpp
index ba916f61ed..c790fbb8a5 100644
--- a/test/utest/layers/layer_norm_layer_test.cpp
+++ b/test/utest/layers/layer_norm_layer_test.cpp
@@ -92,7 +92,7 @@ void layer_norm_fprop_cpu<__half>(const __half* gamma, const __half* beta, const
     for (int j = 0; j < num_feature; j++) {
       int idx = i * num_feature + j;
       float in_norm = (__half2float(in[idx]) - mean) / sqrt(var + eps);
-      out[idx] = __float2half(gamma[j] * in_norm + beta[j]);
+      out[idx] = gamma[j] * __float2half(in_norm) + beta[j];
     }
   }
 }
@@ -176,7 +176,8 @@ void layer_norm_bprop_cpu<__half>(const __half* gamma, const __half* out, __half
     float d_var = 0.0f;
     for (int j = 0; j < num_feature; j++) {
       int idx = i * num_feature + j;
-      float val = (__half2float(out[idx]) * gamma[j]) * (__half2float(in[idx]) - mean);
+      float val =
+          (__half2float(out[idx]) * __half2float(gamma[j])) * (__half2float(in[idx]) - mean);
       d_var += val;
     }
     d_var *= (-0.5f) * pow(inv_std, 3);
@@ -184,15 +185,16 @@ void layer_norm_bprop_cpu<__half>(const __half* gamma, const __half* out, __half
     float d_mu = 0.0f;
     for (int j = 0; j < num_feature; j++) {
       int idx = i * num_feature + j;
-      d_mu += __half2float(out[idx]) * gamma[j] * inv_std;
+      d_mu += __half2float(out[idx]) * __half2float(gamma[j]) * inv_std;
     }
     d_mu *= (-1.0f / num_feature);
 
     for (int j = 0; j < num_feature; j++) {
       int idx = i * num_feature + j;
-      gamma_grad[j] = gamma_grad[j] + out[idx] * (in[idx] - mean) * inv_std;
+      gamma_grad[j] =
+          gamma_grad[j] + out[idx] * (in[idx] - __float2half(mean)) * __float2half(inv_std);
       beta_grad[j] = beta_grad[j] + out[idx];
-      in[idx] = __float2half((__half2float(out[idx]) * gamma[j]) * inv_std +
+      in[idx] = __float2half(__half2float(out[idx] * gamma[j]) * inv_std +
                              d_var * (2.0 / num_feature) * (__half2float(in[idx]) - mean) + d_mu);
     }
   }
diff --git a/test/utest/layers/mlp_test.cpp b/test/utest/layers/mlp_test.cpp
index 3a0226630e..9d743eee58 100644
--- a/test/utest/layers/mlp_test.cpp
+++ b/test/utest/layers/mlp_test.cpp
@@ -64,7 +64,7 @@ static void cpu_mm(T* c, const T* a, bool transpose_a, const T* b, bool transpos
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = static_cast<half>(beta * static_cast<float>(c[i * n + j]) + sum);
     }
@@ -79,7 +79,7 @@ static void cpu_add_bias_and_re(T* top, T* middle, const T* bias, bool is_relu,
       T t = top[i * n + j] + (use_bias ? bias[j] : T(0.0f));
       middle[i * n + j] = t;
       if (is_relu)
-        top[i * n + j] = t < 0 ? T(0.0f) : t;
+        top[i * n + j] = __half2float(t) < 0 ? T(0.0f) : t;
       else
         top[i * n + j] = t;
     }
@@ -91,7 +91,8 @@ static void cpu_reverse_add_bias_and_re(T* bias_grad, T* dRelu, T* middle, const
                                         int m, int n, bool is_tail, bool use_bias) {
   for (int i = 0; i < m; ++i) {
     for (int j = 0; j < n; ++j) {
-      if ((middle[i * n + j] <= 0 && is_tail) || (middle[i * n + j] < 0 && !is_tail)) {
+      if ((__half2float(middle[i * n + j]) <= 0 && is_tail) ||
+          (__half2float(middle[i * n + j]) < 0 && !is_tail)) {
         dRelu[i * n + j] = 0.0f;
       } else {
         dRelu[i * n + j] = bprop_out[i * n + j];
@@ -101,7 +102,7 @@ static void cpu_reverse_add_bias_and_re(T* bias_grad, T* dRelu, T* middle, const
   if (use_bias) {
     for (int i = 0; i < n; ++i) {
       float sum = 0.0f;
-      for (int j = 0; j < m; ++j) sum += dRelu[j * n + i];
+      for (int j = 0; j < m; ++j) sum += __half2float(dRelu[j * n + i]);
       bias_grad[i] = sum;
     }
   }
@@ -134,11 +135,11 @@ static float compare_array(const T* arr1, const T* arr2, size_t n, float thresho
       HCTR_LOG(INFO, WORLD, "Nan or Inf Error\n");
       return INT_MAX;
     }
-    if (fabs(arr1[i] - arr2[i]) > threshold) {
-      if (arr2[i] == 0 && fabs(arr1[i]) > threshold) {
+    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
+      if (__half2float(arr2[i]) == 0 && fabs(__half2float(arr1[i])) > threshold) {
         HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]);
         m++;
-      } else if (fabs(arr1[i] - arr2[i]) / arr2[i] > threshold) {
+      } else if (fabs(__half2float(arr1[i] - arr2[i])) / __half2float(arr2[i]) > threshold) {
         HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]);
         m++;
       }
@@ -465,7 +466,7 @@ static void mlp_test(std::vector<Layer_t> network, std::vector<std::vector<size_
           for (uint32_t col = 0; col < fc_out_dims[i]; col++) {
             float sum = 0.0;
             for (uint32_t row = 0; row < batch_size; row++) {
-              sum = sum + p.h_top_grad[i][row * fc_out_dims[i] + col];
+              sum = sum + __half2float(p.h_top_grad[i][row * fc_out_dims[i] + col]);
             }
             p.h_bias_grad[i][col] = sum;
           }
@@ -615,4 +616,4 @@ TEST(mlp_test_fp32_perf, all) {
 TEST(mlp_test_tf32_perf, all) {
   mlp_test<float>(network, mlp_num_outputs, use_relu, use_bias, use_fuse_wb, true, input_dim,
                   batch_size, perf_config_set);
-};
\ No newline at end of file
+};
diff --git a/test/utest/layers/multi_cross_layer_test.cpp b/test/utest/layers/multi_cross_layer_test.cpp
index c52bcad136..01f9825363 100644
--- a/test/utest/layers/multi_cross_layer_test.cpp
+++ b/test/utest/layers/multi_cross_layer_test.cpp
@@ -26,6 +26,16 @@
 
 using namespace HugeCTR;
 
+template <typename T>
+float cal_C(const T a, const float beta, const T b) {
+  return float(a * beta + b);
+}
+
+template <>
+float cal_C(const __half a, const float beta, const __half b) {
+  return __half2float(a) * beta + __half2float(b);
+}
+
 template <typename T>
 class MultiCrossLayerTest {
  private:
@@ -184,7 +194,7 @@ class MultiCrossLayerTest {
             // column of A is rowA
             acc = acc + (A[r * rowB + k] * B[c * rowB + k]);
           }
-          C[r * colB + c] = C[r * colB + c] * beta + acc;
+          C[r * colB + c] = cal_C(C[r * colB + c], beta, acc);
         }
       }
     } else if (transA) {
@@ -196,7 +206,7 @@ class MultiCrossLayerTest {
             // column of A is rowA
             acc = acc + (A[k * rowA + r] * B[k * colB + c]);
           }
-          C[r * colB + c] = C[r * colB + c] * beta + acc;
+          C[r * colB + c] = cal_C(C[r * colB + c], beta, acc);
         }
       }
     } else {
@@ -207,7 +217,7 @@ class MultiCrossLayerTest {
             // column of A is rowB
             acc = acc + (A[r * rowB + k] * B[k * colB + c]);
           }
-          C[r * colB + c] = C[r * colB + c] * beta + acc;
+          C[r * colB + c] = cal_C(C[r * colB + c], beta, acc);
         }
       }
     }
@@ -877,4 +887,4 @@ TEST(multi_cross_layer_v2, fp16_3x1024x2) {
 // TEST(multi_cross_layer_v2, fp16_dlrm) {
 //   MultiCrossLayerTest<__half> test(65536, 3456, 3, 512);
 //   test.test();
-// }
\ No newline at end of file
+// }
diff --git a/test/utest/layers/multi_head_attention_layer_test.cpp b/test/utest/layers/multi_head_attention_layer_test.cpp
index 5829bccadc..a35aaa7612 100644
--- a/test/utest/layers/multi_head_attention_layer_test.cpp
+++ b/test/utest/layers/multi_head_attention_layer_test.cpp
@@ -196,6 +196,18 @@ void multi_head_attention_cpu(T *in1, T *in2, T *output, int64_t b, int64_t h, i
   }
 }
 
+template <>
+void multi_head_attention_cpu(__half *in1, __half *in2, __half *output, int64_t b, int64_t h,
+                              int64_t m, int64_t n, int64_t k) {
+  transpose(in2, b, h, n, k);
+  matmul_cpu(in1, in2, output, b, h, m, k, n);
+  // Just to revert in2 back
+  transpose(in2, b, h, k, n);
+  for (int64_t i = 0; i < b * h * m * n; i++) {
+    output[i] = __half2float(output[i]) / ((float)sqrt(k));
+  }
+}
+
 template <typename T>
 void multi_head_attention_cpu_noT(T *in1, T *in2, T *output, int64_t b, int64_t h, int64_t m,
                                   int64_t n, int64_t k) {
@@ -232,6 +244,28 @@ void multi_head_attention_dgrad_cpu(T *out, T **h_ins, T **h_b_ins, int64_t b, i
     h_b_ins[1][i] = h_b_ins[1][i] / ((float)sqrt(k));
   }
 }
+
+template <>
+void multi_head_attention_dgrad_cpu(__half *out, __half **h_ins, __half **h_b_ins, int64_t b,
+                                    int64_t h, int64_t m, int64_t n, int64_t k) {
+  // transpose(h_ins[1], h, b, n, k);
+  // transpose(h_ins[0], h, b, m, n);
+  // out [b,h,m,n]
+  // in1 [b,h,m,k]
+  // in2 [b,h,n,k]
+  matmul_cpu(out, h_ins[1], h_b_ins[0], b, h, m, n, k);
+  transpose(out, b, h, m, n);
+  matmul_cpu(out, h_ins[0], h_b_ins[1], b, h, n, m, k);
+  // Just revert out back
+  transpose(out, b, h, m, n);
+  for (int64_t i = 0; i < b * h * m * k; i++) {
+    h_b_ins[0][i] = __half2float(h_b_ins[0][i]) / ((float)sqrt(k));
+  }
+  for (int64_t i = 0; i < b * h * n * k; i++) {
+    h_b_ins[1][i] = __half2float(h_b_ins[1][i]) / ((float)sqrt(k));
+  }
+}
+
 template <typename T>
 void multi_head_attention_dgrad_3d_cpu(T *out, T *value_out, T **h_ins, T **h_b_ins,
                                        int64_t batch_size, int64_t head_num, int64_t seq_len,
diff --git a/test/utest/layers/sequence_mask_layer_test.cpp b/test/utest/layers/sequence_mask_layer_test.cpp
index aa38ebb407..b023aa097a 100644
--- a/test/utest/layers/sequence_mask_layer_test.cpp
+++ b/test/utest/layers/sequence_mask_layer_test.cpp
@@ -46,7 +46,7 @@ struct Eps<__half> {
 template <typename T>
 void f2i_input(T* input, size_t in_size, size_t max_sequence_len) {
   for (size_t i = 0; i < in_size; i++) {
-    input[i] = abs(floor(input[i] * max_sequence_len));
+    input[i] = abs(floor(__half2float(input[i]) * max_sequence_len));
   }
 }
 
diff --git a/test/utest/layers/sequence_mask_layer_test_old.cpp b/test/utest/layers/sequence_mask_layer_test_old.cpp
index dec57c31cb..638a409b11 100644
--- a/test/utest/layers/sequence_mask_layer_test_old.cpp
+++ b/test/utest/layers/sequence_mask_layer_test_old.cpp
@@ -43,7 +43,7 @@ struct Eps<__half> {
 template <typename T>
 void f2i_input(T* input, size_t in_size, size_t max_sequence_len) {
   for (size_t i = 0; i < in_size; i++) {
-    input[i] = abs(floor(input[i] * max_sequence_len));
+    input[i] = abs(floor(__half2float(input[i]) * max_sequence_len));
   }
 }
 
diff --git a/test/utest/test_utils.hpp b/test/utest/test_utils.hpp
index ef4210d10c..decf23e2d3 100644
--- a/test/utest/test_utils.hpp
+++ b/test/utest/test_utils.hpp
@@ -53,7 +53,7 @@ void normal_sync_cpu(T* data, int64_t num_elements, const float mean, const floa
 
 template <typename T>
 T abs(const T& val) {
-  return val > T(0) ? val : -val;
+  return val > T(0.0) ? val : -val;
 }
 
 template <typename T>
@@ -72,6 +72,24 @@ ::testing::AssertionResult compare_array_approx(const T* h_out, const T* h_exp,
   return ::testing::AssertionSuccess();
 }
 
+template <typename T>
+::testing::AssertionResult compare_array_approx(const __half* h_out, const __half* h_exp, int len,
+                                                __half eps) {
+  for (int i = 0; i < len; ++i) {
+    auto output = h_out[i];
+    auto expected = h_exp[i];
+    T diff = abs(output - expected);
+    if (diff > eps) {
+      // if (diff > eps && i < 128 * 10) {
+      // std::cout
+      return ::testing::AssertionFailure()
+             << "output: " << __half2float(output) << " != expected: " << __half2float(expected)
+             << " at idx " << i << std::endl;
+    }
+  }
+  return ::testing::AssertionSuccess();
+}
+
 template <typename T>
 ::testing::AssertionResult compare_array_approx(const T* h_out, const T expected, int len, T eps) {
   for (int i = 0; i < len; ++i) {
@@ -118,6 +136,23 @@ ::testing::AssertionResult compare_array_approx_rel(const T* h_out, const T* h_e
   return ::testing::AssertionSuccess();
 }
 
+template <typename T>
+::testing::AssertionResult compare_array_approx_rel(const __half* h_out, const __half* h_exp,
+                                                    int len, __half max_rel_err,
+                                                    __half max_abs_err) {
+  for (int i = 0; i < len; ++i) {
+    auto output = h_out[i];
+    auto expected = h_exp[i];
+    T abs_err = abs(output - expected);
+    T rel_err = abs_err / expected;
+    if (abs_err > max_abs_err && rel_err > max_rel_err) {
+      return ::testing::AssertionFailure()
+             << "output: " << (float)output << " != expected: " << (float)expected << " at idx "
+             << i;
+    }
+  }
+  return ::testing::AssertionSuccess();
+}
 __forceinline__ bool cpu_gpu_cmp(float* cpu_p, float* gpu_p, int len) {
   float* gpu_tmp = (float*)malloc(sizeof(float) * len);
   cudaMemcpy(gpu_tmp, gpu_p, sizeof(float) * len, cudaMemcpyDeviceToHost);