Merge branch 'fix_operator_error_in_mlperf' into 'main'

Fix math operator error on CUDA 12.2 See merge request dl/hugectr/hugectr!1426
NVIDIA-Merlin · Aug 2, 2023 · 734dc44 · 734dc44
2 parents f5452a6 + 6f156df
commit 734dc44
Show file tree

Hide file tree

Showing 35 changed files with 271 additions and 106 deletions.
diff --git a/HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp b/HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp
@@ -142,7 +142,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
     float var = 0.0f;
     for (int i = 0; i < batch_size; i++) {
       int idx = i * num_feature + j;
-      float diff = in[idx] - mean;
+      float diff = __half2float(in[idx]) - mean;
       var += (diff * diff);
     }
     var /= batch_size;
@@ -162,7 +162,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
     for (int i = 0; i < batch_size; i++) {
       int idx = i * num_feature + j;
       val1 += (__half2float(out[idx]) * gamma[j]);
-      val2 += __half2float(in[idx] - mean);
+      val2 += (__half2float(in[idx]) - mean);
     }
     val1 *= (-inv_std);
     val2 *= (d_var / batch_size) * -2;

diff --git a/HugeCTR/src/cpu/layers/elu_layer_cpu.cpp b/HugeCTR/src/cpu/layers/elu_layer_cpu.cpp
@@ -27,7 +27,8 @@ namespace {
 template <typename T>
 void elu_cpu(const T* in, T* out, int len, T alpha) {
   for (int i = 0; i < len; ++i) {
-    out[i] = (in[i] < 0) ? T(alpha * (exp(in[i]) - 1)) : in[i];
+    out[i] =
+        (__half2float(in[i]) < 0) ? T(__half2float(alpha) * (exp(__half2float(in[i])) - 1)) : in[i];
   }
 }
 

diff --git a/HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp b/HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp
@@ -91,8 +91,8 @@ void fm_order2_bprop_cpu(const __half* in, const __half* top_grad, __half* dgrad
       }
       for (int k = 0; k < slot_num; k++) {
         int index = offset + k * emb_vec_size;
-        dgrad[index] =
-            __float2half(__half2float(top_grad[i * emb_vec_size + j]) * (sum - in[index]));
+        dgrad[index] = __float2half(__half2float(top_grad[i * emb_vec_size + j]) *
+                                    (sum - __half2float(in[index])));
       }
     }
   }

diff --git a/HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp b/HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp
@@ -29,7 +29,7 @@ void cpu_mm(__half* c, const __half* a, bool transpose_a, const __half* b, bool
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = sum;
     }
@@ -47,7 +47,7 @@ void cpu_add_bias(__half* top, const __half* bias, int m, int n) {
 void cpu_reverse_add_bias(__half* bias_grad, const __half* top, int m, int n) {
   for (int i = 0; i < n; ++i) {
     float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += top[j * n + i];
+    for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]);
     bias_grad[i] = sum;
   }
 }

diff --git a/HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp b/HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp
@@ -29,7 +29,7 @@ void cpu_mm(__half* c, const __half* a, bool transpose_a, const __half* b, bool
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = sum;
     }
@@ -41,7 +41,7 @@ void cpu_add_bias_and_re(__half* top, __half* middle, const __half* bias, int m,
     for (int j = 0; j < n; ++j) {
       __half t = top[i * n + j] + bias[j];
       middle[i * n + j] = t;
-      top[i * n + j] = t < 0 ? __float2half(0.0f) : t;
+      top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t;
     }
   }
 }
@@ -50,7 +50,7 @@ void cpu_reverse_add_bias_and_re(__half* bias_grad, __half* middle, const __half
                                  int n) {
   for (int i = 0; i < m; ++i)
     for (int j = 0; j < n; ++j) {
-      if (middle[i * n + j] < 0) {
+      if (__half2float(middle[i * n + j]) < 0) {
         middle[i * n + j] = 0.0f;
       } else {
         middle[i * n + j] = top[i * n + j];
@@ -59,7 +59,7 @@ void cpu_reverse_add_bias_and_re(__half* bias_grad, __half* middle, const __half
 
   for (int i = 0; i < n; ++i) {
     float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += middle[j * n + i];
+    for (int j = 0; j < m; ++j) sum += __half2float(middle[j * n + i]);
     bias_grad[i] = sum;
   }
 }

diff --git a/HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp b/HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp
@@ -74,8 +74,8 @@ void matmul_cpu(size_t height, size_t in_width, size_t n_ins, T *h_concat, T *h_
       for (size_t n = 0; n < n_ins; n++) {
         float accum = 0.0f;
         for (size_t k = 0; k < in_width; k++) {
-          accum += h_concat[concat_stride + m * in_width + k] *
-                   h_concat[concat_stride + n * in_width + k];
+          accum += __half2float(h_concat[concat_stride + m * in_width + k] *
+                                h_concat[concat_stride + n * in_width + k]);
         }
         h_mat[mat_stride + m * n_ins + n] = accum;
       }

diff --git a/test/utest/core23_dev_test/trainable_layers/batch_norm_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/batch_norm_layer_test.cpp
@@ -158,7 +158,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
     float var = 0.0f;
     for (int i = 0; i < batch_size; i++) {
       int idx = i * num_feature + j;
-      float diff = in[idx] - mean;
+      float diff = __half2float(in[idx]) - mean;
       var += (diff * diff);
     }
     var /= batch_size;
@@ -178,7 +178,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
     for (int i = 0; i < batch_size; i++) {
       int idx = i * num_feature + j;
       val1 += (__half2float(out[idx]) * gamma[j]);
-      val2 += __half2float(in[idx] - mean);
+      val2 += __half2float(in[idx]) - mean;
     }
     val1 *= (-inv_std);
     val2 *= (d_var / batch_size) * -2;

diff --git a/test/utest/core23_dev_test/trainable_layers/fully_connected_layer_half_test.cpp b/test/utest/core23_dev_test/trainable_layers/fully_connected_layer_half_test.cpp
@@ -33,7 +33,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = sum;
     }
@@ -51,15 +51,15 @@ static void cpu_add_bias(__half *top, const __half *bias, int m, int n) {
 static void cpu_reverse_add_bias(__half *bias_grad, const __half *top, int m, int n) {
   for (int i = 0; i < n; ++i) {
     float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += top[j * n + i];
+    for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]);
     bias_grad[i] = sum;
   }
 }
 
 static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
   size_t m = 0;
   for (size_t i = 0; i < n; i++) {
-    if (fabs(arr1[i] - arr2[i]) > threshold) {
+    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
       m++;
     }
   }

diff --git a/test/utest/core23_dev_test/trainable_layers/fused_fully_connected_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/fused_fully_connected_layer_test.cpp
@@ -34,7 +34,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = sum;
     }
@@ -46,7 +46,7 @@ static void cpu_add_bias_and_re(__half *top, __half *middle, const __half *bias,
     for (int j = 0; j < n; ++j) {
       __half t = top[i * n + j] + bias[j];
       middle[i * n + j] = t;
-      top[i * n + j] = t < 0 ? __float2half(0.0f) : t;
+      top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t;
     }
   }
 }
@@ -55,7 +55,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const
                                         int n) {
   for (int i = 0; i < m; ++i)
     for (int j = 0; j < n; ++j) {
-      if (middle[i * n + j] < 0) {
+      if (__half2float(middle[i * n + j]) < 0) {
         middle[i * n + j] = 0.0f;
       } else {
         middle[i * n + j] = top[i * n + j];
@@ -64,15 +64,15 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const
 
   for (int i = 0; i < n; ++i) {
     float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += middle[j * n + i];
+    for (int j = 0; j < m; ++j) sum += __half2float(middle[j * n + i]);
     bias_grad[i] = sum;
   }
 }
 
 static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
   size_t m = 0;
   for (size_t i = 0; i < n; i++) {
-    if (fabs(arr1[i] - arr2[i]) > threshold) {
+    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
       m++;
     }
   }

diff --git a/test/utest/core23_dev_test/trainable_layers/fused_relu_bias_fully_connected_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/fused_relu_bias_fully_connected_layer_test.cpp
@@ -34,7 +34,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = sum;
     }
@@ -46,7 +46,7 @@ static void cpu_add_bias_and_re(__half *top, __half *middle, const __half *bias,
     for (int j = 0; j < n; ++j) {
       __half t = top[i * n + j] + bias[j];
       middle[i * n + j] = t;
-      top[i * n + j] = t < 0 ? __float2half(0.0f) : t;
+      top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t;
     }
   }
 }
@@ -55,7 +55,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __
                                         int m, int n) {
   for (int i = 0; i < m; ++i)
     for (int j = 0; j < n; ++j) {
-      if (top[i * n + j] < 0) {
+      if (__half2float(top[i * n + j]) < 0) {
         top[i * n + j] = 0.0f;
       } else {
         top[i * n + j] = bprop_out[i * n + j];
@@ -64,15 +64,15 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __
 
   for (int i = 0; i < n; ++i) {
     float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += top[j * n + i];
+    for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]);
     bias_grad[i] = sum;
   }
 }
 
 static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
   size_t m = 0;
   for (size_t i = 0; i < n; i++) {
-    if (fabs(arr1[i] - arr2[i]) > threshold) {
+    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
       m++;
     }
   }

diff --git a/test/utest/core23_dev_test/trainable_layers/layer_norm_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/layer_norm_layer_test.cpp
@@ -92,7 +92,7 @@ void layer_norm_fprop_cpu<__half>(const __half* gamma, const __half* beta, const
     for (int j = 0; j < num_feature; j++) {
       int idx = i * num_feature + j;
       float in_norm = (__half2float(in[idx]) - mean) / sqrt(var + eps);
-      out[idx] = __float2half(gamma[j] * in_norm + beta[j]);
+      out[idx] = gamma[j] * __float2half(in_norm) + beta[j];
     }
   }
 }
@@ -176,23 +176,24 @@ void layer_norm_bprop_cpu<__half>(const __half* gamma, const __half* out, __half
     float d_var = 0.0f;
     for (int j = 0; j < num_feature; j++) {
       int idx = i * num_feature + j;
-      float val = (__half2float(out[idx]) * gamma[j]) * (__half2float(in[idx]) - mean);
+      float val = __half2float(out[idx] * gamma[j]) * (__half2float(in[idx]) - mean);
       d_var += val;
     }
     d_var *= (-0.5f) * pow(inv_std, 3);
 
     float d_mu = 0.0f;
     for (int j = 0; j < num_feature; j++) {
       int idx = i * num_feature + j;
-      d_mu += __half2float(out[idx]) * gamma[j] * inv_std;
+      d_mu += __half2float(out[idx] * gamma[j]) * inv_std;
     }
     d_mu *= (-1.0f / num_feature);
 
     for (int j = 0; j < num_feature; j++) {
       int idx = i * num_feature + j;
-      gamma_grad[j] = gamma_grad[j] + out[idx] * (in[idx] - mean) * inv_std;
+      gamma_grad[j] =
+          gamma_grad[j] + out[idx] * (in[idx] - __float2half(mean)) * __float2half(inv_std);
       beta_grad[j] = beta_grad[j] + out[idx];
-      in[idx] = __float2half((__half2float(out[idx]) * gamma[j]) * inv_std +
+      in[idx] = __float2half(__half2float(out[idx] * gamma[j]) * inv_std +
                              d_var * (2.0 / num_feature) * (__half2float(in[idx]) - mean) + d_mu);
     }
   }

diff --git a/test/utest/core23_dev_test/trainable_layers/mlp_test.cpp b/test/utest/core23_dev_test/trainable_layers/mlp_test.cpp
@@ -65,7 +65,7 @@ static void cpu_mm(T* c, const T* a, bool transpose_a, const T* b, bool transpos
       for (int kk = 0; kk < k; ++kk) {
         int ai = transpose_a ? kk * m + i : i * k + kk;
         int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += a[ai] * b[bi];
+        sum += __half2float(a[ai] * b[bi]);
       }
       c[i * n + j] = static_cast<half>(beta * static_cast<float>(c[i * n + j]) + sum);
     }
@@ -80,7 +80,7 @@ static void cpu_add_bias_and_re(T* top, T* middle, const T* bias, bool is_relu,
       T t = top[i * n + j] + (use_bias ? bias[j] : T(0.0f));
       middle[i * n + j] = t;
       if (is_relu)
-        top[i * n + j] = t < 0 ? T(0.0f) : t;
+        top[i * n + j] = __half2float(t) < 0 ? T(0.0f) : t;
       else
         top[i * n + j] = t;
     }
@@ -92,7 +92,8 @@ static void cpu_reverse_add_bias_and_re(T* bias_grad, T* dRelu, T* middle, const
                                         int m, int n, bool is_tail, bool use_bias) {
   for (int i = 0; i < m; ++i) {
     for (int j = 0; j < n; ++j) {
-      if ((middle[i * n + j] <= 0 && is_tail) || (middle[i * n + j] < 0 && !is_tail)) {
+      if ((__half2float(middle[i * n + j]) <= 0 && is_tail) ||
+          (__half2float(middle[i * n + j]) < 0 && !is_tail)) {
         dRelu[i * n + j] = 0.0f;
       } else {
         dRelu[i * n + j] = bprop_out[i * n + j];
@@ -102,7 +103,7 @@ static void cpu_reverse_add_bias_and_re(T* bias_grad, T* dRelu, T* middle, const
   if (use_bias) {
     for (int i = 0; i < n; ++i) {
       float sum = 0.0f;
-      for (int j = 0; j < m; ++j) sum += dRelu[j * n + i];
+      for (int j = 0; j < m; ++j) sum += __half2float(dRelu[j * n + i]);
       bias_grad[i] = sum;
     }
   }
@@ -135,11 +136,11 @@ static float compare_array(const T* arr1, const T* arr2, size_t n, float thresho
       HCTR_LOG(INFO, WORLD, "Nan or Inf Error\n");
       return INT_MAX;
     }
-    if (fabs(arr1[i] - arr2[i]) > threshold) {
-      if (arr2[i] == 0 && fabs(arr1[i]) > threshold) {
+    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
+      if (__half2float(arr2[i]) == 0 && fabs(__half2float(arr1[i])) > threshold) {
         HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]);
         m++;
-      } else if (fabs(arr1[i] - arr2[i]) / arr2[i] > threshold) {
+      } else if (fabs(__half2float(arr1[i] - arr2[i])) / __half2float(arr2[i]) > threshold) {
         HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]);
         m++;
       }
@@ -484,7 +485,7 @@ static void mlp_test(std::vector<Layer_t> network,
           for (uint32_t col = 0; col < fc_out_dims[i]; col++) {
             float sum = 0.0;
             for (uint32_t row = 0; row < batch_size; row++) {
-              sum = sum + p.h_top_grad[i].get()[row * fc_out_dims[i] + col];
+              sum = sum + __half2float(p.h_top_grad[i].get()[row * fc_out_dims[i] + col]);
             }
             p.h_bias_grad[i].get()[col] = sum;
           }

diff --git a/test/utest/core23_dev_test/trainable_layers/multi_cross_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/multi_cross_layer_test.cpp
@@ -185,7 +185,7 @@ class MultiCrossLayerTest {
             // column of A is rowA
             acc = acc + (A[r * rowB + k] * B[c * rowB + k]);
           }
-          C[r * colB + c] = C[r * colB + c] * beta + acc;
+          C[r * colB + c] = __float2half(__half2float(C[r * colB + c]) * beta + __half2float(acc));
         }
       }
     } else if (transA) {
@@ -197,7 +197,7 @@ class MultiCrossLayerTest {
             // column of A is rowA
             acc = acc + (A[k * rowA + r] * B[k * colB + c]);
           }
-          C[r * colB + c] = C[r * colB + c] * beta + acc;
+          C[r * colB + c] = __float2half(__half2float(C[r * colB + c]) * beta + __half2float(acc));
         }
       }
     } else {
@@ -208,7 +208,7 @@ class MultiCrossLayerTest {
             // column of A is rowB
             acc = acc + (A[r * rowB + k] * B[k * colB + c]);
           }
-          C[r * colB + c] = C[r * colB + c] * beta + acc;
+          C[r * colB + c] = __float2half(__half2float(C[r * colB + c]) * beta + __half2float(acc));
         }
       }
     }

diff --git a/test/utest/layers/add_layer_test.cpp b/test/utest/layers/add_layer_test.cpp
@@ -49,7 +49,7 @@ void add_cpu(Vector input, T *output, size_t size, size_t num) {
   for (auto i = 0; i < size; i++) {
     float tmp = 0.f;
     for (size_t j = 0; j < num; j++) {
-      tmp += input[j][i];
+      tmp += (float)input[j][i];
     }
     output[i] = tmp;
   }

diff --git a/test/utest/layers/batch_norm_layer_test.cpp b/test/utest/layers/batch_norm_layer_test.cpp
@@ -158,7 +158,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
     float var = 0.0f;
     for (int i = 0; i < batch_size; i++) {
       int idx = i * num_feature + j;
-      float diff = in[idx] - mean;
+      float diff = __half2float(in[idx]) - mean;
       var += (diff * diff);
     }
     var /= batch_size;
@@ -178,7 +178,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
     for (int i = 0; i < batch_size; i++) {
       int idx = i * num_feature + j;
       val1 += (__half2float(out[idx]) * gamma[j]);
-      val2 += __half2float(in[idx] - mean);
+      val2 += __half2float(in[idx]) - mean;
     }
     val1 *= (-inv_std);
     val2 *= (d_var / batch_size) * -2;