From 6f156dffd88824d316eabbf24b3f1cf70d0619f8 Mon Sep 17 00:00:00 2001 From: Emma Qiao Date: Wed, 2 Aug 2023 05:56:53 -0700 Subject: [PATCH] Fix math operator error on CUDA 12.2 --- .../src/cpu/layers/batch_norm_layer_cpu.cpp | 4 +- HugeCTR/src/cpu/layers/elu_layer_cpu.cpp | 3 +- .../src/cpu/layers/fm_order2_layer_cpu.cpp | 4 +- .../layers/fully_connected_layer_half_cpu.cpp | 4 +- .../fused_fully_connected_layer_cpu.cpp | 8 ++-- .../src/cpu/layers/interaction_layer_cpu.cpp | 4 +- .../batch_norm_layer_test.cpp | 4 +- .../fully_connected_layer_half_test.cpp | 6 +-- .../fused_fully_connected_layer_test.cpp | 10 ++--- ...d_relu_bias_fully_connected_layer_test.cpp | 10 ++--- .../layer_norm_layer_test.cpp | 11 +++--- .../trainable_layers/mlp_test.cpp | 17 +++++---- .../multi_cross_layer_test.cpp | 6 +-- test/utest/layers/add_layer_test.cpp | 2 +- test/utest/layers/batch_norm_layer_test.cpp | 4 +- test/utest/layers/dropout_layer_test.cpp | 4 +- test/utest/layers/dropout_layer_test_old.cpp | 4 +- .../elementwise_multiply_layer_test.cpp | 2 +- test/utest/layers/elu_layer_test.cpp | 19 ++++++++++ test/utest/layers/elu_layer_test_old.cpp | 19 ++++++++++ test/utest/layers/fm_order2_layer_test.cpp | 3 +- .../utest/layers/fm_order2_layer_test_old.cpp | 4 +- .../fully_connected_layer_half_test.cpp | 6 +-- .../fused_fully_connected_layer_test.cpp | 10 ++--- ...d_relu_bias_fully_connected_layer_test.cpp | 10 ++--- test/utest/layers/group_dense_layer_test.cpp | 17 +++++---- test/utest/layers/interaction_layer_test.cpp | 29 +++++++++++++-- .../layers/interaction_layer_test_old.cpp | 29 +++++++++++++-- test/utest/layers/layer_norm_layer_test.cpp | 12 +++--- test/utest/layers/mlp_test.cpp | 19 +++++----- test/utest/layers/multi_cross_layer_test.cpp | 18 +++++++-- .../multi_head_attention_layer_test.cpp | 34 +++++++++++++++++ .../utest/layers/sequence_mask_layer_test.cpp | 2 +- .../layers/sequence_mask_layer_test_old.cpp | 2 +- test/utest/test_utils.hpp | 37 ++++++++++++++++++- 35 files changed, 271 insertions(+), 106 deletions(-) diff --git a/HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp b/HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp index c9d5211dbb..85fe14dfbd 100644 --- a/HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp +++ b/HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp @@ -142,7 +142,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half* float var = 0.0f; for (int i = 0; i < batch_size; i++) { int idx = i * num_feature + j; - float diff = in[idx] - mean; + float diff = __half2float(in[idx]) - mean; var += (diff * diff); } var /= batch_size; @@ -162,7 +162,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half* for (int i = 0; i < batch_size; i++) { int idx = i * num_feature + j; val1 += (__half2float(out[idx]) * gamma[j]); - val2 += __half2float(in[idx] - mean); + val2 += (__half2float(in[idx]) - mean); } val1 *= (-inv_std); val2 *= (d_var / batch_size) * -2; diff --git a/HugeCTR/src/cpu/layers/elu_layer_cpu.cpp b/HugeCTR/src/cpu/layers/elu_layer_cpu.cpp index 01b75a00a1..891de22d61 100644 --- a/HugeCTR/src/cpu/layers/elu_layer_cpu.cpp +++ b/HugeCTR/src/cpu/layers/elu_layer_cpu.cpp @@ -27,7 +27,8 @@ namespace { template void elu_cpu(const T* in, T* out, int len, T alpha) { for (int i = 0; i < len; ++i) { - out[i] = (in[i] < 0) ? T(alpha * (exp(in[i]) - 1)) : in[i]; + out[i] = + (__half2float(in[i]) < 0) ? T(__half2float(alpha) * (exp(__half2float(in[i])) - 1)) : in[i]; } } diff --git a/HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp b/HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp index a593e1a623..0293af5f0e 100644 --- a/HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp +++ b/HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp @@ -91,8 +91,8 @@ void fm_order2_bprop_cpu(const __half* in, const __half* top_grad, __half* dgrad } for (int k = 0; k < slot_num; k++) { int index = offset + k * emb_vec_size; - dgrad[index] = - __float2half(__half2float(top_grad[i * emb_vec_size + j]) * (sum - in[index])); + dgrad[index] = __float2half(__half2float(top_grad[i * emb_vec_size + j]) * + (sum - __half2float(in[index]))); } } } diff --git a/HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp b/HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp index 52e23dc1ad..1b2566327b 100644 --- a/HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp +++ b/HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp @@ -29,7 +29,7 @@ void cpu_mm(__half* c, const __half* a, bool transpose_a, const __half* b, bool for (int kk = 0; kk < k; ++kk) { int ai = transpose_a ? kk * m + i : i * k + kk; int bi = transpose_b ? j * k + kk : kk * n + j; - sum += a[ai] * b[bi]; + sum += __half2float(a[ai] * b[bi]); } c[i * n + j] = sum; } @@ -47,7 +47,7 @@ void cpu_add_bias(__half* top, const __half* bias, int m, int n) { void cpu_reverse_add_bias(__half* bias_grad, const __half* top, int m, int n) { for (int i = 0; i < n; ++i) { float sum = 0.0f; - for (int j = 0; j < m; ++j) sum += top[j * n + i]; + for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]); bias_grad[i] = sum; } } diff --git a/HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp b/HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp index a25317dc1b..d2da98d944 100644 --- a/HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp +++ b/HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp @@ -29,7 +29,7 @@ void cpu_mm(__half* c, const __half* a, bool transpose_a, const __half* b, bool for (int kk = 0; kk < k; ++kk) { int ai = transpose_a ? kk * m + i : i * k + kk; int bi = transpose_b ? j * k + kk : kk * n + j; - sum += a[ai] * b[bi]; + sum += __half2float(a[ai] * b[bi]); } c[i * n + j] = sum; } @@ -41,7 +41,7 @@ void cpu_add_bias_and_re(__half* top, __half* middle, const __half* bias, int m, for (int j = 0; j < n; ++j) { __half t = top[i * n + j] + bias[j]; middle[i * n + j] = t; - top[i * n + j] = t < 0 ? __float2half(0.0f) : t; + top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t; } } } @@ -50,7 +50,7 @@ void cpu_reverse_add_bias_and_re(__half* bias_grad, __half* middle, const __half int n) { for (int i = 0; i < m; ++i) for (int j = 0; j < n; ++j) { - if (middle[i * n + j] < 0) { + if (__half2float(middle[i * n + j]) < 0) { middle[i * n + j] = 0.0f; } else { middle[i * n + j] = top[i * n + j]; @@ -59,7 +59,7 @@ void cpu_reverse_add_bias_and_re(__half* bias_grad, __half* middle, const __half for (int i = 0; i < n; ++i) { float sum = 0.0f; - for (int j = 0; j < m; ++j) sum += middle[j * n + i]; + for (int j = 0; j < m; ++j) sum += __half2float(middle[j * n + i]); bias_grad[i] = sum; } } diff --git a/HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp b/HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp index 8eb424f77a..74e3ea977f 100644 --- a/HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp +++ b/HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp @@ -74,8 +74,8 @@ void matmul_cpu(size_t height, size_t in_width, size_t n_ins, T *h_concat, T *h_ for (size_t n = 0; n < n_ins; n++) { float accum = 0.0f; for (size_t k = 0; k < in_width; k++) { - accum += h_concat[concat_stride + m * in_width + k] * - h_concat[concat_stride + n * in_width + k]; + accum += __half2float(h_concat[concat_stride + m * in_width + k] * + h_concat[concat_stride + n * in_width + k]); } h_mat[mat_stride + m * n_ins + n] = accum; } diff --git a/test/utest/core23_dev_test/trainable_layers/batch_norm_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/batch_norm_layer_test.cpp index e47a30027a..f9201ca408 100644 --- a/test/utest/core23_dev_test/trainable_layers/batch_norm_layer_test.cpp +++ b/test/utest/core23_dev_test/trainable_layers/batch_norm_layer_test.cpp @@ -158,7 +158,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half* float var = 0.0f; for (int i = 0; i < batch_size; i++) { int idx = i * num_feature + j; - float diff = in[idx] - mean; + float diff = __half2float(in[idx]) - mean; var += (diff * diff); } var /= batch_size; @@ -178,7 +178,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half* for (int i = 0; i < batch_size; i++) { int idx = i * num_feature + j; val1 += (__half2float(out[idx]) * gamma[j]); - val2 += __half2float(in[idx] - mean); + val2 += __half2float(in[idx]) - mean; } val1 *= (-inv_std); val2 *= (d_var / batch_size) * -2; diff --git a/test/utest/core23_dev_test/trainable_layers/fully_connected_layer_half_test.cpp b/test/utest/core23_dev_test/trainable_layers/fully_connected_layer_half_test.cpp index 372bcef6f9..97b1b93d94 100644 --- a/test/utest/core23_dev_test/trainable_layers/fully_connected_layer_half_test.cpp +++ b/test/utest/core23_dev_test/trainable_layers/fully_connected_layer_half_test.cpp @@ -33,7 +33,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b for (int kk = 0; kk < k; ++kk) { int ai = transpose_a ? kk * m + i : i * k + kk; int bi = transpose_b ? j * k + kk : kk * n + j; - sum += a[ai] * b[bi]; + sum += __half2float(a[ai] * b[bi]); } c[i * n + j] = sum; } @@ -51,7 +51,7 @@ static void cpu_add_bias(__half *top, const __half *bias, int m, int n) { static void cpu_reverse_add_bias(__half *bias_grad, const __half *top, int m, int n) { for (int i = 0; i < n; ++i) { float sum = 0.0f; - for (int j = 0; j < m; ++j) sum += top[j * n + i]; + for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]); bias_grad[i] = sum; } } @@ -59,7 +59,7 @@ static void cpu_reverse_add_bias(__half *bias_grad, const __half *top, int m, in static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) { size_t m = 0; for (size_t i = 0; i < n; i++) { - if (fabs(arr1[i] - arr2[i]) > threshold) { + if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) { m++; } } diff --git a/test/utest/core23_dev_test/trainable_layers/fused_fully_connected_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/fused_fully_connected_layer_test.cpp index 9dc200d0d9..2c9102e422 100644 --- a/test/utest/core23_dev_test/trainable_layers/fused_fully_connected_layer_test.cpp +++ b/test/utest/core23_dev_test/trainable_layers/fused_fully_connected_layer_test.cpp @@ -34,7 +34,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b for (int kk = 0; kk < k; ++kk) { int ai = transpose_a ? kk * m + i : i * k + kk; int bi = transpose_b ? j * k + kk : kk * n + j; - sum += a[ai] * b[bi]; + sum += __half2float(a[ai] * b[bi]); } c[i * n + j] = sum; } @@ -46,7 +46,7 @@ static void cpu_add_bias_and_re(__half *top, __half *middle, const __half *bias, for (int j = 0; j < n; ++j) { __half t = top[i * n + j] + bias[j]; middle[i * n + j] = t; - top[i * n + j] = t < 0 ? __float2half(0.0f) : t; + top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t; } } } @@ -55,7 +55,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const int n) { for (int i = 0; i < m; ++i) for (int j = 0; j < n; ++j) { - if (middle[i * n + j] < 0) { + if (__half2float(middle[i * n + j]) < 0) { middle[i * n + j] = 0.0f; } else { middle[i * n + j] = top[i * n + j]; @@ -64,7 +64,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const for (int i = 0; i < n; ++i) { float sum = 0.0f; - for (int j = 0; j < m; ++j) sum += middle[j * n + i]; + for (int j = 0; j < m; ++j) sum += __half2float(middle[j * n + i]); bias_grad[i] = sum; } } @@ -72,7 +72,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) { size_t m = 0; for (size_t i = 0; i < n; i++) { - if (fabs(arr1[i] - arr2[i]) > threshold) { + if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) { m++; } } diff --git a/test/utest/core23_dev_test/trainable_layers/fused_relu_bias_fully_connected_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/fused_relu_bias_fully_connected_layer_test.cpp index ea54c8a397..9f984cd941 100644 --- a/test/utest/core23_dev_test/trainable_layers/fused_relu_bias_fully_connected_layer_test.cpp +++ b/test/utest/core23_dev_test/trainable_layers/fused_relu_bias_fully_connected_layer_test.cpp @@ -34,7 +34,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b for (int kk = 0; kk < k; ++kk) { int ai = transpose_a ? kk * m + i : i * k + kk; int bi = transpose_b ? j * k + kk : kk * n + j; - sum += a[ai] * b[bi]; + sum += __half2float(a[ai] * b[bi]); } c[i * n + j] = sum; } @@ -46,7 +46,7 @@ static void cpu_add_bias_and_re(__half *top, __half *middle, const __half *bias, for (int j = 0; j < n; ++j) { __half t = top[i * n + j] + bias[j]; middle[i * n + j] = t; - top[i * n + j] = t < 0 ? __float2half(0.0f) : t; + top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t; } } } @@ -55,7 +55,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __ int m, int n) { for (int i = 0; i < m; ++i) for (int j = 0; j < n; ++j) { - if (top[i * n + j] < 0) { + if (__half2float(top[i * n + j]) < 0) { top[i * n + j] = 0.0f; } else { top[i * n + j] = bprop_out[i * n + j]; @@ -64,7 +64,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __ for (int i = 0; i < n; ++i) { float sum = 0.0f; - for (int j = 0; j < m; ++j) sum += top[j * n + i]; + for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]); bias_grad[i] = sum; } } @@ -72,7 +72,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __ static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) { size_t m = 0; for (size_t i = 0; i < n; i++) { - if (fabs(arr1[i] - arr2[i]) > threshold) { + if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) { m++; } } diff --git a/test/utest/core23_dev_test/trainable_layers/layer_norm_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/layer_norm_layer_test.cpp index 4709f91f03..281586a143 100644 --- a/test/utest/core23_dev_test/trainable_layers/layer_norm_layer_test.cpp +++ b/test/utest/core23_dev_test/trainable_layers/layer_norm_layer_test.cpp @@ -92,7 +92,7 @@ void layer_norm_fprop_cpu<__half>(const __half* gamma, const __half* beta, const for (int j = 0; j < num_feature; j++) { int idx = i * num_feature + j; float in_norm = (__half2float(in[idx]) - mean) / sqrt(var + eps); - out[idx] = __float2half(gamma[j] * in_norm + beta[j]); + out[idx] = gamma[j] * __float2half(in_norm) + beta[j]; } } } @@ -176,7 +176,7 @@ void layer_norm_bprop_cpu<__half>(const __half* gamma, const __half* out, __half float d_var = 0.0f; for (int j = 0; j < num_feature; j++) { int idx = i * num_feature + j; - float val = (__half2float(out[idx]) * gamma[j]) * (__half2float(in[idx]) - mean); + float val = __half2float(out[idx] * gamma[j]) * (__half2float(in[idx]) - mean); d_var += val; } d_var *= (-0.5f) * pow(inv_std, 3); @@ -184,15 +184,16 @@ void layer_norm_bprop_cpu<__half>(const __half* gamma, const __half* out, __half float d_mu = 0.0f; for (int j = 0; j < num_feature; j++) { int idx = i * num_feature + j; - d_mu += __half2float(out[idx]) * gamma[j] * inv_std; + d_mu += __half2float(out[idx] * gamma[j]) * inv_std; } d_mu *= (-1.0f / num_feature); for (int j = 0; j < num_feature; j++) { int idx = i * num_feature + j; - gamma_grad[j] = gamma_grad[j] + out[idx] * (in[idx] - mean) * inv_std; + gamma_grad[j] = + gamma_grad[j] + out[idx] * (in[idx] - __float2half(mean)) * __float2half(inv_std); beta_grad[j] = beta_grad[j] + out[idx]; - in[idx] = __float2half((__half2float(out[idx]) * gamma[j]) * inv_std + + in[idx] = __float2half(__half2float(out[idx] * gamma[j]) * inv_std + d_var * (2.0 / num_feature) * (__half2float(in[idx]) - mean) + d_mu); } } diff --git a/test/utest/core23_dev_test/trainable_layers/mlp_test.cpp b/test/utest/core23_dev_test/trainable_layers/mlp_test.cpp index 0b0dd5fa6b..4220cb2041 100644 --- a/test/utest/core23_dev_test/trainable_layers/mlp_test.cpp +++ b/test/utest/core23_dev_test/trainable_layers/mlp_test.cpp @@ -65,7 +65,7 @@ static void cpu_mm(T* c, const T* a, bool transpose_a, const T* b, bool transpos for (int kk = 0; kk < k; ++kk) { int ai = transpose_a ? kk * m + i : i * k + kk; int bi = transpose_b ? j * k + kk : kk * n + j; - sum += a[ai] * b[bi]; + sum += __half2float(a[ai] * b[bi]); } c[i * n + j] = static_cast(beta * static_cast(c[i * n + j]) + sum); } @@ -80,7 +80,7 @@ static void cpu_add_bias_and_re(T* top, T* middle, const T* bias, bool is_relu, T t = top[i * n + j] + (use_bias ? bias[j] : T(0.0f)); middle[i * n + j] = t; if (is_relu) - top[i * n + j] = t < 0 ? T(0.0f) : t; + top[i * n + j] = __half2float(t) < 0 ? T(0.0f) : t; else top[i * n + j] = t; } @@ -92,7 +92,8 @@ static void cpu_reverse_add_bias_and_re(T* bias_grad, T* dRelu, T* middle, const int m, int n, bool is_tail, bool use_bias) { for (int i = 0; i < m; ++i) { for (int j = 0; j < n; ++j) { - if ((middle[i * n + j] <= 0 && is_tail) || (middle[i * n + j] < 0 && !is_tail)) { + if ((__half2float(middle[i * n + j]) <= 0 && is_tail) || + (__half2float(middle[i * n + j]) < 0 && !is_tail)) { dRelu[i * n + j] = 0.0f; } else { dRelu[i * n + j] = bprop_out[i * n + j]; @@ -102,7 +103,7 @@ static void cpu_reverse_add_bias_and_re(T* bias_grad, T* dRelu, T* middle, const if (use_bias) { for (int i = 0; i < n; ++i) { float sum = 0.0f; - for (int j = 0; j < m; ++j) sum += dRelu[j * n + i]; + for (int j = 0; j < m; ++j) sum += __half2float(dRelu[j * n + i]); bias_grad[i] = sum; } } @@ -135,11 +136,11 @@ static float compare_array(const T* arr1, const T* arr2, size_t n, float thresho HCTR_LOG(INFO, WORLD, "Nan or Inf Error\n"); return INT_MAX; } - if (fabs(arr1[i] - arr2[i]) > threshold) { - if (arr2[i] == 0 && fabs(arr1[i]) > threshold) { + if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) { + if (__half2float(arr2[i]) == 0 && fabs(__half2float(arr1[i])) > threshold) { HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]); m++; - } else if (fabs(arr1[i] - arr2[i]) / arr2[i] > threshold) { + } else if (fabs(__half2float(arr1[i] - arr2[i])) / __half2float(arr2[i]) > threshold) { HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]); m++; } @@ -484,7 +485,7 @@ static void mlp_test(std::vector network, for (uint32_t col = 0; col < fc_out_dims[i]; col++) { float sum = 0.0; for (uint32_t row = 0; row < batch_size; row++) { - sum = sum + p.h_top_grad[i].get()[row * fc_out_dims[i] + col]; + sum = sum + __half2float(p.h_top_grad[i].get()[row * fc_out_dims[i] + col]); } p.h_bias_grad[i].get()[col] = sum; } diff --git a/test/utest/core23_dev_test/trainable_layers/multi_cross_layer_test.cpp b/test/utest/core23_dev_test/trainable_layers/multi_cross_layer_test.cpp index 2b25676242..b85c27aeb5 100644 --- a/test/utest/core23_dev_test/trainable_layers/multi_cross_layer_test.cpp +++ b/test/utest/core23_dev_test/trainable_layers/multi_cross_layer_test.cpp @@ -185,7 +185,7 @@ class MultiCrossLayerTest { // column of A is rowA acc = acc + (A[r * rowB + k] * B[c * rowB + k]); } - C[r * colB + c] = C[r * colB + c] * beta + acc; + C[r * colB + c] = __float2half(__half2float(C[r * colB + c]) * beta + __half2float(acc)); } } } else if (transA) { @@ -197,7 +197,7 @@ class MultiCrossLayerTest { // column of A is rowA acc = acc + (A[k * rowA + r] * B[k * colB + c]); } - C[r * colB + c] = C[r * colB + c] * beta + acc; + C[r * colB + c] = __float2half(__half2float(C[r * colB + c]) * beta + __half2float(acc)); } } } else { @@ -208,7 +208,7 @@ class MultiCrossLayerTest { // column of A is rowB acc = acc + (A[r * rowB + k] * B[k * colB + c]); } - C[r * colB + c] = C[r * colB + c] * beta + acc; + C[r * colB + c] = __float2half(__half2float(C[r * colB + c]) * beta + __half2float(acc)); } } } diff --git a/test/utest/layers/add_layer_test.cpp b/test/utest/layers/add_layer_test.cpp index dbc3cef9dd..a5471c6b3a 100644 --- a/test/utest/layers/add_layer_test.cpp +++ b/test/utest/layers/add_layer_test.cpp @@ -49,7 +49,7 @@ void add_cpu(Vector input, T *output, size_t size, size_t num) { for (auto i = 0; i < size; i++) { float tmp = 0.f; for (size_t j = 0; j < num; j++) { - tmp += input[j][i]; + tmp += (float)input[j][i]; } output[i] = tmp; } diff --git a/test/utest/layers/batch_norm_layer_test.cpp b/test/utest/layers/batch_norm_layer_test.cpp index 5b057d825c..7a8ebbbf20 100644 --- a/test/utest/layers/batch_norm_layer_test.cpp +++ b/test/utest/layers/batch_norm_layer_test.cpp @@ -158,7 +158,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half* float var = 0.0f; for (int i = 0; i < batch_size; i++) { int idx = i * num_feature + j; - float diff = in[idx] - mean; + float diff = __half2float(in[idx]) - mean; var += (diff * diff); } var /= batch_size; @@ -178,7 +178,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half* for (int i = 0; i < batch_size; i++) { int idx = i * num_feature + j; val1 += (__half2float(out[idx]) * gamma[j]); - val2 += __half2float(in[idx] - mean); + val2 += __half2float(in[idx]) - mean; } val1 *= (-inv_std); val2 *= (d_var / batch_size) * -2; diff --git a/test/utest/layers/dropout_layer_test.cpp b/test/utest/layers/dropout_layer_test.cpp index 5a205bbb35..ee9c2166c0 100644 --- a/test/utest/layers/dropout_layer_test.cpp +++ b/test/utest/layers/dropout_layer_test.cpp @@ -72,7 +72,7 @@ void dropout_test(int64_t dim0, int64_t dim1, float rate) { int cnt_zero_fprop = 0; for (int i = 0; i < len; i++) { - if (std::abs(h_top[i] - 0.f) < eps) { + if (std::abs(__half2float(h_top[i]) - 0.f) < eps) { cnt_zero_fprop++; } } @@ -91,7 +91,7 @@ void dropout_test(int64_t dim0, int64_t dim1, float rate) { core23::DeviceType::CPU, bottom_tensor.device()); int cnt_zero_bprop = 0; for (int i = 0; i < len; i++) { - if (std::abs(h_bottom[i] - 0.f) < eps) { + if (std::abs(__half2float(h_bottom[i]) - 0.f) < eps) { cnt_zero_bprop++; } } diff --git a/test/utest/layers/dropout_layer_test_old.cpp b/test/utest/layers/dropout_layer_test_old.cpp index c4dcc6f4f7..e2331912f5 100644 --- a/test/utest/layers/dropout_layer_test_old.cpp +++ b/test/utest/layers/dropout_layer_test_old.cpp @@ -64,7 +64,7 @@ void dropout_test(size_t dim0, size_t dim1, float rate) { HCTR_LIB_THROW(cudaMemcpy(h_out.get(), d_out, n_bytes, cudaMemcpyDeviceToHost)); int cnt_zero_fprop = 0; for (int i = 0; i < len; i++) { - if (std::abs(h_out[i] - 0.f) < eps) { + if (std::abs(__half2float(h_out[i]) - 0.f) < eps) { cnt_zero_fprop++; } } @@ -82,7 +82,7 @@ void dropout_test(size_t dim0, size_t dim1, float rate) { HCTR_LIB_THROW(cudaMemcpy(h_in.get(), d_in, n_bytes, cudaMemcpyDeviceToHost)); int cnt_zero_bprop = 0; for (int i = 0; i < len; i++) { - if (std::abs(h_out[i] - 0.f) < eps) { + if (std::abs(__half2float(h_out[i]) - 0.f) < eps) { cnt_zero_bprop++; } } diff --git a/test/utest/layers/elementwise_multiply_layer_test.cpp b/test/utest/layers/elementwise_multiply_layer_test.cpp index 111c6f4126..61d331a442 100644 --- a/test/utest/layers/elementwise_multiply_layer_test.cpp +++ b/test/utest/layers/elementwise_multiply_layer_test.cpp @@ -58,7 +58,7 @@ void elementwise_multiply_dgrad_cpu(const T *top_grad, T **dgrad, const T *fprop for (size_t i = 0; i < size; i++) { for (size_t j = 0; j < num; j++) { - if (0 == fprop_output[i]) { + if (0 == __half2float(fprop_output[i])) { dgrad[j][i] = zero; } else { T d_input = dgrad[j][i]; diff --git a/test/utest/layers/elu_layer_test.cpp b/test/utest/layers/elu_layer_test.cpp index 032a60951b..5f6805318f 100644 --- a/test/utest/layers/elu_layer_test.cpp +++ b/test/utest/layers/elu_layer_test.cpp @@ -56,6 +56,25 @@ void elu_bprop_cpu(const T* d_out, T* d_in, int len, T alpha) { } } +template <> +void elu_cpu(const __half* in, __half* out, int len, __half alpha) { + for (int i = 0; i < len; ++i) { + out[i] = (__half2float(in[i]) < 0) + ? __half(__half2float(alpha) * (exp(__half2float(in[i])) - 1)) + : in[i]; + } +} + +template <> +void elu_bprop_cpu(const __half* d_out, __half* d_in, int len, __half alpha) { + for (int i = 0; i < len; ++i) { + d_in[i] = + (__half2float(d_in[i]) < 0) + ? __half(__half2float(alpha) * exp(__half2float(d_in[i])) * __half2float(d_out[i])) + : d_out[i]; + } +} + template void elu_test(int64_t dim0, int64_t dim1, T alpha) { constexpr bool use_mixed_precision = std::is_same_v; diff --git a/test/utest/layers/elu_layer_test_old.cpp b/test/utest/layers/elu_layer_test_old.cpp index dfac55cb4f..944031fd0a 100644 --- a/test/utest/layers/elu_layer_test_old.cpp +++ b/test/utest/layers/elu_layer_test_old.cpp @@ -53,6 +53,25 @@ void elu_bprop_cpu(const T* d_out, T* d_in, int len, T alpha) { } } +template <> +void elu_cpu(const __half* in, __half* out, int len, __half alpha) { + for (int i = 0; i < len; ++i) { + out[i] = (__half2float(in[i]) < 0) + ? __half(__half2float(alpha) * (exp(__half2float(in[i])) - 1)) + : in[i]; + } +} + +template <> +void elu_bprop_cpu(const __half* d_out, __half* d_in, int len, __half alpha) { + for (int i = 0; i < len; ++i) { + d_in[i] = + (__half2float(d_in[i]) < 0) + ? __half(__half2float(alpha) * exp(__half2float(d_in[i])) * __half2float(d_out[i])) + : d_out[i]; + } +} + template void elu_test(size_t dim0, size_t dim1, T alpha) { std::shared_ptr> buf = GeneralBuffer2::create(); diff --git a/test/utest/layers/fm_order2_layer_test.cpp b/test/utest/layers/fm_order2_layer_test.cpp index dc341b8002..aa0b3a07ae 100644 --- a/test/utest/layers/fm_order2_layer_test.cpp +++ b/test/utest/layers/fm_order2_layer_test.cpp @@ -109,8 +109,7 @@ void fm_order2_bprop_cpu(const __half* in, const __half* top_grad, __half* dgrad } for (int k = 0; k < slot_num; k++) { int index = offset + k * emb_vec_size; - dgrad[index] = - __float2half(__half2float(top_grad[i * emb_vec_size + j]) * (sum - in[index])); + dgrad[index] = top_grad[i * emb_vec_size + j] * (__float2half(sum) - in[index]); } } } diff --git a/test/utest/layers/fm_order2_layer_test_old.cpp b/test/utest/layers/fm_order2_layer_test_old.cpp index 8fddc654ca..5177749878 100644 --- a/test/utest/layers/fm_order2_layer_test_old.cpp +++ b/test/utest/layers/fm_order2_layer_test_old.cpp @@ -108,8 +108,8 @@ void fm_order2_bprop_cpu(const __half* in, const __half* top_grad, __half* dgrad } for (int k = 0; k < slot_num; k++) { int index = offset + k * emb_vec_size; - dgrad[index] = - __float2half(__half2float(top_grad[i * emb_vec_size + j]) * (sum - in[index])); + dgrad[index] = __float2half(__half2float(top_grad[i * emb_vec_size + j]) * + (sum - __half2float(in[index]))); } } } diff --git a/test/utest/layers/fully_connected_layer_half_test.cpp b/test/utest/layers/fully_connected_layer_half_test.cpp index b6da5c3368..dfc8ccbbb2 100644 --- a/test/utest/layers/fully_connected_layer_half_test.cpp +++ b/test/utest/layers/fully_connected_layer_half_test.cpp @@ -32,7 +32,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b for (int kk = 0; kk < k; ++kk) { int ai = transpose_a ? kk * m + i : i * k + kk; int bi = transpose_b ? j * k + kk : kk * n + j; - sum += a[ai] * b[bi]; + sum += __half2float(a[ai] * b[bi]); } c[i * n + j] = sum; } @@ -50,7 +50,7 @@ static void cpu_add_bias(__half *top, const __half *bias, int m, int n) { static void cpu_reverse_add_bias(__half *bias_grad, const __half *top, int m, int n) { for (int i = 0; i < n; ++i) { float sum = 0.0f; - for (int j = 0; j < m; ++j) sum += top[j * n + i]; + for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]); bias_grad[i] = sum; } } @@ -58,7 +58,7 @@ static void cpu_reverse_add_bias(__half *bias_grad, const __half *top, int m, in static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) { size_t m = 0; for (size_t i = 0; i < n; i++) { - if (fabs(arr1[i] - arr2[i]) > threshold) { + if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) { m++; } } diff --git a/test/utest/layers/fused_fully_connected_layer_test.cpp b/test/utest/layers/fused_fully_connected_layer_test.cpp index acd3e10ed6..eb7ec0dff1 100644 --- a/test/utest/layers/fused_fully_connected_layer_test.cpp +++ b/test/utest/layers/fused_fully_connected_layer_test.cpp @@ -33,7 +33,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b for (int kk = 0; kk < k; ++kk) { int ai = transpose_a ? kk * m + i : i * k + kk; int bi = transpose_b ? j * k + kk : kk * n + j; - sum += a[ai] * b[bi]; + sum += __half2float(a[ai]) * __half2float(b[bi]); } c[i * n + j] = sum; } @@ -45,7 +45,7 @@ static void cpu_add_bias_and_re(__half *top, __half *middle, const __half *bias, for (int j = 0; j < n; ++j) { __half t = top[i * n + j] + bias[j]; middle[i * n + j] = t; - top[i * n + j] = t < 0 ? __float2half(0.0f) : t; + top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t; } } } @@ -54,7 +54,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const int n) { for (int i = 0; i < m; ++i) for (int j = 0; j < n; ++j) { - if (middle[i * n + j] < 0) { + if (__half2float(middle[i * n + j]) < 0) { middle[i * n + j] = 0.0f; } else { middle[i * n + j] = top[i * n + j]; @@ -63,7 +63,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const for (int i = 0; i < n; ++i) { float sum = 0.0f; - for (int j = 0; j < m; ++j) sum += middle[j * n + i]; + for (int j = 0; j < m; ++j) sum += __half2float(middle[j * n + i]); bias_grad[i] = sum; } } @@ -71,7 +71,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) { size_t m = 0; for (size_t i = 0; i < n; i++) { - if (fabs(arr1[i] - arr2[i]) > threshold) { + if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) { m++; } } diff --git a/test/utest/layers/fused_relu_bias_fully_connected_layer_test.cpp b/test/utest/layers/fused_relu_bias_fully_connected_layer_test.cpp index 0370ac750c..721c78a195 100644 --- a/test/utest/layers/fused_relu_bias_fully_connected_layer_test.cpp +++ b/test/utest/layers/fused_relu_bias_fully_connected_layer_test.cpp @@ -33,7 +33,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b for (int kk = 0; kk < k; ++kk) { int ai = transpose_a ? kk * m + i : i * k + kk; int bi = transpose_b ? j * k + kk : kk * n + j; - sum += a[ai] * b[bi]; + sum += __half2float(a[ai]) * __half2float(b[bi]); } c[i * n + j] = sum; } @@ -45,7 +45,7 @@ static void cpu_add_bias_and_re(__half *top, __half *middle, const __half *bias, for (int j = 0; j < n; ++j) { __half t = top[i * n + j] + bias[j]; middle[i * n + j] = t; - top[i * n + j] = t < 0 ? __float2half(0.0f) : t; + top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t; } } } @@ -54,7 +54,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __ int m, int n) { for (int i = 0; i < m; ++i) for (int j = 0; j < n; ++j) { - if (top[i * n + j] < 0) { + if (__half2float(top[i * n + j]) < 0) { top[i * n + j] = 0.0f; } else { top[i * n + j] = bprop_out[i * n + j]; @@ -63,7 +63,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __ for (int i = 0; i < n; ++i) { float sum = 0.0f; - for (int j = 0; j < m; ++j) sum += top[j * n + i]; + for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]); bias_grad[i] = sum; } } @@ -71,7 +71,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __ static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) { size_t m = 0; for (size_t i = 0; i < n; i++) { - if (fabs(arr1[i] - arr2[i]) > threshold) { + if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) { m++; } } diff --git a/test/utest/layers/group_dense_layer_test.cpp b/test/utest/layers/group_dense_layer_test.cpp index 11a7516655..85aaa0ff81 100644 --- a/test/utest/layers/group_dense_layer_test.cpp +++ b/test/utest/layers/group_dense_layer_test.cpp @@ -65,7 +65,7 @@ static void cpu_mm(__half* c, const __half* a, bool transpose_a, const __half* b for (int kk = 0; kk < k; ++kk) { int ai = transpose_a ? kk * m + i : i * k + kk; int bi = transpose_b ? j * k + kk : kk * n + j; - sum += a[ai] * b[bi]; + sum += __half2float(a[ai] * b[bi]); } c[i * n + j] = static_cast(beta * static_cast(c[i * n + j]) + sum); } @@ -79,7 +79,7 @@ static void cpu_add_bias_and_re(__half* top, __half* middle, const __half* bias, __half t = top[i * n + j] + bias[j]; middle[i * n + j] = t; if (is_relu) - top[i * n + j] = t < 0 ? __float2half(0.0f) : t; + top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t; else top[i * n + j] = t; } @@ -90,7 +90,8 @@ static void cpu_reverse_add_bias_and_re(__half* bias_grad, __half* dRelu, __half const __half* bprop_out, int m, int n, bool is_tail) { for (int i = 0; i < m; ++i) { for (int j = 0; j < n; ++j) { - if ((middle[i * n + j] <= 0 && is_tail) || (middle[i * n + j] < 0 && !is_tail)) { + if ((__half2float(middle[i * n + j]) <= 0 && is_tail) || + (__half2float(middle[i * n + j]) < 0 && !is_tail)) { dRelu[i * n + j] = 0.0f; } else { dRelu[i * n + j] = bprop_out[i * n + j]; @@ -99,7 +100,7 @@ static void cpu_reverse_add_bias_and_re(__half* bias_grad, __half* dRelu, __half } for (int i = 0; i < n; ++i) { float sum = 0.0f; - for (int j = 0; j < m; ++j) sum += dRelu[j * n + i]; + for (int j = 0; j < m; ++j) sum += __half2float(dRelu[j * n + i]); bias_grad[i] = sum; } } @@ -130,11 +131,11 @@ static float compare_array(const __half* arr1, const __half* arr2, size_t n, flo HCTR_LOG(INFO, WORLD, "Nan or Inf Error\n"); return INT_MAX; } - if (fabs(arr1[i] - arr2[i]) > threshold) { - if (arr2[i] == 0 && fabs(arr1[i]) > threshold) { + if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) { + if (__half2float(arr2[i]) == 0 && fabs(__half2float(arr1[i])) > threshold) { HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]); m++; - } else if (fabs(arr1[i] - arr2[i]) / arr2[i] > threshold) { + } else if (fabs(__half2float(arr1[i] - arr2[i])) / __half2float(arr2[i]) > threshold) { HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]); m++; } @@ -451,7 +452,7 @@ static void group_dense_layer_test(uint32_t* input_dims, uint32_t* output_dims, for (uint32_t col = 0; col < output_dims[i]; col++) { float sum = 0.0; for (uint32_t row = 0; row < batch_size; row++) { - sum = sum + h_top_grad[i][row * output_dims[i] + col]; + sum = sum + __half2float(h_top_grad[i][row * output_dims[i] + col]); } h_bias_grad[i][col] = sum; } diff --git a/test/utest/layers/interaction_layer_test.cpp b/test/utest/layers/interaction_layer_test.cpp index 320a7a3657..2165005f91 100644 --- a/test/utest/layers/interaction_layer_test.cpp +++ b/test/utest/layers/interaction_layer_test.cpp @@ -45,6 +45,26 @@ __half get_eps(bool use_tf32) { return __float2half(1); } +template +float get_accum(const T a, const T b) { + return float(a * b); +} + +template <> +float get_accum(const __half a, const __half b) { + return __half2float(a) * __half2float(b); +} + +template +float get_sec_accum(const T a, const T b, const T c) { + return float((a + b) * c); +} + +template <> +float get_sec_accum(const __half a, const __half b, const __half c) { + return (__half2float(a) + __half2float(b)) * __half2float(c); +} + template void interaction_layer_test(int64_t height, int64_t n_emb, int64_t in_width, bool enable_tf32_compute = false) { @@ -141,8 +161,8 @@ void interaction_layer_test(int64_t height, int64_t n_emb, int64_t in_width, for (size_t n = 0; n < n_ins; n++) { float accum = 0.0f; for (size_t k = 0; k < in_width; k++) { - accum += h_concat[concat_stride + m * in_width + k] * - h_concat[concat_stride + n * in_width + k]; + accum += get_accum(h_concat[concat_stride + m * in_width + k], + h_concat[concat_stride + n * in_width + k]); } h_matmul[mat_stride + m * n_ins + n] = accum; } @@ -217,8 +237,9 @@ void interaction_layer_test(int64_t height, int64_t n_emb, int64_t in_width, for (size_t n = 0; n < in_width; n++) { float accum = 0.0f; for (size_t k = 0; k < n_ins; k++) { - accum += (h_matmul[mat_stride + m * n_ins + k] + h_matmul[mat_stride + k * n_ins + m]) * - h_concat_tmp[concat_stride + k * in_width + n]; + accum += get_sec_accum(h_matmul[mat_stride + m * n_ins + k], + h_matmul[mat_stride + k * n_ins + m], + h_concat_tmp[concat_stride + k * in_width + n]); } h_concat[concat_stride + m * in_width + n] = 1.0f * accum; } diff --git a/test/utest/layers/interaction_layer_test_old.cpp b/test/utest/layers/interaction_layer_test_old.cpp index 4b35f9bcc2..6b7f10b789 100644 --- a/test/utest/layers/interaction_layer_test_old.cpp +++ b/test/utest/layers/interaction_layer_test_old.cpp @@ -40,6 +40,26 @@ __half get_eps(bool use_tf32) { return __float2half(1); } +template +float get_accum(const T a, const T b) { + return float(a * b); +} + +template <> +float get_accum(const __half a, const __half b) { + return __half2float(a) * __half2float(b); +} + +template +float get_sec_accum(const T a, const T b, const T c) { + return float((a + b) * c); +} + +template <> +float get_sec_accum(const __half a, const __half b, const __half c) { + return (__half2float(a) + __half2float(b)) * __half2float(c); +} + template void interaction_layer_test(size_t height, size_t n_emb, size_t in_width, bool enable_tf32_compute = false) { @@ -138,8 +158,8 @@ void interaction_layer_test(size_t height, size_t n_emb, size_t in_width, for (size_t n = 0; n < n_ins; n++) { float accum = 0.0f; for (size_t k = 0; k < in_width; k++) { - accum += h_concat[concat_stride + m * in_width + k] * - h_concat[concat_stride + n * in_width + k]; + accum += get_accum(h_concat[concat_stride + m * in_width + k], + h_concat[concat_stride + n * in_width + k]); } h_mat[mat_stride + m * n_ins + n] = accum; } @@ -217,8 +237,9 @@ void interaction_layer_test(size_t height, size_t n_emb, size_t in_width, for (size_t n = 0; n < in_width; n++) { float accum = 0.0f; for (size_t k = 0; k < n_ins; k++) { - accum += (h_mat[mat_stride + m * n_ins + k] + h_mat[mat_stride + k * n_ins + m]) * - h_concat_tmp[concat_stride + k * in_width + n]; + accum += + get_sec_accum(h_mat[mat_stride + m * n_ins + k], h_mat[mat_stride + k * n_ins + m], + h_concat_tmp[concat_stride + k * in_width + n]); } h_concat[concat_stride + m * in_width + n] = 1.0f * accum; } diff --git a/test/utest/layers/layer_norm_layer_test.cpp b/test/utest/layers/layer_norm_layer_test.cpp index ba916f61ed..c790fbb8a5 100644 --- a/test/utest/layers/layer_norm_layer_test.cpp +++ b/test/utest/layers/layer_norm_layer_test.cpp @@ -92,7 +92,7 @@ void layer_norm_fprop_cpu<__half>(const __half* gamma, const __half* beta, const for (int j = 0; j < num_feature; j++) { int idx = i * num_feature + j; float in_norm = (__half2float(in[idx]) - mean) / sqrt(var + eps); - out[idx] = __float2half(gamma[j] * in_norm + beta[j]); + out[idx] = gamma[j] * __float2half(in_norm) + beta[j]; } } } @@ -176,7 +176,8 @@ void layer_norm_bprop_cpu<__half>(const __half* gamma, const __half* out, __half float d_var = 0.0f; for (int j = 0; j < num_feature; j++) { int idx = i * num_feature + j; - float val = (__half2float(out[idx]) * gamma[j]) * (__half2float(in[idx]) - mean); + float val = + (__half2float(out[idx]) * __half2float(gamma[j])) * (__half2float(in[idx]) - mean); d_var += val; } d_var *= (-0.5f) * pow(inv_std, 3); @@ -184,15 +185,16 @@ void layer_norm_bprop_cpu<__half>(const __half* gamma, const __half* out, __half float d_mu = 0.0f; for (int j = 0; j < num_feature; j++) { int idx = i * num_feature + j; - d_mu += __half2float(out[idx]) * gamma[j] * inv_std; + d_mu += __half2float(out[idx]) * __half2float(gamma[j]) * inv_std; } d_mu *= (-1.0f / num_feature); for (int j = 0; j < num_feature; j++) { int idx = i * num_feature + j; - gamma_grad[j] = gamma_grad[j] + out[idx] * (in[idx] - mean) * inv_std; + gamma_grad[j] = + gamma_grad[j] + out[idx] * (in[idx] - __float2half(mean)) * __float2half(inv_std); beta_grad[j] = beta_grad[j] + out[idx]; - in[idx] = __float2half((__half2float(out[idx]) * gamma[j]) * inv_std + + in[idx] = __float2half(__half2float(out[idx] * gamma[j]) * inv_std + d_var * (2.0 / num_feature) * (__half2float(in[idx]) - mean) + d_mu); } } diff --git a/test/utest/layers/mlp_test.cpp b/test/utest/layers/mlp_test.cpp index 3a0226630e..9d743eee58 100644 --- a/test/utest/layers/mlp_test.cpp +++ b/test/utest/layers/mlp_test.cpp @@ -64,7 +64,7 @@ static void cpu_mm(T* c, const T* a, bool transpose_a, const T* b, bool transpos for (int kk = 0; kk < k; ++kk) { int ai = transpose_a ? kk * m + i : i * k + kk; int bi = transpose_b ? j * k + kk : kk * n + j; - sum += a[ai] * b[bi]; + sum += __half2float(a[ai] * b[bi]); } c[i * n + j] = static_cast(beta * static_cast(c[i * n + j]) + sum); } @@ -79,7 +79,7 @@ static void cpu_add_bias_and_re(T* top, T* middle, const T* bias, bool is_relu, T t = top[i * n + j] + (use_bias ? bias[j] : T(0.0f)); middle[i * n + j] = t; if (is_relu) - top[i * n + j] = t < 0 ? T(0.0f) : t; + top[i * n + j] = __half2float(t) < 0 ? T(0.0f) : t; else top[i * n + j] = t; } @@ -91,7 +91,8 @@ static void cpu_reverse_add_bias_and_re(T* bias_grad, T* dRelu, T* middle, const int m, int n, bool is_tail, bool use_bias) { for (int i = 0; i < m; ++i) { for (int j = 0; j < n; ++j) { - if ((middle[i * n + j] <= 0 && is_tail) || (middle[i * n + j] < 0 && !is_tail)) { + if ((__half2float(middle[i * n + j]) <= 0 && is_tail) || + (__half2float(middle[i * n + j]) < 0 && !is_tail)) { dRelu[i * n + j] = 0.0f; } else { dRelu[i * n + j] = bprop_out[i * n + j]; @@ -101,7 +102,7 @@ static void cpu_reverse_add_bias_and_re(T* bias_grad, T* dRelu, T* middle, const if (use_bias) { for (int i = 0; i < n; ++i) { float sum = 0.0f; - for (int j = 0; j < m; ++j) sum += dRelu[j * n + i]; + for (int j = 0; j < m; ++j) sum += __half2float(dRelu[j * n + i]); bias_grad[i] = sum; } } @@ -134,11 +135,11 @@ static float compare_array(const T* arr1, const T* arr2, size_t n, float thresho HCTR_LOG(INFO, WORLD, "Nan or Inf Error\n"); return INT_MAX; } - if (fabs(arr1[i] - arr2[i]) > threshold) { - if (arr2[i] == 0 && fabs(arr1[i]) > threshold) { + if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) { + if (__half2float(arr2[i]) == 0 && fabs(__half2float(arr1[i])) > threshold) { HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]); m++; - } else if (fabs(arr1[i] - arr2[i]) / arr2[i] > threshold) { + } else if (fabs(__half2float(arr1[i] - arr2[i])) / __half2float(arr2[i]) > threshold) { HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]); m++; } @@ -465,7 +466,7 @@ static void mlp_test(std::vector network, std::vector(network, mlp_num_outputs, use_relu, use_bias, use_fuse_wb, true, input_dim, batch_size, perf_config_set); -}; \ No newline at end of file +}; diff --git a/test/utest/layers/multi_cross_layer_test.cpp b/test/utest/layers/multi_cross_layer_test.cpp index c52bcad136..01f9825363 100644 --- a/test/utest/layers/multi_cross_layer_test.cpp +++ b/test/utest/layers/multi_cross_layer_test.cpp @@ -26,6 +26,16 @@ using namespace HugeCTR; +template +float cal_C(const T a, const float beta, const T b) { + return float(a * beta + b); +} + +template <> +float cal_C(const __half a, const float beta, const __half b) { + return __half2float(a) * beta + __half2float(b); +} + template class MultiCrossLayerTest { private: @@ -184,7 +194,7 @@ class MultiCrossLayerTest { // column of A is rowA acc = acc + (A[r * rowB + k] * B[c * rowB + k]); } - C[r * colB + c] = C[r * colB + c] * beta + acc; + C[r * colB + c] = cal_C(C[r * colB + c], beta, acc); } } } else if (transA) { @@ -196,7 +206,7 @@ class MultiCrossLayerTest { // column of A is rowA acc = acc + (A[k * rowA + r] * B[k * colB + c]); } - C[r * colB + c] = C[r * colB + c] * beta + acc; + C[r * colB + c] = cal_C(C[r * colB + c], beta, acc); } } } else { @@ -207,7 +217,7 @@ class MultiCrossLayerTest { // column of A is rowB acc = acc + (A[r * rowB + k] * B[k * colB + c]); } - C[r * colB + c] = C[r * colB + c] * beta + acc; + C[r * colB + c] = cal_C(C[r * colB + c], beta, acc); } } } @@ -877,4 +887,4 @@ TEST(multi_cross_layer_v2, fp16_3x1024x2) { // TEST(multi_cross_layer_v2, fp16_dlrm) { // MultiCrossLayerTest<__half> test(65536, 3456, 3, 512); // test.test(); -// } \ No newline at end of file +// } diff --git a/test/utest/layers/multi_head_attention_layer_test.cpp b/test/utest/layers/multi_head_attention_layer_test.cpp index 5829bccadc..a35aaa7612 100644 --- a/test/utest/layers/multi_head_attention_layer_test.cpp +++ b/test/utest/layers/multi_head_attention_layer_test.cpp @@ -196,6 +196,18 @@ void multi_head_attention_cpu(T *in1, T *in2, T *output, int64_t b, int64_t h, i } } +template <> +void multi_head_attention_cpu(__half *in1, __half *in2, __half *output, int64_t b, int64_t h, + int64_t m, int64_t n, int64_t k) { + transpose(in2, b, h, n, k); + matmul_cpu(in1, in2, output, b, h, m, k, n); + // Just to revert in2 back + transpose(in2, b, h, k, n); + for (int64_t i = 0; i < b * h * m * n; i++) { + output[i] = __half2float(output[i]) / ((float)sqrt(k)); + } +} + template void multi_head_attention_cpu_noT(T *in1, T *in2, T *output, int64_t b, int64_t h, int64_t m, int64_t n, int64_t k) { @@ -232,6 +244,28 @@ void multi_head_attention_dgrad_cpu(T *out, T **h_ins, T **h_b_ins, int64_t b, i h_b_ins[1][i] = h_b_ins[1][i] / ((float)sqrt(k)); } } + +template <> +void multi_head_attention_dgrad_cpu(__half *out, __half **h_ins, __half **h_b_ins, int64_t b, + int64_t h, int64_t m, int64_t n, int64_t k) { + // transpose(h_ins[1], h, b, n, k); + // transpose(h_ins[0], h, b, m, n); + // out [b,h,m,n] + // in1 [b,h,m,k] + // in2 [b,h,n,k] + matmul_cpu(out, h_ins[1], h_b_ins[0], b, h, m, n, k); + transpose(out, b, h, m, n); + matmul_cpu(out, h_ins[0], h_b_ins[1], b, h, n, m, k); + // Just revert out back + transpose(out, b, h, m, n); + for (int64_t i = 0; i < b * h * m * k; i++) { + h_b_ins[0][i] = __half2float(h_b_ins[0][i]) / ((float)sqrt(k)); + } + for (int64_t i = 0; i < b * h * n * k; i++) { + h_b_ins[1][i] = __half2float(h_b_ins[1][i]) / ((float)sqrt(k)); + } +} + template void multi_head_attention_dgrad_3d_cpu(T *out, T *value_out, T **h_ins, T **h_b_ins, int64_t batch_size, int64_t head_num, int64_t seq_len, diff --git a/test/utest/layers/sequence_mask_layer_test.cpp b/test/utest/layers/sequence_mask_layer_test.cpp index aa38ebb407..b023aa097a 100644 --- a/test/utest/layers/sequence_mask_layer_test.cpp +++ b/test/utest/layers/sequence_mask_layer_test.cpp @@ -46,7 +46,7 @@ struct Eps<__half> { template void f2i_input(T* input, size_t in_size, size_t max_sequence_len) { for (size_t i = 0; i < in_size; i++) { - input[i] = abs(floor(input[i] * max_sequence_len)); + input[i] = abs(floor(__half2float(input[i]) * max_sequence_len)); } } diff --git a/test/utest/layers/sequence_mask_layer_test_old.cpp b/test/utest/layers/sequence_mask_layer_test_old.cpp index dec57c31cb..638a409b11 100644 --- a/test/utest/layers/sequence_mask_layer_test_old.cpp +++ b/test/utest/layers/sequence_mask_layer_test_old.cpp @@ -43,7 +43,7 @@ struct Eps<__half> { template void f2i_input(T* input, size_t in_size, size_t max_sequence_len) { for (size_t i = 0; i < in_size; i++) { - input[i] = abs(floor(input[i] * max_sequence_len)); + input[i] = abs(floor(__half2float(input[i]) * max_sequence_len)); } } diff --git a/test/utest/test_utils.hpp b/test/utest/test_utils.hpp index ef4210d10c..decf23e2d3 100644 --- a/test/utest/test_utils.hpp +++ b/test/utest/test_utils.hpp @@ -53,7 +53,7 @@ void normal_sync_cpu(T* data, int64_t num_elements, const float mean, const floa template T abs(const T& val) { - return val > T(0) ? val : -val; + return val > T(0.0) ? val : -val; } template @@ -72,6 +72,24 @@ ::testing::AssertionResult compare_array_approx(const T* h_out, const T* h_exp, return ::testing::AssertionSuccess(); } +template +::testing::AssertionResult compare_array_approx(const __half* h_out, const __half* h_exp, int len, + __half eps) { + for (int i = 0; i < len; ++i) { + auto output = h_out[i]; + auto expected = h_exp[i]; + T diff = abs(output - expected); + if (diff > eps) { + // if (diff > eps && i < 128 * 10) { + // std::cout + return ::testing::AssertionFailure() + << "output: " << __half2float(output) << " != expected: " << __half2float(expected) + << " at idx " << i << std::endl; + } + } + return ::testing::AssertionSuccess(); +} + template ::testing::AssertionResult compare_array_approx(const T* h_out, const T expected, int len, T eps) { for (int i = 0; i < len; ++i) { @@ -118,6 +136,23 @@ ::testing::AssertionResult compare_array_approx_rel(const T* h_out, const T* h_e return ::testing::AssertionSuccess(); } +template +::testing::AssertionResult compare_array_approx_rel(const __half* h_out, const __half* h_exp, + int len, __half max_rel_err, + __half max_abs_err) { + for (int i = 0; i < len; ++i) { + auto output = h_out[i]; + auto expected = h_exp[i]; + T abs_err = abs(output - expected); + T rel_err = abs_err / expected; + if (abs_err > max_abs_err && rel_err > max_rel_err) { + return ::testing::AssertionFailure() + << "output: " << (float)output << " != expected: " << (float)expected << " at idx " + << i; + } + } + return ::testing::AssertionSuccess(); +} __forceinline__ bool cpu_gpu_cmp(float* cpu_p, float* gpu_p, int len) { float* gpu_tmp = (float*)malloc(sizeof(float) * len); cudaMemcpy(gpu_tmp, gpu_p, sizeof(float) * len, cudaMemcpyDeviceToHost);