Skip to content

Commit

Permalink
Merge branch 'fix_operator_error_in_mlperf' into 'main'
Browse files Browse the repository at this point in the history
Fix math operator error on CUDA 12.2

See merge request dl/hugectr/hugectr!1426
  • Loading branch information
minseokl committed Aug 2, 2023
2 parents f5452a6 + 6f156df commit 734dc44
Show file tree
Hide file tree
Showing 35 changed files with 271 additions and 106 deletions.
4 changes: 2 additions & 2 deletions HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
float var = 0.0f;
for (int i = 0; i < batch_size; i++) {
int idx = i * num_feature + j;
float diff = in[idx] - mean;
float diff = __half2float(in[idx]) - mean;
var += (diff * diff);
}
var /= batch_size;
Expand All @@ -162,7 +162,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
for (int i = 0; i < batch_size; i++) {
int idx = i * num_feature + j;
val1 += (__half2float(out[idx]) * gamma[j]);
val2 += __half2float(in[idx] - mean);
val2 += (__half2float(in[idx]) - mean);
}
val1 *= (-inv_std);
val2 *= (d_var / batch_size) * -2;
Expand Down
3 changes: 2 additions & 1 deletion HugeCTR/src/cpu/layers/elu_layer_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ namespace {
template <typename T>
void elu_cpu(const T* in, T* out, int len, T alpha) {
for (int i = 0; i < len; ++i) {
out[i] = (in[i] < 0) ? T(alpha * (exp(in[i]) - 1)) : in[i];
out[i] =
(__half2float(in[i]) < 0) ? T(__half2float(alpha) * (exp(__half2float(in[i])) - 1)) : in[i];
}
}

Expand Down
4 changes: 2 additions & 2 deletions HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ void fm_order2_bprop_cpu(const __half* in, const __half* top_grad, __half* dgrad
}
for (int k = 0; k < slot_num; k++) {
int index = offset + k * emb_vec_size;
dgrad[index] =
__float2half(__half2float(top_grad[i * emb_vec_size + j]) * (sum - in[index]));
dgrad[index] = __float2half(__half2float(top_grad[i * emb_vec_size + j]) *
(sum - __half2float(in[index])));
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ void cpu_mm(__half* c, const __half* a, bool transpose_a, const __half* b, bool
for (int kk = 0; kk < k; ++kk) {
int ai = transpose_a ? kk * m + i : i * k + kk;
int bi = transpose_b ? j * k + kk : kk * n + j;
sum += a[ai] * b[bi];
sum += __half2float(a[ai] * b[bi]);
}
c[i * n + j] = sum;
}
Expand All @@ -47,7 +47,7 @@ void cpu_add_bias(__half* top, const __half* bias, int m, int n) {
void cpu_reverse_add_bias(__half* bias_grad, const __half* top, int m, int n) {
for (int i = 0; i < n; ++i) {
float sum = 0.0f;
for (int j = 0; j < m; ++j) sum += top[j * n + i];
for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]);
bias_grad[i] = sum;
}
}
Expand Down
8 changes: 4 additions & 4 deletions HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ void cpu_mm(__half* c, const __half* a, bool transpose_a, const __half* b, bool
for (int kk = 0; kk < k; ++kk) {
int ai = transpose_a ? kk * m + i : i * k + kk;
int bi = transpose_b ? j * k + kk : kk * n + j;
sum += a[ai] * b[bi];
sum += __half2float(a[ai] * b[bi]);
}
c[i * n + j] = sum;
}
Expand All @@ -41,7 +41,7 @@ void cpu_add_bias_and_re(__half* top, __half* middle, const __half* bias, int m,
for (int j = 0; j < n; ++j) {
__half t = top[i * n + j] + bias[j];
middle[i * n + j] = t;
top[i * n + j] = t < 0 ? __float2half(0.0f) : t;
top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t;
}
}
}
Expand All @@ -50,7 +50,7 @@ void cpu_reverse_add_bias_and_re(__half* bias_grad, __half* middle, const __half
int n) {
for (int i = 0; i < m; ++i)
for (int j = 0; j < n; ++j) {
if (middle[i * n + j] < 0) {
if (__half2float(middle[i * n + j]) < 0) {
middle[i * n + j] = 0.0f;
} else {
middle[i * n + j] = top[i * n + j];
Expand All @@ -59,7 +59,7 @@ void cpu_reverse_add_bias_and_re(__half* bias_grad, __half* middle, const __half

for (int i = 0; i < n; ++i) {
float sum = 0.0f;
for (int j = 0; j < m; ++j) sum += middle[j * n + i];
for (int j = 0; j < m; ++j) sum += __half2float(middle[j * n + i]);
bias_grad[i] = sum;
}
}
Expand Down
4 changes: 2 additions & 2 deletions HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ void matmul_cpu(size_t height, size_t in_width, size_t n_ins, T *h_concat, T *h_
for (size_t n = 0; n < n_ins; n++) {
float accum = 0.0f;
for (size_t k = 0; k < in_width; k++) {
accum += h_concat[concat_stride + m * in_width + k] *
h_concat[concat_stride + n * in_width + k];
accum += __half2float(h_concat[concat_stride + m * in_width + k] *
h_concat[concat_stride + n * in_width + k]);
}
h_mat[mat_stride + m * n_ins + n] = accum;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
float var = 0.0f;
for (int i = 0; i < batch_size; i++) {
int idx = i * num_feature + j;
float diff = in[idx] - mean;
float diff = __half2float(in[idx]) - mean;
var += (diff * diff);
}
var /= batch_size;
Expand All @@ -178,7 +178,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
for (int i = 0; i < batch_size; i++) {
int idx = i * num_feature + j;
val1 += (__half2float(out[idx]) * gamma[j]);
val2 += __half2float(in[idx] - mean);
val2 += __half2float(in[idx]) - mean;
}
val1 *= (-inv_std);
val2 *= (d_var / batch_size) * -2;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b
for (int kk = 0; kk < k; ++kk) {
int ai = transpose_a ? kk * m + i : i * k + kk;
int bi = transpose_b ? j * k + kk : kk * n + j;
sum += a[ai] * b[bi];
sum += __half2float(a[ai] * b[bi]);
}
c[i * n + j] = sum;
}
Expand All @@ -51,15 +51,15 @@ static void cpu_add_bias(__half *top, const __half *bias, int m, int n) {
static void cpu_reverse_add_bias(__half *bias_grad, const __half *top, int m, int n) {
for (int i = 0; i < n; ++i) {
float sum = 0.0f;
for (int j = 0; j < m; ++j) sum += top[j * n + i];
for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]);
bias_grad[i] = sum;
}
}

static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
size_t m = 0;
for (size_t i = 0; i < n; i++) {
if (fabs(arr1[i] - arr2[i]) > threshold) {
if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
m++;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b
for (int kk = 0; kk < k; ++kk) {
int ai = transpose_a ? kk * m + i : i * k + kk;
int bi = transpose_b ? j * k + kk : kk * n + j;
sum += a[ai] * b[bi];
sum += __half2float(a[ai] * b[bi]);
}
c[i * n + j] = sum;
}
Expand All @@ -46,7 +46,7 @@ static void cpu_add_bias_and_re(__half *top, __half *middle, const __half *bias,
for (int j = 0; j < n; ++j) {
__half t = top[i * n + j] + bias[j];
middle[i * n + j] = t;
top[i * n + j] = t < 0 ? __float2half(0.0f) : t;
top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t;
}
}
}
Expand All @@ -55,7 +55,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const
int n) {
for (int i = 0; i < m; ++i)
for (int j = 0; j < n; ++j) {
if (middle[i * n + j] < 0) {
if (__half2float(middle[i * n + j]) < 0) {
middle[i * n + j] = 0.0f;
} else {
middle[i * n + j] = top[i * n + j];
Expand All @@ -64,15 +64,15 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const

for (int i = 0; i < n; ++i) {
float sum = 0.0f;
for (int j = 0; j < m; ++j) sum += middle[j * n + i];
for (int j = 0; j < m; ++j) sum += __half2float(middle[j * n + i]);
bias_grad[i] = sum;
}
}

static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
size_t m = 0;
for (size_t i = 0; i < n; i++) {
if (fabs(arr1[i] - arr2[i]) > threshold) {
if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
m++;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b
for (int kk = 0; kk < k; ++kk) {
int ai = transpose_a ? kk * m + i : i * k + kk;
int bi = transpose_b ? j * k + kk : kk * n + j;
sum += a[ai] * b[bi];
sum += __half2float(a[ai] * b[bi]);
}
c[i * n + j] = sum;
}
Expand All @@ -46,7 +46,7 @@ static void cpu_add_bias_and_re(__half *top, __half *middle, const __half *bias,
for (int j = 0; j < n; ++j) {
__half t = top[i * n + j] + bias[j];
middle[i * n + j] = t;
top[i * n + j] = t < 0 ? __float2half(0.0f) : t;
top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t;
}
}
}
Expand All @@ -55,7 +55,7 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __
int m, int n) {
for (int i = 0; i < m; ++i)
for (int j = 0; j < n; ++j) {
if (top[i * n + j] < 0) {
if (__half2float(top[i * n + j]) < 0) {
top[i * n + j] = 0.0f;
} else {
top[i * n + j] = bprop_out[i * n + j];
Expand All @@ -64,15 +64,15 @@ static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __

for (int i = 0; i < n; ++i) {
float sum = 0.0f;
for (int j = 0; j < m; ++j) sum += top[j * n + i];
for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]);
bias_grad[i] = sum;
}
}

static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
size_t m = 0;
for (size_t i = 0; i < n; i++) {
if (fabs(arr1[i] - arr2[i]) > threshold) {
if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
m++;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ void layer_norm_fprop_cpu<__half>(const __half* gamma, const __half* beta, const
for (int j = 0; j < num_feature; j++) {
int idx = i * num_feature + j;
float in_norm = (__half2float(in[idx]) - mean) / sqrt(var + eps);
out[idx] = __float2half(gamma[j] * in_norm + beta[j]);
out[idx] = gamma[j] * __float2half(in_norm) + beta[j];
}
}
}
Expand Down Expand Up @@ -176,23 +176,24 @@ void layer_norm_bprop_cpu<__half>(const __half* gamma, const __half* out, __half
float d_var = 0.0f;
for (int j = 0; j < num_feature; j++) {
int idx = i * num_feature + j;
float val = (__half2float(out[idx]) * gamma[j]) * (__half2float(in[idx]) - mean);
float val = __half2float(out[idx] * gamma[j]) * (__half2float(in[idx]) - mean);
d_var += val;
}
d_var *= (-0.5f) * pow(inv_std, 3);

float d_mu = 0.0f;
for (int j = 0; j < num_feature; j++) {
int idx = i * num_feature + j;
d_mu += __half2float(out[idx]) * gamma[j] * inv_std;
d_mu += __half2float(out[idx] * gamma[j]) * inv_std;
}
d_mu *= (-1.0f / num_feature);

for (int j = 0; j < num_feature; j++) {
int idx = i * num_feature + j;
gamma_grad[j] = gamma_grad[j] + out[idx] * (in[idx] - mean) * inv_std;
gamma_grad[j] =
gamma_grad[j] + out[idx] * (in[idx] - __float2half(mean)) * __float2half(inv_std);
beta_grad[j] = beta_grad[j] + out[idx];
in[idx] = __float2half((__half2float(out[idx]) * gamma[j]) * inv_std +
in[idx] = __float2half(__half2float(out[idx] * gamma[j]) * inv_std +
d_var * (2.0 / num_feature) * (__half2float(in[idx]) - mean) + d_mu);
}
}
Expand Down
17 changes: 9 additions & 8 deletions test/utest/core23_dev_test/trainable_layers/mlp_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ static void cpu_mm(T* c, const T* a, bool transpose_a, const T* b, bool transpos
for (int kk = 0; kk < k; ++kk) {
int ai = transpose_a ? kk * m + i : i * k + kk;
int bi = transpose_b ? j * k + kk : kk * n + j;
sum += a[ai] * b[bi];
sum += __half2float(a[ai] * b[bi]);
}
c[i * n + j] = static_cast<half>(beta * static_cast<float>(c[i * n + j]) + sum);
}
Expand All @@ -80,7 +80,7 @@ static void cpu_add_bias_and_re(T* top, T* middle, const T* bias, bool is_relu,
T t = top[i * n + j] + (use_bias ? bias[j] : T(0.0f));
middle[i * n + j] = t;
if (is_relu)
top[i * n + j] = t < 0 ? T(0.0f) : t;
top[i * n + j] = __half2float(t) < 0 ? T(0.0f) : t;
else
top[i * n + j] = t;
}
Expand All @@ -92,7 +92,8 @@ static void cpu_reverse_add_bias_and_re(T* bias_grad, T* dRelu, T* middle, const
int m, int n, bool is_tail, bool use_bias) {
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
if ((middle[i * n + j] <= 0 && is_tail) || (middle[i * n + j] < 0 && !is_tail)) {
if ((__half2float(middle[i * n + j]) <= 0 && is_tail) ||
(__half2float(middle[i * n + j]) < 0 && !is_tail)) {
dRelu[i * n + j] = 0.0f;
} else {
dRelu[i * n + j] = bprop_out[i * n + j];
Expand All @@ -102,7 +103,7 @@ static void cpu_reverse_add_bias_and_re(T* bias_grad, T* dRelu, T* middle, const
if (use_bias) {
for (int i = 0; i < n; ++i) {
float sum = 0.0f;
for (int j = 0; j < m; ++j) sum += dRelu[j * n + i];
for (int j = 0; j < m; ++j) sum += __half2float(dRelu[j * n + i]);
bias_grad[i] = sum;
}
}
Expand Down Expand Up @@ -135,11 +136,11 @@ static float compare_array(const T* arr1, const T* arr2, size_t n, float thresho
HCTR_LOG(INFO, WORLD, "Nan or Inf Error\n");
return INT_MAX;
}
if (fabs(arr1[i] - arr2[i]) > threshold) {
if (arr2[i] == 0 && fabs(arr1[i]) > threshold) {
if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
if (__half2float(arr2[i]) == 0 && fabs(__half2float(arr1[i])) > threshold) {
HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]);
m++;
} else if (fabs(arr1[i] - arr2[i]) / arr2[i] > threshold) {
} else if (fabs(__half2float(arr1[i] - arr2[i])) / __half2float(arr2[i]) > threshold) {
HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]);
m++;
}
Expand Down Expand Up @@ -484,7 +485,7 @@ static void mlp_test(std::vector<Layer_t> network,
for (uint32_t col = 0; col < fc_out_dims[i]; col++) {
float sum = 0.0;
for (uint32_t row = 0; row < batch_size; row++) {
sum = sum + p.h_top_grad[i].get()[row * fc_out_dims[i] + col];
sum = sum + __half2float(p.h_top_grad[i].get()[row * fc_out_dims[i] + col]);
}
p.h_bias_grad[i].get()[col] = sum;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class MultiCrossLayerTest {
// column of A is rowA
acc = acc + (A[r * rowB + k] * B[c * rowB + k]);
}
C[r * colB + c] = C[r * colB + c] * beta + acc;
C[r * colB + c] = __float2half(__half2float(C[r * colB + c]) * beta + __half2float(acc));
}
}
} else if (transA) {
Expand All @@ -197,7 +197,7 @@ class MultiCrossLayerTest {
// column of A is rowA
acc = acc + (A[k * rowA + r] * B[k * colB + c]);
}
C[r * colB + c] = C[r * colB + c] * beta + acc;
C[r * colB + c] = __float2half(__half2float(C[r * colB + c]) * beta + __half2float(acc));
}
}
} else {
Expand All @@ -208,7 +208,7 @@ class MultiCrossLayerTest {
// column of A is rowB
acc = acc + (A[r * rowB + k] * B[k * colB + c]);
}
C[r * colB + c] = C[r * colB + c] * beta + acc;
C[r * colB + c] = __float2half(__half2float(C[r * colB + c]) * beta + __half2float(acc));
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion test/utest/layers/add_layer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ void add_cpu(Vector input, T *output, size_t size, size_t num) {
for (auto i = 0; i < size; i++) {
float tmp = 0.f;
for (size_t j = 0; j < num; j++) {
tmp += input[j][i];
tmp += (float)input[j][i];
}
output[i] = tmp;
}
Expand Down
4 changes: 2 additions & 2 deletions test/utest/layers/batch_norm_layer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
float var = 0.0f;
for (int i = 0; i < batch_size; i++) {
int idx = i * num_feature + j;
float diff = in[idx] - mean;
float diff = __half2float(in[idx]) - mean;
var += (diff * diff);
}
var /= batch_size;
Expand All @@ -178,7 +178,7 @@ void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half*
for (int i = 0; i < batch_size; i++) {
int idx = i * num_feature + j;
val1 += (__half2float(out[idx]) * gamma[j]);
val2 += __half2float(in[idx] - mean);
val2 += __half2float(in[idx]) - mean;
}
val1 *= (-inv_std);
val2 *= (d_var / batch_size) * -2;
Expand Down
Loading

0 comments on commit 734dc44

Please sign in to comment.