Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cuda : speed-up by using CUBLAS_COMPUTE_32F instead of CUBLAS_COMPUTE_16F #3816

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 53 additions & 19 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7579,8 +7579,7 @@ static void ggml_cuda_op_mul_mat_cublas(

const int compute_capability = g_device_caps[id].cc;

if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
//printf("this branch\n");
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
cuda_pool_alloc<half> src0_as_f16;
if (src0->type != GGML_TYPE_F16) {
Expand All @@ -7601,23 +7600,44 @@ static void ggml_cuda_op_mul_mat_cublas(
to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
}
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
cuda_pool_alloc<half> dst_f16(row_diff*src1_ncols);

const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;

CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
CUBLAS_CHECK(
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
row_diff, src1_ncols, ne10,
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
src1_ptr, CUDA_R_16F, ne10,
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
CUBLAS_COMPUTE_16F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));

const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
switch (dst->op_params[0]) {
case GGML_PREC_DEFAULT:
{
cuda_pool_alloc<half> dst_f16(row_diff*src1_ncols);

const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;

CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
CUBLAS_CHECK(
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
row_diff, src1_ncols, ne10,
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
src1_ptr, CUDA_R_16F, ne10,
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
CUBLAS_COMPUTE_16F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));

const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
} break;
case GGML_PREC_F32:
{
const float alpha_f32 = 1.0f;
const float beta_f32 = 0.0f;

CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
CUBLAS_CHECK(
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
row_diff, src1_ncols, ne10,
&alpha_f32, src0_ptr, CUDA_R_16F, ne00,
src1_ptr, CUDA_R_16F, ne10,
&beta_f32, dst_dd_i, CUDA_R_32F, ldc,
CUBLAS_COMPUTE_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} break;
}
Copy link
Owner Author

@ggerganov ggerganov Jan 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might want to merge this particular change of ggml_cuda_op_mul_mat_cublas since it uses less memory than cublasSgemm and still performs the compute in F32 which is needed for models like Phi-2

Copy link
Collaborator

@slaren slaren Jan 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure if I am following all the logic, but I would be concerned about down-converting F32 src0/src1 to F16 despite the user requesting GGML_PREC_32. In the long run, I think it would be better to always respect the user types and do all the type conversions in the graph (ggerganov/ggml#455), since it would give users more control and it would simplify the code in the backends. It would also move the temporary buffer from the pool to the compute buffer, which would result in more accurate estimation of the VRAM needed to run a model. It should also help with the issue of to_fp32 and to_fp16 in the CUDA backend being unable to deal with non-contiguous tensors, since it would be done in a ggml_cpy instead.

} else {
cuda_pool_alloc<float> src0_ddq_as_f32;
cuda_pool_alloc<float> src1_ddq_as_f32;
Expand All @@ -7635,7 +7655,7 @@ static void ggml_cuda_op_mul_mat_cublas(
to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
}

const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();

const float alpha = 1.0f;
Expand Down Expand Up @@ -9234,6 +9254,20 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
}

void ggml_cuda_free_data(struct ggml_tensor * tensor) {
// print current mem usage using cudaMemGetInfo
// TODO: this is a hack - need better solution
{
size_t free;
size_t total;
CUDA_CHECK(cudaMemGetInfo(&free, &total));

static size_t used = 0;
if (used < total - free) {
printf("CUDA: used %zu MB, free %zu MB\n", (total - free)/1024/1024, free/1024/1024);
used = total - free;
}
}

if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
return;
}
Expand Down
6 changes: 6 additions & 0 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -4077,6 +4077,12 @@ struct ggml_tensor * ggml_mul_mat(
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

// TMP: force f32 precision
{
const int32_t prec_i32 = GGML_PREC_F32;
ggml_set_op_params_i32(result, 0, prec_i32);
}

result->op = GGML_OP_MUL_MAT;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
Expand Down
Loading