Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

fix nf4 performance in hybrid CPU #120

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion neural_speed/application/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,16 @@ ne_ftype quant_params_to_ftype(const quant_params& params) {
return NE_FTYPE_MOSTLY_Q8_0;
}
} else {
return NE_FTYPE_MOSTLY_Q_BTLA;
if (params.weight_dtype == "int4")
return NE_FTYPE_MOSTLY_Q4_J;
else if (params.weight_dtype == "int8")
return NE_FTYPE_MOSTLY_Q8_J;
else if (params.weight_dtype == "fp8" || params.weight_dtype == "fp8_e5m2")
return NE_FTYPE_MOSTLY_F8;
else if (params.weight_dtype == "fp4")
return NE_FTYPE_MOSTLY_F4;
else if (params.weight_dtype == "nf4")
return NE_FTYPE_MOSTLY_NF4;
}
return NE_FTYPE_UNKNOWN;
}
Expand Down
6 changes: 5 additions & 1 deletion neural_speed/core/data_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ enum ne_ftype {
NE_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
NE_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
NE_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
NE_FTYPE_MOSTLY_Q_BTLA = 10, // except 1d tensors
NE_FTYPE_MOSTLY_Q4_J = 10, // except 1d tensors
NE_FTYPE_MOSTLY_Q8_J = 11, // except 1d tensors
NE_FTYPE_MOSTLY_F8 = 12, // except 1d tensors
NE_FTYPE_MOSTLY_NF4 = 13, // except 1d tensors
NE_FTYPE_MOSTLY_F4 = 14, // except 1d tensors
};

#define QK4_0 32
Expand Down
15 changes: 15 additions & 0 deletions neural_speed/core/layers/ne_bestla.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,21 @@
using namespace bestla; // NOLINT
using namespace ne_bestla; // NOLINT

bool bestla_is_hybrid() {
GetCPUDevice();
return _cd->isHybrid();
}

int bestla_get_Pcore_number() {
GetCPUDevice();
return _cd->getPcoreNum();
}

int bestla_get_Ecore_number() {
GetCPUDevice();
return _cd->getEcoreNum();
}

void bestla_init() {
GetCPUDevice();
if (_cd->AMX_BF16() || _cd->AMX_INT8()) {
Expand Down
6 changes: 6 additions & 0 deletions neural_speed/core/ne_bestla.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ void bestla_timer(bool _init);

int bestla_set_threads(int _nth);

bool bestla_is_hybrid();

int bestla_get_Pcore_number();

int bestla_get_Ecore_number();

void* bestla_get_thread_handle();

void bestla_init();
Expand Down
7 changes: 7 additions & 0 deletions neural_speed/models/baichuan/baichuan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,13 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
}
yuchengliu1 marked this conversation as resolved.
Show resolved Hide resolved

const bool run_mha_reordered = model.layers[0].k_cache->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
7 changes: 7 additions & 0 deletions neural_speed/models/bloom/bloom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,13 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
}

struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
ne_set_name(embd, "embd");
Expand Down
7 changes: 7 additions & 0 deletions neural_speed/models/chatglm/chatglm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
}

struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size);
ne_set_name(embd, "embd");
Expand Down
7 changes: 7 additions & 0 deletions neural_speed/models/chatglm/chatglm2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,13 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
}

const bool run_mha_reordered = model.layers[0].k_cache->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
7 changes: 7 additions & 0 deletions neural_speed/models/falcon/falcon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,13 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
}

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
7 changes: 7 additions & 0 deletions neural_speed/models/gptj/gptj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,13 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
}

// no padding input for optimized MHA kernel
const bool run_mha_reordered = (kv_self.k->type == NE_TYPE_BTLA);
Expand Down
7 changes: 7 additions & 0 deletions neural_speed/models/gptneox/gptneox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,13 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
}

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
7 changes: 7 additions & 0 deletions neural_speed/models/llama/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,13 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
}

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {0, 0};
Expand Down
8 changes: 7 additions & 1 deletion neural_speed/models/model_utils/quant_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,8 @@ size_t bestla_quantize(const float* f32ptr, void* dstpr, const quant_params_inte
}
if (params.bits == quant_bits::nf4) {
quant_type = BTLA_DTYPE::F4_NF4;
if (bestla_is_hybrid())
printf("Warning: Not recommend NF4 in client CPU. Please use Int4 to get better performance.\n");
}
if (params.bits == quant_bits::fp8_e4m3) {
quant_type = BTLA_DTYPE::F8_E4M3;
Expand Down Expand Up @@ -476,7 +478,11 @@ bool model_quantize_special(std::ifstream& finp, std::ofstream& fout, const ne_f
case NE_FTYPE_MOSTLY_Q4_0:
qtype = NE_TYPE_Q4_0;
break;
case NE_FTYPE_MOSTLY_Q_BTLA:
case NE_FTYPE_MOSTLY_Q4_J:
case NE_FTYPE_MOSTLY_Q8_J:
case NE_FTYPE_MOSTLY_F8:
case NE_FTYPE_MOSTLY_NF4:
case NE_FTYPE_MOSTLY_F4:
qtype = NE_TYPE_BTLA;
break;
case NE_FTYPE_MOSTLY_F16: {
Expand Down
7 changes: 7 additions & 0 deletions neural_speed/models/mpt/mpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,13 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
}

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
8 changes: 8 additions & 0 deletions neural_speed/models/opt/opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "core/data_types.h"
#include "core/ne.h"
#include "core/ne_layers.h"
#include "core/ne_bestla.h"
#include "models/model_utils/model_utils.h"
#include "models/model_utils/util.h"

Expand Down Expand Up @@ -86,6 +87,13 @@ static bool opt_model_eval_internal(model_context* ctx, const model_input* input
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
}

struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
ne_set_name(embd, "embd");
Expand Down
7 changes: 7 additions & 0 deletions neural_speed/models/phi/phi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,13 @@ static bool phi2_model_eval_internal(model_context* ctx, const model_input* inpu
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
}

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
7 changes: 7 additions & 0 deletions neural_speed/models/qwen/qwen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,13 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
}

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
7 changes: 7 additions & 0 deletions neural_speed/models/starcoder/starcoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,13 @@ static bool starcoder_model_eval_internal(model_context* ctx, const model_input*
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
}

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
Loading