Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
subtract the logic
Browse files Browse the repository at this point in the history
  • Loading branch information
yuchengliu1 committed Feb 6, 2024
1 parent e2e84e5 commit c512026
Show file tree
Hide file tree
Showing 16 changed files with 73 additions and 89 deletions.
12 changes: 5 additions & 7 deletions neural_speed/core/layers/ne_bestla.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,12 @@ bool bestla_is_hybrid() {
return _cd->isHybrid();
}

int bestla_get_Pcore_number() {
int bestla_get_best_thread_number(bool is_support_Ecore) {
GetCPUDevice();
return _cd->getPcoreNum();
}

int bestla_get_Ecore_number() {
GetCPUDevice();
return _cd->getEcoreNum();
if (_cd->isHybrid())
return is_support_Ecore ? _cd->getThreads() : _cd->getPcoreNum();
else
return _cd->getThreads();
}

void bestla_init() {
Expand Down
4 changes: 1 addition & 3 deletions neural_speed/core/ne_bestla.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@ int bestla_set_threads(int _nth);

bool bestla_is_hybrid();

int bestla_get_Pcore_number();

int bestla_get_Ecore_number();
int bestla_get_best_thread_number(bool is_support_Ecore);

void* bestla_get_thread_handle();

Expand Down
11 changes: 5 additions & 6 deletions neural_speed/models/baichuan/baichuan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,11 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
const int best_thread_num =
bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
if (gf.n_threads > best_thread_num) {
printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
gf.n_threads = best_thread_num;
}

const bool run_mha_reordered = model.layers[0].k_cache->type == NE_TYPE_BTLA;
Expand Down
11 changes: 5 additions & 6 deletions neural_speed/models/bloom/bloom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,11 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
const int best_thread_num =
bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
if (gf.n_threads > best_thread_num) {
printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
gf.n_threads = best_thread_num;
}

struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
Expand Down
11 changes: 5 additions & 6 deletions neural_speed/models/chatglm/chatglm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,11 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
const int best_thread_num =
bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
if (gf.n_threads > best_thread_num) {
printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
gf.n_threads = best_thread_num;
}

struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size);
Expand Down
12 changes: 5 additions & 7 deletions neural_speed/models/chatglm/chatglm2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,11 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
// for big prochatglms, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
const int best_thread_num =
bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
if (gf.n_threads > best_thread_num) {
printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
gf.n_threads = best_thread_num;
}

const bool run_mha_reordered = model.layers[0].k_cache->type == NE_TYPE_BTLA;
Expand Down
11 changes: 5 additions & 6 deletions neural_speed/models/falcon/falcon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,11 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
const int best_thread_num =
bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
if (gf.n_threads > best_thread_num) {
printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
gf.n_threads = best_thread_num;
}

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
Expand Down
11 changes: 5 additions & 6 deletions neural_speed/models/gptj/gptj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,11 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
const int best_thread_num =
bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
if (gf.n_threads > best_thread_num) {
printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
gf.n_threads = best_thread_num;
}

// no padding input for optimized MHA kernel
Expand Down
11 changes: 5 additions & 6 deletions neural_speed/models/gptneox/gptneox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,12 +128,11 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
const int best_thread_num =
bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
if (gf.n_threads > best_thread_num) {
printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
gf.n_threads = best_thread_num;
}

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
Expand Down
11 changes: 5 additions & 6 deletions neural_speed/models/llama/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,11 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
const int best_thread_num =
bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
if (gf.n_threads > best_thread_num) {
printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
gf.n_threads = best_thread_num;
}

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
Expand Down
2 changes: 2 additions & 0 deletions neural_speed/models/model_utils/quant_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,8 @@ size_t bestla_quantize(const float* f32ptr, void* dstpr, const quant_params_inte
}
if (params.bits == quant_bits::fp4_e2m1) {
quant_type = BTLA_DTYPE::F4_E2M1;
if (bestla_is_hybrid())
printf("Warning: Not recommend FP4 in client CPU. Please use Int4 to get better performance.\n");
}
if (params.bits == quant_bits::nf4) {
quant_type = BTLA_DTYPE::F4_NF4;
Expand Down
11 changes: 5 additions & 6 deletions neural_speed/models/mpt/mpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,11 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
const int best_thread_num =
bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
if (gf.n_threads > best_thread_num) {
printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
gf.n_threads = best_thread_num;
}

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
Expand Down
11 changes: 5 additions & 6 deletions neural_speed/models/opt/opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,11 @@ static bool opt_model_eval_internal(model_context* ctx, const model_input* input
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
const int best_thread_num =
bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
if (gf.n_threads > best_thread_num) {
printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
gf.n_threads = best_thread_num;
}

struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
Expand Down
11 changes: 5 additions & 6 deletions neural_speed/models/phi/phi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,11 @@ static bool phi2_model_eval_internal(model_context* ctx, const model_input* inpu
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
const int best_thread_num =
bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
if (gf.n_threads > best_thread_num) {
printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
gf.n_threads = best_thread_num;
}

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
Expand Down
11 changes: 5 additions & 6 deletions neural_speed/models/qwen/qwen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,11 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
const int best_thread_num =
bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
if (gf.n_threads > best_thread_num) {
printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
gf.n_threads = best_thread_num;
}

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
Expand Down
11 changes: 5 additions & 6 deletions neural_speed/models/starcoder/starcoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,11 @@ static bool starcoder_model_eval_internal(model_context* ctx, const model_input*
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
printf(
"WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
"New thread number is: %d\n.",
bestla_get_Pcore_number());
gf.n_threads = bestla_get_Pcore_number();
const int best_thread_num =
bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
if (gf.n_threads > best_thread_num) {
printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
gf.n_threads = best_thread_num;
}

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
Expand Down

0 comments on commit c512026

Please sign in to comment.