diff --git a/neural_speed/core/layers/ne_bestla.cpp b/neural_speed/core/layers/ne_bestla.cpp index cecf61916..ac64ccd5c 100644 --- a/neural_speed/core/layers/ne_bestla.cpp +++ b/neural_speed/core/layers/ne_bestla.cpp @@ -21,14 +21,12 @@ bool bestla_is_hybrid() { return _cd->isHybrid(); } -int bestla_get_Pcore_number() { +int bestla_get_best_thread_number(bool is_support_Ecore) { GetCPUDevice(); - return _cd->getPcoreNum(); -} - -int bestla_get_Ecore_number() { - GetCPUDevice(); - return _cd->getEcoreNum(); + if (_cd->isHybrid()) + return is_support_Ecore ? _cd->getThreads() : _cd->getPcoreNum(); + else + return _cd->getThreads(); } void bestla_init() { diff --git a/neural_speed/core/ne_bestla.h b/neural_speed/core/ne_bestla.h index cdb33c703..452026266 100644 --- a/neural_speed/core/ne_bestla.h +++ b/neural_speed/core/ne_bestla.h @@ -24,9 +24,7 @@ int bestla_set_threads(int _nth); bool bestla_is_hybrid(); -int bestla_get_Pcore_number(); - -int bestla_get_Ecore_number(); +int bestla_get_best_thread_number(bool is_support_Ecore); void* bestla_get_thread_handle(); diff --git a/neural_speed/models/baichuan/baichuan.cpp b/neural_speed/models/baichuan/baichuan.cpp index d4386acf2..097037362 100644 --- a/neural_speed/models/baichuan/baichuan.cpp +++ b/neural_speed/models/baichuan/baichuan.cpp @@ -106,12 +106,11 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input* // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ne_cgraph gf = {}; gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) { - printf( - "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. " - "New thread number is: %d\n.", - bestla_get_Pcore_number()); - gf.n_threads = bestla_get_Pcore_number(); + const int best_thread_num = + bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4); + if (gf.n_threads > best_thread_num) { + printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num); + gf.n_threads = best_thread_num; } const bool run_mha_reordered = model.layers[0].k_cache->type == NE_TYPE_BTLA; diff --git a/neural_speed/models/bloom/bloom.cpp b/neural_speed/models/bloom/bloom.cpp index 0acb15634..b2c683da0 100644 --- a/neural_speed/models/bloom/bloom.cpp +++ b/neural_speed/models/bloom/bloom.cpp @@ -88,12 +88,11 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ne_cgraph gf = {}; gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) { - printf( - "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. " - "New thread number is: %d\n.", - bestla_get_Pcore_number()); - gf.n_threads = bestla_get_Pcore_number(); + const int best_thread_num = + bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4); + if (gf.n_threads > best_thread_num) { + printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num); + gf.n_threads = best_thread_num; } struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N); diff --git a/neural_speed/models/chatglm/chatglm.cpp b/neural_speed/models/chatglm/chatglm.cpp index 7d6ab3c7c..c6fd35519 100644 --- a/neural_speed/models/chatglm/chatglm.cpp +++ b/neural_speed/models/chatglm/chatglm.cpp @@ -99,12 +99,11 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ne_cgraph gf = {}; gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) { - printf( - "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. " - "New thread number is: %d\n.", - bestla_get_Pcore_number()); - gf.n_threads = bestla_get_Pcore_number(); + const int best_thread_num = + bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4); + if (gf.n_threads > best_thread_num) { + printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num); + gf.n_threads = best_thread_num; } struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size); diff --git a/neural_speed/models/chatglm/chatglm2.cpp b/neural_speed/models/chatglm/chatglm2.cpp index 674cfba38..77da60f67 100644 --- a/neural_speed/models/chatglm/chatglm2.cpp +++ b/neural_speed/models/chatglm/chatglm2.cpp @@ -110,13 +110,11 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i // for big prochatglms, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ne_cgraph gf = {}; - gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) { - printf( - "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. " - "New thread number is: %d\n.", - bestla_get_Pcore_number()); - gf.n_threads = bestla_get_Pcore_number(); + const int best_thread_num = + bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4); + if (gf.n_threads > best_thread_num) { + printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num); + gf.n_threads = best_thread_num; } const bool run_mha_reordered = model.layers[0].k_cache->type == NE_TYPE_BTLA; diff --git a/neural_speed/models/falcon/falcon.cpp b/neural_speed/models/falcon/falcon.cpp index 90be61478..8c5874154 100644 --- a/neural_speed/models/falcon/falcon.cpp +++ b/neural_speed/models/falcon/falcon.cpp @@ -92,12 +92,11 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ne_cgraph gf = {}; gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) { - printf( - "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. " - "New thread number is: %d\n.", - bestla_get_Pcore_number()); - gf.n_threads = bestla_get_Pcore_number(); + const int best_thread_num = + bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4); + if (gf.n_threads > best_thread_num) { + printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num); + gf.n_threads = best_thread_num; } const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA; diff --git a/neural_speed/models/gptj/gptj.cpp b/neural_speed/models/gptj/gptj.cpp index e1d97e9eb..a466d395e 100644 --- a/neural_speed/models/gptj/gptj.cpp +++ b/neural_speed/models/gptj/gptj.cpp @@ -133,12 +133,11 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ne_cgraph gf = {}; gf.n_threads = n_threads; - if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) { - printf( - "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. " - "New thread number is: %d\n.", - bestla_get_Pcore_number()); - gf.n_threads = bestla_get_Pcore_number(); + const int best_thread_num = + bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4); + if (gf.n_threads > best_thread_num) { + printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num); + gf.n_threads = best_thread_num; } // no padding input for optimized MHA kernel diff --git a/neural_speed/models/gptneox/gptneox.cpp b/neural_speed/models/gptneox/gptneox.cpp index 022f43421..c80874679 100644 --- a/neural_speed/models/gptneox/gptneox.cpp +++ b/neural_speed/models/gptneox/gptneox.cpp @@ -128,12 +128,11 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ne_cgraph gf = {}; gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) { - printf( - "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. " - "New thread number is: %d\n.", - bestla_get_Pcore_number()); - gf.n_threads = bestla_get_Pcore_number(); + const int best_thread_num = + bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4); + if (gf.n_threads > best_thread_num) { + printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num); + gf.n_threads = best_thread_num; } const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA; diff --git a/neural_speed/models/llama/llama.cpp b/neural_speed/models/llama/llama.cpp index e65b451fb..e29f48888 100644 --- a/neural_speed/models/llama/llama.cpp +++ b/neural_speed/models/llama/llama.cpp @@ -118,12 +118,11 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ne_cgraph gf = {}; gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) { - printf( - "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. " - "New thread number is: %d\n.", - bestla_get_Pcore_number()); - gf.n_threads = bestla_get_Pcore_number(); + const int best_thread_num = + bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4); + if (gf.n_threads > best_thread_num) { + printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num); + gf.n_threads = best_thread_num; } const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA; diff --git a/neural_speed/models/model_utils/quant_utils.cpp b/neural_speed/models/model_utils/quant_utils.cpp index 49c8811d5..721fabb67 100644 --- a/neural_speed/models/model_utils/quant_utils.cpp +++ b/neural_speed/models/model_utils/quant_utils.cpp @@ -285,6 +285,8 @@ size_t bestla_quantize(const float* f32ptr, void* dstpr, const quant_params_inte } if (params.bits == quant_bits::fp4_e2m1) { quant_type = BTLA_DTYPE::F4_E2M1; + if (bestla_is_hybrid()) + printf("Warning: Not recommend FP4 in client CPU. Please use Int4 to get better performance.\n"); } if (params.bits == quant_bits::nf4) { quant_type = BTLA_DTYPE::F4_NF4; diff --git a/neural_speed/models/mpt/mpt.cpp b/neural_speed/models/mpt/mpt.cpp index 255138c4f..c210c49c5 100644 --- a/neural_speed/models/mpt/mpt.cpp +++ b/neural_speed/models/mpt/mpt.cpp @@ -89,12 +89,11 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ne_cgraph gf = {}; gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) { - printf( - "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. " - "New thread number is: %d\n.", - bestla_get_Pcore_number()); - gf.n_threads = bestla_get_Pcore_number(); + const int best_thread_num = + bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4); + if (gf.n_threads > best_thread_num) { + printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num); + gf.n_threads = best_thread_num; } const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA; diff --git a/neural_speed/models/opt/opt.cpp b/neural_speed/models/opt/opt.cpp index 827b2e808..e7018457f 100644 --- a/neural_speed/models/opt/opt.cpp +++ b/neural_speed/models/opt/opt.cpp @@ -87,12 +87,11 @@ static bool opt_model_eval_internal(model_context* ctx, const model_input* input // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ne_cgraph gf = {}; gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) { - printf( - "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. " - "New thread number is: %d\n.", - bestla_get_Pcore_number()); - gf.n_threads = bestla_get_Pcore_number(); + const int best_thread_num = + bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4); + if (gf.n_threads > best_thread_num) { + printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num); + gf.n_threads = best_thread_num; } struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N); diff --git a/neural_speed/models/phi/phi.cpp b/neural_speed/models/phi/phi.cpp index b1fcda16a..ab02fda5b 100644 --- a/neural_speed/models/phi/phi.cpp +++ b/neural_speed/models/phi/phi.cpp @@ -111,12 +111,11 @@ static bool phi2_model_eval_internal(model_context* ctx, const model_input* inpu // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ne_cgraph gf = {}; gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) { - printf( - "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. " - "New thread number is: %d\n.", - bestla_get_Pcore_number()); - gf.n_threads = bestla_get_Pcore_number(); + const int best_thread_num = + bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4); + if (gf.n_threads > best_thread_num) { + printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num); + gf.n_threads = best_thread_num; } const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA; diff --git a/neural_speed/models/qwen/qwen.cpp b/neural_speed/models/qwen/qwen.cpp index bcdc6325b..8666e79fc 100644 --- a/neural_speed/models/qwen/qwen.cpp +++ b/neural_speed/models/qwen/qwen.cpp @@ -118,12 +118,11 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ne_cgraph gf = {}; gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) { - printf( - "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. " - "New thread number is: %d\n.", - bestla_get_Pcore_number()); - gf.n_threads = bestla_get_Pcore_number(); + const int best_thread_num = + bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4); + if (gf.n_threads > best_thread_num) { + printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num); + gf.n_threads = best_thread_num; } const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA; diff --git a/neural_speed/models/starcoder/starcoder.cpp b/neural_speed/models/starcoder/starcoder.cpp index 73882b382..27b02af8c 100644 --- a/neural_speed/models/starcoder/starcoder.cpp +++ b/neural_speed/models/starcoder/starcoder.cpp @@ -91,12 +91,11 @@ static bool starcoder_model_eval_internal(model_context* ctx, const model_input* // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ne_cgraph gf = {}; gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads; - if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) { - printf( - "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. " - "New thread number is: %d\n.", - bestla_get_Pcore_number()); - gf.n_threads = bestla_get_Pcore_number(); + const int best_thread_num = + bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4); + if (gf.n_threads > best_thread_num) { + printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num); + gf.n_threads = best_thread_num; } const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;