subtract the logic

intel · Feb 6, 2024 · c512026 · c512026
1 parent e2e84e5
commit c512026
Show file tree

Hide file tree

Showing 16 changed files with 73 additions and 89 deletions.
diff --git a/neural_speed/core/layers/ne_bestla.cpp b/neural_speed/core/layers/ne_bestla.cpp
@@ -21,14 +21,12 @@ bool bestla_is_hybrid() {
   return _cd->isHybrid();
 }
 
-int bestla_get_Pcore_number() {
+int bestla_get_best_thread_number(bool is_support_Ecore) {
   GetCPUDevice();
-  return _cd->getPcoreNum();
-}
-
-int bestla_get_Ecore_number() {
-  GetCPUDevice();
-  return _cd->getEcoreNum();
+  if (_cd->isHybrid())
+    return is_support_Ecore ? _cd->getThreads() : _cd->getPcoreNum();
+  else
+    return _cd->getThreads();
 }
 
 void bestla_init() {

diff --git a/neural_speed/core/ne_bestla.h b/neural_speed/core/ne_bestla.h
@@ -24,9 +24,7 @@ int bestla_set_threads(int _nth);
 
 bool bestla_is_hybrid();
 
-int bestla_get_Pcore_number();
-
-int bestla_get_Ecore_number();
+int bestla_get_best_thread_number(bool is_support_Ecore);
 
 void* bestla_get_thread_handle();
 

diff --git a/neural_speed/models/baichuan/baichuan.cpp b/neural_speed/models/baichuan/baichuan.cpp
@@ -106,12 +106,11 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
-  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
-    printf(
-        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
-        "New thread number is: %d\n.",
-        bestla_get_Pcore_number());
-    gf.n_threads = bestla_get_Pcore_number();
+  const int best_thread_num =
+      bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
+  if (gf.n_threads > best_thread_num) {
+    printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
+    gf.n_threads = best_thread_num;
   }
 
   const bool run_mha_reordered = model.layers[0].k_cache->type == NE_TYPE_BTLA;

diff --git a/neural_speed/models/bloom/bloom.cpp b/neural_speed/models/bloom/bloom.cpp
@@ -88,12 +88,11 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
-  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
-    printf(
-        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
-        "New thread number is: %d\n.",
-        bestla_get_Pcore_number());
-    gf.n_threads = bestla_get_Pcore_number();
+  const int best_thread_num =
+      bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
+  if (gf.n_threads > best_thread_num) {
+    printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
+    gf.n_threads = best_thread_num;
   }
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);

diff --git a/neural_speed/models/chatglm/chatglm.cpp b/neural_speed/models/chatglm/chatglm.cpp
@@ -99,12 +99,11 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
-  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
-    printf(
-        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
-        "New thread number is: %d\n.",
-        bestla_get_Pcore_number());
-    gf.n_threads = bestla_get_Pcore_number();
+  const int best_thread_num =
+      bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
+  if (gf.n_threads > best_thread_num) {
+    printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
+    gf.n_threads = best_thread_num;
   }
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size);

diff --git a/neural_speed/models/chatglm/chatglm2.cpp b/neural_speed/models/chatglm/chatglm2.cpp
@@ -110,13 +110,11 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
   // for big prochatglms, if BLAS is enabled, it is better to use only one thread
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
-  gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
-  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
-    printf(
-        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
-        "New thread number is: %d\n.",
-        bestla_get_Pcore_number());
-    gf.n_threads = bestla_get_Pcore_number();
+  const int best_thread_num =
+      bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
+  if (gf.n_threads > best_thread_num) {
+    printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
+    gf.n_threads = best_thread_num;
   }
 
   const bool run_mha_reordered = model.layers[0].k_cache->type == NE_TYPE_BTLA;

diff --git a/neural_speed/models/falcon/falcon.cpp b/neural_speed/models/falcon/falcon.cpp
@@ -92,12 +92,11 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
-  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
-    printf(
-        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
-        "New thread number is: %d\n.",
-        bestla_get_Pcore_number());
-    gf.n_threads = bestla_get_Pcore_number();
+  const int best_thread_num =
+      bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
+  if (gf.n_threads > best_thread_num) {
+    printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
+    gf.n_threads = best_thread_num;
   }
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;

diff --git a/neural_speed/models/gptj/gptj.cpp b/neural_speed/models/gptj/gptj.cpp
@@ -133,12 +133,11 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = n_threads;
-  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
-    printf(
-        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
-        "New thread number is: %d\n.",
-        bestla_get_Pcore_number());
-    gf.n_threads = bestla_get_Pcore_number();
+  const int best_thread_num =
+      bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
+  if (gf.n_threads > best_thread_num) {
+    printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
+    gf.n_threads = best_thread_num;
   }
 
   // no padding input for optimized MHA kernel

diff --git a/neural_speed/models/gptneox/gptneox.cpp b/neural_speed/models/gptneox/gptneox.cpp
@@ -128,12 +128,11 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
-  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
-    printf(
-        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
-        "New thread number is: %d\n.",
-        bestla_get_Pcore_number());
-    gf.n_threads = bestla_get_Pcore_number();
+  const int best_thread_num =
+      bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
+  if (gf.n_threads > best_thread_num) {
+    printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
+    gf.n_threads = best_thread_num;
   }
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;

diff --git a/neural_speed/models/llama/llama.cpp b/neural_speed/models/llama/llama.cpp
@@ -118,12 +118,11 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
-  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
-    printf(
-        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
-        "New thread number is: %d\n.",
-        bestla_get_Pcore_number());
-    gf.n_threads = bestla_get_Pcore_number();
+  const int best_thread_num =
+      bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
+  if (gf.n_threads > best_thread_num) {
+    printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
+    gf.n_threads = best_thread_num;
   }
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;

diff --git a/neural_speed/models/model_utils/quant_utils.cpp b/neural_speed/models/model_utils/quant_utils.cpp
@@ -285,6 +285,8 @@ size_t bestla_quantize(const float* f32ptr, void* dstpr, const quant_params_inte
   }
   if (params.bits == quant_bits::fp4_e2m1) {
     quant_type = BTLA_DTYPE::F4_E2M1;
+    if (bestla_is_hybrid())
+      printf("Warning: Not recommend FP4 in client CPU. Please use Int4 to get better performance.\n");
   }
   if (params.bits == quant_bits::nf4) {
     quant_type = BTLA_DTYPE::F4_NF4;

diff --git a/neural_speed/models/mpt/mpt.cpp b/neural_speed/models/mpt/mpt.cpp
@@ -89,12 +89,11 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
-  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
-    printf(
-        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
-        "New thread number is: %d\n.",
-        bestla_get_Pcore_number());
-    gf.n_threads = bestla_get_Pcore_number();
+  const int best_thread_num =
+      bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
+  if (gf.n_threads > best_thread_num) {
+    printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
+    gf.n_threads = best_thread_num;
   }
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;

diff --git a/neural_speed/models/opt/opt.cpp b/neural_speed/models/opt/opt.cpp
@@ -87,12 +87,11 @@ static bool opt_model_eval_internal(model_context* ctx, const model_input* input
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
-  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
-    printf(
-        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
-        "New thread number is: %d\n.",
-        bestla_get_Pcore_number());
-    gf.n_threads = bestla_get_Pcore_number();
+  const int best_thread_num =
+      bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
+  if (gf.n_threads > best_thread_num) {
+    printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
+    gf.n_threads = best_thread_num;
   }
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);

diff --git a/neural_speed/models/phi/phi.cpp b/neural_speed/models/phi/phi.cpp
@@ -111,12 +111,11 @@ static bool phi2_model_eval_internal(model_context* ctx, const model_input* inpu
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
-  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
-    printf(
-        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
-        "New thread number is: %d\n.",
-        bestla_get_Pcore_number());
-    gf.n_threads = bestla_get_Pcore_number();
+  const int best_thread_num =
+      bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
+  if (gf.n_threads > best_thread_num) {
+    printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
+    gf.n_threads = best_thread_num;
   }
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;

diff --git a/neural_speed/models/qwen/qwen.cpp b/neural_speed/models/qwen/qwen.cpp
@@ -118,12 +118,11 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
-  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
-    printf(
-        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
-        "New thread number is: %d\n.",
-        bestla_get_Pcore_number());
-    gf.n_threads = bestla_get_Pcore_number();
+  const int best_thread_num =
+      bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
+  if (gf.n_threads > best_thread_num) {
+    printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
+    gf.n_threads = best_thread_num;
   }
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;

diff --git a/neural_speed/models/starcoder/starcoder.cpp b/neural_speed/models/starcoder/starcoder.cpp
@@ -91,12 +91,11 @@ static bool starcoder_model_eval_internal(model_context* ctx, const model_input*
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
-  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
-    printf(
-        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
-        "New thread number is: %d\n.",
-        bestla_get_Pcore_number());
-    gf.n_threads = bestla_get_Pcore_number();
+  const int best_thread_num =
+      bestla_get_best_thread_number(hparams.ftype == NE_FTYPE_MOSTLY_NF4 || hparams.ftype == NE_FTYPE_MOSTLY_F4);
+  if (gf.n_threads > best_thread_num) {
+    printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", best_thread_num);
+    gf.n_threads = best_thread_num;
   }
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;