intel · yuchengliu1 · Feb 5, 2024 · Feb 6, 2024 · Feb 6, 2024 · Feb 7, 2024
diff --git a/neural_speed/application/common.cpp b/neural_speed/application/common.cpp
@@ -730,7 +730,16 @@ ne_ftype quant_params_to_ftype(const quant_params& params) {
       return NE_FTYPE_MOSTLY_Q8_0;
     }
   } else {
-    return NE_FTYPE_MOSTLY_Q_BTLA;
+    if (params.weight_dtype == "int4")
+      return NE_FTYPE_MOSTLY_Q4_J;
+    else if (params.weight_dtype == "int8")
+      return NE_FTYPE_MOSTLY_Q8_J;
+    else if (params.weight_dtype == "fp8" || params.weight_dtype == "fp8_e5m2")
+      return NE_FTYPE_MOSTLY_F8;
+    else if (params.weight_dtype == "fp4")
+      return NE_FTYPE_MOSTLY_F4;
+    else if (params.weight_dtype == "nf4")
+      return NE_FTYPE_MOSTLY_NF4;
   }
   return NE_FTYPE_UNKNOWN;
 }

diff --git a/neural_speed/core/data_types.h b/neural_speed/core/data_types.h
@@ -73,7 +73,11 @@ enum ne_ftype {
   NE_FTYPE_MOSTLY_Q8_0 = 7,           // except 1d tensors
   NE_FTYPE_MOSTLY_Q5_0 = 8,           // except 1d tensors
   NE_FTYPE_MOSTLY_Q5_1 = 9,           // except 1d tensors
-  NE_FTYPE_MOSTLY_Q_BTLA = 10,        // except 1d tensors
+  NE_FTYPE_MOSTLY_Q4_J = 10,          // except 1d tensors
+  NE_FTYPE_MOSTLY_Q8_J = 11,          // except 1d tensors
+  NE_FTYPE_MOSTLY_F8 = 12,            // except 1d tensors
+  NE_FTYPE_MOSTLY_NF4 = 13,           // except 1d tensors
+  NE_FTYPE_MOSTLY_F4 = 14,            // except 1d tensors
 };
 
 #define QK4_0 32

diff --git a/neural_speed/core/layers/ne_bestla.cpp b/neural_speed/core/layers/ne_bestla.cpp
@@ -16,6 +16,21 @@
 using namespace bestla;     // NOLINT
 using namespace ne_bestla;  // NOLINT
 
+bool bestla_is_hybrid() {
+  GetCPUDevice();
+  return _cd->isHybrid();
+}
+
+int bestla_get_Pcore_number() {
+  GetCPUDevice();
+  return _cd->getPcoreNum();
+}
+
+int bestla_get_Ecore_number() {
+  GetCPUDevice();
+  return _cd->getEcoreNum();
+}
+
 void bestla_init() {
   GetCPUDevice();
   if (_cd->AMX_BF16() || _cd->AMX_INT8()) {

diff --git a/neural_speed/core/ne_bestla.h b/neural_speed/core/ne_bestla.h
@@ -22,6 +22,12 @@ void bestla_timer(bool _init);
 
 int bestla_set_threads(int _nth);
 
+bool bestla_is_hybrid();
+
+int bestla_get_Pcore_number();
+
+int bestla_get_Ecore_number();
+
 void* bestla_get_thread_handle();
 
 void bestla_init();

diff --git a/neural_speed/models/baichuan/baichuan.cpp b/neural_speed/models/baichuan/baichuan.cpp
@@ -106,6 +106,13 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
+    printf(
+        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
+        "New thread number is: %d\n.",
+        bestla_get_Pcore_number());
+    gf.n_threads = bestla_get_Pcore_number();
+  }
 
   const bool run_mha_reordered = model.layers[0].k_cache->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};

diff --git a/neural_speed/models/bloom/bloom.cpp b/neural_speed/models/bloom/bloom.cpp
@@ -88,6 +88,13 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
+    printf(
+        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
+        "New thread number is: %d\n.",
+        bestla_get_Pcore_number());
+    gf.n_threads = bestla_get_Pcore_number();
+  }
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
   ne_set_name(embd, "embd");

diff --git a/neural_speed/models/chatglm/chatglm.cpp b/neural_speed/models/chatglm/chatglm.cpp
@@ -99,6 +99,13 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
+    printf(
+        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
+        "New thread number is: %d\n.",
+        bestla_get_Pcore_number());
+    gf.n_threads = bestla_get_Pcore_number();
+  }
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size);
   ne_set_name(embd, "embd");

diff --git a/neural_speed/models/chatglm/chatglm2.cpp b/neural_speed/models/chatglm/chatglm2.cpp
@@ -111,6 +111,13 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
+    printf(
+        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
+        "New thread number is: %d\n.",
+        bestla_get_Pcore_number());
+    gf.n_threads = bestla_get_Pcore_number();
+  }
 
   const bool run_mha_reordered = model.layers[0].k_cache->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};

diff --git a/neural_speed/models/falcon/falcon.cpp b/neural_speed/models/falcon/falcon.cpp
@@ -92,6 +92,13 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
+    printf(
+        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
+        "New thread number is: %d\n.",
+        bestla_get_Pcore_number());
+    gf.n_threads = bestla_get_Pcore_number();
+  }
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};

diff --git a/neural_speed/models/gptj/gptj.cpp b/neural_speed/models/gptj/gptj.cpp
@@ -133,6 +133,13 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = n_threads;
+  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
+    printf(
+        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
+        "New thread number is: %d\n.",
+        bestla_get_Pcore_number());
+    gf.n_threads = bestla_get_Pcore_number();
+  }
 
   // no padding input for optimized MHA kernel
   const bool run_mha_reordered = (kv_self.k->type == NE_TYPE_BTLA);

diff --git a/neural_speed/models/gptneox/gptneox.cpp b/neural_speed/models/gptneox/gptneox.cpp
@@ -128,6 +128,13 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
+    printf(
+        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
+        "New thread number is: %d\n.",
+        bestla_get_Pcore_number());
+    gf.n_threads = bestla_get_Pcore_number();
+  }
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};

diff --git a/neural_speed/models/llama/llama.cpp b/neural_speed/models/llama/llama.cpp
@@ -118,6 +118,13 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
+    printf(
+        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
+        "New thread number is: %d\n.",
+        bestla_get_Pcore_number());
+    gf.n_threads = bestla_get_Pcore_number();
+  }
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {0, 0};

diff --git a/neural_speed/models/model_utils/quant_utils.cpp b/neural_speed/models/model_utils/quant_utils.cpp
@@ -288,6 +288,8 @@ size_t bestla_quantize(const float* f32ptr, void* dstpr, const quant_params_inte
   }
   if (params.bits == quant_bits::nf4) {
     quant_type = BTLA_DTYPE::F4_NF4;
+    if (bestla_is_hybrid())
+      printf("Warning: Not recommend NF4 in client CPU. Please use Int4 to get better performance.\n");
   }
   if (params.bits == quant_bits::fp8_e4m3) {
     quant_type = BTLA_DTYPE::F8_E4M3;
@@ -476,7 +478,11 @@ bool model_quantize_special(std::ifstream& finp, std::ofstream& fout, const ne_f
     case NE_FTYPE_MOSTLY_Q4_0:
       qtype = NE_TYPE_Q4_0;
       break;
-    case NE_FTYPE_MOSTLY_Q_BTLA:
+    case NE_FTYPE_MOSTLY_Q4_J:
+    case NE_FTYPE_MOSTLY_Q8_J:
+    case NE_FTYPE_MOSTLY_F8:
+    case NE_FTYPE_MOSTLY_NF4:
+    case NE_FTYPE_MOSTLY_F4:
       qtype = NE_TYPE_BTLA;
       break;
     case NE_FTYPE_MOSTLY_F16: {

diff --git a/neural_speed/models/mpt/mpt.cpp b/neural_speed/models/mpt/mpt.cpp
@@ -89,6 +89,13 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
+    printf(
+        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
+        "New thread number is: %d\n.",
+        bestla_get_Pcore_number());
+    gf.n_threads = bestla_get_Pcore_number();
+  }
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};

diff --git a/neural_speed/models/opt/opt.cpp b/neural_speed/models/opt/opt.cpp
@@ -31,6 +31,7 @@
 #include "core/data_types.h"
 #include "core/ne.h"
 #include "core/ne_layers.h"
+#include "core/ne_bestla.h"
 #include "models/model_utils/model_utils.h"
 #include "models/model_utils/util.h"
 
@@ -86,6 +87,13 @@ static bool opt_model_eval_internal(model_context* ctx, const model_input* input
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
+    printf(
+        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
+        "New thread number is: %d\n.",
+        bestla_get_Pcore_number());
+    gf.n_threads = bestla_get_Pcore_number();
+  }
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
   ne_set_name(embd, "embd");

diff --git a/neural_speed/models/phi/phi.cpp b/neural_speed/models/phi/phi.cpp
@@ -111,6 +111,13 @@ static bool phi2_model_eval_internal(model_context* ctx, const model_input* inpu
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
+    printf(
+        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
+        "New thread number is: %d\n.",
+        bestla_get_Pcore_number());
+    gf.n_threads = bestla_get_Pcore_number();
+  }
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};

diff --git a/neural_speed/models/qwen/qwen.cpp b/neural_speed/models/qwen/qwen.cpp
@@ -118,6 +118,13 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
+    printf(
+        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
+        "New thread number is: %d\n.",
+        bestla_get_Pcore_number());
+    gf.n_threads = bestla_get_Pcore_number();
+  }
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};

diff --git a/neural_speed/models/starcoder/starcoder.cpp b/neural_speed/models/starcoder/starcoder.cpp
@@ -91,6 +91,13 @@ static bool starcoder_model_eval_internal(model_context* ctx, const model_input*
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
   gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  if (hparams.ftype == NE_FTYPE_MOSTLY_NF4 && bestla_is_hybrid() && gf.n_threads > bestla_get_Pcore_number()) {
+    printf(
+        "WARNING: NF4 is poor at Ecore, only use P-core to inference. Not use the thread number according to settings. "
+        "New thread number is: %d\n.",
+        bestla_get_Pcore_number());
+    gf.n_threads = bestla_get_Pcore_number();
+  }
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};