intel · yuchengliu1 · Feb 5, 2024 · Feb 6, 2024 · Feb 6, 2024 · Feb 7, 2024
diff --git a/neural_speed/application/common.cpp b/neural_speed/application/common.cpp
@@ -62,41 +62,6 @@
 #define M_PI 3.14159265358979323846
 #endif
 
-int32_t get_num_physical_cores() {
-#ifdef __linux__
-  // enumerate the set of thread siblings, num entries is num cores
-  std::unordered_set<std::string> siblings;
-  for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
-    std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
-    if (!thread_siblings.is_open()) {
-      break;  // no more cpus
-    }
-    std::string line;
-    if (std::getline(thread_siblings, line)) {
-      siblings.insert(line);
-    }
-  }
-  if (!siblings.empty()) {
-    return static_cast<int32_t>(siblings.size());
-  }
-#elif defined(__APPLE__) && defined(__MACH__)
-  int32_t num_physical_cores;
-  size_t len = sizeof(num_physical_cores);
-  int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, nullptr, 0);
-  if (result == 0) {
-    return num_physical_cores;
-  }
-  result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, nullptr, 0);
-  if (result == 0) {
-    return num_physical_cores;
-  }
-#elif defined(_WIN32)
-  // TODO(Yucheng): Implement
-#endif
-  unsigned int n_threads = std::thread::hardware_concurrency();
-  return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
-}
-
 bool isValidFilename(const std::string& filename) {
   std::ifstream infile(filename.c_str());
   return infile.good();
@@ -730,7 +695,16 @@ ne_ftype quant_params_to_ftype(const quant_params& params) {
       return NE_FTYPE_MOSTLY_Q8_0;
     }
   } else {
-    return NE_FTYPE_MOSTLY_Q_BTLA;
+    if (params.weight_dtype == "int4")
+      return NE_FTYPE_MOSTLY_Q4_J;
+    else if (params.weight_dtype == "int8")
+      return NE_FTYPE_MOSTLY_Q8_J;
+    else if (params.weight_dtype == "fp8" || params.weight_dtype == "fp8_e5m2")
+      return NE_FTYPE_MOSTLY_F8;
+    else if (params.weight_dtype == "fp4")
+      return NE_FTYPE_MOSTLY_F4;
+    else if (params.weight_dtype == "nf4")
+      return NE_FTYPE_MOSTLY_NF4;
   }
   return NE_FTYPE_UNKNOWN;
 }

diff --git a/neural_speed/application/common.h b/neural_speed/application/common.h
@@ -41,8 +41,6 @@
 // CLI argument parsing
 //
 
-int32_t get_num_physical_cores();
-
 struct common_params {
   int32_t n_threads = get_num_physical_cores();
 

diff --git a/neural_speed/core/data_types.h b/neural_speed/core/data_types.h
@@ -73,7 +73,11 @@ enum ne_ftype {
   NE_FTYPE_MOSTLY_Q8_0 = 7,           // except 1d tensors
   NE_FTYPE_MOSTLY_Q5_0 = 8,           // except 1d tensors
   NE_FTYPE_MOSTLY_Q5_1 = 9,           // except 1d tensors
-  NE_FTYPE_MOSTLY_Q_BTLA = 10,        // except 1d tensors
+  NE_FTYPE_MOSTLY_Q4_J = 10,          // except 1d tensors
+  NE_FTYPE_MOSTLY_Q8_J = 11,          // except 1d tensors
+  NE_FTYPE_MOSTLY_F8 = 12,            // except 1d tensors
+  NE_FTYPE_MOSTLY_NF4 = 13,           // except 1d tensors
+  NE_FTYPE_MOSTLY_F4 = 14,            // except 1d tensors
 };
 
 #define QK4_0 32

diff --git a/neural_speed/core/layers/ne_bestla.cpp b/neural_speed/core/layers/ne_bestla.cpp
@@ -16,6 +16,21 @@
 using namespace bestla;     // NOLINT
 using namespace ne_bestla;  // NOLINT
 
+bool bestla_is_hybrid() {
+  GetCPUDevice();
+  return _cd->isHybrid();
+}
+
+int bestla_get_Pcore_number() {
+  GetCPUDevice();
+  return _cd->getPcoreNum();
+}
+
+int bestla_get_thread_number() {
+  GetCPUDevice();
+  return _cd->getThreads();
+}
+
 void bestla_init() {
   GetCPUDevice();
   if (_cd->AMX_BF16() || _cd->AMX_INT8()) {

diff --git a/neural_speed/core/ne_bestla.h b/neural_speed/core/ne_bestla.h
@@ -22,6 +22,12 @@ void bestla_timer(bool _init);
 
 int bestla_set_threads(int _nth);
 
+bool bestla_is_hybrid();
+
+int bestla_get_Pcore_number();
+
+int bestla_get_thread_number();
+
 void* bestla_get_thread_handle();
 
 void bestla_init();

diff --git a/neural_speed/core/ne_layers.c b/neural_speed/core/ne_layers.c
@@ -11960,6 +11960,14 @@ int ne_cpu_has_avx2(void) {
 #endif
 }
 
+int ne_cpu_has_avx_vnni(void) {
+#if defined(__AVXVNNI__)
+  return 1;
+#else
+  return 0;
+#endif
+}
+
 int ne_cpu_has_avx512(void) {
 #if defined(__AVX512F__)
   return 1;

diff --git a/neural_speed/core/ne_layers.h b/neural_speed/core/ne_layers.h
@@ -596,6 +596,7 @@ NE_API size_t ne_quantize_chunk(enum ne_type type, const float* src, void* dst,
 
 NE_API int ne_cpu_has_avx(void);
 NE_API int ne_cpu_has_avx2(void);
+NE_API int ne_cpu_has_avx_vnni(void);
 NE_API int ne_cpu_has_avx512(void);
 NE_API int ne_cpu_has_avx512_vbmi(void);
 NE_API int ne_cpu_has_avx512_vnni(void);

diff --git a/neural_speed/models/baichuan/baichuan.cpp b/neural_speed/models/baichuan/baichuan.cpp
@@ -105,7 +105,7 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
   // for big probaichuans, if BLAS is enabled, it is better to use only one thread
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
-  gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);
 
   const bool run_mha_reordered = model.layers[0].k_cache->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};

diff --git a/neural_speed/models/bloom/bloom.cpp b/neural_speed/models/bloom/bloom.cpp
@@ -87,7 +87,7 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp
   // for big profalcon, if BLAS is enabled, it is better to use only one thread
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
-  gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
   ne_set_name(embd, "embd");

diff --git a/neural_speed/models/chatglm/chatglm.cpp b/neural_speed/models/chatglm/chatglm.cpp
@@ -98,7 +98,7 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
   // for big prochatglms, if BLAS is enabled, it is better to use only one thread
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
-  gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size);
   ne_set_name(embd, "embd");

diff --git a/neural_speed/models/chatglm/chatglm2.cpp b/neural_speed/models/chatglm/chatglm2.cpp
@@ -110,7 +110,7 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
   // for big prochatglms, if BLAS is enabled, it is better to use only one thread
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
-  gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);
 
   const bool run_mha_reordered = model.layers[0].k_cache->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};

diff --git a/neural_speed/models/falcon/falcon.cpp b/neural_speed/models/falcon/falcon.cpp
@@ -91,7 +91,7 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
   // for big profalcons, if BLAS is enabled, it is better to use only one thread
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
-  gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};

diff --git a/neural_speed/models/gptj/gptj.cpp b/neural_speed/models/gptj/gptj.cpp
@@ -132,7 +132,7 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
   // for big prompts, if BLAS is enabled, it is better to use only one thread
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
-  gf.n_threads = n_threads;
+  gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);
 
   // no padding input for optimized MHA kernel
   const bool run_mha_reordered = (kv_self.k->type == NE_TYPE_BTLA);

diff --git a/neural_speed/models/gptneox/gptneox.cpp b/neural_speed/models/gptneox/gptneox.cpp
@@ -127,7 +127,7 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i
   // for big progptneoxs, if BLAS is enabled, it is better to use only one thread
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
-  gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};

diff --git a/neural_speed/models/llama/llama.cpp b/neural_speed/models/llama/llama.cpp
@@ -117,7 +117,7 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp
   // for big prompts, if BLAS is enabled, it is better to use only one thread
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
-  gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {0, 0};

diff --git a/neural_speed/models/model_utils/model_utils.cpp b/neural_speed/models/model_utils/model_utils.cpp
@@ -1766,6 +1766,7 @@ const char* model_print_system_info(void) {
   s = "";
   s += "AVX = " + std::to_string(ne_cpu_has_avx()) + " | ";
   s += "AVX2 = " + std::to_string(ne_cpu_has_avx2()) + " | ";
+  s += "AVX_VNNI = " + std::to_string(ne_cpu_has_avx_vnni()) + " | ";
   s += "AVX512 = " + std::to_string(ne_cpu_has_avx512()) + " | ";
   s += "AVX512_VBMI = " + std::to_string(ne_cpu_has_avx512_vbmi()) + " | ";
   s += "AVX512_VNNI = " + std::to_string(ne_cpu_has_avx512_vnni()) + " | ";

diff --git a/neural_speed/models/model_utils/quant_utils.cpp b/neural_speed/models/model_utils/quant_utils.cpp
@@ -285,9 +285,13 @@ size_t bestla_quantize(const float* f32ptr, void* dstpr, const quant_params_inte
   }
   if (params.bits == quant_bits::fp4_e2m1) {
     quant_type = BTLA_DTYPE::F4_E2M1;
+    if (bestla_is_hybrid())
+      printf("Warning: Not recommend FP4 in client CPU. Please use Int4 to get better performance.\n");
   }
   if (params.bits == quant_bits::nf4) {
     quant_type = BTLA_DTYPE::F4_NF4;
+    if (bestla_is_hybrid())
+      printf("Warning: Not recommend NF4 in client CPU. Please use Int4 to get better performance.\n");
   }
   if (params.bits == quant_bits::fp8_e4m3) {
     quant_type = BTLA_DTYPE::F8_E4M3;
@@ -476,7 +480,11 @@ bool model_quantize_special(std::ifstream& finp, std::ofstream& fout, const ne_f
     case NE_FTYPE_MOSTLY_Q4_0:
       qtype = NE_TYPE_Q4_0;
       break;
-    case NE_FTYPE_MOSTLY_Q_BTLA:
+    case NE_FTYPE_MOSTLY_Q4_J:
+    case NE_FTYPE_MOSTLY_Q8_J:
+    case NE_FTYPE_MOSTLY_F8:
+    case NE_FTYPE_MOSTLY_NF4:
+    case NE_FTYPE_MOSTLY_F4:
       qtype = NE_TYPE_BTLA;
       break;
     case NE_FTYPE_MOSTLY_F16: {

diff --git a/neural_speed/models/model_utils/util.cpp b/neural_speed/models/model_utils/util.cpp
@@ -12,6 +12,8 @@
 //  See the License for the specific language governing permissions and
 //  limitations under the License.
 #include "util.h"
+#include "core/ne_bestla.h"
+#include "core/ne_layers.h"
 
 int32_t get_num_physical_cores() {
 #ifdef __linux__
@@ -47,3 +49,21 @@ int32_t get_num_physical_cores() {
   unsigned int n_threads = std::thread::hardware_concurrency();
   return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }
+
+int get_best_thread(const ne_ftype ftype, const int n_threads, const int N) {
+  int res;
+  if (N >= 32 && ne_cpu_has_blas()) {
+    res = 1;
+  } else {
+    if ((ftype == NE_FTYPE_MOSTLY_NF4 || ftype == NE_FTYPE_MOSTLY_F4) && bestla_is_hybrid())
+      res = bestla_get_Pcore_number();
+    else
+      res = bestla_get_thread_number();
+  }
+  if (res < n_threads) {
+    printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", res);
+  } else {
+    res = n_threads;
+  }
+  return res;
+}
diff --git a/neural_speed/models/model_utils/util.h b/neural_speed/models/model_utils/util.h
@@ -31,6 +31,7 @@
 #include <unordered_set>
 #include <vector>
 #include "bestla/bestla_utils.h"  // borrow aligned_malloc
+#include "core/data_types.h"
 
 #ifdef __has_include
 #if __has_include(<unistd.h>)
@@ -425,4 +426,6 @@ typedef model_buffer model_ctx_buffer;
 
 int32_t get_num_physical_cores();
 
+int get_best_thread(const ne_ftype ftype, const int n_threads, const int N);
+
 #endif
diff --git a/neural_speed/models/mpt/mpt.cpp b/neural_speed/models/mpt/mpt.cpp
@@ -88,7 +88,7 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input
   // for big prompts, if BLAS is enabled, it is better to use only one thread
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
-  gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};

diff --git a/neural_speed/models/opt/opt.cpp b/neural_speed/models/opt/opt.cpp
@@ -31,6 +31,7 @@
 #include "core/data_types.h"
 #include "core/ne.h"
 #include "core/ne_layers.h"
+#include "core/ne_bestla.h"
 #include "models/model_utils/model_utils.h"
 #include "models/model_utils/util.h"
 
@@ -85,7 +86,7 @@ static bool opt_model_eval_internal(model_context* ctx, const model_input* input
   // for big prompts, if BLAS is enabled, it is better to use only one thread
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
-  gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);
 
   struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
   ne_set_name(embd, "embd");

diff --git a/neural_speed/models/phi/phi.cpp b/neural_speed/models/phi/phi.cpp
@@ -110,7 +110,7 @@ static bool phi2_model_eval_internal(model_context* ctx, const model_input* inpu
   // for big progptneoxs, if BLAS is enabled, it is better to use only one thread
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
-  gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};

diff --git a/neural_speed/models/qwen/qwen.cpp b/neural_speed/models/qwen/qwen.cpp
@@ -117,7 +117,7 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu
   // for big progptneoxs, if BLAS is enabled, it is better to use only one thread
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
-  gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};

diff --git a/neural_speed/models/starcoder/starcoder.cpp b/neural_speed/models/starcoder/starcoder.cpp
@@ -90,7 +90,7 @@ static bool starcoder_model_eval_internal(model_context* ctx, const model_input*
   // for big prompts, if BLAS is enabled, it is better to use only one thread
   // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
   ne_cgraph gf = {};
-  gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+  gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);
 
   const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
   kv_cache_info_t kv_cache_info = {};