Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

fix nf4 performance in hybrid CPU #120

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 10 additions & 36 deletions neural_speed/application/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,41 +62,6 @@
#define M_PI 3.14159265358979323846
#endif

int32_t get_num_physical_cores() {
#ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
std::unordered_set<std::string> siblings;
for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
if (!thread_siblings.is_open()) {
break; // no more cpus
}
std::string line;
if (std::getline(thread_siblings, line)) {
siblings.insert(line);
}
}
if (!siblings.empty()) {
return static_cast<int32_t>(siblings.size());
}
#elif defined(__APPLE__) && defined(__MACH__)
int32_t num_physical_cores;
size_t len = sizeof(num_physical_cores);
int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, nullptr, 0);
if (result == 0) {
return num_physical_cores;
}
result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, nullptr, 0);
if (result == 0) {
return num_physical_cores;
}
#elif defined(_WIN32)
// TODO(Yucheng): Implement
#endif
unsigned int n_threads = std::thread::hardware_concurrency();
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}

bool isValidFilename(const std::string& filename) {
std::ifstream infile(filename.c_str());
return infile.good();
Expand Down Expand Up @@ -730,7 +695,16 @@ ne_ftype quant_params_to_ftype(const quant_params& params) {
return NE_FTYPE_MOSTLY_Q8_0;
}
} else {
return NE_FTYPE_MOSTLY_Q_BTLA;
if (params.weight_dtype == "int4")
return NE_FTYPE_MOSTLY_Q4_J;
else if (params.weight_dtype == "int8")
return NE_FTYPE_MOSTLY_Q8_J;
else if (params.weight_dtype == "fp8" || params.weight_dtype == "fp8_e5m2")
return NE_FTYPE_MOSTLY_F8;
else if (params.weight_dtype == "fp4")
return NE_FTYPE_MOSTLY_F4;
else if (params.weight_dtype == "nf4")
return NE_FTYPE_MOSTLY_NF4;
}
return NE_FTYPE_UNKNOWN;
}
Expand Down
2 changes: 0 additions & 2 deletions neural_speed/application/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@
// CLI argument parsing
//

int32_t get_num_physical_cores();

struct common_params {
int32_t n_threads = get_num_physical_cores();

Expand Down
6 changes: 5 additions & 1 deletion neural_speed/core/data_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ enum ne_ftype {
NE_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
NE_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
NE_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
NE_FTYPE_MOSTLY_Q_BTLA = 10, // except 1d tensors
NE_FTYPE_MOSTLY_Q4_J = 10, // except 1d tensors
NE_FTYPE_MOSTLY_Q8_J = 11, // except 1d tensors
NE_FTYPE_MOSTLY_F8 = 12, // except 1d tensors
NE_FTYPE_MOSTLY_NF4 = 13, // except 1d tensors
NE_FTYPE_MOSTLY_F4 = 14, // except 1d tensors
};

#define QK4_0 32
Expand Down
15 changes: 15 additions & 0 deletions neural_speed/core/layers/ne_bestla.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,21 @@
using namespace bestla; // NOLINT
using namespace ne_bestla; // NOLINT

bool bestla_is_hybrid() {
GetCPUDevice();
return _cd->isHybrid();
}

int bestla_get_Pcore_number() {
GetCPUDevice();
return _cd->getPcoreNum();
}

int bestla_get_thread_number() {
GetCPUDevice();
return _cd->getThreads();
}

void bestla_init() {
GetCPUDevice();
if (_cd->AMX_BF16() || _cd->AMX_INT8()) {
Expand Down
6 changes: 6 additions & 0 deletions neural_speed/core/ne_bestla.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ void bestla_timer(bool _init);

int bestla_set_threads(int _nth);

bool bestla_is_hybrid();

int bestla_get_Pcore_number();

int bestla_get_thread_number();

void* bestla_get_thread_handle();

void bestla_init();
Expand Down
8 changes: 8 additions & 0 deletions neural_speed/core/ne_layers.c
Original file line number Diff line number Diff line change
Expand Up @@ -11960,6 +11960,14 @@ int ne_cpu_has_avx2(void) {
#endif
}

int ne_cpu_has_avx_vnni(void) {
#if defined(__AVXVNNI__)
return 1;
#else
return 0;
#endif
}

int ne_cpu_has_avx512(void) {
#if defined(__AVX512F__)
return 1;
Expand Down
1 change: 1 addition & 0 deletions neural_speed/core/ne_layers.h
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,7 @@ NE_API size_t ne_quantize_chunk(enum ne_type type, const float* src, void* dst,

NE_API int ne_cpu_has_avx(void);
NE_API int ne_cpu_has_avx2(void);
NE_API int ne_cpu_has_avx_vnni(void);
NE_API int ne_cpu_has_avx512(void);
NE_API int ne_cpu_has_avx512_vbmi(void);
NE_API int ne_cpu_has_avx512_vnni(void);
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/baichuan/baichuan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
// for big probaichuans, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);

const bool run_mha_reordered = model.layers[0].k_cache->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/bloom/bloom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp
// for big profalcon, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);

struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
ne_set_name(embd, "embd");
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/chatglm/chatglm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
// for big prochatglms, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);

struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N * batch_size);
ne_set_name(embd, "embd");
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/chatglm/chatglm2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
// for big prochatglms, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);

const bool run_mha_reordered = model.layers[0].k_cache->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/falcon/falcon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
// for big profalcons, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/gptj/gptj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = n_threads;
gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);

// no padding input for optimized MHA kernel
const bool run_mha_reordered = (kv_self.k->type == NE_TYPE_BTLA);
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/gptneox/gptneox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i
// for big progptneoxs, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/llama/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {0, 0};
Expand Down
1 change: 1 addition & 0 deletions neural_speed/models/model_utils/model_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1766,6 +1766,7 @@ const char* model_print_system_info(void) {
s = "";
s += "AVX = " + std::to_string(ne_cpu_has_avx()) + " | ";
s += "AVX2 = " + std::to_string(ne_cpu_has_avx2()) + " | ";
s += "AVX_VNNI = " + std::to_string(ne_cpu_has_avx_vnni()) + " | ";
s += "AVX512 = " + std::to_string(ne_cpu_has_avx512()) + " | ";
s += "AVX512_VBMI = " + std::to_string(ne_cpu_has_avx512_vbmi()) + " | ";
s += "AVX512_VNNI = " + std::to_string(ne_cpu_has_avx512_vnni()) + " | ";
Expand Down
10 changes: 9 additions & 1 deletion neural_speed/models/model_utils/quant_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,13 @@ size_t bestla_quantize(const float* f32ptr, void* dstpr, const quant_params_inte
}
if (params.bits == quant_bits::fp4_e2m1) {
quant_type = BTLA_DTYPE::F4_E2M1;
if (bestla_is_hybrid())
printf("Warning: Not recommend FP4 in client CPU. Please use Int4 to get better performance.\n");
}
if (params.bits == quant_bits::nf4) {
quant_type = BTLA_DTYPE::F4_NF4;
if (bestla_is_hybrid())
printf("Warning: Not recommend NF4 in client CPU. Please use Int4 to get better performance.\n");
}
if (params.bits == quant_bits::fp8_e4m3) {
quant_type = BTLA_DTYPE::F8_E4M3;
Expand Down Expand Up @@ -476,7 +480,11 @@ bool model_quantize_special(std::ifstream& finp, std::ofstream& fout, const ne_f
case NE_FTYPE_MOSTLY_Q4_0:
qtype = NE_TYPE_Q4_0;
break;
case NE_FTYPE_MOSTLY_Q_BTLA:
case NE_FTYPE_MOSTLY_Q4_J:
case NE_FTYPE_MOSTLY_Q8_J:
case NE_FTYPE_MOSTLY_F8:
case NE_FTYPE_MOSTLY_NF4:
case NE_FTYPE_MOSTLY_F4:
qtype = NE_TYPE_BTLA;
break;
case NE_FTYPE_MOSTLY_F16: {
Expand Down
20 changes: 20 additions & 0 deletions neural_speed/models/model_utils/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "util.h"
#include "core/ne_bestla.h"
#include "core/ne_layers.h"

int32_t get_num_physical_cores() {
#ifdef __linux__
Expand Down Expand Up @@ -47,3 +49,21 @@ int32_t get_num_physical_cores() {
unsigned int n_threads = std::thread::hardware_concurrency();
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}

int get_best_thread(const ne_ftype ftype, const int n_threads, const int N) {
int res;
if (N >= 32 && ne_cpu_has_blas()) {
res = 1;
} else {
if ((ftype == NE_FTYPE_MOSTLY_NF4 || ftype == NE_FTYPE_MOSTLY_F4) && bestla_is_hybrid())
res = bestla_get_Pcore_number();
else
res = bestla_get_thread_number();
}
if (res < n_threads) {
printf("WARNING: Thread number exceed the limit. Actual thread number is: %d\n now.", res);
} else {
res = n_threads;
}
return res;
}
3 changes: 3 additions & 0 deletions neural_speed/models/model_utils/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <unordered_set>
#include <vector>
#include "bestla/bestla_utils.h" // borrow aligned_malloc
#include "core/data_types.h"

#ifdef __has_include
#if __has_include(<unistd.h>)
Expand Down Expand Up @@ -425,4 +426,6 @@ typedef model_buffer model_ctx_buffer;

int32_t get_num_physical_cores();

int get_best_thread(const ne_ftype ftype, const int n_threads, const int N);

#endif
2 changes: 1 addition & 1 deletion neural_speed/models/mpt/mpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ static bool mpt_model_eval_internal(model_context* ctx, const model_input* input
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
3 changes: 2 additions & 1 deletion neural_speed/models/opt/opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "core/data_types.h"
#include "core/ne.h"
#include "core/ne_layers.h"
#include "core/ne_bestla.h"
#include "models/model_utils/model_utils.h"
#include "models/model_utils/util.h"

Expand Down Expand Up @@ -85,7 +86,7 @@ static bool opt_model_eval_internal(model_context* ctx, const model_input* input
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);

struct ne_tensor* embd = d_ne_new_tensor_1d(ctx0, NE_TYPE_I32, N);
ne_set_name(embd, "embd");
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/phi/phi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ static bool phi2_model_eval_internal(model_context* ctx, const model_input* inpu
// for big progptneoxs, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/qwen/qwen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu
// for big progptneoxs, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/starcoder/starcoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ static bool starcoder_model_eval_internal(model_context* ctx, const model_input*
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ne_cgraph gf = {};
gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
gf.n_threads = get_best_thread(hparams.ftype, n_threads, N);

const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
kv_cache_info_t kv_cache_info = {};
Expand Down
Loading