intel · airMeng · Feb 22, 2024 · Feb 7, 2024 · Feb 7, 2024 · Feb 7, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -54,17 +54,17 @@ option(NS_SANITIZE_UNDEFINED     "neural_speed: enable undefined sanitizer"
 # instruction set specific
 option(NS_AVX                    "neural_speed: enable AVX"                                     ON)
 option(NS_AVX2                   "neural_speed: enable AVX2"                                    ON)
+option(NS_F16C                   "neural_speed: enable F16C"                                    ON)
 option(NS_AVX512                 "neural_speed: enable AVX512"                                  OFF)
 option(NS_AVX512_VBMI            "neural_speed: enable AVX512-VBMI"                             OFF)
 option(NS_AVX512_VNNI            "neural_speed: enable AVX512-VNNI"                             OFF)
 option(NS_FMA                    "neural_speed: enable FMA"                                     ON)
 option(NS_AMX                    "neural_speed: enable AMX"                                     OFF)
-option(NS_F16C                   "neural_speed: enable F16C"                                    ON)
 
 option(NS_BUILD_TESTS            "neural_speed: build tests"                       ${NS_STANDALONE})
 option(NS_BTLA_UT                "enable BesTLA's unit tests"                                   OFF)
 option(NS_BUILD_EXAMPLES         "neural_speed: build examples"                    ${NS_STANDALONE})
-option(NS_USE_CLANG_TIDY         "neural_speed: clang-tidy check"                               OFF)                               
+option(NS_USE_CLANG_TIDY         "neural_speed: clang-tidy check"                               OFF)
 
 
 if(NS_BUILD_TESTS)
@@ -101,6 +101,7 @@ if (MSVC)
     endif()
 endif()
 
+set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF)  # default to false so that pybind11 will not try to use IPO
 if (NS_LTO)
     include(CheckIPOSupported)
     check_ipo_supported(RESULT result OUTPUT output)

diff --git a/neural_speed/application/audio_run.cpp b/neural_speed/application/audio_run.cpp
@@ -39,7 +39,6 @@
 #include <unistd.h>
 #elif defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
 #include <windows.h>
 #include <signal.h>
 #endif

diff --git a/neural_speed/application/common.cpp b/neural_speed/application/common.cpp
@@ -48,7 +48,6 @@
 
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
 #include <windows.h>
 #include <fcntl.h>
 #include <io.h>

diff --git a/neural_speed/application/main_pybind.cpp b/neural_speed/application/main_pybind.cpp
@@ -49,7 +49,6 @@
 #include <unistd.h>
 #elif defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
 #include <signal.h>
 #include <windows.h>
 #endif
@@ -528,7 +527,7 @@ const std::vector<float>& Model::evaluate_(const std::vector<std::vector<model_t
       fprintf(stderr, "%s: error: prompt confliction\n", __func__);
       return empty_ret;
     } else if (input_id_cb.size() > n_ctx - 4) {  // long input_id_cb and empty curr_input_ids[bs]
-      fprintf(stderr, "\n%s: Warning: prompt is too long (%d tokens, max %d), will be truncated\n", __func__,
+      fprintf(stderr, "\n%s: Warning: prompt is too long (%zu tokens, max %d), will be truncated\n", __func__,
               input_id_cb.size(), n_ctx - 4);
       curr_input_ids[bs].resize(n_ctx - 4);
       std::copy(input_id_cb.end() - n_ctx - 8, input_id_cb.end(), curr_input_ids[bs].begin() + 4);
@@ -643,7 +642,7 @@ std::vector<std::vector<model_token>> Model::generate_tokens(const std::vector<s
 
   if (curr_input_ids[STATIC_INPUT_HEAD_IDX].empty()) {
     if (input_ids[STATIC_INPUT_HEAD_IDX].size() > n_ctx - 4) {
-      fprintf(stderr, "\n%s: Warning: prompt is too long (%d tokens, max %d), will be truncated\n", __func__,
+      fprintf(stderr, "\n%s: Warning: prompt is too long (%zu tokens, max %d), will be truncated\n", __func__,
               input_ids[STATIC_INPUT_HEAD_IDX].size(), n_ctx - 4);
       curr_input_ids[STATIC_INPUT_HEAD_IDX].resize(n_ctx - 4);
       std::copy(input_ids[STATIC_INPUT_HEAD_IDX].end() - n_ctx - 8, input_ids[STATIC_INPUT_HEAD_IDX].end(),

diff --git a/neural_speed/application/main_run.cpp b/neural_speed/application/main_run.cpp
@@ -42,7 +42,6 @@
 #include <unistd.h>
 #elif defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
 #include <windows.h>
 #include <signal.h>
 #endif

diff --git a/neural_speed/application/pybind_gptj.cpp b/neural_speed/application/pybind_gptj.cpp
@@ -57,6 +57,9 @@ bool gptj_model_eval_ids(model_context* ctx, model_token* tokens, size_t n_eval,
   return true;
 }
 
+static const char* memory_dtype =
+    (getenv("NE_MEM_DTYPE") != nullptr && strlen(getenv("NE_MEM_DTYPE")) > 0) ? getenv("NE_MEM_DTYPE") : "auto";
+
 extern "C" {
 void* init_gptj(int seed, int n_predict, int n_batch, int top_k, float top_p, float temp, float repeat_penalty,
                 bool perplexity, int n_ctx, const char* model_file, bool beam_search = false, int beam_size = 4,
@@ -79,7 +82,17 @@ void* init_gptj(int seed, int n_predict, int n_batch, int top_k, float top_p, fl
   params.batch_size = batch_size;
   params.beam_search = beam_search;
   params.beam_size = beam_size;
-  if (batch_size > 1) params.memory_type = KV_MEM_TYPE_F16;  // TODO(Yi): NO MHA IN MULTI-BATCH
+  if (batch_size > 1)  // TODO(Yi): NO MHA IN MULTI-BATCH
+    params.memory_type = KV_MEM_TYPE_F16;
+  else if (strcmp(memory_dtype, "f32") == 0)
+    params.memory_type = KV_MEM_TYPE_F32;
+  else if (strcmp(memory_dtype, "f16") == 0)
+    params.memory_type = KV_MEM_TYPE_F16;
+  else if (strcmp(memory_dtype, "auto") == 0)
+    params.memory_type = KV_MEM_TYPE_AUTO;
+  else
+    fprintf(stderr, "Unexpected memory dtype!");
+
   // params.use_mmap = false;
   // params.use_mlock= true;
   model_init_backend();
@@ -238,6 +251,7 @@ char* eval_gptj_char(void* ctx, const char* prom, int n_predict, int top_k, floa
 
   char* res_c_str = new char[res.size() + 1];
   std::strncpy(res_c_str, res.c_str(), res.size());
+  res_c_str[res.size()] = '\0';
   return res_c_str;
 }
 
@@ -254,7 +268,7 @@ int main(int argc, char* argv[]) {
   }
 
   auto gptj_in_all_bs =
-      init_gptj(1234, 32, 32, 40, 1.0, 0.8, 1.02, false, 2048, argv[1], true, 4, 1, 56, 30, 1.0, true);
+      init_gptj(1234, 32, 32, 40, 1.f, 0.8f, 1.02f, false, 2048, argv[1], true, 4, 1, 56, 30, 1.0, true);
   std::vector<void*> ctxs = {gptj_in_all_bs};
   for (auto gptj_in_all : ctxs) {
     auto res = eval_gptj_char(
@@ -341,7 +355,7 @@ int main(int argc, char* argv[]) {
         "out-of-place-and-still-not-obvious 'Call Waiter' button. But in hindsight, I should have gone with a simple "
         "HUD from the start, especially one that indicated each team's colors and general state of the game without "
         "the need for zooming in and out. Development Development went fast.",
-        128, 40, 1.0, 0.8, 2048);
+        128, 40, 1.0f, 0.8f, 2048);
     std::cout << res << std::endl;
     exit_gptj(gptj_in_all);
     delete[] res;

diff --git a/neural_speed/application/whisper_pybind.cpp b/neural_speed/application/whisper_pybind.cpp
@@ -42,7 +42,6 @@
 #include <unistd.h>
 #elif defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
 #include <signal.h>
 #include <windows.h>
 #endif
@@ -446,7 +445,7 @@ void Model::inference(const std::string& fname_inp) {
       }
 
       if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
-        fprintf(stderr, "%s: failed to process audio\n", fname_inp);
+        fprintf(stderr, "%s: failed to process audio\n", fname_inp.c_str());
         return;
       }
     }

diff --git a/neural_speed/cmake/Common.cmake b/neural_speed/cmake/Common.cmake
@@ -14,19 +14,40 @@
 
 function(warning_check TARGET)
     # TODO(hengyu): add warning check
-    # if (MSVC)
+    if (MSVC)
     #     target_compile_definitions(${TARGET} PUBLIC -DPLATFORM_WINDOWS -DNOGDI -DNOMINMAX -D_USE_MATH_DEFINES -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS)
-    #     target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options /utf-8>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:/utf-8>")
+        target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options /utf-8>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:/utf-8>")
     #     target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options /sdl>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:/sdl>")
-    # else()
+
+        # Use public to affect pybind targets
+        target_compile_options(${TARGET} PUBLIC /wd4244 /wd4267)  # possible loss of data
+        target_compile_options(${TARGET} PUBLIC /wd4305)  # truncation from 'double' to 'float'
+        target_compile_options(${TARGET} PUBLIC /wd4018)  # '>': signed/unsigned mismatch
+        target_compile_options(${TARGET} PUBLIC /wd4334)  # '<<': result of 32-bit shift implicitly converted to 64 bits
+
+        # 'std::codecvt_utf8<wchar_t,1114111,(std::codecvt_mode)0>': warning STL4017: std::wbuffer_convert,
+        # std::wstring_convert, and the <codecvt> header (containing std::codecvt_mode, std::codecvt_utf8,
+        # std::codecvt_utf16, and std::codecvt_utf8_utf16) are deprecated in C++17. (The std::codecvt class template is NOT
+        # deprecated.) The C++ Standard doesn't provide equivalent non-deprecated functionality; consider using
+        # MultiByteToWideChar() and WideCharToMultiByte() from <Windows.h> instead. You can define
+        # _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING or _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS to suppress this
+        # warning.
+        target_compile_definitions(${TARGET} PUBLIC _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
+
+        # Microsoft renamed some POSIX and Microsoft-specific library functions in the CRT to conform with C99 and C++03
+        # constraints on reserved and global implementation-defined names. If you need to use the existing function names
+        # for portability reasons, you can turn off these warnings. The functions are still available in the library under
+        # their original names.
+        target_compile_definitions(${TARGET} PUBLIC _CRT_NONSTDC_NO_WARNINGS)
+    else()
     #     # Enable warning
     #     target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options -Wall>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Wall>")
     #     target_compile_options(${TARGET} PRIVATE "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Wextra>")
     #     if(NOT CMAKE_BUILD_TYPE MATCHES "[Dd]ebug")
     #         target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options -Werror>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Werror>")
     #         target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options -Wno-error=deprecated-declarations>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Wno-error=deprecated-declarations>")
     #     endif()
-    # endif()
+    endif()
 endfunction()
 
 function(add_executable_w_warning TARGET)

diff --git a/neural_speed/core/ne_layers.c b/neural_speed/core/ne_layers.c
@@ -3463,8 +3463,10 @@ static void ne_compute_forward_dump_tensor(const struct ne_compute_params* param
   const int64_t ne03 = src0->ne[3];
   const int64_t nr = ne_nrows(src0);
 
-  fprintf(file, "Total element is %ld\n", ne_nelements(src0));
-  fprintf(file, "ne[0] size is %ld ne[1] size is %ld ne[2] size is %ld ne[3] size is %ld \n", ne00, ne01, ne02, ne03);
+  fprintf(file, "Total element is %" PRId64 "\n", ne_nelements(src0));
+  fprintf(file,
+          "ne[0] size is %" PRId64 " ne[1] size is %" PRId64 " ne[2] size is %" PRId64 " ne[3] size is %" PRId64 " \n",
+          ne00, ne01, ne02, ne03);
   switch (src0->type) {
     case NE_TYPE_F32: {
       for (int64_t ir = 0; ir < nr; ++ir) {

diff --git a/neural_speed/models/baichuan/baichuan_utils.cpp b/neural_speed/models/baichuan/baichuan_utils.cpp
@@ -145,8 +145,8 @@ void BAICHUAN::load(model_context* ctx, model_progress_callback progress_callbac
     layer.ffn[2] =
         ml->get_tensor(layers_i + ".mlp.up_proj.weight", {n_embd, uint32_t(model.hparams.inner_hidden_size)}, backend);
 
-    layer.v_cache == nullptr;
-    layer.k_cache == nullptr;
+    layer.v_cache = nullptr;
+    layer.k_cache = nullptr;
   }
 
   // print memory requirements

diff --git a/neural_speed/models/model_utils/arg_parse.cpp b/neural_speed/models/model_utils/arg_parse.cpp
@@ -323,7 +323,7 @@ bool gpt_params_parse(int argc, char** argv, gpt_params& params) {  // NOLINT
         } else {
           throw std::exception();
         }
-      } catch (const std::exception& e) {
+      } catch (const std::exception&) {
         invalid_param = true;
         break;
       }

diff --git a/neural_speed/models/model_utils/model_files.h b/neural_speed/models/model_utils/model_files.h
@@ -1059,10 +1059,10 @@ struct model_file_loader {
     char gguf_magic[4];
     const size_t n = fread(&gguf_magic, 1, sizeof(gguf_magic), file.fp);
     bool ok = true;
-    ok = ok & gguf_magic[0] == 'G';
-    ok = ok & gguf_magic[1] == 'G';
-    ok = ok & gguf_magic[2] == 'U';
-    ok = ok & gguf_magic[3] == 'F';
+    ok &= gguf_magic[0] == 'G';
+    ok &= gguf_magic[1] == 'G';
+    ok &= gguf_magic[2] == 'U';
+    ok &= gguf_magic[3] == 'F';
 
     if (ok) {
       model_magic = GGUF;

diff --git a/neural_speed/models/model_utils/model_utils.cpp b/neural_speed/models/model_utils/model_utils.cpp
@@ -99,7 +99,7 @@ static bool kv_cache_init(const struct model_hparams& hparams, struct model_kv_c
   const auto wtype_alloc = wtype == NE_TYPE_BTLA ? NE_TYPE_I8 : wtype;
 
   if (model) {  // non-null param of model for kv-cache as components of model->layers[il]
-    for (int il = 0; il < hparams.n_layer; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
       auto& k_cache = model->layers[il].k_cache;
       auto& v_cache = model->layers[il].v_cache;
       if (wtype == NE_TYPE_F16) {  // chatglm does not support fp32 kv-cache in original impl of chatglm_util.cpp
@@ -2693,7 +2693,7 @@ bool beam_search_flow::step_update_beams_and_kv_cache() {
   std::vector<beam_next_token> next_tokens =
       beam_top_k_next_tokens(ctx, beams_score, num_beams, beam_indices, sample_scale);
   if (next_tokens.size() != num_sample_k) {
-    fprintf(stderr, "%s: error: sampled next tokens size is %ld which is not equal to %d.\n", __func__,
+    fprintf(stderr, "%s: error: sampled next tokens size is %zu which is not equal to %d.\n", __func__,
             next_tokens.size(), num_sample_k);
     return false;
   }

diff --git a/neural_speed/models/model_utils/model_utils.h b/neural_speed/models/model_utils/model_utils.h
@@ -307,7 +307,7 @@ struct beam {
   const bool eos() const { return !token_ids.empty() && token_ids.back() == ctx->vocab.eos_token_id; }
 
   void print() const {
-    printf("length: %ld, score: %12.6f, eos: %d, request_idx: %d, beam_idx: %d, done: %d, tokens:\n", token_ids.size(),
+    printf("length: %zu, score: %12.6f, eos: %d, request_idx: %d, beam_idx: %d, done: %d, tokens:\n", token_ids.size(),
            score, eos(), request_idx, beam_idx, done);
     for (const auto& id : token_ids) {
       printf("%d: %s, ", id, model_token_to_str(ctx, id));

diff --git a/neural_speed/models/model_utils/quant_utils.cpp b/neural_speed/models/model_utils/quant_utils.cpp
@@ -422,7 +422,6 @@ void ne_common_quantize(const int nthread, const quant_params_internal& params,
   }
   printf("size = %8.2f MB -> %8.2f MB\n", tensor.size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 
-__WRITE_FILE:
   size_org += tensor.size;
   size_new += new_size;
   saver.write_tensor(tensor, new_type, new_data, new_size);

diff --git a/neural_speed/models/model_utils/scheduler.cpp b/neural_speed/models/model_utils/scheduler.cpp
@@ -168,7 +168,8 @@ std::vector<sequence> Iter_level_scheduler::pop_completed_requests() {
       return std::vector<sequence>();
     }
     if (log_level == 0) {
-      fprintf(stdout, "%s: info: tokens generation time of sequence (query_id %lu, request_idx: %d) is %8.2fms.\n",
+      fprintf(stdout,
+              "%s: info: tokens generation time of sequence (query_id %" PRIu64 ", request_idx: %d) is %8.2fms.\n",
               __func__, ret_seqs[l].query_id, ret_seqs[l].request_idx,
               (ret_seqs[l].end_time - ret_seqs[l].receive_time) / 1000.0);
     }
@@ -213,7 +214,8 @@ bool Cont_batch_gen_scheduler::add_request(sequence seq) {
   seq.status = seq_status::WAITING;
   seq.request_idx = waiting_free_req_idx_seqs_num > 0 ? -1 : query_free_req_idx();
   if (log_level == 0) {
-    fprintf(stdout, "%s: info: added seq query_id: %lu, request_idx: %d \n", __func__, seq.query_id, seq.request_idx);
+    fprintf(stdout, "%s: info: added seq query_id: %" PRIu64 ", request_idx: %d \n", __func__, seq.query_id,
+            seq.request_idx);
   }
   if (seq.request_idx == -1) waiting_free_req_idx_seqs_num++;
   return waiting_pool.add(seq);
@@ -246,7 +248,7 @@ bool Cont_batch_gen_scheduler::prepare_seqs() {
             }
             executed_seqs[cur_running_num + np].request_idx = fidx;
             if (log_level == 0) {
-              fprintf(stdout, "%s: info: updated seq query_id: %lu, request_idx: %d \n", __func__,
+              fprintf(stdout, "%s: info: updated seq query_id: %" PRIu64 ", request_idx: %d \n", __func__,
                       executed_seqs[cur_running_num + np].query_id, executed_seqs[cur_running_num + np].request_idx);
             }
             waiting_free_req_idx_seqs_num--;
@@ -320,15 +322,15 @@ bool Cont_batch_gen_scheduler::update_pools() {
       finished_pool.add(executed_seqs[ns]);
       free_req_idx[executed_seqs[ns].request_idx] = true;
       if (log_level == 0) {
-        fprintf(stdout, "%s: info: seq query_id: %lu, request_idx: %d finished.\n", __func__,
+        fprintf(stdout, "%s: info: seq query_id: %" PRIu64 ", request_idx: %d finished.\n", __func__,
                 executed_seqs[ns].query_id, executed_seqs[ns].request_idx);
       }
     } else {
-      fprintf(
-          stderr,
-          "%s: error: wrong seq status: %d of seq query_id: %lu, request_idx: %d, should be in DECODING OR FINISHED.\n",
-          __func__, static_cast<int>(executed_seqs[ns].status), executed_seqs[ns].query_id,
-          executed_seqs[ns].request_idx);
+      fprintf(stderr,
+              "%s: error: wrong seq status: %d of seq query_id: %" PRIu64
+              ", request_idx: %d, should be in DECODING OR FINISHED.\n",
+              __func__, static_cast<int>(executed_seqs[ns].status), executed_seqs[ns].query_id,
+              executed_seqs[ns].request_idx);
       return false;
     }
   }