Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Improvements Targeting Windows #136

Merged
merged 8 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,17 @@ option(NS_SANITIZE_UNDEFINED "neural_speed: enable undefined sanitizer"
# instruction set specific
option(NS_AVX "neural_speed: enable AVX" ON)
option(NS_AVX2 "neural_speed: enable AVX2" ON)
option(NS_F16C "neural_speed: enable F16C" ON)
option(NS_AVX512 "neural_speed: enable AVX512" OFF)
option(NS_AVX512_VBMI "neural_speed: enable AVX512-VBMI" OFF)
option(NS_AVX512_VNNI "neural_speed: enable AVX512-VNNI" OFF)
option(NS_FMA "neural_speed: enable FMA" ON)
option(NS_AMX "neural_speed: enable AMX" OFF)
option(NS_F16C "neural_speed: enable F16C" ON)

option(NS_BUILD_TESTS "neural_speed: build tests" ${NS_STANDALONE})
option(NS_BTLA_UT "enable BesTLA's unit tests" OFF)
option(NS_BUILD_EXAMPLES "neural_speed: build examples" ${NS_STANDALONE})
option(NS_USE_CLANG_TIDY "neural_speed: clang-tidy check" OFF)
option(NS_USE_CLANG_TIDY "neural_speed: clang-tidy check" OFF)


if(NS_BUILD_TESTS)
Expand Down Expand Up @@ -101,6 +101,7 @@ if (MSVC)
endif()
endif()

set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF) # default to false so that pybind11 will not try to use IPO
if (NS_LTO)
include(CheckIPOSupported)
check_ipo_supported(RESULT result OUTPUT output)
Expand Down
1 change: 0 additions & 1 deletion neural_speed/application/audio_run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
#include <unistd.h>
#elif defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h>
#endif
Expand Down
1 change: 0 additions & 1 deletion neural_speed/application/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@

#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <fcntl.h>
#include <io.h>
Expand Down
5 changes: 2 additions & 3 deletions neural_speed/application/main_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
#include <unistd.h>
#elif defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <signal.h>
#include <windows.h>
#endif
Expand Down Expand Up @@ -528,7 +527,7 @@ const std::vector<float>& Model::evaluate_(const std::vector<std::vector<model_t
fprintf(stderr, "%s: error: prompt confliction\n", __func__);
return empty_ret;
} else if (input_id_cb.size() > n_ctx - 4) { // long input_id_cb and empty curr_input_ids[bs]
fprintf(stderr, "\n%s: Warning: prompt is too long (%d tokens, max %d), will be truncated\n", __func__,
fprintf(stderr, "\n%s: Warning: prompt is too long (%zu tokens, max %d), will be truncated\n", __func__,
input_id_cb.size(), n_ctx - 4);
curr_input_ids[bs].resize(n_ctx - 4);
std::copy(input_id_cb.end() - n_ctx - 8, input_id_cb.end(), curr_input_ids[bs].begin() + 4);
Expand Down Expand Up @@ -643,7 +642,7 @@ std::vector<std::vector<model_token>> Model::generate_tokens(const std::vector<s

if (curr_input_ids[STATIC_INPUT_HEAD_IDX].empty()) {
if (input_ids[STATIC_INPUT_HEAD_IDX].size() > n_ctx - 4) {
fprintf(stderr, "\n%s: Warning: prompt is too long (%d tokens, max %d), will be truncated\n", __func__,
fprintf(stderr, "\n%s: Warning: prompt is too long (%zu tokens, max %d), will be truncated\n", __func__,
input_ids[STATIC_INPUT_HEAD_IDX].size(), n_ctx - 4);
curr_input_ids[STATIC_INPUT_HEAD_IDX].resize(n_ctx - 4);
std::copy(input_ids[STATIC_INPUT_HEAD_IDX].end() - n_ctx - 8, input_ids[STATIC_INPUT_HEAD_IDX].end(),
Expand Down
1 change: 0 additions & 1 deletion neural_speed/application/main_run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
#include <unistd.h>
#elif defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h>
#endif
Expand Down
20 changes: 17 additions & 3 deletions neural_speed/application/pybind_gptj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ bool gptj_model_eval_ids(model_context* ctx, model_token* tokens, size_t n_eval,
return true;
}

static const char* memory_dtype =
(getenv("NE_MEM_DTYPE") != nullptr && strlen(getenv("NE_MEM_DTYPE")) > 0) ? getenv("NE_MEM_DTYPE") : "auto";

extern "C" {
void* init_gptj(int seed, int n_predict, int n_batch, int top_k, float top_p, float temp, float repeat_penalty,
bool perplexity, int n_ctx, const char* model_file, bool beam_search = false, int beam_size = 4,
Expand All @@ -79,7 +82,17 @@ void* init_gptj(int seed, int n_predict, int n_batch, int top_k, float top_p, fl
params.batch_size = batch_size;
params.beam_search = beam_search;
params.beam_size = beam_size;
if (batch_size > 1) params.memory_type = KV_MEM_TYPE_F16; // TODO(Yi): NO MHA IN MULTI-BATCH
if (batch_size > 1) // TODO(Yi): NO MHA IN MULTI-BATCH
params.memory_type = KV_MEM_TYPE_F16;
else if (strcmp(memory_dtype, "f32") == 0)
params.memory_type = KV_MEM_TYPE_F32;
else if (strcmp(memory_dtype, "f16") == 0)
params.memory_type = KV_MEM_TYPE_F16;
else if (strcmp(memory_dtype, "auto") == 0)
params.memory_type = KV_MEM_TYPE_AUTO;
else
fprintf(stderr, "Unexpected memory dtype!");

// params.use_mmap = false;
// params.use_mlock= true;
model_init_backend();
Expand Down Expand Up @@ -238,6 +251,7 @@ char* eval_gptj_char(void* ctx, const char* prom, int n_predict, int top_k, floa

char* res_c_str = new char[res.size() + 1];
std::strncpy(res_c_str, res.c_str(), res.size());
res_c_str[res.size()] = '\0';
return res_c_str;
}

Expand All @@ -254,7 +268,7 @@ int main(int argc, char* argv[]) {
}

auto gptj_in_all_bs =
init_gptj(1234, 32, 32, 40, 1.0, 0.8, 1.02, false, 2048, argv[1], true, 4, 1, 56, 30, 1.0, true);
init_gptj(1234, 32, 32, 40, 1.f, 0.8f, 1.02f, false, 2048, argv[1], true, 4, 1, 56, 30, 1.0, true);
std::vector<void*> ctxs = {gptj_in_all_bs};
for (auto gptj_in_all : ctxs) {
auto res = eval_gptj_char(
Expand Down Expand Up @@ -341,7 +355,7 @@ int main(int argc, char* argv[]) {
"out-of-place-and-still-not-obvious 'Call Waiter' button. But in hindsight, I should have gone with a simple "
"HUD from the start, especially one that indicated each team's colors and general state of the game without "
"the need for zooming in and out. Development Development went fast.",
128, 40, 1.0, 0.8, 2048);
128, 40, 1.0f, 0.8f, 2048);
std::cout << res << std::endl;
exit_gptj(gptj_in_all);
delete[] res;
Expand Down
3 changes: 1 addition & 2 deletions neural_speed/application/whisper_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
#include <unistd.h>
#elif defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <signal.h>
#include <windows.h>
#endif
Expand Down Expand Up @@ -446,7 +445,7 @@ void Model::inference(const std::string& fname_inp) {
}

if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
fprintf(stderr, "%s: failed to process audio\n", fname_inp);
fprintf(stderr, "%s: failed to process audio\n", fname_inp.c_str());
return;
}
}
Expand Down
29 changes: 25 additions & 4 deletions neural_speed/cmake/Common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,40 @@

function(warning_check TARGET)
# TODO(hengyu): add warning check
# if (MSVC)
if (MSVC)
# target_compile_definitions(${TARGET} PUBLIC -DPLATFORM_WINDOWS -DNOGDI -DNOMINMAX -D_USE_MATH_DEFINES -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS)
# target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options /utf-8>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:/utf-8>")
target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options /utf-8>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:/utf-8>")
# target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options /sdl>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:/sdl>")
# else()

# Use public to affect pybind targets
target_compile_options(${TARGET} PUBLIC /wd4244 /wd4267) # possible loss of data
target_compile_options(${TARGET} PUBLIC /wd4305) # truncation from 'double' to 'float'
target_compile_options(${TARGET} PUBLIC /wd4018) # '>': signed/unsigned mismatch
target_compile_options(${TARGET} PUBLIC /wd4334) # '<<': result of 32-bit shift implicitly converted to 64 bits

# 'std::codecvt_utf8<wchar_t,1114111,(std::codecvt_mode)0>': warning STL4017: std::wbuffer_convert,
# std::wstring_convert, and the <codecvt> header (containing std::codecvt_mode, std::codecvt_utf8,
# std::codecvt_utf16, and std::codecvt_utf8_utf16) are deprecated in C++17. (The std::codecvt class template is NOT
# deprecated.) The C++ Standard doesn't provide equivalent non-deprecated functionality; consider using
# MultiByteToWideChar() and WideCharToMultiByte() from <Windows.h> instead. You can define
# _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING or _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS to suppress this
# warning.
target_compile_definitions(${TARGET} PUBLIC _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)

# Microsoft renamed some POSIX and Microsoft-specific library functions in the CRT to conform with C99 and C++03
# constraints on reserved and global implementation-defined names. If you need to use the existing function names
# for portability reasons, you can turn off these warnings. The functions are still available in the library under
# their original names.
target_compile_definitions(${TARGET} PUBLIC _CRT_NONSTDC_NO_WARNINGS)
else()
# # Enable warning
# target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options -Wall>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Wall>")
# target_compile_options(${TARGET} PRIVATE "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Wextra>")
# if(NOT CMAKE_BUILD_TYPE MATCHES "[Dd]ebug")
# target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options -Werror>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Werror>")
# target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options -Wno-error=deprecated-declarations>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Wno-error=deprecated-declarations>")
# endif()
# endif()
endif()
endfunction()

function(add_executable_w_warning TARGET)
Expand Down
6 changes: 4 additions & 2 deletions neural_speed/core/ne_layers.c
Original file line number Diff line number Diff line change
Expand Up @@ -3463,8 +3463,10 @@ static void ne_compute_forward_dump_tensor(const struct ne_compute_params* param
const int64_t ne03 = src0->ne[3];
const int64_t nr = ne_nrows(src0);

fprintf(file, "Total element is %ld\n", ne_nelements(src0));
fprintf(file, "ne[0] size is %ld ne[1] size is %ld ne[2] size is %ld ne[3] size is %ld \n", ne00, ne01, ne02, ne03);
fprintf(file, "Total element is %" PRId64 "\n", ne_nelements(src0));
fprintf(file,
"ne[0] size is %" PRId64 " ne[1] size is %" PRId64 " ne[2] size is %" PRId64 " ne[3] size is %" PRId64 " \n",
ne00, ne01, ne02, ne03);
switch (src0->type) {
case NE_TYPE_F32: {
for (int64_t ir = 0; ir < nr; ++ir) {
Expand Down
4 changes: 2 additions & 2 deletions neural_speed/models/baichuan/baichuan_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ void BAICHUAN::load(model_context* ctx, model_progress_callback progress_callbac
layer.ffn[2] =
ml->get_tensor(layers_i + ".mlp.up_proj.weight", {n_embd, uint32_t(model.hparams.inner_hidden_size)}, backend);

layer.v_cache == nullptr;
layer.k_cache == nullptr;
layer.v_cache = nullptr;
layer.k_cache = nullptr;
}

// print memory requirements
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/model_utils/arg_parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ bool gpt_params_parse(int argc, char** argv, gpt_params& params) { // NOLINT
} else {
throw std::exception();
}
} catch (const std::exception& e) {
} catch (const std::exception&) {
invalid_param = true;
break;
}
Expand Down
8 changes: 4 additions & 4 deletions neural_speed/models/model_utils/model_files.h
Original file line number Diff line number Diff line change
Expand Up @@ -1059,10 +1059,10 @@ struct model_file_loader {
char gguf_magic[4];
const size_t n = fread(&gguf_magic, 1, sizeof(gguf_magic), file.fp);
bool ok = true;
ok = ok & gguf_magic[0] == 'G';
ok = ok & gguf_magic[1] == 'G';
ok = ok & gguf_magic[2] == 'U';
ok = ok & gguf_magic[3] == 'F';
ok &= gguf_magic[0] == 'G';
ok &= gguf_magic[1] == 'G';
ok &= gguf_magic[2] == 'U';
ok &= gguf_magic[3] == 'F';

if (ok) {
model_magic = GGUF;
Expand Down
4 changes: 2 additions & 2 deletions neural_speed/models/model_utils/model_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ static bool kv_cache_init(const struct model_hparams& hparams, struct model_kv_c
const auto wtype_alloc = wtype == NE_TYPE_BTLA ? NE_TYPE_I8 : wtype;

if (model) { // non-null param of model for kv-cache as components of model->layers[il]
for (int il = 0; il < hparams.n_layer; ++il) {
for (int il = 0; il < n_layer; ++il) {
auto& k_cache = model->layers[il].k_cache;
auto& v_cache = model->layers[il].v_cache;
if (wtype == NE_TYPE_F16) { // chatglm does not support fp32 kv-cache in original impl of chatglm_util.cpp
Expand Down Expand Up @@ -2693,7 +2693,7 @@ bool beam_search_flow::step_update_beams_and_kv_cache() {
std::vector<beam_next_token> next_tokens =
beam_top_k_next_tokens(ctx, beams_score, num_beams, beam_indices, sample_scale);
if (next_tokens.size() != num_sample_k) {
fprintf(stderr, "%s: error: sampled next tokens size is %ld which is not equal to %d.\n", __func__,
fprintf(stderr, "%s: error: sampled next tokens size is %zu which is not equal to %d.\n", __func__,
next_tokens.size(), num_sample_k);
return false;
}
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/model_utils/model_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ struct beam {
const bool eos() const { return !token_ids.empty() && token_ids.back() == ctx->vocab.eos_token_id; }

void print() const {
printf("length: %ld, score: %12.6f, eos: %d, request_idx: %d, beam_idx: %d, done: %d, tokens:\n", token_ids.size(),
printf("length: %zu, score: %12.6f, eos: %d, request_idx: %d, beam_idx: %d, done: %d, tokens:\n", token_ids.size(),
score, eos(), request_idx, beam_idx, done);
for (const auto& id : token_ids) {
printf("%d: %s, ", id, model_token_to_str(ctx, id));
Expand Down
1 change: 0 additions & 1 deletion neural_speed/models/model_utils/quant_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,6 @@ void ne_common_quantize(const int nthread, const quant_params_internal& params,
}
printf("size = %8.2f MB -> %8.2f MB\n", tensor.size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);

__WRITE_FILE:
size_org += tensor.size;
size_new += new_size;
saver.write_tensor(tensor, new_type, new_data, new_size);
Expand Down
20 changes: 11 additions & 9 deletions neural_speed/models/model_utils/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,8 @@ std::vector<sequence> Iter_level_scheduler::pop_completed_requests() {
return std::vector<sequence>();
}
if (log_level == 0) {
fprintf(stdout, "%s: info: tokens generation time of sequence (query_id %lu, request_idx: %d) is %8.2fms.\n",
fprintf(stdout,
"%s: info: tokens generation time of sequence (query_id %" PRIu64 ", request_idx: %d) is %8.2fms.\n",
__func__, ret_seqs[l].query_id, ret_seqs[l].request_idx,
(ret_seqs[l].end_time - ret_seqs[l].receive_time) / 1000.0);
}
Expand Down Expand Up @@ -213,7 +214,8 @@ bool Cont_batch_gen_scheduler::add_request(sequence seq) {
seq.status = seq_status::WAITING;
seq.request_idx = waiting_free_req_idx_seqs_num > 0 ? -1 : query_free_req_idx();
if (log_level == 0) {
fprintf(stdout, "%s: info: added seq query_id: %lu, request_idx: %d \n", __func__, seq.query_id, seq.request_idx);
fprintf(stdout, "%s: info: added seq query_id: %" PRIu64 ", request_idx: %d \n", __func__, seq.query_id,
seq.request_idx);
}
if (seq.request_idx == -1) waiting_free_req_idx_seqs_num++;
return waiting_pool.add(seq);
Expand Down Expand Up @@ -246,7 +248,7 @@ bool Cont_batch_gen_scheduler::prepare_seqs() {
}
executed_seqs[cur_running_num + np].request_idx = fidx;
if (log_level == 0) {
fprintf(stdout, "%s: info: updated seq query_id: %lu, request_idx: %d \n", __func__,
fprintf(stdout, "%s: info: updated seq query_id: %" PRIu64 ", request_idx: %d \n", __func__,
executed_seqs[cur_running_num + np].query_id, executed_seqs[cur_running_num + np].request_idx);
}
waiting_free_req_idx_seqs_num--;
Expand Down Expand Up @@ -320,15 +322,15 @@ bool Cont_batch_gen_scheduler::update_pools() {
finished_pool.add(executed_seqs[ns]);
free_req_idx[executed_seqs[ns].request_idx] = true;
if (log_level == 0) {
fprintf(stdout, "%s: info: seq query_id: %lu, request_idx: %d finished.\n", __func__,
fprintf(stdout, "%s: info: seq query_id: %" PRIu64 ", request_idx: %d finished.\n", __func__,
executed_seqs[ns].query_id, executed_seqs[ns].request_idx);
}
} else {
fprintf(
stderr,
"%s: error: wrong seq status: %d of seq query_id: %lu, request_idx: %d, should be in DECODING OR FINISHED.\n",
__func__, static_cast<int>(executed_seqs[ns].status), executed_seqs[ns].query_id,
executed_seqs[ns].request_idx);
fprintf(stderr,
"%s: error: wrong seq status: %d of seq query_id: %" PRIu64
", request_idx: %d, should be in DECODING OR FINISHED.\n",
__func__, static_cast<int>(executed_seqs[ns].status), executed_seqs[ns].query_id,
executed_seqs[ns].request_idx);
return false;
}
}
Expand Down
Loading
Loading