Skip to content

Commit

Permalink
llama : quantize up to 31% faster on Linux and Windows with mmap (#3206)
Browse files Browse the repository at this point in the history
* llama : enable mmap in quantize on Linux -> 31% faster

* also enable mmap on Windows

---------

Co-authored-by: Georgi Gerganov <[email protected]>
  • Loading branch information
cebtenzzre and ggerganov authored Sep 29, 2023
1 parent 0a4a4a0 commit 2777a84
Showing 1 changed file with 17 additions and 4 deletions.
21 changes: 17 additions & 4 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
nthread = std::thread::hardware_concurrency();
}

llama_model_loader ml(fname_inp, /*use_mmap*/ false);
// mmap consistently increases speed Linux, and also increases speed on Windows with
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
#if defined(__linux__) || defined(_WIN32)
constexpr bool use_mmap = true;
#else
constexpr bool use_mmap = false;
#endif

llama_model_loader ml(fname_inp, use_mmap);
if (ml.use_mmap) {
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
}

llama_model model;
llm_load_arch(ml, model);
Expand Down Expand Up @@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

const std::string name = ggml_get_name(tensor);

if (read_data.size() < ggml_nbytes(tensor)) {
read_data.resize(ggml_nbytes(tensor));
if (!ml.use_mmap) {
if (read_data.size() < ggml_nbytes(tensor)) {
read_data.resize(ggml_nbytes(tensor));
}
tensor->data = read_data.data();
}
tensor->data = read_data.data();
ml.load_data_for(tensor);

LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
Expand Down

0 comments on commit 2777a84

Please sign in to comment.