add mem enlarge scale for gptj

Signed-off-by: Yu, Zhentao <[email protected]>
intel · Jan 23, 2024 · 72d11a9 · 72d11a9
1 parent eb639e4
commit 72d11a9
Show file tree

Hide file tree

Showing 7 changed files with 37 additions and 21 deletions.
diff --git a/neural_speed/application/main_pybind.cpp b/neural_speed/application/main_pybind.cpp
@@ -82,7 +82,8 @@ void init_gpt_params(gpt_params* params, const std::string& model_path, int max_
                      float temperature = 0.8, int min_new_tokens = 0, float length_penalty = 1.0f,
                      bool early_stopping = false, int n_keep = 0, int n_discard = -1, bool shift_roped_k = false,
                      int batch_size = 1, model_vocab::id pad_token = -1, const std::string& memory_dtype = "auto",
-                     const bool& continuous_batching = false, const int& max_request_num = MODEL_MAX_REQUEST_NUM) {
+                     const bool& continuous_batching = false, const int& max_request_num = MODEL_MAX_REQUEST_NUM,
+                     const int& model_scratch_enlarge_scale = 1) {
   MODEL_ASSERT(params != nullptr);
 #ifdef MODEL_NAME
   params->model_name = MODEL_NAME;
@@ -121,6 +122,7 @@ void init_gpt_params(gpt_params* params, const std::string& model_path, int max_
   params->min_new_tokens = min_new_tokens;
   params->length_penalty = length_penalty;
   params->do_early_stopping = early_stopping;
+  params->model_scratch_enlarge_scale = model_scratch_enlarge_scale;
 
   printf(
       "beam_size: %d, do_sample: %d, top_k: %d, top_p: %f, continuous_batching: %d, max_request_num: %d, "
@@ -136,7 +138,8 @@ class ModelServer {
               int top_k, float top_p, float temperature, int min_new_tokens, float length_penalty, bool early_stopping,
               int n_keep, int n_discard, bool shift_roped_k, int batch_size, model_vocab::id pad_token,
               const std::string& memory_dtype, const bool& continuous_batching, const int& max_request_num,
-              const std::string& policy, const bool& print_log, const std::function<void()>& init_cb)
+              const int& model_scratch_enlarge_scale, const std::string& policy, const bool& print_log,
+              const std::function<void()>& init_cb)
       : response(response),
         waiting(),
         running(true),
@@ -154,7 +157,7 @@ class ModelServer {
           this->InitServerParams(model_path, max_new_tokens, n_batch, ctx_size, seed, threads, repetition_penalty,
                                  num_beams, do_sample, top_k, top_p, temperature, min_new_tokens, length_penalty,
                                  early_stopping, n_keep, n_discard, shift_roped_k, batch_size, pad_token, memory_dtype,
-                                 true, max_request_num);
+                                 true, max_request_num, model_scratch_enlarge_scale);
           cbg_scheduler scheduler(this->params, policy, print_log ? 0 : 1);
           std::vector<sequence> added_seqs;
           while (running) {
@@ -255,11 +258,12 @@ class ModelServer {
                         int threads, float repetition_penalty, int num_beams, bool do_sample, int top_k, float top_p,
                         float temperature, int min_new_tokens, float length_penalty, bool early_stopping, int n_keep,
                         int n_discard, bool shift_roped_k, int batch_size, model_vocab::id pad_token,
-                        const std::string& memory_dtype, const bool& continuous_batching, const int& max_request_num) {
+                        const std::string& memory_dtype, const bool& continuous_batching, const int& max_request_num,
+                        const int& model_scratch_enlarge_scale) {
     init_gpt_params(&params, model_path, max_new_tokens, n_batch, ctx_size, seed, threads, repetition_penalty,
                     num_beams, do_sample, top_k, top_p, temperature, min_new_tokens, length_penalty, early_stopping,
                     n_keep, n_discard, shift_roped_k, batch_size, pad_token, memory_dtype, continuous_batching,
-                    max_request_num);
+                    max_request_num, model_scratch_enlarge_scale);
   }
 
   ~ModelServer() {
@@ -913,15 +917,17 @@ PYBIND11_MODULE(qwen_cpp, m)
   py::class_<ModelServer>(m, "ModelServer", py::module_local())
       .def(py::init<const ResponseCallback&, const std::string&, bool, int, int, int, int, int, float, int, bool, int,
                     float, float, int, float, bool, int, int, bool, int, model_vocab::id, const std::string&,
-                    const bool&, const int&, const std::string&, const bool&, const std::function<void()>&>(),
+                    const bool&, const int&, const int&, const std::string&, const bool&,
+                    const std::function<void()>&>(),
            py::arg("response"), py::arg("model_path"), py::arg("return_prompt") = false, py::arg("max_new_tokens") = -1,
            py::arg("n_batch") = 512, py::arg("ctx_size") = 512, py::arg("seed") = -1, py::arg("threads") = 8,
            py::arg("repetition_penalty") = 1.1f, py::arg("num_beams") = 1, py::arg("do_sample") = false,
            py::arg("top_k") = 40, py::arg("top_p") = 0.95, py::arg("temperature") = 0.8, py::arg("min_new_tokens") = 0,
            py::arg("length_penalty") = 1.0, py::arg("early_stopping") = false, py::arg("n_keep") = 0,
            py::arg("n_discard") = -1, py::arg("shift_roped_k") = false, py::arg("batch_size") = 1,
            py::arg("pad_token") = -1, py::arg("memory_dtype") = "auto", py::arg("continuous_batching") = true,
-           py::arg("max_request_num") = MODEL_MAX_REQUEST_NUM, py::arg("policy") = "fcfs", py::arg("print_log") = false,
+           py::arg("max_request_num") = MODEL_MAX_REQUEST_NUM, py::arg("model_scratch_enlarge_scale") = 1,
+           py::arg("policy") = "fcfs", py::arg("print_log") = false,
            py::arg("init_cb") = std::function<void()>{[]() {}})
       .def("issueQuery", &ModelServer::issueQuery, "desc placeholder", py::arg("qs"))
       .def("Empty", &ModelServer::Empty, "No more queries to execute");

diff --git a/neural_speed/models/gptj/gptj.h b/neural_speed/models/gptj/gptj.h
@@ -26,11 +26,11 @@ enum gptj_model {
   GPTJ_65B,
 };
 
-static const model_scratch gptj_mem_req(int n_layers) {
+static const model_scratch gptj_mem_req(int n_layers, int enlarge_scale = 1) {
   switch (n_layers) {
     case 28:
       // should be enough for batch=8 * beam=4
-      return {3072ull * MB, 2048ull * MB, 3072ull * MB};
+      return {3072ull * enlarge_scale * MB, 2048ull * enlarge_scale * MB, 3072ull * enlarge_scale * MB};
     default:
       MODEL_ASSERT(false);
   }

diff --git a/neural_speed/models/gptj/gptj_utils.cpp b/neural_speed/models/gptj/gptj_utils.cpp
@@ -75,7 +75,7 @@ void GPTJ::init(const char* path_model, model_context* ctx, int n_gpu_layer_, bo
   n_embd = hparams.n_embd;
   n_vocab = hparams.n_vocab;
   n_layer = hparams.n_layer;
-  scratch = gptj_mem_req(n_layer);
+  scratch = gptj_mem_req(n_layer, lctx.model_scratch_enlarge_scale);
   model.scratchs = scratch;
 }
 

diff --git a/neural_speed/models/model_utils/model_config.h b/neural_speed/models/model_utils/model_config.h
@@ -102,6 +102,8 @@ struct gpt_params {
   uint32_t min_new_tokens = 0;     // min new tokens for beam search generation
   float length_penalty = 1.0f;     // exponential penalty to the length in beam search generation
   bool do_early_stopping = false;  // early stopping in beam search generation
+
+  int model_scratch_enlarge_scale = 1;  // model memory scratch enlarge scale
 };
 
 bool gpt_params_parse(int argc, char** argv, gpt_params& params);

diff --git a/neural_speed/models/model_utils/model_types.h b/neural_speed/models/model_utils/model_types.h
@@ -302,6 +302,8 @@ struct model_context {
 
   size_t mem_per_token = 0;
 
+  int model_scratch_enlarge_scale = 1;  // model memory scratch enlarge scale
+
   // decode output (3-dimensional array: [batch_size] [n_tokens] [n_vocab])
   std::vector<float> logits;
   bool logits_all = false;
@@ -420,6 +422,8 @@ struct model_context_params {
   int max_request_num;  // maximum num of bearable requests in current env
   // global generation config
   generation_config gen_conf;
+  // model memory scratch enlarge scale
+  int model_scratch_enlarge_scale;
 
   // called with a progress value between 0 and 1, pass nullptr to disable
   model_progress_callback progress_callback;

diff --git a/neural_speed/models/model_utils/model_utils.cpp b/neural_speed/models/model_utils/model_utils.cpp
@@ -188,6 +188,7 @@ struct model_context_params model_context_default_params() {
       /*cont_batching                =*/false,
       /*.max_request_num             =*/1,
       /*.gen_conf                    =*/generation_config(),
+      /*model_scratch_enlarge_scale  =*/1,
       /*.progress_callback           =*/nullptr,
       /*.progress_callback_user_data =*/nullptr,
   };
@@ -910,6 +911,7 @@ struct model_context* model_init_from_file(const char* path_model, struct model_
   }
   ctx->cont_batching = params.cont_batching;
   ctx->generation_conf = params.gen_conf;
+  ctx->model_scratch_enlarge_scale = params.model_scratch_enlarge_scale;
   const model_archs arch = params.arch;
 
   // the type so that kv-cache allocated according to this type must be large enough
@@ -1282,6 +1284,7 @@ struct model_context* model_init_from_gpt_params(const gpt_params& params) {
   lparams.gen_conf.min_new_tokens = params.min_new_tokens;
   lparams.gen_conf.length_penalty = params.length_penalty;
   lparams.gen_conf.do_early_stopping = params.do_early_stopping;
+  lparams.model_scratch_enlarge_scale = params.model_scratch_enlarge_scale;
 
   NE_ASSERT(("Start size cannot be greater than the maximun context size!", lparams.n_keep < lparams.n_ctx));
 

diff --git a/scripts/python_api_example_for_model_server.py b/scripts/python_api_example_for_model_server.py
@@ -36,17 +36,18 @@ def f_response(res, working):
 
 model_path = "/home/zhentao/ils/ns/q4_j.bin" # please set your corresponding local neural_speed low-bits model file
 added_count = 0
-s = cpp.ModelServer(f_response,               # reponse function (deliver generation results and current reamin working size in server)
-                    model_path,               # model_path
-                    max_new_tokens=128,       # global query max generation token length
-                    num_beams=4,              # global beam search related generation parameters
-                    min_new_tokens=30,        # global beam search related generation parameters (default: 0)
-                    early_stopping=True,      # global beam search related generation parameters (default: False)
-                    continuous_batching=True, # turn on continuous batching mechanism (default: True)
-                    return_prompt=True,       # also return prompt token ids in generation results (default: False)
-                    threads=56,               # number of threads in model evaluate process (please bind cores if need)
-                    max_request_num=8,        # maximum number of running requests (or queries, default: 8)
-                    print_log=True,           # print server running logs (default: False)
+s = cpp.ModelServer(f_response,                      # reponse function (deliver generation results and current reamin working size in server)
+                    model_path,                      # model_path
+                    max_new_tokens=128,              # global query max generation token length
+                    num_beams=4,                     # global beam search related generation parameters
+                    min_new_tokens=30,               # global beam search related generation parameters (default: 0)
+                    early_stopping=True,             # global beam search related generation parameters (default: False)
+                    continuous_batching=True,        # turn on continuous batching mechanism (default: True)
+                    return_prompt=True,              # also return prompt token ids in generation results (default: False)
+                    threads=56,                      # number of threads in model evaluate process (please bind cores if need)
+                    max_request_num=8,               # maximum number of running requests (or queries, default: 8)
+                    print_log=True,                  # print server running logs (default: False)
+                    model_scratch_enlarge_scale = 1, # model memory scratch enlarge scale (default: 1)
                 )
 for i in range(len(prompts)):
     p_token_ids = tokenizer(prompts[i], return_tensors='pt').input_ids.tolist()