gguf refactor

intel · Jan 9, 2024 · 428d4c7 · 428d4c7
1 parent af60e9a
commit 428d4c7
Show file tree

Hide file tree

Showing 2 changed files with 195 additions and 163 deletions.
diff --git a/neural_speed/application/main_run.cpp b/neural_speed/application/main_run.cpp
@@ -404,10 +404,11 @@ int main(int argc, char** argv) {  // NOLINT
   const float mirostat_eta = params.mirostat_eta;
   const bool penalize_nl = params.penalize_nl;
   model_token id = 0;
-
-  if (params.warmup) {
+  if (ns_log_level() >= 0 && params.warmup) {
+    // Warmup phase is used to generate static objects(e.g. JIT kernels)
+    int constexpr WarmUpPromptLen = 32;
     {
-      const std::vector<model_token> tmp(32, ctx->vocab.bos_token_id);
+      const std::vector<model_token> tmp(WarmUpPromptLen, ctx->vocab.bos_token_id);
       std::vector<model_input> inputs = {model_input{
           /*.tokens              =*/tmp.data(),
           /*.n_tokens           =*/(uint32_t)tmp.size(),
@@ -430,8 +431,8 @@ int main(int argc, char** argv) {  // NOLINT
           /*.tokens              =*/tmp.data(),
           /*.n_tokens           =*/(uint32_t)tmp.size(),
           /*.n_prompt_tokens    =*/0,
-          /*.n_past             =*/(uint32_t)(params.n_predict - 1),
-          /*.n_total            =*/(uint32_t)(params.n_predict - 1),
+          /*.n_past             =*/WarmUpPromptLen,
+          /*.n_total            =*/WarmUpPromptLen,
           /*.request_idx        =*/0,
           /*.beam_idx           =*/0,
           /*.padding_side       =*/0,
@@ -771,4 +772,4 @@ int main(int argc, char** argv) {  // NOLINT
   model_free(ctx);
 
   return 0;
-}  // NOLINT
+}  // NOLINT