From 2ee2695abb4b5df6b358afe291b6d0b96e689516 Mon Sep 17 00:00:00 2001
From: "Ding, Yi1" <yi1.ding@intel.com>
Date: Thu, 18 Jan 2024 00:30:52 -0800
Subject: [PATCH] mha for beam search

---
 bestla/bestla/bestla_epilogue.h               |   1 +
 neural_speed/core/layers/mha_dense.cpp        | 316 +++++++++++++++++-
 neural_speed/core/layers/mha_dense.h          |   2 +
 .../core/layers/ne_test_layers_utils.hpp      |   2 +-
 neural_speed/core/ne_layers.c                 |  32 +-
 neural_speed/core/ne_layers.h                 |  15 +
 neural_speed/models/gptj/gptj.cpp             |  73 ++--
 7 files changed, 380 insertions(+), 61 deletions(-)
diff --git a/bestla/bestla/bestla_epilogue.h b/bestla/bestla/bestla_epilogue.h
index f2228c22f..b2349013b 100644
--- a/bestla/bestla/bestla_epilogue.h
+++ b/bestla/bestla/bestla_epilogue.h
@@ -93,6 +93,7 @@ struct ParamAlphaBetaProcess {
 template <BTLA_ISA ISA_T>
 class AlphaBetaProcessFp32 {
  public:
+  using DType = float;
   using Param = ParamAlphaBetaProcess<float>;
 
   BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
diff --git a/neural_speed/core/layers/mha_dense.cpp b/neural_speed/core/layers/mha_dense.cpp
index 5539bd3dd..6f9b717c3 100644
--- a/neural_speed/core/layers/mha_dense.cpp
+++ b/neural_speed/core/layers/mha_dense.cpp
@@ -80,6 +80,7 @@ struct attn_fwd_args_t {
   int step_k_bs, step_k_head_num, step_k_sl, step_k_head_size;
   int step_v_bs, step_v_head_num, step_v_sl, step_v_head_size;
   int step_dst_bs, step_dst_head_num, step_dst_sl;
+  int n_prompt;  // caller grantees that K/V for first n_prompt tokens are identical among batches
 };
 
 struct mha_problem_t {
@@ -657,7 +658,7 @@ class mha_interface_t {
     const auto num_heads = p.batch_size * p.head_num;  // Total number of heads
     device::CpuBase cb;                                // Note: DO NOT use cb.mNumThreads; use th.num_threads() instead
 
-    const bool is_causal = (p.attn_flags & NE_ATTN_FLAG_IS_CAUSAL) != 0;
+    const bool is_causal = (p.attn_flags & NE_ATTN_FLAG_IS_CAUSAL) != 0 && p.sl_q > 1;
     const bool is_alibi = (p.attn_flags & NE_ATTN_FLAG_IS_ALIBI8) != 0;
     assert(!is_causal || p.sl_q <= p.sl_kv);
     assert(("alibi not supported!", !is_alibi));
@@ -1338,7 +1339,7 @@ class mha_stable_interface_t {
     assert((p.V_layout != ATTN_FWD_LAYOUT_PLAIN || p.step_k_sl == 1));
     const auto num_heads = p.batch_size * p.head_num;  // Total number of heads
     device::CpuBase cb;                                // Note: DO NOT use cb.mNumThreads; use th.num_threads() instead
-    const bool is_causal = (p.attn_flags & NE_ATTN_FLAG_IS_CAUSAL) != 0;
+    const bool is_causal = (p.attn_flags & NE_ATTN_FLAG_IS_CAUSAL) != 0 && p.sl_q > 1;
     const bool is_alibi = (p.attn_flags & NE_ATTN_FLAG_IS_ALIBI8) != 0;
     assert(!is_causal || p.sl_q <= p.sl_kv);
     assert(("head_num must be a multiple of heads_kv!", p.head_num % p.heads_kv == 0));
@@ -1505,6 +1506,284 @@ class mha_stable_interface_t {
     return BTLA_CODE::Success;
   }
 
+  BTLA_CODE compute_beams(const attn_fwd_args_t<Q_T, K_T, V_T, DST_T>& p, const parallel::IThreading& th) {
+    assert((std::is_same<Q_T, int8_t>::value || p.Q_sc == 1));
+    assert((std::is_same<K_T, int8_t>::value || p.K_sc == 1));
+    assert((std::is_same<V_T, int8_t>::value || p.V_sc == 1));
+    assert((std::is_same<DST_T, int8_t>::value || p.dst_sc == 1));
+
+    assert((p.Q_layout == ATTN_FWD_LAYOUT_PLAIN && p.dst_layout == ATTN_FWD_LAYOUT_PLAIN));
+    assert((p.K_layout == ATTN_FWD_LAYOUT_PLAIN ||
+            (std::is_same<K_T, int8_t>::value && p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4) ||
+            (std::is_same<K_T, bf16>::value && p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2)));
+    assert((p.V_layout == ATTN_FWD_LAYOUT_PLAIN ||
+            (std::is_same<V_T, int8_t>::value && p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4) ||
+            (std::is_same<V_T, bf16>::value && p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2)));
+
+    assert((!std::is_same<PrologueK, ::weight_forward_n_tile48_t<typename L_Max::GemmCore, L_Max::RT_ISA>>::value) ||
+           p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4 ||
+           p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2);  // WeightForward can only be used with preprocessed layout
+    assert(
+        (!std::is_same<PrologueV, ::weight_forward_n_tile48_t<typename L_Scale::GemmCore, L_Scale::RT_ISA>>::value) ||
+        p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4 ||
+        p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2);  // WeightForward can only be used with preprocessed layout
+
+    assert((p.K_layout != ATTN_FWD_LAYOUT_PLAIN || p.step_v_head_size == 1));
+    assert((p.V_layout != ATTN_FWD_LAYOUT_PLAIN || p.step_k_sl == 1));
+    assert(p.sl_q == 1 && p.batch_size > 1);  // beam search next-token cases
+    device::CpuBase cb;                       // Note: DO NOT use cb.mNumThreads; use th.num_threads() instead
+    const bool is_causal = (p.attn_flags & NE_ATTN_FLAG_IS_CAUSAL) != 0 && p.sl_q > 1;
+    const bool is_alibi = (p.attn_flags & NE_ATTN_FLAG_IS_ALIBI8) != 0;
+    assert(!is_causal || p.sl_q <= p.sl_kv);
+    assert(("head_num must be a multiple of heads_kv!", p.head_num % p.heads_kv == 0));
+    assert(("Not Implemented", !is_alibi));
+    assert(("Not Implemented", !is_causal));
+    const auto group_heads = p.head_num / p.heads_kv;
+    const auto sl_diff = p.sl_kv - p.sl_q;
+
+    // TP will need the real rank oder of k
+    int32_t k_offset = 0;
+    int32_t log_head_num = p.head_num;
+#ifdef NS_TP_MODEL
+    NE_ASSERT(("Not implemented", false))
+#endif
+
+    // alibi slope
+    const int n_heads_log2_floor = 1 << static_cast<int>(floor(log2(log_head_num)));
+    const float m0 = powf(2.0f, -(8.f) / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -(8.f / 2.0f) / n_heads_log2_floor);
+
+    const auto m_tiles = updiv(p.batch_size, M_TILE);
+    assert(p.batch_size <= M_TILE && m_tiles == 1);
+    const auto num_tasks = p.head_num;
+
+    using Scheduler2D = bestla::parallel::Scheduler2D;
+    const Scheduler2D parl({th.num_threads(), {num_tasks, 1}, {1, 1}});  // main parallel scheduler
+
+    th.parallel_for([&](int tid) {
+      const int tmp_s_size = M_TILE * padto(padto(p.sl_kv, GemmQK::NTILE), GemmPV::KTILE);
+      const int tmp_p_size = tmp_s_size;
+      const int tmp_bytes = tmp_s_size * sizeof(float);  // S & exp
+      const auto tmp_s = reinterpret_cast<float*>(p.tmp + tid * tmp_bytes);
+      using PType = typename GemmPV::AType;
+      const auto tmp_p = reinterpret_cast<PType*>(tmp_s);  // overwrite tmp_s row-wisely
+
+      // calculate mm + softmax + mm
+      {
+        typename parallel::ThreadProblem2D thdp{tid};
+        parl.getIndex(thdp);
+        const auto [task_start, _assert0] = thdp.loc;
+        auto [task_size, _assert_max1] = thdp.size;
+        assert(task_size == 0 || _assert0 == 0);
+        assert(task_size == 0 || _assert_max1 == 1 || _assert_max1 == 0);
+        if (_assert_max1 == 0 || !thdp.valid) task_size = 0;
+
+        for (int task_id = task_start; task_id < task_start + task_size; ++task_id) {
+          const int ihn = task_id;
+          const int ihkv = ihn / group_heads;
+
+          const auto alibi_ihn_m = 0.f;  // Alibi not implemented
+
+          float s_max[M_TILE]{};  // maximum for each row of the S matrix
+          std::fill_n(s_max, M_TILE, -INFINITY);
+
+          // ptr to Q / dst matrix of the current head
+          const auto head_q_bs0 = p.Q + ihn * p.step_q_head_num;
+          // const auto head_k = p.K + ibs * p.step_k_bs + ihkv * p.step_k_head_num;
+          // const auto head_v = p.V + ibs * p.step_v_bs + ihkv * p.step_v_head_num;
+          const auto head_k_bs0 = p.K + ihkv * p.step_k_head_num;  // bs here is beam
+          const auto head_v_bs0 = p.V + ihkv * p.step_v_head_num;  // bs here is beam
+          // const auto head_dst = p.dst + ibs * p.step_dst_bs + ihn * p.step_dst_head_num;
+          const auto head_dst_bs0 = p.dst + ihn * p.step_dst_head_num;
+
+          assert(!is_causal);
+          const auto unmasked_size = p.sl_kv;
+
+          const auto unmasked_size_pad_qk = std::min(p.sl_kv, padto(unmasked_size, GemmQK::NTILE));
+          const auto unmasked_size_pad_pv = std::min(p.sl_kv, padto(unmasked_size, GemmPV::KTILE));
+          const int ld_tmp_s = padto(padto(unmasked_size_pad_pv, GemmQK::NTILE), GemmPV::KTILE);
+          static_assert(sizeof(float) >= sizeof(PType), "PType exceeded float size!");
+          const int ld_tmp_p = ld_tmp_s * sizeof(float) / sizeof(PType);
+          const auto qk_prok_ldb = p.step_k_sl == 1                                 ? p.step_k_head_size
+                                   : p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4 ? p.step_k_sl
+                                   : p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2 ? p.step_k_sl
+                                                                                    : (assert(0), 0);
+
+          const auto n_prompt_le_n = padto_le(p.n_prompt, GemmQK::NTILE);
+          typename parallel::gemm::ThreadProblemBase tpQKBatch{
+              /* ThreadProblem2D */ {tid, {}, {0, 0}, {p.batch_size, n_prompt_le_n}, true},
+              /* .block = */ {M_TILE, GemmQK::NTILE, p.head_size},
+              /* .stacksize = */ cb.mL2Cache,
+              /* .tmpcachesize = */ cb.mL2Cache,
+          };
+          l_qk.run(  // QxK => S ==exp==> P
+              QKArgs{
+                  utils::GemmProblem{
+                      /* .batch */ 1,
+                      /* .M = */ p.batch_size,
+                      /* .N = */ n_prompt_le_n,
+                      /* .K = */ p.head_size,
+                  },
+                  /* .paramA = */
+                  QKProQArgs{
+                      head_q_bs0,
+                      p.step_q_bs,
+                  },
+                  /* .paramB = */
+                  QKProKArgs{
+                      /* .B = */ head_k_bs0,
+                      /* .ldb = */ qk_prok_ldb,
+                      /* .is_padded = */ true,
+                  },  // K should be pre-transposed
+                  /* .paramC = */
+                  QKEpiArgs{
+                      /* .dst = */ tmp_s,
+                      /* .dst_sum = */ s_max,
+                      /* .ld_dst = */ ld_tmp_s,
+                      /* .scale = */ p.QK_scale * p.Q_sc * p.K_sc,
+                      /* .causal_offset = */ -1,
+                      /* .alibi_slope = */ alibi_ihn_m,
+                  },
+                  // /* .workspace = */ nullptr,
+              },
+              tpQKBatch);
+          for (int ibs = 0; ibs < p.batch_size; ++ibs) {
+            typename parallel::gemm::ThreadProblemBase tpQKBeam{
+                /* ThreadProblem2D */ {tid, {}, {ibs, n_prompt_le_n}, {1, p.sl_kv - n_prompt_le_n}, true},
+                /* .block = */ {M_TILE, GemmQK::NTILE, p.head_size},
+                /* .stacksize = */ cb.mL2Cache,
+                /* .tmpcachesize = */ cb.mL2Cache,
+            };
+            l_qk.run(  // QxK => S ==exp==> P
+                QKArgs{
+                    utils::GemmProblem{
+                        /* .batch */ 1,
+                        /* .M = */ 1,
+                        /* .N = */ p.sl_kv,
+                        /* .K = */ p.head_size,
+                    },
+                    /* .paramA = */
+                    QKProQArgs{
+                        head_q_bs0,
+                        p.step_q_bs,
+                    },
+                    /* .paramB = */
+                    QKProKArgs{
+                        /* .B = */ head_k_bs0 + ibs * p.step_k_bs,
+                        /* .ldb = */ qk_prok_ldb,
+                        /* .is_padded = */ true,
+                    },  // K should be pre-transposed
+                    /* .paramC = */
+                    QKEpiArgs{
+                        /* .dst = */ tmp_s,
+                        /* .dst_sum = */ s_max,
+                        /* .ld_dst = */ ld_tmp_s,
+                        /* .scale = */ p.QK_scale * p.Q_sc * p.K_sc,
+                        /* .causal_offset = */ is_causal ? sl_diff : -1,
+                        /* .alibi_slope = */ alibi_ihn_m,
+                    },
+                    // /* .workspace = */ nullptr,
+                },
+                tpQKBeam);
+          }
+
+          // softmax (with pre-computed row_max)
+          assert(!is_causal);
+          const auto unmasked_size_start = p.sl_kv;
+          float expsum[M_TILE]{};  // maximum for each row of the S matrix
+          const auto softmax_npad_size = padto(unmasked_size_pad_pv, GemmPV::KTILE);
+          inplace_precompute_max_softmax_t<float, PType>::forward(          //
+              p.batch_size, unmasked_size_start, softmax_npad_size,         // m / n
+              is_causal, tmp_s, tmp_p, s_max, expsum, ld_tmp_s, ld_tmp_p);  //
+
+          const auto pv_scale = expsum;
+          for (int i = 0; i < M_TILE; ++i) pv_scale[i] = p.V_sc / UINT8_MAX / expsum[i] / p.dst_sc;
+
+          const auto pv_prov_ldb = p.step_v_head_size == 1                          ? p.step_v_sl
+                                   : p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4 ? p.step_v_head_size
+                                   : p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2 ? p.step_v_head_size
+                                                                                    : (assert(0), 0);
+
+          const auto n_prompt_le_k = padto_le(p.n_prompt, GemmPV::KTILE);
+          typename parallel::gemm::ThreadProblemBase tpPVBatch{
+              /* ThreadProblem2D */ {tid, {}, {0, 0}, {p.batch_size, p.head_size}, true},
+              /* .block = */ {M_TILE, GemmPV::NTILE, n_prompt_le_k},
+              /* .stacksize = */ cb.mL2Cache,
+              /* .tmpcachesize = */ cb.mL2Cache,
+          };
+          l_pv.run(  // PxV => O
+              PVArgs{
+                  utils::GemmProblem{
+                      /* .batch */ 1,
+                      /* .M = */ p.batch_size,
+                      /* .N = */ p.head_size,
+                      /* .K = */ n_prompt_le_k,
+                  },
+                  /* .paramA = */ PVProPArgs{tmp_p, ld_tmp_p},
+                  /* .paramB = */
+                  PVProVArgs{
+                      /* .B = */ head_v_bs0,
+                      /* .ldb = */ pv_prov_ldb,
+                      /* .is_padded = */ true,
+                  },
+                  /* .paramC = */
+                  PVEpiArgs{
+                      /* .C = */ head_dst_bs0,
+                      /* .D = */ head_dst_bs0,
+                      /* .ldc = */ p.step_dst_bs,
+                      /* .ldd = */ p.step_dst_bs,
+                      /* .alpha = */ 1.f,
+                      /* .beta = */ 0,
+                  },
+                  // /* .workspace = */ nullptr,
+              },
+              tpPVBatch);
+          for (int ibs = 0; ibs < p.batch_size; ++ibs) {
+            if constexpr (std::is_same_v<PVEpiArgs,
+                                         typename epilogue::gemm::AlphaBetaProcessFp32<L_Scale::RT_ISA>::Param>) {
+              typename parallel::gemm::ThreadProblemBase tpPVBeam{
+                  /* ThreadProblem2D */ {tid, {}, {ibs, 0}, {1, p.head_size}, true},
+                  /* .block = */ {M_TILE, GemmPV::NTILE, unmasked_size_pad_pv - n_prompt_le_k},
+                  /* .stacksize = */ cb.mL2Cache,
+                  /* .tmpcachesize = */ cb.mL2Cache,
+              };
+              l_pv.run(  // PxV => O
+                  PVArgs{
+                      utils::GemmProblem{
+                          /* .batch */ 1,
+                          /* .M = */ p.batch_size,
+                          /* .N = */ p.head_size,
+                          /* .K = */ unmasked_size_pad_pv - n_prompt_le_k,
+                      },
+                      /* .paramA = */ PVProPArgs{tmp_p + n_prompt_le_k, ld_tmp_p},
+                      /* .paramB = */
+                      PVProVArgs{
+                          /* .B = */ head_v_bs0 + ibs * p.step_v_bs + n_prompt_le_k * GemmPV::NTILE,
+                          /* .ldb = */ pv_prov_ldb,
+                          /* .is_padded = */ true,
+                      },
+                      /* .paramC = */
+                      PVEpiArgs{
+                          /* .C = */ head_dst_bs0,
+                          /* .D = */ head_dst_bs0,
+                          /* .ldc = */ p.step_dst_bs,
+                          /* .ldd = */ p.step_dst_bs,
+                          /* .alpha = */ 1.f,
+                          /* .beta = */ 1.f,
+                      },
+                      // /* .workspace = */ nullptr,
+                  },
+                  tpPVBeam);
+            } else {
+              assert(("Not implemented", false));
+            }
+          }
+        }
+      }
+    });
+    return BTLA_CODE::Success;
+  }
+
  protected:
   L_Max l_qk;
   L_Scale l_pv;
@@ -1679,15 +1958,27 @@ void bestla_fusion_attn_forward<float, bf16, bf16, float>(const attn_fwd_args_t<
         prologue_a::gemm::ActivationConverterFp32,            //
         ::weight_forward_n_tile48_t,                          //
         ::ScaleTrackMaxFp32Fp32>;                             //
-    using GemmKernelBF16 = ::launcher_base_weight_t<          //
-        BTLA_ISA::AMX_BF16,                                   //
-        gemm::HCoreRowNAmxbf16<48, 16>,                       //
-        ::activation_identity_t,                              // pretty sure we have enough paddings for P-matrix
-        ::weight_forward_n_tile48_t,                          //
-        epilogue::gemm::AccumulatorWriteBackFp32>;            //
-    static mha_stable_interface_t<GemmKernelBF16TrackMax, GemmKernelBF16> mha;
-    [[maybe_unused]] const auto ret = mha.compute(params, *pth);
-    assert(ret == BTLA_CODE::Success);
+    if (params.n_prompt > 0 && params.batch_size > 1) {       // beam search optimization
+      using GemmKernelBF16 = ::launcher_base_weight_t<        //
+          BTLA_ISA::AMX_BF16,                                 //
+          gemm::HCoreRowNAmxbf16<48, 16>,                     //
+          ::activation_identity_t,                            // pretty sure we have enough paddings for P-matrix
+          ::weight_forward_n_tile48_t,                        //
+          epilogue::gemm::AlphaBetaProcessFp32>;              //
+      static mha_stable_interface_t<GemmKernelBF16TrackMax, GemmKernelBF16> mha;
+      [[maybe_unused]] const auto ret = mha.compute_beams(params, *pth);
+      assert(ret == BTLA_CODE::Success);
+    } else {
+      using GemmKernelBF16 = ::launcher_base_weight_t<  //
+          BTLA_ISA::AMX_BF16,                           //
+          gemm::HCoreRowNAmxbf16<48, 16>,               //
+          ::activation_identity_t,                      // pretty sure we have enough paddings for P-matrix
+          ::weight_forward_n_tile48_t,                  //
+          epilogue::gemm::AccumulatorWriteBackFp32>;    //
+      static mha_stable_interface_t<GemmKernelBF16TrackMax, GemmKernelBF16> mha;
+      [[maybe_unused]] const auto ret = mha.compute(params, *pth);
+      assert(ret == BTLA_CODE::Success);
+    }
   } else {
     assert(0);
   }
@@ -1695,7 +1986,7 @@ void bestla_fusion_attn_forward<float, bf16, bf16, float>(const attn_fwd_args_t<
 
 template <typename Q_T, typename K_T, typename V_T, typename DST_T>
 void bestla_fusion_attn_forward_ref(const attn_fwd_args_t<Q_T, K_T, V_T, DST_T>& p) {
-  const bool is_causal = (p.attn_flags & NE_ATTN_FLAG_IS_CAUSAL) != 0;
+  const bool is_causal = (p.attn_flags & NE_ATTN_FLAG_IS_CAUSAL) != 0 && p.sl_q > 1;
   const bool is_alibi = (p.attn_flags & NE_ATTN_FLAG_IS_ALIBI8) != 0;
   assert(!is_causal || p.sl_q <= p.sl_kv);
   assert(("head_num must be a multiple of heads_kv!", p.head_num % p.heads_kv == 0));
@@ -1937,6 +2228,7 @@ void bestla_reordered_attn_fp32_forward(const bestla_reordered_attn_fp32_fp32_fw
       /* .step_dst_bs = */ params->step_dst_bs,
       /* .step_dst_head_num = */ params->step_dst_head_num,
       /* .step_dst_sl = */ params->step_dst_sl,
+      /* .n_prompt = */ params->n_prompt,
   };
   return bestla_fusion_attn_forward<float, bf16, bf16, float>(bestla_params);
 }
diff --git a/neural_speed/core/layers/mha_dense.h b/neural_speed/core/layers/mha_dense.h
index af8581b5b..0683e2d72 100644
--- a/neural_speed/core/layers/mha_dense.h
+++ b/neural_speed/core/layers/mha_dense.h
@@ -80,6 +80,7 @@ typedef struct attn_fp32_fp16_fp16_fp32_fwd_args_t {
   int step_k_bs, step_k_head_num, step_k_sl, step_k_head_size;
   int step_v_bs, step_v_head_num, step_v_sl, step_v_head_size;
   int step_dst_bs, step_dst_head_num, step_dst_sl;
+  int n_prompt;  // caller grantees that K/V for first n_prompt tokens are identical among batches
 } attn_fp32_fp16_fp16_fp32_fwd_args_t;
 
 void bestla_fusion_attn_bf16_forward(const attn_bf16_fwd_args_t* params);
@@ -165,6 +166,7 @@ typedef struct bestla_reordered_attn_fp32_fp32_fwd_args_t {
   int stride_k_bs, stride_k_head_num, stride_k_sl, stride_k_head_size;
   int stride_v_bs, stride_v_head_num, stride_v_sl, stride_v_head_size;
   int step_dst_bs, step_dst_head_num, step_dst_sl;
+  int n_prompt;  // caller grantees that K/V for first n_prompt tokens are identical among batches
 } bestla_reordered_attn_fp32_fp32_fwd_args_t;
 void bestla_reordered_attn_fp32_forward(const bestla_reordered_attn_fp32_fp32_fwd_args_t* params);
 
diff --git a/neural_speed/core/layers/ne_test_layers_utils.hpp b/neural_speed/core/layers/ne_test_layers_utils.hpp
index fce60c499..67027be07 100644
--- a/neural_speed/core/layers/ne_test_layers_utils.hpp
+++ b/neural_speed/core/layers/ne_test_layers_utils.hpp
@@ -19,7 +19,7 @@
 #include <vector>
 #include <algorithm>
 
-#include "bestla/jit_blas_utils.h"
+#include "bestla/bestla_utils.h"
 
 #ifndef NS_TESTS
 static_assert(false, "Only include this header file for testing!");
diff --git a/neural_speed/core/ne_layers.c b/neural_speed/core/ne_layers.c
index a6154bda2..400b81767 100644
--- a/neural_speed/core/ne_layers.c
+++ b/neural_speed/core/ne_layers.c
@@ -50,12 +50,6 @@
 #include "ne.h"
 #include "ne_bestla.h"
 
-// if C99 - static_assert is noop
-// ref: https://stackoverflow.com/a/53923785/4039976
-#ifndef static_assert
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-
 #if defined(_WIN32)
 
 #include <windows.h>
@@ -3274,9 +3268,16 @@ struct ne_tensor* ne_conv_1d_ph(struct ne_context* ctx, struct ne_tensor* a, str
 }
 
 // ne_flash_attn
-
 struct ne_tensor* ne_flash_attn(struct ne_context* ctx, struct ne_tensor* q, struct ne_tensor* k, struct ne_tensor* v,
                                 float scale, ne_attn_flags_t flags) {
+  const ne_attn_op_params_t attn_op_param = {
+      .flags = flags,
+      .scale = scale,
+  };
+  return ne_flash_attn_with_params(ctx, q, k, v, &attn_op_param);
+};
+struct ne_tensor* ne_flash_attn_with_params(struct ne_context* ctx, struct ne_tensor* q, struct ne_tensor* k,
+                                            struct ne_tensor* v, const ne_attn_op_params_t* op_params) {
   NE_ASSERT(ne_can_mul_mat(k, q));
   int batch = q->ne[3];
   int headnum = q->ne[2];
@@ -3303,8 +3304,7 @@ struct ne_tensor* ne_flash_attn(struct ne_context* ctx, struct ne_tensor* q, str
   result->src1 = k;
   result->opt[0] = v;
   result->opt[1] = tmp_t;
-  *(float*)result->padding = scale;
-  *(ne_attn_flags_t*)&result->padding[sizeof(scale)] = flags;
+  memcpy(result->op_params, op_params, sizeof(ne_attn_op_params_t));
   return result;
 }
 
@@ -8744,8 +8744,7 @@ static void ne_compute_forward_flash_attn_f32_f16_f16(const struct ne_compute_pa
   int step_v_head_size = v->nb[1] / veles;
   int step_v_head_num = v->nb[2] / veles;
   int step_v_bs = k->nb[3] / veles;
-  float scale = *(float*)dst->padding;
-  ne_attn_flags_t flags = *(bool*)&dst->padding[sizeof(scale)];
+  const ne_attn_op_params_t* op_params = (ne_attn_op_params_t*)dst->op_params;
   attn_fp32_fp16_fp16_fp32_fwd_args_t args = {
       .Q = (float*)q->data,
       .K = (ne_fp16_t*)k->data,
@@ -8756,8 +8755,8 @@ static void ne_compute_forward_flash_attn_f32_f16_f16(const struct ne_compute_pa
       .V_sc = 1.f,
       .dst_sc = 1.f,
       .tmp = tmp->data,
-      .QK_scale = scale,
-      .attn_flags = flags,
+      .QK_scale = op_params->scale,
+      .attn_flags = op_params->flags,
       .batch_size = batch,
       .head_num = headnum,
       .heads_kv = heads_kv,
@@ -8782,6 +8781,7 @@ static void ne_compute_forward_flash_attn_f32_f16_f16(const struct ne_compute_pa
       .step_dst_bs = seq_cur * embedsize,
       .step_dst_head_num = headsize,
       .step_dst_sl = embedsize,
+      .n_prompt = op_params->n_prompt,
   };
   bestla_fusion_attn_fp32_fp16_fp16_fp32_forward(&args);
 }
@@ -8801,8 +8801,9 @@ static void ne_compute_forward_flash_attn_reordered(const struct ne_compute_para
   const int64_t dst_ele_size = ne_element_size(dst);
   // const int64_t seq_past = seq_all - seq_cur;
 
-  float scale = *(float*)dst->padding;
-  ne_attn_flags_t flags = *(ne_attn_flags_t*)&dst->padding[sizeof(scale)];
+  const ne_attn_op_params_t* op_params = (ne_attn_op_params_t*)dst->op_params;
+  float scale = op_params->scale;
+  ne_attn_flags_t flags = op_params->flags;
 
   NE_ASSERT(k->type == NE_TYPE_BTLA && v->type == NE_TYPE_BTLA);
   ATTN_FWD_LAYOUT K_layout = *(ATTN_FWD_LAYOUT*)(&k->nb[0]);
@@ -8848,6 +8849,7 @@ static void ne_compute_forward_flash_attn_reordered(const struct ne_compute_para
       .step_dst_bs = dst->nb[3] / dst_ele_size,
       .step_dst_head_num = dst->nb[1] / dst_ele_size,
       .step_dst_sl = dst->nb[2] / dst_ele_size,
+      .n_prompt = op_params->n_prompt,
   };
   bestla_reordered_attn_fp32_forward(&args);
 }
diff --git a/neural_speed/core/ne_layers.h b/neural_speed/core/ne_layers.h
index 032283696..ad1c3b5aa 100644
--- a/neural_speed/core/ne_layers.h
+++ b/neural_speed/core/ne_layers.h
@@ -37,6 +37,12 @@
 #include "core/data_types.h"
 #include "layers/layers.h"
 
+// if C99 - static_assert is noop
+// ref: https://stackoverflow.com/a/53923785/4039976
+#if !defined(static_assert) && (!defined(__cplusplus) || __cplusplus < 201103L)
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+
 #define NE_QNT_VERSION 2            // bump this on quantization format changes
 #define NE_QNT_VERSION_FACTOR 1000  // do not change this
 
@@ -69,6 +75,13 @@ typedef enum NE_ATTN_FLAG {
 } NE_ATTN_FLAG;
 typedef uint32_t ne_attn_flags_t;
 
+typedef struct ne_attn_op_params_t {
+  ne_attn_flags_t flags;
+  float scale;
+  int n_prompt;
+} ne_attn_op_params_t;
+static_assert(sizeof(ne_attn_op_params_t) <= NE_MAX_OP_PARAMS, "ATTN OP PARAM too large!");
+
 // convert FP16 <-> FP32
 NE_API float ne_fp16_to_fp32(ne_fp16_t x);
 NE_API ne_fp16_t ne_fp32_to_fp16(float x);
@@ -440,6 +453,8 @@ NE_API struct ne_tensor* ne_conv_1d_ph(struct ne_context* ctx, struct ne_tensor*
 
 NE_API struct ne_tensor* ne_flash_attn(struct ne_context* ctx, struct ne_tensor* q, struct ne_tensor* k,
                                        struct ne_tensor* v, float scale, ne_attn_flags_t flags);
+NE_API struct ne_tensor* ne_flash_attn_with_params(struct ne_context* ctx, struct ne_tensor* q, struct ne_tensor* k,
+                                                   struct ne_tensor* v, const ne_attn_op_params_t* op_params);
 // set no_zeroing to true to prevent zeroing unaligned seq
 NE_API struct ne_tensor* ne_flash_attn_update_k(struct ne_context* ctx, struct ne_tensor* cache, struct ne_tensor* cur,
                                                 int n_past, bool no_zeroing);
diff --git a/neural_speed/models/gptj/gptj.cpp b/neural_speed/models/gptj/gptj.cpp
index 49709975b..1eeef540d 100644
--- a/neural_speed/models/gptj/gptj.cpp
+++ b/neural_speed/models/gptj/gptj.cpp
@@ -208,10 +208,11 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
     if (lctx.cont_batching) {
       size_t off_sl = 0;
       // per_request rope
-      for (int gi = 0; gi < infer_groups.size(); ++gi) {
-        const int qk_bs = infer_groups[gi].size();
-        const int qk_sl = n_tokens[infer_groups[gi].front()];
-        const int qk_n_past = n_pasts[infer_groups[gi].front()];
+
+      for (const auto& curr_group : infer_groups) {
+        const int qk_bs = curr_group.size();
+        const int qk_sl = n_tokens[curr_group.front()];
+        const int qk_n_past = n_pasts[curr_group.front()];
         struct ne_tensor* Qcur_req =
             ne_view_4d(ctx0, Qcur, head_size, n_head, qk_sl, qk_bs, ne_element_size(Qcur) * head_size,
                        ne_element_size(Qcur) * head_size * n_head, ne_element_size(Qcur) * head_size * n_head * qk_sl,
@@ -313,11 +314,11 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
       const auto k_size = kv_cache_info.k_bytes;
       const auto v_size = kv_cache_info.v_bytes;
       size_t off_sl = 0;
-      for (int gi = 0; gi < infer_groups.size(); ++gi) {
-        const int update_bs = infer_groups[gi].size();
-        const int update_sl = n_tokens[infer_groups[gi].front()];
-        const int update_block_id = block_ids[infer_groups[gi].front()];
-        const int update_n_past = n_pasts[infer_groups[gi].front()];
+      for (const auto& curr_group : infer_groups) {
+        const int update_bs = curr_group.size();
+        const int update_sl = n_tokens[curr_group.front()];
+        const int update_block_id = block_ids[curr_group.front()];
+        const int update_n_past = n_pasts[curr_group.front()];
         struct ne_tensor* k_cache_g = ne_view_4d(ctx0, kv_self.k,                      // tensor
                                                  head_size, n_ctx, n_head, update_bs,  // ne
                                                  0, 0, k_size,                         // nb (bestla managed)
@@ -345,20 +346,19 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
     struct ne_tensor* KQV_merged_contiguous =
         ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_size * n_head, seq_len_sum, NE_SIZE_CALC);
     size_t off_sl = 0;
-    for (int gi = 0; gi < infer_groups.size(); ++gi) {
-      const int attn_bs = infer_groups[gi].size();
-      const int attn_sl = n_tokens[infer_groups[gi].front()];
-      const int attn_block_id = block_ids[infer_groups[gi].front()];
-      const int attn_n_past = n_pasts[infer_groups[gi].front()];
-      const int attn_n_total = n_totals[infer_groups[gi].front()];
+    for (const auto& curr_group : infer_groups) {
+      const int attn_bs = curr_group.size();
+      const int attn_sl = n_tokens[curr_group.front()];
+      const int attn_block_id = block_ids[curr_group.front()];
+      const int attn_n_past = n_pasts[curr_group.front()];
+      const int attn_n_total = n_totals[curr_group.front()];
       struct ne_tensor* Q =
           ne_permute(ctx0,
                      ne_view_4d(ctx0, Qcur, head_size, n_head, attn_sl, attn_bs, ne_element_size(Qcur) * head_size,
                                 ne_element_size(Qcur) * head_size * n_head,
                                 ne_element_size(Qcur) * head_size * n_head * attn_sl, off_sl * ne_element_size(Qcur)),
                      0, 2, 1, 3);
-      std::string suffix = std::to_string(gi);
-      ne_set_name(Q, std::string("Q_" + suffix).c_str());
+      ne_set_name(Q, "Q");
       struct ne_tensor *K, *V;
       const int n_cached_gi = shift_roped_k ? n_cached : attn_n_past + attn_sl;
       if (run_mha_reordered) {
@@ -412,7 +412,7 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
         }
       } else {
         std::vector<int> attn_block_ids;
-        for (const auto& bsi : infer_groups[gi]) {
+        for (const auto& bsi : curr_group) {
           attn_block_ids.push_back(block_ids[bsi]);
         }
         K = model_kv_cache_seq_concat(&gf, &lctx, ctx0, head_size, n_cached_gi, n_head, attn_bs, attn_block_ids, il);
@@ -431,20 +431,27 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
         V = model_kv_cache_seq_concat(&gf, &lctx, ctx0, n_cached_gi, head_size, n_head, attn_bs, attn_block_ids, il,
                                       false);
       }
-      ne_set_name(K, std::string("K_" + suffix).c_str());
-      ne_set_name(V, std::string("V_" + suffix).c_str());
+      ne_set_name(K, "K");
+      ne_set_name(V, "V");
 
       struct ne_tensor* KQV_merged_gi;
       const float attn_scale = 1.0f / sqrtf(static_cast<float>(head_size));
       ne_attn_flags_t attn_flags = NE_ATTN_FLAG_NONE;
+#ifndef NDEBUG
+      for (const auto bi : curr_group) {
+        NE_ASSERT(inputs[bi].n_prompt_tokens == inputs[curr_group[0]].n_prompt_tokens);
+      }
+#endif
+      const int n_prompt = curr_group.size() == 1 ? 0 : inputs[curr_group[0]].n_prompt_tokens;
       if (attn_n_total == 0 || !shift_roped_k)
         attn_flags |= NE_ATTN_FLAG_IS_CAUSAL;  // no causal mask on next-token cases
-      if (run_mha_reordered) {                 // reordered kv-cache bf16 mha must be used if run_mha_reordered
-        struct ne_tensor* KQV_Out = ne_flash_attn(ctx0, Q, K, V, attn_scale, attn_flags);
+      const auto attn_op_param = ne_attn_op_params_t{attn_flags, attn_scale, n_prompt};
+      if (run_mha_reordered) {  // reordered kv-cache bf16 mha must be used if run_mha_reordered
+        struct ne_tensor* KQV_Out = ne_flash_attn_with_params(ctx0, Q, K, V, &attn_op_param);
         KQV_merged_gi = ne_view_2d(ctx0, KQV_Out, head_size * n_head, attn_sl * attn_bs,
                                    head_size * n_head * ne_element_size(KQV_Out), 0);
       } else if (run_mha_fp16) {  // non-reordered kv-cache fp16 mha
-        struct ne_tensor* KQV_Out = ne_flash_attn(ctx0, Q, K, V, attn_scale, attn_flags);
+        struct ne_tensor* KQV_Out = ne_flash_attn_with_params(ctx0, Q, K, V, &attn_op_param);
         KQV_merged_gi = ne_view_2d(ctx0, KQV_Out, head_size * n_head, attn_sl * attn_bs,
                                    head_size * n_head * ne_element_size(KQV_Out), 0);
       } else if (attn_n_total == 0 && run_mha_bf16_first) {
@@ -462,37 +469,37 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
       } else {
         // K * Q
         struct ne_tensor* KQ = ne_mul_mat(ctx0, K, Q);
-        ne_set_name(KQ, std::string("KQ_" + suffix).c_str());
+        ne_set_name(KQ, "KQ");
 
         // KQ_scaled = KQ / sqrt(n_embd/n_head)
         struct ne_tensor* KQ_scale = ne_new_f32(ctx0, attn_scale);
-        ne_set_name(KQ_scale, std::string("1/sqrt(n_embd/n_head)_" + suffix).c_str());
+        ne_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
 
         // KQ_scaled shape [n_cached, N, n_head, 1]
         struct ne_tensor* KQ_scaled = ne_scale_inplace(ctx0, KQ, KQ_scale);
-        ne_set_name(KQ_scaled, std::string("KQ_scaled_" + suffix).c_str());
+        ne_set_name(KQ_scaled, "KQ_scaled");
 
         // KQ_scaled = mask_past(KQ_scaled)
         if (attn_n_total == 0 || !shift_roped_k || !no_padding) {
-          std::vector<int> attn_n_padding(infer_groups[gi].size(), 0);
-          for (int npa = 0; !n_padding.empty() && npa < infer_groups[gi].size(); ++npa) {
-            attn_n_padding[npa] = n_padding[infer_groups[gi][npa]];
+          std::vector<int> attn_n_padding(curr_group.size(), 0);
+          for (int npa = 0; !n_padding.empty() && npa < curr_group.size(); ++npa) {
+            attn_n_padding[npa] = n_padding[curr_group[npa]];
           }
           KQ_scaled = ne_diag_mask_inf_with_padding_inplace(ctx0, KQ_scaled, attn_n_past, attn_n_padding.data());
-          ne_set_name(KQ_scaled, std::string("KQ_masked_" + suffix).c_str());
+          ne_set_name(KQ_scaled, "KQ_masked");
         }
 
         // KQ = soft_max(KQ_masked)
         struct ne_tensor* KQ_soft_max = ne_soft_max_inplace(ctx0, KQ_scaled);
-        ne_set_name(KQ_soft_max, std::string("KQ_soft_max_" + suffix).c_str());
+        ne_set_name(KQ_soft_max, "KQ_soft_max");
 
         struct ne_tensor* KQV = ne_mul_mat(ctx0, V, KQ_soft_max);
-        ne_set_name(KQV, std::string("KQV_" + suffix).c_str());
+        ne_set_name(KQV, "KQV");
 
         // KQV_merged = KQV.permute(0, 2, 1, 3)
         KQV_merged_gi = ne_permute(ctx0, KQV, 0, 2, 1, 3);
       }
-      ne_set_name(KQV_merged_gi, std::string("KQV_merged_" + suffix).c_str());
+      ne_set_name(KQV_merged_gi, "KQV_merged");
       ne_build_forward_expand(&gf, ne_cpy(ctx0, KQV_merged_gi,
                                           ne_view_2d(ctx0, KQV_merged_contiguous, head_size * n_head, attn_sl * attn_bs,
                                                      head_size * n_head * ne_element_size(KQV_merged_contiguous),