From 590d65bf6a00d22326449dbddbeb2ba79b5a746b Mon Sep 17 00:00:00 2001
From: intellinjun <jun.lin@intel.com>
Date: Fri, 1 Mar 2024 16:50:42 +0800
Subject: [PATCH] add ffn silu support assert

Signed-off-by: intellinjun <jun.lin@intel.com>
---
 neural_speed/core/ne_layers.c       | 46 ++++++++++++++---------------
 neural_speed/models/llama/llama.cpp |  5 +++-
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/neural_speed/core/ne_layers.c b/neural_speed/core/ne_layers.c
index de22ae8b5..791c94076 100644
--- a/neural_speed/core/ne_layers.c
+++ b/neural_speed/core/ne_layers.c
@@ -7159,29 +7159,28 @@ static void ne_compute_forward_mul_mat_id_f32(const struct ne_compute_params* pa
   const int n_as = dst->op_params[1];
   // char * wdata_src1_end = (char *)params->wdata;
   // int64_t wdata_src1_end = 0;
-  int64_t matrix_row_counts[100];  // [n_as]
-  int64_t matrix_rows[30000];      // [n_as][ne11]
-#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
 
   // nb01 >= nb00 - src0 is not transposed
   //   compute by src0 rows
 
   if (params->type == NE_TASK_INIT) {
-    memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
-    memset(matrix_rows, -1, 30000 * sizeof(int64_t));
-    for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-      const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]);
-      NE_ASSERT(row_id >= 0 && row_id < n_as);
-      mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01;
-      matrix_row_counts[row_id] += 1;
-    }
-
     return;
   }
 
   if (params->type == NE_TASK_FINALIZE) {
     return;
   }
+  int64_t matrix_row_counts[100];  // [n_as]
+  int64_t matrix_rows[30000];      // [n_as][ne11]
+#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
+  memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
+  memset(matrix_rows, -1, 30000 * sizeof(int64_t));
+  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+    const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]);
+    NE_ASSERT(row_id >= 0 && row_id < n_as);
+    mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01;
+    matrix_row_counts[row_id] += 1;
+  }
   for (int cur_a = 0; cur_a < n_as; ++cur_a) {
     const int64_t cne1 = matrix_row_counts[cur_a];
     if (cne1 == 0) {
@@ -7292,22 +7291,11 @@ static void ne_compute_forward_mul_mat_id_f16_f32(const struct ne_compute_params
   const int n_as = dst->op_params[1];
   // char * wdata_src1_end = (char *)params->wdata;
   // int64_t wdata_src1_end = 0;
-  int64_t matrix_row_counts[100];  // [n_as]
-  int64_t matrix_rows[30000];      // [n_as][ne11]
-#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
 
   // nb01 >= nb00 - src0 is not transposed
   //   compute by src0 rows
 
   if (params->type == NE_TASK_INIT) {
-    memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
-    memset(matrix_rows, -1, 30000 * sizeof(int64_t));
-    for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-      const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]);
-      NE_ASSERT(row_id >= 0 && row_id < n_as);
-      mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01;
-      matrix_row_counts[row_id] += 1;
-    }
     ne_fp16_t* const wdata = params->wdata;
 
     size_t id = 0;
@@ -7330,6 +7318,17 @@ static void ne_compute_forward_mul_mat_id_f16_f32(const struct ne_compute_params
   if (params->type == NE_TASK_FINALIZE) {
     return;
   }
+  int64_t matrix_row_counts[100];  // [n_as]
+  int64_t matrix_rows[30000];      // [n_as][ne11]
+#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
+  memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
+  memset(matrix_rows, -1, 30000 * sizeof(int64_t));
+  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+    const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]);
+    NE_ASSERT(row_id >= 0 && row_id < n_as);
+    mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01;
+    matrix_row_counts[row_id] += 1;
+  }
   for (int cur_a = 0; cur_a < n_as; ++cur_a) {
     const int64_t cne1 = matrix_row_counts[cur_a];
     if (cne1 == 0) {
@@ -11342,6 +11341,7 @@ void ne_graph_compute(struct ne_context* ctx, struct ne_cgraph* cgraph) {
           }
           if (wei->type == NE_TYPE_BTLA) {
             cur = bestla_f32f32_get_workspace_size(node->src1->ne[1], wei->ne[1], node->src1->ne[0], wei->data);
+            node->n_tasks = 1;
           } else if (wei->type == NE_TYPE_F16 && node->src1->type == NE_TYPE_F32) {
             cur = NE_TYPE_SIZE[NE_TYPE_F16] * ne_nelements(node->src1);
           } else if (wei->type == NE_TYPE_F32 && node->src1->type == NE_TYPE_F32) {
diff --git a/neural_speed/models/llama/llama.cpp b/neural_speed/models/llama/llama.cpp
index ac4ccbac0..41aedf08d 100644
--- a/neural_speed/models/llama/llama.cpp
+++ b/neural_speed/models/llama/llama.cpp
@@ -379,7 +379,10 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp
 
         for (int i = 0; i < n_expert_used; ++i) {
           ne_tensor* cur_expert;
-          if (N == 1 && ctx0, model.layers[il].ffn_down_exp[0]->type == NE_TYPE_BTLA) {
+          if (N == 1 && bestla_fusion_FFN_SiLu_f32f32_support(
+                            model.layers[il].ffn_gate_exp[0]->data, model.layers[il].ffn_down_exp[0]->data,
+                            model.layers[il].ffn_up_exp[0]->data, N, cur->ne[0],
+                            model.layers[il].ffn_gate_exp[0]->ne[1], model.layers[il].ffn_down_exp[0]->ne[1])) {
             cur_expert = ne_mul_id_ffn_silu(ctx0, model.layers[il].ffn_down_exp, model.layers[il].ffn_gate_exp,
                                             model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
           } else {