From 590d65bf6a00d22326449dbddbeb2ba79b5a746b Mon Sep 17 00:00:00 2001 From: intellinjun Date: Fri, 1 Mar 2024 16:50:42 +0800 Subject: [PATCH] add ffn silu support assert Signed-off-by: intellinjun --- neural_speed/core/ne_layers.c | 46 ++++++++++++++--------------- neural_speed/models/llama/llama.cpp | 5 +++- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/neural_speed/core/ne_layers.c b/neural_speed/core/ne_layers.c index de22ae8b5..791c94076 100644 --- a/neural_speed/core/ne_layers.c +++ b/neural_speed/core/ne_layers.c @@ -7159,29 +7159,28 @@ static void ne_compute_forward_mul_mat_id_f32(const struct ne_compute_params* pa const int n_as = dst->op_params[1]; // char * wdata_src1_end = (char *)params->wdata; // int64_t wdata_src1_end = 0; - int64_t matrix_row_counts[100]; // [n_as] - int64_t matrix_rows[30000]; // [n_as][ne11] -#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] // nb01 >= nb00 - src0 is not transposed // compute by src0 rows if (params->type == NE_TASK_INIT) { - memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); - memset(matrix_rows, -1, 30000 * sizeof(int64_t)); - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]); - NE_ASSERT(row_id >= 0 && row_id < n_as); - mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01; - matrix_row_counts[row_id] += 1; - } - return; } if (params->type == NE_TASK_FINALIZE) { return; } + int64_t matrix_row_counts[100]; // [n_as] + int64_t matrix_rows[30000]; // [n_as][ne11] +#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] + memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); + memset(matrix_rows, -1, 30000 * sizeof(int64_t)); + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]); + NE_ASSERT(row_id >= 0 && row_id < n_as); + mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01; + matrix_row_counts[row_id] += 1; + } for (int cur_a = 0; cur_a < n_as; ++cur_a) { const int64_t cne1 = matrix_row_counts[cur_a]; if (cne1 == 0) { @@ -7292,22 +7291,11 @@ static void ne_compute_forward_mul_mat_id_f16_f32(const struct ne_compute_params const int n_as = dst->op_params[1]; // char * wdata_src1_end = (char *)params->wdata; // int64_t wdata_src1_end = 0; - int64_t matrix_row_counts[100]; // [n_as] - int64_t matrix_rows[30000]; // [n_as][ne11] -#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] // nb01 >= nb00 - src0 is not transposed // compute by src0 rows if (params->type == NE_TASK_INIT) { - memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); - memset(matrix_rows, -1, 30000 * sizeof(int64_t)); - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]); - NE_ASSERT(row_id >= 0 && row_id < n_as); - mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01; - matrix_row_counts[row_id] += 1; - } ne_fp16_t* const wdata = params->wdata; size_t id = 0; @@ -7330,6 +7318,17 @@ static void ne_compute_forward_mul_mat_id_f16_f32(const struct ne_compute_params if (params->type == NE_TASK_FINALIZE) { return; } + int64_t matrix_row_counts[100]; // [n_as] + int64_t matrix_rows[30000]; // [n_as][ne11] +#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] + memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); + memset(matrix_rows, -1, 30000 * sizeof(int64_t)); + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]); + NE_ASSERT(row_id >= 0 && row_id < n_as); + mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01; + matrix_row_counts[row_id] += 1; + } for (int cur_a = 0; cur_a < n_as; ++cur_a) { const int64_t cne1 = matrix_row_counts[cur_a]; if (cne1 == 0) { @@ -11342,6 +11341,7 @@ void ne_graph_compute(struct ne_context* ctx, struct ne_cgraph* cgraph) { } if (wei->type == NE_TYPE_BTLA) { cur = bestla_f32f32_get_workspace_size(node->src1->ne[1], wei->ne[1], node->src1->ne[0], wei->data); + node->n_tasks = 1; } else if (wei->type == NE_TYPE_F16 && node->src1->type == NE_TYPE_F32) { cur = NE_TYPE_SIZE[NE_TYPE_F16] * ne_nelements(node->src1); } else if (wei->type == NE_TYPE_F32 && node->src1->type == NE_TYPE_F32) { diff --git a/neural_speed/models/llama/llama.cpp b/neural_speed/models/llama/llama.cpp index ac4ccbac0..41aedf08d 100644 --- a/neural_speed/models/llama/llama.cpp +++ b/neural_speed/models/llama/llama.cpp @@ -379,7 +379,10 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp for (int i = 0; i < n_expert_used; ++i) { ne_tensor* cur_expert; - if (N == 1 && ctx0, model.layers[il].ffn_down_exp[0]->type == NE_TYPE_BTLA) { + if (N == 1 && bestla_fusion_FFN_SiLu_f32f32_support( + model.layers[il].ffn_gate_exp[0]->data, model.layers[il].ffn_down_exp[0]->data, + model.layers[il].ffn_up_exp[0]->data, N, cur->ne[0], + model.layers[il].ffn_gate_exp[0]->ne[1], model.layers[il].ffn_down_exp[0]->ne[1])) { cur_expert = ne_mul_id_ffn_silu(ctx0, model.layers[il].ffn_down_exp, model.layers[il].ffn_gate_exp, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur); } else {