Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
add ffn silu support assert
Browse files Browse the repository at this point in the history
Signed-off-by: intellinjun <[email protected]>
  • Loading branch information
intellinjun committed Mar 1, 2024
1 parent affe69c commit 590d65b
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 24 deletions.
46 changes: 23 additions & 23 deletions neural_speed/core/ne_layers.c
Original file line number Diff line number Diff line change
Expand Up @@ -7159,29 +7159,28 @@ static void ne_compute_forward_mul_mat_id_f32(const struct ne_compute_params* pa
const int n_as = dst->op_params[1];
// char * wdata_src1_end = (char *)params->wdata;
// int64_t wdata_src1_end = 0;
int64_t matrix_row_counts[100]; // [n_as]
int64_t matrix_rows[30000]; // [n_as][ne11]
#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]

// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows

if (params->type == NE_TASK_INIT) {
memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
memset(matrix_rows, -1, 30000 * sizeof(int64_t));
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]);
NE_ASSERT(row_id >= 0 && row_id < n_as);
mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01;
matrix_row_counts[row_id] += 1;
}

return;
}

if (params->type == NE_TASK_FINALIZE) {
return;
}
int64_t matrix_row_counts[100]; // [n_as]
int64_t matrix_rows[30000]; // [n_as][ne11]
#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
memset(matrix_rows, -1, 30000 * sizeof(int64_t));
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]);
NE_ASSERT(row_id >= 0 && row_id < n_as);
mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01;
matrix_row_counts[row_id] += 1;
}
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
const int64_t cne1 = matrix_row_counts[cur_a];
if (cne1 == 0) {
Expand Down Expand Up @@ -7292,22 +7291,11 @@ static void ne_compute_forward_mul_mat_id_f16_f32(const struct ne_compute_params
const int n_as = dst->op_params[1];
// char * wdata_src1_end = (char *)params->wdata;
// int64_t wdata_src1_end = 0;
int64_t matrix_row_counts[100]; // [n_as]
int64_t matrix_rows[30000]; // [n_as][ne11]
#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]

// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows

if (params->type == NE_TASK_INIT) {
memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
memset(matrix_rows, -1, 30000 * sizeof(int64_t));
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]);
NE_ASSERT(row_id >= 0 && row_id < n_as);
mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01;
matrix_row_counts[row_id] += 1;
}
ne_fp16_t* const wdata = params->wdata;

size_t id = 0;
Expand All @@ -7330,6 +7318,17 @@ static void ne_compute_forward_mul_mat_id_f16_f32(const struct ne_compute_params
if (params->type == NE_TASK_FINALIZE) {
return;
}
int64_t matrix_row_counts[100]; // [n_as]
int64_t matrix_rows[30000]; // [n_as][ne11]
#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
memset(matrix_rows, -1, 30000 * sizeof(int64_t));
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]);
NE_ASSERT(row_id >= 0 && row_id < n_as);
mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01;
matrix_row_counts[row_id] += 1;
}
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
const int64_t cne1 = matrix_row_counts[cur_a];
if (cne1 == 0) {
Expand Down Expand Up @@ -11342,6 +11341,7 @@ void ne_graph_compute(struct ne_context* ctx, struct ne_cgraph* cgraph) {
}
if (wei->type == NE_TYPE_BTLA) {
cur = bestla_f32f32_get_workspace_size(node->src1->ne[1], wei->ne[1], node->src1->ne[0], wei->data);
node->n_tasks = 1;
} else if (wei->type == NE_TYPE_F16 && node->src1->type == NE_TYPE_F32) {
cur = NE_TYPE_SIZE[NE_TYPE_F16] * ne_nelements(node->src1);
} else if (wei->type == NE_TYPE_F32 && node->src1->type == NE_TYPE_F32) {
Expand Down
5 changes: 4 additions & 1 deletion neural_speed/models/llama/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,10 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp

for (int i = 0; i < n_expert_used; ++i) {
ne_tensor* cur_expert;
if (N == 1 && ctx0, model.layers[il].ffn_down_exp[0]->type == NE_TYPE_BTLA) {
if (N == 1 && bestla_fusion_FFN_SiLu_f32f32_support(

This comment has been minimized.

Copy link
@luoyu-intel

luoyu-intel Mar 1, 2024

Contributor

Why FFN fusion? You should check matmul+silu fusion

This comment has been minimized.

Copy link
@intellinjun

intellinjun Mar 2, 2024

Author Contributor

I fusioned silu and three mul_mat_id,and ne_mul_id_ffn_silu used ne_mul_ffn_silu fusion

model.layers[il].ffn_gate_exp[0]->data, model.layers[il].ffn_down_exp[0]->data,
model.layers[il].ffn_up_exp[0]->data, N, cur->ne[0],
model.layers[il].ffn_gate_exp[0]->ne[1], model.layers[il].ffn_down_exp[0]->ne[1])) {
cur_expert = ne_mul_id_ffn_silu(ctx0, model.layers[il].ffn_down_exp, model.layers[il].ffn_gate_exp,
model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
} else {
Expand Down

0 comments on commit 590d65b

Please sign in to comment.