diff --git a/neural_speed/core/ne_layers.c b/neural_speed/core/ne_layers.c index 49541ae56..4245e4879 100644 --- a/neural_speed/core/ne_layers.c +++ b/neural_speed/core/ne_layers.c @@ -1131,44 +1131,38 @@ struct ne_tensor* ne_new_tensor_impl(struct ne_context* ctx, enum ne_type type, return result; } -struct ne_tensor* ne_new_tensor(struct ne_context* ctx, enum ne_type type, int n_dims, const int64_t* ne, size_t size) { - return ne_new_tensor_impl(ctx, type, n_dims, ne, NULL, size, NE_BACKEND_CPU); +struct ne_tensor* ne_new_tensor(struct ne_context* ctx, enum ne_type type, int n_dims, const int64_t* ne, size_t size, + enum ne_backend bk) { + return ne_new_tensor_impl(ctx, type, n_dims, ne, NULL, size, bk); } -struct ne_tensor* ne_new_tensor_1d(struct ne_context* ctx, enum ne_type type, int64_t ne0, size_t size) { - return ne_new_tensor(ctx, type, 1, &ne0, size); +struct ne_tensor* ne_new_tensor_1d(struct ne_context* ctx, enum ne_type type, int64_t ne0, size_t size, + enum ne_backend bk) { + return ne_new_tensor(ctx, type, 1, &ne0, size, bk); } -struct ne_tensor* ne_new_tensor_2d(struct ne_context* ctx, enum ne_type type, int64_t ne0, int64_t ne1, size_t size) { +struct ne_tensor* ne_new_tensor_2d(struct ne_context* ctx, enum ne_type type, int64_t ne0, int64_t ne1, size_t size, + enum ne_backend bk) { const int64_t ne[2] = {ne0, ne1}; - return ne_new_tensor(ctx, type, 2, ne, size); + return ne_new_tensor(ctx, type, 2, ne, size, bk); } struct ne_tensor* ne_new_tensor_3d(struct ne_context* ctx, enum ne_type type, int64_t ne0, int64_t ne1, int64_t ne2, - size_t size) { + size_t size, enum ne_backend bk) { const int64_t ne[3] = {ne0, ne1, ne2}; - return ne_new_tensor(ctx, type, 3, ne, size); + return ne_new_tensor(ctx, type, 3, ne, size, bk); } struct ne_tensor* ne_new_tensor_4d(struct ne_context* ctx, enum ne_type type, int64_t ne0, int64_t ne1, int64_t ne2, - int64_t ne3, size_t size) { + int64_t ne3, size_t size, enum ne_backend bk) { const int64_t ne[4] = {ne0, ne1, ne2, ne3}; - return ne_new_tensor(ctx, type, 4, ne, size); -} - -struct ne_tensor* ne_new_dev_tensor(struct ne_context* ctx, enum ne_type type, int n_dims, const int64_t* ne, - size_t size) { - return ne_new_tensor_impl(ctx, type, n_dims, ne, NULL, size, NE_BACKEND_SYCL); -} - -struct ne_tensor* ne_new_dev_tensor_1d(struct ne_context* ctx, enum ne_type type, int64_t ne0, size_t size) { - return ne_new_dev_tensor(ctx, type, 1, &ne0, size); + return ne_new_tensor(ctx, type, 4, ne, size, bk); } struct ne_tensor* ne_new_i32(struct ne_context* ctx, int32_t value) { ne_scratch_save(ctx); - struct ne_tensor* result = ne_new_tensor_1d(ctx, NE_TYPE_I32, 1, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor_1d(ctx, NE_TYPE_I32, 1, NE_SIZE_CALC, NE_BACKEND_CPU); ne_scratch_load(ctx); @@ -1180,7 +1174,7 @@ struct ne_tensor* ne_new_i32(struct ne_context* ctx, int32_t value) { struct ne_tensor* ne_new_f32(struct ne_context* ctx, float value) { ne_scratch_save(ctx); - struct ne_tensor* result = ne_new_tensor_1d(ctx, NE_TYPE_F32, 1, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor_1d(ctx, NE_TYPE_F32, 1, NE_SIZE_CALC, NE_BACKEND_CPU); ne_scratch_load(ctx); @@ -1564,7 +1558,7 @@ struct ne_tensor* ne_acc_impl(struct ne_context* ctx, struct ne_tensor* a, struc ne_scratch_save(ctx); - struct ne_tensor* c = ne_new_tensor_1d(ctx, NE_TYPE_I32, 5, NE_SIZE_CALC); + struct ne_tensor* c = ne_new_tensor_1d(ctx, NE_TYPE_I32, 5, NE_SIZE_CALC, a->backend); ((int32_t*)c->data)[0] = nb1; ((int32_t*)c->data)[1] = nb2; @@ -1908,7 +1902,7 @@ struct ne_tensor* ne_sum(struct ne_context* ctx, struct ne_tensor* a) { is_node = true; } - struct ne_tensor* result = ne_new_tensor_1d(ctx, a->type, 1, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor_1d(ctx, a->type, 1, NE_SIZE_CALC, a->backend); result->op = NE_OP_SUM; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -1932,7 +1926,7 @@ struct ne_tensor* ne_sum_rows(struct ne_context* ctx, struct ne_tensor* a) { ne[i] = a->ne[i]; } - struct ne_tensor* result = ne_new_tensor(ctx, a->type, a->n_dims, ne, a->size); + struct ne_tensor* result = ne_new_tensor(ctx, a->type, a->n_dims, ne, a->size, a->backend); result->op = NE_OP_SUM_ROWS; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -1953,7 +1947,7 @@ struct ne_tensor* ne_mean(struct ne_context* ctx, struct ne_tensor* a) { } int64_t ne[NE_MAX_DIMS] = {1, a->ne[1], a->ne[2], a->ne[3]}; - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, a->n_dims, ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, a->n_dims, ne, NE_SIZE_CALC, a->backend); result->op = NE_OP_MEAN; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -1978,7 +1972,7 @@ struct ne_tensor* ne_repeat(struct ne_context* ctx, struct ne_tensor* a, struct return a; } - struct ne_tensor* result = ne_new_tensor(ctx, a->type, b->n_dims, b->ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, a->type, b->n_dims, b->ne, NE_SIZE_CALC, a->backend); result->op = NE_OP_REPEAT; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -2255,7 +2249,7 @@ struct ne_tensor* ne_mul_mat(struct ne_context* ctx, struct ne_tensor* a, struct } const int64_t ne[4] = {a->ne[1], b->ne[1], b->ne[2], b->ne[3]}; - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, MAX(a->n_dims, b->n_dims), ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, MAX(a->n_dims, b->n_dims), ne, NE_SIZE_CALC, a->backend); result->op = NE_OP_MUL_MAT; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -2279,7 +2273,7 @@ struct ne_tensor* ne_mul_mat_with_bias(struct ne_context* ctx, struct ne_tensor* } const int64_t ne[4] = {w->ne[1], a->ne[1], w->ne[2], a->ne[3]}; - struct ne_tensor* result = ne_new_tensor(ctx, a->type, MIN(w->n_dims, a->n_dims), ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, a->type, MIN(w->n_dims, a->n_dims), ne, NE_SIZE_CALC, w->backend); result->op = NE_OP_MUL_MAT_BIAS; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -2305,7 +2299,8 @@ struct ne_tensor* ne_mul_mat_id(struct ne_context* ctx, struct ne_tensor* const } const int64_t ne[4] = {as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3]}; - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne, NE_SIZE_CALC); + struct ne_tensor* result = + ne_new_tensor(ctx, NE_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne, NE_SIZE_CALC, as[0]->backend); int params[] = {id, n_as}; ne_set_op_params(result, ¶ms, sizeof(params)); result->op = NE_OP_MUL_MAT_ID; @@ -2345,10 +2340,10 @@ struct ne_tensor* ne_mul_id_ffn_silu(struct ne_context* ctx, struct ne_tensor* c is_node = true; } const int64_t ne[4] = {w2->ne[1], src->ne[1], src->ne[2], src->ne[3]}; - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC, w2->backend); const int64_t tne[4] = {w1->ne[1], src->ne[1], src->ne[2], src->ne[3]}; - struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC); - struct ne_tensor* tmp1 = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC); + struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC, w1->backend); + struct ne_tensor* tmp1 = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC, w1->backend); int params[] = {id, n_as}; ne_set_op_params(result, ¶ms, sizeof(params)); result->op = NE_OP_MUL_ID_FFN_SILU; @@ -2390,10 +2385,10 @@ struct ne_tensor* ne_mul_id_ffn_gelu(struct ne_context* ctx, struct ne_tensor* c is_node = true; } const int64_t ne[4] = {w2->ne[1], src->ne[1], src->ne[2], src->ne[3]}; - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC, w2->backend); const int64_t tne[4] = {w1->ne[1], src->ne[1], src->ne[2], src->ne[3]}; - struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC); - struct ne_tensor* tmp1 = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC); + struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC, w1->backend); + struct ne_tensor* tmp1 = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC, w1->backend); int params[] = {id, n_as}; ne_set_op_params(result, ¶ms, sizeof(params)); result->op = NE_OP_MUL_ID_FFN_GELU; @@ -2416,7 +2411,7 @@ struct ne_tensor* ne_mul_id_ffn_gelu(struct ne_context* ctx, struct ne_tensor* c struct ne_tensor* ne_argsort(struct ne_context* ctx, struct ne_tensor* a) { bool is_node = false; - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_I32, NE_MAX_DIMS, a->ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_I32, NE_MAX_DIMS, a->ne, NE_SIZE_CALC, a->backend); result->op = NE_OP_ARGSORT; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -2455,7 +2450,8 @@ struct ne_tensor* ne_mul_qkv(struct ne_context* ctx, struct ne_tensor* qw, struc } const int64_t ne[4] = {qw->ne[1], src->ne[1], src->ne[2] * 3, src->ne[3]}; - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, MIN(src->n_dims, qw->n_dims), ne, NE_SIZE_CALC); + struct ne_tensor* result = + ne_new_tensor(ctx, NE_TYPE_F32, MIN(src->n_dims, qw->n_dims), ne, NE_SIZE_CALC, qw->backend); result->op = NE_OP_MUL_QKV; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -2481,10 +2477,10 @@ struct ne_tensor* ne_ffn_silu(struct ne_context* ctx, struct ne_tensor* w1, stru } const int64_t ne[4] = {w2->ne[1], src->ne[1], src->ne[2], src->ne[3]}; - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC, w1->backend); const int64_t tne[4] = {w1->ne[1], src->ne[1], src->ne[2], src->ne[3]}; - struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC); - struct ne_tensor* tmp1 = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC); + struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC, w1->backend); + struct ne_tensor* tmp1 = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC, w1->backend); result->op = NE_OP_MUL_FFN_SILU; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -2508,9 +2504,9 @@ struct ne_tensor* ne_ffn_add_gelu(struct ne_context* ctx, struct ne_tensor* w1, } const int64_t ne[4] = {w2->ne[1], src->ne[1], src->ne[2], src->ne[3]}; - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC, w2->backend); const int64_t tne[4] = {w1->ne[1], src->ne[1], src->ne[2], src->ne[3]}; - struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC); + struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC, w1->backend); result->op = NE_OP_MUL_FFN_ADD_GELU; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -2534,9 +2530,9 @@ struct ne_tensor* ne_ffn_gelu(struct ne_context* ctx, struct ne_tensor* w1, stru } const int64_t ne[4] = {w2->ne[1], src->ne[1], src->ne[2], src->ne[3]}; - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC, w2->backend); const int64_t tne[4] = {w1->ne[1], src->ne[1], src->ne[2], src->ne[3]}; - struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC); + struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC, w1->backend); result->op = NE_OP_MUL_FFN_GELU; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -2559,10 +2555,10 @@ struct ne_tensor* ne_ffn_gelu_mul(struct ne_context* ctx, struct ne_tensor* w1, } const int64_t ne[4] = {w2->ne[1], src->ne[1], src->ne[2], src->ne[3]}; - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC, w2->backend); const int64_t tne[4] = {w1->ne[1], src->ne[1], src->ne[2], src->ne[3]}; - struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC); - struct ne_tensor* tmp1 = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC); + struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC, w1->backend); + struct ne_tensor* tmp1 = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC, w1->backend); result->op = NE_OP_MUL_FFN_GELU_MUL; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -2621,7 +2617,7 @@ struct ne_tensor* ne_set_impl(struct ne_context* ctx, struct ne_tensor* a, struc ne_scratch_save(ctx); - struct ne_tensor* c = ne_new_tensor_1d(ctx, NE_TYPE_I32, 5, NE_SIZE_CALC); + struct ne_tensor* c = ne_new_tensor_1d(ctx, NE_TYPE_I32, 5, NE_SIZE_CALC, a->backend); ((int32_t*)c->data)[0] = nb1; ((int32_t*)c->data)[1] = nb2; @@ -3052,7 +3048,8 @@ struct ne_tensor* ne_get_rows(struct ne_context* ctx, struct ne_tensor* a, struc } // TODO: implement non F32 return // struct ne_tensor * result = ne_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct ne_tensor* result = ne_new_tensor_4d(ctx, NE_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2], NE_SIZE_CALC); + struct ne_tensor* result = + ne_new_tensor_4d(ctx, NE_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2], NE_SIZE_CALC, a->backend); result->op = NE_OP_GET_ROWS; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -3077,7 +3074,7 @@ struct ne_tensor* ne_get_rows_back(struct ne_context* ctx, struct ne_tensor* a, // TODO: implement non F32 return // struct ne_tensor * result = ne_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct ne_tensor* result = ne_new_tensor_2d(ctx, NE_TYPE_F32, c->ne[0], c->ne[1], NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor_2d(ctx, NE_TYPE_F32, c->ne[0], c->ne[1], NE_SIZE_CALC, a->backend); result->op = NE_OP_GET_ROWS_BACK; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -3099,7 +3096,7 @@ struct ne_tensor* ne_diag(struct ne_context* ctx, struct ne_tensor* a) { } const int64_t ne[4] = {a->ne[0], a->ne[0], a->ne[2], a->ne[3]}; - struct ne_tensor* result = ne_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne, NE_SIZE_CALC, a->backend); result->op = NE_OP_DIAG; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -3125,7 +3122,7 @@ struct ne_tensor* ne_diag_mask_inf_impl(struct ne_context* ctx, struct ne_tensor ne_scratch_save(ctx); const int bs = a->ne[3]; - struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 2 + bs, NE_SIZE_CALC); + struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 2 + bs, NE_SIZE_CALC, a->backend); ((int32_t*)b->data)[0] = n_past; ((int32_t*)b->data)[1] = inplace ? 1 : 0; @@ -3178,7 +3175,7 @@ struct ne_tensor* ne_diag_mask_zero_impl(struct ne_context* ctx, struct ne_tenso ne_scratch_save(ctx); - struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 2, NE_SIZE_CALC); + struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 2, NE_SIZE_CALC, a->backend); ne_set_name(b, "n_past, inplace"); ((int32_t*)b->data)[0] = n_past; @@ -3223,7 +3220,7 @@ struct ne_tensor* ne_padding_mask_inf_impl(struct ne_context* ctx, struct ne_ten #define PM_PADDING_IDX 2 const int bs = a->ne[3]; - struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, PM_PARAMS_NUM + bs, NE_SIZE_CALC); + struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, PM_PARAMS_NUM + bs, NE_SIZE_CALC, a->backend); ((int32_t*)b->data)[PM_NPAST_IDX] = n_past; ((int32_t*)b->data)[PM_INPLACE_IDX] = inplace ? 1 : 0; @@ -3306,7 +3303,7 @@ struct ne_tensor* ne_rope_impl(struct ne_context* ctx, struct ne_tensor* a, int #define ROPE_PADDING_IDX 5 const int bs = a->ne[3]; - struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, ROPE_PARAMS_NUM + bs, NE_SIZE_CALC); + struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, ROPE_PARAMS_NUM + bs, NE_SIZE_CALC, a->backend); ((int32_t*)b->data)[ROPE_NPAST_IDX] = n_past; ((int32_t*)b->data)[ROPE_NDIMS_IDX] = n_dims; @@ -3389,7 +3386,7 @@ struct ne_tensor* ne_rope_back(struct ne_context* ctx, struct ne_tensor* a, int ne_scratch_save(ctx); - struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 3, NE_SIZE_CALC); + struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 3, NE_SIZE_CALC, a->backend); ne_set_name(b, "n_past, n_dims, mode"); ((int32_t*)b->data)[0] = n_past; @@ -3442,7 +3439,7 @@ struct ne_tensor* ne_alibi(struct ne_context* ctx, struct ne_tensor* a, int n_pa ne_scratch_save(ctx); - struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 3, NE_SIZE_CALC); + struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 3, NE_SIZE_CALC, a->backend); ((int32_t*)b->data)[0] = n_past; ((int32_t*)b->data)[1] = n_head; @@ -3474,7 +3471,7 @@ struct ne_tensor* ne_clamp(struct ne_context* ctx, struct ne_tensor* a, float mi ne_scratch_save(ctx); - struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 3, NE_SIZE_CALC); + struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 3, NE_SIZE_CALC, a->backend); ((float*)b->data)[0] = min; ((float*)b->data)[1] = max; @@ -3508,7 +3505,7 @@ struct ne_tensor* ne_conv_1d_1s(struct ne_context* ctx, struct ne_tensor* a, str 1, 1, }; - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, 2, ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, 2, ne, NE_SIZE_CALC, a->backend); result->op = NE_OP_CONV_1D_1S; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -3537,7 +3534,7 @@ struct ne_tensor* ne_conv_1d_2s(struct ne_context* ctx, struct ne_tensor* a, str 1, 1, }; - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, 2, ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, 2, ne, NE_SIZE_CALC, a->backend); result->op = NE_OP_CONV_1D_2S; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -3569,7 +3566,7 @@ NE_API struct ne_tensor* ne_conv_1d(struct ne_context* ctx, struct ne_tensor* a, 1, 1, }; - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, 2, ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, 2, ne, NE_SIZE_CALC, a->backend); int32_t params[] = {s0, p0, d0}; ne_set_op_params(result, params, sizeof(params)); @@ -3608,10 +3605,11 @@ struct ne_tensor* ne_flash_attn(struct ne_context* ctx, struct ne_tensor* q, str NE_ASSERT(batch == k->ne[3]); NE_ASSERT(batch == v->ne[3]); bool is_node = true; - struct ne_tensor* result = ne_new_tensor_4d(ctx, NE_TYPE_F32, headsize, headnum, seq_cur, batch, NE_SIZE_CALC); + struct ne_tensor* result = + ne_new_tensor_4d(ctx, NE_TYPE_F32, headsize, headnum, seq_cur, batch, NE_SIZE_CALC, q->backend); attn_shape_t atte_shape = {batch, headnum, headsize, seq_cur, seq_all}; size_t tmpsize = bestla_fusion_attn_workspace_size(&atte_shape); - struct ne_tensor* tmp_t = ne_new_tensor_1d(ctx, NE_TYPE_I8, tmpsize, NE_SIZE_CALC); + struct ne_tensor* tmp_t = ne_new_tensor_1d(ctx, NE_TYPE_I8, tmpsize, NE_SIZE_CALC, q->backend); result->op = NE_OP_FLASH_ATTN; result->grad = NULL; result->src0 = q; @@ -3632,7 +3630,7 @@ struct ne_tensor* ne_flash_attn_kv_update(struct ne_context* ctx, struct ne_tens ne_scratch_save(ctx); - struct ne_tensor* params = ne_new_tensor_1d(ctx, NE_TYPE_I32, 3, NE_SIZE_CALC); + struct ne_tensor* params = ne_new_tensor_1d(ctx, NE_TYPE_I32, 3, NE_SIZE_CALC, cache->backend); ((int32_t*)params->data)[0] = n_past; ((int32_t*)params->data)[1] = (int)is_v; @@ -3671,7 +3669,7 @@ struct ne_tensor* ne_flash_ff(struct ne_context* ctx, struct ne_tensor* a, struc } // struct ne_tensor * result = ne_dup_tensor(ctx, a); - struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, 4, a->ne, NE_SIZE_CALC); + struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, 4, a->ne, NE_SIZE_CALC, a->backend); result->op = NE_OP_FLASH_FF; result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL; @@ -3694,7 +3692,8 @@ struct ne_tensor* ne_map_unary_impl_f32(struct ne_context* ctx, struct ne_tensor is_node = true; } - struct ne_tensor* addr_tensor = ne_new_tensor_1d(ctx, NE_TYPE_I32, sizeof(void*) / sizeof(int32_t), NE_SIZE_CALC); + struct ne_tensor* addr_tensor = + ne_new_tensor_1d(ctx, NE_TYPE_I32, sizeof(void*) / sizeof(int32_t), NE_SIZE_CALC, a->backend); *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a); @@ -3726,7 +3725,8 @@ struct ne_tensor* ne_map_binary_impl_f32(struct ne_context* ctx, struct ne_tenso is_node = true; } - struct ne_tensor* addr_tensor = ne_new_tensor_1d(ctx, NE_TYPE_I32, sizeof(void*) / sizeof(int32_t), NE_SIZE_CALC); + struct ne_tensor* addr_tensor = + ne_new_tensor_1d(ctx, NE_TYPE_I32, sizeof(void*) / sizeof(int32_t), NE_SIZE_CALC, a->backend); *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a); @@ -10971,7 +10971,8 @@ static void ne_compute_backward(struct ne_context* ctx, struct ne_tensor* tensor int64_t ne[4] = {nc0, ncr, nr0, nrr}; struct ne_tensor* F00 = tensor->grad; - struct ne_tensor* F01 = ne_reshape(ctx, F00, ne_new_tensor(ctx, tensor->grad->type, 4, ne, NE_SIZE_CALC)); + struct ne_tensor* F01 = + ne_reshape(ctx, F00, ne_new_tensor(ctx, tensor->grad->type, 4, ne, NE_SIZE_CALC, tensor->grad->backend)); struct ne_tensor* F02 = ne_permute(ctx, F01, 0, 2, 1, 3); struct ne_tensor* F03 = ne_cont(ctx, F02); struct ne_tensor* F04 = ne_reshape_2d(ctx, F03, nc0 * nr0, ncr * nrr); @@ -11772,7 +11773,7 @@ void ne_graph_compute(struct ne_context* ctx, struct ne_cgraph* cgraph) { cgraph->work_size = work_size + CACHE_LINE_SIZE * (n_threads - 1); NE_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); - cgraph->work = ne_new_tensor_1d(ctx, NE_TYPE_I8, cgraph->work_size, NE_SIZE_CALC); + cgraph->work = ne_new_tensor_1d(ctx, NE_TYPE_I8, cgraph->work_size, NE_SIZE_CALC, NE_BACKEND_CPU); } } @@ -12135,15 +12136,17 @@ static enum ne_opt_result ne_opt_adam(struct ne_context* ctx, struct ne_opt_para const float beta2 = params.adam.beta2; const float eps = params.adam.eps; - float* x = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC)->data; // view of the parameters - float* g1 = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC)->data; // gradient - float* g2 = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC)->data; // gradient squared - float* m = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC)->data; // first moment - float* v = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC)->data; // second moment - float* mh = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC)->data; // first moment hat - float* vh = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC)->data; // second moment hat - - float* pf = params.past > 0 ? (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, params.past, NE_SIZE_CALC)->data + float* x = + (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC, NE_BACKEND_CPU)->data; // view of the parameters + float* g1 = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC, NE_BACKEND_CPU)->data; // gradient + float* g2 = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC, NE_BACKEND_CPU)->data; // gradient squared + float* m = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC, NE_BACKEND_CPU)->data; // first moment + float* v = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC, NE_BACKEND_CPU)->data; // second moment + float* mh = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC, NE_BACKEND_CPU)->data; // first moment hat + float* vh = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC, NE_BACKEND_CPU)->data; // second moment hat + + float* pf = params.past > 0 + ? (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, params.past, NE_SIZE_CALC, NE_BACKEND_CPU)->data : NULL; // past function values // initialize @@ -12414,13 +12417,15 @@ static enum ne_opt_result ne_opt_lbfgs(struct ne_context* ctx, struct ne_opt_par } } - float* x = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC)->data; // current parameters - float* xp = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC)->data; // previous parameters - float* g = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC)->data; // current gradient - float* gp = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC)->data; // previous gradient - float* d = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC)->data; // search direction + float* x = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC, NE_BACKEND_CPU)->data; // current parameters + float* xp = + (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC, NE_BACKEND_CPU)->data; // previous parameters + float* g = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC, NE_BACKEND_CPU)->data; // current gradient + float* gp = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC, NE_BACKEND_CPU)->data; // previous gradient + float* d = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC, NE_BACKEND_CPU)->data; // search direction - float* pf = params.past > 0 ? (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, params.past, NE_SIZE_CALC)->data + float* pf = params.past > 0 + ? (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, params.past, NE_SIZE_CALC, NE_BACKEND_CPU)->data : NULL; // past function values float fx = 0.0f; // cost function value @@ -12438,8 +12443,8 @@ static enum ne_opt_result ne_opt_lbfgs(struct ne_context* ctx, struct ne_opt_par for (int i = 0; i < m; ++i) { lm[i].alpha = 0.0f; lm[i].ys = 0.0f; - lm[i].s = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC)->data; - lm[i].y = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC)->data; + lm[i].s = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC, NE_BACKEND_CPU)->data; + lm[i].y = (float*)ne_new_tensor_1d(ctx, NE_TYPE_F32, nx, NE_SIZE_CALC, NE_BACKEND_CPU)->data; } // evaluate the function value and its gradient diff --git a/neural_speed/core/ne_layers.h b/neural_speed/core/ne_layers.h index 1dd386a7f..12ede6bcd 100644 --- a/neural_speed/core/ne_layers.h +++ b/neural_speed/core/ne_layers.h @@ -115,29 +115,25 @@ NE_API size_t ne_used_mem(const struct ne_context* ctx); NE_API size_t ne_set_scratch(struct ne_context* ctx, struct ne_scratch scratch); NE_API struct ne_tensor* ne_new_tensor(struct ne_context* ctx, enum ne_type type, int n_dims, const int64_t* ne, - size_t size); + size_t size, enum ne_backend bk); -NE_API struct ne_tensor* ne_new_tensor_1d(struct ne_context* ctx, enum ne_type type, int64_t ne0, size_t size); +NE_API struct ne_tensor* ne_new_tensor_1d(struct ne_context* ctx, enum ne_type type, int64_t ne0, size_t size, + enum ne_backend bk); NE_API struct ne_tensor* ne_new_tensor_2d(struct ne_context* ctx, enum ne_type type, int64_t ne0, int64_t ne1, - size_t size); + size_t size, enum ne_backend bk); NE_API struct ne_tensor* ne_new_tensor_3d(struct ne_context* ctx, enum ne_type type, int64_t ne0, int64_t ne1, - int64_t ne2, size_t size); + int64_t ne2, size_t size, enum ne_backend bk); NE_API struct ne_tensor* ne_new_tensor_4d(struct ne_context* ctx, enum ne_type type, int64_t ne0, int64_t ne1, - int64_t ne2, int64_t ne3, size_t size); + int64_t ne2, int64_t ne3, size_t size, enum ne_backend bk); -NE_API struct ne_tensor* ne_new_dev_tensor(struct ne_context* ctx, enum ne_type type, int n_dims, const int64_t* ne, - size_t size); - -NE_API struct ne_tensor* ne_new_dev_tensor_1d(struct ne_context* ctx, enum ne_type type, int64_t ne0, size_t size); - -#define d_ne_new_tensor(...) ne_new_tensor(__VA_ARGS__, NE_SIZE_CALC) -#define d_ne_new_tensor_1d(...) ne_new_tensor_1d(__VA_ARGS__, NE_SIZE_CALC) -#define d_ne_new_tensor_2d(...) ne_new_tensor_2d(__VA_ARGS__, NE_SIZE_CALC) -#define d_ne_new_tensor_3d(...) ne_new_tensor_3d(__VA_ARGS__, NE_SIZE_CALC) -#define d_ne_new_tensor_4d(...) ne_new_tensor_4d(__VA_ARGS__, NE_SIZE_CALC) +#define d_ne_new_tensor(...) ne_new_tensor(__VA_ARGS__, NE_SIZE_CALC, NE_BACKEND_CPU) +#define d_ne_new_tensor_1d(...) ne_new_tensor_1d(__VA_ARGS__, NE_SIZE_CALC, NE_BACKEND_CPU) +#define d_ne_new_tensor_2d(...) ne_new_tensor_2d(__VA_ARGS__, NE_SIZE_CALC, NE_BACKEND_CPU) +#define d_ne_new_tensor_3d(...) ne_new_tensor_3d(__VA_ARGS__, NE_SIZE_CALC, NE_BACKEND_CPU) +#define d_ne_new_tensor_4d(...) ne_new_tensor_4d(__VA_ARGS__, NE_SIZE_CALC, NE_BACKEND_CPU) NE_API struct ne_tensor* ne_new_i32(struct ne_context* ctx, int32_t value); NE_API struct ne_tensor* ne_new_f32(struct ne_context* ctx, float value); diff --git a/neural_speed/models/llama/llama.cpp b/neural_speed/models/llama/llama.cpp index 3aef6a560..d449cab99 100644 --- a/neural_speed/models/llama/llama.cpp +++ b/neural_speed/models/llama/llama.cpp @@ -173,12 +173,12 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp bestla_reordered_attn_fp32_batch_kv_info(&kv_shape, &kv_cache_info); } - struct ne_tensor* embd = ne_new_tensor_1d(ctx0, NE_TYPE_I32, seq_len_sum, NE_SIZE_CALC); + struct ne_tensor* embd = ne_new_tensor_1d(ctx0, NE_TYPE_I32, seq_len_sum, NE_SIZE_CALC, NE_BACKEND_CPU); ne_set_name(embd, "embd"); #ifdef NS_SYCL size_t memsize = size_t(seq_len_sum) * sizeof(model_token); - struct ne_tensor* embd_dev = ne_new_dev_tensor_1d(ctx0, NE_TYPE_I32, seq_len_sum, NE_SIZE_CALC); + struct ne_tensor* embd_dev = ne_new_tensor_1d(ctx0, NE_TYPE_I32, seq_len_sum, NE_SIZE_CALC, NE_BACKEND_SYCL); int cpy_off = 0; for (int i = 0; i < batch_size; ++i) { bestla_device_memcpy_sync(static_cast(embd_dev->data) + cpy_off, inputs[i].tokens, @@ -271,7 +271,7 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp // self-attention const float attn_scale = 1.0f / sqrtf(static_cast(head_size)); struct ne_tensor* KQV_merged_contiguous = - ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_size * n_head, seq_len_sum, NE_SIZE_CALC); + ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_size * n_head, seq_len_sum, NE_SIZE_CALC, NE_BACKEND_CPU); if (!run_mha_reordered) { // store key and value to memory // important: @@ -527,7 +527,8 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp } } else { // for-loop MOE (deal with sequence one by one) - struct ne_tensor* moe_out = ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_size * n_head, seq_len_sum, NE_SIZE_CALC); + struct ne_tensor* moe_out = + ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_size * n_head, seq_len_sum, NE_SIZE_CALC, NE_BACKEND_CPU); size_t off_sl = 0; for (int bi = 0; bi < batch_size; ++bi) { const int moe_sl = n_tokens[bi]; diff --git a/neural_speed/models/model_utils/model_files.h b/neural_speed/models/model_utils/model_files.h index 444d28976..df48590bd 100644 --- a/neural_speed/models/model_utils/model_files.h +++ b/neural_speed/models/model_utils/model_files.h @@ -1456,13 +1456,13 @@ struct model_model_loader { struct ne_tensor* tensor; if (lt.ne.size() == 2) { if (lt.type == NE_TYPE_BTLA) { - tensor = ne_new_tensor_2d(ne_ctx, lt.type, lt.ne.at(0), lt.ne.at(1), lt.size); + tensor = ne_new_tensor_2d(ne_ctx, lt.type, lt.ne.at(0), lt.ne.at(1), lt.size, backend); } else { - tensor = ne_new_tensor_2d(ne_ctx, lt.type, lt.ne.at(0), lt.ne.at(1), NE_SIZE_CALC); + tensor = ne_new_tensor_2d(ne_ctx, lt.type, lt.ne.at(0), lt.ne.at(1), NE_SIZE_CALC, backend); } } else { MODEL_ASSERT(lt.ne.size() == 1); - tensor = ne_new_tensor_1d(ne_ctx, lt.type, lt.ne.at(0), NE_SIZE_CALC); + tensor = ne_new_tensor_1d(ne_ctx, lt.type, lt.ne.at(0), NE_SIZE_CALC, backend); } ne_set_name(tensor, lt.name.c_str()); MODEL_ASSERT(lt.ne_tensor == nullptr); // if this fails, we called get_tensor twice on the same tensor diff --git a/neural_speed/models/model_utils/model_utils.cpp b/neural_speed/models/model_utils/model_utils.cpp index aea4e7b87..9d75c4e25 100644 --- a/neural_speed/models/model_utils/model_utils.cpp +++ b/neural_speed/models/model_utils/model_utils.cpp @@ -109,11 +109,11 @@ static bool kv_cache_init(const struct model_hparams& hparams, struct model_kv_c k_cache = d_ne_new_tensor_4d(model->ctx, NE_TYPE_F16, head_size, n_ctx, heads_kv, batch_size * beam_size); v_cache = d_ne_new_tensor_4d(model->ctx, NE_TYPE_F16, n_ctx, head_size, heads_kv, batch_size * beam_size); } else if (wtype == NE_TYPE_BTLA) { - k_cache = ne_new_tensor_1d(model->ctx, wtype_alloc, layer_ne_k + NE_ALIGNMENT, NE_SIZE_CALC); + k_cache = ne_new_tensor_1d(model->ctx, wtype_alloc, layer_ne_k + NE_ALIGNMENT, NE_SIZE_CALC, NE_BACKEND_CPU); const auto k_align_off = reinterpret_cast(k_cache->data) % NE_ALIGNMENT; k_cache = ne_view_1d(model->ctx, k_cache, layer_ne_k, NE_ALIGNMENT - k_align_off); k_cache->type = wtype; - v_cache = ne_new_tensor_1d(model->ctx, wtype_alloc, layer_ne_v + NE_ALIGNMENT, NE_SIZE_CALC); + v_cache = ne_new_tensor_1d(model->ctx, wtype_alloc, layer_ne_v + NE_ALIGNMENT, NE_SIZE_CALC, NE_BACKEND_CPU); const auto v_align_off = reinterpret_cast(v_cache->data) % NE_ALIGNMENT; v_cache = ne_view_1d(model->ctx, v_cache, layer_ne_v, NE_ALIGNMENT - v_align_off); v_cache->type = wtype; @@ -126,11 +126,13 @@ static bool kv_cache_init(const struct model_hparams& hparams, struct model_kv_c const bool run_mha_reordered = model->layers[0].k_cache->type == NE_TYPE_BTLA; fprintf(stderr, "%s: run_mha_reordered = %d\n", __func__, run_mha_reordered); } else { - cache.k = ne_new_tensor_1d(cache.ctx, wtype_alloc, n_layer * layer_ne_k + NE_ALIGNMENT, NE_SIZE_CALC); + cache.k = + ne_new_tensor_1d(cache.ctx, wtype_alloc, n_layer * layer_ne_k + NE_ALIGNMENT, NE_SIZE_CALC, NE_BACKEND_CPU); const auto k_align_off = reinterpret_cast(cache.k->data) % NE_ALIGNMENT; cache.k = ne_view_1d(cache.ctx, cache.k, n_layer * layer_ne_k, NE_ALIGNMENT - k_align_off); cache.k->type = wtype; - cache.v = ne_new_tensor_1d(cache.ctx, wtype_alloc, n_layer * layer_ne_v + NE_ALIGNMENT, NE_SIZE_CALC); + cache.v = + ne_new_tensor_1d(cache.ctx, wtype_alloc, n_layer * layer_ne_v + NE_ALIGNMENT, NE_SIZE_CALC, NE_BACKEND_CPU); const auto v_align_off = reinterpret_cast(cache.v->data) % NE_ALIGNMENT; cache.v = ne_view_1d(cache.ctx, cache.v, n_layer * layer_ne_v, NE_ALIGNMENT - v_align_off); cache.v->type = wtype; @@ -140,7 +142,7 @@ static bool kv_cache_init(const struct model_hparams& hparams, struct model_kv_c if (shift_roped_k) { // prepare rope helper for fused-attention const auto cossin_dtype = wtype == NE_TYPE_BTLA ? NE_TYPE_F16 : wtype; - cache.cossin = ne_new_tensor_1d(cache.ctx, cossin_dtype, head_size, NE_SIZE_CALC); + cache.cossin = ne_new_tensor_1d(cache.ctx, cossin_dtype, head_size, NE_SIZE_CALC, NE_BACKEND_CPU); ne_set_name(cache.cossin, "cossin(-1)"); float freq_base = hparams.freq_base; float theta = -1 * hparams.freq_scale; @@ -1161,7 +1163,7 @@ int model_apply_lora_from_file_internal(struct model_context* ctx, const char* p } ne_tensor* lora_tensor; if (n_dims == 2) { - lora_tensor = ne_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1], NE_SIZE_CALC); + lora_tensor = ne_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1], NE_SIZE_CALC, NE_BACKEND_CPU); } else { fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims); return 1; @@ -1490,11 +1492,13 @@ size_t model_copy_state_data(struct model_context* ctx, uint8_t* dst) { ne_cgraph gf{}; gf.n_threads = 1; - ne_tensor* kout3d = ne_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer, NE_SIZE_CALC); + ne_tensor* kout3d = + ne_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer, NE_SIZE_CALC, NE_BACKEND_CPU); kout3d->data = out; out += ne_nbytes(kout3d); - ne_tensor* vout3d = ne_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer, NE_SIZE_CALC); + ne_tensor* vout3d = + ne_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer, NE_SIZE_CALC, NE_BACKEND_CPU); vout3d->data = out; out += ne_nbytes(vout3d); @@ -1603,11 +1607,13 @@ size_t model_set_state_data(struct model_context* ctx, uint8_t* src) { ne_cgraph gf{}; gf.n_threads = 1; - ne_tensor* kin3d = ne_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer, NE_SIZE_CALC); + ne_tensor* kin3d = + ne_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer, NE_SIZE_CALC, NE_BACKEND_CPU); kin3d->data = reinterpret_cast(inp); inp += ne_nbytes(kin3d); - ne_tensor* vin3d = ne_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer, NE_SIZE_CALC); + ne_tensor* vin3d = + ne_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer, NE_SIZE_CALC, NE_BACKEND_CPU); vin3d->data = reinterpret_cast(inp); inp += ne_nbytes(vin3d); @@ -1957,7 +1963,7 @@ static ne_tensor* ne_model_kv_cache_seq_concat(struct ne_cgraph* cgraph, struct continue; } else { if (dst == nullptr) { - dst = ne_new_tensor_4d(nectx, cache->type, ne0, ne1, ne2, ne3, NE_SIZE_CALC); + dst = ne_new_tensor_4d(nectx, cache->type, ne0, ne1, ne2, ne3, NE_SIZE_CALC, NE_BACKEND_CPU); } struct ne_tensor* dst_i = ne_view_4d(nectx, dst, ne0, ne1, ne2, cont_bs, elem_size * ne0, elem_size * ne0 * ne1, elem_size * ne0 * ne1 * ne2, dst_off);