From 11772b9b9bf8522ed1ab1b136b186968ecdb7a99 Mon Sep 17 00:00:00 2001 From: ThanatosShinji Date: Wed, 29 May 2024 22:28:44 +0800 Subject: [PATCH] fix sync --- neural_speed/models/llama/llama.cpp | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/neural_speed/models/llama/llama.cpp b/neural_speed/models/llama/llama.cpp index 97118f68d..9e6646537 100644 --- a/neural_speed/models/llama/llama.cpp +++ b/neural_speed/models/llama/llama.cpp @@ -176,24 +176,12 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp struct ne_tensor* embd = ne_new_tensor_1d(ctx0, NE_TYPE_I32, seq_len_sum, NE_SIZE_CALC, NE_BACKEND_CPU); ne_set_name(embd, "embd"); -#ifdef NS_SYCL - size_t memsize = size_t(seq_len_sum) * sizeof(model_token); - struct ne_tensor* embd_dev = ne_new_tensor_1d(ctx0, NE_TYPE_I32, seq_len_sum, NE_SIZE_CALC, NE_BACKEND_SYCL); - int cpy_off = 0; - for (int i = 0; i < batch_size; ++i) { - bestla_device_memcpy_sync(static_cast(embd_dev->data) + cpy_off, inputs[i].tokens, - n_tokens[i] * ne_element_size(embd), ctx->device_queue); - cpy_off += n_tokens[i]; - } - bestla_device_memcpy_sync(static_cast(embd->data), static_cast(embd_dev->data), memsize, - ctx->device_queue); -#else int cpy_off = 0; for (int i = 0; i < batch_size; ++i) { memcpy(static_cast(embd->data) + cpy_off, inputs[i].tokens, n_tokens[i] * ne_element_size(embd)); cpy_off += n_tokens[i]; } -#endif + #ifdef NS_TP_MODEL if (enable_tp) { // need to broadcast the ids @@ -643,6 +631,7 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp lctx.model.kv_self.n = n_cached; float* logptr = NULL; if (inpL->backend == NE_BACKEND_SYCL) { + bestla_device_sync(ctx0->dev_queue); struct ne_tensor* inpL_host = ne_new_tensor_1d(ctx0, NE_TYPE_I8, inpL->size, NE_SIZE_CALC, NE_BACKEND_CPU); logptr = (float*)inpL_host->data; bestla_device_memcpy_sync(logptr, inpL->data, inpL->size, ctx0->dev_queue);