From 2f37e21d1fd90828f40d4b27bb87c0a70b85ee08 Mon Sep 17 00:00:00 2001 From: mingfeima Date: Thu, 10 Oct 2024 23:30:03 -0700 Subject: [PATCH] change ggml_backend_buffer_is_host to return false for amx backend ggml-ci --- ggml/src/ggml-amx.cpp | 2 +- ggml/src/ggml-backend.c | 2 ++ src/llama.cpp | 21 ++++++++------------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-amx.cpp b/ggml/src/ggml-amx.cpp index 68ff674a51bb6b..b633b4861c77cb 100644 --- a/ggml/src/ggml-amx.cpp +++ b/ggml/src/ggml-amx.cpp @@ -111,7 +111,7 @@ GGML_CALL static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend } GGML_CALL static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - return true; + return false; GGML_UNUSED(buft); } diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index 88eabbe2ba44f5..2106aff0f5d7a8 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -1199,10 +1199,12 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st } } +#ifndef GGML_USE_AMX if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) { // since the tensor is pre-allocated, it cannot be moved to another backend GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation"); } +#endif // graph input if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { diff --git a/src/llama.cpp b/src/llama.cpp index b85e8acdc50f51..1d91306905e6b8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3462,8 +3462,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ } #elif defined(GGML_USE_CANN) buft = ggml_backend_cann_buffer_type(local_gpu); -#elif defined(GGML_USE_AMX) - buft = ggml_backend_amx_buffer_type(); #endif if (buft == nullptr) { @@ -5087,12 +5085,7 @@ struct llama_model_loader { } else { GGML_ASSERT(weight->idx < files.size()); const auto & file = files.at(weight->idx); -#if defined(GGML_USE_AMX) - const bool can_use_mmap = false; -#else - const bool can_use_mmap = true; -#endif - if (ggml_backend_buffer_is_host(cur->buffer) && can_use_mmap) { + if (ggml_backend_buffer_is_host(cur->buffer)) { file->seek(weight->offs, SEEK_SET); file->read_raw(cur->data, n_size); if (check_tensors) { @@ -6865,7 +6858,14 @@ static bool llm_load_tensors( // assign cpu layers for (int i = 0; i < i_gpu_start; ++i) { +#ifdef GGML_USE_AMX + model.buft_layer[i] = { + ggml_backend_amx_buffer_type(), + llama_default_buffer_type_cpu(true) + }; +#else model.buft_layer[i] = llama_default_buffer_type_cpu(true); +#endif } if (split_mode == LLAMA_SPLIT_MODE_LAYER) { @@ -18587,11 +18587,6 @@ struct llama_model_params llama_model_default_params() { result.n_gpu_layers = 999; #endif -#ifdef GGML_USE_AMX - // by default offload all layers to AMX - result.n_gpu_layers = 999; -#endif - return result; }