change ggml_backend_buffer_is_host to return false for amx backend

ggml-ci
ggerganov · Oct 11, 2024 · 2f37e21 · 2f37e21
1 parent 5685eb1
commit 2f37e21
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 14 deletions.
diff --git a/ggml/src/ggml-amx.cpp b/ggml/src/ggml-amx.cpp
@@ -111,7 +111,7 @@ GGML_CALL static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend
 }
 
 GGML_CALL static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-  return true;
+  return false;
 
   GGML_UNUSED(buft);
 }

diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
@@ -1199,10 +1199,12 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
         }
     }
 
+#ifndef GGML_USE_AMX
     if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
         // since the tensor is pre-allocated, it cannot be moved to another backend
         GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
     }
+#endif
 
     // graph input
     if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {

diff --git a/src/llama.cpp b/src/llama.cpp
@@ -3462,8 +3462,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
     }
 #elif defined(GGML_USE_CANN)
     buft = ggml_backend_cann_buffer_type(local_gpu);
-#elif defined(GGML_USE_AMX)
-    buft = ggml_backend_amx_buffer_type();
 #endif
 
     if (buft == nullptr) {
@@ -5087,12 +5085,7 @@ struct llama_model_loader {
             } else {
                 GGML_ASSERT(weight->idx < files.size());
                 const auto & file = files.at(weight->idx);
-#if defined(GGML_USE_AMX)
-                const bool can_use_mmap = false;
-#else
-                const bool can_use_mmap = true;
-#endif
-                if (ggml_backend_buffer_is_host(cur->buffer) && can_use_mmap) {
+                if (ggml_backend_buffer_is_host(cur->buffer)) {
                     file->seek(weight->offs, SEEK_SET);
                     file->read_raw(cur->data, n_size);
                     if (check_tensors) {
@@ -6865,7 +6858,14 @@ static bool llm_load_tensors(
 
     // assign cpu layers
     for (int i = 0; i < i_gpu_start; ++i) {
+#ifdef GGML_USE_AMX
+        model.buft_layer[i] = {
+            ggml_backend_amx_buffer_type(),
+            llama_default_buffer_type_cpu(true)
+        };
+#else
         model.buft_layer[i] = llama_default_buffer_type_cpu(true);
+#endif
     }
 
     if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
@@ -18587,11 +18587,6 @@ struct llama_model_params llama_model_default_params() {
     result.n_gpu_layers = 999;
 #endif
 
-#ifdef GGML_USE_AMX
-    // by default offload all layers to AMX
-    result.n_gpu_layers = 999;
-#endif
-
     return result;
 }