From 2f37e21d1fd90828f40d4b27bb87c0a70b85ee08 Mon Sep 17 00:00:00 2001
From: mingfeima <mingfei.ma@intel.com>
Date: Thu, 10 Oct 2024 23:30:03 -0700
Subject: [PATCH] change ggml_backend_buffer_is_host to return false for amx
 backend

ggml-ci
---
 ggml/src/ggml-amx.cpp   |  2 +-
 ggml/src/ggml-backend.c |  2 ++
 src/llama.cpp           | 21 ++++++++-------------
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/ggml/src/ggml-amx.cpp b/ggml/src/ggml-amx.cpp
index 68ff674a51bb6b..b633b4861c77cb 100644
--- a/ggml/src/ggml-amx.cpp
+++ b/ggml/src/ggml-amx.cpp
@@ -111,7 +111,7 @@ GGML_CALL static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend
 }
 
 GGML_CALL static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-  return true;
+  return false;
 
   GGML_UNUSED(buft);
 }
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index 88eabbe2ba44f5..2106aff0f5d7a8 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -1199,10 +1199,12 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
         }
     }
 
+#ifndef GGML_USE_AMX
     if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
         // since the tensor is pre-allocated, it cannot be moved to another backend
         GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
     }
+#endif
 
     // graph input
     if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
diff --git a/src/llama.cpp b/src/llama.cpp
index b85e8acdc50f51..1d91306905e6b8 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3462,8 +3462,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
     }
 #elif defined(GGML_USE_CANN)
     buft = ggml_backend_cann_buffer_type(local_gpu);
-#elif defined(GGML_USE_AMX)
-    buft = ggml_backend_amx_buffer_type();
 #endif
 
     if (buft == nullptr) {
@@ -5087,12 +5085,7 @@ struct llama_model_loader {
             } else {
                 GGML_ASSERT(weight->idx < files.size());
                 const auto & file = files.at(weight->idx);
-#if defined(GGML_USE_AMX)
-                const bool can_use_mmap = false;
-#else
-                const bool can_use_mmap = true;
-#endif
-                if (ggml_backend_buffer_is_host(cur->buffer) && can_use_mmap) {
+                if (ggml_backend_buffer_is_host(cur->buffer)) {
                     file->seek(weight->offs, SEEK_SET);
                     file->read_raw(cur->data, n_size);
                     if (check_tensors) {
@@ -6865,7 +6858,14 @@ static bool llm_load_tensors(
 
     // assign cpu layers
     for (int i = 0; i < i_gpu_start; ++i) {
+#ifdef GGML_USE_AMX
+        model.buft_layer[i] = {
+            ggml_backend_amx_buffer_type(),
+            llama_default_buffer_type_cpu(true)
+        };
+#else
         model.buft_layer[i] = llama_default_buffer_type_cpu(true);
+#endif
     }
 
     if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
@@ -18587,11 +18587,6 @@ struct llama_model_params llama_model_default_params() {
     result.n_gpu_layers = 999;
 #endif
 
-#ifdef GGML_USE_AMX
-    // by default offload all layers to AMX
-    result.n_gpu_layers = 999;
-#endif
-
     return result;
 }