Skip to content

Commit

Permalink
change ggml_backend_buffer_is_host to return false for amx backend
Browse files Browse the repository at this point in the history
ggml-ci
  • Loading branch information
mingfeima committed Oct 11, 2024
1 parent 5685eb1 commit 2f37e21
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 14 deletions.
2 changes: 1 addition & 1 deletion ggml/src/ggml-amx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ GGML_CALL static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend
}

GGML_CALL static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true;
return false;

GGML_UNUSED(buft);
}
Expand Down
2 changes: 2 additions & 0 deletions ggml/src/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -1199,10 +1199,12 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
}
}

#ifndef GGML_USE_AMX
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
// since the tensor is pre-allocated, it cannot be moved to another backend
GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
}
#endif

// graph input
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
Expand Down
21 changes: 8 additions & 13 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3462,8 +3462,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
}
#elif defined(GGML_USE_CANN)
buft = ggml_backend_cann_buffer_type(local_gpu);
#elif defined(GGML_USE_AMX)
buft = ggml_backend_amx_buffer_type();
#endif

if (buft == nullptr) {
Expand Down Expand Up @@ -5087,12 +5085,7 @@ struct llama_model_loader {
} else {
GGML_ASSERT(weight->idx < files.size());
const auto & file = files.at(weight->idx);
#if defined(GGML_USE_AMX)
const bool can_use_mmap = false;
#else
const bool can_use_mmap = true;
#endif
if (ggml_backend_buffer_is_host(cur->buffer) && can_use_mmap) {
if (ggml_backend_buffer_is_host(cur->buffer)) {
file->seek(weight->offs, SEEK_SET);
file->read_raw(cur->data, n_size);
if (check_tensors) {
Expand Down Expand Up @@ -6865,7 +6858,14 @@ static bool llm_load_tensors(

// assign cpu layers
for (int i = 0; i < i_gpu_start; ++i) {
#ifdef GGML_USE_AMX
model.buft_layer[i] = {
ggml_backend_amx_buffer_type(),
llama_default_buffer_type_cpu(true)
};
#else
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
#endif
}

if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
Expand Down Expand Up @@ -18587,11 +18587,6 @@ struct llama_model_params llama_model_default_params() {
result.n_gpu_layers = 999;
#endif

#ifdef GGML_USE_AMX
// by default offload all layers to AMX
result.n_gpu_layers = 999;
#endif

return result;
}

Expand Down

0 comments on commit 2f37e21

Please sign in to comment.