From 73afe681aa76e818733fc1f30de082c1d6910bcd Mon Sep 17 00:00:00 2001 From: "Gilad S." <7817232+giladgd@users.noreply.github.com> Date: Thu, 17 Oct 2024 01:36:51 +0300 Subject: [PATCH] fix: use `vm_allocate` to allocate CPU backend buffer on macOS (#9875) * fix: use `vm_allocate` to allocate CPU backend buffer on macOS * fix: switch to `posix_memalign` to keep existing `free()` usages work * feat: move `GGML_ALIGNED_MALLOC` to `ggml-backend-impl.h`, add support for `vm_allocate` on macOS * style: formatting * fix: move const outside of `#ifndef` * style: formatting * fix: unused var * fix: transform `GGML_ALIGNED_MALLOC` and `GGML_ALIGNED_FREE` into functions and add them to `ggml-impl.h` * fix: unused var * fix: page align to `GGUF_DEFAULT_ALIGNMENT` * fix: page align to `TENSOR_ALIGNMENT` * fix: convert `TENSOR_ALIGNMENT` to a macro * fix: increase page size to `32` on iOS * fix: iOS page size * fix: `hbw_posix_memalign` alignment --- ggml/src/ggml-backend.cpp | 8 ++--- ggml/src/ggml-impl.h | 8 +++++ ggml/src/ggml.c | 74 +++++++++++++++++++++++++++------------ 3 files changed, 63 insertions(+), 27 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 15d650150a5f3..6d6ffeb4efe12 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -682,8 +682,6 @@ ggml_backend_t ggml_backend_init_best(void) { // backend CPU -static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment - static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) { return "CPU"; @@ -702,7 +700,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { } static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { - free(buffer->context); + ggml_aligned_free(buffer->context, buffer->size); } static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { @@ -770,8 +768,8 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty } static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned - void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h) + void * data = ggml_aligned_malloc(size); + if (data == NULL) { GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); return NULL; diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index d3f4bad8c0a84..65c4f81195be3 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -19,6 +19,9 @@ extern "C" { #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +// required for mmap as gguf only guarantees 32-byte alignment +#define TENSOR_ALIGNMENT 32 + // static_assert should be a #define, but if it's not, // fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop @@ -196,6 +199,11 @@ struct ggml_cgraph { struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); +// Memory allocation + +void * ggml_aligned_malloc(size_t size); +void ggml_aligned_free(void * ptr, size_t size); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 3f01092d9f59a..779b38d1213f2 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -35,10 +35,6 @@ #include #endif -#ifdef GGML_USE_METAL -#include -#endif - #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) #undef GGML_USE_LLAMAFILE #endif @@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t; #endif #if defined(__APPLE__) +#include +#include #include #endif @@ -386,22 +384,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi //#define GGML_SOFT_MAX_ACCELERATE #endif + +void * ggml_aligned_malloc(size_t size) { #if defined(_MSC_VER) || defined(__MINGW32__) -#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) -#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) + return _aligned_malloc(size, TENSOR_ALIGNMENT); #else -inline static void * ggml_aligned_malloc(size_t size) { if (size == 0) { GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); return NULL; } void * aligned_memory = NULL; #ifdef GGML_USE_CPU_HBM - int result = hbw_posix_memalign(&aligned_memory, 16, size); + int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); +#elif TARGET_OS_OSX + kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE); + int result = EFAULT; + switch (alloc_status) { + case KERN_SUCCESS: + result = 0; + break; + case KERN_INVALID_ADDRESS: + result = EINVAL; + break; + case KERN_NO_SPACE: + result = ENOMEM; + break; + default: + result = EFAULT; + break; + } #elif GGML_USE_METAL - int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size); + const long page_size = sysconf(_SC_PAGESIZE); + int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size); #else - int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); + int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); #endif if (result != 0) { // Handle allocation failure @@ -419,14 +435,26 @@ inline static void * ggml_aligned_malloc(size_t size) { return NULL; } return aligned_memory; +#endif } -#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) -#ifdef GGML_USE_CPU_HBM -#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr) + +void ggml_aligned_free(void * ptr, size_t size) { + GGML_UNUSED(size); +#if defined(_MSC_VER) || defined(__MINGW32__) + _aligned_free(ptr); +#elif GGML_USE_CPU_HBM + if (ptr != NULL) { + hbw_free(ptr); + } +#elif TARGET_OS_OSX + if (ptr != NULL) { + vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size); + } #else -#define GGML_ALIGNED_FREE(ptr) free(ptr) -#endif + free(ptr); #endif +} + inline static void * ggml_malloc(size_t size) { if (size == 0) { @@ -3869,7 +3897,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { *ctx = (struct ggml_context) { /*.mem_size =*/ mem_size, - /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size), + /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.no_alloc =*/ params.no_alloc, /*.no_alloc_save =*/ params.no_alloc, @@ -3909,7 +3937,7 @@ void ggml_free(struct ggml_context * ctx) { __func__, i, ggml_used_mem(ctx)); if (ctx->mem_buffer_owned) { - GGML_ALIGNED_FREE(ctx->mem_buffer); + ggml_aligned_free(ctx->mem_buffer, ctx->mem_size); } found = true; @@ -19608,9 +19636,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask void ggml_threadpool_free(struct ggml_threadpool* threadpool) { if (!threadpool) return; + const int n_threads = threadpool->n_threads_max; + #ifndef GGML_USE_OPENMP struct ggml_compute_state* workers = threadpool->workers; - const int n_threads = threadpool->n_threads_max; ggml_mutex_lock(&threadpool->mutex); @@ -19630,8 +19659,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) { ggml_cond_destroy(&threadpool->cond); #endif // GGML_USE_OPENMP - GGML_ALIGNED_FREE(threadpool->workers); - GGML_ALIGNED_FREE(threadpool); + const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads; + ggml_aligned_free(threadpool->workers, workers_size); + ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool)); } #ifndef GGML_USE_OPENMP @@ -20063,7 +20093,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( struct ggml_cplan * cplan) { struct ggml_threadpool * threadpool = - GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool)); + ggml_aligned_malloc(sizeof(struct ggml_threadpool)); { threadpool->cgraph = cgraph; threadpool->cplan = cplan; @@ -20084,7 +20114,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( // Allocate and init workers state const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads; - struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size); + struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size); memset(workers, 0, workers_size); for (int j = 0; j < tpp->n_threads; j++) {