Skip to content

Commit

Permalink
fix: use vm_allocate to allocate CPU backend buffer on macOS (#9875)
Browse files Browse the repository at this point in the history
* fix: use `vm_allocate` to allocate CPU backend buffer on macOS

* fix: switch to `posix_memalign` to keep existing `free()` usages work

* feat: move `GGML_ALIGNED_MALLOC` to `ggml-backend-impl.h`, add support for `vm_allocate` on macOS

* style: formatting

* fix: move const outside of `#ifndef`

* style: formatting

* fix: unused var

* fix: transform `GGML_ALIGNED_MALLOC` and `GGML_ALIGNED_FREE` into functions and add them to `ggml-impl.h`

* fix: unused var

* fix: page align to `GGUF_DEFAULT_ALIGNMENT`

* fix: page align to `TENSOR_ALIGNMENT`

* fix: convert `TENSOR_ALIGNMENT` to a macro

* fix: increase page size to `32` on iOS

* fix: iOS page size

* fix: `hbw_posix_memalign` alignment
  • Loading branch information
giladgd authored Oct 16, 2024
1 parent 9e04102 commit 73afe68
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 27 deletions.
8 changes: 3 additions & 5 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -682,8 +682,6 @@ ggml_backend_t ggml_backend_init_best(void) {

// backend CPU

static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment

static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
return "CPU";

Expand All @@ -702,7 +700,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
}

static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
ggml_aligned_free(buffer->context, buffer->size);
}

static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
Expand Down Expand Up @@ -770,8 +768,8 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
}

static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
void * data = ggml_aligned_malloc(size);

if (data == NULL) {
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ extern "C" {
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))

// required for mmap as gguf only guarantees 32-byte alignment
#define TENSOR_ALIGNMENT 32

// static_assert should be a #define, but if it's not,
// fall back to the _Static_assert C11 keyword.
// if C99 - static_assert is noop
Expand Down Expand Up @@ -196,6 +199,11 @@ struct ggml_cgraph {

struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);

// Memory allocation

void * ggml_aligned_malloc(size_t size);
void ggml_aligned_free(void * ptr, size_t size);

#ifdef __cplusplus
}
#endif
74 changes: 52 additions & 22 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,6 @@
#include <omp.h>
#endif

#ifdef GGML_USE_METAL
#include <unistd.h>
#endif

#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
#undef GGML_USE_LLAMAFILE
#endif
Expand Down Expand Up @@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t;
#endif

#if defined(__APPLE__)
#include <unistd.h>
#include <mach/mach.h>
#include <TargetConditionals.h>
#endif

Expand Down Expand Up @@ -386,22 +384,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
//#define GGML_SOFT_MAX_ACCELERATE
#endif


void * ggml_aligned_malloc(size_t size) {
#if defined(_MSC_VER) || defined(__MINGW32__)
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
return _aligned_malloc(size, TENSOR_ALIGNMENT);
#else
inline static void * ggml_aligned_malloc(size_t size) {
if (size == 0) {
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
return NULL;
}
void * aligned_memory = NULL;
#ifdef GGML_USE_CPU_HBM
int result = hbw_posix_memalign(&aligned_memory, 16, size);
int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
#elif TARGET_OS_OSX
kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
int result = EFAULT;
switch (alloc_status) {
case KERN_SUCCESS:
result = 0;
break;
case KERN_INVALID_ADDRESS:
result = EINVAL;
break;
case KERN_NO_SPACE:
result = ENOMEM;
break;
default:
result = EFAULT;
break;
}
#elif GGML_USE_METAL
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
const long page_size = sysconf(_SC_PAGESIZE);
int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
#else
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
#endif
if (result != 0) {
// Handle allocation failure
Expand All @@ -419,14 +435,26 @@ inline static void * ggml_aligned_malloc(size_t size) {
return NULL;
}
return aligned_memory;
#endif
}
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
#ifdef GGML_USE_CPU_HBM
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)

void ggml_aligned_free(void * ptr, size_t size) {
GGML_UNUSED(size);
#if defined(_MSC_VER) || defined(__MINGW32__)
_aligned_free(ptr);
#elif GGML_USE_CPU_HBM
if (ptr != NULL) {
hbw_free(ptr);
}
#elif TARGET_OS_OSX
if (ptr != NULL) {
vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
}
#else
#define GGML_ALIGNED_FREE(ptr) free(ptr)
#endif
free(ptr);
#endif
}


inline static void * ggml_malloc(size_t size) {
if (size == 0) {
Expand Down Expand Up @@ -3869,7 +3897,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {

*ctx = (struct ggml_context) {
/*.mem_size =*/ mem_size,
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
/*.no_alloc =*/ params.no_alloc,
/*.no_alloc_save =*/ params.no_alloc,
Expand Down Expand Up @@ -3909,7 +3937,7 @@ void ggml_free(struct ggml_context * ctx) {
__func__, i, ggml_used_mem(ctx));

if (ctx->mem_buffer_owned) {
GGML_ALIGNED_FREE(ctx->mem_buffer);
ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
}

found = true;
Expand Down Expand Up @@ -19608,9 +19636,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
if (!threadpool) return;

const int n_threads = threadpool->n_threads_max;

#ifndef GGML_USE_OPENMP
struct ggml_compute_state* workers = threadpool->workers;
const int n_threads = threadpool->n_threads_max;

ggml_mutex_lock(&threadpool->mutex);

Expand All @@ -19630,8 +19659,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
ggml_cond_destroy(&threadpool->cond);
#endif // GGML_USE_OPENMP

GGML_ALIGNED_FREE(threadpool->workers);
GGML_ALIGNED_FREE(threadpool);
const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
ggml_aligned_free(threadpool->workers, workers_size);
ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
}

#ifndef GGML_USE_OPENMP
Expand Down Expand Up @@ -20063,7 +20093,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
struct ggml_cplan * cplan) {

struct ggml_threadpool * threadpool =
GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
ggml_aligned_malloc(sizeof(struct ggml_threadpool));
{
threadpool->cgraph = cgraph;
threadpool->cplan = cplan;
Expand All @@ -20084,7 +20114,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(

// Allocate and init workers state
const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size);
struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);

memset(workers, 0, workers_size);
for (int j = 0; j < tpp->n_threads; j++) {
Expand Down

0 comments on commit 73afe68

Please sign in to comment.