Skip to content

Commit 73afe68

Browse files
authored
fix: use vm_allocate to allocate CPU backend buffer on macOS (#9875)
* fix: use `vm_allocate` to allocate CPU backend buffer on macOS * fix: switch to `posix_memalign` to keep existing `free()` usages work * feat: move `GGML_ALIGNED_MALLOC` to `ggml-backend-impl.h`, add support for `vm_allocate` on macOS * style: formatting * fix: move const outside of `#ifndef` * style: formatting * fix: unused var * fix: transform `GGML_ALIGNED_MALLOC` and `GGML_ALIGNED_FREE` into functions and add them to `ggml-impl.h` * fix: unused var * fix: page align to `GGUF_DEFAULT_ALIGNMENT` * fix: page align to `TENSOR_ALIGNMENT` * fix: convert `TENSOR_ALIGNMENT` to a macro * fix: increase page size to `32` on iOS * fix: iOS page size * fix: `hbw_posix_memalign` alignment
1 parent 9e04102 commit 73afe68

File tree

3 files changed

+63
-27
lines changed

3 files changed

+63
-27
lines changed

ggml/src/ggml-backend.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -682,8 +682,6 @@ ggml_backend_t ggml_backend_init_best(void) {
682682

683683
// backend CPU
684684

685-
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
686-
687685
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
688686
return "CPU";
689687

@@ -702,7 +700,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
702700
}
703701

704702
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
705-
free(buffer->context);
703+
ggml_aligned_free(buffer->context, buffer->size);
706704
}
707705

708706
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -770,8 +768,8 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
770768
}
771769

772770
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
773-
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
774-
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
771+
void * data = ggml_aligned_malloc(size);
772+
775773
if (data == NULL) {
776774
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
777775
return NULL;

ggml/src/ggml-impl.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ extern "C" {
1919
#define MIN(a, b) ((a) < (b) ? (a) : (b))
2020
#define MAX(a, b) ((a) > (b) ? (a) : (b))
2121

22+
// required for mmap as gguf only guarantees 32-byte alignment
23+
#define TENSOR_ALIGNMENT 32
24+
2225
// static_assert should be a #define, but if it's not,
2326
// fall back to the _Static_assert C11 keyword.
2427
// if C99 - static_assert is noop
@@ -196,6 +199,11 @@ struct ggml_cgraph {
196199

197200
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
198201

202+
// Memory allocation
203+
204+
void * ggml_aligned_malloc(size_t size);
205+
void ggml_aligned_free(void * ptr, size_t size);
206+
199207
#ifdef __cplusplus
200208
}
201209
#endif

ggml/src/ggml.c

Lines changed: 52 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,6 @@
3535
#include <omp.h>
3636
#endif
3737

38-
#ifdef GGML_USE_METAL
39-
#include <unistd.h>
40-
#endif
41-
4238
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
4339
#undef GGML_USE_LLAMAFILE
4440
#endif
@@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t;
189185
#endif
190186

191187
#if defined(__APPLE__)
188+
#include <unistd.h>
189+
#include <mach/mach.h>
192190
#include <TargetConditionals.h>
193191
#endif
194192

@@ -386,22 +384,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
386384
//#define GGML_SOFT_MAX_ACCELERATE
387385
#endif
388386

387+
388+
void * ggml_aligned_malloc(size_t size) {
389389
#if defined(_MSC_VER) || defined(__MINGW32__)
390-
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
391-
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
390+
return _aligned_malloc(size, TENSOR_ALIGNMENT);
392391
#else
393-
inline static void * ggml_aligned_malloc(size_t size) {
394392
if (size == 0) {
395393
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
396394
return NULL;
397395
}
398396
void * aligned_memory = NULL;
399397
#ifdef GGML_USE_CPU_HBM
400-
int result = hbw_posix_memalign(&aligned_memory, 16, size);
398+
int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
399+
#elif TARGET_OS_OSX
400+
kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
401+
int result = EFAULT;
402+
switch (alloc_status) {
403+
case KERN_SUCCESS:
404+
result = 0;
405+
break;
406+
case KERN_INVALID_ADDRESS:
407+
result = EINVAL;
408+
break;
409+
case KERN_NO_SPACE:
410+
result = ENOMEM;
411+
break;
412+
default:
413+
result = EFAULT;
414+
break;
415+
}
401416
#elif GGML_USE_METAL
402-
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
417+
const long page_size = sysconf(_SC_PAGESIZE);
418+
int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
403419
#else
404-
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
420+
int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
405421
#endif
406422
if (result != 0) {
407423
// Handle allocation failure
@@ -419,14 +435,26 @@ inline static void * ggml_aligned_malloc(size_t size) {
419435
return NULL;
420436
}
421437
return aligned_memory;
438+
#endif
422439
}
423-
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
424-
#ifdef GGML_USE_CPU_HBM
425-
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
440+
441+
void ggml_aligned_free(void * ptr, size_t size) {
442+
GGML_UNUSED(size);
443+
#if defined(_MSC_VER) || defined(__MINGW32__)
444+
_aligned_free(ptr);
445+
#elif GGML_USE_CPU_HBM
446+
if (ptr != NULL) {
447+
hbw_free(ptr);
448+
}
449+
#elif TARGET_OS_OSX
450+
if (ptr != NULL) {
451+
vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
452+
}
426453
#else
427-
#define GGML_ALIGNED_FREE(ptr) free(ptr)
428-
#endif
454+
free(ptr);
429455
#endif
456+
}
457+
430458

431459
inline static void * ggml_malloc(size_t size) {
432460
if (size == 0) {
@@ -3869,7 +3897,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
38693897

38703898
*ctx = (struct ggml_context) {
38713899
/*.mem_size =*/ mem_size,
3872-
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
3900+
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
38733901
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
38743902
/*.no_alloc =*/ params.no_alloc,
38753903
/*.no_alloc_save =*/ params.no_alloc,
@@ -3909,7 +3937,7 @@ void ggml_free(struct ggml_context * ctx) {
39093937
__func__, i, ggml_used_mem(ctx));
39103938

39113939
if (ctx->mem_buffer_owned) {
3912-
GGML_ALIGNED_FREE(ctx->mem_buffer);
3940+
ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
39133941
}
39143942

39153943
found = true;
@@ -19608,9 +19636,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
1960819636
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
1960919637
if (!threadpool) return;
1961019638

19639+
const int n_threads = threadpool->n_threads_max;
19640+
1961119641
#ifndef GGML_USE_OPENMP
1961219642
struct ggml_compute_state* workers = threadpool->workers;
19613-
const int n_threads = threadpool->n_threads_max;
1961419643

1961519644
ggml_mutex_lock(&threadpool->mutex);
1961619645

@@ -19630,8 +19659,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
1963019659
ggml_cond_destroy(&threadpool->cond);
1963119660
#endif // GGML_USE_OPENMP
1963219661

19633-
GGML_ALIGNED_FREE(threadpool->workers);
19634-
GGML_ALIGNED_FREE(threadpool);
19662+
const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
19663+
ggml_aligned_free(threadpool->workers, workers_size);
19664+
ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
1963519665
}
1963619666

1963719667
#ifndef GGML_USE_OPENMP
@@ -20063,7 +20093,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
2006320093
struct ggml_cplan * cplan) {
2006420094

2006520095
struct ggml_threadpool * threadpool =
20066-
GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
20096+
ggml_aligned_malloc(sizeof(struct ggml_threadpool));
2006720097
{
2006820098
threadpool->cgraph = cgraph;
2006920099
threadpool->cplan = cplan;
@@ -20084,7 +20114,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
2008420114

2008520115
// Allocate and init workers state
2008620116
const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
20087-
struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size);
20117+
struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
2008820118

2008920119
memset(workers, 0, workers_size);
2009020120
for (int j = 0; j < tpp->n_threads; j++) {

0 commit comments

Comments
 (0)