Skip to content

fix: use vm_allocate to allocate CPU backend buffer on macOS #9875

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -682,8 +682,6 @@ ggml_backend_t ggml_backend_init_best(void) {

// backend CPU

static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment

static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
return "CPU";

Expand All @@ -702,7 +700,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
}

static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
ggml_aligned_free(buffer->context, buffer->size);
}

static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
Expand Down Expand Up @@ -770,8 +768,8 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
}

static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
void * data = ggml_aligned_malloc(size);

if (data == NULL) {
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ extern "C" {
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))

// required for mmap as gguf only guarantees 32-byte alignment
#define TENSOR_ALIGNMENT 32

// static_assert should be a #define, but if it's not,
// fall back to the _Static_assert C11 keyword.
// if C99 - static_assert is noop
Expand Down Expand Up @@ -196,6 +199,11 @@ struct ggml_cgraph {

struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);

// Memory allocation

void * ggml_aligned_malloc(size_t size);
void ggml_aligned_free(void * ptr, size_t size);

#ifdef __cplusplus
}
#endif
74 changes: 52 additions & 22 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,6 @@
#include <omp.h>
#endif

#ifdef GGML_USE_METAL
#include <unistd.h>
#endif

#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
#undef GGML_USE_LLAMAFILE
#endif
Expand Down Expand Up @@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t;
#endif

#if defined(__APPLE__)
#include <unistd.h>
#include <mach/mach.h>
#include <TargetConditionals.h>
#endif

Expand Down Expand Up @@ -386,22 +384,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
//#define GGML_SOFT_MAX_ACCELERATE
#endif


void * ggml_aligned_malloc(size_t size) {
#if defined(_MSC_VER) || defined(__MINGW32__)
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
return _aligned_malloc(size, TENSOR_ALIGNMENT);
#else
inline static void * ggml_aligned_malloc(size_t size) {
if (size == 0) {
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
return NULL;
}
void * aligned_memory = NULL;
#ifdef GGML_USE_CPU_HBM
int result = hbw_posix_memalign(&aligned_memory, 16, size);
int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
#elif TARGET_OS_OSX
kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
int result = EFAULT;
switch (alloc_status) {
case KERN_SUCCESS:
result = 0;
break;
case KERN_INVALID_ADDRESS:
result = EINVAL;
break;
case KERN_NO_SPACE:
result = ENOMEM;
break;
default:
result = EFAULT;
break;
}
#elif GGML_USE_METAL
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
const long page_size = sysconf(_SC_PAGESIZE);
int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
#else
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
#endif
if (result != 0) {
// Handle allocation failure
Expand All @@ -419,14 +435,26 @@ inline static void * ggml_aligned_malloc(size_t size) {
return NULL;
}
return aligned_memory;
#endif
}
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
#ifdef GGML_USE_CPU_HBM
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)

void ggml_aligned_free(void * ptr, size_t size) {
GGML_UNUSED(size);
#if defined(_MSC_VER) || defined(__MINGW32__)
_aligned_free(ptr);
#elif GGML_USE_CPU_HBM
if (ptr != NULL) {
hbw_free(ptr);
}
#elif TARGET_OS_OSX
if (ptr != NULL) {
vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
}
#else
#define GGML_ALIGNED_FREE(ptr) free(ptr)
#endif
free(ptr);
#endif
}


inline static void * ggml_malloc(size_t size) {
if (size == 0) {
Expand Down Expand Up @@ -3869,7 +3897,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {

*ctx = (struct ggml_context) {
/*.mem_size =*/ mem_size,
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
/*.no_alloc =*/ params.no_alloc,
/*.no_alloc_save =*/ params.no_alloc,
Expand Down Expand Up @@ -3909,7 +3937,7 @@ void ggml_free(struct ggml_context * ctx) {
__func__, i, ggml_used_mem(ctx));

if (ctx->mem_buffer_owned) {
GGML_ALIGNED_FREE(ctx->mem_buffer);
ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
}

found = true;
Expand Down Expand Up @@ -19608,9 +19636,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
if (!threadpool) return;

const int n_threads = threadpool->n_threads_max;

#ifndef GGML_USE_OPENMP
struct ggml_compute_state* workers = threadpool->workers;
const int n_threads = threadpool->n_threads_max;

ggml_mutex_lock(&threadpool->mutex);

Expand All @@ -19630,8 +19659,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
ggml_cond_destroy(&threadpool->cond);
#endif // GGML_USE_OPENMP

GGML_ALIGNED_FREE(threadpool->workers);
GGML_ALIGNED_FREE(threadpool);
const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
ggml_aligned_free(threadpool->workers, workers_size);
ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
}

#ifndef GGML_USE_OPENMP
Expand Down Expand Up @@ -20063,7 +20093,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
struct ggml_cplan * cplan) {

struct ggml_threadpool * threadpool =
GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
ggml_aligned_malloc(sizeof(struct ggml_threadpool));
{
threadpool->cgraph = cgraph;
threadpool->cplan = cplan;
Expand All @@ -20084,7 +20114,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(

// Allocate and init workers state
const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size);
struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);

memset(workers, 0, workers_size);
for (int j = 0; j < tpp->n_threads; j++) {
Expand Down
Loading