Skip to content

Draft: Metal max buffer workaround #1825

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion examples/metal/metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ int main(int argc, char ** argv) {
// debug output
{
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
ggml_metal_get_tensor(ctx_metal, logits);

float * ptr = (float *) ggml_get_data(logits);

Expand Down
9 changes: 0 additions & 9 deletions ggml-metal.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
// used during the graph evaluation to determine the arguments of the compute kernels.
//
// Synchronization between device and host memory (for example for input and output tensors)
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
//

#pragma once

Expand Down Expand Up @@ -48,12 +45,6 @@ bool ggml_metal_add_buffer(
void * data,
size_t size);

// set data from host memory into the device
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

// get data from the device into host memory
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

// same as ggml_graph_compute but uses Metal
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);

Expand Down
71 changes: 31 additions & 40 deletions ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -224,56 +224,47 @@ bool ggml_metal_add_buffer(
}

size_t page_size = getpagesize();
size_t aligned_size = size;
if ((aligned_size % page_size) != 0) {
aligned_size += (page_size - (aligned_size % page_size));
}

ctx->buffers[ctx->n_buffers].name = name;
ctx->buffers[ctx->n_buffers].data = data;
ctx->buffers[ctx->n_buffers].size = size;
size_t sys_max_buffer_size = 2ul * 1024ul * 1024ul * 1024ul; // ctx->device.maxBufferLength;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: this is an artificial 2GB limit that I had in place to test this out, given I don't actually bump into maxBufferLength on my M1 Max.

Should be switched back to ctx->device.maxBufferLength once issues are worked out.


if (ctx->device.maxBufferLength < aligned_size) {
fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength);
return false;
// Make sure total size is page-aligned
size_t total_aligned_size = size;
if ((total_aligned_size % page_size) != 0) {
total_aligned_size += (page_size - (total_aligned_size % page_size));
}
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];

if (ctx->buffers[ctx->n_buffers].metal == nil) {
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
return false;
} else {
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
// Make sure chunk size is page-aligned
size_t max_chunk_size = sys_max_buffer_size / 2;
if ((max_chunk_size % page_size) != 0) {
max_chunk_size += (page_size - (max_chunk_size % page_size));
}

++ctx->n_buffers;
size_t chunk_offset = 0;
while (total_aligned_size > 0) {
size_t chunk_logical_size = (max_chunk_size > total_aligned_size) ? total_aligned_size : max_chunk_size;
size_t sys_buffer_size = (sys_max_buffer_size > total_aligned_size) ? total_aligned_size : sys_max_buffer_size;
void *chunk = (uint8_t *) data + chunk_offset;
ctx->buffers[ctx->n_buffers].name = name;
ctx->buffers[ctx->n_buffers].data = chunk;
ctx->buffers[ctx->n_buffers].size = chunk_logical_size;
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:chunk length:sys_buffer_size options:MTLResourceStorageModeShared deallocator:nil];

if (ctx->buffers[ctx->n_buffers].metal == nil) {
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name,
sys_buffer_size / 1024.0 / 1024.0);
return false;
} else {
fprintf(stderr, "%s: allocated '%-16s' buffer, sys_size = %8.2f MB, size = %8.2f MB, max: %zu\n", __func__, name,
sys_buffer_size / 1024.0 / 1024.0, chunk_logical_size / 1024.0 / 1024.0, sys_max_buffer_size);
}
++ctx->n_buffers;
total_aligned_size -= chunk_logical_size;
chunk_offset += chunk_logical_size;
}
}

return true;
}

void ggml_metal_set_tensor(
struct ggml_metal_context * ctx,
struct ggml_tensor * t) {
metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);

size_t offs;
id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);

memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
}

void ggml_metal_get_tensor(
struct ggml_metal_context * ctx,
struct ggml_tensor * t) {
metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);

size_t offs;
id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);

memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
}

void ggml_metal_graph_compute(
struct ggml_metal_context * ctx,
struct ggml_cgraph * gf) {
Expand Down
10 changes: 0 additions & 10 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1575,7 +1575,6 @@ static bool llama_eval_internal(
#ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) {
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
ggml_metal_get_tensor (lctx.ctx_metal, cur);
} else {
// IMPORTANT:
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
Expand All @@ -1584,15 +1583,6 @@ static bool llama_eval_internal(
//
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
// But for now, we have focused only on Matrix x Vector Metal multiplication.
//
// TODO: avoid these syncs via shared memory (ref #1696)
//
if (lctx.ctx_metal) {
// We need to sync the GPU KV cache with the CPU KV cache
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
}

ggml_graph_compute(ctx0, &gf);
}
#else
Expand Down