Skip to content

Commit ce2c7d7

Browse files
authored
metal : handle buffers larger than device's maxBufferLength (#1826)
* metal : handle buffers larger than device's maxBufferLength * metal : print more verbose device info + handle errors * metal : fix prints for overlapping views * metal : minimize view overlap to try to utilize device memory better
1 parent 57cd694 commit ce2c7d7

File tree

6 files changed

+125
-35
lines changed

6 files changed

+125
-35
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ $(info )
252252
ggml.o: ggml.c ggml.h ggml-cuda.h
253253
$(CC) $(CFLAGS) -c $< -o $@
254254

255-
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
255+
llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
256256
$(CXX) $(CXXFLAGS) -c $< -o $@
257257

258258
common.o: examples/common.cpp examples/common.h

ggml-metal.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx);
4141
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
4242
// - the mapping is used during computation to determine the arguments of the compute kernels
4343
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
44+
// - max_size specifies the maximum size of a tensor and is used to create shared views such
45+
// that it is guaranteed that the tensor will fit in at least one of the views
4446
//
4547
bool ggml_metal_add_buffer(
4648
struct ggml_metal_context * ctx,
4749
const char * name,
4850
void * data,
49-
size_t size);
51+
size_t size,
52+
size_t max_size);
5053

5154
// set data from host memory into the device
5255
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

ggml-metal.m

Lines changed: 79 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,14 @@ @implementation GGMLMetalClass
183183
#undef GGML_METAL_ADD_KERNEL
184184
}
185185

186+
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
187+
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
188+
if (ctx->device.maxTransferRate != 0) {
189+
fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
190+
} else {
191+
fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
192+
}
193+
186194
return ctx;
187195
}
188196

@@ -199,10 +207,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
199207
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
200208
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
201209

210+
const int64_t tsize = ggml_nbytes(t);
211+
212+
// find the view that contains the tensor fully
202213
for (int i = 0; i < ctx->n_buffers; ++i) {
203214
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
204215

205-
if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
216+
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
206217
*offs = (size_t) ioffs;
207218

208219
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
@@ -220,7 +231,8 @@ bool ggml_metal_add_buffer(
220231
struct ggml_metal_context * ctx,
221232
const char * name,
222233
void * data,
223-
size_t size) {
234+
size_t size,
235+
size_t max_size) {
224236
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
225237
fprintf(stderr, "%s: too many buffers\n", __func__);
226238
return false;
@@ -237,30 +249,68 @@ bool ggml_metal_add_buffer(
237249
}
238250
}
239251

240-
size_t page_size = getpagesize();
241-
size_t aligned_size = size;
242-
if ((aligned_size % page_size) != 0) {
243-
aligned_size += (page_size - (aligned_size % page_size));
252+
const size_t size_page = getpagesize();
253+
254+
size_t size_aligned = size;
255+
if ((size_aligned % size_page) != 0) {
256+
size_aligned += (size_page - (size_aligned % size_page));
244257
}
245258

246-
ctx->buffers[ctx->n_buffers].name = name;
247-
ctx->buffers[ctx->n_buffers].data = data;
248-
ctx->buffers[ctx->n_buffers].size = size;
259+
// the buffer fits into the max buffer size allowed by the device
260+
if (size_aligned <= ctx->device.maxBufferLength) {
261+
ctx->buffers[ctx->n_buffers].name = name;
262+
ctx->buffers[ctx->n_buffers].data = data;
263+
ctx->buffers[ctx->n_buffers].size = size;
249264

250-
if (ctx->device.maxBufferLength < aligned_size) {
251-
fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength);
252-
return false;
253-
}
254-
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
265+
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
266+
267+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
268+
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
269+
return false;
270+
}
271+
272+
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
273+
274+
++ctx->n_buffers;
275+
} else {
276+
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
277+
// one of the views
278+
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
279+
const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
280+
const size_t size_view = ctx->device.maxBufferLength;
281+
282+
for (size_t i = 0; i < size; i += size_step) {
283+
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
255284

256-
if (ctx->buffers[ctx->n_buffers].metal == nil) {
257-
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
258-
return false;
285+
ctx->buffers[ctx->n_buffers].name = name;
286+
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
287+
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
288+
289+
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
290+
291+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
292+
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
293+
return false;
294+
}
295+
296+
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
297+
if (i + size_step < size) {
298+
fprintf(stderr, "\n");
299+
}
300+
301+
++ctx->n_buffers;
302+
}
259303
}
260304

261-
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
305+
fprintf(stderr, ", (%8.2f / %8.2f)",
306+
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
307+
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
262308

263-
++ctx->n_buffers;
309+
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
310+
fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
311+
} else {
312+
fprintf(stderr, "\n");
313+
}
264314
}
265315

266316
return true;
@@ -909,4 +959,14 @@ void ggml_metal_graph_compute(
909959
dispatch_barrier_sync(queue, ^{});
910960

911961
[command_buffers[n_cb - 1] waitUntilCompleted];
962+
963+
// check status of command buffers
964+
// needed to detect if the device ran out-of-memory for example (#1881)
965+
for (int i = 0; i < n_cb; i++) {
966+
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
967+
if (status != MTLCommandBufferStatusCompleted) {
968+
fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
969+
GGML_ASSERT(false);
970+
}
971+
}
912972
}

ggml.c

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4154,14 +4154,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
41544154
ctx->no_alloc = no_alloc;
41554155
}
41564156

4157-
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
4157+
void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
41584158
return ctx->mem_buffer;
41594159
}
41604160

4161-
size_t ggml_get_mem_size(struct ggml_context * ctx) {
4161+
size_t ggml_get_mem_size(const struct ggml_context * ctx) {
41624162
return ctx->mem_size;
41634163
}
41644164

4165+
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4166+
size_t max_size = 0;
4167+
4168+
struct ggml_object * obj = ctx->objects_begin;
4169+
4170+
while (obj != NULL) {
4171+
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4172+
4173+
const size_t size = ggml_nbytes(tensor);
4174+
4175+
if (max_size < size) {
4176+
max_size = size;
4177+
}
4178+
4179+
obj = obj->next;
4180+
}
4181+
4182+
return max_size;
4183+
}
4184+
41654185
// IMPORTANT:
41664186
// when creating "opt" tensors, always save and load the scratch buffer
41674187
// this is an error prone process, but it is necessary to support inplace

ggml.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -500,8 +500,9 @@ extern "C" {
500500
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
501501
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
502502

503-
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
504-
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
503+
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
504+
GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
505+
GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
505506

506507
GGML_API struct ggml_tensor * ggml_new_tensor(
507508
struct ggml_context * ctx,

llama.cpp

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2696,29 +2696,35 @@ struct llama_context * llama_init_from_file(
26962696
// this allocates all Metal resources and memory buffers
26972697
ctx->ctx_metal = ggml_metal_init();
26982698

2699-
void *data_ptr = NULL;
2699+
void * data_ptr = NULL;
27002700
size_t data_size = 0;
2701+
27012702
if (params.use_mmap) {
2702-
data_ptr = ctx->model.mapping->addr;
2703-
data_size= ctx->model.mapping->size;
2703+
data_ptr = ctx->model.mapping->addr;
2704+
data_size = ctx->model.mapping->size;
27042705
} else {
2705-
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2706-
data_size= ggml_get_mem_size(ctx->model.ctx);
2706+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2707+
data_size = ggml_get_mem_size (ctx->model.ctx);
27072708
}
27082709

2710+
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
2711+
2712+
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
2713+
27092714
#define LLAMA_METAL_CHECK_BUF(result) \
27102715
if (!(result)) { \
27112716
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
27122717
llama_free(ctx); \
27132718
return NULL; \
27142719
}
27152720

2716-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
2717-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
2721+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
2722+
2723+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2724+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
27182725

2719-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
2720-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
2721-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
2726+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
2727+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
27222728
#undef LLAMA_METAL_CHECK_BUF
27232729
}
27242730
#endif

0 commit comments

Comments
 (0)