@@ -183,6 +183,14 @@ @implementation GGMLMetalClass
183
183
#undef GGML_METAL_ADD_KERNEL
184
184
}
185
185
186
+ fprintf (stderr, " %s : recommendedMaxWorkingSetSize = %8.2f MB\n " , __func__, ctx->device .recommendedMaxWorkingSetSize / 1024.0 / 1024.0 );
187
+ fprintf (stderr, " %s : hasUnifiedMemory = %s \n " , __func__, ctx->device .hasUnifiedMemory ? " true" : " false" );
188
+ if (ctx->device .maxTransferRate != 0 ) {
189
+ fprintf (stderr, " %s : maxTransferRate = %8.2f MB/s\n " , __func__, ctx->device .maxTransferRate / 1024.0 / 1024.0 );
190
+ } else {
191
+ fprintf (stderr, " %s : maxTransferRate = built-in GPU\n " , __func__);
192
+ }
193
+
186
194
return ctx;
187
195
}
188
196
@@ -199,10 +207,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
199
207
static id <MTLBuffer > ggml_metal_get_buffer (struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
200
208
// fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
201
209
210
+ const int64_t tsize = ggml_nbytes (t);
211
+
212
+ // find the view that contains the tensor fully
202
213
for (int i = 0 ; i < ctx->n_buffers ; ++i) {
203
214
const int64_t ioffs = (int64_t ) t->data - (int64_t ) ctx->buffers [i].data ;
204
215
205
- if (ioffs >= 0 && ioffs < (int64_t ) ctx->buffers [i].size ) {
216
+ if (ioffs >= 0 && ioffs + tsize <= (int64_t ) ctx->buffers [i].size ) {
206
217
*offs = (size_t ) ioffs;
207
218
208
219
// fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
@@ -220,7 +231,8 @@ bool ggml_metal_add_buffer(
220
231
struct ggml_metal_context * ctx,
221
232
const char * name,
222
233
void * data,
223
- size_t size) {
234
+ size_t size,
235
+ size_t max_size) {
224
236
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
225
237
fprintf (stderr, " %s : too many buffers\n " , __func__);
226
238
return false ;
@@ -237,30 +249,68 @@ bool ggml_metal_add_buffer(
237
249
}
238
250
}
239
251
240
- size_t page_size = getpagesize ();
241
- size_t aligned_size = size;
242
- if ((aligned_size % page_size) != 0 ) {
243
- aligned_size += (page_size - (aligned_size % page_size));
252
+ const size_t size_page = getpagesize ();
253
+
254
+ size_t size_aligned = size;
255
+ if ((size_aligned % size_page) != 0 ) {
256
+ size_aligned += (size_page - (size_aligned % size_page));
244
257
}
245
258
246
- ctx->buffers [ctx->n_buffers].name = name;
247
- ctx->buffers [ctx->n_buffers].data = data;
248
- ctx->buffers [ctx->n_buffers].size = size;
259
+ // the buffer fits into the max buffer size allowed by the device
260
+ if (size_aligned <= ctx->device .maxBufferLength ) {
261
+ ctx->buffers [ctx->n_buffers].name = name;
262
+ ctx->buffers [ctx->n_buffers].data = data;
263
+ ctx->buffers [ctx->n_buffers].size = size;
249
264
250
- if (ctx->device .maxBufferLength < aligned_size) {
251
- fprintf (stderr, " %s : buffer '%s ' size %zu is larger than buffer maximum of %zu \n " , __func__, name, aligned_size, ctx->device .maxBufferLength );
252
- return false ;
253
- }
254
- ctx->buffers [ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy: data length: aligned_size options: MTLResourceStorageModeShared deallocator: nil ];
265
+ ctx->buffers [ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy: data length: size_aligned options: MTLResourceStorageModeShared deallocator: nil ];
266
+
267
+ if (ctx->buffers [ctx->n_buffers].metal == nil ) {
268
+ fprintf (stderr, " %s : failed to allocate '%-16s ' buffer, size = %8.2f MB\n " , __func__, name, size_aligned / 1024.0 / 1024.0 );
269
+ return false ;
270
+ }
271
+
272
+ fprintf (stderr, " %s : allocated '%-16s ' buffer, size = %8.2f MB" , __func__, name, size_aligned / 1024.0 / 1024.0 );
273
+
274
+ ++ctx->n_buffers ;
275
+ } else {
276
+ // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
277
+ // one of the views
278
+ const size_t size_ovlp = ((max_size + size_page - 1 ) / size_page + 1 ) * size_page; // round-up 2 pages just in case
279
+ const size_t size_step = ctx->device .maxBufferLength - size_ovlp;
280
+ const size_t size_view = ctx->device .maxBufferLength ;
281
+
282
+ for (size_t i = 0 ; i < size; i += size_step) {
283
+ const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
255
284
256
- if (ctx->buffers [ctx->n_buffers].metal == nil ) {
257
- fprintf (stderr, " %s : failed to allocate '%-16s ' buffer, size = %8.2f MB\n " , __func__, name, aligned_size / 1024.0 / 1024.0 );
258
- return false ;
285
+ ctx->buffers [ctx->n_buffers].name = name;
286
+ ctx->buffers [ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
287
+ ctx->buffers [ctx->n_buffers].size = size_step_aligned;
288
+
289
+ ctx->buffers [ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy: (void *) ((uint8_t *) data + i) length: size_step_aligned options: MTLResourceStorageModeShared deallocator: nil ];
290
+
291
+ if (ctx->buffers [ctx->n_buffers].metal == nil ) {
292
+ fprintf (stderr, " %s : failed to allocate '%-16s ' buffer, size = %8.2f MB\n " , __func__, name, size_step_aligned / 1024.0 / 1024.0 );
293
+ return false ;
294
+ }
295
+
296
+ fprintf (stderr, " %s : allocated '%-16s ' buffer, size = %8.2f MB, offs = %12ld " , __func__, name, size_step_aligned / 1024.0 / 1024.0 , i);
297
+ if (i + size_step < size) {
298
+ fprintf (stderr, " \n " );
299
+ }
300
+
301
+ ++ctx->n_buffers ;
302
+ }
259
303
}
260
304
261
- fprintf (stderr, " %s : allocated '%-16s ' buffer, size = %8.2f MB\n " , __func__, name, aligned_size / 1024.0 / 1024.0 );
305
+ fprintf (stderr, " , (%8.2f / %8.2f )" ,
306
+ ctx->device .currentAllocatedSize / 1024.0 / 1024.0 ,
307
+ ctx->device .recommendedMaxWorkingSetSize / 1024.0 / 1024.0 );
262
308
263
- ++ctx->n_buffers ;
309
+ if (ctx->device .currentAllocatedSize > ctx->device .recommendedMaxWorkingSetSize ) {
310
+ fprintf (stderr, " , warning: current allocated size is greater than the recommended max working set size\n " );
311
+ } else {
312
+ fprintf (stderr, " \n " );
313
+ }
264
314
}
265
315
266
316
return true ;
@@ -909,4 +959,14 @@ void ggml_metal_graph_compute(
909
959
dispatch_barrier_sync (queue, ^{});
910
960
911
961
[command_buffers[n_cb - 1 ] waitUntilCompleted ];
962
+
963
+ // check status of command buffers
964
+ // needed to detect if the device ran out-of-memory for example (#1881)
965
+ for (int i = 0 ; i < n_cb; i++) {
966
+ MTLCommandBufferStatus status = (MTLCommandBufferStatus ) [command_buffers[i] status ];
967
+ if (status != MTLCommandBufferStatusCompleted ) {
968
+ fprintf (stderr, " %s : command buffer %d failed with status %lu \n " , __func__, i, status);
969
+ GGML_ASSERT (false );
970
+ }
971
+ }
912
972
}
0 commit comments