@@ -558,18 +558,27 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
558
558
559
559
[desc release ];
560
560
561
+ // GGML_LOG_INFO("%s: resized heap to %zu\n", __func__, [heap->obj size]);
562
+
561
563
ggml_metal_heap_reset (heap);
562
564
563
565
return true ;
564
566
}
565
567
566
- static id <MTLBuffer > ggml_metal_heap_alloc (struct ggml_metal_heap * heap, size_t size) {
567
- const size_t alignment = 1024 *1024 ;
568
+ static id <MTLBuffer > ggml_metal_heap_alloc (struct ggml_metal_heap * heap, size_t size, bool no_alloc) {
569
+ // note: this is probably more than needed, but just in case
570
+ const size_t alignment = 1024 ;
568
571
569
572
const size_t size_aligned = GGML_PAD (size, alignment);
570
573
574
+ // GGML_LOG_INFO("%s: size = %zu, size_aligned = %zu, need = %zu, fail = %d\n", __func__, size, size_aligned, heap->need, heap->fail);
575
+
571
576
heap->need += size_aligned;
572
577
578
+ if (no_alloc) {
579
+ return nil ;
580
+ }
581
+
573
582
if (!heap->fail && size_aligned > [heap->obj maxAvailableSizeWithAlignment: alignment]) {
574
583
heap->fail = 1 ;
575
584
}
@@ -883,7 +892,7 @@ @implementation GGMLMetalClass
883
892
for (int i = 0 ; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
884
893
ctx->cmd_bufs [i].obj = nil ;
885
894
886
- // create 1MB heaps per command buffer
895
+ // create initial small heaps per command buffer
887
896
// these can be resized during compute when necessary
888
897
ctx->cmd_bufs [i].heap = ggml_metal_heap_init (device, 32 );
889
898
}
@@ -1624,17 +1633,19 @@ static bool ggml_metal_encode_node(
1624
1633
GGML_ABORT (" unsupported op" );
1625
1634
}
1626
1635
1636
+ const bool no_alloc = no_compute;
1637
+
1638
+ // heap buffers for temporary data
1627
1639
id <MTLBuffer > h_src0 = nil ;
1640
+
1628
1641
switch (dst->op ) {
1629
1642
case GGML_OP_SOFT_MAX:
1630
1643
{
1631
- h_src0 = ggml_metal_heap_alloc (heap, ggml_nbytes (src0));
1632
- if (!h_src0) {
1633
- // GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu\n",
1634
- // __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize]);
1644
+ h_src0 = ggml_metal_heap_alloc (heap, ggml_nbytes (src0), no_alloc );
1645
+ if (!no_alloc && ! h_src0) {
1646
+ GGML_LOG_ERROR (" %s : failed to allocate buffer, idx = %4d , size = %8zu , need = %8zu , max available = %9zu , heap size = %9zu , heap used = %zu , fail = %d \n " ,
1647
+ __func__, idx, ggml_nbytes (src0), heap->need , [heap->obj maxAvailableSizeWithAlignment: 0 ], [heap->obj size ], [heap->obj usedSize ], heap-> fail );
1635
1648
return false ;
1636
- } else {
1637
- // GGML_LOG_ERROR("%s: allocated %zu\n", __func__, ggml_nbytes(src0));
1638
1649
}
1639
1650
} break ;
1640
1651
default :
@@ -4707,16 +4718,13 @@ static enum ggml_status ggml_metal_graph_compute(
4707
4718
// number of threads in addition to the main thread
4708
4719
const int n_cb = ctx->n_cb ;
4709
4720
4710
- int n_try = 2 ;
4711
-
4712
4721
// submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
4713
4722
// the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
4714
4723
// while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
4715
4724
// each thread creates it's own command buffer and enqueues the ops in parallel
4716
4725
//
4717
4726
// tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2
4718
4727
4719
- while (n_try-- > 0 ) {
4720
4728
@autoreleasepool {
4721
4729
ctx->gf = gf;
4722
4730
@@ -4834,55 +4842,6 @@ static enum ggml_status ggml_metal_graph_compute(
4834
4842
}
4835
4843
}
4836
4844
4837
- bool retry = false ;
4838
-
4839
- // check heap statuses
4840
- for (int i = 0 ; i <= n_cb; ++i) {
4841
- struct ggml_metal_heap * heap = ctx->cmd_bufs [i].heap ;
4842
-
4843
- const size_t need = heap->need ;
4844
-
4845
- // printf("\nXXXXXXXXXXXXXXXXX cb %d, need = %zu, fail = %d, size = %zu\n", i, need, heap->fail, [heap->obj currentAllocatedSize]);
4846
-
4847
- if (heap->fail == 0 ) {
4848
- ggml_metal_heap_reset (ctx->cmd_bufs [i].heap );
4849
- [heap->obj setPurgeableState: MTLPurgeableStateEmpty ];
4850
-
4851
- continue ;
4852
- }
4853
-
4854
- if (heap->fail == 2 ) {
4855
- GGML_LOG_ERROR (" %s : command buffer %d , MTLHeap ran out of buffers, max = %d \n " , __func__, i, heap->n );
4856
- return GGML_STATUS_ALLOC_FAILED;
4857
- }
4858
-
4859
- if (heap->fail == 3 ) {
4860
- GGML_LOG_ERROR (" %s : command buffer %d , MTLHeap failed to allocate buffer, max = %d \n " , __func__, i, heap->n );
4861
- return GGML_STATUS_ALLOC_FAILED;
4862
- }
4863
-
4864
- // GGML_LOG_INFO("%s: command buffer %d, MTLHeap need = %zu\n", __func__, i, need);
4865
-
4866
- if (!ggml_metal_heap_resize (heap, need)) {
4867
- GGML_LOG_ERROR (" %s : failed to increase heap size to %zu \n " , __func__, need);
4868
- return GGML_STATUS_ALLOC_FAILED;
4869
- }
4870
-
4871
- retry = true ;
4872
- }
4873
-
4874
- if (!retry) {
4875
- break ;
4876
- }
4877
-
4878
- // printf("XXXXXXXXXXXXXXXXXXXXXXX retry\n");
4879
-
4880
- if (n_try == 0 ) {
4881
- GGML_LOG_ERROR (" %s : failed to allocate heap memory\n " , __func__);
4882
- return GGML_STATUS_ALLOC_FAILED;
4883
- }
4884
- }
4885
-
4886
4845
return GGML_STATUS_SUCCESS;
4887
4846
}
4888
4847
@@ -5257,21 +5216,38 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
5257
5216
5258
5217
const bool should_capture = ctx->capture_next_compute ;
5259
5218
5260
- bool no_compute = false ;
5219
+ ggml_metal_heap_reset (heap) ;
5261
5220
5262
5221
for (int idx = node_start; idx < node_end; ++idx) {
5263
- if (should_capture) {
5264
- [encoder pushDebugGroup: [NSString stringWithCString: ggml_op_desc (ggml_graph_node (ctx->gf, idx)) encoding: NSUTF8StringEncoding]];
5265
- }
5222
+ ggml_metal_encode_node (backend, idx, encoder, heap, true );
5223
+ }
5224
+
5225
+ bool can_compute = true ;
5266
5226
5267
- const bool res = ggml_metal_encode_node (backend, idx, encoder, heap, no_compute);
5227
+ if (heap->need > [heap->obj size ]) {
5228
+ const size_t need = heap->need ;
5268
5229
5269
- if (should_capture) {
5270
- [encoder popDebugGroup ];
5230
+ if (!ggml_metal_heap_resize (heap, need)) {
5231
+ GGML_LOG_ERROR (" %s : failed to resize MTLHeap, need = %zu \n " , __func__, need);
5232
+ can_compute = false ;
5271
5233
}
5234
+ }
5235
+
5236
+ if (can_compute) {
5237
+ for (int idx = node_start; idx < node_end; ++idx) {
5238
+ if (should_capture) {
5239
+ [encoder pushDebugGroup: [NSString stringWithCString: ggml_op_desc (ggml_graph_node (ctx->gf, idx)) encoding: NSUTF8StringEncoding]];
5240
+ }
5241
+
5242
+ const bool res = ggml_metal_encode_node (backend, idx, encoder, heap, false );
5272
5243
5273
- if (!res) {
5274
- no_compute = true ;
5244
+ if (should_capture) {
5245
+ [encoder popDebugGroup ];
5246
+ }
5247
+
5248
+ if (!res) {
5249
+ break ;
5250
+ }
5275
5251
}
5276
5252
}
5277
5253
0 commit comments