Skip to content

Single allocation of encode_async block with non-ARC capture in ggml-metal.m #9747

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 7, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 46 additions & 46 deletions ggml/src/ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,6 @@
struct ggml_cgraph * gf;

// the callback given to the thread pool
// TODO: ideally, this should be created once, utilizing the command buffer state above
// for some reason, doing it like this leads to a crash
void (^encode_async)(size_t ith);

// n_cb command buffers + 1 used by the main thread
Expand Down Expand Up @@ -683,6 +681,8 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
[ctx->kernels[i].pipeline release];
}

Block_release(ctx->encode_async);

[ctx->queue release];
[ctx->device release];

Expand Down Expand Up @@ -3000,46 +3000,6 @@ static enum ggml_status ggml_metal_graph_compute(
}
}

// TODO: how to avoid this allocation? I tried initializing it in ggml_backend_metal_set_n_cb but it crashes.
ctx->encode_async = ^(size_t iter) {
const int cb_idx = iter;
const int n_cb_l = ctx->n_cb;

const int n_nodes_0 = ctx->n_nodes_0;
const int n_nodes_1 = ctx->n_nodes_1;

const int n_nodes_per_cb = ctx->n_nodes_per_cb;

id<MTLCommandBuffer> command_buffer = ctx->command_buffers[cb_idx];
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];

int node_start = 0;
int node_end = n_nodes_0;

if (cb_idx < n_cb_l) {
node_start = n_nodes_0 + ( (cb_idx + 0) * n_nodes_per_cb);
node_end = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
}

for (int idx = node_start; idx < node_end; ++idx) {
if (should_capture) {
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(gf, idx)) encoding:NSUTF8StringEncoding]];
}

ggml_metal_encode_node(ctx, idx, encoder);

if (should_capture) {
[encoder popDebugGroup];
}
}

[encoder endEncoding];

if (cb_idx < 2 || ctx->abort_callback == NULL) {
[command_buffer commit];
}
};

// the main thread commits the first few commands immediately
// command_buffer[n_cb]
{
Expand Down Expand Up @@ -3468,10 +3428,50 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
}
}

// TODO: setting encode_async here causes crash during the next ggml_metal_graph_compute call. why?
//ctx->encode_async = ^(size_t iter) {
// ...
//};
if (ctx->encode_async) {
Block_release(ctx->encode_async);
}

ctx->encode_async = Block_copy(^(size_t iter) {
Copy link
Member

@ggerganov ggerganov Oct 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In theory, we might want to call ggml_backend_metal_set_n_cb() multiple times in the future (although unlikely). So to make this more correct in this case, we should check if ctx->encode_async == nil here and if not, call Block_release() first

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, that had crossed my mind indeed, but didn't want to add more logic there than needed for now - worried about scope creep. I agree this makes total sense. Will add 👍

const int cb_idx = iter;
const int n_cb_l = ctx->n_cb;

const int n_nodes_0 = ctx->n_nodes_0;
const int n_nodes_1 = ctx->n_nodes_1;

const int n_nodes_per_cb = ctx->n_nodes_per_cb;

id<MTLCommandBuffer> command_buffer = ctx->command_buffers[cb_idx];
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];

int node_start = 0;
int node_end = n_nodes_0;

if (cb_idx < n_cb_l) {
node_start = n_nodes_0 + ( (cb_idx + 0) * n_nodes_per_cb);
node_end = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
}

const bool should_capture = ctx->capture_next_compute;

for (int idx = node_start; idx < node_end; ++idx) {
if (should_capture) {
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
}

ggml_metal_encode_node(ctx, idx, encoder);

if (should_capture) {
[encoder popDebugGroup];
}
}

[encoder endEncoding];

if (cb_idx < 2 || ctx->abort_callback == NULL) {
[command_buffer commit];
}
});
}

static struct ggml_backend_i ggml_backend_metal_i = {
Expand Down
Loading