@@ -17181,7 +17181,7 @@ static void llama_output_reorder(struct llama_context * ctx) {
17181
17181
}
17182
17182
}
17183
17183
17184
- static void llama_graph_compute(
17184
+ static enum ggml_status llama_graph_compute(
17185
17185
llama_context & lctx,
17186
17186
ggml_cgraph * gf,
17187
17187
int n_threads,
@@ -17196,12 +17196,14 @@ static void llama_graph_compute(
17196
17196
set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
17197
17197
}
17198
17198
17199
- auto err = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
17200
- if (err != GGML_STATUS_SUCCESS) {
17201
- LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err );
17199
+ auto status = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
17200
+ if (status != GGML_STATUS_SUCCESS) {
17201
+ LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status );
17202
17202
}
17203
17203
17204
17204
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
17205
+
17206
+ return status;
17205
17207
}
17206
17208
17207
17209
// decode a batch of tokens by evaluating the transformer
@@ -17387,7 +17389,18 @@ static int llama_decode_internal(
17387
17389
17388
17390
llama_set_inputs(lctx, ubatch);
17389
17391
17390
- llama_graph_compute(lctx, gf, n_threads, threadpool);
17392
+ const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
17393
+ switch (compute_status) {
17394
+ case GGML_STATUS_SUCCESS:
17395
+ break;
17396
+ case GGML_STATUS_ABORTED:
17397
+ return 2;
17398
+ case GGML_STATUS_ALLOC_FAILED:
17399
+ return -2;
17400
+ case GGML_STATUS_FAILED:
17401
+ default:
17402
+ return -3;
17403
+ }
17391
17404
17392
17405
// update the kv ring buffer
17393
17406
{
@@ -17624,7 +17637,18 @@ static int llama_encode_internal(
17624
17637
17625
17638
llama_set_inputs(lctx, ubatch);
17626
17639
17627
- llama_graph_compute(lctx, gf, n_threads, threadpool);
17640
+ const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
17641
+ switch (compute_status) {
17642
+ case GGML_STATUS_SUCCESS:
17643
+ break;
17644
+ case GGML_STATUS_ABORTED:
17645
+ return 2;
17646
+ case GGML_STATUS_ALLOC_FAILED:
17647
+ return -2;
17648
+ case GGML_STATUS_FAILED:
17649
+ default:
17650
+ return -3;
17651
+ }
17628
17652
17629
17653
// extract embeddings
17630
17654
if (embd) {
0 commit comments