@@ -17113,7 +17113,7 @@ static void llama_output_reorder(struct llama_context * ctx) {
17113
17113
}
17114
17114
}
17115
17115
17116
- static void llama_graph_compute(
17116
+ static enum ggml_status llama_graph_compute(
17117
17117
llama_context & lctx,
17118
17118
ggml_cgraph * gf,
17119
17119
int n_threads,
@@ -17128,12 +17128,14 @@ static void llama_graph_compute(
17128
17128
set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
17129
17129
}
17130
17130
17131
- auto err = ggml_backend_sched_graph_compute_async(lctx.sched, gf);
17132
- if (err != GGML_STATUS_SUCCESS) {
17133
- LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err );
17131
+ auto status = ggml_backend_sched_graph_compute_async(lctx.sched, gf);
17132
+ if (status != GGML_STATUS_SUCCESS) {
17133
+ LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status );
17134
17134
}
17135
17135
17136
17136
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
17137
+
17138
+ return status;
17137
17139
}
17138
17140
17139
17141
// decode a batch of tokens by evaluating the transformer
@@ -17315,7 +17317,18 @@ static int llama_decode_internal(
17315
17317
17316
17318
llama_set_inputs(lctx, ubatch);
17317
17319
17318
- llama_graph_compute(lctx, gf, n_threads, threadpool);
17320
+ const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
17321
+ switch (compute_status) {
17322
+ case GGML_STATUS_SUCCESS:
17323
+ break;
17324
+ case GGML_STATUS_ABORTED:
17325
+ return 2;
17326
+ case GGML_STATUS_ALLOC_FAILED:
17327
+ return -2;
17328
+ case GGML_STATUS_FAILED:
17329
+ default:
17330
+ return -3;
17331
+ }
17319
17332
17320
17333
// update the kv ring buffer
17321
17334
{
@@ -17549,7 +17562,18 @@ static int llama_encode_internal(
17549
17562
17550
17563
llama_set_inputs(lctx, ubatch);
17551
17564
17552
- llama_graph_compute(lctx, gf, n_threads, threadpool);
17565
+ const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
17566
+ switch (compute_status) {
17567
+ case GGML_STATUS_SUCCESS:
17568
+ break;
17569
+ case GGML_STATUS_ABORTED:
17570
+ return 2;
17571
+ case GGML_STATUS_ALLOC_FAILED:
17572
+ return -2;
17573
+ case GGML_STATUS_FAILED:
17574
+ default:
17575
+ return -3;
17576
+ }
17553
17577
17554
17578
// extract embeddings
17555
17579
if (embd) {
0 commit comments