@@ -16612,7 +16612,7 @@ static void llama_output_reorder(struct llama_context * ctx) {
16612
16612
}
16613
16613
}
16614
16614
16615
- static void llama_graph_compute(
16615
+ static enum ggml_status llama_graph_compute(
16616
16616
llama_context & lctx,
16617
16617
ggml_cgraph * gf,
16618
16618
int n_threads,
@@ -16634,9 +16634,11 @@ static void llama_graph_compute(
16634
16634
}
16635
16635
#endif
16636
16636
16637
- ggml_backend_sched_graph_compute_async(lctx.sched, gf);
16637
+ auto status = ggml_backend_sched_graph_compute_async(lctx.sched, gf);
16638
16638
16639
16639
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
16640
+
16641
+ return status;
16640
16642
}
16641
16643
16642
16644
// decode a batch of tokens by evaluating the transformer
@@ -16818,7 +16820,18 @@ static int llama_decode_internal(
16818
16820
16819
16821
llama_set_inputs(lctx, ubatch);
16820
16822
16821
- llama_graph_compute(lctx, gf, n_threads, threadpool);
16823
+ const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
16824
+ switch (compute_status) {
16825
+ case GGML_STATUS_SUCCESS:
16826
+ break;
16827
+ case GGML_STATUS_ABORTED:
16828
+ return 2;
16829
+ case GGML_STATUS_ALLOC_FAILED:
16830
+ return -2;
16831
+ case GGML_STATUS_FAILED:
16832
+ default:
16833
+ return -3;
16834
+ }
16822
16835
16823
16836
// update the kv ring buffer
16824
16837
{
@@ -17038,7 +17051,18 @@ static int llama_encode_internal(
17038
17051
17039
17052
llama_set_inputs(lctx, ubatch);
17040
17053
17041
- llama_graph_compute(lctx, gf, n_threads, threadpool);
17054
+ const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
17055
+ switch (compute_status) {
17056
+ case GGML_STATUS_SUCCESS:
17057
+ break;
17058
+ case GGML_STATUS_ABORTED:
17059
+ return 2;
17060
+ case GGML_STATUS_ALLOC_FAILED:
17061
+ return -2;
17062
+ case GGML_STATUS_FAILED:
17063
+ default:
17064
+ return -3;
17065
+ }
17042
17066
17043
17067
// extract embeddings
17044
17068
if (embd) {
0 commit comments