Skip to content

Commit 58106a6

Browse files
committed
llama: propagating the results of graph_compute to the user interface
1 parent a89f75e commit 58106a6

File tree

1 file changed

+29
-5
lines changed

1 file changed

+29
-5
lines changed

src/llama.cpp

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17113,7 +17113,7 @@ static void llama_output_reorder(struct llama_context * ctx) {
1711317113
}
1711417114
}
1711517115

17116-
static void llama_graph_compute(
17116+
static enum ggml_status llama_graph_compute(
1711717117
llama_context & lctx,
1711817118
ggml_cgraph * gf,
1711917119
int n_threads,
@@ -17128,12 +17128,14 @@ static void llama_graph_compute(
1712817128
set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
1712917129
}
1713017130

17131-
auto err = ggml_backend_sched_graph_compute_async(lctx.sched, gf);
17132-
if (err != GGML_STATUS_SUCCESS) {
17131+
auto status = ggml_backend_sched_graph_compute_async(lctx.sched, gf);
17132+
if (status != GGML_STATUS_SUCCESS) {
1713317133
LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err);
1713417134
}
1713517135

1713617136
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
17137+
17138+
return status;
1713717139
}
1713817140

1713917141
// decode a batch of tokens by evaluating the transformer
@@ -17315,7 +17317,18 @@ static int llama_decode_internal(
1731517317

1731617318
llama_set_inputs(lctx, ubatch);
1731717319

17318-
llama_graph_compute(lctx, gf, n_threads, threadpool);
17320+
const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
17321+
switch (compute_status) {
17322+
case GGML_STATUS_SUCCESS:
17323+
break;
17324+
case GGML_STATUS_ABORTED:
17325+
return 2;
17326+
case GGML_STATUS_ALLOC_FAILED:
17327+
return -2;
17328+
case GGML_STATUS_FAILED:
17329+
default:
17330+
return -3;
17331+
}
1731917332

1732017333
// update the kv ring buffer
1732117334
{
@@ -17549,7 +17562,18 @@ static int llama_encode_internal(
1754917562

1755017563
llama_set_inputs(lctx, ubatch);
1755117564

17552-
llama_graph_compute(lctx, gf, n_threads, threadpool);
17565+
const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
17566+
switch (compute_status) {
17567+
case GGML_STATUS_SUCCESS:
17568+
break;
17569+
case GGML_STATUS_ABORTED:
17570+
return 2;
17571+
case GGML_STATUS_ALLOC_FAILED:
17572+
return -2;
17573+
case GGML_STATUS_FAILED:
17574+
default:
17575+
return -3;
17576+
}
1755317577

1755417578
// extract embeddings
1755517579
if (embd) {

0 commit comments

Comments
 (0)