Skip to content

Commit e8b8d32

Browse files
authored
server : fix incorrect num_tokens_predicted (#3480)
1 parent 8f3a642 commit e8b8d32

File tree

1 file changed

+5
-3
lines changed

1 file changed

+5
-3
lines changed

examples/server/server.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -504,9 +504,11 @@ struct llama_server_context
504504
});
505505
}
506506

507+
bool tg = true;
507508
while (n_past < embd.size())
508509
{
509510
int n_eval = (int)embd.size() - n_past;
511+
tg = n_eval == 1;
510512
if (n_eval > params.n_batch)
511513
{
512514
n_eval = params.n_batch;
@@ -633,7 +635,9 @@ struct llama_server_context
633635

634636
last_n_tokens.erase(last_n_tokens.begin());
635637
last_n_tokens.push_back(result.tok);
636-
num_tokens_predicted++;
638+
if (tg) {
639+
num_tokens_predicted++;
640+
}
637641
}
638642

639643
// add it to the context
@@ -1124,8 +1128,6 @@ static json format_timings(llama_server_context &llama)
11241128
{
11251129
const auto timings = llama_get_timings(llama.ctx);
11261130

1127-
assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
1128-
11291131
return json{
11301132
{"prompt_n", timings.n_p_eval},
11311133
{"prompt_ms", timings.t_p_eval_ms},

0 commit comments

Comments
 (0)