Skip to content

Commit 0f86ae9

Browse files
more performance info
1 parent 91e470f commit 0f86ae9

File tree

1 file changed

+20
-8
lines changed

1 file changed

+20
-8
lines changed

llama.cpp

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4410,19 +4410,31 @@ void llama_dump_result_info_yaml(FILE * stream, const llama_context * ctx, const
44104410
fprintf(stream, "ftype: %u\n", ctx->model.hparams.ftype);
44114411
fprintf(stream, "ftype_str: %s\n", llama_ftype_name(ctx->model.hparams.ftype));
44124412
fprintf(stream, "model_type: %s\n", llama_model_type_name(ctx->model.type));
4413-
fprintf(stream, "n_eval: %d\n", ctx->n_eval);
4414-
fprintf(stream, "n_vocab: %d\n", ctx->model.hparams.n_vocab);
4415-
fprintf(stream, "n_p_eval: %d\n", ctx->n_p_eval);
4416-
fprintf(stream, "n_sample: %d\n", ctx->n_sample);
4413+
fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
4414+
1.0e-3 * ctx->t_eval_us / ctx->n_eval);
4415+
fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
4416+
1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
4417+
fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
4418+
1.0e-3 * ctx->t_sample_us / ctx->n_sample);
4419+
fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
4420+
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", ctx->model.hparams.n_vocab);
4421+
fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
4422+
fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->n_sample);
44174423
dump_string_yaml_multiline(stream, "output", output_str, false);
44184424

44194425
const std::vector<int> output_token_vector(output_tokens, output_tokens + n_output_tokens);
44204426
dump_vector_int_yaml(stream, "output_tokens", output_token_vector);
44214427

4422-
fprintf(stream, "t_eval_us: %ld\n", ctx->t_eval_us);
4423-
fprintf(stream, "t_load_us: %ld\n", ctx->t_load_us);
4424-
fprintf(stream, "t_p_eval_us: %ld\n", ctx->t_p_eval_us);
4425-
fprintf(stream, "t_sample_us: %ld\n", ctx->t_sample_us);
4428+
fprintf(stream, "t_eval_us: %ld # total microseconds spent generating tokens\n", ctx->t_eval_us);
4429+
fprintf(stream, "t_load_us: %ld # total microseconds spent loading the model\n", ctx->t_load_us);
4430+
fprintf(stream, "t_p_eval_us: %ld # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
4431+
fprintf(stream, "t_sample_us: %ld # total microseconds spent sampling\n", ctx->t_sample_us);
4432+
fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
4433+
1.0e6 * ctx->n_eval / ctx->t_eval_us);
4434+
fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
4435+
1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
4436+
fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
4437+
1.0e6 * ctx->n_sample / ctx->t_sample_us);
44264438
}
44274439

44284440
// For internal test use

0 commit comments

Comments
 (0)