@@ -4410,19 +4410,31 @@ void llama_dump_result_info_yaml(FILE * stream, const llama_context * ctx, const
4410
4410
fprintf (stream, " ftype: %u\n " , ctx->model .hparams .ftype );
4411
4411
fprintf (stream, " ftype_str: %s\n " , llama_ftype_name (ctx->model .hparams .ftype ));
4412
4412
fprintf (stream, " model_type: %s\n " , llama_model_type_name (ctx->model .type ));
4413
- fprintf (stream, " n_eval: %d\n " , ctx->n_eval );
4414
- fprintf (stream, " n_vocab: %d\n " , ctx->model .hparams .n_vocab );
4415
- fprintf (stream, " n_p_eval: %d\n " , ctx->n_p_eval );
4416
- fprintf (stream, " n_sample: %d\n " , ctx->n_sample );
4413
+ fprintf (stream, " mst_eval: %.2f # ms / token during generation\n " ,
4414
+ 1.0e-3 * ctx->t_eval_us / ctx->n_eval );
4415
+ fprintf (stream, " mst_p_eval: %.2f # ms / token during prompt processing\n " ,
4416
+ 1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval );
4417
+ fprintf (stream, " mst_sample: %.2f # ms / token during sampling\n " ,
4418
+ 1.0e-3 * ctx->t_sample_us / ctx->n_sample );
4419
+ fprintf (stream, " n_eval: %d # number of tokens generated (excluding the first one)\n " , ctx->n_eval );
4420
+ fprintf (stream, " n_vocab: %d # output size of the final layer, 32001 for some models\n " , ctx->model .hparams .n_vocab );
4421
+ fprintf (stream, " n_p_eval: %d # number of tokens processed in batches at the beginning\n " , ctx->n_p_eval );
4422
+ fprintf (stream, " n_sample: %d # number of sampled tokens\n " , ctx->n_sample );
4417
4423
dump_string_yaml_multiline (stream, " output" , output_str, false );
4418
4424
4419
4425
const std::vector<int > output_token_vector (output_tokens, output_tokens + n_output_tokens);
4420
4426
dump_vector_int_yaml (stream, " output_tokens" , output_token_vector);
4421
4427
4422
- fprintf (stream, " t_eval_us: %ld\n " , ctx->t_eval_us );
4423
- fprintf (stream, " t_load_us: %ld\n " , ctx->t_load_us );
4424
- fprintf (stream, " t_p_eval_us: %ld\n " , ctx->t_p_eval_us );
4425
- fprintf (stream, " t_sample_us: %ld\n " , ctx->t_sample_us );
4428
+ fprintf (stream, " t_eval_us: %ld # total microseconds spent generating tokens\n " , ctx->t_eval_us );
4429
+ fprintf (stream, " t_load_us: %ld # total microseconds spent loading the model\n " , ctx->t_load_us );
4430
+ fprintf (stream, " t_p_eval_us: %ld # total microseconds spent prompt processing\n " , ctx->t_p_eval_us );
4431
+ fprintf (stream, " t_sample_us: %ld # total microseconds spent sampling\n " , ctx->t_sample_us );
4432
+ fprintf (stream, " ts_eval: %.2f # tokens / second during generation\n " ,
4433
+ 1.0e6 * ctx->n_eval / ctx->t_eval_us );
4434
+ fprintf (stream, " ts_p_eval: %.2f # tokens / second during prompt processing\n " ,
4435
+ 1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us );
4436
+ fprintf (stream, " ts_sample: %.2f # tokens / second during sampling\n " ,
4437
+ 1.0e6 * ctx->n_sample / ctx->t_sample_us );
4426
4438
}
4427
4439
4428
4440
// For internal test use
0 commit comments