Skip to content

Commit 3f87a4c

Browse files
Di Xu (SWE)facebook-github-bot
authored andcommitted
Support more breakdown of latency metrics/stats for Llama (#6072)
Summary: Support more breakdown of latency metrics/stats for Llama - This is needed when we debugging the Frame-LLM project across teams Reviewed By: cccclai Differential Revision: D64139460
1 parent 192ca82 commit 3f87a4c

File tree

1 file changed

+24
-0
lines changed

1 file changed

+24
-0
lines changed

extension/llm/runner/stats.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,14 @@ struct Stats {
2929
long model_load_end_ms;
3030
// inference_start_ms: Immediately after the model is loaded (or we check
3131
// for model load), measure the inference time.
32+
// NOTE: It's actually the tokenizer encode + model execution time.
3233
long inference_start_ms;
34+
// End of the tokenizer encode time.
35+
long token_encode_end_ms;
36+
// Start of the model execution (forward function) time.
37+
long model_execution_start_ms;
38+
// End of the model execution (forward function) time.
39+
long model_execution_end_ms;
3340
// prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
3441
// before the inference loop starts
3542
long prompt_eval_end_ms;
@@ -139,6 +146,23 @@ inline void print_report(const Stats& stats) {
139146
((double)(stats.first_token_ms - stats.inference_start_ms) /
140147
stats.SCALING_FACTOR_UNITS_PER_SECOND));
141148

149+
if (stats.token_encode_end_ms != 0) {
150+
ET_LOG(
151+
Info,
152+
"\tTokenizer encoding time:\t%f (seconds)",
153+
((double)(stats.token_encode_end_ms - stats.inference_start_ms) /
154+
stats.SCALING_FACTOR_UNITS_PER_SECOND));
155+
}
156+
157+
if (stats.model_execution_end_ms - stats.model_execution_start_ms > 0) {
158+
ET_LOG(
159+
Info,
160+
"\tForward/inference only time:\t%f (seconds)",
161+
((double)(stats.model_execution_end_ms -
162+
stats.model_execution_start_ms) /
163+
stats.SCALING_FACTOR_UNITS_PER_SECOND));
164+
}
165+
142166
ET_LOG(
143167
Info,
144168
"\tSampling time over %" PRIu64 " tokens:\t%f (seconds)",

0 commit comments

Comments
 (0)