Support more breakdown of latency metrics/stats for Llama (#6072)

Di Xu (SWE) · facebook-github-bot · commit d6aea3d4a20e · 2024-10-10T07:35:02.000-07:00
Summary: Pull Request resolved: #6072 Support more breakdown of latency metrics/stats for Llama - This is needed when we debugging the Frame-LLM project across teams Reviewed By: cccclai Differential Revision: D64139460 fbshipit-source-id: ec92ee2e15621705e7b8aa28d53e54e66c45a7cc
diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h
@@ -29,7 +29,14 @@ struct Stats {
   long model_load_end_ms;
   // inference_start_ms: Immediately after the model is loaded (or we check
   // for model load), measure the inference time.
+  // NOTE: It's actually the tokenizer encode + model execution time.
   long inference_start_ms;
+  // End of the tokenizer encode time.
+  long token_encode_end_ms;
+  // Start of the model execution (forward function) time.
+  long model_execution_start_ms;
+  // End of the model execution (forward function) time.
+  long model_execution_end_ms;
   // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
   // before the inference loop starts
   long prompt_eval_end_ms;