Expose timestamp stats (#2794)

kirklandsign · facebook-github-bot · commit d24818abdd9c · 2024-04-03T21:45:03.000-07:00
Summary: Pull Request resolved: #2794 Differential Revision: D55604786 Pulled By: kirklandsign
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
@@ -208,7 +208,8 @@ Result<torch::executor::Tensor> Runner::run_model_step(
 Error Runner::generate(
     const std::string& prompt,
     int32_t seq_len,
-    std::function<void(const std::string&)> callback) {
+    std::function<void(const std::string&)> on_token_generated_callback,
+    std::function<void(const TimeStampsAndStats&)> on_stats_callback) {
   // Prepare the inputs.
   // Use ones-initialized inputs.
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
@@ -364,8 +365,8 @@ Error Runner::generate(
     util::safe_printf(piece);
     fflush(stdout);
 
-    if (callback) {
-      callback(piece);
+    if (on_token_generated_callback) {
+      on_token_generated_callback(piece);
     }
 
     if (shouldStop_) {
@@ -386,18 +387,19 @@ Error Runner::generate(
     ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
   }
 
-  timers_.printReport(num_prompt_tokens, pos - num_prompt_tokens);
+  timers_.num_prompt_tokens = num_prompt_tokens;
+  timers_.num_generated_tokens = pos - num_prompt_tokens;
+  timers_.printReport();
+  if (on_stats_callback) {
+    on_stats_callback(timers_);
+  }
 
   delete[] prompt_tokens;
   return Error::Ok;
 }
 
-void Runner::TimeStamps::printReport(
-    const int64_t& num_prompt_tokens,
-    const int64_t& num_generated_tokens) {
-  printf(
-      "PyTorchObserver %s\n",
-      toJsonString(num_prompt_tokens, num_generated_tokens).c_str());
+void Runner::TimeStampsAndStats::printReport() {
+  printf("PyTorchObserver %s\n", toJsonString().c_str());
 
   ET_LOG(
       Info,
@@ -449,9 +451,7 @@ void Runner::TimeStamps::printReport(
       (double)aggregate_sampling_time_ms / SCALING_FACTOR_UNITS_PER_SECOND);
 }
 
-const std::string Runner::TimeStamps::toJsonString(
-    const int64_t& num_prompt_tokens,
-    const int64_t& num_generated_tokens) {
+const std::string Runner::TimeStampsAndStats::toJsonString() {
   std::stringstream ss;
   ss << "{\"prompt_tokens\":" << num_prompt_tokens << ","
      << "\"generated_tokens\":" << num_generated_tokens << ","
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
@@ -31,12 +31,42 @@ class Runner {
       const std::string& tokenizer_path,
       const float temperature = 0.8f);
 
+  struct TimeStampsAndStats {
+    // Scaling factor for timestamps - in this case, we use ms.
+    const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
+    // Time stamps for the different stages of the execution
+    // model_load_start_ms: Start of model loading.
+    long model_load_start_ms;
+    // model_load_end_ms: End of model loading.
+    long model_load_end_ms;
+    // inference_start_ms: Immediately after the model is loaded (or we check
+    // for model load), measure the inference time.
+    long inference_start_ms;
+    // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
+    // before the inference loop starts
+    long prompt_eval_end_ms;
+    // first_token: Timestamp when the first generated token is emitted
+    long first_token_ms;
+    // inference_end_ms: End of inference/generation.
+    long inference_end_ms;
+    // Keep a running total of the time spent in sampling.
+    long aggregate_sampling_time_ms;
+    // Token count from prompt
+    int64_t num_prompt_tokens;
+    // Token count from generated (total - prompt)
+    int64_t num_generated_tokens;
+
+    void printReport();
+    const std::string toJsonString();
+  };
+
   bool is_loaded() const;
   Error load();
   Error generate(
       const std::string& prompt,
       int32_t seq_len = 128,
-      std::function<void(const std::string&)> callback = {});
+      std::function<void(const std::string&)> on_token_generated_callback = {},
+      std::function<void(const TimeStampsAndStats&)> on_stats_callback = {});
   void stop();
 
  private:
@@ -68,36 +98,7 @@ class Runner {
   std::unique_ptr<Tokenizer> tokenizer_;
   std::unique_ptr<Sampler> sampler_;
   bool shouldStop_{false};
-
-  struct TimeStamps {
-    // Scaling factor for timestamps - in this case, we use ms.
-    const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
-    // Time stamps for the different stages of the execution
-    // model_load_start_ms: Start of model loading.
-    long model_load_start_ms;
-    // model_load_end_ms: End of model loading.
-    long model_load_end_ms;
-    // inference_start_ms: Immediately after the model is loaded (or we check
-    // for model load), measure the inference time.
-    long inference_start_ms;
-    // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
-    // before the inference loop starts
-    long prompt_eval_end_ms;
-    // first_token: Timestamp when the first generated token is emitted
-    long first_token_ms;
-    // inference_end_ms: End of inference/generation.
-    long inference_end_ms;
-    // Keep a running total of the time spent in sampling.
-    long aggregate_sampling_time_ms;
-
-    void printReport(
-        const int64_t& num_prompt_tokens,
-        const int64_t& num_generated_tokens);
-    const std::string toJsonString(
-        const int64_t& num_prompt_tokens,
-        const int64_t& num_generated_tokens);
-  };
-  TimeStamps timers_;
+  TimeStampsAndStats timers_;
 };
 
 } // namespace torch::executor