Add time to first token for llama runner (#2141)

Varun Puri · facebook-github-bot · commit caee336d6842 · 2024-03-14T13:31:29.000-07:00
Summary: Pull Request resolved: #2141 Add time to first generated token & other features - Since we're measuring the first token time, the token rate is measured both at the * Model Load Time - just a timer around ET_CHECK_OK_OR_RETURN_ERROR(load()); * Total inference time - Immediately after model load until the end of the inference loop * >>First token time - From immediately after the model load until the first generated (not prompt) token is printed. * >>>>Prompt eval - (comparable to llama.cpp prompt_eval_time) prompt array allocation and tokenization. Ends right before the inference loop starts * >>Remaining tokens - immediately after the first token is outputted until the end of the inference loop * >>Net eval time - (comparable to llama.cpp eval_time) Total time spent generating tokens. * Sample time - amount of time spent sampling per token (present in llama.cpp) bypass-github-executorch-ci-checks bypass-github-pytorch-ci-checks Reviewed By: digantdesai, Jack-Khuu Differential Revision: D54223564 fbshipit-source-id: 3846903b56d20e2d4159fae63de3d471e5677c51
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
@@ -14,6 +14,7 @@
 
 #include <ctime>
 #include <memory>
+#include <sstream>
 
 #ifdef USE_ATEN_LIB
 #include <torch/torch.h>
@@ -161,8 +162,16 @@ Error Runner::generate(
   // Prepare the inputs.
   // Use ones-initialized inputs.
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
-  ET_CHECK_OK_OR_RETURN_ERROR(load());
+  if (!is_loaded()) {
+    timers_.model_load_start_ms = util::time_in_ms();
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+    timers_.model_load_end_ms = util::time_in_ms();
+  }
+
+  // First token time only measures the time it takes to encode the prompt and
+  // return a response token.
 
+  timers_.inference_start_ms = util::time_in_ms();
   shouldStop_ = false;
 
   // encode the (string) prompt into tokens sequence
@@ -179,6 +188,7 @@ Error Runner::generate(
       append_eos_ ? n_eos_ : 0,
       prompt_tokens,
       &num_prompt_tokens);
+
   for (int i = 0; i < num_prompt_tokens; i++) {
     ET_LOG(Info, "prompt_tokens[%d]: %d", i, prompt_tokens[i]);
   }
@@ -192,8 +202,6 @@ Error Runner::generate(
       "Sequence length exceeded - please increase the seq_len value passed to generate()");
 
   // start the main loop
-  long start =
-      0; // used to time our code, only initialized after first iteration
   int next; // will store the next token in the sequence
   int64_t pos = num_prompt_tokens - 1; // position in the sequence
   int token = prompt_tokens[pos]; // prefill starts from 0 to num_prompt_tokens
@@ -254,6 +262,7 @@ Error Runner::generate(
               tokenizer_->decode(prompt_tokens[i - 1], prompt_tokens[i])));
     }
   }
+
   // create a 1xN int tensor with next as value
   while (pos < seq_len) {
     // ET_LOG(Info, "Generating step %d...", pos);
@@ -289,10 +298,14 @@ Error Runner::generate(
         outputs.size() > 0,
         "Expecting output to have at least one evalue. Got %zu",
         outputs.size());
-
+    if (pos == num_prompt_tokens) {
+      timers_.first_token_ms = util::time_in_ms();
+    } else if (pos == num_prompt_tokens - 1) {
+      timers_.prompt_eval_end_ms = util::time_in_ms();
+    }
     int32_t next_tok;
     exec_aten::Tensor logits_tensor = outputs.at(logits_index).toTensor();
-
+    long sample_start_time_ms = util::time_in_ms();
     switch (logits_tensor.scalar_type()) {
       case ScalarType::Float: {
         next_tok = logitsToToken<float>(logits_tensor, pos, 0);
@@ -308,6 +321,8 @@ Error Runner::generate(
             "Unsupported dtype output %hhd",
             static_cast<int8_t>(logits_tensor.scalar_type()));
     }
+    timers_.aggregate_sampling_time_ms +=
+        util::time_in_ms() - sample_start_time_ms;
 
     // advance the state machine
     if (pos < num_prompt_tokens - 1) {
@@ -339,16 +354,13 @@ Error Runner::generate(
 
     // data-dependent terminating condition: we have n_eos_ number of EOS
     if (pos >= num_prompt_tokens && next == eos_id_) {
-      ET_LOG(Info, "Reached to the end of generation");
+      printf("\n");
+      ET_LOG(Info, "\nReached to the end of generation");
       break;
     }
 
     token = next;
 
-    // init the timer here because the first iteration can be slower
-    if (start == 0) {
-      start = util::time_in_ms();
-    }
     if (use_kv_cache_) {
       // outputs: [k_cache, v_cache, logits, k_cache, v_cache]
       memcpy(
@@ -361,23 +373,97 @@ Error Runner::generate(
           v_data.size());
     }
   }
+  timers_.inference_end_ms = util::time_in_ms();
   printf("\n");
 
   if (pos == seq_len) {
     ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
   }
-  // report achieved tok/s (pos-1 because the timer starts after first
-  // iteration)
-  if (pos >= 1) {
-    long end = util::time_in_ms();
-    ET_LOG(
-        Info, "Achieved tok/s: %f\n", (pos - 1) / (double)(end - start) * 1000);
-  }
+
+  timers_.printReport(num_prompt_tokens, pos - num_prompt_tokens);
 
   delete[] prompt_tokens;
   return Error::Ok;
 }
 
+void Runner::TimeStamps::printReport(
+    const int64_t& num_prompt_tokens,
+    const int64_t& num_generated_tokens) {
+  printf(
+      "PyTorchObserver %s\n",
+      toJsonString(num_prompt_tokens, num_generated_tokens).c_str());
+
+  ET_LOG(
+      Info,
+      "\tPrompt Tokens: %" PRIu64 "    Generated Tokens: %" PRIu64,
+      num_prompt_tokens,
+      num_generated_tokens);
+
+  ET_LOG(
+      Info,
+      "\tModel Load Time:\t\t%f (seconds)",
+      ((double)(model_load_end_ms - model_load_start_ms) /
+       SCALING_FACTOR_UNITS_PER_SECOND));
+  double inference_time_ms = (double)(inference_end_ms - inference_start_ms);
+  ET_LOG(
+      Info,
+      "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
+      inference_time_ms / SCALING_FACTOR_UNITS_PER_SECOND,
+
+      (num_generated_tokens) / (double)(inference_end_ms - inference_start_ms) *
+          SCALING_FACTOR_UNITS_PER_SECOND);
+  double prompt_eval_time = (double)(prompt_eval_end_ms - inference_start_ms);
+  ET_LOG(
+      Info,
+      "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
+      prompt_eval_time / SCALING_FACTOR_UNITS_PER_SECOND,
+      (num_prompt_tokens) / prompt_eval_time * SCALING_FACTOR_UNITS_PER_SECOND);
+
+  double eval_time = (double)(inference_end_ms - prompt_eval_end_ms);
+  ET_LOG(
+      Info,
+      "\t\tGenerated %" PRIu64
+      " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
+      num_generated_tokens,
+      eval_time / SCALING_FACTOR_UNITS_PER_SECOND,
+      num_generated_tokens / eval_time * SCALING_FACTOR_UNITS_PER_SECOND);
+
+  // Time to first token is measured from the start of inference, excluding
+  // model load time.
+  ET_LOG(
+      Info,
+      "\tTime to first generated token:\t%f (seconds)",
+      ((double)(first_token_ms - inference_start_ms) /
+       SCALING_FACTOR_UNITS_PER_SECOND));
+
+  ET_LOG(
+      Info,
+      "\tSampling time over %" PRIu64
+      " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
+      num_prompt_tokens + num_generated_tokens,
+      (double)aggregate_sampling_time_ms / SCALING_FACTOR_UNITS_PER_SECOND,
+      (num_prompt_tokens + num_generated_tokens) /
+          (double)aggregate_sampling_time_ms * SCALING_FACTOR_UNITS_PER_SECOND);
+}
+
+const std::string Runner::TimeStamps::toJsonString(
+    const int64_t& num_prompt_tokens,
+    const int64_t& num_generated_tokens) {
+  std::stringstream ss;
+  ss << "{\"prompt_tokens\":" << num_prompt_tokens << ","
+     << "\"generated_tokens\":" << num_generated_tokens << ","
+     << "\"model_load_start_ms\":" << model_load_start_ms << ","
+     << "\"model_load_end_ms\":" << model_load_end_ms << ","
+     << "\"inference_start_ms\":" << inference_start_ms << ","
+     << "\"inference_end_ms\":" << inference_end_ms << ","
+     << "\"prompt_eval_end_ms\":" << prompt_eval_end_ms << ","
+     << "\"first_token_ms\":" << first_token_ms << ","
+     << "\"aggregate_sampling_time_ms\":" << aggregate_sampling_time_ms << ","
+     << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
+     << SCALING_FACTOR_UNITS_PER_SECOND << "}";
+  return ss.str();
+}
+
 void Runner::stop() {
   shouldStop_ = true;
 }
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
@@ -63,6 +63,36 @@ class Runner {
   std::unique_ptr<Tokenizer> tokenizer_;
   std::unique_ptr<Sampler> sampler_;
   bool shouldStop_{false};
+
+  struct TimeStamps {
+    // Scaling factor for timestamps - in this case, we use ms.
+    const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
+    // Time stamps for the different stages of the execution
+    // model_load_start_ms: Start of model loading.
+    long model_load_start_ms;
+    // model_load_end_ms: End of model loading.
+    long model_load_end_ms;
+    // inference_start_ms: Immediately after the model is loaded (or we check
+    // for model load), measure the inference time.
+    long inference_start_ms;
+    // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
+    // before the inference loop starts
+    long prompt_eval_end_ms;
+    // first_token: Timestamp when the first generated token is emitted
+    long first_token_ms;
+    // inference_end_ms: End of inference/generation.
+    long inference_end_ms;
+    // Keep a running total of the time spent in sampling.
+    long aggregate_sampling_time_ms;
+
+    void printReport(
+        const int64_t& num_prompt_tokens,
+        const int64_t& num_generated_tokens);
+    const std::string toJsonString(
+        const int64_t& num_prompt_tokens,
+        const int64_t& num_generated_tokens);
+  };
+  TimeStamps timers_;
 };
 
 } // namespace torch::executor