refactor runner to fix bugs (#2752)

JacobSzwejbka · facebook-github-bot · commit b4750be432a7 · 2024-03-28T16:15:20.000-07:00
Summary:

Lots of bugs before with kv cache implementation of runner. Trying this out

Differential Revision: D55496044
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
@@ -10,6 +10,7 @@
 // The module takes in a string as input and emits a string as output.
 
 #include <executorch/examples/models/llama2/runner/runner.h>
+#include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
 #include <ctime>
@@ -121,24 +122,6 @@ T Runner::getMetadataHelper(std::string method_name, T default_val) {
   return res;
 }
 
-std::vector<exec_aten::SizesType> Runner::getKVCacheShape() {
-  // shape: (n_layers, args.max_batch_size, args.max_seq_len, self.n_kv_heads,
-  // self.head_dim)
-  std::vector<std::string> methods = {
-      "get_n_layers",
-      "get_max_batch_size",
-      "get_max_seq_len",
-      "get_n_kv_heads",
-      "get_head_dim"};
-  std::vector<int64_t> default_values = {12, 1, 128, 32, 128};
-  std::vector<exec_aten::SizesType> result;
-  for (int i = 0; i < methods.size(); ++i) {
-    // convert from int64_t to int32_t
-    result.push_back(getMetadataHelper<int64_t>(methods[i], default_values[i]));
-  }
-  return result;
-}
-
 template <typename T>
 int32_t Runner::logitsToToken(
     const exec_aten::Tensor& logits_tensor,
@@ -155,6 +138,73 @@ int32_t Runner::logitsToToken(
   return sampler_->sample(logits_last);
 }
 
+// Given an input token. Set up the inputs for the model and execute a single
+// step. Returning the logits tensor.
+Result<torch::executor::Tensor> Runner::run_model_step(
+    int64_t input_token,
+    ManagedTensor& managed_tokens,
+    ManagedTensor& managed_start_pos,
+    size_t max_seq_len) {
+  // ET_LOG(Info, "Input token %" PRIu64, input_token);
+  if (use_kv_cache_) {
+    std::vector<EValue> inputs;
+    auto tokens = managed_tokens.get_aliasing_tensor();
+    auto start_pos = managed_start_pos.get_aliasing_tensor();
+
+    // When using kv-cache our input is always 1 token, so just update to the
+    // latest.
+    tokens.mutable_data_ptr<int64_t>()[0] = input_token;
+
+    // inputs:[tokens, start_pos]
+    inputs.push_back(tokens);
+    inputs.push_back(start_pos);
+
+    Result<std::vector<EValue>> outputs_res = module_->forward(inputs);
+    ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
+    ET_CHECK_MSG(
+        outputs_res.get().size() == 1,
+        "More then one output returned from executing LLM.");
+    ET_CHECK_MSG(
+        outputs_res.get()[0].isTensor(),
+        "Non Tensor Output returned from executing LLM");
+
+    // Bump start_pos by 1
+    start_pos.mutable_data_ptr<int64_t>()[0]++;
+
+    // Return the logits tensor
+    return outputs_res.get()[0].toTensor();
+  } else { // no kv cache
+    std::vector<EValue> inputs;
+    auto tokens = managed_tokens.get_aliasing_tensor();
+    auto start_pos = managed_start_pos.get_aliasing_tensor();
+
+    // When not using kv-cache our input is the entire history of tokens we have
+    // seen, so resize input to be 1 larger and append the new token to the end.
+    // TODO does this work in ATen mode?
+    tokens.mutable_data_ptr<int64_t>()[tokens.size(1) - 1] = input_token;
+
+    // inputs:[tokens]
+    inputs.push_back(tokens);
+
+    Result<std::vector<EValue>> outputs_res = module_->forward(inputs);
+    ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
+    ET_CHECK_MSG(
+        outputs_res.get().size() == 1,
+        "More then one output returned from executing LLM.");
+    ET_CHECK_MSG(
+        outputs_res.get()[0].isTensor(),
+        "Non Tensor Output returned from executing LLM");
+
+    if (tokens.size(1) < max_seq_len) {
+      // Resize the tokens tensor to be 1 larger for next step.
+      managed_tokens.resize({1, static_cast<int>(tokens.size(1) + 1)});
+    }
+
+    // Return the logits tensor
+    return outputs_res.get()[0].toTensor();
+  }
+}
+
 Error Runner::generate(
     const std::string& prompt,
     int32_t seq_len,
@@ -189,9 +239,6 @@ Error Runner::generate(
       prompt_tokens,
       &num_prompt_tokens);
 
-  for (int i = 0; i < num_prompt_tokens; i++) {
-    ET_LOG(Info, "prompt_tokens[%d]: %d", i, prompt_tokens[i]);
-  }
   ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token");
   ET_CHECK_MSG(
       num_prompt_tokens < max_seq_len_,
@@ -202,89 +249,94 @@ Error Runner::generate(
       "Sequence length exceeded - please increase the seq_len value passed to generate()");
 
   // start the main loop
-  int next; // will store the next token in the sequence
-  int64_t pos = num_prompt_tokens - 1; // position in the sequence
-  int token = prompt_tokens[pos]; // prefill starts from 0 to num_prompt_tokens
-  int logits_index = 0; // index of the logits tensor in the output
-  std::vector<exec_aten::SizesType> input_shape = {1, 1};
-  std::vector<exec_aten::SizesType> pos_shape = {1};
+  int64_t pos = 0; // position in the sequence
+
   std::vector<int64_t> token_data; // allocate space for the tokens
-  std::vector<int64_t> pos_data; // allocate space for the tokens
+  std::vector<exec_aten::SizesType> token_shape = {1, seq_len};
+
+  std::vector<int64_t> start_pos_data; // allocate space for the tokens
+  std::vector<exec_aten::SizesType> start_pos_shape = {1};
 
   if (use_kv_cache_) {
-    // set pos to 0, refill token by token
-    pos = 0;
+    // hard code these to size 1 as kv cache is locked to static size right now.
     token_data.resize(1);
-    pos_data.resize(seq_len);
+    token_shape[1] = 1;
+    start_pos_data.resize(1);
+    start_pos_data[0] = 0;
   } else {
-    // reserve data for tokens, notice the size is still 0.
+    // reserve data for tokens, notice the size is still 0 but the capacity is
+    // seq_len.
     token_data.resize(seq_len);
   }
 
   // initialize tensor wrappers
-  ManagedTensor pos_managed(
-      pos_data.data(), pos_data.size(), pos_shape, ScalarType::Long);
-
-  // copy prompt tokens into data
-  for (int i = 0; i <= pos; ++i) {
-    // @lint-ignore CLANGTIDY facebook-hte-LocalUncheckedArrayBounds
-    token_data[i] = prompt_tokens[i];
-    if (i > 0) {
-      printf(
-          "%s",
-          ET_UNWRAP(
-              tokenizer_->decode(prompt_tokens[i - 1], prompt_tokens[i])));
+  ManagedTensor tokens_managed(
+      token_data.data(),
+      128, // TODO clean up unused 128 here as ManagedTensor ignores this arg in
+           // ctor
+      token_shape,
+      ScalarType::Long);
+  // Create with the max shape to approapriately set the capacity of this
+  // tensor, then resize back to 1 for first input.
+  tokens_managed.resize({1, 1});
+
+  ManagedTensor start_pos_managed(
+      start_pos_data.data(), 128, start_pos_shape, ScalarType::Long);
+
+  int64_t prev_token = -1;
+  int64_t cur_token = prompt_tokens[0];
+
+  // If we arent using the kv cache then we can batch prefill the prompt
+  if (!use_kv_cache_) {
+    tokens_managed.resize({1, num_prompt_tokens});
+    for (int i = 0; i < num_prompt_tokens - 1; i++) {
+      tokens_managed.get_aliasing_tensor().mutable_data_ptr<int64_t>()[i] =
+          prompt_tokens[i];
+    }
+    // prefill tokens up to the last prompt token and then enter the loop with
+    // the last promp token as the current token.
+    cur_token = prompt_tokens[num_prompt_tokens - 1];
+    pos = num_prompt_tokens - 1;
+
+    // Print the prompt for consistent output between single token prefill and
+    // batch prefill.
+    int prev = prompt_tokens[0];
+    int cur = -1;
+    for (int i = 1; i < num_prompt_tokens; i++) {
+      cur = prompt_tokens[i];
+      auto piece_res = tokenizer_->decode(prev, cur);
+      ET_CHECK_OK_OR_RETURN_ERROR(piece_res.error());
+      util::safe_printf(piece_res.get());
+      fflush(stdout);
+      prev = cur;
     }
   }
 
-  // create a 1xN int tensor with next as value
-  while (pos + 1 < seq_len) {
-    // ET_LOG(Info, "Generating step %d...", pos);
-    // set the current token in the tensor
-    std::vector<EValue> inputs;
-    if (use_kv_cache_) {
-      token_data[0] = token;
-      input_shape[1] = 1;
-      // inputs: [tokens, start_pos, k_cache, v_cache]
-      inputs.emplace_back(pos_managed.get_aliasing_tensor());
-    } else {
-      // @lint-ignore CLANGTIDY facebook-hte-LocalUncheckedArrayBounds
-      token_data[pos] = token;
-      input_shape[1] = pos + 1;
-    }
-    ManagedTensor token_managed(
-        token_data.data(), token_data.size(), input_shape, ScalarType::Long);
-    inputs.insert(inputs.begin(), token_managed.get_aliasing_tensor());
-    // For kv cache, inputs: [tokens, start_pos, k_cache, v_cache]
-    // Otherwise inputs: [tokens]
-    Result<std::vector<EValue>> outputs_res = module_->forward(inputs);
-    ET_CHECK_MSG(
-        outputs_res.ok(),
-        "Execution of method forward failed with status 0x%" PRIx32,
-        static_cast<int32_t>(outputs_res.error()));
-    // ET_LOG(Info, "Model executed successfully.");
+  // Generate our tokens
+  while (pos < seq_len - 1) {
+    // Run the model
+    Result<torch::executor::Tensor> logits_res =
+        run_model_step(cur_token, tokens_managed, start_pos_managed, seq_len);
 
-    std::vector<EValue> outputs = outputs_res.get();
-    // Check the outputs.
-    ET_CHECK_MSG(
-        outputs.size() == 1 && outputs.at(0).isTensor(),
-        "Expecting output to have exactly 1 tensor output. Got %zu outputs.",
-        outputs.size());
     if (pos == num_prompt_tokens) {
       timers_.first_token_ms = util::time_in_ms();
     } else if (pos == num_prompt_tokens - 1) {
       timers_.prompt_eval_end_ms = util::time_in_ms();
     }
-    int32_t next_tok;
-    exec_aten::Tensor logits_tensor = outputs.at(logits_index).toTensor();
+
+    ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error());
+    exec_aten::Tensor& logits_tensor = logits_res.get();
+
+    prev_token = cur_token;
+
     long sample_start_time_ms = util::time_in_ms();
     switch (logits_tensor.scalar_type()) {
       case ScalarType::Float: {
-        next_tok = logitsToToken<float>(logits_tensor, pos, 0);
+        cur_token = logitsToToken<float>(logits_tensor, pos, 0);
         break;
       }
       case ScalarType::Half: {
-        next_tok = logitsToToken<exec_aten::Half>(logits_tensor, pos, 0);
+        cur_token = logitsToToken<exec_aten::Half>(logits_tensor, pos, 0);
         break;
       }
       default:
@@ -299,19 +351,12 @@ Error Runner::generate(
     // advance the state machine
     if (pos < num_prompt_tokens - 1) {
       // prefill, force the next token to be the next prompt token
-      next = prompt_tokens[pos + 1];
-    } else {
-      // otherwise sample the next token from the logits
-      next = next_tok;
+      cur_token = prompt_tokens[pos + 1];
     }
-    // ET_LOG(Info, "Output saved, next = %d", next);
     pos++;
-    if (use_kv_cache_) {
-      pos_data.at(0) = pos;
-    }
 
     // print the token as string, decode it with the Tokenizer object
-    auto piece_res = tokenizer_->decode(token, next);
+    auto piece_res = tokenizer_->decode(prev_token, cur_token);
     ET_CHECK(piece_res.ok());
     const char* piece = piece_res.get();
 
@@ -328,22 +373,20 @@ Error Runner::generate(
     }
 
     // data-dependent terminating condition: we have n_eos_ number of EOS
-    if (pos >= num_prompt_tokens && next == eos_id_) {
+    if (pos >= num_prompt_tokens && cur_token == eos_id_) {
       printf("\n");
       ET_LOG(Info, "\nReached to the end of generation");
       break;
     }
-
-    token = next;
   }
   timers_.inference_end_ms = util::time_in_ms();
   printf("\n");
 
-  if (pos + 1 == seq_len) {
+  if (pos == seq_len) {
     ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
   }
 
-  timers_.printReport(num_prompt_tokens, (pos + 1) - num_prompt_tokens);
+  timers_.printReport(num_prompt_tokens, pos - num_prompt_tokens);
 
   delete[] prompt_tokens;
   return Error::Ok;
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
@@ -20,6 +20,7 @@
 #include <executorch/examples/models/llama2/sampler/sampler.h>
 #include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
+#include <executorch/extension/runner_util/managed_tensor.h>
 
 namespace torch::executor {
 
@@ -45,7 +46,11 @@ class Runner {
   template <typename T>
   int32_t
   logitsToToken(const exec_aten::Tensor& logits_tensor, int64_t pos, T _);
-  std::vector<exec_aten::SizesType> getKVCacheShape();
+  Result<torch::executor::Tensor> run_model_step(
+      int64_t input_token,
+      ManagedTensor& tokens,
+      ManagedTensor& start_pos,
+      size_t max_seq_len);
   // metadata
   int32_t vocab_size_;
   int32_t bos_id_;
diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl
@@ -35,6 +35,8 @@ def define_common_targets():
                 "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
                 "//executorch/extension/module:module" + aten_suffix,
                 "//executorch/kernels/quantized:generated_lib" + aten_suffix,
+                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
             ] + (_get_operator_lib(aten)) + ([
                 # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
                 # Therefore enable it explicitly for now to avoid failing tests
diff --git a/extension/runner_util/managed_tensor.h b/extension/runner_util/managed_tensor.h
@@ -8,6 +8,9 @@
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/platform/assert.h>
+
 #ifdef USE_ATEN_LIB
 #include <torch/torch.h>
 #else
@@ -56,10 +59,20 @@ class ManagedTensor {
         data_ptr_,
         dim_order_.data(),
         strides_.data(),
-        TensorShapeDynamism::STATIC);
+        TensorShapeDynamism::DYNAMIC_BOUND);
 #endif
   }
 
+  void resize(const std::vector<SizesType>& new_sizes) {
+    ET_CHECK_MSG(
+        new_sizes.size() == sizes_.size(),
+        "Cannot change rank of a managed tensor");
+    auto err = resize_tensor(
+        this->get_aliasing_tensor(),
+        exec_aten::ArrayRef<SizesType>(new_sizes.data(), new_sizes.size()));
+    ET_CHECK(err == Error::Ok);
+  }
+
   /**
    * Get the underlying Tensor object. This is assuming the copying is cheap.
    */
diff --git a/extension/runner_util/targets.bzl b/extension/runner_util/targets.bzl
@@ -38,5 +38,6 @@ def define_common_targets():
             ],
             deps = [
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
             ],
         )

Original file line number	Diff line number	Diff line change
`@@ -38,5 +38,6 @@ def define_common_targets():`
`38`	`38`	`],`
`39`	`39`	`deps = [`
`40`	`40`	`"//executorch/runtime/core/exec_aten:lib" + aten_suffix,`
	`41`	`+ "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,`
`41`	`42`	`],`
`42`	`43`	`)`