Revert changes to getKVCacheSize()

Varun Puri · facebook-github-bot · commit 758168e62c28 · 2024-02-27T16:25:43.000-08:00
Summary: KV Cache does not support dynamic shapes. Do not change the size of the KV cache based on the sequence length.

Reviewed By: kimishpatel, larryliu0820

Differential Revision: D54218307

fbshipit-source-id: 5a40093fd44db082a1de57126eab970bfc022b4b
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
@@ -119,7 +119,7 @@ T Runner::getMetadataHelper(std::string method_name, T default_val) {
   return res;
 }
 
-std::vector<exec_aten::SizesType> Runner::getKVCacheShape(int32_t seq_len) {
+std::vector<exec_aten::SizesType> Runner::getKVCacheShape() {
   // shape: (n_layers, args.max_batch_size, args.max_seq_len, self.n_kv_heads,
   // self.head_dim)
   std::vector<std::string> methods = {
@@ -134,9 +134,6 @@ std::vector<exec_aten::SizesType> Runner::getKVCacheShape(int32_t seq_len) {
     // convert from int64_t to int32_t
     result.push_back(getMetadataHelper<int64_t>(methods[i], default_values[i]));
   }
-  // update seq_len if one is provided between 1 and max_seq_len
-  ET_CHECK_MSG(result.size() == 5, "KV cache shape must have 5 elements");
-  result[2] = (seq_len > 0 && seq_len <= result[2]) ? seq_len : result[2];
   return result;
 }
 
@@ -201,7 +198,7 @@ Error Runner::generate(
   int token = prompt_tokens[pos]; // prefill starts from 0 to num_prompt_tokens
   int eos_counter = 0; // counter to capture EOS
   int logits_index = 0; // index of the logits tensor in the output
-  std::vector<exec_aten::SizesType> kv_cache_shape = getKVCacheShape(seq_len);
+  std::vector<exec_aten::SizesType> kv_cache_shape = getKVCacheShape();
   std::vector<exec_aten::SizesType> input_shape = {1, 1};
   std::vector<exec_aten::SizesType> pos_shape = {};
   std::vector<uint8_t> k_data;
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
@@ -45,7 +45,7 @@ class Runner {
   template <typename T>
   int32_t
   logitsToToken(const exec_aten::Tensor& logits_tensor, int64_t pos, T _);
-  std::vector<exec_aten::SizesType> getKVCacheShape(int32_t seq_len);
+  std::vector<exec_aten::SizesType> getKVCacheShape();
   // metadata
   int32_t vocab_size_;
   int32_t bos_id_;