Update on "[Executorch][llm] Enable leveraging ring kv cache via module swap"

kimishpatel · kimishpatel · commit 5ed22843acb9 · 2025-05-07T19:47:21.000-07:00
This allows us to make some of the attention modules to use sliding window kv cache. Will help enable models like gemma3. Differential Revision: [D73891426](https://our.internmc.facebook.com/intern/diff/D73891426/) [ghstack-poisoned]
diff --git a/examples/models/llama/source_transformation/custom_kv_cache.py b/examples/models/llama/source_transformation/custom_kv_cache.py
@@ -110,24 +110,44 @@ def _quantize_and_update(self, input_pos, k_val, v_val, indices=None):
 
         if self.use_custom_update_cache_op:
             start_pos = input_pos[0].item()
-            _ = torch.ops.llama.update_cache(
-                quantized_k_val, self.k_cache, start_pos, indices
-            )
-            _ = torch.ops.llama.update_cache(
-                k_scales, self.k_cache_scales, start_pos, indices
-            )
-            _ = torch.ops.llama.update_cache(
-                k_zero_points, self.k_cache_zero_points, start_pos, indices
-            )
-            _ = torch.ops.llama.update_cache(
-                quantized_v_val, self.v_cache, start_pos, indices
-            )
-            _ = torch.ops.llama.update_cache(
-                v_scales, self.v_cache_scales, start_pos, indices
-            )
-            _ = torch.ops.llama.update_cache(
-                v_zero_points, self.v_cache_zero_points, start_pos, indices
-            )
+            if indices is not None:
+                _ = torch.ops.llama.update_cache_with_indices(
+                    quantized_k_val, self.k_cache, start_pos, indices
+                )
+                _ = torch.ops.llama.update_cache_with_indices(
+                    k_scales, self.k_cache_scales, start_pos, indices
+                )
+                _ = torch.ops.llama.update_cache_with_indices(
+                    k_zero_points, self.k_cache_zero_points, start_pos, indices
+                )
+                _ = torch.ops.llama.update_cache_with_indices(
+                    quantized_v_val, self.v_cache, start_pos, indices
+                )
+                _ = torch.ops.llama.update_cache_with_indices(
+                    v_scales, self.v_cache_scales, start_pos, indices
+                )
+                _ = torch.ops.llama.update_cache_with_indices(
+                    v_zero_points, self.v_cache_zero_points, start_pos, indices
+                )
+            else:
+                _ = torch.ops.llama.update_cache(
+                    quantized_k_val, self.k_cache, start_pos
+                )
+                _ = torch.ops.llama.update_cache(
+                    k_scales, self.k_cache_scales, start_pos
+                )
+                _ = torch.ops.llama.update_cache(
+                    k_zero_points, self.k_cache_zero_points, start_pos
+                )
+                _ = torch.ops.llama.update_cache(
+                    quantized_v_val, self.v_cache, start_pos
+                )
+                _ = torch.ops.llama.update_cache(
+                    v_scales, self.v_cache_scales, start_pos
+                )
+                _ = torch.ops.llama.update_cache(
+                    v_zero_points, self.v_cache_zero_points, start_pos
+                )
         else:
             assert indices is None, "Indices not supported for this path"
             # Following is also broken because in prefill input_pos = [0]
@@ -165,8 +185,16 @@ def _update_and_return_float_values(self, input_pos, k_val, v_val, indices=None)
         # instead of dequantized value.
         start_pos = input_pos[0].item()
         if self.use_custom_update_cache_op:
-            _ = torch.ops.llama.update_cache(k_val, k_out, start_pos, indices)
-            _ = torch.ops.llama.update_cache(v_val, v_out, start_pos, indices)
+            if indices is not None:
+                _ = torch.ops.llama.update_cache_with_indices(
+                    k_val, k_out, start_pos, indices
+                )
+                _ = torch.ops.llama.update_cache_with_indices(
+                    v_val, v_out, start_pos, indices
+                )
+            else:
+                _ = torch.ops.llama.update_cache(k_val, k_out, start_pos)
+                _ = torch.ops.llama.update_cache(v_val, v_out, start_pos)
         else:
             k_out[:, input_pos] = k_val
             v_out[:, input_pos] = v_val
@@ -310,8 +338,16 @@ def update(
         v_val = v_val.transpose(1, 2)
         start_pos = input_pos[0].item()
 
-        _ = torch.ops.llama.update_cache(k_val, self.k_cache, start_pos, indices)
-        _ = torch.ops.llama.update_cache(v_val, self.v_cache, start_pos, indices)
+        if indices is not None:
+            _ = torch.ops.llama.update_cache_with_indices(
+                k_val, self.k_cache, start_pos, indices
+            )
+            _ = torch.ops.llama.update_cache_with_indices(
+                v_val, self.v_cache, start_pos, indices
+            )
+        else:
+            _ = torch.ops.llama.update_cache(k_val, self.k_cache, start_pos)
+            _ = torch.ops.llama.update_cache(v_val, self.v_cache, start_pos)
 
         return (
             self.k_cache.transpose(1, 2),
diff --git a/extension/llm/custom_ops/custom_ops.py b/extension/llm/custom_ops/custom_ops.py
@@ -232,7 +232,25 @@ def update_cache_meta(
     value,
     cache,
     start_pos,
-    indices=None,
+):
+    _validate_update_cache_params(
+        value,
+        cache,
+        start_pos,
+    )
+
+    # Update cache doesnt really return anything but I dont know a better
+    # workaround. Should we just return cache instead? But I am afraid that
+    # will result in extra memory allocation
+    return torch.empty((1,), dtype=value.dtype, device="meta")
+
+
+@impl(custom_ops_lib, "update_cache_with_indices", "Meta")
+def update_cache_with_indices_meta(
+    value,
+    cache,
+    start_pos,
+    indices,
 ):
     _validate_update_cache_params(
         value,
diff --git a/extension/llm/custom_ops/op_sdpa_aot.cpp b/extension/llm/custom_ops/op_sdpa_aot.cpp
@@ -122,14 +122,26 @@ Tensor& update_cache_out_no_context(
     const Tensor& value,
     Tensor& cache,
     const int64_t start_pos,
-    const std::optional<Tensor> indices,
     Tensor& output);
 
 at::Tensor update_cache_aten(
+    const at::Tensor& value,
+    at::Tensor& cache,
+    const int64_t start_pos);
+
+// New functions for update_cache_with_indices
+Tensor& update_cache_with_indices_out_no_context(
+    const Tensor& value,
+    Tensor& cache,
+    const int64_t start_pos,
+    const Tensor& indices,
+    Tensor& output);
+
+at::Tensor update_cache_with_indices_aten(
     const at::Tensor& value,
     at::Tensor& cache,
     const int64_t start_pos,
-    const std::optional<at::Tensor>& indices);
+    const at::Tensor& indices);
 
 Tensor& sdpa_with_kv_cache_out_no_context(
     const Tensor& q_projected,
@@ -326,20 +338,41 @@ Tensor& update_cache_out_no_context(
     const Tensor& value,
     Tensor& cache,
     const int64_t start_pos,
-    const std::optional<Tensor> indices,
     Tensor& output) {
   executorch::aten::RuntimeContext context{};
   return torch::executor::native::update_cache_out(
-      context, value, cache, start_pos, indices, output);
+      context, value, cache, start_pos, output);
 }
 
 at::Tensor update_cache_aten(
+    const at::Tensor& value,
+    at::Tensor& cache,
+    const int64_t start_pos) {
+  auto output = at::empty({1});
+  WRAP_TO_ATEN(update_cache_out_no_context, 3)
+  (value, cache, start_pos, output);
+  return output;
+}
+
+// Implementations for update_cache_with_indices
+Tensor& update_cache_with_indices_out_no_context(
+    const Tensor& value,
+    Tensor& cache,
+    const int64_t start_pos,
+    const Tensor& indices,
+    Tensor& output) {
+  executorch::aten::RuntimeContext context{};
+  return torch::executor::native::update_cache_with_indices_out(
+      context, value, cache, start_pos, indices, output);
+}
+
+at::Tensor update_cache_with_indices_aten(
     const at::Tensor& value,
     at::Tensor& cache,
     const int64_t start_pos,
-    const std::optional<at::Tensor>& indices) {
+    const at::Tensor& indices) {
   auto output = at::empty({1});
-  WRAP_TO_ATEN(update_cache_out_no_context, 4)
+  WRAP_TO_ATEN(update_cache_with_indices_out_no_context, 4)
   (value, cache, start_pos, indices, output);
   return output;
 }
@@ -367,10 +400,16 @@ TORCH_LIBRARY_FRAGMENT(llama, m) {
       "float? scale=None, *, Tensor(a!) out) -> Tensor(a!)");
   m.def(
       "update_cache(Tensor value, Tensor(a!) cache, "
-      "SymInt start_pos, Tensor? indices=None) -> Tensor");
+      "SymInt start_pos) -> Tensor");
   m.def(
       "update_cache.out(Tensor value, Tensor(a!) cache, "
-      "SymInt start_pos, Tensor? indices=None, *, Tensor(b!) out) -> Tensor(b!)");
+      "SymInt start_pos, *, Tensor(b!) out) -> Tensor(b!)");
+  m.def(
+      "update_cache_with_indices(Tensor value, Tensor(a!) cache, "
+      "SymInt start_pos, Tensor indices) -> Tensor");
+  m.def(
+      "update_cache_with_indices.out(Tensor value, Tensor(a!) cache, "
+      "SymInt start_pos, Tensor indices, *, Tensor(b!) out) -> Tensor(b!)");
   m.def(
       "custom_quantized_sdpa(Tensor query, Tensor key, Tensor value, SymInt start_pos, "
       "Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, "
@@ -400,7 +439,15 @@ TORCH_LIBRARY_IMPL(llama, CompositeExplicitAutograd, m) {
   m.impl("update_cache", torch::executor::native::update_cache_aten);
   m.impl(
       "update_cache.out",
-      WRAP_TO_ATEN(torch::executor::native::update_cache_out_no_context, 4));
+      WRAP_TO_ATEN(torch::executor::native::update_cache_out_no_context, 3));
+  m.impl(
+      "update_cache_with_indices",
+      torch::executor::native::update_cache_with_indices_aten);
+  m.impl(
+      "update_cache_with_indices.out",
+      WRAP_TO_ATEN(
+          torch::executor::native::update_cache_with_indices_out_no_context,
+          4));
   m.impl(
       "custom_quantized_sdpa",
       torch::executor::native::custom_quantized_sdpa_aten);
diff --git a/extension/llm/custom_ops/op_update_cache.cpp b/extension/llm/custom_ops/op_update_cache.cpp
@@ -20,6 +20,7 @@ namespace executor {
 namespace native {
 
 namespace {
+// Helper function to validate cache parameters
 bool validate_cache_params(
     const Tensor& quantized_value,
     const Tensor& quantized_cache,
@@ -32,26 +33,8 @@ bool validate_cache_params(
   ET_CHECK_OR_RETURN_FALSE(
       quantized_value.dim() == 4, "quantized_value must be a 4D tensor");
 
-  ET_CHECK_OR_RETURN_FALSE(
-      indices.has_value() || start_pos < quantized_cache.size(1),
-      "start_pos: %" PRId64 " must be less than cache size at dim 1: %zd",
-      start_pos,
-      quantized_cache.size(1));
-
-  ET_CHECK_OR_RETURN_FALSE(
-      indices.has_value() ||
-          (start_pos + seq_length) <= quantized_cache.size(1),
-      "start_post + seq_length must be less than max seq length supported by cache."
-      "start pos: %" PRId64 ", seq_length: %" PRId64
-      "."
-      "cache size: %zd",
-      start_pos,
-      seq_length,
-      quantized_cache.size(1));
-
-  // Validate indices tensor if provided
   if (indices.has_value()) {
-    const Tensor& indices_tensor = indices.value();
+    const auto& indices_tensor = indices.value();
     ET_CHECK_OR_RETURN_FALSE(
         indices_tensor.dim() == 2,
         "indices must be a 2D tensor [batch_size, seq_len]");
@@ -72,6 +55,22 @@ bool validate_cache_params(
         is_contiguous_dim_order(
             indices_tensor.dim_order().data(), indices_tensor.dim()),
         "indices must be in contiguous dim order");
+  } else {
+    ET_CHECK_OR_RETURN_FALSE(
+        start_pos < quantized_cache.size(1),
+        "start_pos: %" PRId64 " must be less than cache size at dim 1: %zd",
+        start_pos,
+        quantized_cache.size(1));
+
+    ET_CHECK_OR_RETURN_FALSE(
+        (start_pos + seq_length) <= quantized_cache.size(1),
+        "start_post + seq_length must be less than max seq length supported by cache."
+        "start pos: %" PRId64 ", seq_length: %" PRId64
+        "."
+        "cache size: %zd",
+        start_pos,
+        seq_length,
+        quantized_cache.size(1));
   }
 
   // Make sure they are in contiguous dim order
@@ -87,22 +86,16 @@ bool validate_cache_params(
 
   return true;
 }
-} // anonymous namespace
 
-Tensor& update_cache_out(
+// Helper function for the actual update operation
+Tensor& update_cache_impl(
     RuntimeContext& ctx,
     const Tensor& value,
     Tensor& cache,
     const int64_t start_pos,
-    const optional<Tensor>& indices,
-    Tensor& output) {
+    Tensor& output,
+    const optional<Tensor>& indices = nullopt) {
   (void)ctx;
-  int64_t seq_len = value.size(1);
-  ET_KERNEL_CHECK(
-      ctx,
-      validate_cache_params(value, cache, start_pos, seq_len, indices),
-      InvalidArgument,
-      output);
 
   ET_CHECK_MSG(
       value.size(0) == cache.size(0),
@@ -151,7 +144,8 @@ Tensor& update_cache_out(
   if (indices.has_value()) {
     // Use the provided indices tensor for each batch and sequence position
     const Tensor& indices_tensor = indices.value();
-    const int64_t* indices_data = indices_tensor.const_data_ptr<int64_t>();
+    const int64_t* indices_data =
+        static_cast<const int64_t*>(indices_tensor.const_data_ptr());
     auto indices_strides = indices_tensor.strides();
     executorch::aten::StridesType indices_batch_stride = indices_strides[0];
     executorch::aten::StridesType indices_seq_stride = indices_strides[1];
@@ -211,6 +205,43 @@ Tensor& update_cache_out(
   // Noone uses output. Just a placeholder.
   return output;
 }
+} // anonymous namespace
+
+// Original update_cache_out function without indices parameter
+Tensor& update_cache_out(
+    RuntimeContext& ctx,
+    const Tensor& value,
+    Tensor& cache,
+    const int64_t start_pos,
+    Tensor& output) {
+  int64_t seq_len = value.size(1);
+  ET_KERNEL_CHECK(
+      ctx,
+      validate_cache_params(value, cache, start_pos, seq_len),
+      InvalidArgument,
+      output);
+
+  return update_cache_impl(ctx, value, cache, start_pos, output);
+}
+
+// New function that explicitly takes indices
+Tensor& update_cache_with_indices_out(
+    RuntimeContext& ctx,
+    const Tensor& value,
+    Tensor& cache,
+    const int64_t start_pos,
+    const Tensor& indices,
+    Tensor& output) {
+  int64_t seq_len = value.size(1);
+  ET_KERNEL_CHECK(
+      ctx,
+      validate_cache_params(value, cache, start_pos, seq_len, indices),
+      InvalidArgument,
+      output);
+
+  return update_cache_impl(ctx, value, cache, start_pos, output, indices);
+}
+
 } // namespace native
 } // namespace executor
 } // namespace torch
@@ -225,3 +256,9 @@ EXECUTORCH_LIBRARY(
     llama,
     "update_cache.out",
     torch::executor::native::update_cache_out);
+
+// Register the new update_cache_with_indices.out op
+EXECUTORCH_LIBRARY(
+    llama,
+    "update_cache_with_indices.out",
+    torch::executor::native::update_cache_with_indices_out);
diff --git a/extension/llm/custom_ops/op_update_cache.h b/extension/llm/custom_ops/op_update_cache.h
diff --git a/extension/llm/custom_ops/test_update_cache.py b/extension/llm/custom_ops/test_update_cache.py