fix: Fix indexing into k_l for recurrent cache with filter

gabe-l-hart · gabe-l-hart · commit f482ca3b5e92 · 2025-05-27T09:03:19.000-06:00
Branch: HybridCache

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -1818,8 +1818,8 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
         ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
         ggml_format_name(k, "cache_k_l%d", i);
         ggml_format_name(v, "cache_v_l%d", i);
-        k_l.push_back(k);
-        v_l.push_back(v);
+        k_l[i] = k;
+        v_l[i] = v;
     }
 
     // allocate tensors and initialize the buffers to avoid NaNs in the padding