kv-cache : use ggml_set_rows

ggerganov · ggerganov · commit 8f1c5e3fa104 · 2025-06-19T19:26:47.000+03:00
ggml-ci
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -281,6 +281,10 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
 }
 
 void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
+    if (self_kv_idxs) {
+        kv_state->set_input_kv_idxs(self_kv_idxs, ubatch);
+    }
+
     if (self_kq_mask) {
         kv_state->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
     }
@@ -1192,6 +1196,9 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
 
         const auto n_kv = kv_state->get_n_kv();
 
+        inp->self_kv_idxs = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        ggml_set_input(inp->self_kv_idxs);
+
         inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
         //cb(inp->self_kq_mask, "KQ_mask", -1);
         ggml_set_input(inp->self_kq_mask);
@@ -1224,8 +1231,10 @@ ggml_tensor * llm_graph_context::build_attn(
 
     // store to KV cache
     {
-        ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il));
-        ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il));
+        const auto & kv_idxs = inp->get_kv_idxs();
+
+        ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, kv_idxs, il));
+        ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, kv_idxs, il));
     }
 
     const auto & kq_mask = inp->get_kq_mask();
@@ -1278,8 +1287,8 @@ ggml_tensor * llm_graph_context::build_attn(
 
     // store to KV cache
     {
-        ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il));
-        ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il));
+        ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, nullptr, il));
+        ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, nullptr, il));
     }
 
     const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
@@ -1383,8 +1392,8 @@ ggml_tensor * llm_graph_context::build_attn(
 
     // store to KV cache
     {
-        ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il));
-        ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il));
+        ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, nullptr, il));
+        ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, nullptr, il));
     }
 
     const auto & kq_mask = inp->get_kq_mask();
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -247,8 +247,12 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
+    ggml_tensor * get_kv_idxs() const { return self_kv_idxs; }
     ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
 
+    // TODO: should this be I64?
+    ggml_tensor * self_kv_idxs = nullptr; // I32 [n_batch]
+
     ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch]
     ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch]
 
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -746,21 +746,25 @@ ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint
             0);
 }
 
-ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const {
+ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * kv_idxs, int32_t il, uint32_t head_cur) const {
     const int32_t ikv = map_layer_ids.at(il);
 
     auto * k = layers[ikv].k;
 
     const int64_t n_tokens = k_cur->ne[2];
 
+    if (kv_idxs) {
+        return ggml_set_rows(ctx, k, ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens), kv_idxs);
+    }
+
     ggml_tensor * k_view = ggml_view_1d(ctx, k,
             n_tokens*hparams.n_embd_k_gqa(il),
             ggml_row_size(k->type, hparams.n_embd_k_gqa(il))*head_cur);
 
     return ggml_cpy(ctx, k_cur, k_view);
 }
 
-ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const {
+ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * kv_idxs, int32_t il, uint32_t head_cur) const {
     const int32_t ikv = map_layer_ids.at(il);
 
     auto * v = layers[ikv].v;
@@ -772,10 +776,18 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_
     ggml_tensor * v_view = nullptr;
 
     if (!v_trans) {
+        if (kv_idxs) {
+            return ggml_set_rows(ctx, v, ggml_reshape_2d(ctx, v_cur, v->ne[0], n_tokens), kv_idxs);
+        }
+
         v_view = ggml_view_1d(ctx, v,
                 n_tokens*hparams.n_embd_v_gqa(il),
                 ggml_row_size(v->type, hparams.n_embd_v_gqa(il))*head_cur);
     } else {
+        if (kv_idxs) {
+            GGML_ABORT("TODO: implement kv_idxs for transposed V cache -- for now use flash attention");
+        }
+
         // note: the V cache is transposed when not using flash attention
         v_view = ggml_view_2d(ctx, v, n_tokens, hparams.n_embd_v_gqa(il),
                 (v->ne[1])*ggml_element_size(v),
@@ -787,6 +799,17 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_
     return ggml_cpy(ctx, v_cur, v_view);
 }
 
+void llama_kv_cache_unified::set_input_kv_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, uint32_t head_cur) const {
+    const uint32_t n_tokens = ubatch->n_tokens;
+
+    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+    int32_t * data = (int32_t *) dst->data;
+
+    for (uint32_t i = 0; i < n_tokens; ++i) {
+        data[i] = head_cur + i;
+    }
+}
+
 void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
     const uint32_t n_tokens = ubatch->n_tokens;
 
@@ -1789,18 +1812,22 @@ ggml_tensor * llama_kv_cache_unified_state::get_v(ggml_context * ctx, int32_t il
     return kv->get_v(ctx, il, n_kv);
 }
 
-ggml_tensor * llama_kv_cache_unified_state::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
-    return kv->cpy_k(ctx, k_cur, il, head);
+ggml_tensor * llama_kv_cache_unified_state::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * kv_idxs, int32_t il) const {
+    return kv->cpy_k(ctx, k_cur, kv_idxs, il, head);
 }
 
-ggml_tensor * llama_kv_cache_unified_state::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const {
-    return kv->cpy_v(ctx, v_cur, il, head);
+ggml_tensor * llama_kv_cache_unified_state::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * kv_idxs, int32_t il) const {
+    return kv->cpy_v(ctx, v_cur, kv_idxs, il, head);
 }
 
 void llama_kv_cache_unified_state::set_input_k_shift(ggml_tensor * dst) const {
     kv->set_input_k_shift(dst);
 }
 
+void llama_kv_cache_unified_state::set_input_kv_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
+    kv->set_input_kv_idxs(dst, ubatch, head);
+}
+
 void llama_kv_cache_unified_state::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
     kv->set_input_kq_mask(dst, ubatch, causal_attn);
 }
diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h
@@ -102,8 +102,8 @@ class llama_kv_cache_unified : public llama_memory_i {
     ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
 
     // store k_cur and v_cur in the cache based on the provided head location
-    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const;
-    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const;
+    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * kv_idxs, int32_t il, uint32_t head_cur) const;
+    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * kv_idxs, int32_t il, uint32_t head_cur) const;
 
     //
     // preparation API
@@ -126,6 +126,7 @@ class llama_kv_cache_unified : public llama_memory_i {
     // set_input API
     //
 
+    void set_input_kv_idxs   (ggml_tensor * dst, const llama_ubatch * ubatch, uint32_t head_cur) const;
     void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
     void set_input_k_shift   (ggml_tensor * dst) const;
     void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
@@ -257,11 +258,12 @@ class llama_kv_cache_unified_state : public llama_memory_state_i {
     ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
 
     // store k_cur and v_cur in the cache based on the provided head location
-    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
-    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
+    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * kv_idxs, int32_t il) const;
+    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * kv_idxs, int32_t il) const;
 
     void set_input_k_shift(ggml_tensor * dst) const;
 
+    void set_input_kv_idxs   (ggml_tensor * dst, const llama_ubatch * ubatch) const;
     void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
     void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
 

Original file line number	Diff line number	Diff line change
`@@ -281,6 +281,10 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {`
`281`	`281`	`}`
`282`	`282`
`283`	`283`	`void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {`
	`284`	`+ if (self_kv_idxs) {`
	`285`	`+ kv_state->set_input_kv_idxs(self_kv_idxs, ubatch);`
	`286`	`+ }`
	`287`	`+`
`284`	`288`	`if (self_kq_mask) {`
`285`	`289`	`kv_state->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);`
`286`	`290`	`}`
`@@ -1192,6 +1196,9 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()`
`1192`	`1196`
`1193`	`1197`	`const auto n_kv = kv_state->get_n_kv();`
`1194`	`1198`
	`1199`	`+ inp->self_kv_idxs = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);`
	`1200`	`+ ggml_set_input(inp->self_kv_idxs);`
	`1201`	`+`
`1195`	`1202`	`inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));`
`1196`	`1203`	`//cb(inp->self_kq_mask, "KQ_mask", -1);`
`1197`	`1204`	`ggml_set_input(inp->self_kq_mask);`
`@@ -1224,8 +1231,10 @@ ggml_tensor * llm_graph_context::build_attn(`
`1224`	`1231`
`1225`	`1232`	`// store to KV cache`
`1226`	`1233`	`{`
`1227`		`- ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il));`
`1228`		`- ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il));`
	`1234`	`+ const auto & kv_idxs = inp->get_kv_idxs();`
	`1235`	`+`
	`1236`	`+ ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, kv_idxs, il));`
	`1237`	`+ ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, kv_idxs, il));`
`1229`	`1238`	`}`
`1230`	`1239`
`1231`	`1240`	`const auto & kq_mask = inp->get_kq_mask();`
`@@ -1278,8 +1287,8 @@ ggml_tensor * llm_graph_context::build_attn(`
`1278`	`1287`
`1279`	`1288`	`// store to KV cache`
`1280`	`1289`	`{`
`1281`		`- ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il));`
`1282`		`- ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il));`
	`1290`	`+ ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, nullptr, il));`
	`1291`	`+ ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, nullptr, il));`
`1283`	`1292`	`}`
`1284`	`1293`
`1285`	`1294`	`const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();`
`@@ -1383,8 +1392,8 @@ ggml_tensor * llm_graph_context::build_attn(`
`1383`	`1392`
`1384`	`1393`	`// store to KV cache`
`1385`	`1394`	`{`
`1386`		`- ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il));`
`1387`		`- ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il));`
	`1395`	`+ ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, nullptr, il));`
	`1396`	`+ ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, nullptr, il));`
`1388`	`1397`	`}`
`1389`	`1398`
`1390`	`1399`	`const auto & kq_mask = inp->get_kq_mask();`