kv-cache : fix non-consecutive token pos warning for recurrent models

compilade · compilade · commit bdbfb4e08e01 · 2025-05-28T01:08:37.000-04:00
The problem was apparently caused by how the tail cells were swapped.

* graph : simplify logic for recurrent state copies
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -242,23 +242,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
 
         // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
         for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t  cell_id = i + kv_self->head;
-
-            const auto & kv_cell = kv_self->cells[cell_id];
-
-            int32_t src = kv_cell.src0;
-
-            // prevent out-of-bound sources
-            if (src < 0) {
-                GGML_ASSERT(kv_self->rs_z >= 0); // Need a valid zero-ed cell as a source
-                src = kv_self->rs_z;
-            }
-            if ((uint32_t) src >= kv_self->size) {
-                // ignore out-of-bound sources
-                src = cell_id;
-            }
-
-            data[i] = src;
+            data[i] = kv_self->cells[i + kv_self->head].src0;
         }
     }
 }
@@ -1442,7 +1426,7 @@ ggml_tensor * llm_graph_context::build_recurrent_state(
     ggml_tensor * state_zero = ggml_view_1d(ctx0, states, n_state*(rs_zero >= 0), rs_zero*states->nb[1]*(rs_zero >= 0));
     ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
 
-    // copy states which won't be changed further (between n_seqs and n_kv)
+    // copy extra states which won't be changed further (between n_seqs and n_kv)
     ggml_tensor * states_extra = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_kv - n_seqs, n_seqs*state_copy->nb[0]));
     ggml_build_forward_expand(gf,
         ggml_cpy(ctx0,
@@ -1452,10 +1436,8 @@ ggml_tensor * llm_graph_context::build_recurrent_state(
     if (!avoid_copies) {
         // copy states
         // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
-        // this shrinks the tensors's ne[1] to n_kv
+        // this shrinks the tensors's ne[1] to n_seqs
         states = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_seqs, 0));
-        // the part of the states that will be used and modified
-        states = ggml_view_2d(ctx0, states, n_state, n_seqs, states->nb[1], 0);
     }
 
     return states;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -2337,17 +2337,17 @@ void llama_kv_cache_recurrent::defrag_sched(float thold) {
 void llama_kv_cache_recurrent::set_full() {
     n = size;
     head = 0;
+    rs_z = 0;
 }
 
 bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
-    const uint32_t n_tokens = ubatch.n_tokens;
-    const uint32_t n_seqs   = ubatch.n_seqs;
+    const uint32_t n_seqs = ubatch.n_seqs;
 
     const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
 
     // if we have enough unused cells before the current head ->
     //   better to start searching from the beginning of the cache, hoping to fill it
-    if (head > used + 2*n_tokens) {
+    if (head > used + 2*n_seqs) {
         head = 0;
     }
 
@@ -2443,16 +2443,16 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
                 empty_cell.src = orig_cell.src;
                 orig_cell.seq_id.erase(seq_id);
                 empty_cell.seq_id.insert(seq_id); // will be overwritten
+                GGML_ASSERT(!orig_cell.is_empty()); // has at least one remaining seq_id
             }
             seq_meta.tail = next_empty_cell;
             // find next empty cell
             if (s + 1 < n_seqs) {
-                next_empty_cell += 1;
                 for (uint32_t i = 0; i < size; ++i) {
+                    next_empty_cell += 1;
                     if (next_empty_cell >= size) { next_empty_cell -= size; }
                     kv_cell & cell = cells[next_empty_cell];
                     if (cell.is_empty()) { break; }
-                    next_empty_cell += 1;
                 }
             }
         }
@@ -2472,12 +2472,14 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
             std::swap(dst_cell.src, src_cell.src);
             std::swap(dst_cell.seq_id, src_cell.seq_id);
 
-            // swap tails (assuming they NEVER overlap)
-            for (const llama_seq_id seq_id : src_cell.seq_id) {
-                cells[seq_id].tail = src_id;
-            }
-            for (const llama_seq_id seq_id : dst_cell.seq_id) {
-                cells[seq_id].tail = dst_id;
+            // swap tails
+            for (uint32_t i = 0; i < size; ++i) {
+                int32_t & tail = cells[i].tail;
+                if (tail == src_id) {
+                    tail = dst_id;
+                } else if (tail == dst_id) {
+                    tail = src_id;
+                }
             }
         }
     }
@@ -2506,13 +2508,18 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
     // Find first to-be-cleared cell
     rs_z = -1;
     for (int i = min; i <= max; ++i) {
-        if (rs_z < 0 && cells[i].src == -1) {
-            rs_z = i;
+        if (cells[i].src == -1) {
+            if (rs_z < 0) {
+                rs_z = i;
+            }
+
+            cells[i].src0 = rs_z;
+        } else {
+            // Stage the source ids for all used cells to allow correct seq_* behavior
+            // and still make these values available when setting the inputs
+            cells[i].src0 = cells[i].src;
         }
-        // Stage the source ids for all used cells to allow correct seq_* behavior
-        // and still make these values available when setting the inputs
-        cells[i].src0 = cells[i].src;
-        cells[i].src = i;
+        cells[i].src = i; // avoid moving or clearing twice
     }
 
     // allow getting the range of used cells, from head to head + n