Skip to content

Commit e3a9421

Browse files
authored
kv-cache : fix out-of-bounds view during reserve graph (#13547)
* kv-cache : fix reserve graph out-of-bounds access ggml-ci * cont : add comment * cont : fix comments [no ci] * cont : more correct comment [no ci]
1 parent 5ab5d5f commit e3a9421

File tree

2 files changed

+12
-10
lines changed

2 files changed

+12
-10
lines changed

src/llama-kv-cache.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,13 @@ void llama_kv_cache_unified::defrag_sched(float thold) {
441441

442442
void llama_kv_cache_unified::set_full() {
443443
n = size;
444+
445+
// when simulating a full KV cache, the specific value of the "head" pointer is not important because it does not
446+
// affect the shapes of the tensors in the compute graph - it only affects the offsets of the K/V views.
447+
// we should only guarantee that the head position won't cause out-of-bounds view of the K, V tensors, so
448+
// setting it to 0 is the simplest way to achieve that
449+
// ref: https://github.com/ggml-org/llama.cpp/issues/13359
450+
head = 0;
444451
}
445452

446453
llama_sbatch llama_kv_cache_unified::sbatch_init(
@@ -1712,6 +1719,7 @@ void llama_kv_cache_recurrent::defrag_sched(float thold) {
17121719

17131720
void llama_kv_cache_recurrent::set_full() {
17141721
n = size;
1722+
head = 0;
17151723
}
17161724

17171725
llama_sbatch llama_kv_cache_recurrent::sbatch_init(

src/llama-kv-cache.h

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -171,11 +171,8 @@ class llama_kv_cache_unified : public llama_kv_cache {
171171
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
172172
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
173173

174-
// Note: The value of head isn't only used to optimize searching
175-
// for a free KV slot. llama_decode_impl also uses it, so it
176-
// cannot be freely changed after a slot has been allocated.
177-
uint32_t head = 0;
178-
uint32_t size = 0;
174+
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
175+
uint32_t size = 0; // total number of cells, shared across all sequences
179176
uint32_t used = 0; // used cells (i.e. at least one seq_id)
180177

181178
// computed before each graph build
@@ -343,11 +340,8 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
343340
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
344341
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
345342

346-
// Note: The value of head isn't only used to optimize searching
347-
// for a free KV slot. llama_decode_impl also uses it, so it
348-
// cannot be freely changed after a slot has been allocated.
349-
uint32_t head = 0;
350-
uint32_t size = 0;
343+
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
344+
uint32_t size = 0; // total number of cells, shared across all sequences
351345
uint32_t used = 0; // used cells (i.e. at least one seq_id)
352346

353347
// computed before each graph build

0 commit comments

Comments
 (0)