@@ -171,11 +171,8 @@ class llama_kv_cache_unified : public llama_kv_cache {
171
171
void state_write (llama_io_write_i & io, llama_seq_id seq_id = -1 ) const override ;
172
172
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1 ) override ;
173
173
174
- // Note: The value of head isn't only used to optimize searching
175
- // for a free KV slot. llama_decode_impl also uses it, so it
176
- // cannot be freely changed after a slot has been allocated.
177
- uint32_t head = 0 ;
178
- uint32_t size = 0 ;
174
+ uint32_t head = 0 ; // the location where the batch will be placed in the cache (see find_slot())
175
+ uint32_t size = 0 ; // total number of cells, shared across all sequences
179
176
uint32_t used = 0 ; // used cells (i.e. at least one seq_id)
180
177
181
178
// computed before each graph build
@@ -343,11 +340,8 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
343
340
void state_write (llama_io_write_i & io, llama_seq_id seq_id = -1 ) const override ;
344
341
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1 ) override ;
345
342
346
- // Note: The value of head isn't only used to optimize searching
347
- // for a free KV slot. llama_decode_impl also uses it, so it
348
- // cannot be freely changed after a slot has been allocated.
349
- uint32_t head = 0 ;
350
- uint32_t size = 0 ;
343
+ uint32_t head = 0 ; // the location where the batch will be placed in the cache (see find_slot())
344
+ uint32_t size = 0 ; // total number of cells, shared across all sequences
351
345
uint32_t used = 0 ; // used cells (i.e. at least one seq_id)
352
346
353
347
// computed before each graph build
0 commit comments