File tree Expand file tree Collapse file tree 2 files changed +7
-6
lines changed Expand file tree Collapse file tree 2 files changed +7
-6
lines changed Original file line number Diff line number Diff line change @@ -570,6 +570,7 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
570
570
571
571
bool found = true ;
572
572
for (uint32_t i = 0 ; i < n_tokens; i++) {
573
+ const llama_pos pos = ubatch.pos [i];
573
574
const llama_seq_id seq_id = ubatch.seq_id [i][0 ];
574
575
575
576
// can we use this cell? either:
@@ -578,10 +579,12 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
578
579
const bool can_use =
579
580
cells.is_empty (head_cur + i) ||
580
581
(
581
- cells.pos_get (head_cur + i) <= ubatch.pos [i] && // causal mask
582
- cells.seq_has (head_cur + i, seq_id) && // sequence mask
583
- cells.seq_count (head_cur + i) == 1 &&
584
- is_masked_swa (cells.pos_get (head_cur + i), ubatch.seq_pos_min [seq_id]) // SWA mask
582
+ cells.seq_has (head_cur + i, seq_id) && // sequence mask
583
+ cells.seq_count (head_cur + i) == 1 &&
584
+ (
585
+ cells.pos_get (head_cur + i) >= pos || // causal mask
586
+ is_masked_swa (cells.pos_get (head_cur + i), ubatch.seq_pos_min [seq_id]) // SWA mask
587
+ )
585
588
);
586
589
587
590
if (!can_use) {
Original file line number Diff line number Diff line change @@ -33,7 +33,6 @@ struct llama_kv_cache : public llama_memory_i {
33
33
// process any pending defrag/shift/etc. operations
34
34
// optionally call once before processing a new batch
35
35
// return true if any operations were performed
36
- // will reserve a new worst-case graph if needed
37
36
virtual bool update (llama_context & lctx) = 0;
38
37
39
38
// schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
@@ -240,7 +239,6 @@ class llama_kv_cache_unified : public llama_kv_cache {
240
239
241
240
// utilizes two instances of llama_kv_cache_unified
242
241
// the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
243
- // upon successful processing of the batch, the SWA cache removes old tokens outside the n_swa window
244
242
245
243
class llama_kv_cache_unified_iswa : public llama_kv_cache {
246
244
public:
You can’t perform that action at this time.
0 commit comments