add chunk attn mask

ngxson · ngxson · commit e6a2809c2d42 · 2025-04-07T19:06:01.000+02:00
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -474,9 +474,17 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
                         }
 
                         // may need to cut off old tokens for sliding window
+                        // TODO @ngxson : the check for n_attn_chunk is temporary, need to optimize it
                         if (data_swa) {
-                            if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
-                                f = -INFINITY;
+                            if (hparams.n_attn_chunk) {
+                                llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
+                                if (kv_self->cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
+                                    f = -INFINITY;
+                                }
+                            } else {
+                                if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
+                                    f = -INFINITY;
+                                }
                             }
                             data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
                         }
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -114,6 +114,7 @@ struct llama_hparams {
 
     uint32_t n_moe_layer_step        = 0;
     bool     use_kq_norm             = true;
+    uint32_t n_attn_chunk            = 0;
     // values below seems to be fixed on llama4
     uint32_t n_no_rope_layer_step    = 4;
     uint32_t n_attn_temp_floor_scale = 8192;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -557,6 +557,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
                 ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,   hparams.n_moe_layer_step);
+                // hack: we use SWA to store the chunked attn mask
+                // luckily, the n_swa_pattern is the same as chunked layer pattern: 3 chunked - 1 full
+                hparams.n_swa_pattern = 4;
+                hparams.n_attn_chunk  = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
+                hparams.n_swa = 1; // unused, added to trigger the SWA
 
                 switch (hparams.n_expert) {
                     case 16:  type = LLM_TYPE_17B_16E; break;

Original file line number	Diff line number	Diff line change
`@@ -474,9 +474,17 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {`
`474`	`474`	`}`
`475`	`475`
`476`	`476`	`// may need to cut off old tokens for sliding window`
	`477`	`+ // TODO @ngxson : the check for n_attn_chunk is temporary, need to optimize it`
`477`	`478`	`if (data_swa) {`
`478`		`- if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {`
`479`		`- f = -INFINITY;`
	`479`	`+ if (hparams.n_attn_chunk) {`
	`480`	`+ llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;`
	`481`	`+ if (kv_self->cells[i].pos < pos_chunk_start \|\| pos < pos_chunk_start) {`
	`482`	`+ f = -INFINITY;`
	`483`	`+ }`
	`484`	`+ } else {`
	`485`	`+ if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {`
	`486`	`+ f = -INFINITY;`
	`487`	`+ }`
`480`	`488`	`}`
`481`	`489`	`data_swa[h(n_kvn_tokens) + s(n_kvn_seq_tokens) + j*n_kv + i] = f;`
`482`	`490`	`}`