File tree Expand file tree Collapse file tree 1 file changed +13
-1
lines changed Expand file tree Collapse file tree 1 file changed +13
-1
lines changed Original file line number Diff line number Diff line change @@ -1823,7 +1823,7 @@ struct llama_hparams {
1823
1823
float f_logit_scale = 0.0f;
1824
1824
1825
1825
bool causal_attn = true;
1826
- bool need_kq_pos = false;
1826
+ bool need_kq_pos = false; // currently, we need KQ_pos data for ALiBi-based models
1827
1827
1828
1828
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1829
1829
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -6311,6 +6311,8 @@ static struct ggml_tensor * llm_build_kqv(
6311
6311
GGML_UNUSED(model);
6312
6312
GGML_UNUSED(n_ctx);
6313
6313
6314
+ // note: if this assert triggers, then some check has failed earlier
6315
+ // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6314
6316
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6315
6317
6316
6318
// split cached v into n_head heads (not transposed)
@@ -15114,6 +15116,16 @@ struct llama_context * llama_new_context_with_model(
15114
15116
}
15115
15117
}
15116
15118
15119
+ if (cparams.flash_attn && hparams.need_kq_pos) {
15120
+ LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15121
+ cparams.flash_attn = false;
15122
+ }
15123
+
15124
+ if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15125
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15126
+ cparams.flash_attn = false;
15127
+ }
15128
+
15117
15129
if (params.seed == LLAMA_DEFAULT_SEED) {
15118
15130
params.seed = time(NULL);
15119
15131
}
You can’t perform that action at this time.
0 commit comments