Force flash attention off for LLM_ARCH_DEEPSEEK2 - embedding too large

jukofyork · jukofyork · commit ddab5e43edf6 · 2025-04-07T19:44:48.000+01:00
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -2278,6 +2278,11 @@ llama_context * llama_init_from_model(
         params.flash_attn = false;
     }
 
+    if (params.flash_attn && model->arch == LLM_ARCH_DEEPSEEK2) {
+        LLAMA_LOG_WARN("%s: flash_attn is not compatible with Deepseek2 - forcing off\n", __func__);
+        params.flash_attn = false;
+    }
+
     if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
         LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
         return nullptr;