Skip to content

Commit ddab5e4

Browse files
committed
Force flash attention off for LLM_ARCH_DEEPSEEK2 - embedding too large
1 parent 8c02442 commit ddab5e4

File tree

1 file changed

+5
-0
lines changed

1 file changed

+5
-0
lines changed

src/llama-context.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2278,6 +2278,11 @@ llama_context * llama_init_from_model(
22782278
params.flash_attn = false;
22792279
}
22802280

2281+
if (params.flash_attn && model->arch == LLM_ARCH_DEEPSEEK2) {
2282+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Deepseek2 - forcing off\n", __func__);
2283+
params.flash_attn = false;
2284+
}
2285+
22812286
if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
22822287
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
22832288
return nullptr;

0 commit comments

Comments
 (0)