vulkan: always use fp32 for scalar flash attention

jeffbolznv · jeffbolznv · commit 3a8d954e0cd2 · 2025-05-06T23:08:39.000-05:00
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5726,9 +5726,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
     assert(k->type == v->type);
 
     vk_pipeline *pipelines;
-    // XXX TODO other backends may be changing accumulator precision to default to f32 soon
-    bool f32acc = dst->op_params[3] == GGML_PREC_F32;
     bool scalar = !ctx->device->coopmat2;
+    // XXX TODO other backends may be changing accumulator precision to default to f32 soon
+    bool f32acc = scalar || dst->op_params[3] == GGML_PREC_F32;
     bool small_rows = N <= get_fa_num_small_rows(scalar);
 
     if (scalar) {