CUDA: fix Volta FlashAttention logic

JohannesGaessler · JohannesGaessler · commit ff0d3f67437f · 2025-02-03T11:23:41.000+01:00
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
@@ -235,7 +235,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
         return;
     }
 
-    if (!new_mma_available(cc)) {
+    if (!fp16_mma_available(cc)) {
         if (prec == GGML_PREC_DEFAULT) {
             if (Q->ne[1] <= 8) {
                 ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);

Original file line number	Diff line number	Diff line change
`@@ -235,7 +235,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst`
`235`	`235`	`return;`
`236`	`236`	`}`
`237`	`237`
`238`		`- if (!new_mma_available(cc)) {`
	`238`	`+ if (!fp16_mma_available(cc)) {`
`239`	`239`	`if (prec == GGML_PREC_DEFAULT) {`
`240`	`240`	`if (Q->ne[1] <= 8) {`
`241`	`241`	`ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);`