CUDA: Improve flash decoding kernel occupancy for BS=1 case

gaugarg-nv · gaugarg-nv · commit 76881ac531bf · 2025-03-04T22:21:36.000+05:30
Addresses issue: #12182 . This PR adds the following optimizations to the CUDA flash decoding code: - Find out active blocks per SM using cudaOccupancyMaxActiveBlocksPerMultiprocessor API. Use this value to determine the optimal parallel_blocks value. - Prefer vector flash attention kernels over MMA kernel for BS=1 This results in upto 15% perf improvement in gen phase throughput for large seq lengths.
diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -308,13 +308,72 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
 
     if (Q->ne[1] == 1) {
         constexpr int cols_per_block  = 1;
-        constexpr int parallel_blocks = 4;
+        const int total_blocks = (((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3]);
+        const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
+        const int seqlen_tiles = (K->ne[1] + D - 1) / D;
+
         if (logit_softcap == 0.0f) {
             constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
-        } else {
+
+            // Determine the number of active blocks per SM
+            // parallel_blocks template parameter has no effect on the number of active blocks, so keeping a constant 4 to determine active blocks
+            int numActiveBlocks = 1;
+            CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlocks, flash_attn_vec_ext_f32<D, cols_per_block, 4, type_K, type_V, use_logit_softcap>, D, 0));
+
+            // we want to keep at least `numActiveBlocks` blocks per SM to improve occupancy.
+            // this kernel operates on `D` tile of seq length. We need to consider how many `D` tiles can be processed in parallel. 
+            // If there are not enough tiles to process, we can reduce the number of blocks
+            const int parallel_blocks = std::min((nsm * numActiveBlocks) / total_blocks, seqlen_tiles);
+
+            if(parallel_blocks >= 24)
+            {
+                ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 24, type_K, type_V, use_logit_softcap>(ctx, dst);
+            }
+            else if(parallel_blocks >= 16)
+            {
+                ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 16, type_K, type_V, use_logit_softcap>(ctx, dst);
+            }
+            else if(parallel_blocks >= 12)
+            {
+                ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 12, type_K, type_V, use_logit_softcap>(ctx, dst);
+            }
+            else if(parallel_blocks >= 8)
+            {
+                ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 8, type_K, type_V, use_logit_softcap>(ctx, dst);
+            }
+            else
+            {
+                ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 4, type_K, type_V, use_logit_softcap>(ctx, dst);
+            }
+        }
+        else
+        {
             constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            int numActiveBlocks = 1;
+            CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlocks, flash_attn_vec_ext_f32<D, cols_per_block, 4, type_K, type_V, use_logit_softcap>, D, 0));
+
+            const int parallel_blocks = std::min((nsm * numActiveBlocks) / total_blocks, seqlen_tiles);
+
+            if(parallel_blocks >= 24)
+            {
+                ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 24, type_K, type_V, use_logit_softcap>(ctx, dst);
+            }
+            else if(parallel_blocks >= 16)
+            {
+                ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 16, type_K, type_V, use_logit_softcap>(ctx, dst);
+            }
+            else if(parallel_blocks >= 12)
+            {
+                ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 12, type_K, type_V, use_logit_softcap>(ctx, dst);
+            }
+            else if(parallel_blocks >= 8)
+            {
+                ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 8, type_K, type_V, use_logit_softcap>(ctx, dst);
+            }
+            else
+            {
+                ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 4, type_K, type_V, use_logit_softcap>(ctx, dst);
+            }
         }
         return;
     }
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
@@ -297,9 +297,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     }
 
     const int gqa_ratio = Q->ne[2] / K->ne[2];
-    const bool mma_fast_for_bs1 = fp16_mma_available(cc) && gqa_ratio % 2 == 0 &&
-        K->type == GGML_TYPE_F16 && V->type == GGML_TYPE_F16 && mask;
-    if (Q->ne[1] == 1 && Q->ne[0] % (2*warp_size) == 0 && !mma_fast_for_bs1) {
+    if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
         if (prec == GGML_PREC_DEFAULT) {
             ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
             return;