Review suggestions

gaugarg-nv · gaugarg-nv · commit 656fc8e5cc88 · 2025-03-06T19:16:10.000+05:30
+ Add defines to vendors/hip.h and vendors/musa.h
diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -308,16 +308,13 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
 
     if (Q->ne[1] == 1) {
         constexpr int cols_per_block  = 1;
-        const int total_blocks = (((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3]);
+        const int num_blocks_base = (((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3]);
         const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
         const int seqlen_tiles = (K->ne[1] + D - 1) / D;
 
         if (logit_softcap == 0.0f) {
             constexpr bool use_logit_softcap = false;
 
-            // cudaOccupancyMaxActiveBlocksPerMultiprocessor is not supported on HIP platform
-            // so, skipping the occupancy check for HIP platform
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
             // Determine the number of active blocks per SM
             // parallel_blocks template parameter has no effect on the number of active blocks, so keeping a constant 4 to determine active blocks
             int numActiveBlocks = 1;
@@ -327,7 +324,7 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
             // we want to keep at least `numActiveBlocks` blocks per SM to improve occupancy.
             // this kernel operates on `D` tile of seq length. We need to consider how many `D` tiles can be processed in parallel.
             // If there are not enough tiles to process, we can reduce the number of blocks
-            const int parallel_blocks = std::min((nsm * numActiveBlocks) / total_blocks, seqlen_tiles);
+            const int parallel_blocks = std::min((nsm * numActiveBlocks) / num_blocks_base, seqlen_tiles);
 
             if (parallel_blocks >= 24) {
                 ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 24, type_K, type_V, use_logit_softcap>(ctx, dst);
@@ -341,22 +338,19 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
             else if (parallel_blocks >= 8) {
                 ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 8, type_K, type_V, use_logit_softcap>(ctx, dst);
             }
-            else
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-            {
+            else {
                 ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 4, type_K, type_V, use_logit_softcap>(ctx, dst);
             }
         }
         else
         {
             constexpr bool use_logit_softcap = true;
 
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
             int numActiveBlocks = 1;
             CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlocks,
                         flash_attn_vec_ext_f32<D, cols_per_block, 4, type_K, type_V, use_logit_softcap>, D, 0));
 
-            const int parallel_blocks = std::min((nsm * numActiveBlocks) / total_blocks, seqlen_tiles);
+            const int parallel_blocks = std::min((nsm * numActiveBlocks) / num_blocks_base, seqlen_tiles);
 
             if (parallel_blocks >= 24) {
                 ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 24, type_K, type_V, use_logit_softcap>(ctx, dst);
@@ -370,9 +364,7 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
             else if (parallel_blocks >= 8) {
                 ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 8, type_K, type_V, use_logit_softcap>(ctx, dst);
             }
-            else
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-            {
+            else {
                 ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 4, type_K, type_V, use_logit_softcap>(ctx, dst);
             }
         }
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
@@ -129,6 +129,7 @@
 #define cudaGraph_t hipGraph_t
 #define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
+#define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor
 #define __trap() do { abort(); __builtin_unreachable(); } while(0)
 #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
 #define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h
@@ -133,5 +133,6 @@
 #define cudaKernelNodeParams musaKernelNodeParams
 #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
 #define cudaStreamEndCapture musaStreamEndCapture
+#define cudaOccupancyMaxActiveBlocksPerMultiprocessor musaOccupancyMaxActiveBlocksPerMultiprocessor
 
 typedef mt_bfloat16 nv_bfloat16;