Remove trailing whitespaces

gaugarg-nv · gaugarg-nv · commit 741b7299b145 · 2025-03-05T00:29:53.000+05:30
diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -321,11 +321,11 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
             // Determine the number of active blocks per SM
             // parallel_blocks template parameter has no effect on the number of active blocks, so keeping a constant 4 to determine active blocks
             int numActiveBlocks = 1;
-            CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlocks, 
+            CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlocks,
                         flash_attn_vec_ext_f32<D, cols_per_block, 4, type_K, type_V, use_logit_softcap>, D, 0));
 
             // we want to keep at least `numActiveBlocks` blocks per SM to improve occupancy.
-            // this kernel operates on `D` tile of seq length. We need to consider how many `D` tiles can be processed in parallel. 
+            // this kernel operates on `D` tile of seq length. We need to consider how many `D` tiles can be processed in parallel.
             // If there are not enough tiles to process, we can reduce the number of blocks
             const int parallel_blocks = std::min((nsm * numActiveBlocks) / total_blocks, seqlen_tiles);
 
@@ -341,7 +341,7 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
             else if(parallel_blocks >= 8) {
                 ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 8, type_K, type_V, use_logit_softcap>(ctx, dst);
             }
-            else 
+            else
 #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
             {
                 ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 4, type_K, type_V, use_logit_softcap>(ctx, dst);
@@ -353,7 +353,7 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
 
 #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
             int numActiveBlocks = 1;
-            CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlocks, 
+            CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlocks,
                         flash_attn_vec_ext_f32<D, cols_per_block, 4, type_K, type_V, use_logit_softcap>, D, 0));
 
             const int parallel_blocks = std::min((nsm * numActiveBlocks) / total_blocks, seqlen_tiles);
@@ -370,7 +370,7 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
             else if(parallel_blocks >= 8) {
                 ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 8, type_K, type_V, use_logit_softcap>(ctx, dst);
             }
-            else 
+            else
 #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
             {
                 ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, 4, type_K, type_V, use_logit_softcap>(ctx, dst);