Reapply "CUDA: fix race conditions FlashAttention kernels (ggml-org#13438)"

Nexesenex · Nexesenex · commit c80f76a7f40e · 2025-05-25T20:40:54.000+02:00
This reverts commit 6a199dd.
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -715,6 +715,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
             KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
         }
 
+        __syncthreads();
+
         // Write back combined meta data:
 #pragma unroll
         for (int imeta = 0; imeta < nmeta; ++imeta) {
diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -181,6 +181,7 @@ static __global__ void flash_attn_vec_ext_f16(
     for (int j = 0; j < ncols; ++j) {
         KQ[j*D + tid] = -HALF_MAX_HALF;
     }
+    __syncthreads();
 
     half2 VKQ[ncols] = {{0.0f, 0.0f}};
 

Original file line number	Diff line number	Diff line change
`@@ -715,6 +715,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(`
`715`	`715`	`KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);`
`716`	`716`	`}`
`717`	`717`
	`718`	`+ __syncthreads();`
	`719`	`+`
`718`	`720`	`// Write back combined meta data:`
`719`	`721`	`#pragma unroll`
`720`	`722`	`for (int imeta = 0; imeta < nmeta; ++imeta) {`
Original file line number	Diff line number	Diff line change
`@@ -181,6 +181,7 @@ static __global__ void flash_attn_vec_ext_f16(`
`181`	`181`	`for (int j = 0; j < ncols; ++j) {`
`182`	`182`	`KQ[j*D + tid] = -HALF_MAX_HALF;`
`183`	`183`	`}`
	`184`	`+ __syncthreads();`
`184`	`185`
`185`	`186`	`half2 VKQ[ncols] = {{0.0f, 0.0f}};`
`186`	`187`