cuda : avoid warp_reduce for smax

ggerganov · ggerganov · commit b150abe83e6f · 2024-02-03T13:17:47.000+02:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -6621,7 +6621,6 @@ static __global__ void flash_attn_ext_f16(
                         M[j] = __hmax(M[j], s);
                     }
 
-                    smax = warp_reduce_max(smax);
                     M[j] = warp_reduce_max(M[j]);
 
                     const half ms = __hisinf(m) == -1 ? __float2half(0.0f) : hexp(m - M[j]);
@@ -6649,6 +6648,8 @@ static __global__ void flash_attn_ext_f16(
                 }
             }
 
+            smax = warp_reduce_max(smax);
+
             // skip -INF blocks
             if (__hisinf(smax) == -1) {
                 continue;

Original file line number	Diff line number	Diff line change
`@@ -6621,7 +6621,6 @@ static __global__ void flash_attn_ext_f16(`
`6621`	`6621`	`M[j] = __hmax(M[j], s);`
`6622`	`6622`	`}`
`6623`	`6623`
`6624`		`- smax = warp_reduce_max(smax);`
`6625`	`6624`	`M[j] = warp_reduce_max(M[j]);`
`6626`	`6625`
`6627`	`6626`	`const half ms = __hisinf(m) == -1 ? __float2half(0.0f) : hexp(m - M[j]);`
`@@ -6649,6 +6648,8 @@ static __global__ void flash_attn_ext_f16(`
`6649`	`6648`	`}`
`6650`	`6649`	`}`
`6651`	`6650`
	`6651`	`+ smax = warp_reduce_max(smax);`
	`6652`	`+`
`6652`	`6653`	`// skip -INF blocks`
`6653`	`6654`	`if (__hisinf(smax) == -1) {`
`6654`	`6655`	`continue;`