We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent b68a112 commit b150abeCopy full SHA for b150abe
ggml-cuda.cu
@@ -6621,7 +6621,6 @@ static __global__ void flash_attn_ext_f16(
6621
M[j] = __hmax(M[j], s);
6622
}
6623
6624
- smax = warp_reduce_max(smax);
6625
M[j] = warp_reduce_max(M[j]);
6626
6627
const half ms = __hisinf(m) == -1 ? __float2half(0.0f) : hexp(m - M[j]);
@@ -6649,6 +6648,8 @@ static __global__ void flash_attn_ext_f16(
6649
6648
6650
6651
+ smax = warp_reduce_max(smax);
6652
+
6653
// skip -INF blocks
6654
if (__hisinf(smax) == -1) {
6655
continue;
0 commit comments