Skip to content

Commit f6711ce

Browse files
CUDA: determine FA parallel blocks at runtime
1 parent 3d652bf commit f6711ce

File tree

10 files changed

+166
-257
lines changed

10 files changed

+166
-257
lines changed

ggml/src/ggml-cuda/fattn-common.cuh

Lines changed: 27 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -612,48 +612,47 @@ static __global__ void flash_attn_stream_k_fixup(
612612
*dst = dst_val / rowsum;
613613
}
614614

615-
template<int D, int parallel_blocks> // D == head size
615+
template<int D> // D == head size
616616
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
617617
__launch_bounds__(D, 1)
618618
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
619619
static __global__ void flash_attn_combine_results(
620620
const float * __restrict__ VKQ_parts,
621621
const float2 * __restrict__ VKQ_meta,
622-
float * __restrict__ dst) {
623-
VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
624-
VKQ_meta += parallel_blocks * gridDim.y*blockIdx.x;
625-
dst += D * gridDim.y*blockIdx.x;
622+
float * __restrict__ dst,
623+
const int parallel_blocks) {
624+
VKQ_parts += parallel_blocks*D * gridDim.z*blockIdx.x;
625+
VKQ_meta += parallel_blocks * gridDim.z*blockIdx.x;
626+
dst += D * gridDim.z*blockIdx.x;
626627

627628
const int tid = threadIdx.x;
628629
__builtin_assume(tid < D);
629630

630-
__shared__ float2 meta[parallel_blocks];
631+
extern __shared__ float2 meta[];
631632
if (tid < 2*parallel_blocks) {
632-
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
633+
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
633634
}
634635

635636
__syncthreads();
636637

637638
float kqmax = meta[0].x;
638-
#pragma unroll
639639
for (int l = 1; l < parallel_blocks; ++l) {
640640
kqmax = max(kqmax, meta[l].x);
641641
}
642642

643643
float VKQ_numerator = 0.0f;
644644
float VKQ_denominator = 0.0f;
645-
#pragma unroll
646645
for (int l = 0; l < parallel_blocks; ++l) {
647646
const float diff = meta[l].x - kqmax;
648647
const float KQ_max_scale = expf(diff);
649648
const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
650649
*((uint32_t *) &KQ_max_scale) &= ftz_mask;
651650

652-
VKQ_numerator += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
651+
VKQ_numerator += KQ_max_scale * VKQ_parts[l*gridDim.z*D + blockIdx.z*D + tid];
653652
VKQ_denominator += KQ_max_scale * meta[l].y;
654653
}
655654

656-
dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
655+
dst[blockIdx.z*D + tid] = VKQ_numerator / VKQ_denominator;
657656
}
658657

659658
static void on_no_fattn_vec_case(const int D) {
@@ -677,11 +676,10 @@ static void on_no_fattn_vec_case(const int D) {
677676
}
678677
}
679678

680-
// parallel_blocks == 0 is stream-k decomposition
681-
template <int D, int ncols1, int ncols2, int parallel_blocks, int KQ_stride>
679+
template <int D, int ncols1, int ncols2, int KQ_stride>
682680
void launch_fattn(
683681
ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel,
684-
const int nwarps, const size_t nbytes_shared, const bool need_f16_K, const bool need_f16_V
682+
const int nwarps, const size_t nbytes_shared, const bool need_f16_K, const bool need_f16_V, const bool stream_k
685683
) {
686684
constexpr int ncols = ncols1 * ncols2;
687685

@@ -704,6 +702,9 @@ void launch_fattn(
704702

705703
GGML_ASSERT(Q->ne[3] == 1);
706704

705+
GGML_ASSERT(stream_k || ncols2 == 1);
706+
const int parallel_blocks = Q->ne[1] <= ncols1 ? 4 : 1;
707+
707708
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
708709

709710
ggml_cuda_pool & pool = ctx.pool();
@@ -760,7 +761,7 @@ void launch_fattn(
760761

761762
const dim3 block_dim(warp_size, nwarps, 1);
762763
dim3 blocks_num;
763-
if (parallel_blocks == 0) {
764+
if (stream_k) {
764765
// For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
765766
const int max_blocks = 2*nsm;
766767
const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks;
@@ -776,9 +777,9 @@ void launch_fattn(
776777

777778
dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + D) * sizeof(float));
778779
} else {
779-
blocks_num.x = parallel_blocks*ntiles_x;
780-
blocks_num.y = Q->ne[2];
781-
blocks_num.z = Q->ne[3];
780+
blocks_num.x = ntiles_x;
781+
blocks_num.y = parallel_blocks;
782+
blocks_num.z = Q->ne[2]*Q->ne[3];
782783

783784
if (parallel_blocks > 1) {
784785
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
@@ -811,7 +812,7 @@ void launch_fattn(
811812
K_data,
812813
V_data,
813814
mask ? ((const char *) mask->data) : nullptr,
814-
(parallel_blocks) > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
815+
!stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
815816
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
816817
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
817818
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
@@ -823,7 +824,7 @@ void launch_fattn(
823824
);
824825
CUDA_CHECK(cudaGetLastError());
825826

826-
if constexpr (parallel_blocks == 0) {
827+
if (stream_k) {
827828
if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
828829
const dim3 block_dim_combine(D, 1, 1);
829830
const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};
@@ -832,13 +833,14 @@ void launch_fattn(
832833
<<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
833834
((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[1]);
834835
}
835-
} else if constexpr (parallel_blocks > 1) {
836+
} else if (parallel_blocks > 1) {
836837
const dim3 block_dim_combine(D, 1, 1);
837-
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
838+
const dim3 blocks_num_combine(Q->ne[1], 1, blocks_num.z);
839+
const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2);
838840

839-
flash_attn_combine_results<D, parallel_blocks>
840-
<<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
841-
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
841+
flash_attn_combine_results<D>
842+
<<<blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream>>>
843+
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks);
842844
}
843845
CUDA_CHECK(cudaGetLastError());
844846
}

ggml/src/ggml-cuda/fattn-mma-f16.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -970,7 +970,7 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
970970
fattn_kernel = flash_attn_ext_f16<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap>;
971971
}
972972

973-
launch_fattn<D, ncols1, ncols2, 0, KQ_per_iter>(ctx, dst, fattn_kernel, nwarps, nbytes_shared_total, true, true);
973+
launch_fattn<D, ncols1, ncols2, KQ_per_iter>(ctx, dst, fattn_kernel, nwarps, nbytes_shared_total, true, true, true);
974974
}
975975

976976

ggml/src/ggml-cuda/fattn-tile-f16.cu

Lines changed: 22 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
#define FATTN_KQ_STRIDE_TILE_F16 64
66

7-
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
7+
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
88
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
99
__launch_bounds__(nwarps*WARP_SIZE, 1)
1010
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
@@ -58,18 +58,17 @@ static __global__ void flash_attn_tile_ext_f16(
5858

5959
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
6060

61-
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
62-
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
61+
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
6362

6463
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
65-
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
66-
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
67-
const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
64+
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.z + nb01*ic0);
65+
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.z / gqa_ratio));
66+
const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
6867
const half * maskh = (const half *) mask + ne11*ic0;
6968

7069
const int stride_KV2 = nb11 / sizeof(half2);
7170

72-
const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
71+
const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
7372
const half slopeh = __float2half(slopef);
7473

7574
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
@@ -105,8 +104,7 @@ static __global__ void flash_attn_tile_ext_f16(
105104

106105
__syncthreads();
107106

108-
const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F16;
109-
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F16) {
107+
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F16; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F16) {
110108
// Calculate KQ tile and keep track of new maximum KQ values:
111109

112110
half kqmax_new[ncols/nwarps];
@@ -271,40 +269,40 @@ static __global__ void flash_attn_tile_ext_f16(
271269
const int i0 = i00 + 2*threadIdx.x;
272270

273271
half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
274-
if (parallel_blocks == 1) {
272+
if (gridDim.y == 1) {
275273
dst_val /= __half2half2(kqsum_j);
276274
}
277-
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
278-
dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] = __low2float(dst_val);
279-
dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = __high2float(dst_val);
275+
const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;
276+
dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 0] = __low2float(dst_val);
277+
dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 1] = __high2float(dst_val);
280278
}
281279

282-
if (parallel_blocks != 1 && threadIdx.x == 0) {
283-
dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
280+
if (gridDim.y != 1 && threadIdx.x == 0) {
281+
dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
284282
}
285283
}
286284
#else
287285
NO_DEVICE_CODE;
288286
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
289287
}
290288

291-
template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
289+
template <int cols_per_block, bool use_logit_softcap>
292290
void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
293291
const ggml_tensor * Q = dst->src[0];
294292
switch (Q->ne[0]) {
295293
case 64: {
296294
constexpr int D = 64;
297295
constexpr int nwarps = 8;
298296
constexpr size_t nbytes_shared = 0;
299-
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
300-
launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
297+
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
298+
launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true, false);
301299
} break;
302300
case 128: {
303301
constexpr int D = 128;
304302
constexpr int nwarps = 8;
305303
constexpr size_t nbytes_shared = 0;
306-
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
307-
launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
304+
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
305+
launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true, false);
308306
} break;
309307
default: {
310308
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
@@ -324,37 +322,22 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten
324322

325323
if (Q->ne[1] <= 16) {
326324
constexpr int cols_per_block = 16;
327-
constexpr int parallel_blocks = 4;
328325
if (logit_softcap == 0.0f) {
329326
constexpr bool use_logit_softcap = false;
330-
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
327+
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
331328
} else {
332329
constexpr bool use_logit_softcap = true;
333-
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
334-
}
335-
return;
336-
}
337-
338-
if (Q->ne[1] <= 32) {
339-
constexpr int cols_per_block = 32;
340-
constexpr int parallel_blocks = 4;
341-
if (logit_softcap == 0.0f) {
342-
constexpr bool use_logit_softcap = false;
343-
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
344-
} else {
345-
constexpr bool use_logit_softcap = true;
346-
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
330+
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
347331
}
348332
return;
349333
}
350334

351335
constexpr int cols_per_block = 32;
352-
constexpr int parallel_blocks = 1;
353336
if (logit_softcap == 0.0f) {
354337
constexpr bool use_logit_softcap = false;
355-
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
338+
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
356339
} else {
357340
constexpr bool use_logit_softcap = true;
358-
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
341+
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
359342
}
360343
}

0 commit comments

Comments
 (0)