@@ -353,6 +353,7 @@ struct vk_device_struct {
353
353
vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2];
354
354
vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2];
355
355
vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2];
356
+ vk_pipeline pipeline_flash_attn_split_k_reduce;
356
357
357
358
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
358
359
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
@@ -504,6 +505,8 @@ struct vk_flash_attn_push_constants {
504
505
float m1;
505
506
506
507
uint32_t gqa_ratio;
508
+ uint32_t split_kv;
509
+ uint32_t k_num;
507
510
};
508
511
509
512
struct vk_op_push_constants {
@@ -1476,7 +1479,7 @@ static std::array<uint32_t, 2> fa_rows_cols(uint32_t D, uint32_t clamp, ggml_typ
1476
1479
1477
1480
// small rows, large cols
1478
1481
if (small_rows) {
1479
- return {flash_attention_num_small_rows, 128 };
1482
+ return {flash_attention_num_small_rows, 64 };
1480
1483
}
1481
1484
// small cols to reduce register count
1482
1485
if (ggml_is_quantized(type) || D == 256) {
@@ -2332,6 +2335,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
2332
2335
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
2333
2336
2334
2337
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
2338
+ ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 3 * sizeof(uint32_t), {1, 1, 1}, {}, 1, true);
2335
2339
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
2336
2340
2337
2341
for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
@@ -5479,9 +5483,38 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
5479
5483
workgroups_y /= N;
5480
5484
}
5481
5485
5486
+ uint32_t split_kv = KV;
5487
+ uint32_t split_k = 1;
5488
+
5489
+ if (gqa_ratio > 1 && ctx->device->shader_core_count > 0) {
5490
+ GGML_ASSERT(workgroups_x == 1);
5491
+ // Try to run two workgroups per SM.
5492
+ split_k = ctx->device->shader_core_count * 2 / workgroups_y;
5493
+ if (split_k > 1) {
5494
+ // Try to evenly split KV into split_k chunks, but it needs to be a multiple
5495
+ // of "align", so recompute split_k based on that.
5496
+ split_kv = ROUNDUP_POW2(KV / split_k, pipelines[1]->align);
5497
+ split_k = CEIL_DIV(KV, split_kv);
5498
+ workgroups_x = split_k;
5499
+ }
5500
+ }
5501
+
5502
+ // Reserve space for split_k temporaries. For each split, we need to store the O matrix (D x ne1)
5503
+ // and the per-row m and L values (ne1 rows).
5504
+ const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k : 0;
5505
+ if (split_k_size > ctx->device->max_memory_allocation_size) {
5506
+ GGML_ABORT("Requested preallocation size is too large");
5507
+ }
5508
+ if (ctx->prealloc_size_split_k < split_k_size) {
5509
+ ctx->prealloc_size_split_k = split_k_size;
5510
+ }
5511
+
5482
5512
if (dryrun) {
5483
5513
// Request descriptor sets
5484
5514
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
5515
+ if (split_k > 1) {
5516
+ ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
5517
+ }
5485
5518
return;
5486
5519
}
5487
5520
@@ -5502,8 +5535,6 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
5502
5535
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
5503
5536
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
5504
5537
5505
- ggml_vk_sync_buffers(subctx);
5506
-
5507
5538
vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr;
5508
5539
size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0;
5509
5540
@@ -5568,16 +5599,45 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
5568
5599
v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
5569
5600
nbm1,
5570
5601
scale, max_bias, logit_softcap,
5571
- mask != nullptr, n_head_log2, m0, m1, gqa_ratio };
5572
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
5573
- {
5574
- vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
5575
- vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
5576
- vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
5577
- vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
5578
- vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
5579
- },
5580
- sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
5602
+ mask != nullptr, n_head_log2, m0, m1,
5603
+ gqa_ratio, split_kv, split_k };
5604
+
5605
+ ggml_vk_sync_buffers(subctx);
5606
+
5607
+ if (split_k > 1) {
5608
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
5609
+ {
5610
+ vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
5611
+ vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
5612
+ vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
5613
+ vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
5614
+ vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
5615
+ },
5616
+ // We only use split_k when group query attention is enabled, which means
5617
+ // there's no more than one tile of rows (i.e. workgroups_x would have been
5618
+ // one). We reuse workgroups_x to mean the number of splits, so we need to
5619
+ // cancel out the divide by wg_denoms[0].
5620
+ sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
5621
+
5622
+ ggml_vk_sync_buffers(subctx);
5623
+ const std::array<uint32_t, 3> pc2 = { D, (uint32_t)ne1, split_k };
5624
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
5625
+ {
5626
+ vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
5627
+ vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
5628
+ },
5629
+ pc2.size() * uint32_t{sizeof(uint32_t)}, pc2.data(), { (uint32_t)ne1, 1, 1 });
5630
+ } else {
5631
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
5632
+ {
5633
+ vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
5634
+ vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
5635
+ vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
5636
+ vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
5637
+ vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
5638
+ },
5639
+ sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
5640
+ }
5581
5641
}
5582
5642
5583
5643
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
0 commit comments