Skip to content

vulkan: Implement split_k for coopmat2 flash attention. #12627

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 73 additions & 13 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@ struct vk_device_struct {
vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2];
vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2];
vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2];
vk_pipeline pipeline_flash_attn_split_k_reduce;

std::unordered_map<std::string, vk_pipeline_ref> pipelines;
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
Expand Down Expand Up @@ -504,6 +505,8 @@ struct vk_flash_attn_push_constants {
float m1;

uint32_t gqa_ratio;
uint32_t split_kv;
uint32_t k_num;
};

struct vk_op_push_constants {
Expand Down Expand Up @@ -1476,7 +1479,7 @@ static std::array<uint32_t, 2> fa_rows_cols(uint32_t D, uint32_t clamp, ggml_typ

// small rows, large cols
if (small_rows) {
return {flash_attention_num_small_rows, 128};
return {flash_attention_num_small_rows, 64};
}
// small cols to reduce register count
if (ggml_is_quantized(type) || D == 256) {
Expand Down Expand Up @@ -2332,6 +2335,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);

ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 3 * sizeof(uint32_t), {1, 1, 1}, {}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);

for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
Expand Down Expand Up @@ -5479,9 +5483,38 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
workgroups_y /= N;
}

uint32_t split_kv = KV;
uint32_t split_k = 1;

if (gqa_ratio > 1 && ctx->device->shader_core_count > 0) {
GGML_ASSERT(workgroups_x == 1);
// Try to run two workgroups per SM.
split_k = ctx->device->shader_core_count * 2 / workgroups_y;
if (split_k > 1) {
// Try to evenly split KV into split_k chunks, but it needs to be a multiple
// of "align", so recompute split_k based on that.
split_kv = ROUNDUP_POW2(KV / split_k, pipelines[1]->align);
split_k = CEIL_DIV(KV, split_kv);
workgroups_x = split_k;
}
}

// Reserve space for split_k temporaries. For each split, we need to store the O matrix (D x ne1)
// and the per-row m and L values (ne1 rows).
const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k : 0;
if (split_k_size > ctx->device->max_memory_allocation_size) {
GGML_ABORT("Requested preallocation size is too large");
}
if (ctx->prealloc_size_split_k < split_k_size) {
ctx->prealloc_size_split_k = split_k_size;
}

if (dryrun) {
// Request descriptor sets
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
if (split_k > 1) {
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
}
return;
}

Expand All @@ -5502,8 +5535,6 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);

ggml_vk_sync_buffers(subctx);

vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr;
size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0;

Expand Down Expand Up @@ -5568,16 +5599,45 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
nbm1,
scale, max_bias, logit_softcap,
mask != nullptr, n_head_log2, m0, m1, gqa_ratio };
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
{
vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
},
sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
mask != nullptr, n_head_log2, m0, m1,
gqa_ratio, split_kv, split_k };

ggml_vk_sync_buffers(subctx);

if (split_k > 1) {
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
{
vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
},
// We only use split_k when group query attention is enabled, which means
// there's no more than one tile of rows (i.e. workgroups_x would have been
// one). We reuse workgroups_x to mean the number of splits, so we need to
// cancel out the divide by wg_denoms[0].
sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });

ggml_vk_sync_buffers(subctx);
const std::array<uint32_t, 3> pc2 = { D, (uint32_t)ne1, split_k };
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
{
vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
},
pc2.size() * uint32_t{sizeof(uint32_t)}, pc2.data(), { (uint32_t)ne1, 1, 1 });
} else {
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
{
vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
},
sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
}
}

static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
Expand Down
40 changes: 37 additions & 3 deletions ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ layout (push_constant) uniform parameter {
float m1;

uint32_t gqa_ratio;
uint32_t split_kv;
uint32_t k_num;
} p;

layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
Expand Down Expand Up @@ -116,6 +118,16 @@ D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TY
return elem;
}

// Store column zero. This is used to save per-row m and L values for split_k.
ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
{
if (r < N && c == 0) {
uint32_t offset = iq2 + r;
data_o[o_offset + offset] = D_TYPE(elem);
}
return elem;
}

// Load the slope matrix, indexed by Q's dimension 2.
ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
{
Expand All @@ -135,10 +147,18 @@ void main() {
const uint32_t N = p.N;
const uint32_t KV = p.KV;

uint32_t i = gl_WorkGroupID.x;
uint32_t split_k_index = 0;

if (p.k_num > 1) {
i = 0;
split_k_index = gl_WorkGroupID.x;
}

const uint32_t Tr = CEIL_DIV(N, Br);
const uint32_t Tc = CEIL_DIV(KV, Bc);

const uint32_t i = gl_WorkGroupID.x;
const uint32_t start_j = split_k_index * p.split_kv / Bc;
const uint32_t end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);

// When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
// When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
Expand Down Expand Up @@ -218,7 +238,7 @@ void main() {
}

[[dont_unroll]]
for (uint32_t j = 0; j < Tc; ++j) {
for (uint32_t j = start_j; j < end_j; ++j) {

coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);

Expand Down Expand Up @@ -312,6 +332,20 @@ void main() {
O = coopMatMulAdd(P_A, V, O);
}

// If there is split_k, then the split_k resolve shader does the final
// division by L. Store the intermediate O value and per-row m and L values.
if (p.k_num > 1) {
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(O);

uint32_t o_offset = D * p.ne1 * split_k_index;
coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);

o_offset = D * p.ne1 * p.k_num + p.ne1 * split_k_index * 2;
coopMatPerElementNV(L, L, perElemOpStoreCol0, o_offset, iq2, N);
coopMatPerElementNV(M, M, perElemOpStoreCol0, o_offset + p.ne1, iq2, N);
return;
}

coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> Ldiag;

// resize L by using smear/reduce
Expand Down
59 changes: 59 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#version 450

#extension GL_EXT_control_flow_attributes : enable

#define BLOCK_SIZE 32

layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;

layout (binding = 0) readonly buffer A {float data_a[];};
layout (binding = 1) writeonly buffer D {float data_d[];};

layout (push_constant) uniform parameter {
uint D;
uint N;
uint k_num;
} p;

void main() {
// Each workgroup handles a row
const uint n = gl_WorkGroupID.x;
const uint tid = gl_LocalInvocationID.x;

uint D = p.D;
uint N = p.N;
uint k_num = p.k_num;

uint l_offset = D * N * k_num + n;
uint m_offset = D * N * k_num + N + n;
uint lm_stride = N * 2;

// Compute the max m value for the row
float m_max = -1.0/0.0;
[[unroll]] for (uint k = 0; k < k_num; ++k) {
float m = data_a[m_offset + k * lm_stride];
m_max = max(m_max, m);
}

// Compute L based on m_max
float L = 0;
[[unroll]] for (uint k = 0; k < k_num; ++k) {
float l = data_a[l_offset + k * lm_stride];
float m = data_a[m_offset + k * lm_stride];
L += exp(m - m_max) * l;
}

L = 1.0 / L;

// Scale and sum the O contributions based on m_max and store the result to memory
for (uint d = tid; d < D; d += BLOCK_SIZE) {
float O = 0.0;
[[unroll]] for (uint k = 0; k < k_num; ++k) {
uint o_offset = D * N * k + D * n + d;
float m = data_a[m_offset + k * lm_stride];
O += exp(m - m_max) * data_a[o_offset];
}
O *= L;
data_d[D * n + d] = O;
}
}
1 change: 1 addition & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,7 @@ void process_shaders() {
string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});

string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
string_to_spv("fa_split_k_reduce", "flash_attn_split_k_reduce.comp", {});
string_to_spv("quantize_q8_1", "quantize_q8_1.comp", {});

string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
Expand Down
6 changes: 6 additions & 0 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4516,6 +4516,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
}
}

for (int kv : { 4096, 8192, 16384, }) {
for (int hs : { 64, 128, }) {
test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, 4, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
}
}

return test_cases;
}

Expand Down
Loading