Skip to content

Commit 638b092

Browse files
committed
Removed the 3D views of wk_b and wv_b, and just save and 3D in GGUF
1 parent 5d037ae commit 638b092

File tree

2 files changed

+4
-20
lines changed

2 files changed

+4
-20
lines changed

convert_hf_to_gguf.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4523,8 +4523,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
45234523
kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
45244524
k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
45254525
k_b = k_b.transpose(1, 2)
4526-
k_b = k_b.reshape(n_head_kv * data_torch.shape[-1], qk_nope_head_dim)
4527-
v_b = v_b.reshape(n_head_kv * v_head_dim, data_torch.shape[-1])
45284526

45294527
return [
45304528
(self.map_tensor_name(name_kb), k_b),

src/llama-model.cpp

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3249,8 +3249,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
32493249

32503250
// note: only old legacy GGUF files will have the unsplit wkv_b tensor in
32513251
if (is_mla) {
3252-
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 0);
3253-
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_v_mla}, 0);
3252+
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
3253+
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
32543254
} else {
32553255
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
32563256
}
@@ -10143,18 +10143,11 @@ struct llm_build_deepseek2 : public llm_graph_context {
1014310143
cb(kv_cmpr, "kv_cmpr", il);
1014410144

1014510145
if (is_mla) {
10146-
ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b,
10147-
n_embd_head_qk_nope, kv_lora_rank, n_head,
10148-
ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope),
10149-
ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope) * kv_lora_rank,
10150-
0);
10151-
cb(wk_b, "wk_b", il);
10152-
1015310146
// {n_embd_head_qk_nope, n_tokens, n_head}
1015410147
q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
1015510148
cb(q_nope, "q_nope_perm", il);
1015610149

10157-
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, wk_b, q_nope);
10150+
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
1015810151
cb(q_nope_absorbed, "q_nope_absorbed", il);
1015910152

1016010153
// {n_embd_head_qk_rope, n_tokens, n_head}
@@ -10178,17 +10171,10 @@ struct llm_build_deepseek2 : public llm_graph_context {
1017810171
ggml_tensor * v_states = kv_cmpr;
1017910172
cb(v_states, "v_states", il);
1018010173

10181-
ggml_tensor * v_mla = ggml_view_3d(ctx0, model.layers[il].wv_b,
10182-
kv_lora_rank, n_embd_head_v, n_head,
10183-
ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank),
10184-
ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank) * n_embd_head_v,
10185-
0);
10186-
cb(v_mla, "v_mla", il);
10187-
1018810174
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
1018910175
cur = build_attn_mla(inp_attn, gf,
1019010176
model.layers[il].wo, NULL,
10191-
q_states, k_states, v_states, nullptr, v_mla, kq_scale, il);
10177+
q_states, k_states, v_states, nullptr, model.layers[il].wv_b, kq_scale, il);
1019210178
} else {
1019310179
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
1019410180
cb(kv, "kv", il);

0 commit comments

Comments
 (0)