@@ -3249,8 +3249,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3249
3249
3250
3250
// note: only old legacy GGUF files will have the unsplit wkv_b tensor in
3251
3251
if (is_mla) {
3252
- layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 0);
3253
- layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_v_mla}, 0);
3252
+ layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head }, 0);
3253
+ layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head }, 0);
3254
3254
} else {
3255
3255
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
3256
3256
}
@@ -10143,18 +10143,11 @@ struct llm_build_deepseek2 : public llm_graph_context {
10143
10143
cb(kv_cmpr, "kv_cmpr", il);
10144
10144
10145
10145
if (is_mla) {
10146
- ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b,
10147
- n_embd_head_qk_nope, kv_lora_rank, n_head,
10148
- ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope),
10149
- ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope) * kv_lora_rank,
10150
- 0);
10151
- cb(wk_b, "wk_b", il);
10152
-
10153
10146
// {n_embd_head_qk_nope, n_tokens, n_head}
10154
10147
q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
10155
10148
cb(q_nope, "q_nope_perm", il);
10156
10149
10157
- ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, wk_b, q_nope);
10150
+ ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il]. wk_b, q_nope);
10158
10151
cb(q_nope_absorbed, "q_nope_absorbed", il);
10159
10152
10160
10153
// {n_embd_head_qk_rope, n_tokens, n_head}
@@ -10178,17 +10171,10 @@ struct llm_build_deepseek2 : public llm_graph_context {
10178
10171
ggml_tensor * v_states = kv_cmpr;
10179
10172
cb(v_states, "v_states", il);
10180
10173
10181
- ggml_tensor * v_mla = ggml_view_3d(ctx0, model.layers[il].wv_b,
10182
- kv_lora_rank, n_embd_head_v, n_head,
10183
- ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank),
10184
- ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank) * n_embd_head_v,
10185
- 0);
10186
- cb(v_mla, "v_mla", il);
10187
-
10188
10174
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
10189
10175
cur = build_attn_mla(inp_attn, gf,
10190
10176
model.layers[il].wo, NULL,
10191
- q_states, k_states, v_states, nullptr, v_mla , kq_scale, il);
10177
+ q_states, k_states, v_states, nullptr, model.layers[il].wv_b , kq_scale, il);
10192
10178
} else {
10193
10179
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
10194
10180
cb(kv, "kv", il);
0 commit comments