@@ -11187,46 +11187,69 @@ struct llm_build_context {
11187
11187
}
11188
11188
11189
11189
// split into {n_head * n_embd_head_qk_nope, n_tokens}
11190
- struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_element_size(q) * hparams.n_embd_head_k, ggml_element_size(q) * hparams.n_embd_head_k * n_head, 0);
11190
+ struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
11191
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11192
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11193
+ 0);
11191
11194
cb(q_nope, "q_nope", il);
11195
+
11192
11196
// and {n_head * n_embd_head_qk_rope, n_tokens}
11193
- struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_element_size(q) * hparams.n_embd_head_k, ggml_element_size(q) * hparams.n_embd_head_k * n_head, ggml_element_size(q) * n_embd_head_qk_nope);
11197
+ struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
11198
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11199
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11200
+ ggml_row_size(q->type, n_embd_head_qk_nope));
11194
11201
cb(q_pe, "q_pe", il);
11195
11202
11196
11203
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
11197
- struct ggml_tensor * compressed_kv_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
11198
- cb(compressed_kv_pe , "compressed_kv_pe ", il);
11204
+ struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
11205
+ cb(kv_pe_compresseed , "kv_pe_compresseed ", il);
11199
11206
11200
11207
// split into {kv_lora_rank, n_tokens}
11201
- struct ggml_tensor * compressed_kv = ggml_view_2d(ctx0, compressed_kv_pe, kv_lora_rank, n_tokens, compressed_kv_pe->nb[1], 0);
11202
- cb(compressed_kv, "compressed_kv", il);
11208
+ struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
11209
+ kv_pe_compresseed->nb[1],
11210
+ 0);
11211
+ cb(kv_compressed, "kv_compressed", il);
11212
+
11203
11213
// and {n_embd_head_qk_rope, n_tokens}
11204
- struct ggml_tensor * k_pe = ggml_view_2d(ctx0, compressed_kv_pe, n_embd_head_qk_rope, n_tokens, compressed_kv_pe->nb[1], ggml_element_size(compressed_kv_pe)*kv_lora_rank);
11214
+ struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
11215
+ kv_pe_compresseed->nb[1],
11216
+ kv_pe_compresseed->nb[1],
11217
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
11205
11218
cb(k_pe, "k_pe", il);
11206
11219
11207
- compressed_kv = llm_build_norm(ctx0, compressed_kv, hparams,
11220
+ kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
11221
+ kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
11208
11222
model.layers[il].attn_kv_a_norm, NULL,
11209
11223
LLM_NORM_RMS, cb, il);
11210
- cb(compressed_kv , "compressed_kv ", il);
11224
+ cb(kv_compressed , "kv_compressed ", il);
11211
11225
11212
11226
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
11213
- struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, compressed_kv );
11227
+ struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed );
11214
11228
cb(kv, "kv", il);
11215
11229
11216
11230
// split into {n_head * n_embd_head_qk_nope, n_tokens}
11217
- struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, ggml_element_size(kv) * (n_embd_head_qk_nope + hparams.n_embd_head_v), ggml_element_size(kv) * n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v), 0);
11231
+ struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
11232
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
11233
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11234
+ 0);
11218
11235
cb(k_nope, "k_nope", il);
11219
11236
11220
11237
// and {n_head * n_embd_head_v, n_tokens}
11221
- struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, ggml_element_size(kv) * (n_embd_head_qk_nope + hparams.n_embd_head_v), ggml_element_size(kv) * n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v), ggml_element_size(kv) * n_embd_head_qk_nope);
11238
+ struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
11239
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11240
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
11241
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
11222
11242
cb(v_states, "v_states", il);
11223
11243
11224
11244
v_states = ggml_cont(ctx0, v_states);
11225
11245
cb(v_states, "v_states", il);
11226
11246
11227
- v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, ggml_element_size(kv) * hparams.n_embd_head_v * n_head, 0);
11247
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
11248
+ ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
11249
+ 0);
11228
11250
cb(v_states, "v_states", il);
11229
11251
11252
+ q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11230
11253
q_pe = ggml_rope_ext(
11231
11254
ctx0, q_pe, inp_pos, nullptr,
11232
11255
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -11235,8 +11258,9 @@ struct llm_build_context {
11235
11258
cb(q_pe, "q_pe", il);
11236
11259
11237
11260
// shared RoPE key
11261
+ k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11238
11262
k_pe = ggml_rope_ext(
11239
- ctx0, ggml_view_3d(ctx0, k_pe, n_embd_head_qk_rope, 1, n_tokens, k_pe->nb[0], k_pe->nb[1], 0) , inp_pos, nullptr,
11263
+ ctx0, k_pe, inp_pos, nullptr,
11240
11264
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11241
11265
ext_factor, attn_factor_scaled, beta_fast, beta_slow
11242
11266
);
0 commit comments