@@ -360,8 +360,8 @@ struct ggml_tensor * forward(
360
360
// wk shape [n_embd, n_embd, 1, 1]
361
361
// Qcur shape [n_embd/n_head, n_head, N, 1]
362
362
// Kcur shape [n_embd/n_head, n_head, N, 1]
363
- struct ggml_tensor * Qcur = ggml_rope_inplace (ctx0, ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model->layers [il].wq , cur), n_embd/n_head, n_head, N), n_past, n_rot, 0 );
364
- struct ggml_tensor * Kcur = ggml_rope_inplace (ctx0, ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model->layers [il].wk , cur), n_embd/n_head, n_head, N), n_past, n_rot, 0 );
363
+ struct ggml_tensor * Qcur = ggml_rope (ctx0, ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model->layers [il].wq , cur), n_embd/n_head, n_head, N), n_past, n_rot, 0 );
364
+ struct ggml_tensor * Kcur = ggml_rope (ctx0, ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model->layers [il].wk , cur), n_embd/n_head, n_head, N), n_past, n_rot, 0 );
365
365
366
366
// store key and value to memory
367
367
{
@@ -414,17 +414,17 @@ struct ggml_tensor * forward(
414
414
// KQ_scaled = KQ / sqrt(n_embd/n_head)
415
415
// KQ_scaled shape [n_past + N, N, n_head, 1]
416
416
struct ggml_tensor * KQ_scaled =
417
- ggml_scale_inplace (ctx0,
417
+ ggml_scale (ctx0,
418
418
KQ,
419
419
ggml_new_f32 (ctx0, 1 .0f /sqrtf (float (n_embd)/n_head)));
420
420
421
421
// KQ_masked = mask_past(KQ_scaled)
422
422
// KQ_masked shape [n_past + N, N, n_head, 1]
423
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace (ctx0, KQ_scaled, n_past);
423
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf (ctx0, KQ_scaled, n_past);
424
424
425
425
// KQ = soft_max(KQ_masked)
426
426
// KQ_soft_max shape [n_past + N, N, n_head, 1]
427
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace (ctx0, KQ_masked);
427
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max (ctx0, KQ_masked);
428
428
429
429
// split cached V into n_head heads
430
430
// // V shape [n_past + N, n_embd/n_head, n_head, 1]
@@ -446,9 +446,10 @@ struct ggml_tensor * forward(
446
446
447
447
// cur = KQV_merged.contiguous().view(n_embd, N)
448
448
// cur shape [n_embd,N,1,1]
449
- cur = ggml_cpy (ctx0,
450
- KQV_merged,
451
- ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, N));
449
+ cur = ggml_reshape_2d (ctx0, ggml_cont (ctx0, KQV_merged), n_embd, N);
450
+ // cur = ggml_cpy(ctx0,
451
+ // KQV_merged,
452
+ // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
452
453
453
454
// projection (no bias)
454
455
cur = ggml_mul_mat (ctx0,
0 commit comments