@@ -3138,8 +3138,6 @@ struct llm_build_context {
3138
3138
const float freq_scale;
3139
3139
const float norm_eps;
3140
3140
const float norm_rms_eps;
3141
- const float clamp_kqv;
3142
- const float max_alibi_bias;
3143
3141
3144
3142
const int32_t n_tokens;
3145
3143
const int32_t n_kv;
@@ -3176,8 +3174,6 @@ struct llm_build_context {
3176
3174
freq_scale (cparams.rope_freq_scale),
3177
3175
norm_eps (hparams.f_norm_eps),
3178
3176
norm_rms_eps (hparams.f_norm_rms_eps),
3179
- clamp_kqv (hparams.f_clamp_kqv),
3180
- max_alibi_bias (hparams.f_max_alibi_bias),
3181
3177
n_tokens (batch.n_tokens),
3182
3178
n_kv (worst_case ? n_ctx : kv_self.n),
3183
3179
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
@@ -3297,11 +3293,10 @@ struct llm_build_context {
3297
3293
struct ggml_tensor * mw,
3298
3294
struct ggml_tensor * mb,
3299
3295
llm_norm_type type,
3300
- float eps,
3301
3296
int il) {
3302
3297
switch (type) {
3303
- case LLM_NORM: cur = ggml_norm (ctx, cur, eps); break ;
3304
- case LLM_NORM_RMS: cur = ggml_rms_norm (ctx, cur, eps ); break ;
3298
+ case LLM_NORM: cur = ggml_norm (ctx, cur, hparams. f_norm_eps ); break ;
3299
+ case LLM_NORM_RMS: cur = ggml_rms_norm (ctx, cur, hparams. f_norm_rms_eps ); break ;
3305
3300
}
3306
3301
3307
3302
if (mw || mb) {
@@ -3418,9 +3413,7 @@ struct llm_build_context {
3418
3413
struct ggml_tensor * q_cur,
3419
3414
struct ggml_tensor * kq_scale,
3420
3415
struct ggml_tensor * kq_mask,
3421
- int32_t n_tokens,
3422
- int32_t n_kv,
3423
- float alibi_bias_max,
3416
+ float max_alibi_bias,
3424
3417
int il) {
3425
3418
struct ggml_tensor * q = ggml_permute (ctx, q_cur, 0 , 2 , 1 , 3 );
3426
3419
cb (q, " q" , il);
@@ -3439,11 +3432,11 @@ struct llm_build_context {
3439
3432
kq = ggml_scale (ctx, kq, kq_scale);
3440
3433
cb (kq, " kq_scaled" , il);
3441
3434
3442
- if (alibi_bias_max > 0 .0f ) {
3435
+ if (max_alibi_bias > 0 .0f ) {
3443
3436
// TODO: n_head or n_head_kv
3444
3437
// TODO: K-shift is likely not working
3445
3438
// TODO: change to ggml_add
3446
- kq = ggml_alibi (ctx, kq, /* n_past*/ 0 , n_head, alibi_bias_max );
3439
+ kq = ggml_alibi (ctx, kq, /* n_past*/ 0 , n_head, max_alibi_bias );
3447
3440
cb (kq, " kq_scaled_alibi" , il);
3448
3441
}
3449
3442
@@ -3516,7 +3509,7 @@ struct llm_build_context {
3516
3509
// norm
3517
3510
cur = build_norm (ctx0, inpL,
3518
3511
model.layers [il].attn_norm , NULL ,
3519
- LLM_NORM_RMS, norm_rms_eps, il);
3512
+ LLM_NORM_RMS, il);
3520
3513
cb (cur, " attn_norm" , il);
3521
3514
3522
3515
// self-attention
@@ -3541,7 +3534,7 @@ struct llm_build_context {
3541
3534
3542
3535
cur = build_kqv (ctx0, cur,
3543
3536
model.layers [il].wo , NULL ,
3544
- Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1 .0f , il);
3537
+ Qcur, KQ_scale, KQ_mask, -1 .0f , il);
3545
3538
cb (cur, " kqv_out" , il);
3546
3539
}
3547
3540
@@ -3552,7 +3545,7 @@ struct llm_build_context {
3552
3545
{
3553
3546
cur = build_norm (ctx0, ffn_inp,
3554
3547
model.layers [il].ffn_norm , NULL ,
3555
- LLM_NORM_RMS, norm_rms_eps, il);
3548
+ LLM_NORM_RMS, il);
3556
3549
cb (cur, " ffn_norm" , il);
3557
3550
3558
3551
cur = build_ffn (ctx0, cur,
@@ -3574,7 +3567,7 @@ struct llm_build_context {
3574
3567
3575
3568
cur = build_norm (ctx0, cur,
3576
3569
model.output_norm , NULL ,
3577
- LLM_NORM_RMS, norm_rms_eps, -1 );
3570
+ LLM_NORM_RMS, -1 );
3578
3571
cb (cur, " result_norm" , -1 );
3579
3572
3580
3573
// lm_head
@@ -3616,7 +3609,7 @@ struct llm_build_context {
3616
3609
3617
3610
cur = build_norm (ctx0, inpL,
3618
3611
model.layers [il].attn_norm , NULL ,
3619
- LLM_NORM_RMS, norm_rms_eps, il);
3612
+ LLM_NORM_RMS, il);
3620
3613
cb (cur, " attn_norm" , il);
3621
3614
3622
3615
// self-attention
@@ -3648,11 +3641,11 @@ struct llm_build_context {
3648
3641
build_kv_store (ctx0, Kcur, Vcur, il);
3649
3642
3650
3643
// apply ALiBi for 13B model
3651
- const float alibi_bias_max = model.type == MODEL_13B ? 8 .0f : -1 .0f ;
3644
+ const float max_alibi_bias = model.type == MODEL_13B ? 8 .0f : -1 .0f ;
3652
3645
3653
3646
cur = build_kqv (ctx0, cur,
3654
3647
model.layers [il].wo , NULL ,
3655
- Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, alibi_bias_max , il);
3648
+ Qcur, KQ_scale, KQ_mask, max_alibi_bias , il);
3656
3649
cb (cur, " kqv_out" , il);
3657
3650
}
3658
3651
@@ -3663,7 +3656,7 @@ struct llm_build_context {
3663
3656
{
3664
3657
cur = build_norm (ctx0, ffn_inp,
3665
3658
model.layers [il].ffn_norm , NULL ,
3666
- LLM_NORM_RMS, norm_rms_eps, il);
3659
+ LLM_NORM_RMS, il);
3667
3660
cb (cur, " ffn_norm" , il);
3668
3661
3669
3662
cur = build_ffn (ctx0, cur,
@@ -3685,7 +3678,7 @@ struct llm_build_context {
3685
3678
3686
3679
cur = build_norm (ctx0, cur,
3687
3680
model.output_norm , NULL ,
3688
- LLM_NORM_RMS, norm_rms_eps, -1 );
3681
+ LLM_NORM_RMS, -1 );
3689
3682
cb (cur, " result_norm" , -1 );
3690
3683
3691
3684
// lm_head
@@ -3728,7 +3721,7 @@ struct llm_build_context {
3728
3721
attn_norm = build_norm (ctx0, inpL,
3729
3722
model.layers [il].attn_norm ,
3730
3723
model.layers [il].attn_norm_b ,
3731
- LLM_NORM, norm_eps, il);
3724
+ LLM_NORM, il);
3732
3725
cb (attn_norm, " attn_norm" , il);
3733
3726
3734
3727
// self-attention
@@ -3738,7 +3731,7 @@ struct llm_build_context {
3738
3731
cur = build_norm (ctx0, attn_norm,
3739
3732
model.layers [il].attn_norm_2 ,
3740
3733
model.layers [il].attn_norm_2_b ,
3741
- LLM_NORM, norm_eps, il);
3734
+ LLM_NORM, il);
3742
3735
cb (cur, " attn_norm_2" , il);
3743
3736
} else {
3744
3737
cur = attn_norm;
@@ -3769,7 +3762,7 @@ struct llm_build_context {
3769
3762
3770
3763
cur = build_kqv (ctx0, attn_norm,
3771
3764
model.layers [il].wo , NULL ,
3772
- Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1 .0f , il);
3765
+ Qcur, KQ_scale, KQ_mask, -1 .0f , il);
3773
3766
cb (cur, " kqv_out" , il);
3774
3767
}
3775
3768
@@ -3801,7 +3794,7 @@ struct llm_build_context {
3801
3794
cur = build_norm (ctx0, cur,
3802
3795
model.output_norm ,
3803
3796
model.output_norm_b ,
3804
- LLM_NORM, norm_eps, -1 );
3797
+ LLM_NORM, -1 );
3805
3798
cb (cur, " result_norm" , -1 );
3806
3799
3807
3800
cur = ggml_mul_mat (ctx0, model.output , cur);
@@ -3843,7 +3836,7 @@ struct llm_build_context {
3843
3836
cur = build_norm (ctx0, inpL,
3844
3837
model.layers [il].attn_norm ,
3845
3838
model.layers [il].attn_norm_b ,
3846
- LLM_NORM, norm_eps, il);
3839
+ LLM_NORM, il);
3847
3840
cb (cur, " attn_norm" , il);
3848
3841
3849
3842
// self-attention
@@ -3868,7 +3861,7 @@ struct llm_build_context {
3868
3861
3869
3862
cur = build_kqv (ctx0, cur,
3870
3863
model.layers [il].wo , model.layers [il].bo ,
3871
- Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1 .0f , il);
3864
+ Qcur, KQ_scale, KQ_mask, -1 .0f , il);
3872
3865
cb (cur, " kqv_out" , il);
3873
3866
}
3874
3867
@@ -3881,7 +3874,7 @@ struct llm_build_context {
3881
3874
cur = build_norm (ctx0, ffn_inp,
3882
3875
model.layers [il].ffn_norm ,
3883
3876
model.layers [il].ffn_norm_b ,
3884
- LLM_NORM, norm_eps, il);
3877
+ LLM_NORM, il);
3885
3878
cb (cur, " ffn_norm" , il);
3886
3879
3887
3880
cur = build_ffn (ctx0, cur,
@@ -3899,7 +3892,7 @@ struct llm_build_context {
3899
3892
cur = build_norm (ctx0, inpL,
3900
3893
model.output_norm ,
3901
3894
model.output_norm_b ,
3902
- LLM_NORM, norm_eps, -1 );
3895
+ LLM_NORM, -1 );
3903
3896
cb (cur, " result_norm" , -1 );
3904
3897
3905
3898
cur = ggml_mul_mat (ctx0, model.output , cur);
@@ -3940,7 +3933,7 @@ struct llm_build_context {
3940
3933
cur = build_norm (ctx0, inpL,
3941
3934
model.layers [il].attn_norm ,
3942
3935
model.layers [il].attn_norm_b ,
3943
- LLM_NORM, norm_eps, il);
3936
+ LLM_NORM, il);
3944
3937
cb (cur, " attn_norm" , il);
3945
3938
3946
3939
// self attention
@@ -3980,13 +3973,13 @@ struct llm_build_context {
3980
3973
tmpq = build_norm (ctx0, tmpq,
3981
3974
model.layers [il].attn_q_norm ,
3982
3975
model.layers [il].attn_q_norm_b ,
3983
- LLM_NORM, norm_eps, il);
3976
+ LLM_NORM, il);
3984
3977
cb (tmpq, " tmpq" , il);
3985
3978
3986
3979
tmpk = build_norm (ctx0, tmpk,
3987
3980
model.layers [il].attn_k_norm ,
3988
3981
model.layers [il].attn_k_norm_b ,
3989
- LLM_NORM, norm_eps, il);
3982
+ LLM_NORM, il);
3990
3983
cb (tmpk, " tmpk" , il);
3991
3984
3992
3985
// RoPE the first n_rot of q/k, pass the other half, and concat.
@@ -4072,7 +4065,7 @@ struct llm_build_context {
4072
4065
// TODO: not tested, could be broken
4073
4066
cur = build_kqv (ctx0, Q,
4074
4067
model.layers [il].wo , model.layers [il].bo ,
4075
- Q, KQ_scale, KQ_mask, n_tokens, n_kv, -1 .0f , il);
4068
+ Q, KQ_scale, KQ_mask, -1 .0f , il);
4076
4069
cb (cur, " kqv_out" , il);
4077
4070
}
4078
4071
@@ -4084,7 +4077,7 @@ struct llm_build_context {
4084
4077
cur = build_norm (ctx0, ffn_inp,
4085
4078
model.layers [il].ffn_norm ,
4086
4079
model.layers [il].ffn_norm_b ,
4087
- LLM_NORM, norm_eps, il);
4080
+ LLM_NORM, il);
4088
4081
cb (cur, " ffn_norm" , il);
4089
4082
4090
4083
cur = build_ffn (ctx0, cur,
@@ -4106,7 +4099,7 @@ struct llm_build_context {
4106
4099
cur = build_norm (ctx0, cur,
4107
4100
model.output_norm ,
4108
4101
model.output_norm_b ,
4109
- LLM_NORM, norm_eps, -1 );
4102
+ LLM_NORM, -1 );
4110
4103
cb (cur, " result_norm" , -1 );
4111
4104
4112
4105
cur = ggml_mul_mat (ctx0, model.output , cur);
@@ -4138,7 +4131,7 @@ struct llm_build_context {
4138
4131
4139
4132
cur = build_norm (ctx0, inpL,
4140
4133
model.layers [il].attn_norm , NULL ,
4141
- LLM_NORM_RMS, norm_rms_eps, il);
4134
+ LLM_NORM_RMS, il);
4142
4135
cb (cur, " attn_norm" , il);
4143
4136
4144
4137
// self-attention
@@ -4162,7 +4155,7 @@ struct llm_build_context {
4162
4155
4163
4156
cur = build_kqv (ctx0, Qcur,
4164
4157
model.layers [il].wo , NULL ,
4165
- Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8 .0f , il);
4158
+ Qcur, KQ_scale, KQ_mask, 8 .0f , il);
4166
4159
cb (cur, " kqv_out" , il);
4167
4160
}
4168
4161
@@ -4173,7 +4166,7 @@ struct llm_build_context {
4173
4166
{
4174
4167
cur = build_norm (ctx0, ffn_inp,
4175
4168
model.layers [il].ffn_norm , NULL ,
4176
- LLM_NORM_RMS, norm_rms_eps, il);
4169
+ LLM_NORM_RMS, il);
4177
4170
cb (cur, " ffn_norm" , il);
4178
4171
4179
4172
cur = build_ffn (ctx0, cur,
@@ -4195,7 +4188,7 @@ struct llm_build_context {
4195
4188
4196
4189
cur = build_norm (ctx0, cur,
4197
4190
model.output_norm , NULL ,
4198
- LLM_NORM_RMS, norm_rms_eps, -1 );
4191
+ LLM_NORM_RMS, -1 );
4199
4192
cb (cur, " result_norm" , -1 );
4200
4193
4201
4194
// lm_head
@@ -4226,14 +4219,14 @@ struct llm_build_context {
4226
4219
inpL = build_norm (ctx0, inpL,
4227
4220
model.tok_norm ,
4228
4221
model.tok_norm_b ,
4229
- LLM_NORM, norm_eps, -1 );
4222
+ LLM_NORM, -1 );
4230
4223
cb (inpL, " inp_norm" , -1 );
4231
4224
4232
4225
for (int il = 0 ; il < n_layer; ++il) {
4233
4226
cur = build_norm (ctx0, inpL,
4234
4227
model.layers [il].attn_norm ,
4235
4228
model.layers [il].attn_norm_b ,
4236
- LLM_NORM, norm_eps, il);
4229
+ LLM_NORM, il);
4237
4230
cb (cur, " attn_norm" , il);
4238
4231
4239
4232
// self-attention
@@ -4258,7 +4251,7 @@ struct llm_build_context {
4258
4251
4259
4252
cur = build_kqv (ctx0, Qcur,
4260
4253
model.layers [il].wo , model.layers [il].bo ,
4261
- Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8 .0f , il);
4254
+ Qcur, KQ_scale, KQ_mask, 8 .0f , il);
4262
4255
cb (cur, " kqv_out" , il);
4263
4256
}
4264
4257
@@ -4271,7 +4264,7 @@ struct llm_build_context {
4271
4264
cur = build_norm (ctx0, ffn_inp,
4272
4265
model.layers [il].ffn_norm ,
4273
4266
model.layers [il].ffn_norm_b ,
4274
- LLM_NORM, norm_eps, il);
4267
+ LLM_NORM, il);
4275
4268
cb (cur, " ffn_norm" , il);
4276
4269
4277
4270
cur = build_ffn (ctx0, cur,
@@ -4289,7 +4282,7 @@ struct llm_build_context {
4289
4282
cur = build_norm (ctx0, inpL,
4290
4283
model.output_norm ,
4291
4284
model.output_norm_b ,
4292
- LLM_NORM, norm_eps, -1 );
4285
+ LLM_NORM, -1 );
4293
4286
cb (cur, " result_norm" , -1 );
4294
4287
4295
4288
cur = ggml_mul_mat (ctx0, model.output , cur);
@@ -4322,7 +4315,7 @@ struct llm_build_context {
4322
4315
attn_norm = build_norm (ctx0, inpL,
4323
4316
model.layers [il].attn_norm ,
4324
4317
NULL ,
4325
- LLM_NORM, norm_eps, il);
4318
+ LLM_NORM, il);
4326
4319
cb (attn_norm, " attn_norm" , il);
4327
4320
4328
4321
// self-attention
@@ -4332,8 +4325,8 @@ struct llm_build_context {
4332
4325
cur = ggml_mul_mat (ctx0, model.layers [il].wqkv , cur);
4333
4326
cb (cur, " wqkv" , il);
4334
4327
4335
- if (clamp_kqv > 0 .0f ) {
4336
- cur = ggml_clamp (ctx0, cur, -clamp_kqv, clamp_kqv );
4328
+ if (hparams. f_clamp_kqv > 0 .0f ) {
4329
+ cur = ggml_clamp (ctx0, cur, -hparams. f_clamp_kqv , hparams. f_clamp_kqv );
4337
4330
cb (cur, " wqkv_clamped" , il);
4338
4331
}
4339
4332
@@ -4351,7 +4344,7 @@ struct llm_build_context {
4351
4344
4352
4345
cur = build_kqv (ctx0, Qcur,
4353
4346
model.layers [il].wo , NULL ,
4354
- Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, max_alibi_bias , il);
4347
+ Qcur, KQ_scale, KQ_mask, hparams. f_max_alibi_bias , il);
4355
4348
cb (cur, " kqv_out" , il);
4356
4349
}
4357
4350
@@ -4364,7 +4357,7 @@ struct llm_build_context {
4364
4357
cur = build_norm (ctx0, ffn_inp,
4365
4358
model.layers [il].ffn_norm ,
4366
4359
NULL ,
4367
- LLM_NORM, norm_eps, il);
4360
+ LLM_NORM, il);
4368
4361
cb (cur, " ffn_norm" , il);
4369
4362
4370
4363
cur = build_ffn (ctx0, cur,
@@ -4387,7 +4380,7 @@ struct llm_build_context {
4387
4380
cur = build_norm (ctx0, cur,
4388
4381
model.output_norm ,
4389
4382
NULL ,
4390
- LLM_NORM, norm_eps, -1 );
4383
+ LLM_NORM, -1 );
4391
4384
cb (cur, " result_norm" , -1 );
4392
4385
4393
4386
cur = ggml_mul_mat (ctx0, model.output , cur);
0 commit comments