Skip to content

Commit 995ee09

Browse files
committed
llm : deduce norm eps based on type + explict max_alibi_bias, clamp_kqv
1 parent 9284aa6 commit 995ee09

File tree

1 file changed

+43
-50
lines changed

1 file changed

+43
-50
lines changed

llama.cpp

Lines changed: 43 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -3138,8 +3138,6 @@ struct llm_build_context {
31383138
const float freq_scale;
31393139
const float norm_eps;
31403140
const float norm_rms_eps;
3141-
const float clamp_kqv;
3142-
const float max_alibi_bias;
31433141

31443142
const int32_t n_tokens;
31453143
const int32_t n_kv;
@@ -3176,8 +3174,6 @@ struct llm_build_context {
31763174
freq_scale (cparams.rope_freq_scale),
31773175
norm_eps (hparams.f_norm_eps),
31783176
norm_rms_eps (hparams.f_norm_rms_eps),
3179-
clamp_kqv (hparams.f_clamp_kqv),
3180-
max_alibi_bias(hparams.f_max_alibi_bias),
31813177
n_tokens (batch.n_tokens),
31823178
n_kv (worst_case ? n_ctx : kv_self.n),
31833179
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
@@ -3297,11 +3293,10 @@ struct llm_build_context {
32973293
struct ggml_tensor * mw,
32983294
struct ggml_tensor * mb,
32993295
llm_norm_type type,
3300-
float eps,
33013296
int il) {
33023297
switch (type) {
3303-
case LLM_NORM: cur = ggml_norm (ctx, cur, eps); break;
3304-
case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, eps); break;
3298+
case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break;
3299+
case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break;
33053300
}
33063301

33073302
if (mw || mb) {
@@ -3418,9 +3413,7 @@ struct llm_build_context {
34183413
struct ggml_tensor * q_cur,
34193414
struct ggml_tensor * kq_scale,
34203415
struct ggml_tensor * kq_mask,
3421-
int32_t n_tokens,
3422-
int32_t n_kv,
3423-
float alibi_bias_max,
3416+
float max_alibi_bias,
34243417
int il) {
34253418
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
34263419
cb(q, "q", il);
@@ -3439,11 +3432,11 @@ struct llm_build_context {
34393432
kq = ggml_scale(ctx, kq, kq_scale);
34403433
cb(kq, "kq_scaled", il);
34413434

3442-
if (alibi_bias_max > 0.0f) {
3435+
if (max_alibi_bias > 0.0f) {
34433436
// TODO: n_head or n_head_kv
34443437
// TODO: K-shift is likely not working
34453438
// TODO: change to ggml_add
3446-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, alibi_bias_max);
3439+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
34473440
cb(kq, "kq_scaled_alibi", il);
34483441
}
34493442

@@ -3516,7 +3509,7 @@ struct llm_build_context {
35163509
// norm
35173510
cur = build_norm(ctx0, inpL,
35183511
model.layers[il].attn_norm, NULL,
3519-
LLM_NORM_RMS, norm_rms_eps, il);
3512+
LLM_NORM_RMS, il);
35203513
cb(cur, "attn_norm", il);
35213514

35223515
// self-attention
@@ -3541,7 +3534,7 @@ struct llm_build_context {
35413534

35423535
cur = build_kqv(ctx0, cur,
35433536
model.layers[il].wo, NULL,
3544-
Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, il);
3537+
Qcur, KQ_scale, KQ_mask, -1.0f, il);
35453538
cb(cur, "kqv_out", il);
35463539
}
35473540

@@ -3552,7 +3545,7 @@ struct llm_build_context {
35523545
{
35533546
cur = build_norm(ctx0, ffn_inp,
35543547
model.layers[il].ffn_norm, NULL,
3555-
LLM_NORM_RMS, norm_rms_eps, il);
3548+
LLM_NORM_RMS, il);
35563549
cb(cur, "ffn_norm", il);
35573550

35583551
cur = build_ffn(ctx0, cur,
@@ -3574,7 +3567,7 @@ struct llm_build_context {
35743567

35753568
cur = build_norm(ctx0, cur,
35763569
model.output_norm, NULL,
3577-
LLM_NORM_RMS, norm_rms_eps, -1);
3570+
LLM_NORM_RMS, -1);
35783571
cb(cur, "result_norm", -1);
35793572

35803573
// lm_head
@@ -3616,7 +3609,7 @@ struct llm_build_context {
36163609

36173610
cur = build_norm(ctx0, inpL,
36183611
model.layers[il].attn_norm, NULL,
3619-
LLM_NORM_RMS, norm_rms_eps, il);
3612+
LLM_NORM_RMS, il);
36203613
cb(cur, "attn_norm", il);
36213614

36223615
// self-attention
@@ -3648,11 +3641,11 @@ struct llm_build_context {
36483641
build_kv_store(ctx0, Kcur, Vcur, il);
36493642

36503643
// apply ALiBi for 13B model
3651-
const float alibi_bias_max = model.type == MODEL_13B ? 8.0f : -1.0f;
3644+
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
36523645

36533646
cur = build_kqv(ctx0, cur,
36543647
model.layers[il].wo, NULL,
3655-
Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, alibi_bias_max, il);
3648+
Qcur, KQ_scale, KQ_mask, max_alibi_bias, il);
36563649
cb(cur, "kqv_out", il);
36573650
}
36583651

@@ -3663,7 +3656,7 @@ struct llm_build_context {
36633656
{
36643657
cur = build_norm(ctx0, ffn_inp,
36653658
model.layers[il].ffn_norm, NULL,
3666-
LLM_NORM_RMS, norm_rms_eps, il);
3659+
LLM_NORM_RMS, il);
36673660
cb(cur, "ffn_norm", il);
36683661

36693662
cur = build_ffn(ctx0, cur,
@@ -3685,7 +3678,7 @@ struct llm_build_context {
36853678

36863679
cur = build_norm(ctx0, cur,
36873680
model.output_norm, NULL,
3688-
LLM_NORM_RMS, norm_rms_eps, -1);
3681+
LLM_NORM_RMS, -1);
36893682
cb(cur, "result_norm", -1);
36903683

36913684
// lm_head
@@ -3728,7 +3721,7 @@ struct llm_build_context {
37283721
attn_norm = build_norm(ctx0, inpL,
37293722
model.layers[il].attn_norm,
37303723
model.layers[il].attn_norm_b,
3731-
LLM_NORM, norm_eps, il);
3724+
LLM_NORM, il);
37323725
cb(attn_norm, "attn_norm", il);
37333726

37343727
// self-attention
@@ -3738,7 +3731,7 @@ struct llm_build_context {
37383731
cur = build_norm(ctx0, attn_norm,
37393732
model.layers[il].attn_norm_2,
37403733
model.layers[il].attn_norm_2_b,
3741-
LLM_NORM, norm_eps, il);
3734+
LLM_NORM, il);
37423735
cb(cur, "attn_norm_2", il);
37433736
} else {
37443737
cur = attn_norm;
@@ -3769,7 +3762,7 @@ struct llm_build_context {
37693762

37703763
cur = build_kqv(ctx0, attn_norm,
37713764
model.layers[il].wo, NULL,
3772-
Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, il);
3765+
Qcur, KQ_scale, KQ_mask, -1.0f, il);
37733766
cb(cur, "kqv_out", il);
37743767
}
37753768

@@ -3801,7 +3794,7 @@ struct llm_build_context {
38013794
cur = build_norm(ctx0, cur,
38023795
model.output_norm,
38033796
model.output_norm_b,
3804-
LLM_NORM, norm_eps, -1);
3797+
LLM_NORM, -1);
38053798
cb(cur, "result_norm", -1);
38063799

38073800
cur = ggml_mul_mat(ctx0, model.output, cur);
@@ -3843,7 +3836,7 @@ struct llm_build_context {
38433836
cur = build_norm(ctx0, inpL,
38443837
model.layers[il].attn_norm,
38453838
model.layers[il].attn_norm_b,
3846-
LLM_NORM, norm_eps, il);
3839+
LLM_NORM, il);
38473840
cb(cur, "attn_norm", il);
38483841

38493842
// self-attention
@@ -3868,7 +3861,7 @@ struct llm_build_context {
38683861

38693862
cur = build_kqv(ctx0, cur,
38703863
model.layers[il].wo, model.layers[il].bo,
3871-
Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, il);
3864+
Qcur, KQ_scale, KQ_mask, -1.0f, il);
38723865
cb(cur, "kqv_out", il);
38733866
}
38743867

@@ -3881,7 +3874,7 @@ struct llm_build_context {
38813874
cur = build_norm(ctx0, ffn_inp,
38823875
model.layers[il].ffn_norm,
38833876
model.layers[il].ffn_norm_b,
3884-
LLM_NORM, norm_eps, il);
3877+
LLM_NORM, il);
38853878
cb(cur, "ffn_norm", il);
38863879

38873880
cur = build_ffn(ctx0, cur,
@@ -3899,7 +3892,7 @@ struct llm_build_context {
38993892
cur = build_norm(ctx0, inpL,
39003893
model.output_norm,
39013894
model.output_norm_b,
3902-
LLM_NORM, norm_eps, -1);
3895+
LLM_NORM, -1);
39033896
cb(cur, "result_norm", -1);
39043897

39053898
cur = ggml_mul_mat(ctx0, model.output, cur);
@@ -3940,7 +3933,7 @@ struct llm_build_context {
39403933
cur = build_norm(ctx0, inpL,
39413934
model.layers[il].attn_norm,
39423935
model.layers[il].attn_norm_b,
3943-
LLM_NORM, norm_eps, il);
3936+
LLM_NORM, il);
39443937
cb(cur, "attn_norm", il);
39453938

39463939
// self attention
@@ -3980,13 +3973,13 @@ struct llm_build_context {
39803973
tmpq = build_norm(ctx0, tmpq,
39813974
model.layers[il].attn_q_norm,
39823975
model.layers[il].attn_q_norm_b,
3983-
LLM_NORM, norm_eps, il);
3976+
LLM_NORM, il);
39843977
cb(tmpq, "tmpq", il);
39853978

39863979
tmpk = build_norm(ctx0, tmpk,
39873980
model.layers[il].attn_k_norm,
39883981
model.layers[il].attn_k_norm_b,
3989-
LLM_NORM, norm_eps, il);
3982+
LLM_NORM, il);
39903983
cb(tmpk, "tmpk", il);
39913984

39923985
// RoPE the first n_rot of q/k, pass the other half, and concat.
@@ -4072,7 +4065,7 @@ struct llm_build_context {
40724065
// TODO: not tested, could be broken
40734066
cur = build_kqv(ctx0, Q,
40744067
model.layers[il].wo, model.layers[il].bo,
4075-
Q, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, il);
4068+
Q, KQ_scale, KQ_mask, -1.0f, il);
40764069
cb(cur, "kqv_out", il);
40774070
}
40784071

@@ -4084,7 +4077,7 @@ struct llm_build_context {
40844077
cur = build_norm(ctx0, ffn_inp,
40854078
model.layers[il].ffn_norm,
40864079
model.layers[il].ffn_norm_b,
4087-
LLM_NORM, norm_eps, il);
4080+
LLM_NORM, il);
40884081
cb(cur, "ffn_norm", il);
40894082

40904083
cur = build_ffn(ctx0, cur,
@@ -4106,7 +4099,7 @@ struct llm_build_context {
41064099
cur = build_norm(ctx0, cur,
41074100
model.output_norm,
41084101
model.output_norm_b,
4109-
LLM_NORM, norm_eps, -1);
4102+
LLM_NORM, -1);
41104103
cb(cur, "result_norm", -1);
41114104

41124105
cur = ggml_mul_mat(ctx0, model.output, cur);
@@ -4138,7 +4131,7 @@ struct llm_build_context {
41384131

41394132
cur = build_norm(ctx0, inpL,
41404133
model.layers[il].attn_norm, NULL,
4141-
LLM_NORM_RMS, norm_rms_eps, il);
4134+
LLM_NORM_RMS, il);
41424135
cb(cur, "attn_norm", il);
41434136

41444137
// self-attention
@@ -4162,7 +4155,7 @@ struct llm_build_context {
41624155

41634156
cur = build_kqv(ctx0, Qcur,
41644157
model.layers[il].wo, NULL,
4165-
Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8.0f, il);
4158+
Qcur, KQ_scale, KQ_mask, 8.0f, il);
41664159
cb(cur, "kqv_out", il);
41674160
}
41684161

@@ -4173,7 +4166,7 @@ struct llm_build_context {
41734166
{
41744167
cur = build_norm(ctx0, ffn_inp,
41754168
model.layers[il].ffn_norm, NULL,
4176-
LLM_NORM_RMS, norm_rms_eps, il);
4169+
LLM_NORM_RMS, il);
41774170
cb(cur, "ffn_norm", il);
41784171

41794172
cur = build_ffn(ctx0, cur,
@@ -4195,7 +4188,7 @@ struct llm_build_context {
41954188

41964189
cur = build_norm(ctx0, cur,
41974190
model.output_norm, NULL,
4198-
LLM_NORM_RMS, norm_rms_eps, -1);
4191+
LLM_NORM_RMS, -1);
41994192
cb(cur, "result_norm", -1);
42004193

42014194
// lm_head
@@ -4226,14 +4219,14 @@ struct llm_build_context {
42264219
inpL = build_norm(ctx0, inpL,
42274220
model.tok_norm,
42284221
model.tok_norm_b,
4229-
LLM_NORM, norm_eps, -1);
4222+
LLM_NORM, -1);
42304223
cb(inpL, "inp_norm", -1);
42314224

42324225
for (int il = 0; il < n_layer; ++il) {
42334226
cur = build_norm(ctx0, inpL,
42344227
model.layers[il].attn_norm,
42354228
model.layers[il].attn_norm_b,
4236-
LLM_NORM, norm_eps, il);
4229+
LLM_NORM, il);
42374230
cb(cur, "attn_norm", il);
42384231

42394232
// self-attention
@@ -4258,7 +4251,7 @@ struct llm_build_context {
42584251

42594252
cur = build_kqv(ctx0, Qcur,
42604253
model.layers[il].wo, model.layers[il].bo,
4261-
Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8.0f, il);
4254+
Qcur, KQ_scale, KQ_mask, 8.0f, il);
42624255
cb(cur, "kqv_out", il);
42634256
}
42644257

@@ -4271,7 +4264,7 @@ struct llm_build_context {
42714264
cur = build_norm(ctx0, ffn_inp,
42724265
model.layers[il].ffn_norm,
42734266
model.layers[il].ffn_norm_b,
4274-
LLM_NORM, norm_eps, il);
4267+
LLM_NORM, il);
42754268
cb(cur, "ffn_norm", il);
42764269

42774270
cur = build_ffn(ctx0, cur,
@@ -4289,7 +4282,7 @@ struct llm_build_context {
42894282
cur = build_norm(ctx0, inpL,
42904283
model.output_norm,
42914284
model.output_norm_b,
4292-
LLM_NORM, norm_eps, -1);
4285+
LLM_NORM, -1);
42934286
cb(cur, "result_norm", -1);
42944287

42954288
cur = ggml_mul_mat(ctx0, model.output, cur);
@@ -4322,7 +4315,7 @@ struct llm_build_context {
43224315
attn_norm = build_norm(ctx0, inpL,
43234316
model.layers[il].attn_norm,
43244317
NULL,
4325-
LLM_NORM, norm_eps, il);
4318+
LLM_NORM, il);
43264319
cb(attn_norm, "attn_norm", il);
43274320

43284321
// self-attention
@@ -4332,8 +4325,8 @@ struct llm_build_context {
43324325
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
43334326
cb(cur, "wqkv", il);
43344327

4335-
if (clamp_kqv > 0.0f) {
4336-
cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
4328+
if (hparams.f_clamp_kqv > 0.0f) {
4329+
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
43374330
cb(cur, "wqkv_clamped", il);
43384331
}
43394332

@@ -4351,7 +4344,7 @@ struct llm_build_context {
43514344

43524345
cur = build_kqv(ctx0, Qcur,
43534346
model.layers[il].wo, NULL,
4354-
Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, max_alibi_bias, il);
4347+
Qcur, KQ_scale, KQ_mask, hparams.f_max_alibi_bias, il);
43554348
cb(cur, "kqv_out", il);
43564349
}
43574350

@@ -4364,7 +4357,7 @@ struct llm_build_context {
43644357
cur = build_norm(ctx0, ffn_inp,
43654358
model.layers[il].ffn_norm,
43664359
NULL,
4367-
LLM_NORM, norm_eps, il);
4360+
LLM_NORM, il);
43684361
cb(cur, "ffn_norm", il);
43694362

43704363
cur = build_ffn(ctx0, cur,
@@ -4387,7 +4380,7 @@ struct llm_build_context {
43874380
cur = build_norm(ctx0, cur,
43884381
model.output_norm,
43894382
NULL,
4390-
LLM_NORM, norm_eps, -1);
4383+
LLM_NORM, -1);
43914384
cb(cur, "result_norm", -1);
43924385

43934386
cur = ggml_mul_mat(ctx0, model.output, cur);

0 commit comments

Comments
 (0)