Skip to content

Commit 0bdf04e

Browse files
committed
fix(llama.cpp): Switch Granite param names to use _scale for consistency
Other scalar multipliers are called *_scale, so this provides a more consistent naming convention. Branch: GraniteLM Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 8086380 commit 0bdf04e

File tree

1 file changed

+33
-33
lines changed

1 file changed

+33
-33
lines changed

src/llama.cpp

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -305,8 +305,8 @@ enum llm_kv {
305305
LLM_KV_RESCALE_EVERY_N_LAYERS,
306306
LLM_KV_TIME_MIX_EXTRA_DIM,
307307
LLM_KV_TIME_DECAY_EXTRA_DIM,
308-
LLM_KV_RESIDUAL_MULTIPLIER,
309-
LLM_KV_EMBEDDING_MULTIPLIER,
308+
LLM_KV_RESIDUAL_SCALE,
309+
LLM_KV_EMBEDDING_SCALE,
310310

311311
LLM_KV_ATTENTION_HEAD_COUNT,
312312
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -321,7 +321,7 @@ enum llm_kv {
321321
LLM_KV_ATTENTION_KV_LORA_RANK,
322322
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
323323
LLM_KV_ATTENTION_SLIDING_WINDOW,
324-
LLM_KV_ATTENTION_MULTIPLIER,
324+
LLM_KV_ATTENTION_SCALE,
325325

326326
LLM_KV_ROPE_DIMENSION_COUNT,
327327
LLM_KV_ROPE_FREQ_BASE,
@@ -412,8 +412,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
412412
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
413413
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
414414
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
415-
{ LLM_KV_RESIDUAL_MULTIPLIER, "%s.residual_multiplier" },
416-
{ LLM_KV_EMBEDDING_MULTIPLIER, "%s.embedding_multiplier" },
415+
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
416+
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
417417

418418
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
419419
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -428,7 +428,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
428428
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
429429
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
430430
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
431-
{ LLM_KV_ATTENTION_MULTIPLIER, "%s.attention.multiplier" },
431+
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
432432

433433
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
434434
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -2396,10 +2396,10 @@ struct llama_hparams {
23962396
float f_max_alibi_bias = 0.0f;
23972397
float f_logit_scale = 0.0f;
23982398

2399-
// For Granite architecture
2400-
float f_residual_multiplier = 0.0f;
2401-
float f_embedding_multiplier = 0.0f;
2402-
float f_attention_multiplier = 0.0f;
2399+
// Additional scale factors (Granite)
2400+
float f_residual_scale = 0.0f;
2401+
float f_embedding_scale = 0.0f;
2402+
float f_attention_scale = 0.0f;
24032403

24042404
bool causal_attn = true;
24052405
bool use_alibi = false;
@@ -2456,16 +2456,16 @@ struct llama_hparams {
24562456

24572457
const float EPSILON = 1e-9f;
24582458

2459-
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2460-
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2461-
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2462-
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2463-
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2464-
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2465-
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2466-
if (!is_float_close(this->f_residual_multiplier, other.f_residual_multiplier, EPSILON)) return true;
2467-
if (!is_float_close(this->f_embedding_multiplier, other.f_embedding_multiplier, EPSILON)) return true;
2468-
if (!is_float_close(this->f_attention_multiplier, other.f_attention_multiplier, EPSILON)) return true;
2459+
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2460+
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2461+
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2462+
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2463+
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2464+
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2465+
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2466+
if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
2467+
if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
2468+
if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
24692469

24702470
return false;
24712471
}
@@ -5465,9 +5465,9 @@ static void llm_load_hparams(
54655465
// Extra multipliers for Granite architecture
54665466
if (model.arch == LLM_ARCH_GRANITE) {
54675467
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5468-
ml.get_key(LLM_KV_RESIDUAL_MULTIPLIER, hparams.f_residual_multiplier);
5469-
ml.get_key(LLM_KV_EMBEDDING_MULTIPLIER, hparams.f_embedding_multiplier);
5470-
ml.get_key(LLM_KV_ATTENTION_MULTIPLIER, hparams.f_attention_multiplier);
5468+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
5469+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
5470+
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
54715471
}
54725472
} break;
54735473
case LLM_ARCH_MINICPM:
@@ -6759,9 +6759,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
67596759
}
67606760

67616761
if (model.arch == LLM_ARCH_GRANITE) {
6762-
LLAMA_LOG_INFO("%s: f_embedding_multiplier = %f\n", __func__, hparams.f_embedding_multiplier);
6763-
LLAMA_LOG_INFO("%s: f_residual_multiplier = %f\n", __func__, hparams.f_residual_multiplier);
6764-
LLAMA_LOG_INFO("%s: f_attention_multiplier = %f\n", __func__, hparams.f_attention_multiplier);
6762+
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6763+
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6764+
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
67656765
}
67666766
}
67676767

@@ -8916,8 +8916,8 @@ static struct ggml_tensor * llm_build_inp_embd(
89168916
}
89178917

89188918
// For Granite architecture
8919-
if (hparams.f_embedding_multiplier != 0.0f) {
8920-
inpL = ggml_scale(ctx, inpL, hparams.f_embedding_multiplier);
8919+
if (hparams.f_embedding_scale != 0.0f) {
8920+
inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale);
89218921
}
89228922

89238923
cb(inpL, "inp_embd", -1);
@@ -10198,7 +10198,7 @@ struct llm_build_context {
1019810198
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
1019910199
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
1020010200

10201-
const float kq_scale = hparams.f_attention_multiplier == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_multiplier;
10201+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
1020210202
for (int il = 0; il < n_layer; ++il) {
1020310203
struct ggml_tensor * inpSA = inpL;
1020410204

@@ -10263,8 +10263,8 @@ struct llm_build_context {
1026310263
}
1026410264

1026510265
// For Granite architecture
10266-
if (hparams.f_residual_multiplier) {
10267-
cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier);
10266+
if (hparams.f_residual_scale) {
10267+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
1026810268
}
1026910269

1027010270
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -10304,8 +10304,8 @@ struct llm_build_context {
1030410304
}
1030510305

1030610306
// For Granite architecture
10307-
if (hparams.f_residual_multiplier) {
10308-
cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier);
10307+
if (hparams.f_residual_scale) {
10308+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
1030910309
}
1031010310

1031110311
cur = ggml_add(ctx0, cur, ffn_inp);

0 commit comments

Comments
 (0)