@@ -305,8 +305,8 @@ enum llm_kv {
305
305
LLM_KV_RESCALE_EVERY_N_LAYERS,
306
306
LLM_KV_TIME_MIX_EXTRA_DIM,
307
307
LLM_KV_TIME_DECAY_EXTRA_DIM,
308
- LLM_KV_RESIDUAL_MULTIPLIER ,
309
- LLM_KV_EMBEDDING_MULTIPLIER ,
308
+ LLM_KV_RESIDUAL_SCALE ,
309
+ LLM_KV_EMBEDDING_SCALE ,
310
310
311
311
LLM_KV_ATTENTION_HEAD_COUNT,
312
312
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -321,7 +321,7 @@ enum llm_kv {
321
321
LLM_KV_ATTENTION_KV_LORA_RANK,
322
322
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
323
323
LLM_KV_ATTENTION_SLIDING_WINDOW,
324
- LLM_KV_ATTENTION_MULTIPLIER ,
324
+ LLM_KV_ATTENTION_SCALE ,
325
325
326
326
LLM_KV_ROPE_DIMENSION_COUNT,
327
327
LLM_KV_ROPE_FREQ_BASE,
@@ -412,8 +412,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
412
412
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
413
413
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
414
414
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
415
- { LLM_KV_RESIDUAL_MULTIPLIER , "%s.residual_multiplier" },
416
- { LLM_KV_EMBEDDING_MULTIPLIER , "%s.embedding_multiplier" },
415
+ { LLM_KV_RESIDUAL_SCALE , "%s.residual_scale" },
416
+ { LLM_KV_EMBEDDING_SCALE , "%s.embedding_scale" },
417
417
418
418
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
419
419
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -428,7 +428,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
428
428
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
429
429
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
430
430
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
431
- { LLM_KV_ATTENTION_MULTIPLIER , "%s.attention.multiplier" },
431
+ { LLM_KV_ATTENTION_SCALE , "%s.attention.scale" },
432
432
433
433
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
434
434
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -2396,10 +2396,10 @@ struct llama_hparams {
2396
2396
float f_max_alibi_bias = 0.0f;
2397
2397
float f_logit_scale = 0.0f;
2398
2398
2399
- // For Granite architecture
2400
- float f_residual_multiplier = 0.0f;
2401
- float f_embedding_multiplier = 0.0f;
2402
- float f_attention_multiplier = 0.0f;
2399
+ // Additional scale factors (Granite)
2400
+ float f_residual_scale = 0.0f;
2401
+ float f_embedding_scale = 0.0f;
2402
+ float f_attention_scale = 0.0f;
2403
2403
2404
2404
bool causal_attn = true;
2405
2405
bool use_alibi = false;
@@ -2456,16 +2456,16 @@ struct llama_hparams {
2456
2456
2457
2457
const float EPSILON = 1e-9f;
2458
2458
2459
- if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2460
- if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2461
- if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2462
- if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2463
- if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2464
- if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2465
- if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2466
- if (!is_float_close(this->f_residual_multiplier , other.f_residual_multiplier, EPSILON)) return true;
2467
- if (!is_float_close(this->f_embedding_multiplier, other.f_embedding_multiplier, EPSILON)) return true;
2468
- if (!is_float_close(this->f_attention_multiplier, other.f_attention_multiplier, EPSILON)) return true;
2459
+ if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2460
+ if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2461
+ if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2462
+ if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2463
+ if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2464
+ if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2465
+ if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2466
+ if (!is_float_close(this->f_residual_scale , other.f_residual_scale, EPSILON)) return true;
2467
+ if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
2468
+ if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
2469
2469
2470
2470
return false;
2471
2471
}
@@ -5465,9 +5465,9 @@ static void llm_load_hparams(
5465
5465
// Extra multipliers for Granite architecture
5466
5466
if (model.arch == LLM_ARCH_GRANITE) {
5467
5467
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5468
- ml.get_key(LLM_KV_RESIDUAL_MULTIPLIER , hparams.f_residual_multiplier );
5469
- ml.get_key(LLM_KV_EMBEDDING_MULTIPLIER , hparams.f_embedding_multiplier );
5470
- ml.get_key(LLM_KV_ATTENTION_MULTIPLIER , hparams.f_attention_multiplier );
5468
+ ml.get_key(LLM_KV_RESIDUAL_SCALE , hparams.f_residual_scale );
5469
+ ml.get_key(LLM_KV_EMBEDDING_SCALE , hparams.f_embedding_scale );
5470
+ ml.get_key(LLM_KV_ATTENTION_SCALE , hparams.f_attention_scale );
5471
5471
}
5472
5472
} break;
5473
5473
case LLM_ARCH_MINICPM:
@@ -6759,9 +6759,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
6759
6759
}
6760
6760
6761
6761
if (model.arch == LLM_ARCH_GRANITE) {
6762
- LLAMA_LOG_INFO("%s: f_embedding_multiplier = %f\n", __func__, hparams.f_embedding_multiplier );
6763
- LLAMA_LOG_INFO("%s: f_residual_multiplier = %f\n", __func__, hparams.f_residual_multiplier );
6764
- LLAMA_LOG_INFO("%s: f_attention_multiplier = %f\n", __func__, hparams.f_attention_multiplier );
6762
+ LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale );
6763
+ LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale );
6764
+ LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale );
6765
6765
}
6766
6766
}
6767
6767
@@ -8916,8 +8916,8 @@ static struct ggml_tensor * llm_build_inp_embd(
8916
8916
}
8917
8917
8918
8918
// For Granite architecture
8919
- if (hparams.f_embedding_multiplier != 0.0f) {
8920
- inpL = ggml_scale(ctx, inpL, hparams.f_embedding_multiplier );
8919
+ if (hparams.f_embedding_scale != 0.0f) {
8920
+ inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale );
8921
8921
}
8922
8922
8923
8923
cb(inpL, "inp_embd", -1);
@@ -10198,7 +10198,7 @@ struct llm_build_context {
10198
10198
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10199
10199
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10200
10200
10201
- const float kq_scale = hparams.f_attention_multiplier == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_multiplier ;
10201
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale ;
10202
10202
for (int il = 0; il < n_layer; ++il) {
10203
10203
struct ggml_tensor * inpSA = inpL;
10204
10204
@@ -10263,8 +10263,8 @@ struct llm_build_context {
10263
10263
}
10264
10264
10265
10265
// For Granite architecture
10266
- if (hparams.f_residual_multiplier ) {
10267
- cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier );
10266
+ if (hparams.f_residual_scale ) {
10267
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale );
10268
10268
}
10269
10269
10270
10270
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@@ -10304,8 +10304,8 @@ struct llm_build_context {
10304
10304
}
10305
10305
10306
10306
// For Granite architecture
10307
- if (hparams.f_residual_multiplier ) {
10308
- cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier );
10307
+ if (hparams.f_residual_scale ) {
10308
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale );
10309
10309
}
10310
10310
10311
10311
cur = ggml_add(ctx0, cur, ffn_inp);
0 commit comments