Skip to content

Commit 383065a

Browse files
committed
feat(llama.cpp): Add config parsing for Granite multiplier params
Branch: GraniteLM Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 406833d commit 383065a

File tree

1 file changed

+38
-8
lines changed

1 file changed

+38
-8
lines changed

src/llama.cpp

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ enum llm_arch {
214214
LLM_ARCH_NEMOTRON,
215215
LLM_ARCH_EXAONE,
216216
LLM_ARCH_RWKV6,
217+
LLM_ARCH_GRANITE,
217218
LLM_ARCH_UNKNOWN,
218219
};
219220

@@ -264,6 +265,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
264265
{ LLM_ARCH_NEMOTRON, "nemotron" },
265266
{ LLM_ARCH_EXAONE, "exaone" },
266267
{ LLM_ARCH_RWKV6, "rwkv6" },
268+
{ LLM_ARCH_GRANITE, "granite" },
267269
{ LLM_ARCH_UNKNOWN, "(unknown)" },
268270
};
269271

@@ -303,6 +305,8 @@ enum llm_kv {
303305
LLM_KV_RESCALE_EVERY_N_LAYERS,
304306
LLM_KV_TIME_MIX_EXTRA_DIM,
305307
LLM_KV_TIME_DECAY_EXTRA_DIM,
308+
LLM_KV_RESIDUAL_MULTIPLIER,
309+
LLM_KV_EMBEDDING_MULTIPLIER,
306310

307311
LLM_KV_ATTENTION_HEAD_COUNT,
308312
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -317,6 +321,7 @@ enum llm_kv {
317321
LLM_KV_ATTENTION_KV_LORA_RANK,
318322
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
319323
LLM_KV_ATTENTION_SLIDING_WINDOW,
324+
LLM_KV_ATTENTION_MULTIPLIER,
320325

321326
LLM_KV_ROPE_DIMENSION_COUNT,
322327
LLM_KV_ROPE_FREQ_BASE,
@@ -407,6 +412,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
407412
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
408413
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
409414
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
415+
{ LLM_KV_RESIDUAL_MULTIPLIER, "%s.residual_multiplier" },
416+
{ LLM_KV_EMBEDDING_MULTIPLIER, "%s.embedding_multiplier" },
410417

411418
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
412419
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -421,6 +428,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
421428
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
422429
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
423430
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
431+
{ LLM_KV_ATTENTION_MULTIPLIER, "%s.attention.multiplier" },
424432

425433
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
426434
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -2372,6 +2380,11 @@ struct llama_hparams {
23722380
float f_max_alibi_bias = 0.0f;
23732381
float f_logit_scale = 0.0f;
23742382

2383+
// For Granite architecture
2384+
float f_residual_multiplier = 0.0f;
2385+
float f_embedding_multiplier = 0.0f;
2386+
float f_attention_multiplier = 0.0f;
2387+
23752388
bool causal_attn = true;
23762389
bool use_alibi = false;
23772390
bool attn_soft_cap = false;
@@ -2427,13 +2440,16 @@ struct llama_hparams {
24272440

24282441
const float EPSILON = 1e-9f;
24292442

2430-
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2431-
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2432-
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2433-
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2434-
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2435-
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2436-
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2443+
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2444+
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2445+
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2446+
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2447+
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2448+
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2449+
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2450+
if (!is_float_close(this->f_residual_multiplier, other.f_residual_multiplier, EPSILON)) return true;
2451+
if (!is_float_close(this->f_embedding_multiplier, other.f_embedding_multiplier, EPSILON)) return true;
2452+
if (!is_float_close(this->f_attention_multiplier, other.f_attention_multiplier, EPSILON)) return true;
24372453

24382454
return false;
24392455
}
@@ -5406,6 +5422,7 @@ static void llm_load_hparams(
54065422
// arch-specific KVs
54075423
switch (model.arch) {
54085424
case LLM_ARCH_LLAMA:
5425+
case LLM_ARCH_GRANITE:
54095426
{
54105427
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
54115428

@@ -5422,13 +5439,20 @@ static void llm_load_hparams(
54225439
// granite uses a vocab with len 49152
54235440
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
54245441
case 36: model.type = e_model::MODEL_8B; break; // granite
5425-
case 40: model.type = e_model::MODEL_13B; break;
5442+
case 40: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : e_model::MODEL_13B; break;
54265443
case 48: model.type = e_model::MODEL_34B; break;
54275444
case 60: model.type = e_model::MODEL_30B; break;
54285445
case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break;
54295446
default: model.type = e_model::MODEL_UNKNOWN;
54305447
}
54315448
}
5449+
// Extra multipliers for Granite architecture
5450+
if (model.arch == LLM_ARCH_GRANITE) {
5451+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5452+
ml.get_key(LLM_KV_RESIDUAL_MULTIPLIER, hparams.f_residual_multiplier);
5453+
ml.get_key(LLM_KV_EMBEDDING_MULTIPLIER, hparams.f_embedding_multiplier);
5454+
ml.get_key(LLM_KV_ATTENTION_MULTIPLIER, hparams.f_attention_multiplier);
5455+
}
54325456
} break;
54335457
case LLM_ARCH_MINICPM:
54345458
{
@@ -6717,6 +6741,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
67176741
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
67186742
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
67196743
}
6744+
6745+
if (model.arch == LLM_ARCH_GRANITE) {
6746+
LLAMA_LOG_INFO("%s: f_embedding_multiplier = %f\n", __func__, hparams.f_embedding_multiplier);
6747+
LLAMA_LOG_INFO("%s: f_residual_multiplier = %f\n", __func__, hparams.f_residual_multiplier);
6748+
LLAMA_LOG_INFO("%s: f_attention_multiplier = %f\n", __func__, hparams.f_attention_multiplier);
6749+
}
67206750
}
67216751

67226752
// Returns false if cancelled by progress_callback

0 commit comments

Comments
 (0)