Skip to content

Commit 009a2b4

Browse files
committed
feat(llama.cpp): Add config parsing for Granite multiplier params
Branch: GraniteLM Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 7152e2c commit 009a2b4

File tree

1 file changed

+38
-8
lines changed

1 file changed

+38
-8
lines changed

src/llama.cpp

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ enum llm_arch {
212212
LLM_ARCH_NEMOTRON,
213213
LLM_ARCH_EXAONE,
214214
LLM_ARCH_RWKV6,
215+
LLM_ARCH_GRANITE,
215216
LLM_ARCH_UNKNOWN,
216217
};
217218

@@ -260,6 +261,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
260261
{ LLM_ARCH_NEMOTRON, "nemotron" },
261262
{ LLM_ARCH_EXAONE, "exaone" },
262263
{ LLM_ARCH_RWKV6, "rwkv6" },
264+
{ LLM_ARCH_GRANITE, "granite" },
263265
{ LLM_ARCH_UNKNOWN, "(unknown)" },
264266
};
265267

@@ -299,6 +301,8 @@ enum llm_kv {
299301
LLM_KV_RESCALE_EVERY_N_LAYERS,
300302
LLM_KV_TIME_MIX_EXTRA_DIM,
301303
LLM_KV_TIME_DECAY_EXTRA_DIM,
304+
LLM_KV_RESIDUAL_MULTIPLIER,
305+
LLM_KV_EMBEDDING_MULTIPLIER,
302306

303307
LLM_KV_ATTENTION_HEAD_COUNT,
304308
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -313,6 +317,7 @@ enum llm_kv {
313317
LLM_KV_ATTENTION_KV_LORA_RANK,
314318
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
315319
LLM_KV_ATTENTION_SLIDING_WINDOW,
320+
LLM_KV_ATTENTION_MULTIPLIER,
316321

317322
LLM_KV_ROPE_DIMENSION_COUNT,
318323
LLM_KV_ROPE_FREQ_BASE,
@@ -403,6 +408,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
403408
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
404409
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
405410
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
411+
{ LLM_KV_RESIDUAL_MULTIPLIER, "%s.residual_multiplier" },
412+
{ LLM_KV_EMBEDDING_MULTIPLIER, "%s.embedding_multiplier" },
406413

407414
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
408415
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -417,6 +424,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
417424
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
418425
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
419426
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
427+
{ LLM_KV_ATTENTION_MULTIPLIER, "%s.attention.multiplier" },
420428

421429
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
422430
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -2320,6 +2328,11 @@ struct llama_hparams {
23202328
float f_max_alibi_bias = 0.0f;
23212329
float f_logit_scale = 0.0f;
23222330

2331+
// For Granite architecture
2332+
float f_residual_multiplier = 0.0f;
2333+
float f_embedding_multiplier = 0.0f;
2334+
float f_attention_multiplier = 0.0f;
2335+
23232336
bool causal_attn = true;
23242337
bool use_alibi = false;
23252338
bool attn_soft_cap = false;
@@ -2375,13 +2388,16 @@ struct llama_hparams {
23752388

23762389
const float EPSILON = 1e-9f;
23772390

2378-
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2379-
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2380-
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2381-
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2382-
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2383-
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2384-
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2391+
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2392+
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2393+
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2394+
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2395+
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2396+
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2397+
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2398+
if (!is_float_close(this->f_residual_multiplier, other.f_residual_multiplier, EPSILON)) return true;
2399+
if (!is_float_close(this->f_embedding_multiplier, other.f_embedding_multiplier, EPSILON)) return true;
2400+
if (!is_float_close(this->f_attention_multiplier, other.f_attention_multiplier, EPSILON)) return true;
23852401

23862402
return false;
23872403
}
@@ -5352,6 +5368,7 @@ static void llm_load_hparams(
53525368
// arch-specific KVs
53535369
switch (model.arch) {
53545370
case LLM_ARCH_LLAMA:
5371+
case LLM_ARCH_GRANITE:
53555372
{
53565373
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
53575374

@@ -5368,13 +5385,20 @@ static void llm_load_hparams(
53685385
// granite uses a vocab with len 49152
53695386
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
53705387
case 36: model.type = e_model::MODEL_8B; break; // granite
5371-
case 40: model.type = e_model::MODEL_13B; break;
5388+
case 40: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : e_model::MODEL_13B; break;
53725389
case 48: model.type = e_model::MODEL_34B; break;
53735390
case 60: model.type = e_model::MODEL_30B; break;
53745391
case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break;
53755392
default: model.type = e_model::MODEL_UNKNOWN;
53765393
}
53775394
}
5395+
// Extra multipliers for Granite architecture
5396+
if (model.arch == LLM_ARCH_GRANITE) {
5397+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5398+
ml.get_key(LLM_KV_RESIDUAL_MULTIPLIER, hparams.f_residual_multiplier);
5399+
ml.get_key(LLM_KV_EMBEDDING_MULTIPLIER, hparams.f_embedding_multiplier);
5400+
ml.get_key(LLM_KV_ATTENTION_MULTIPLIER, hparams.f_attention_multiplier);
5401+
}
53785402
} break;
53795403
case LLM_ARCH_MINICPM:
53805404
{
@@ -6644,6 +6668,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
66446668
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
66456669
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
66466670
}
6671+
6672+
if (model.arch == LLM_ARCH_GRANITE) {
6673+
LLAMA_LOG_INFO("%s: f_embedding_multiplier = %f\n", __func__, hparams.f_embedding_multiplier);
6674+
LLAMA_LOG_INFO("%s: f_residual_multiplier = %f\n", __func__, hparams.f_residual_multiplier);
6675+
LLAMA_LOG_INFO("%s: f_attention_multiplier = %f\n", __func__, hparams.f_attention_multiplier);
6676+
}
66476677
}
66486678

66496679
// Returns false if cancelled by progress_callback

0 commit comments

Comments
 (0)