@@ -214,6 +214,7 @@ enum llm_arch {
214
214
LLM_ARCH_NEMOTRON,
215
215
LLM_ARCH_EXAONE,
216
216
LLM_ARCH_RWKV6,
217
+ LLM_ARCH_GRANITE,
217
218
LLM_ARCH_UNKNOWN,
218
219
};
219
220
@@ -264,6 +265,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
264
265
{ LLM_ARCH_NEMOTRON, "nemotron" },
265
266
{ LLM_ARCH_EXAONE, "exaone" },
266
267
{ LLM_ARCH_RWKV6, "rwkv6" },
268
+ { LLM_ARCH_GRANITE, "granite" },
267
269
{ LLM_ARCH_UNKNOWN, "(unknown)" },
268
270
};
269
271
@@ -303,6 +305,8 @@ enum llm_kv {
303
305
LLM_KV_RESCALE_EVERY_N_LAYERS,
304
306
LLM_KV_TIME_MIX_EXTRA_DIM,
305
307
LLM_KV_TIME_DECAY_EXTRA_DIM,
308
+ LLM_KV_RESIDUAL_MULTIPLIER,
309
+ LLM_KV_EMBEDDING_MULTIPLIER,
306
310
307
311
LLM_KV_ATTENTION_HEAD_COUNT,
308
312
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -317,6 +321,7 @@ enum llm_kv {
317
321
LLM_KV_ATTENTION_KV_LORA_RANK,
318
322
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
319
323
LLM_KV_ATTENTION_SLIDING_WINDOW,
324
+ LLM_KV_ATTENTION_MULTIPLIER,
320
325
321
326
LLM_KV_ROPE_DIMENSION_COUNT,
322
327
LLM_KV_ROPE_FREQ_BASE,
@@ -407,6 +412,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
407
412
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
408
413
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
409
414
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
415
+ { LLM_KV_RESIDUAL_MULTIPLIER, "%s.residual_multiplier" },
416
+ { LLM_KV_EMBEDDING_MULTIPLIER, "%s.embedding_multiplier" },
410
417
411
418
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
412
419
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -421,6 +428,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
421
428
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
422
429
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
423
430
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
431
+ { LLM_KV_ATTENTION_MULTIPLIER, "%s.attention.multiplier" },
424
432
425
433
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
426
434
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -2372,6 +2380,11 @@ struct llama_hparams {
2372
2380
float f_max_alibi_bias = 0.0f;
2373
2381
float f_logit_scale = 0.0f;
2374
2382
2383
+ // For Granite architecture
2384
+ float f_residual_multiplier = 0.0f;
2385
+ float f_embedding_multiplier = 0.0f;
2386
+ float f_attention_multiplier = 0.0f;
2387
+
2375
2388
bool causal_attn = true;
2376
2389
bool use_alibi = false;
2377
2390
bool attn_soft_cap = false;
@@ -2427,13 +2440,16 @@ struct llama_hparams {
2427
2440
2428
2441
const float EPSILON = 1e-9f;
2429
2442
2430
- if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2431
- if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2432
- if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2433
- if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2434
- if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2435
- if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2436
- if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2443
+ if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2444
+ if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2445
+ if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2446
+ if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2447
+ if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2448
+ if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2449
+ if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2450
+ if (!is_float_close(this->f_residual_multiplier, other.f_residual_multiplier, EPSILON)) return true;
2451
+ if (!is_float_close(this->f_embedding_multiplier, other.f_embedding_multiplier, EPSILON)) return true;
2452
+ if (!is_float_close(this->f_attention_multiplier, other.f_attention_multiplier, EPSILON)) return true;
2437
2453
2438
2454
return false;
2439
2455
}
@@ -5406,6 +5422,7 @@ static void llm_load_hparams(
5406
5422
// arch-specific KVs
5407
5423
switch (model.arch) {
5408
5424
case LLM_ARCH_LLAMA:
5425
+ case LLM_ARCH_GRANITE:
5409
5426
{
5410
5427
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5411
5428
@@ -5422,13 +5439,20 @@ static void llm_load_hparams(
5422
5439
// granite uses a vocab with len 49152
5423
5440
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
5424
5441
case 36: model.type = e_model::MODEL_8B; break; // granite
5425
- case 40: model.type = e_model::MODEL_13B; break;
5442
+ case 40: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : e_model::MODEL_13B; break;
5426
5443
case 48: model.type = e_model::MODEL_34B; break;
5427
5444
case 60: model.type = e_model::MODEL_30B; break;
5428
5445
case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break;
5429
5446
default: model.type = e_model::MODEL_UNKNOWN;
5430
5447
}
5431
5448
}
5449
+ // Extra multipliers for Granite architecture
5450
+ if (model.arch == LLM_ARCH_GRANITE) {
5451
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5452
+ ml.get_key(LLM_KV_RESIDUAL_MULTIPLIER, hparams.f_residual_multiplier);
5453
+ ml.get_key(LLM_KV_EMBEDDING_MULTIPLIER, hparams.f_embedding_multiplier);
5454
+ ml.get_key(LLM_KV_ATTENTION_MULTIPLIER, hparams.f_attention_multiplier);
5455
+ }
5432
5456
} break;
5433
5457
case LLM_ARCH_MINICPM:
5434
5458
{
@@ -6717,6 +6741,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
6717
6741
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6718
6742
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6719
6743
}
6744
+
6745
+ if (model.arch == LLM_ARCH_GRANITE) {
6746
+ LLAMA_LOG_INFO("%s: f_embedding_multiplier = %f\n", __func__, hparams.f_embedding_multiplier);
6747
+ LLAMA_LOG_INFO("%s: f_residual_multiplier = %f\n", __func__, hparams.f_residual_multiplier);
6748
+ LLAMA_LOG_INFO("%s: f_attention_multiplier = %f\n", __func__, hparams.f_attention_multiplier);
6749
+ }
6720
6750
}
6721
6751
6722
6752
// Returns false if cancelled by progress_callback
0 commit comments