@@ -213,6 +213,7 @@ enum llm_arch {
213
213
LLM_ARCH_NEMOTRON,
214
214
LLM_ARCH_EXAONE,
215
215
LLM_ARCH_RWKV6,
216
+ LLM_ARCH_GRANITE,
216
217
LLM_ARCH_UNKNOWN,
217
218
};
218
219
@@ -261,6 +262,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
261
262
{ LLM_ARCH_NEMOTRON, "nemotron" },
262
263
{ LLM_ARCH_EXAONE, "exaone" },
263
264
{ LLM_ARCH_RWKV6, "rwkv6" },
265
+ { LLM_ARCH_GRANITE, "granite" },
264
266
{ LLM_ARCH_UNKNOWN, "(unknown)" },
265
267
};
266
268
@@ -300,6 +302,8 @@ enum llm_kv {
300
302
LLM_KV_RESCALE_EVERY_N_LAYERS,
301
303
LLM_KV_TIME_MIX_EXTRA_DIM,
302
304
LLM_KV_TIME_DECAY_EXTRA_DIM,
305
+ LLM_KV_RESIDUAL_MULTIPLIER,
306
+ LLM_KV_EMBEDDING_MULTIPLIER,
303
307
304
308
LLM_KV_ATTENTION_HEAD_COUNT,
305
309
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -314,6 +318,7 @@ enum llm_kv {
314
318
LLM_KV_ATTENTION_KV_LORA_RANK,
315
319
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
316
320
LLM_KV_ATTENTION_SLIDING_WINDOW,
321
+ LLM_KV_ATTENTION_MULTIPLIER,
317
322
318
323
LLM_KV_ROPE_DIMENSION_COUNT,
319
324
LLM_KV_ROPE_FREQ_BASE,
@@ -404,6 +409,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
404
409
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
405
410
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
406
411
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
412
+ { LLM_KV_RESIDUAL_MULTIPLIER, "%s.residual_multiplier" },
413
+ { LLM_KV_EMBEDDING_MULTIPLIER, "%s.embedding_multiplier" },
407
414
408
415
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
409
416
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -418,6 +425,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
418
425
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
419
426
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
420
427
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
428
+ { LLM_KV_ATTENTION_MULTIPLIER, "%s.attention.multiplier" },
421
429
422
430
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
423
431
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -2321,6 +2329,11 @@ struct llama_hparams {
2321
2329
float f_max_alibi_bias = 0.0f;
2322
2330
float f_logit_scale = 0.0f;
2323
2331
2332
+ // For Granite architecture
2333
+ float f_residual_multiplier = 0.0f;
2334
+ float f_embedding_multiplier = 0.0f;
2335
+ float f_attention_multiplier = 0.0f;
2336
+
2324
2337
bool causal_attn = true;
2325
2338
bool use_alibi = false;
2326
2339
bool attn_soft_cap = false;
@@ -2376,13 +2389,16 @@ struct llama_hparams {
2376
2389
2377
2390
const float EPSILON = 1e-9f;
2378
2391
2379
- if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2380
- if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2381
- if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2382
- if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2383
- if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2384
- if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2385
- if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2392
+ if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2393
+ if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2394
+ if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2395
+ if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2396
+ if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2397
+ if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2398
+ if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2399
+ if (!is_float_close(this->f_residual_multiplier, other.f_residual_multiplier, EPSILON)) return true;
2400
+ if (!is_float_close(this->f_embedding_multiplier, other.f_embedding_multiplier, EPSILON)) return true;
2401
+ if (!is_float_close(this->f_attention_multiplier, other.f_attention_multiplier, EPSILON)) return true;
2386
2402
2387
2403
return false;
2388
2404
}
@@ -5351,6 +5367,7 @@ static void llm_load_hparams(
5351
5367
// arch-specific KVs
5352
5368
switch (model.arch) {
5353
5369
case LLM_ARCH_LLAMA:
5370
+ case LLM_ARCH_GRANITE:
5354
5371
{
5355
5372
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5356
5373
@@ -5367,13 +5384,20 @@ static void llm_load_hparams(
5367
5384
// granite uses a vocab with len 49152
5368
5385
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
5369
5386
case 36: model.type = e_model::MODEL_8B; break; // granite
5370
- case 40: model.type = e_model::MODEL_13B; break;
5387
+ case 40: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : e_model::MODEL_13B; break;
5371
5388
case 48: model.type = e_model::MODEL_34B; break;
5372
5389
case 60: model.type = e_model::MODEL_30B; break;
5373
5390
case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break;
5374
5391
default: model.type = e_model::MODEL_UNKNOWN;
5375
5392
}
5376
5393
}
5394
+ // Extra multipliers for Granite architecture
5395
+ if (model.arch == LLM_ARCH_GRANITE) {
5396
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5397
+ ml.get_key(LLM_KV_RESIDUAL_MULTIPLIER, hparams.f_residual_multiplier);
5398
+ ml.get_key(LLM_KV_EMBEDDING_MULTIPLIER, hparams.f_embedding_multiplier);
5399
+ ml.get_key(LLM_KV_ATTENTION_MULTIPLIER, hparams.f_attention_multiplier);
5400
+ }
5377
5401
} break;
5378
5402
case LLM_ARCH_MINICPM:
5379
5403
{
@@ -6632,6 +6656,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
6632
6656
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6633
6657
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6634
6658
}
6659
+
6660
+ if (model.arch == LLM_ARCH_GRANITE) {
6661
+ LLAMA_LOG_INFO("%s: f_embedding_multiplier = %f\n", __func__, hparams.f_embedding_multiplier);
6662
+ LLAMA_LOG_INFO("%s: f_residual_multiplier = %f\n", __func__, hparams.f_residual_multiplier);
6663
+ LLAMA_LOG_INFO("%s: f_attention_multiplier = %f\n", __func__, hparams.f_attention_multiplier);
6664
+ }
6635
6665
}
6636
6666
6637
6667
// Returns false if cancelled by progress_callback
0 commit comments