@@ -214,6 +214,7 @@ enum llm_arch {
214
214
LLM_ARCH_NEMOTRON,
215
215
LLM_ARCH_EXAONE,
216
216
LLM_ARCH_RWKV6,
217
+ LLM_ARCH_GRANITE,
217
218
LLM_ARCH_UNKNOWN,
218
219
};
219
220
@@ -264,6 +265,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
264
265
{ LLM_ARCH_NEMOTRON, "nemotron" },
265
266
{ LLM_ARCH_EXAONE, "exaone" },
266
267
{ LLM_ARCH_RWKV6, "rwkv6" },
268
+ { LLM_ARCH_GRANITE, "granite" },
267
269
{ LLM_ARCH_UNKNOWN, "(unknown)" },
268
270
};
269
271
@@ -303,6 +305,8 @@ enum llm_kv {
303
305
LLM_KV_RESCALE_EVERY_N_LAYERS,
304
306
LLM_KV_TIME_MIX_EXTRA_DIM,
305
307
LLM_KV_TIME_DECAY_EXTRA_DIM,
308
+ LLM_KV_RESIDUAL_SCALE,
309
+ LLM_KV_EMBEDDING_SCALE,
306
310
307
311
LLM_KV_ATTENTION_HEAD_COUNT,
308
312
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -317,6 +321,7 @@ enum llm_kv {
317
321
LLM_KV_ATTENTION_KV_LORA_RANK,
318
322
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
319
323
LLM_KV_ATTENTION_SLIDING_WINDOW,
324
+ LLM_KV_ATTENTION_SCALE,
320
325
321
326
LLM_KV_ROPE_DIMENSION_COUNT,
322
327
LLM_KV_ROPE_FREQ_BASE,
@@ -407,6 +412,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
407
412
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
408
413
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
409
414
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
415
+ { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
416
+ { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
410
417
411
418
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
412
419
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -421,6 +428,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
421
428
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
422
429
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
423
430
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
431
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
424
432
425
433
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
426
434
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -1454,6 +1462,22 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1454
1462
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
1455
1463
},
1456
1464
},
1465
+ {
1466
+ LLM_ARCH_GRANITE,
1467
+ {
1468
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1469
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1470
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1471
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1472
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1473
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1474
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1475
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1476
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1477
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1478
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1479
+ },
1480
+ },
1457
1481
{
1458
1482
LLM_ARCH_UNKNOWN,
1459
1483
{
@@ -2372,6 +2396,11 @@ struct llama_hparams {
2372
2396
float f_max_alibi_bias = 0.0f;
2373
2397
float f_logit_scale = 0.0f;
2374
2398
2399
+ // Additional scale factors (Granite)
2400
+ float f_residual_scale = 0.0f;
2401
+ float f_embedding_scale = 0.0f;
2402
+ float f_attention_scale = 0.0f;
2403
+
2375
2404
bool causal_attn = true;
2376
2405
bool use_alibi = false;
2377
2406
bool attn_soft_cap = false;
@@ -2434,6 +2463,9 @@ struct llama_hparams {
2434
2463
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2435
2464
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2436
2465
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2466
+ if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
2467
+ if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
2468
+ if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
2437
2469
2438
2470
return false;
2439
2471
}
@@ -6019,6 +6051,20 @@ static void llm_load_hparams(
6019
6051
default: model.type = e_model::MODEL_UNKNOWN;
6020
6052
}
6021
6053
} break;
6054
+ case LLM_ARCH_GRANITE:
6055
+ {
6056
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
6057
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
6058
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
6059
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
6060
+ ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
6061
+
6062
+ switch (hparams.n_layer) {
6063
+ case 40: model.type = e_model::MODEL_3B; break;
6064
+ // Add additional layer/vocab/etc checks here for other model sizes
6065
+ default: model.type = e_model::MODEL_UNKNOWN;
6066
+ }
6067
+ } break;
6022
6068
default: (void)0;
6023
6069
}
6024
6070
@@ -6717,6 +6763,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
6717
6763
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6718
6764
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6719
6765
}
6766
+
6767
+ if (model.arch == LLM_ARCH_GRANITE) {
6768
+ LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6769
+ LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6770
+ LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
6771
+ }
6720
6772
}
6721
6773
6722
6774
// Returns false if cancelled by progress_callback
@@ -6885,6 +6937,7 @@ static bool llm_load_tensors(
6885
6937
case LLM_ARCH_LLAMA:
6886
6938
case LLM_ARCH_REFACT:
6887
6939
case LLM_ARCH_MINICPM:
6940
+ case LLM_ARCH_GRANITE:
6888
6941
{
6889
6942
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6890
6943
@@ -8868,6 +8921,11 @@ static struct ggml_tensor * llm_build_inp_embd(
8868
8921
ggml_set_input(lctx.inp_embd);
8869
8922
}
8870
8923
8924
+ // For Granite architecture
8925
+ if (hparams.f_embedding_scale != 0.0f) {
8926
+ inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale);
8927
+ }
8928
+
8871
8929
cb(inpL, "inp_embd", -1);
8872
8930
8873
8931
return inpL;
@@ -10146,6 +10204,7 @@ struct llm_build_context {
10146
10204
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10147
10205
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10148
10206
10207
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
10149
10208
for (int il = 0; il < n_layer; ++il) {
10150
10209
struct ggml_tensor * inpSA = inpL;
10151
10210
@@ -10198,7 +10257,7 @@ struct llm_build_context {
10198
10257
10199
10258
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10200
10259
model.layers[il].wo, model.layers[il].bo,
10201
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)) , cb, il);
10260
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale , cb, il);
10202
10261
}
10203
10262
10204
10263
if (il == n_layer - 1) {
@@ -10209,6 +10268,11 @@ struct llm_build_context {
10209
10268
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10210
10269
}
10211
10270
10271
+ // For Granite architecture
10272
+ if (hparams.f_residual_scale) {
10273
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
10274
+ }
10275
+
10212
10276
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10213
10277
cb(ffn_inp, "ffn_inp", il);
10214
10278
@@ -10245,6 +10309,11 @@ struct llm_build_context {
10245
10309
cb(cur, "ffn_moe_out", il);
10246
10310
}
10247
10311
10312
+ // For Granite architecture
10313
+ if (hparams.f_residual_scale) {
10314
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
10315
+ }
10316
+
10248
10317
cur = ggml_add(ctx0, cur, ffn_inp);
10249
10318
cb(cur, "ffn_out", il);
10250
10319
@@ -10264,6 +10333,12 @@ struct llm_build_context {
10264
10333
10265
10334
// lm_head
10266
10335
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10336
+
10337
+ // For Granite architecture
10338
+ if (hparams.f_logit_scale) {
10339
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
10340
+ }
10341
+
10267
10342
cb(cur, "result_output", -1);
10268
10343
10269
10344
ggml_build_forward_expand(gf, cur);
@@ -15789,6 +15864,7 @@ static struct ggml_cgraph * llama_build_graph(
15789
15864
15790
15865
switch (model.arch) {
15791
15866
case LLM_ARCH_LLAMA:
15867
+ case LLM_ARCH_GRANITE:
15792
15868
{
15793
15869
result = llm.build_llama();
15794
15870
} break;
@@ -19089,6 +19165,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
19089
19165
case LLM_ARCH_ARCTIC:
19090
19166
case LLM_ARCH_DEEPSEEK2:
19091
19167
case LLM_ARCH_CHATGLM:
19168
+ case LLM_ARCH_GRANITE:
19092
19169
return LLAMA_ROPE_TYPE_NORM;
19093
19170
19094
19171
// the pairs of head values are offset by n_rot/2
0 commit comments