@@ -1415,6 +1415,22 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1415
1415
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
1416
1416
},
1417
1417
},
1418
+ {
1419
+ LLM_ARCH_GRANITE,
1420
+ {
1421
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1422
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1423
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1424
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1425
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1426
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1427
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1428
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1429
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1430
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1431
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1432
+ },
1433
+ },
1418
1434
{
1419
1435
LLM_ARCH_UNKNOWN,
1420
1436
{
@@ -6844,6 +6860,7 @@ static bool llm_load_tensors(
6844
6860
case LLM_ARCH_LLAMA:
6845
6861
case LLM_ARCH_REFACT:
6846
6862
case LLM_ARCH_MINICPM:
6863
+ case LLM_ARCH_GRANITE:
6847
6864
{
6848
6865
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6849
6866
@@ -8738,6 +8755,11 @@ static struct ggml_tensor * llm_build_inp_embd(
8738
8755
ggml_set_input(lctx.inp_embd);
8739
8756
}
8740
8757
8758
+ // For Granite architecture
8759
+ if (hparams.f_embedding_multiplier != 0.0f) {
8760
+ inpL = ggml_scale(ctx, inpL, hparams.f_embedding_multiplier);
8761
+ }
8762
+
8741
8763
cb(inpL, "inp_embd", -1);
8742
8764
8743
8765
return inpL;
@@ -10016,6 +10038,7 @@ struct llm_build_context {
10016
10038
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10017
10039
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10018
10040
10041
+ const float kq_scale = hparams.f_attention_multiplier == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_multiplier;
10019
10042
for (int il = 0; il < n_layer; ++il) {
10020
10043
struct ggml_tensor * inpSA = inpL;
10021
10044
@@ -10068,7 +10091,7 @@ struct llm_build_context {
10068
10091
10069
10092
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10070
10093
model.layers[il].wo, model.layers[il].bo,
10071
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)) , cb, il);
10094
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale , cb, il);
10072
10095
}
10073
10096
10074
10097
if (il == n_layer - 1) {
@@ -10079,6 +10102,11 @@ struct llm_build_context {
10079
10102
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10080
10103
}
10081
10104
10105
+ // For Granite architecture
10106
+ if (hparams.f_residual_multiplier) {
10107
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier);
10108
+ }
10109
+
10082
10110
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10083
10111
cb(ffn_inp, "ffn_inp", il);
10084
10112
@@ -10115,6 +10143,11 @@ struct llm_build_context {
10115
10143
cb(cur, "ffn_moe_out", il);
10116
10144
}
10117
10145
10146
+ // For Granite architecture
10147
+ if (hparams.f_residual_multiplier) {
10148
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier);
10149
+ }
10150
+
10118
10151
cur = ggml_add(ctx0, cur, ffn_inp);
10119
10152
cb(cur, "ffn_out", il);
10120
10153
@@ -10134,6 +10167,12 @@ struct llm_build_context {
10134
10167
10135
10168
// lm_head
10136
10169
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10170
+
10171
+ // For Granite architecture
10172
+ if (hparams.f_logit_scale) {
10173
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
10174
+ }
10175
+
10137
10176
cb(cur, "result_output", -1);
10138
10177
10139
10178
ggml_build_forward_expand(gf, cur);
@@ -15322,6 +15361,7 @@ static struct ggml_cgraph * llama_build_graph(
15322
15361
15323
15362
switch (model.arch) {
15324
15363
case LLM_ARCH_LLAMA:
15364
+ case LLM_ARCH_GRANITE:
15325
15365
{
15326
15366
result = llm.build_llama();
15327
15367
} break;
@@ -18608,6 +18648,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
18608
18648
case LLM_ARCH_ARCTIC:
18609
18649
case LLM_ARCH_DEEPSEEK2:
18610
18650
case LLM_ARCH_CHATGLM:
18651
+ case LLM_ARCH_GRANITE:
18611
18652
return LLAMA_ROPE_TYPE_NORM;
18612
18653
18613
18654
// the pairs of head values are offset by n_rot/2
0 commit comments