@@ -1416,6 +1416,22 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1416
1416
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
1417
1417
},
1418
1418
},
1419
+ {
1420
+ LLM_ARCH_GRANITE,
1421
+ {
1422
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1423
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1424
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1425
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1426
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1427
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1428
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1429
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1430
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1431
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1432
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1433
+ },
1434
+ },
1419
1435
{
1420
1436
LLM_ARCH_UNKNOWN,
1421
1437
{
@@ -6832,6 +6848,7 @@ static bool llm_load_tensors(
6832
6848
case LLM_ARCH_LLAMA:
6833
6849
case LLM_ARCH_REFACT:
6834
6850
case LLM_ARCH_MINICPM:
6851
+ case LLM_ARCH_GRANITE:
6835
6852
{
6836
6853
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6837
6854
@@ -8726,6 +8743,11 @@ static struct ggml_tensor * llm_build_inp_embd(
8726
8743
ggml_set_input(lctx.inp_embd);
8727
8744
}
8728
8745
8746
+ // For Granite architecture
8747
+ if (hparams.f_embedding_multiplier != 0.0f) {
8748
+ inpL = ggml_scale(ctx, inpL, hparams.f_embedding_multiplier);
8749
+ }
8750
+
8729
8751
cb(inpL, "inp_embd", -1);
8730
8752
8731
8753
return inpL;
@@ -10004,6 +10026,7 @@ struct llm_build_context {
10004
10026
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10005
10027
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10006
10028
10029
+ const float kq_scale = hparams.f_attention_multiplier == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_multiplier;
10007
10030
for (int il = 0; il < n_layer; ++il) {
10008
10031
struct ggml_tensor * inpSA = inpL;
10009
10032
@@ -10056,7 +10079,7 @@ struct llm_build_context {
10056
10079
10057
10080
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10058
10081
model.layers[il].wo, model.layers[il].bo,
10059
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)) , cb, il);
10082
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale , cb, il);
10060
10083
}
10061
10084
10062
10085
if (il == n_layer - 1) {
@@ -10067,6 +10090,11 @@ struct llm_build_context {
10067
10090
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10068
10091
}
10069
10092
10093
+ // For Granite architecture
10094
+ if (hparams.f_residual_multiplier) {
10095
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier);
10096
+ }
10097
+
10070
10098
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10071
10099
cb(ffn_inp, "ffn_inp", il);
10072
10100
@@ -10103,6 +10131,11 @@ struct llm_build_context {
10103
10131
cb(cur, "ffn_moe_out", il);
10104
10132
}
10105
10133
10134
+ // For Granite architecture
10135
+ if (hparams.f_residual_multiplier) {
10136
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier);
10137
+ }
10138
+
10106
10139
cur = ggml_add(ctx0, cur, ffn_inp);
10107
10140
cb(cur, "ffn_out", il);
10108
10141
@@ -10122,6 +10155,12 @@ struct llm_build_context {
10122
10155
10123
10156
// lm_head
10124
10157
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10158
+
10159
+ // For Granite architecture
10160
+ if (hparams.f_logit_scale) {
10161
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
10162
+ }
10163
+
10125
10164
cb(cur, "result_output", -1);
10126
10165
10127
10166
ggml_build_forward_expand(gf, cur);
@@ -15300,6 +15339,7 @@ static struct ggml_cgraph * llama_build_graph(
15300
15339
15301
15340
switch (model.arch) {
15302
15341
case LLM_ARCH_LLAMA:
15342
+ case LLM_ARCH_GRANITE:
15303
15343
{
15304
15344
result = llm.build_llama();
15305
15345
} break;
@@ -18540,6 +18580,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
18540
18580
case LLM_ARCH_ARCTIC:
18541
18581
case LLM_ARCH_DEEPSEEK2:
18542
18582
case LLM_ARCH_CHATGLM:
18583
+ case LLM_ARCH_GRANITE:
18543
18584
return LLAMA_ROPE_TYPE_NORM;
18544
18585
18545
18586
// the pairs of head values are offset by n_rot/2
0 commit comments