@@ -1462,6 +1462,22 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1462
1462
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
1463
1463
},
1464
1464
},
1465
+ {
1466
+ LLM_ARCH_GRANITE,
1467
+ {
1468
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1469
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1470
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1471
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1472
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1473
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1474
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1475
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1476
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1477
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1478
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1479
+ },
1480
+ },
1465
1481
{
1466
1482
LLM_ARCH_UNKNOWN,
1467
1483
{
@@ -6915,6 +6931,7 @@ static bool llm_load_tensors(
6915
6931
case LLM_ARCH_LLAMA:
6916
6932
case LLM_ARCH_REFACT:
6917
6933
case LLM_ARCH_MINICPM:
6934
+ case LLM_ARCH_GRANITE:
6918
6935
{
6919
6936
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6920
6937
@@ -8898,6 +8915,11 @@ static struct ggml_tensor * llm_build_inp_embd(
8898
8915
ggml_set_input(lctx.inp_embd);
8899
8916
}
8900
8917
8918
+ // For Granite architecture
8919
+ if (hparams.f_embedding_multiplier != 0.0f) {
8920
+ inpL = ggml_scale(ctx, inpL, hparams.f_embedding_multiplier);
8921
+ }
8922
+
8901
8923
cb(inpL, "inp_embd", -1);
8902
8924
8903
8925
return inpL;
@@ -10176,6 +10198,7 @@ struct llm_build_context {
10176
10198
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10177
10199
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10178
10200
10201
+ const float kq_scale = hparams.f_attention_multiplier == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_multiplier;
10179
10202
for (int il = 0; il < n_layer; ++il) {
10180
10203
struct ggml_tensor * inpSA = inpL;
10181
10204
@@ -10228,7 +10251,7 @@ struct llm_build_context {
10228
10251
10229
10252
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
10230
10253
model.layers[il].wo, model.layers[il].bo,
10231
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)) , cb, il);
10254
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale , cb, il);
10232
10255
}
10233
10256
10234
10257
if (il == n_layer - 1) {
@@ -10239,6 +10262,11 @@ struct llm_build_context {
10239
10262
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10240
10263
}
10241
10264
10265
+ // For Granite architecture
10266
+ if (hparams.f_residual_multiplier) {
10267
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier);
10268
+ }
10269
+
10242
10270
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10243
10271
cb(ffn_inp, "ffn_inp", il);
10244
10272
@@ -10275,6 +10303,11 @@ struct llm_build_context {
10275
10303
cb(cur, "ffn_moe_out", il);
10276
10304
}
10277
10305
10306
+ // For Granite architecture
10307
+ if (hparams.f_residual_multiplier) {
10308
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier);
10309
+ }
10310
+
10278
10311
cur = ggml_add(ctx0, cur, ffn_inp);
10279
10312
cb(cur, "ffn_out", il);
10280
10313
@@ -10294,6 +10327,12 @@ struct llm_build_context {
10294
10327
10295
10328
// lm_head
10296
10329
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10330
+
10331
+ // For Granite architecture
10332
+ if (hparams.f_logit_scale) {
10333
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
10334
+ }
10335
+
10297
10336
cb(cur, "result_output", -1);
10298
10337
10299
10338
ggml_build_forward_expand(gf, cur);
@@ -15819,6 +15858,7 @@ static struct ggml_cgraph * llama_build_graph(
15819
15858
15820
15859
switch (model.arch) {
15821
15860
case LLM_ARCH_LLAMA:
15861
+ case LLM_ARCH_GRANITE:
15822
15862
{
15823
15863
result = llm.build_llama();
15824
15864
} break;
@@ -19115,6 +19155,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
19115
19155
case LLM_ARCH_ARCTIC:
19116
19156
case LLM_ARCH_DEEPSEEK2:
19117
19157
case LLM_ARCH_CHATGLM:
19158
+ case LLM_ARCH_GRANITE:
19118
19159
return LLAMA_ROPE_TYPE_NORM;
19119
19160
19120
19161
// the pairs of head values are offset by n_rot/2
0 commit comments