@@ -213,6 +213,7 @@ enum llm_arch {
213
213
LLM_ARCH_EXAONE,
214
214
LLM_ARCH_RWKV6,
215
215
LLM_ARCH_GRANITE,
216
+ LLM_ARCH_GRANITE_MOE,
216
217
LLM_ARCH_UNKNOWN,
217
218
};
218
219
@@ -262,6 +263,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
262
263
{ LLM_ARCH_EXAONE, "exaone" },
263
264
{ LLM_ARCH_RWKV6, "rwkv6" },
264
265
{ LLM_ARCH_GRANITE, "granite" },
266
+ { LLM_ARCH_GRANITE_MOE, "granitemoe" },
265
267
{ LLM_ARCH_UNKNOWN, "(unknown)" },
266
268
};
267
269
@@ -1431,6 +1433,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1431
1433
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1432
1434
},
1433
1435
},
1436
+ {
1437
+ LLM_ARCH_GRANITE_MOE,
1438
+ {
1439
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1440
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1441
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1442
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1443
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1444
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1445
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1446
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1447
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1448
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1449
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1450
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1451
+ },
1452
+ },
1434
1453
{
1435
1454
LLM_ARCH_UNKNOWN,
1436
1455
{
@@ -2344,7 +2363,7 @@ struct llama_hparams {
2344
2363
float f_max_alibi_bias = 0.0f;
2345
2364
float f_logit_scale = 0.0f;
2346
2365
2347
- // For Granite architecture
2366
+ // For Granite architectures
2348
2367
float f_residual_multiplier = 0.0f;
2349
2368
float f_embedding_multiplier = 0.0f;
2350
2369
float f_attention_multiplier = 0.0f;
@@ -5385,6 +5404,7 @@ static void llm_load_hparams(
5385
5404
switch (model.arch) {
5386
5405
case LLM_ARCH_LLAMA:
5387
5406
case LLM_ARCH_GRANITE:
5407
+ case LLM_ARCH_GRANITE_MOE:
5388
5408
{
5389
5409
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5390
5410
@@ -5408,8 +5428,8 @@ static void llm_load_hparams(
5408
5428
default: model.type = e_model::MODEL_UNKNOWN;
5409
5429
}
5410
5430
}
5411
- // Extra multipliers for Granite architecture
5412
- if (model.arch == LLM_ARCH_GRANITE) {
5431
+ // Extra multipliers for Granite architectures
5432
+ if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE ) {
5413
5433
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5414
5434
ml.get_key(LLM_KV_RESIDUAL_MULTIPLIER, hparams.f_residual_multiplier);
5415
5435
ml.get_key(LLM_KV_EMBEDDING_MULTIPLIER, hparams.f_embedding_multiplier);
@@ -6685,7 +6705,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
6685
6705
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6686
6706
}
6687
6707
6688
- if (model.arch == LLM_ARCH_GRANITE) {
6708
+ if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE ) {
6689
6709
LLAMA_LOG_INFO("%s: f_embedding_multiplier = %f\n", __func__, hparams.f_embedding_multiplier);
6690
6710
LLAMA_LOG_INFO("%s: f_residual_multiplier = %f\n", __func__, hparams.f_residual_multiplier);
6691
6711
LLAMA_LOG_INFO("%s: f_attention_multiplier = %f\n", __func__, hparams.f_attention_multiplier);
@@ -6861,6 +6881,7 @@ static bool llm_load_tensors(
6861
6881
case LLM_ARCH_REFACT:
6862
6882
case LLM_ARCH_MINICPM:
6863
6883
case LLM_ARCH_GRANITE:
6884
+ case LLM_ARCH_GRANITE_MOE:
6864
6885
{
6865
6886
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6866
6887
@@ -15362,6 +15383,7 @@ static struct ggml_cgraph * llama_build_graph(
15362
15383
switch (model.arch) {
15363
15384
case LLM_ARCH_LLAMA:
15364
15385
case LLM_ARCH_GRANITE:
15386
+ case LLM_ARCH_GRANITE_MOE:
15365
15387
{
15366
15388
result = llm.build_llama();
15367
15389
} break;
@@ -18649,6 +18671,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
18649
18671
case LLM_ARCH_DEEPSEEK2:
18650
18672
case LLM_ARCH_CHATGLM:
18651
18673
case LLM_ARCH_GRANITE:
18674
+ case LLM_ARCH_GRANITE_MOE:
18652
18675
return LLAMA_ROPE_TYPE_NORM;
18653
18676
18654
18677
// the pairs of head values are offset by n_rot/2
0 commit comments