@@ -215,6 +215,7 @@ enum llm_arch {
215
215
LLM_ARCH_EXAONE,
216
216
LLM_ARCH_RWKV6,
217
217
LLM_ARCH_GRANITE,
218
+ LLM_ARCH_GRANITE_MOE,
218
219
LLM_ARCH_UNKNOWN,
219
220
};
220
221
@@ -266,6 +267,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
266
267
{ LLM_ARCH_EXAONE, "exaone" },
267
268
{ LLM_ARCH_RWKV6, "rwkv6" },
268
269
{ LLM_ARCH_GRANITE, "granite" },
270
+ { LLM_ARCH_GRANITE_MOE, "granitemoe" },
269
271
{ LLM_ARCH_UNKNOWN, "(unknown)" },
270
272
};
271
273
@@ -1478,6 +1480,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1478
1480
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1479
1481
},
1480
1482
},
1483
+ {
1484
+ LLM_ARCH_GRANITE_MOE,
1485
+ {
1486
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1487
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1488
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1489
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1490
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1491
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1492
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1493
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1494
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1495
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1496
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1497
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1498
+ },
1499
+ },
1481
1500
{
1482
1501
LLM_ARCH_UNKNOWN,
1483
1502
{
@@ -2396,7 +2415,7 @@ struct llama_hparams {
2396
2415
float f_max_alibi_bias = 0.0f;
2397
2416
float f_logit_scale = 0.0f;
2398
2417
2399
- // Additional scale factors (Granite)
2418
+ // Additional scale factors (Granite/Granite MoE )
2400
2419
float f_residual_scale = 0.0f;
2401
2420
float f_embedding_scale = 0.0f;
2402
2421
float f_attention_scale = 0.0f;
@@ -6048,6 +6067,7 @@ static void llm_load_hparams(
6048
6067
}
6049
6068
} break;
6050
6069
case LLM_ARCH_GRANITE:
6070
+ case LLM_ARCH_GRANITE_MOE:
6051
6071
{
6052
6072
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
6053
6073
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -6056,6 +6076,7 @@ static void llm_load_hparams(
6056
6076
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
6057
6077
6058
6078
switch (hparams.n_layer) {
6079
+ case 32: model.type = e_model::MODEL_3B; break;
6059
6080
case 40: model.type = e_model::MODEL_3B; break;
6060
6081
// Add additional layer/vocab/etc checks here for other model sizes
6061
6082
default: model.type = e_model::MODEL_UNKNOWN;
@@ -6767,7 +6788,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
6767
6788
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6768
6789
}
6769
6790
6770
- if (model.arch == LLM_ARCH_GRANITE) {
6791
+ if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE ) {
6771
6792
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6772
6793
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6773
6794
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -6941,6 +6962,7 @@ static bool llm_load_tensors(
6941
6962
case LLM_ARCH_REFACT:
6942
6963
case LLM_ARCH_MINICPM:
6943
6964
case LLM_ARCH_GRANITE:
6965
+ case LLM_ARCH_GRANITE_MOE:
6944
6966
{
6945
6967
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6946
6968
@@ -15868,6 +15890,7 @@ static struct ggml_cgraph * llama_build_graph(
15868
15890
switch (model.arch) {
15869
15891
case LLM_ARCH_LLAMA:
15870
15892
case LLM_ARCH_GRANITE:
15893
+ case LLM_ARCH_GRANITE_MOE:
15871
15894
{
15872
15895
result = llm.build_llama();
15873
15896
} break;
@@ -19169,6 +19192,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
19169
19192
case LLM_ARCH_DEEPSEEK2:
19170
19193
case LLM_ARCH_CHATGLM:
19171
19194
case LLM_ARCH_GRANITE:
19195
+ case LLM_ARCH_GRANITE_MOE:
19172
19196
return LLAMA_ROPE_TYPE_NORM;
19173
19197
19174
19198
// the pairs of head values are offset by n_rot/2
0 commit comments