Skip to content

Commit 3d6bf69

Browse files
llama : add IBM Granite MoE architecture (ggml-org#9438)
* feat(gguf-py): Add granitemoe architecture This includes the addition of new tensor names for the new moe layers. These may not be correct at this point due to the need for the hack in gguf_writer.py to double-check the length of the shape for these layers. Branch: GraniteMoE Signed-off-by: Gabe Goodhart <[email protected]> * feat(convert_hf_to_gguf): Add GraniteMoeModel GraniteMoe has the same configuration deltas as Granite Branch: GraniteMoE Signed-off-by: Gabe Goodhart <[email protected]> * fix(granitemoe convert): Split the double-sized input layer into gate and up After a lot of staring and squinting, it's clear that the standard mixtral expert implementation is equivalent to the vectorized parallel experts in granite. The difference is that in granite, the w1 and w3 are concatenated into a single tensor "input_linear." Rather than reimplementing all of the math on the llama.cpp side, the much simpler route is to just split this tensor during conversion and follow the standard mixtral route. Branch: GraniteMoE Co-Authored-By: [email protected] Signed-off-by: Gabe Goodhart <[email protected]> * feat(granitemoe): Implement granitemoe GraniteMoE follows the mixtral architecture (once the input_linear layers are split into gate_exps/up_exps). The main delta is the addition of the same four multipliers used in Granite. Branch: GraniteMoE Signed-off-by: Gabe Goodhart <[email protected]> * Typo fix in docstring Co-Authored-By: [email protected] Co-authored-by: Georgi Gerganov <[email protected]> Signed-off-by: Gabe Goodhart <[email protected]> * fix(conversion): Simplify tensor name mapping in conversion Branch: GraniteMoE Co-Authored-By: [email protected] Signed-off-by: Gabe Goodhart <[email protected]> * fix(convert): Remove unused tensor name mappings Branch: GraniteMoE Co-Authored-By: [email protected] Signed-off-by: Gabe Goodhart <[email protected]> * fix(convert): Sanity check on merged FFN tensor sizes Branch: GraniteMoE Co-Authored-By: [email protected] Signed-off-by: Gabe Goodhart <[email protected]> * fix: Allow "output" layer in granite moe architecture (convert and cpp) Branch: GraniteMoE Co-Authored-By: [email protected] Signed-off-by: Gabe Goodhart <[email protected]> * fix(granite): Add missing 'output' tensor for Granite This is a fix for the previous `granite` architecture PR. Recent snapshots have included this (`lm_head.weights`) as part of the architecture Branch: GraniteMoE Signed-off-by: Gabe Goodhart <[email protected]> --------- Signed-off-by: Gabe Goodhart <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 904837e commit 3d6bf69

File tree

4 files changed

+88
-13
lines changed

4 files changed

+88
-13
lines changed

convert_hf_to_gguf.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4102,16 +4102,45 @@ def set_gguf_parameters(self):
41024102
# consistency
41034103
if attention_scale := self.hparams.get("attention_multiplier"):
41044104
self.gguf_writer.add_attention_scale(attention_scale)
4105+
logger.info("gguf: (granite) attention_scale = %s", attention_scale)
41054106
if embedding_scale := self.hparams.get("embedding_multiplier"):
41064107
self.gguf_writer.add_embedding_scale(embedding_scale)
4108+
logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
41074109
if residual_scale := self.hparams.get("residual_multiplier"):
41084110
self.gguf_writer.add_residual_scale(residual_scale)
4109-
if logits_scaling := self.hparams.get("logits_scaling"):
4110-
self.gguf_writer.add_logit_scale(logits_scaling)
4111+
logger.info("gguf: (granite) residual_scale = %s", residual_scale)
4112+
if logits_scale := self.hparams.get("logits_scaling"):
4113+
self.gguf_writer.add_logit_scale(logits_scale)
4114+
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
4115+
4116+
4117+
@Model.register("GraniteMoeForCausalLM")
4118+
class GraniteMoeModel(GraniteModel):
4119+
"""Conversion for IBM's GraniteMoeForCausalLM"""
4120+
model_arch = gguf.MODEL_ARCH.GRANITE_MOE
4121+
4122+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4123+
"""In modeling_granitemoe, the JetMoe implementation of parallel experts
4124+
is used. This essentially merges w1 and w3 into a single tensor with 2x
4125+
the hidden size that is then split during forward. To keep compatibility
4126+
with existing mixtral support, we pull them apart here.
4127+
"""
4128+
4129+
if name.endswith("block_sparse_moe.input_linear.weight"):
4130+
ffn_dim = self.hparams["intermediate_size"]
4131+
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
4132+
gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
4133+
return [
4134+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
4135+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
4136+
]
4137+
4138+
return super().modify_tensors(data_torch, name, bid)
41114139

41124140

41134141
###### CONVERSION LOGIC ######
41144142

4143+
41154144
# tree of lazy tensors
41164145
class LazyTorchTensor(gguf.LazyBase):
41174146
_tensor_type = torch.Tensor

gguf-py/gguf/constants.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ class MODEL_ARCH(IntEnum):
235235
NEMOTRON = auto()
236236
EXAONE = auto()
237237
GRANITE = auto()
238+
GRANITE_MOE = auto()
238239

239240

240241
class MODEL_TENSOR(IntEnum):
@@ -392,6 +393,7 @@ class MODEL_TENSOR(IntEnum):
392393
MODEL_ARCH.NEMOTRON: "nemotron",
393394
MODEL_ARCH.EXAONE: "exaone",
394395
MODEL_ARCH.GRANITE: "granite",
396+
MODEL_ARCH.GRANITE_MOE: "granitemoe",
395397
}
396398

397399
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -1232,6 +1234,7 @@ class MODEL_TENSOR(IntEnum):
12321234
MODEL_ARCH.GRANITE: [
12331235
MODEL_TENSOR.TOKEN_EMBD,
12341236
MODEL_TENSOR.OUTPUT_NORM,
1237+
MODEL_TENSOR.OUTPUT,
12351238
MODEL_TENSOR.ATTN_NORM,
12361239
MODEL_TENSOR.ATTN_Q,
12371240
MODEL_TENSOR.ATTN_K,
@@ -1242,6 +1245,21 @@ class MODEL_TENSOR(IntEnum):
12421245
MODEL_TENSOR.FFN_DOWN,
12431246
MODEL_TENSOR.FFN_UP,
12441247
],
1248+
MODEL_ARCH.GRANITE_MOE: [
1249+
MODEL_TENSOR.TOKEN_EMBD,
1250+
MODEL_TENSOR.OUTPUT_NORM,
1251+
MODEL_TENSOR.OUTPUT,
1252+
MODEL_TENSOR.ATTN_NORM,
1253+
MODEL_TENSOR.ATTN_Q,
1254+
MODEL_TENSOR.ATTN_K,
1255+
MODEL_TENSOR.ATTN_V,
1256+
MODEL_TENSOR.ATTN_OUT,
1257+
MODEL_TENSOR.FFN_NORM,
1258+
MODEL_TENSOR.FFN_GATE_INP,
1259+
MODEL_TENSOR.FFN_GATE_EXP,
1260+
MODEL_TENSOR.FFN_DOWN_EXP,
1261+
MODEL_TENSOR.FFN_UP_EXP,
1262+
],
12451263
# TODO
12461264
}
12471265

gguf-py/gguf/tensor_mapping.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -251,11 +251,12 @@ class TensorNameMap:
251251
),
252252

253253
MODEL_TENSOR.FFN_GATE_INP: (
254-
"layers.{bid}.feed_forward.gate", # mixtral
255-
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
256-
"model.layers.{bid}.mlp.gate", # qwen2moe olmoe
257-
"transformer.decoder_layer.{bid}.router", # Grok
258-
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
254+
"layers.{bid}.feed_forward.gate", # mixtral
255+
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
256+
"model.layers.{bid}.mlp.gate", # qwen2moe olmoe
257+
"transformer.decoder_layer.{bid}.router", # Grok
258+
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
259+
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
259260
),
260261

261262
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@@ -364,10 +365,11 @@ class TensorNameMap:
364365
),
365366

366367
MODEL_TENSOR.FFN_DOWN_EXP: (
367-
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
368-
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
369-
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
370-
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
368+
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
369+
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
370+
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
371+
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
372+
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
371373
),
372374

373375
MODEL_TENSOR.FFN_DOWN_SHEXP: (

src/llama.cpp

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ enum llm_arch {
215215
LLM_ARCH_EXAONE,
216216
LLM_ARCH_RWKV6,
217217
LLM_ARCH_GRANITE,
218+
LLM_ARCH_GRANITE_MOE,
218219
LLM_ARCH_UNKNOWN,
219220
};
220221

@@ -266,6 +267,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
266267
{ LLM_ARCH_EXAONE, "exaone" },
267268
{ LLM_ARCH_RWKV6, "rwkv6" },
268269
{ LLM_ARCH_GRANITE, "granite" },
270+
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
269271
{ LLM_ARCH_UNKNOWN, "(unknown)" },
270272
};
271273

@@ -1467,6 +1469,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
14671469
{
14681470
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
14691471
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1472+
{ LLM_TENSOR_OUTPUT, "output" },
14701473
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
14711474
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
14721475
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
@@ -1478,6 +1481,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
14781481
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
14791482
},
14801483
},
1484+
{
1485+
LLM_ARCH_GRANITE_MOE,
1486+
{
1487+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1488+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1489+
{ LLM_TENSOR_OUTPUT, "output" },
1490+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1491+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1492+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1493+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1494+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1495+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1496+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1497+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1498+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1499+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1500+
},
1501+
},
14811502
{
14821503
LLM_ARCH_UNKNOWN,
14831504
{
@@ -2396,7 +2417,7 @@ struct llama_hparams {
23962417
float f_max_alibi_bias = 0.0f;
23972418
float f_logit_scale = 0.0f;
23982419

2399-
// Additional scale factors (Granite)
2420+
// Additional scale factors (Granite/Granite MoE)
24002421
float f_residual_scale = 0.0f;
24012422
float f_embedding_scale = 0.0f;
24022423
float f_attention_scale = 0.0f;
@@ -6048,6 +6069,7 @@ static void llm_load_hparams(
60486069
}
60496070
} break;
60506071
case LLM_ARCH_GRANITE:
6072+
case LLM_ARCH_GRANITE_MOE:
60516073
{
60526074
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
60536075
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -6056,6 +6078,7 @@ static void llm_load_hparams(
60566078
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
60576079

60586080
switch (hparams.n_layer) {
6081+
case 32: model.type = e_model::MODEL_3B; break;
60596082
case 40: model.type = e_model::MODEL_3B; break;
60606083
// Add additional layer/vocab/etc checks here for other model sizes
60616084
default: model.type = e_model::MODEL_UNKNOWN;
@@ -6810,7 +6833,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
68106833
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
68116834
}
68126835

6813-
if (model.arch == LLM_ARCH_GRANITE) {
6836+
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
68146837
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
68156838
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
68166839
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -6984,6 +7007,7 @@ static bool llm_load_tensors(
69847007
case LLM_ARCH_REFACT:
69857008
case LLM_ARCH_MINICPM:
69867009
case LLM_ARCH_GRANITE:
7010+
case LLM_ARCH_GRANITE_MOE:
69877011
{
69887012
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
69897013

@@ -15930,6 +15954,7 @@ static struct ggml_cgraph * llama_build_graph(
1593015954
switch (model.arch) {
1593115955
case LLM_ARCH_LLAMA:
1593215956
case LLM_ARCH_GRANITE:
15957+
case LLM_ARCH_GRANITE_MOE:
1593315958
{
1593415959
result = llm.build_llama();
1593515960
} break;
@@ -19231,6 +19256,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
1923119256
case LLM_ARCH_DEEPSEEK2:
1923219257
case LLM_ARCH_CHATGLM:
1923319258
case LLM_ARCH_GRANITE:
19259+
case LLM_ARCH_GRANITE_MOE:
1923419260
return LLAMA_ROPE_TYPE_NORM;
1923519261

1923619262
// the pairs of head values are offset by n_rot/2

0 commit comments

Comments
 (0)