Skip to content

Commit df08e22

Browse files
committed
fix(granitemoe convert): Split the double-sized input layer into gate and up
After a lot of staring and squinting, it's clear that the standard mixtral expert implementation is equivalent to the vectorized parallel experts in granite. The difference is that in granite, the w1 and w3 are concatenated into a single tensor "input_linear." Rather than reimplementing all of the math on the llama.cpp side, the much simpler route is to just split this tensor during conversion and follow the standard mixtral route. Branch: GraniteMoE Signed-off-by: Gabe Goodhart <[email protected]>
1 parent ed4851c commit df08e22

File tree

3 files changed

+29
-9
lines changed

3 files changed

+29
-9
lines changed

convert_hf_to_gguf.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4119,8 +4119,26 @@ class GraniteMoeModel(GraniteModel):
41194119
"""Conversion for IBM's GraniteMoeForCausalLM"""
41204120
model_arch = gguf.MODEL_ARCH.GRANITE_MOE
41214121

4122+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4123+
"""In modeling_granitemoe, the JetMoe implementation of parallel experts
4124+
is used. This essentially merges w1 and w3 into a single tensor with 2x
4125+
the hidden size that is then split during forward. To keep compativility
4126+
with existing mixtral support, we pull them apart here.
4127+
"""
4128+
4129+
if name.endswith("block_sparse_moe.input_linear.weight"):
4130+
gate, up = data_torch.chunk(2, dim=-2)
4131+
return [
4132+
(self.map_tensor_name(f"model.layers.{bid}.block_sparse_moe.input_linear.gate.weight"), gate),
4133+
(self.map_tensor_name(f"model.layers.{bid}.block_sparse_moe.input_linear.up.weight"), up),
4134+
]
4135+
4136+
return super().modify_tensors(data_torch, name, bid)
4137+
4138+
41224139
###### CONVERSION LOGIC ######
41234140

4141+
41244142
# tree of lazy tensors
41254143
class LazyTorchTensor(gguf.LazyBase):
41264144
_tensor_type = torch.Tensor

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1254,6 +1254,7 @@ class MODEL_TENSOR(IntEnum):
12541254
MODEL_TENSOR.ATTN_OUT,
12551255
MODEL_TENSOR.FFN_NORM,
12561256
MODEL_TENSOR.FFN_GATE_INP,
1257+
MODEL_TENSOR.FFN_GATE_EXP,
12571258
MODEL_TENSOR.FFN_DOWN_EXP,
12581259
MODEL_TENSOR.FFN_UP_EXP,
12591260
],

gguf-py/gguf/tensor_mapping.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -293,11 +293,11 @@ class TensorNameMap:
293293
),
294294

295295
MODEL_TENSOR.FFN_UP_EXP: (
296-
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
297-
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
298-
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
299-
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
300-
"model.layers.{bid}.block_sparse_moe.input_linear", # granitemoe
296+
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
297+
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
298+
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
299+
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
300+
"model.layers.{bid}.block_sparse_moe.input_linear.up", # granitemoe
301301
),
302302

303303
MODEL_TENSOR.FFN_UP_SHEXP: (
@@ -326,10 +326,11 @@ class TensorNameMap:
326326
),
327327

328328
MODEL_TENSOR.FFN_GATE_EXP: (
329-
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
330-
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
331-
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
332-
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
329+
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
330+
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
331+
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
332+
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
333+
"model.layers.{bid}.block_sparse_moe.input_linear.gate", # granitemoe
333334
),
334335

335336
MODEL_TENSOR.FFN_GATE_SHEXP: (

0 commit comments

Comments
 (0)