Skip to content

Commit b9c7052

Browse files
committed
feat(gguf-py): Add granitemoe architecture
This includes the addition of new tensor names for the new moe layers. These may not be correct at this point due to the need for the hack in gguf_writer.py to double-check the length of the shape for these layers. Branch: GraniteMoE Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 5d054a4 commit b9c7052

File tree

2 files changed

+30
-12
lines changed

2 files changed

+30
-12
lines changed

gguf-py/gguf/constants.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ class MODEL_ARCH(IntEnum):
235235
NEMOTRON = auto()
236236
EXAONE = auto()
237237
GRANITE = auto()
238+
GRANITE_MOE = auto()
238239

239240

240241
class MODEL_TENSOR(IntEnum):
@@ -392,6 +393,7 @@ class MODEL_TENSOR(IntEnum):
392393
MODEL_ARCH.NEMOTRON: "nemotron",
393394
MODEL_ARCH.EXAONE: "exaone",
394395
MODEL_ARCH.GRANITE: "granite",
396+
MODEL_ARCH.GRANITE_MOE: "granitemoe",
395397
}
396398

397399
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -1242,6 +1244,19 @@ class MODEL_TENSOR(IntEnum):
12421244
MODEL_TENSOR.FFN_DOWN,
12431245
MODEL_TENSOR.FFN_UP,
12441246
],
1247+
MODEL_ARCH.GRANITE_MOE: [
1248+
MODEL_TENSOR.TOKEN_EMBD,
1249+
MODEL_TENSOR.OUTPUT_NORM,
1250+
MODEL_TENSOR.ATTN_NORM,
1251+
MODEL_TENSOR.ATTN_Q,
1252+
MODEL_TENSOR.ATTN_K,
1253+
MODEL_TENSOR.ATTN_V,
1254+
MODEL_TENSOR.ATTN_OUT,
1255+
MODEL_TENSOR.FFN_NORM,
1256+
MODEL_TENSOR.FFN_GATE_EXP,
1257+
MODEL_TENSOR.FFN_DOWN_EXP,
1258+
MODEL_TENSOR.FFN_UP_EXP,
1259+
],
12451260
# TODO
12461261
}
12471262

gguf-py/gguf/tensor_mapping.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -292,10 +292,11 @@ class TensorNameMap:
292292
),
293293

294294
MODEL_TENSOR.FFN_UP_EXP: (
295-
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
296-
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
297-
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
298-
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
295+
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
296+
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
297+
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
298+
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
299+
"model.layers.{bid}.block_sparse_moe.input_linear", # granitemoe
299300
),
300301

301302
MODEL_TENSOR.FFN_UP_SHEXP: (
@@ -324,10 +325,11 @@ class TensorNameMap:
324325
),
325326

326327
MODEL_TENSOR.FFN_GATE_EXP: (
327-
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
328-
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
329-
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
330-
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
328+
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
329+
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
330+
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
331+
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
332+
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
331333
),
332334

333335
MODEL_TENSOR.FFN_GATE_SHEXP: (
@@ -364,10 +366,11 @@ class TensorNameMap:
364366
),
365367

366368
MODEL_TENSOR.FFN_DOWN_EXP: (
367-
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
368-
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
369-
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
370-
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
369+
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
370+
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
371+
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
372+
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
373+
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
371374
),
372375

373376
MODEL_TENSOR.FFN_DOWN_SHEXP: (

0 commit comments

Comments
 (0)