@@ -13,7 +13,7 @@ class TensorNameMap:
13
13
"transformer.wte" , # gpt2 gpt-j mpt refact qwen dbrx jais exaone
14
14
"transformer.word_embeddings" , # falcon
15
15
"word_embeddings" , # bloom
16
- "model.embed_tokens" , # llama-hf nemotron
16
+ "model.embed_tokens" , # llama-hf nemotron olmoe
17
17
"tok_embeddings" , # llama-pth
18
18
"embeddings.word_embeddings" , # bert nomic-bert
19
19
"language_model.embedding.word_embeddings" , # persimmon
@@ -54,7 +54,7 @@ class TensorNameMap:
54
54
# Output
55
55
MODEL_TENSOR .OUTPUT : (
56
56
"embed_out" , # gptneox
57
- "lm_head" , # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone
57
+ "lm_head" , # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe
58
58
"output" , # llama-pth bloom internlm2
59
59
"word_embeddings_for_head" , # persimmon
60
60
"lm_head.linear" , # phi2
@@ -66,7 +66,7 @@ class TensorNameMap:
66
66
MODEL_TENSOR .OUTPUT_NORM : (
67
67
"gpt_neox.final_layer_norm" , # gptneox
68
68
"transformer.ln_f" , # gpt2 gpt-j falcon jais exaone
69
- "model.norm" , # llama-hf baichuan internlm2
69
+ "model.norm" , # llama-hf baichuan internlm2 olmoe
70
70
"norm" , # llama-pth
71
71
"transformer.norm_f" , # mpt dbrx
72
72
"ln_f" , # refact bloom qwen gpt2
@@ -98,7 +98,7 @@ class TensorNameMap:
98
98
"transformer.h.{bid}.input_layernorm" , # falcon7b
99
99
"h.{bid}.input_layernorm" , # bloom
100
100
"transformer.h.{bid}.ln_mlp" , # falcon40b
101
- "model.layers.{bid}.input_layernorm" , # llama-hf nemotron
101
+ "model.layers.{bid}.input_layernorm" , # llama-hf nemotron olmoe
102
102
"layers.{bid}.attention_norm" , # llama-pth
103
103
"language_model.encoder.layers.{bid}.input_layernorm" , # persimmon
104
104
"model.layers.{bid}.ln1" , # yi
@@ -142,7 +142,7 @@ class TensorNameMap:
142
142
143
143
# Attention query
144
144
MODEL_TENSOR .ATTN_Q : (
145
- "model.layers.{bid}.self_attn.q_proj" , # llama-hf nemotron
145
+ "model.layers.{bid}.self_attn.q_proj" , # llama-hf nemotron olmoe
146
146
"layers.{bid}.attention.wq" , # llama-pth
147
147
"encoder.layer.{bid}.attention.self.query" , # bert
148
148
"transformer.h.{bid}.attn.q_proj" , # gpt-j
@@ -154,7 +154,7 @@ class TensorNameMap:
154
154
155
155
# Attention key
156
156
MODEL_TENSOR .ATTN_K : (
157
- "model.layers.{bid}.self_attn.k_proj" , # llama-hf nemotron
157
+ "model.layers.{bid}.self_attn.k_proj" , # llama-hf nemotron olmoe
158
158
"layers.{bid}.attention.wk" , # llama-pth
159
159
"encoder.layer.{bid}.attention.self.key" , # bert
160
160
"transformer.h.{bid}.attn.k_proj" , # gpt-j
@@ -167,7 +167,7 @@ class TensorNameMap:
167
167
168
168
# Attention value
169
169
MODEL_TENSOR .ATTN_V : (
170
- "model.layers.{bid}.self_attn.v_proj" , # llama-hf nemotron
170
+ "model.layers.{bid}.self_attn.v_proj" , # llama-hf nemotron olmoe
171
171
"layers.{bid}.attention.wv" , # llama-pth
172
172
"encoder.layer.{bid}.attention.self.value" , # bert
173
173
"transformer.h.{bid}.attn.v_proj" , # gpt-j
@@ -185,7 +185,7 @@ class TensorNameMap:
185
185
"transformer.blocks.{bid}.attn.out_proj" , # mpt
186
186
"transformer.h.{bid}.self_attention.dense" , # falcon
187
187
"h.{bid}.self_attention.dense" , # bloom
188
- "model.layers.{bid}.self_attn.o_proj" , # llama-hf nemotron
188
+ "model.layers.{bid}.self_attn.o_proj" , # llama-hf nemotron olmoe
189
189
"layers.{bid}.attention.wo" , # llama-pth
190
190
"encoder.layer.{bid}.attention.output.dense" , # bert
191
191
"transformer.h.{bid}.attn.out_proj" , # gpt-j
@@ -229,7 +229,7 @@ class TensorNameMap:
229
229
"transformer.h.{bid}.ln_2" , # gpt2 refact qwen jais exaone
230
230
"h.{bid}.post_attention_layernorm" , # bloom
231
231
"transformer.blocks.{bid}.norm_2" , # mpt
232
- "model.layers.{bid}.post_attention_layernorm" , # llama-hf nemotron
232
+ "model.layers.{bid}.post_attention_layernorm" , # llama-hf nemotron olmoe
233
233
"layers.{bid}.ffn_norm" , # llama-pth
234
234
"language_model.encoder.layers.{bid}.post_attention_layernorm" , # persimmon
235
235
"model.layers.{bid}.ln2" , # yi
@@ -253,7 +253,7 @@ class TensorNameMap:
253
253
MODEL_TENSOR .FFN_GATE_INP : (
254
254
"layers.{bid}.feed_forward.gate" , # mixtral
255
255
"model.layers.{bid}.block_sparse_moe.gate" , # mixtral
256
- "model.layers.{bid}.mlp.gate" , # qwen2moe
256
+ "model.layers.{bid}.mlp.gate" , # qwen2moe olmoe
257
257
"transformer.decoder_layer.{bid}.router" , # Grok
258
258
"transformer.blocks.{bid}.ffn.router.layer" , # dbrx
259
259
),
@@ -295,7 +295,7 @@ class TensorNameMap:
295
295
"layers.{bid}.feed_forward.experts.w3" , # mixtral (merged)
296
296
"transformer.decoder_layer.{bid}.moe.linear_v" , # Grok (merged)
297
297
"transformer.blocks.{bid}.ffn.experts.mlp.v1" , # dbrx
298
- "model.layers.{bid}.mlp.experts.up_proj" , # qwen2moe (merged)
298
+ "model.layers.{bid}.mlp.experts.up_proj" , # qwen2moe olmoe (merged)
299
299
),
300
300
301
301
MODEL_TENSOR .FFN_UP_SHEXP : (
@@ -327,7 +327,7 @@ class TensorNameMap:
327
327
"layers.{bid}.feed_forward.experts.w1" , # mixtral (merged)
328
328
"transformer.decoder_layer.{bid}.moe.linear" , # Grok (merged)
329
329
"transformer.blocks.{bid}.ffn.experts.mlp.w1" , # dbrx
330
- "model.layers.{bid}.mlp.experts.gate_proj" , # qwen2moe (merged)
330
+ "model.layers.{bid}.mlp.experts.gate_proj" , # qwen2moe olmoe (merged)
331
331
),
332
332
333
333
MODEL_TENSOR .FFN_GATE_SHEXP : (
@@ -367,7 +367,7 @@ class TensorNameMap:
367
367
"layers.{bid}.feed_forward.experts.w2" , # mixtral (merged)
368
368
"transformer.decoder_layer.{bid}.moe.linear_1" , # Grok (merged)
369
369
"transformer.blocks.{bid}.ffn.experts.mlp.w2" , # dbrx
370
- "model.layers.{bid}.mlp.experts.down_proj" , # qwen2moe (merged)
370
+ "model.layers.{bid}.mlp.experts.down_proj" , # qwen2moe olmoe (merged)
371
371
),
372
372
373
373
MODEL_TENSOR .FFN_DOWN_SHEXP : (
@@ -378,7 +378,7 @@ class TensorNameMap:
378
378
MODEL_TENSOR .ATTN_Q_NORM : (
379
379
"language_model.encoder.layers.{bid}.self_attention.q_layernorm" ,
380
380
"model.layers.{bid}.self_attn.q_layernorm" , # persimmon
381
- "model.layers.{bid}.self_attn.q_norm" , # cohere
381
+ "model.layers.{bid}.self_attn.q_norm" , # cohere olmoe
382
382
"transformer.blocks.{bid}.attn.q_ln" , # sea-lion
383
383
"encoder.layer.{bid}.attention.self.layer_norm_q" , # jina-bert-v2
384
384
"transformer.layers.{bid}.attn.q_norm" , # openelm
@@ -387,7 +387,7 @@ class TensorNameMap:
387
387
MODEL_TENSOR .ATTN_K_NORM : (
388
388
"language_model.encoder.layers.{bid}.self_attention.k_layernorm" ,
389
389
"model.layers.{bid}.self_attn.k_layernorm" , # persimmon
390
- "model.layers.{bid}.self_attn.k_norm" , # cohere
390
+ "model.layers.{bid}.self_attn.k_norm" , # cohere olmoe
391
391
"transformer.blocks.{bid}.attn.k_ln" , # sea-lion
392
392
"encoder.layer.{bid}.attention.self.layer_norm_k" , # jina-bert-v2
393
393
"transformer.layers.{bid}.attn.k_norm" , # openelm
0 commit comments