@@ -13,7 +13,7 @@ class TensorNameMap:
13
13
"transformer.wte" , # gpt2 gpt-j mpt refact qwen dbrx jais exaone
14
14
"transformer.word_embeddings" , # falcon
15
15
"word_embeddings" , # bloom
16
- "model.embed_tokens" , # llama-hf nemotron olmoe olmo_1124
16
+ "model.embed_tokens" , # llama-hf nemotron olmoe olmo2
17
17
"tok_embeddings" , # llama-pth
18
18
"embeddings.word_embeddings" , # bert nomic-bert
19
19
"language_model.embedding.word_embeddings" , # persimmon
@@ -54,7 +54,7 @@ class TensorNameMap:
54
54
# Output
55
55
MODEL_TENSOR .OUTPUT : (
56
56
"embed_out" , # gptneox
57
- "lm_head" , # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo_1124
57
+ "lm_head" , # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2
58
58
"output" , # llama-pth bloom internlm2
59
59
"word_embeddings_for_head" , # persimmon
60
60
"lm_head.linear" , # phi2
@@ -66,7 +66,7 @@ class TensorNameMap:
66
66
MODEL_TENSOR .OUTPUT_NORM : (
67
67
"gpt_neox.final_layer_norm" , # gptneox
68
68
"transformer.ln_f" , # gpt2 gpt-j falcon jais exaone
69
- "model.norm" , # llama-hf baichuan internlm2 olmoe olmo_1124
69
+ "model.norm" , # llama-hf baichuan internlm2 olmoe olmo2
70
70
"norm" , # llama-pth
71
71
"transformer.norm_f" , # mpt dbrx
72
72
"ln_f" , # refact bloom qwen gpt2
@@ -145,7 +145,7 @@ class TensorNameMap:
145
145
146
146
# Attention query
147
147
MODEL_TENSOR .ATTN_Q : (
148
- "model.layers.{bid}.self_attn.q_proj" , # llama-hf nemotron olmoe olmo_1124
148
+ "model.layers.{bid}.self_attn.q_proj" , # llama-hf nemotron olmoe olmo2
149
149
"layers.{bid}.attention.wq" , # llama-pth
150
150
"encoder.layer.{bid}.attention.self.query" , # bert
151
151
"transformer.h.{bid}.attn.q_proj" , # gpt-j
@@ -157,7 +157,7 @@ class TensorNameMap:
157
157
158
158
# Attention key
159
159
MODEL_TENSOR .ATTN_K : (
160
- "model.layers.{bid}.self_attn.k_proj" , # llama-hf nemotron olmoe olmo_1124
160
+ "model.layers.{bid}.self_attn.k_proj" , # llama-hf nemotron olmoe olmo2
161
161
"layers.{bid}.attention.wk" , # llama-pth
162
162
"encoder.layer.{bid}.attention.self.key" , # bert
163
163
"transformer.h.{bid}.attn.k_proj" , # gpt-j
@@ -170,7 +170,7 @@ class TensorNameMap:
170
170
171
171
# Attention value
172
172
MODEL_TENSOR .ATTN_V : (
173
- "model.layers.{bid}.self_attn.v_proj" , # llama-hf nemotron olmoe olmo_1124
173
+ "model.layers.{bid}.self_attn.v_proj" , # llama-hf nemotron olmoe olmo2
174
174
"layers.{bid}.attention.wv" , # llama-pth
175
175
"encoder.layer.{bid}.attention.self.value" , # bert
176
176
"transformer.h.{bid}.attn.v_proj" , # gpt-j
@@ -188,7 +188,7 @@ class TensorNameMap:
188
188
"transformer.blocks.{bid}.attn.out_proj" , # mpt
189
189
"transformer.h.{bid}.self_attention.dense" , # falcon
190
190
"h.{bid}.self_attention.dense" , # bloom
191
- "model.layers.{bid}.self_attn.o_proj" , # llama-hf nemotron olmoe olmo_1124
191
+ "model.layers.{bid}.self_attn.o_proj" , # llama-hf nemotron olmoe olmo2
192
192
"layers.{bid}.attention.wo" , # llama-pth
193
193
"encoder.layer.{bid}.attention.output.dense" , # bert
194
194
"transformer.h.{bid}.attn.out_proj" , # gpt-j
@@ -215,7 +215,7 @@ class TensorNameMap:
215
215
),
216
216
217
217
MODEL_TENSOR .ATTN_POST_NORM : (
218
- "model.layers.{bid}.post_attention_layernorm" , # gemma2 olmo_1124
218
+ "model.layers.{bid}.post_attention_layernorm" , # gemma2 olmo2
219
219
),
220
220
221
221
# Rotary embeddings
@@ -250,7 +250,7 @@ class TensorNameMap:
250
250
251
251
# Post feed-forward norm
252
252
MODEL_TENSOR .FFN_POST_NORM : (
253
- "model.layers.{bid}.post_feedforward_layernorm" , # gemma2 olmo_1124
253
+ "model.layers.{bid}.post_feedforward_layernorm" , # gemma2 olmo2
254
254
),
255
255
256
256
MODEL_TENSOR .FFN_GATE_INP : (
@@ -273,7 +273,7 @@ class TensorNameMap:
273
273
"transformer.blocks.{bid}.ffn.up_proj" , # mpt
274
274
"transformer.h.{bid}.mlp.dense_h_to_4h" , # falcon
275
275
"h.{bid}.mlp.dense_h_to_4h" , # bloom
276
- "model.layers.{bid}.mlp.up_proj" , # llama-hf refact nemotron olmo_1124
276
+ "model.layers.{bid}.mlp.up_proj" , # llama-hf refact nemotron olmo2
277
277
"layers.{bid}.feed_forward.w3" , # llama-pth
278
278
"encoder.layer.{bid}.intermediate.dense" , # bert
279
279
"transformer.h.{bid}.mlp.fc_in" , # gpt-j
@@ -314,7 +314,7 @@ class TensorNameMap:
314
314
315
315
# Feed-forward gate
316
316
MODEL_TENSOR .FFN_GATE : (
317
- "model.layers.{bid}.mlp.gate_proj" , # llama-hf refact olmo_1124
317
+ "model.layers.{bid}.mlp.gate_proj" , # llama-hf refact olmo2
318
318
"layers.{bid}.feed_forward.w1" , # llama-pth
319
319
"transformer.h.{bid}.mlp.w2" , # qwen
320
320
"transformer.h.{bid}.mlp.c_fc2" , # jais
@@ -346,7 +346,7 @@ class TensorNameMap:
346
346
"transformer.blocks.{bid}.ffn.down_proj" , # mpt
347
347
"transformer.h.{bid}.mlp.dense_4h_to_h" , # falcon
348
348
"h.{bid}.mlp.dense_4h_to_h" , # bloom
349
- "model.layers.{bid}.mlp.down_proj" , # llama-hf nemotron olmo_1124
349
+ "model.layers.{bid}.mlp.down_proj" , # llama-hf nemotron olmo2
350
350
"layers.{bid}.feed_forward.w2" , # llama-pth
351
351
"encoder.layer.{bid}.output.dense" , # bert
352
352
"transformer.h.{bid}.mlp.fc_out" , # gpt-j
@@ -383,7 +383,7 @@ class TensorNameMap:
383
383
MODEL_TENSOR .ATTN_Q_NORM : (
384
384
"language_model.encoder.layers.{bid}.self_attention.q_layernorm" ,
385
385
"model.layers.{bid}.self_attn.q_layernorm" , # persimmon
386
- "model.layers.{bid}.self_attn.q_norm" , # cohere olmoe chameleon olmo_1124
386
+ "model.layers.{bid}.self_attn.q_norm" , # cohere olmoe chameleon olmo2
387
387
"transformer.blocks.{bid}.attn.q_ln" , # sea-lion
388
388
"encoder.layer.{bid}.attention.self.layer_norm_q" , # jina-bert-v2
389
389
"transformer.layers.{bid}.attn.q_norm" , # openelm
@@ -392,7 +392,7 @@ class TensorNameMap:
392
392
MODEL_TENSOR .ATTN_K_NORM : (
393
393
"language_model.encoder.layers.{bid}.self_attention.k_layernorm" ,
394
394
"model.layers.{bid}.self_attn.k_layernorm" , # persimmon
395
- "model.layers.{bid}.self_attn.k_norm" , # cohere olmoe chameleon olmo_1124
395
+ "model.layers.{bid}.self_attn.k_norm" , # cohere olmoe chameleon olmo2
396
396
"transformer.blocks.{bid}.attn.k_ln" , # sea-lion
397
397
"encoder.layer.{bid}.attention.self.layer_norm_k" , # jina-bert-v2
398
398
"transformer.layers.{bid}.attn.k_norm" , # openelm
0 commit comments