@@ -13,7 +13,7 @@ class TensorNameMap:
13
13
"transformer.wte" , # gpt2 gpt-j mpt refact qwen dbrx jais
14
14
"transformer.word_embeddings" , # falcon
15
15
"word_embeddings" , # bloom
16
- "model.embed_tokens" , # llama-hf
16
+ "model.embed_tokens" , # llama-hf nemotron
17
17
"tok_embeddings" , # llama-pth
18
18
"embeddings.word_embeddings" , # bert nomic-bert
19
19
"language_model.embedding.word_embeddings" , # persimmon
@@ -52,7 +52,7 @@ class TensorNameMap:
52
52
# Output
53
53
MODEL_TENSOR .OUTPUT : (
54
54
"embed_out" , # gptneox
55
- "lm_head" , # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais
55
+ "lm_head" , # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron
56
56
"output" , # llama-pth bloom internlm2
57
57
"word_embeddings_for_head" , # persimmon
58
58
"lm_head.linear" , # phi2
@@ -75,6 +75,7 @@ class TensorNameMap:
75
75
"transformer.rms_norm" , # Grok
76
76
"encoder.final_layernorm" , # chatglm
77
77
"transformer.norm" , # openelm
78
+ "model.norm" , # nemotron
78
79
),
79
80
80
81
# Rope frequencies
@@ -93,7 +94,7 @@ class TensorNameMap:
93
94
"transformer.h.{bid}.input_layernorm" , # falcon7b
94
95
"h.{bid}.input_layernorm" , # bloom
95
96
"transformer.h.{bid}.ln_mlp" , # falcon40b
96
- "model.layers.{bid}.input_layernorm" , # llama-hf
97
+ "model.layers.{bid}.input_layernorm" , # llama-hf nemotron
97
98
"layers.{bid}.attention_norm" , # llama-pth
98
99
"language_model.encoder.layers.{bid}.input_layernorm" , # persimmon
99
100
"model.layers.{bid}.ln1" , # yi
@@ -135,7 +136,7 @@ class TensorNameMap:
135
136
136
137
# Attention query
137
138
MODEL_TENSOR .ATTN_Q : (
138
- "model.layers.{bid}.self_attn.q_proj" , # llama-hf
139
+ "model.layers.{bid}.self_attn.q_proj" , # llama-hf nemotron
139
140
"layers.{bid}.attention.wq" , # llama-pth
140
141
"encoder.layer.{bid}.attention.self.query" , # bert
141
142
"transformer.h.{bid}.attn.q_proj" , # gpt-j
@@ -146,7 +147,7 @@ class TensorNameMap:
146
147
147
148
# Attention key
148
149
MODEL_TENSOR .ATTN_K : (
149
- "model.layers.{bid}.self_attn.k_proj" , # llama-hf
150
+ "model.layers.{bid}.self_attn.k_proj" , # llama-hf nemotron
150
151
"layers.{bid}.attention.wk" , # llama-pth
151
152
"encoder.layer.{bid}.attention.self.key" , # bert
152
153
"transformer.h.{bid}.attn.k_proj" , # gpt-j
@@ -158,7 +159,7 @@ class TensorNameMap:
158
159
159
160
# Attention value
160
161
MODEL_TENSOR .ATTN_V : (
161
- "model.layers.{bid}.self_attn.v_proj" , # llama-hf
162
+ "model.layers.{bid}.self_attn.v_proj" , # llama-hf nemotron
162
163
"layers.{bid}.attention.wv" , # llama-pth
163
164
"encoder.layer.{bid}.attention.self.value" , # bert
164
165
"transformer.h.{bid}.attn.v_proj" , # gpt-j
@@ -175,7 +176,7 @@ class TensorNameMap:
175
176
"transformer.blocks.{bid}.attn.out_proj" , # mpt
176
177
"transformer.h.{bid}.self_attention.dense" , # falcon
177
178
"h.{bid}.self_attention.dense" , # bloom
178
- "model.layers.{bid}.self_attn.o_proj" , # llama-hf
179
+ "model.layers.{bid}.self_attn.o_proj" , # llama-hf nemotron
179
180
"layers.{bid}.attention.wo" , # llama-pth
180
181
"encoder.layer.{bid}.attention.output.dense" , # bert
181
182
"transformer.h.{bid}.attn.out_proj" , # gpt-j
@@ -218,7 +219,7 @@ class TensorNameMap:
218
219
"transformer.h.{bid}.ln_2" , # gpt2 refact qwen jais
219
220
"h.{bid}.post_attention_layernorm" , # bloom
220
221
"transformer.blocks.{bid}.norm_2" , # mpt
221
- "model.layers.{bid}.post_attention_layernorm" , # llama-hf
222
+ "model.layers.{bid}.post_attention_layernorm" , # llama-hf nemotron
222
223
"layers.{bid}.ffn_norm" , # llama-pth
223
224
"language_model.encoder.layers.{bid}.post_attention_layernorm" , # persimmon
224
225
"model.layers.{bid}.ln2" , # yi
@@ -258,7 +259,7 @@ class TensorNameMap:
258
259
"transformer.blocks.{bid}.ffn.up_proj" , # mpt
259
260
"transformer.h.{bid}.mlp.dense_h_to_4h" , # falcon
260
261
"h.{bid}.mlp.dense_h_to_4h" , # bloom
261
- "model.layers.{bid}.mlp.up_proj" , # llama-hf refact
262
+ "model.layers.{bid}.mlp.up_proj" , # llama-hf refact nemotron
262
263
"layers.{bid}.feed_forward.w3" , # llama-pth
263
264
"encoder.layer.{bid}.intermediate.dense" , # bert
264
265
"transformer.h.{bid}.mlp.fc_in" , # gpt-j
@@ -329,7 +330,7 @@ class TensorNameMap:
329
330
"transformer.blocks.{bid}.ffn.down_proj" , # mpt
330
331
"transformer.h.{bid}.mlp.dense_4h_to_h" , # falcon
331
332
"h.{bid}.mlp.dense_4h_to_h" , # bloom
332
- "model.layers.{bid}.mlp.down_proj" , # llama-hf
333
+ "model.layers.{bid}.mlp.down_proj" , # llama-hf nemotron
333
334
"layers.{bid}.feed_forward.w2" , # llama-pth
334
335
"encoder.layer.{bid}.output.dense" , # bert
335
336
"transformer.h.{bid}.mlp.fc_out" , # gpt-j
0 commit comments