@@ -24,6 +24,7 @@ class TensorNameMap:
24
24
"backbone.embedding" , # mamba
25
25
"backbone.embeddings" , # mamba-hf
26
26
"transformer.in_out_embed" , # Grok
27
+ "transformer.token_embeddings" , # openelm
27
28
),
28
29
29
30
# Token type embeddings
@@ -36,6 +37,7 @@ class TensorNameMap:
36
37
"word_embeddings_layernorm" , # bloom
37
38
"embeddings.LayerNorm" , # bert
38
39
"emb_ln" , # nomic-bert
40
+ "transformer.norm" , # openelm
39
41
),
40
42
41
43
# Position embeddings
@@ -68,6 +70,7 @@ class TensorNameMap:
68
70
"model.norm_f" , # mamba-qbert
69
71
"backbone.norm_f" , # mamba
70
72
"transformer.rms_norm" , # Grok
73
+ "transformer.norm" , # openelm
71
74
),
72
75
73
76
# Rope frequencies
@@ -97,6 +100,7 @@ class TensorNameMap:
97
100
"backbone.layers.{bid}.norm" , # mamba
98
101
"transformer.decoder_layer.{bid}.rms_norm" , # Grok
99
102
"transformer.blocks.{bid}.norm_attn_norm.norm_1" , # dbrx
103
+ "transformer.layers.{bid}.attn_norm" , # openelm
100
104
),
101
105
102
106
# Attention norm 2
@@ -117,7 +121,8 @@ class TensorNameMap:
117
121
"h.{bid}.attn.c_attn" , # gpt2
118
122
"transformer.h.{bid}.mixer.Wqkv" , # phi2
119
123
"encoder.layers.{bid}.attn.Wqkv" , # nomic-bert
120
- "model.layers.{bid}.self_attn.qkv_proj" # phi3
124
+ "model.layers.{bid}.self_attn.qkv_proj" , # phi3
125
+ "transformer.layers.{bid}.attn.qkv_proj" , # openelm
121
126
),
122
127
123
128
# Attention query
@@ -175,6 +180,7 @@ class TensorNameMap:
175
180
"encoder.layers.{bid}.attn.out_proj" , # nomic-bert
176
181
"transformer.decoder_layer.{bid}.multi_head_attention.linear" , # Grok
177
182
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj" , # dbrx
183
+ "transformer.layers.{bid}.attn.out_proj" , # openelm
178
184
),
179
185
180
186
# Attention output norm
@@ -206,6 +212,7 @@ class TensorNameMap:
206
212
"h.{bid}.ln_2" , # gpt2
207
213
"model.layers.{bid}.ffn_norm" , # internlm2
208
214
"transformer.decoder_layer.{bid}.rms_norm_2" , # Grok
215
+ "transformer.layers.{bid}.ffn_norm" , # openelm
209
216
),
210
217
211
218
MODEL_TENSOR .FFN_GATE_INP : (
@@ -244,6 +251,7 @@ class TensorNameMap:
244
251
"encoder.layers.{bid}.mlp.fc11" , # nomic-bert
245
252
"model.layers.{bid}.mlp.c_fc" , # starcoder2
246
253
"encoder.layer.{bid}.mlp.gated_layers_v" , # jina-bert-v2
254
+ "transformer.layers.{bid}.ffn.proj_1" , # openelm
247
255
),
248
256
249
257
MODEL_TENSOR .FFN_UP_EXP : (
@@ -306,6 +314,7 @@ class TensorNameMap:
306
314
"encoder.layers.{bid}.mlp.fc2" , # nomic-bert
307
315
"model.layers.{bid}.mlp.c_proj" , # starcoder2
308
316
"encoder.layer.{bid}.mlp.wo" , # jina-bert-v2
317
+ "transformer.layers.{bid}.ffn.proj_2" , # openelm
309
318
),
310
319
311
320
MODEL_TENSOR .FFN_DOWN_EXP : (
@@ -324,15 +333,17 @@ class TensorNameMap:
324
333
"model.layers.{bid}.self_attn.q_layernorm" , # persimmon
325
334
"model.layers.{bid}.self_attn.q_norm" , # cohere
326
335
"transformer.blocks.{bid}.attn.q_ln" , # sea-lion
327
- "encoder.layer.{bid}.attention.self.layer_norm_q" # jina-bert-v2
336
+ "encoder.layer.{bid}.attention.self.layer_norm_q" , # jina-bert-v2
337
+ "transformer.layers.{bid}.attn.q_norm" , # openelm
328
338
),
329
339
330
340
MODEL_TENSOR .ATTN_K_NORM : (
331
341
"language_model.encoder.layers.{bid}.self_attention.k_layernorm" ,
332
342
"model.layers.{bid}.self_attn.k_layernorm" , # persimmon
333
343
"model.layers.{bid}.self_attn.k_norm" , # cohere
334
344
"transformer.blocks.{bid}.attn.k_ln" , # sea-lion
335
- "encoder.layer.{bid}.attention.self.layer_norm_k" # jina-bert-v2
345
+ "encoder.layer.{bid}.attention.self.layer_norm_k" , # jina-bert-v2
346
+ "transformer.layers.{bid}.attn.k_norm" , # openelm
336
347
),
337
348
338
349
MODEL_TENSOR .ROPE_FREQS : (
0 commit comments