@@ -169,6 +169,7 @@ class TensorNameMap:
169
169
"model.layers.{bid}.self_attn.q_proj_no_perm" , # llama-custom
170
170
"layers.{bid}.attention.wq" , # llama-pth
171
171
"encoder.layer.{bid}.attention.self.query" , # bert
172
+ "transformer.layer.{bid}.attention.q_lin" , # distillbert
172
173
"transformer.h.{bid}.attn.q_proj" , # gpt-j
173
174
"model.layers.layers.{bid}.self_attn.q_proj" , # plamo
174
175
"model.layers.{bid}.attention.wq" , # internlm2
@@ -183,6 +184,7 @@ class TensorNameMap:
183
184
"model.layers.{bid}.self_attn.k_proj_no_perm" , # llama-custom
184
185
"layers.{bid}.attention.wk" , # llama-pth
185
186
"encoder.layer.{bid}.attention.self.key" , # bert
187
+ "transformer.layer.{bid}.attention.k_lin" , # distillbert
186
188
"transformer.h.{bid}.attn.k_proj" , # gpt-j
187
189
"transformer.h.{bid}.attn.k" , # refact
188
190
"model.layers.layers.{bid}.self_attn.k_proj" , # plamo
@@ -197,6 +199,7 @@ class TensorNameMap:
197
199
"model.layers.{bid}.self_attn.v_proj" , # llama-hf nemotron olmoe olmo2 phimoe
198
200
"layers.{bid}.attention.wv" , # llama-pth
199
201
"encoder.layer.{bid}.attention.self.value" , # bert
202
+ "transformer.layer.{bid}.attention.v_lin" , # distillbert
200
203
"transformer.h.{bid}.attn.v_proj" , # gpt-j
201
204
"transformer.h.{bid}.attn.v" , # refact
202
205
"model.layers.layers.{bid}.self_attn.v_proj" , # plamo
@@ -217,6 +220,7 @@ class TensorNameMap:
217
220
"model.layers.{bid}.self_attn.linear_attn" , # deci
218
221
"layers.{bid}.attention.wo" , # llama-pth
219
222
"encoder.layer.{bid}.attention.output.dense" , # bert
223
+ "transformer.layer.{bid}.attention.out_lin" , # distillbert
220
224
"transformer.h.{bid}.attn.out_proj" , # gpt-j
221
225
"language_model.encoder.layers.{bid}.self_attention.dense" , # persimmon
222
226
"model.layers.{bid}.self_attn.dense" , # persimmon
@@ -237,6 +241,7 @@ class TensorNameMap:
237
241
# Attention output norm
238
242
MODEL_TENSOR .ATTN_OUT_NORM : (
239
243
"encoder.layer.{bid}.attention.output.LayerNorm" , # bert
244
+ "transformer.layer.{bid}.sa_layer_norm" , # distillbert
240
245
"encoder.layers.{bid}.norm1" , # nomic-bert
241
246
"transformer.decoder_layer.{bid}.rms_norm_1" , # Grok
242
247
"transformer.blocks.{bid}.norm_attn_norm.norm_2" , # dbrx
@@ -313,6 +318,7 @@ class TensorNameMap:
313
318
"model.layers.{bid}.mlp.up_proj" , # llama-hf refact nemotron olmo2
314
319
"layers.{bid}.feed_forward.w3" , # llama-pth
315
320
"encoder.layer.{bid}.intermediate.dense" , # bert
321
+ "transformer.layer.{bid}.ffn.lin1" , # distillbert
316
322
"transformer.h.{bid}.mlp.fc_in" , # gpt-j
317
323
"transformer.h.{bid}.mlp.linear_3" , # refact
318
324
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h" , # persimmon
@@ -396,6 +402,7 @@ class TensorNameMap:
396
402
"model.layers.{bid}.mlp.down_proj" , # llama-hf nemotron olmo2
397
403
"layers.{bid}.feed_forward.w2" , # llama-pth
398
404
"encoder.layer.{bid}.output.dense" , # bert
405
+ "transformer.layer.{bid}.ffn.lin2" , # distillbert
399
406
"transformer.h.{bid}.mlp.fc_out" , # gpt-j
400
407
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h" , # persimmon
401
408
"model.layers.{bid}.mlp.dense_4h_to_h" , # persimmon
@@ -457,6 +464,7 @@ class TensorNameMap:
457
464
458
465
MODEL_TENSOR .LAYER_OUT_NORM : (
459
466
"encoder.layer.{bid}.output.LayerNorm" , # bert
467
+ "transformer.layer.{bid}.output_layer_norm" , # distillbert
460
468
"encoder.layers.{bid}.norm2" , # nomic-bert
461
469
"transformer.decoder_layer.{bid}.rms_norm_3" , # Grok
462
470
"encoder.layer.{bid}.mlp.layernorm" , # jina-bert-v2
@@ -827,6 +835,7 @@ class TensorNameMap:
827
835
MODEL_TENSOR .CLS : (
828
836
"classifier" , # jina
829
837
"classifier.dense" , # roberta
838
+ "pre_classifier" , # distillbert
830
839
),
831
840
832
841
MODEL_TENSOR .CLS_OUT : (
0 commit comments