@@ -4,92 +4,92 @@ def get_tensor_namemap( n_blocks : int):
4
4
tensor_map = {}
5
5
# Token embeddings
6
6
mapped_to = "token_embd"
7
- tensor_map ["gpt_neox.embed_in" ] = mapped_to # gptneox
8
- tensor_map ["transformer.wte" ] = mapped_to # gpt2 mpt
7
+ tensor_map ["gpt_neox.embed_in" ] = mapped_to # gptneox
8
+ tensor_map ["transformer.wte" ] = mapped_to # gpt2 mpt
9
9
tensor_map ["transformer.word_embeddings" ] = mapped_to # falcon
10
- tensor_map ["model.embed_tokens" ] = mapped_to # llama-hf
11
- tensor_map ["tok_embeddings" ] = mapped_to # llama-pth
10
+ tensor_map ["model.embed_tokens" ] = mapped_to # llama-hf
11
+ tensor_map ["tok_embeddings" ] = mapped_to # llama-pth
12
12
# Position embeddings
13
13
mapped_to = "pos_embd"
14
14
tensor_map ["transformer.wpe" ] = mapped_to # gpt2
15
15
# Output norm
16
16
mapped_to = "output_norm"
17
17
tensor_map ["gpt_neox.final_layer_norm" ] = mapped_to # gptneox
18
- tensor_map ["transformer.ln_f" ] = mapped_to # gpt2 falcon
19
- tensor_map ["transformer.norm_f" ] = mapped_to # mpt
20
- tensor_map ["model.norm" ] = mapped_to # llama-hf
21
- tensor_map ["norm" ] = mapped_to # llama-pth
18
+ tensor_map ["transformer.ln_f" ] = mapped_to # gpt2 falcon
19
+ tensor_map ["transformer.norm_f" ] = mapped_to # mpt
20
+ tensor_map ["model.norm" ] = mapped_to # llama-hf
21
+ tensor_map ["norm" ] = mapped_to # llama-pth
22
22
# Output
23
23
mapped_to = "output"
24
24
tensor_map ["embed_out" ] = mapped_to # gptneox
25
- tensor_map ["lm_head" ] = mapped_to # gpt2 mpt falcon llama-hf
26
- tensor_map ["output" ] = mapped_to # llama-pth
25
+ tensor_map ["lm_head" ] = mapped_to # gpt2 mpt falcon llama-hf
26
+ tensor_map ["output" ] = mapped_to # llama-pth
27
27
# Attention and fee-forward layer blocks
28
28
for i in range (0 ,n_blocks ):
29
29
# Attention norm
30
30
mapped_to = "blk." + str (i )+ ".attn_norm"
31
31
tensor_map ["gpt_neox.layers." + str (i )+ ".input_layernorm" ] = mapped_to # gptneox
32
- tensor_map ["transformer.h." + str (i )+ ".ln_1" ] = mapped_to # gpt2
33
- tensor_map ["transformer.blocks." + str (i )+ ".norm_1" ] = mapped_to # mpt
34
- tensor_map ["transformer.h." + str (i )+ ".input_layernorm" ] = mapped_to # falcon7b
35
- tensor_map ["transformer.h." + str (i )+ ".ln_attn" ] = mapped_to # falcon40b
36
- tensor_map ["model.layers." + str (i )+ ".input_layernorm" ] = mapped_to # llama-hf
37
- tensor_map ["layers." + str (i )+ ".attention_norm" ] = mapped_to # llama-pth
32
+ tensor_map ["transformer.h." + str (i )+ ".ln_1" ] = mapped_to # gpt2
33
+ tensor_map ["transformer.blocks." + str (i )+ ".norm_1" ] = mapped_to # mpt
34
+ tensor_map ["transformer.h." + str (i )+ ".input_layernorm" ] = mapped_to # falcon7b
35
+ tensor_map ["transformer.h." + str (i )+ ".ln_attn" ] = mapped_to # falcon40b
36
+ tensor_map ["model.layers." + str (i )+ ".input_layernorm" ] = mapped_to # llama-hf
37
+ tensor_map ["layers." + str (i )+ ".attention_norm" ] = mapped_to # llama-pth
38
38
# Attention norm 2
39
39
mapped_to = "blk." + str (i )+ ".attn_norm_2"
40
40
tensor_map ["transformer.h." + str (i )+ ".ln_mlp" ] = mapped_to # falcon40b
41
41
# Attention query-key-value
42
42
mapped_to = "blk." + str (i )+ ".attn_qkv"
43
- tensor_map ["gpt_neox.layers." + str (i )+ ".attention.query_key_value" ] = mapped_to # gptneox
44
- tensor_map ["transformer.h." + str (i )+ ".attn.c_attn" ] = mapped_to # gpt2
45
- tensor_map ["transformer.blocks." + str (i )+ ".attn.Wqkv" ] = mapped_to # mpt
46
- tensor_map ["transformer.h." + str (i )+ ".self_attention.query_key_value" ] = mapped_to # falcon
43
+ tensor_map ["gpt_neox.layers." + str (i )+ ".attention.query_key_value" ] = mapped_to # gptneox
44
+ tensor_map ["transformer.h." + str (i )+ ".attn.c_attn" ] = mapped_to # gpt2
45
+ tensor_map ["transformer.blocks." + str (i )+ ".attn.Wqkv" ] = mapped_to # mpt
46
+ tensor_map ["transformer.h." + str (i )+ ".self_attention.query_key_value" ] = mapped_to # falcon
47
47
# Attention query
48
48
mapped_to = "blk." + str (i )+ ".attn_q"
49
49
tensor_map ["model.layers." + str (i )+ ".self_attn.q_proj" ] = mapped_to # llama-hf
50
- tensor_map ["layers." + str (i )+ ".attention.wq" ] = mapped_to # llama-pth
50
+ tensor_map ["layers." + str (i )+ ".attention.wq" ] = mapped_to # llama-pth
51
51
# Attention key
52
52
mapped_to = "blk." + str (i )+ ".attn_k"
53
53
tensor_map ["model.layers." + str (i )+ ".self_attn.k_proj" ] = mapped_to # llama-hf
54
- tensor_map ["layers." + str (i )+ ".attention.wk" ] = mapped_to # llama-pth
54
+ tensor_map ["layers." + str (i )+ ".attention.wk" ] = mapped_to # llama-pth
55
55
# Attention value
56
56
mapped_to = "blk." + str (i )+ ".attn_v"
57
57
tensor_map ["model.layers." + str (i )+ ".self_attn.v_proj" ] = mapped_to # llama-hf
58
- tensor_map ["layers." + str (i )+ ".attention.wv" ] = mapped_to # llama-pth
58
+ tensor_map ["layers." + str (i )+ ".attention.wv" ] = mapped_to # llama-pth
59
59
# Attention output
60
60
mapped_to = "blk." + str (i )+ ".attn_output"
61
- tensor_map ["gpt_neox.layers." + str (i )+ ".attention.dense" ] = mapped_to # gptneox
62
- tensor_map ["transformer.h." + str (i )+ ".attn.c_proj" ] = mapped_to # gpt2
63
- tensor_map ["transformer.blocks." + str (i )+ ".attn.out_proj" ] = mapped_to # mpt
61
+ tensor_map ["gpt_neox.layers." + str (i )+ ".attention.dense" ] = mapped_to # gptneox
62
+ tensor_map ["transformer.h." + str (i )+ ".attn.c_proj" ] = mapped_to # gpt2
63
+ tensor_map ["transformer.blocks." + str (i )+ ".attn.out_proj" ] = mapped_to # mpt
64
64
tensor_map ["transformer.h." + str (i )+ ".self_attention.dense" ] = mapped_to # falcon
65
- tensor_map ["model.layers." + str (i )+ ".self_attn.o_proj" ] = mapped_to # llama-hf
66
- tensor_map ["layers." + str (i )+ ".attention.wo" ] = mapped_to # llama-pth
65
+ tensor_map ["model.layers." + str (i )+ ".self_attn.o_proj" ] = mapped_to # llama-hf
66
+ tensor_map ["layers." + str (i )+ ".attention.wo" ] = mapped_to # llama-pth
67
67
# Feed-forward norm
68
68
mapped_to = "blk." + str (i )+ ".ffn_norm"
69
69
tensor_map ["gpt_neox.layers." + str (i )+ ".post_attention_layernorm" ] = mapped_to # gptneox
70
- tensor_map ["transformer.h." + str (i )+ ".ln_2" ] = mapped_to # gpt2
71
- tensor_map ["transformer.blocks." + str (i )+ ".norm_2" ] = mapped_to # mpt
72
- tensor_map ["model.layers." + str (i )+ ".post_attention_layernorm" ] = mapped_to # llama-hf
73
- tensor_map ["layers." + str (i )+ ".ffn_norm" ] = mapped_to # llama-pth
70
+ tensor_map ["transformer.h." + str (i )+ ".ln_2" ] = mapped_to # gpt2
71
+ tensor_map ["transformer.blocks." + str (i )+ ".norm_2" ] = mapped_to # mpt
72
+ tensor_map ["model.layers." + str (i )+ ".post_attention_layernorm" ] = mapped_to # llama-hf
73
+ tensor_map ["layers." + str (i )+ ".ffn_norm" ] = mapped_to # llama-pth
74
74
# Feed-forward up
75
75
mapped_to = "blk." + str (i )+ ".ffn_up"
76
76
tensor_map ["gpt_neox.layers." + str (i )+ ".mlp.dense_h_to_4h" ] = mapped_to # gptneox
77
- tensor_map ["transformer.h." + str (i )+ ".mlp.c_fc" ] = mapped_to # gpt2
78
- tensor_map ["transformer.blocks." + str (i )+ ".ffn.up_proj" ] = mapped_to # mpt
79
- tensor_map ["transformer.h." + str (i )+ ".mlp.dense_h_to_4h" ] = mapped_to # falcon
80
- tensor_map ["model.layers." + str (i )+ ".mlp.up_proj" ] = mapped_to # llama-hf
81
- tensor_map ["layers." + str (i )+ ".feed_forward.w3" ] = mapped_to # llama-pth
77
+ tensor_map ["transformer.h." + str (i )+ ".mlp.c_fc" ] = mapped_to # gpt2
78
+ tensor_map ["transformer.blocks." + str (i )+ ".ffn.up_proj" ] = mapped_to # mpt
79
+ tensor_map ["transformer.h." + str (i )+ ".mlp.dense_h_to_4h" ] = mapped_to # falcon
80
+ tensor_map ["model.layers." + str (i )+ ".mlp.up_proj" ] = mapped_to # llama-hf
81
+ tensor_map ["layers." + str (i )+ ".feed_forward.w3" ] = mapped_to # llama-pth
82
82
# Feed-forward gate
83
83
mapped_to = "blk." + str (i )+ ".ffn_gate"
84
84
tensor_map ["model.layers." + str (i )+ ".mlp.gate_proj" ] = mapped_to # llama-hf
85
- tensor_map ["layers." + str (i )+ ".feed_forward.w1" ] = mapped_to # llama-pth
85
+ tensor_map ["layers." + str (i )+ ".feed_forward.w1" ] = mapped_to # llama-pth
86
86
# Feed-forward down
87
87
mapped_to = "blk." + str (i )+ ".ffn_down"
88
88
tensor_map ["gpt_neox.layers." + str (i )+ ".mlp.dense_4h_to_h" ] = mapped_to # gptneox
89
- tensor_map ["transformer.h." + str (i )+ ".mlp.c_proj" ] = mapped_to # gpt2
90
- tensor_map ["transformer.blocks." + str (i )+ ".ffn.down_proj" ] = mapped_to # mpt
91
- tensor_map ["transformer.h." + str (i )+ ".mlp.dense_4h_to_h" ] = mapped_to # falcon
92
- tensor_map ["model.layers." + str (i )+ ".mlp.down_proj" ] = mapped_to # llama-hf
93
- tensor_map ["layers." + str (i )+ ".feed_forward.w2" ] = mapped_to # llama-pth
89
+ tensor_map ["transformer.h." + str (i )+ ".mlp.c_proj" ] = mapped_to # gpt2
90
+ tensor_map ["transformer.blocks." + str (i )+ ".ffn.down_proj" ] = mapped_to # mpt
91
+ tensor_map ["transformer.h." + str (i )+ ".mlp.dense_4h_to_h" ] = mapped_to # falcon
92
+ tensor_map ["model.layers." + str (i )+ ".mlp.down_proj" ] = mapped_to # llama-hf
93
+ tensor_map ["layers." + str (i )+ ".feed_forward.w2" ] = mapped_to # llama-pth
94
94
95
95
return tensor_map
0 commit comments