20
20
21
21
model = torch .load (fname_model , map_location = "cpu" )
22
22
23
- n_vocab , n_embd = model ['model.embed_tokens.weight' ].shape
24
- n_layer = 1 + max (int (m .group (1 )) for name in model
25
- if (m := re .match (r'model\.layers\.([0-9]+)' , name )))
23
+ n_vocab , n_embd = model ["model.embed_tokens.weight" ].shape
24
+ n_layer = 1 + max (
25
+ int (m .group (1 ))
26
+ for name in model
27
+ if (m := re .match (r"model\.layers\.([0-9]+)" , name ))
28
+ )
26
29
27
30
# hardcoded:
28
31
n_mult = 256
36
39
37
40
fout = open (fname_out , "wb" )
38
41
39
- fout .write (struct .pack ("i" , 0x67676d66 )) # magic: ggmf in hex
40
- fout .write (struct .pack ("i" , 1 )) # file version
42
+ fout .write (struct .pack ("i" , 0x67676D66 )) # magic: ggmf in hex
43
+ fout .write (struct .pack ("i" , 1 )) # file version
41
44
fout .write (struct .pack ("i" , n_vocab ))
42
45
fout .write (struct .pack ("i" , n_embd ))
43
46
fout .write (struct .pack ("i" , n_mult ))
44
47
fout .write (struct .pack ("i" , n_head ))
45
48
fout .write (struct .pack ("i" , n_layer ))
46
- fout .write (struct .pack ("i" , n_embd // n_head )) # rot (obsolete)
49
+ fout .write (struct .pack ("i" , n_embd // n_head )) # rot (obsolete)
47
50
fout .write (struct .pack ("i" , 4 ))
48
51
49
52
66
69
fout .write (text )
67
70
fout .write (struct .pack ("f" , tokenizer .get_score (i )))
68
71
72
+
69
73
def write_header (shape , dst_name , ftype_cur ):
70
- sname = dst_name .encode (' utf-8' )
74
+ sname = dst_name .encode (" utf-8" )
71
75
fout .write (struct .pack ("iii" , len (shape ), len (sname ), ftype_cur ))
72
76
fout .write (struct .pack ("i" * len (shape ), * shape [::- 1 ]))
73
77
fout .write (sname )
74
78
79
+
75
80
def convert_non_q4 (src_name , dst_name ):
76
81
v = model [src_name ]
77
82
shape = v .shape
78
- print ("Processing non-Q4 variable: " + src_name + " with shape: " , shape , " and type: " , v .dtype )
83
+ print (
84
+ "Processing non-Q4 variable: " + src_name + " with shape: " ,
85
+ shape ,
86
+ " and type: " ,
87
+ v .dtype ,
88
+ )
79
89
if len (shape ) == 1 :
80
90
print (" Converting to float32" )
81
91
v = v .to (torch .float32 )
@@ -88,11 +98,12 @@ def convert_non_q4(src_name, dst_name):
88
98
# data
89
99
v .numpy ().tofile (fout )
90
100
101
+
91
102
def convert_q4 (src_name , dst_name , permute = False ):
92
103
zeros = model [f"{ src_name } .zeros" ].numpy ()
93
104
scales = model [f"{ src_name } .scales" ].numpy ()
94
105
bias = model [f"{ src_name } .bias" ].numpy ()
95
- qweight = model [f"{ src_name } .qweight" ].numpy ().T # transpose
106
+ qweight = model [f"{ src_name } .qweight" ].numpy ().T # transpose
96
107
97
108
# Q4_1 does not support bias; good thing the bias is always all zeros.
98
109
assert not np .any (bias )
@@ -113,7 +124,7 @@ def convert_q4(src_name, dst_name, permute=False):
113
124
# the columns in a row, so we end up wasting quite a bit of memory with
114
125
# repeated scales and addends.
115
126
116
- addends = - zeros # flip sign
127
+ addends = - zeros # flip sign
117
128
118
129
# Since the output format is mixed between integers and floats, we have
119
130
# to hackily view the floats as int32s just so numpy will let us
@@ -128,37 +139,53 @@ def convert_q4(src_name, dst_name, permute=False):
128
139
addends_rep = np .atleast_3d (addends_view ).repeat (grouped .shape [1 ], axis = 1 )
129
140
scales_rep = np .atleast_3d (scales_view ).repeat (grouped .shape [1 ], axis = 1 )
130
141
131
- blob = np .concatenate ([scales_rep , addends_rep , grouped ], axis = 2 , casting = 'no' )
142
+ blob = np .concatenate ([scales_rep , addends_rep , grouped ], axis = 2 , casting = "no" )
132
143
133
144
if permute :
134
145
# Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
135
146
# This can be done after the above conversion because it doesn't affect column order/layout.
136
- blob = (blob .reshape (n_head , 2 , shape [0 ] // n_head // 2 , * blob .shape [1 :])
137
- .swapaxes (1 , 2 )
138
- .reshape (blob .shape ))
147
+ blob = (
148
+ blob .reshape (n_head , 2 , shape [0 ] // n_head // 2 , * blob .shape [1 :])
149
+ .swapaxes (1 , 2 )
150
+ .reshape (blob .shape )
151
+ )
139
152
140
153
# header
141
- write_header (shape , dst_name , 3 ) # ftype = Q4_1
154
+ write_header (shape , dst_name , 3 ) # ftype = Q4_1
142
155
143
156
# data
144
157
blob .tofile (fout )
145
158
159
+
146
160
convert_non_q4 ("model.embed_tokens.weight" , "tok_embeddings.weight" )
147
161
convert_non_q4 ("model.norm.weight" , "norm.weight" )
148
162
convert_non_q4 ("lm_head.weight" , "output.weight" )
149
163
150
164
for i in range (n_layer ):
151
- convert_q4 (f"model.layers.{ i } .self_attn.q_proj" , f"layers.{ i } .attention.wq.weight" , permute = True )
152
- convert_q4 (f"model.layers.{ i } .self_attn.k_proj" , f"layers.{ i } .attention.wk.weight" , permute = True )
165
+ convert_q4 (
166
+ f"model.layers.{ i } .self_attn.q_proj" ,
167
+ f"layers.{ i } .attention.wq.weight" ,
168
+ permute = True ,
169
+ )
170
+ convert_q4 (
171
+ f"model.layers.{ i } .self_attn.k_proj" ,
172
+ f"layers.{ i } .attention.wk.weight" ,
173
+ permute = True ,
174
+ )
153
175
convert_q4 (f"model.layers.{ i } .self_attn.v_proj" , f"layers.{ i } .attention.wv.weight" )
154
176
convert_q4 (f"model.layers.{ i } .self_attn.o_proj" , f"layers.{ i } .attention.wo.weight" )
155
177
156
178
convert_q4 (f"model.layers.{ i } .mlp.gate_proj" , f"layers.{ i } .feed_forward.w1.weight" )
157
179
convert_q4 (f"model.layers.{ i } .mlp.down_proj" , f"layers.{ i } .feed_forward.w2.weight" )
158
- convert_q4 (f"model.layers.{ i } .mlp.up_proj" , f"layers.{ i } .feed_forward.w3.weight" )
159
-
160
- convert_non_q4 (f"model.layers.{ i } .input_layernorm.weight" , f"layers.{ i } .attention_norm.weight" )
161
- convert_non_q4 (f"model.layers.{ i } .post_attention_layernorm.weight" , f"layers.{ i } .ffn_norm.weight" )
180
+ convert_q4 (f"model.layers.{ i } .mlp.up_proj" , f"layers.{ i } .feed_forward.w3.weight" )
181
+
182
+ convert_non_q4 (
183
+ f"model.layers.{ i } .input_layernorm.weight" , f"layers.{ i } .attention_norm.weight"
184
+ )
185
+ convert_non_q4 (
186
+ f"model.layers.{ i } .post_attention_layernorm.weight" ,
187
+ f"layers.{ i } .ffn_norm.weight" ,
188
+ )
162
189
163
190
164
191
fout .close ()
0 commit comments