@@ -114,13 +114,15 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
114
114
115
115
@dataclass
116
116
class Params :
117
- n_vocab : int
118
- n_embd : int
119
- n_mult : int
120
- n_head : int
121
- n_layer : int
122
- n_ctx : int
123
- n_kv_head : Optional [int ] # This parameter is only used for Llama 2
117
+ n_vocab : int
118
+ n_embd : int
119
+ n_mult : int
120
+ n_layer : int
121
+ n_ctx : int
122
+ n_ff : int
123
+ n_head : int
124
+ n_head_kv : int
125
+ f_norm_eps : float
124
126
125
127
@staticmethod
126
128
def guessed (model : 'LazyModel' ) -> 'Params' :
@@ -139,28 +141,36 @@ def guessed(model: 'LazyModel') -> 'Params':
139
141
raise Exception ("failed to guess 'n_layer'. This model is unknown or unsupported.\n "
140
142
"Suggestion: provide 'config.json' of the model in the same directory containing model files." )
141
143
142
- n_head = n_embd // 128 # guessed
144
+ n_head = n_embd // 128 # guessed
145
+ n_mult = 255 # guessed
146
+
147
+ # TODO: verify this
148
+ n_ff = int (2 * (4 * n_embd ) / 3 )
149
+ n_ff = n_mult * ((n_ff + n_mult - 1 ) // n_mult )
143
150
144
151
return Params (
145
- n_vocab = n_vocab ,
146
- n_embd = n_embd ,
147
- n_mult = 256 ,
148
- n_head = n_head ,
149
- n_layer = n_layer ,
150
- n_ctx = - 1 ,
151
- n_kv_head = None ,
152
+ n_vocab = n_vocab ,
153
+ n_embd = n_embd ,
154
+ n_mult = 256 ,
155
+ n_layer = n_layer ,
156
+ n_ctx = - 1 ,
157
+ n_ff = n_ff ,
158
+ n_head = n_head ,
159
+ n_head_kv = n_head ,
160
+ f_norm_eps = 1e-5 ,
152
161
)
153
162
154
163
@staticmethod
155
164
def loadHFTransformerJson (model : 'LazyModel' , config_path : 'Path' ) -> 'Params' :
156
165
config = json .load (open (config_path ))
157
166
158
- n_vocab = config ["vocab_size" ];
159
- n_embd = config ["hidden_size" ];
160
- n_head = config ["num_attention_heads" ];
161
- n_layer = config ["num_hidden_layers" ];
162
- n_ff = config ["intermediate_size" ];
163
- n_kv_head = config .get ("num_key_value_heads" )
167
+ n_vocab = config ["vocab_size" ];
168
+ n_embd = config ["hidden_size" ];
169
+ n_layer = config ["num_hidden_layers" ];
170
+ n_ff = config ["intermediate_size" ];
171
+ n_head = config ["num_attention_heads" ];
172
+ n_head_kv = config ["num_key_value_heads" ];
173
+ f_norm_eps = config ["rms_norm_eps" ];
164
174
165
175
n_mult = find_n_mult (n_ff , n_embd );
166
176
@@ -173,13 +183,15 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
173
183
"Suggestion: provide 'config.json' of the model in the same directory containing model files." )
174
184
175
185
return Params (
176
- n_vocab = n_vocab ,
177
- n_embd = n_embd ,
178
- n_mult = n_mult ,
179
- n_head = n_head ,
180
- n_layer = n_layer ,
181
- n_ctx = n_ctx ,
182
- n_kv_head = n_kv_head ,
186
+ n_vocab = n_vocab ,
187
+ n_embd = n_embd ,
188
+ n_mult = n_mult ,
189
+ n_layer = n_layer ,
190
+ n_ctx = n_ctx ,
191
+ n_ff = n_ff ,
192
+ n_head = n_head ,
193
+ n_head_kv = n_head_kv ,
194
+ f_norm_eps = f_norm_eps ,
183
195
)
184
196
185
197
# LLaMA v2 70B params.json
@@ -188,23 +200,32 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
188
200
def loadOriginalParamsJson (model : 'LazyModel' , config_path : 'Path' ) -> 'Params' :
189
201
config = json .load (open (config_path ))
190
202
191
- n_vocab = config ["vocab_size" ];
192
- n_embd = config ["dim" ];
193
- n_head = config ["n_heads" ];
194
- n_layer = config ["n_layers" ];
195
- n_mult = config ["multiple_of" ];
203
+ n_vocab = config ["vocab_size" ];
204
+ n_embd = config ["dim" ];
205
+ n_layer = config ["n_layers" ];
206
+ n_mult = config ["multiple_of" ];
207
+ n_ctx = 2048 if config ["norm_eps" ] == 1e-06 else 4096 # hack to determine LLaMA v1 vs v2
208
+ n_ff = - 1 ;
209
+ n_head = config ["n_heads" ];
210
+ n_head_kv = config ["n_kv_head" ] if "n_kv_head" in config else n_head ;
211
+ f_norm_eps = config ["norm_eps" ];
196
212
197
213
if n_vocab == - 1 :
198
214
n_vocab = model ["tok_embeddings.weight" ].shape [0 ]
199
215
216
+ if n_ff == - 1 :
217
+ n_ff = model ["layers.0.feed_forward.w1.weight" ].shape [0 ]
218
+
200
219
return Params (
201
- n_vocab = n_vocab ,
202
- n_embd = n_embd ,
203
- n_mult = n_mult ,
204
- n_head = n_head ,
205
- n_layer = n_layer ,
206
- n_ctx = - 1 ,
207
- n_kv_head = None ,
220
+ n_vocab = n_vocab ,
221
+ n_embd = n_embd ,
222
+ n_mult = n_mult ,
223
+ n_layer = n_layer ,
224
+ n_ctx = n_ctx ,
225
+ n_ff = n_ff ,
226
+ n_head = n_head ,
227
+ n_head_kv = n_head_kv ,
228
+ f_norm_eps = f_norm_eps ,
208
229
)
209
230
210
231
@staticmethod
@@ -310,9 +331,9 @@ def __repr__(self) -> str:
310
331
Vocab = Union [BpeVocab , SentencePieceVocab ]
311
332
312
333
313
- def permute (weights : NDArray , n_head : int , n_kv_head : Optional [ int ] = None ) -> NDArray :
314
- if n_kv_head is not None and n_head != n_kv_head :
315
- n_head //= n_kv_head
334
+ def permute (weights : NDArray , n_head : int , n_head_kv : int ) -> NDArray :
335
+ if n_head_kv is not None and n_head != n_head_kv :
336
+ n_head //= n_head_kv
316
337
return (weights .reshape (n_head , 2 , weights .shape [0 ] // n_head // 2 , * weights .shape [1 :])
317
338
.swapaxes (1 , 2 )
318
339
.reshape (weights .shape ))
@@ -324,7 +345,7 @@ class Tensor(metaclass=ABCMeta):
324
345
@abstractmethod
325
346
def astype (self , data_type : DataType ) -> 'Tensor' : ...
326
347
@abstractmethod
327
- def permute (self , n_head : int , n_kv_head : Optional [ int ] = None ) -> 'Tensor' : ...
348
+ def permute (self , n_head : int , n_head_kv : int ) -> 'Tensor' : ...
328
349
@abstractmethod
329
350
def permute_part (self , n_part : int , n_head : int ) -> 'UnquantizedTensor' : ...
330
351
@abstractmethod
@@ -362,8 +383,8 @@ def part(self, n_part: int) -> 'UnquantizedTensor':
362
383
r = self .ndarray .shape [0 ] // 3
363
384
return UnquantizedTensor (self .ndarray [r * n_part : r * n_part + r , ...])
364
385
365
- def permute (self , n_head : int , n_kv_head : Optional [ int ] = None ) -> 'UnquantizedTensor' :
366
- return UnquantizedTensor (permute (self .ndarray , n_head , n_kv_head ))
386
+ def permute (self , n_head : int , n_head_kv : int ) -> 'UnquantizedTensor' :
387
+ return UnquantizedTensor (permute (self .ndarray , n_head , n_head_kv ))
367
388
368
389
369
390
def load_unquantized (lazy_tensor : 'LazyTensor' , expected_dtype : Any = None , convert : bool = False ) -> NDArray :
@@ -386,18 +407,18 @@ def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, conv
386
407
387
408
388
409
class DeferredPermutedTensor (Tensor ):
389
- def __init__ (self , base : Tensor , n_head : int , n_kv_head : Optional [ int ] = None ) -> None :
410
+ def __init__ (self , base : Tensor , n_head : int , n_head_kv : int ) -> None :
390
411
self .base = base
391
412
self .n_head = n_head
392
413
self .data_type = self .base .data_type
393
414
394
415
def astype (self , data_type : DataType ) -> Tensor :
395
- return self .base .astype (data_type ).permute (self .n_head , self .n_kv_head )
416
+ return self .base .astype (data_type ).permute (self .n_head , self .n_head_kv )
396
417
397
418
def to_ggml (self ) -> GGMLCompatibleTensor :
398
- return self .base .to_ggml ().permute (self .n_head , self .n_kv_head )
419
+ return self .base .to_ggml ().permute (self .n_head , self .n_head_kv )
399
420
400
- def permute (self , n_head : int , n_kv_head : Optional [ int ] = None ) -> Tensor :
421
+ def permute (self , n_head : int , n_head_kv : int ) -> Tensor :
401
422
raise Exception ("shouldn't permute twice" )
402
423
403
424
@@ -493,10 +514,10 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
493
514
return ModelPlus (model , paths , format , vocab )
494
515
495
516
496
- def permute_lazy (lazy_tensor : LazyTensor , n_head : int , n_kv_head : Optional [ int ] = None ) -> LazyTensor :
517
+ def permute_lazy (lazy_tensor : LazyTensor , n_head : int , n_head_kv : int ) -> LazyTensor :
497
518
def load () -> Tensor :
498
- return lazy_tensor .load ().permute (n_head , n_kv_head )
499
- return LazyTensor (load , lazy_tensor .shape , lazy_tensor .data_type , f'permute({ n_head } , { n_kv_head } ) ' + lazy_tensor .description )
519
+ return lazy_tensor .load ().permute (n_head , n_head_kv )
520
+ return LazyTensor (load , lazy_tensor .shape , lazy_tensor .data_type , f'permute({ n_head } , { n_head_kv } ) ' + lazy_tensor .description )
500
521
501
522
def permute_part_lazy (lazy_tensor : LazyTensor , n_part : int , n_head : int ) -> LazyTensor :
502
523
def load () -> Tensor :
@@ -521,7 +542,7 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
521
542
for i in itertools .count ():
522
543
if f"model.layers.{ i } .self_attn.q_proj.weight" in model :
523
544
out [f"layers.{ i } .attention.wq.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.q_proj.weight" ], params .n_head )
524
- out [f"layers.{ i } .attention.wk.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.k_proj.weight" ], params .n_head , params .n_kv_head )
545
+ out [f"layers.{ i } .attention.wk.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.k_proj.weight" ], params .n_head , params .n_head_kv )
525
546
out [f"layers.{ i } .attention.wv.weight" ] = model [f"model.layers.{ i } .self_attn.v_proj.weight" ]
526
547
elif f"model.layers.{ i } .self_attn.W_pack.weight" in model :
527
548
out [f"layers.{ i } .attention.wq.weight" ] = permute_part_lazy (model [f"model.layers.{ i } .self_attn.W_pack.weight" ], 0 , params .n_head )
@@ -732,9 +753,15 @@ def __init__(self, fname_out: Path) -> None:
732
753
def write_file_header (self , params : Params , file_type : GGMLFileType ) -> None :
733
754
llm_arch = "llama"
734
755
735
- self .gguf .add_architecture (llm_arch )
736
- self .gguf .add_context_length (llm_arch , params .n_ctx )
737
- self .gguf .add_embedding_length (llm_arch , params .n_embd )
756
+ self .gguf .add_architecture (llm_arch )
757
+ self .gguf .add_context_length (llm_arch , params .n_ctx )
758
+ self .gguf .add_embedding_length (llm_arch , params .n_embd )
759
+ self .gguf .add_block_count (llm_arch , params .n_layer )
760
+ self .gguf .add_feed_forward_length (llm_arch , params .n_ff )
761
+ self .gguf .add_rope_dimension_count (llm_arch , params .n_embd // params .n_head )
762
+ self .gguf .add_head_count (llm_arch , params .n_head )
763
+ self .gguf .add_head_count_kv (llm_arch , params .n_head_kv )
764
+ self .gguf .add_layer_norm_rms_eps (llm_arch , params .f_norm_eps )
738
765
739
766
def write_tensor_header (self , name : str , shape : Sequence [int ], data_type : DataType ) -> None :
740
767
sname = name .encode ('utf-8' )
@@ -744,15 +771,22 @@ def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataTy
744
771
self .fout .seek ((self .fout .tell () + 31 ) & - 32 )
745
772
746
773
def write_vocab (self , vocab : Vocab ) -> None :
774
+ tokens = []
775
+ scores = []
747
776
for text , score in vocab .all_tokens ():
748
- self .fout .write (struct .pack ("i" , len (text )))
749
- self .fout .write (text )
750
- self .fout .write (struct .pack ("f" , score ))
777
+ tokens .append (text )
778
+ scores .append (score )
779
+
780
+ self .gguf .add_tokenizer_model ("llama" )
781
+ self .gguf .add_token_list (tokens )
782
+ self .gguf .add_token_scores (scores )
783
+ #self.gguf.add_token_types(toktypes) # TODO: add this
784
+
785
+ # TODO: added / special tokens
751
786
752
787
@staticmethod
753
- def write_vocab_only (fname_out : Path , vocab : Vocab ) -> None :
788
+ def write_vocab_only (fname_out : Path , params : Params , vocab : Vocab ) -> None :
754
789
of = OutputFile (fname_out )
755
- params = Params (n_vocab = vocab .vocab_size , n_embd = 0 , n_mult = 0 , n_head = 1 , n_layer = 0 )
756
790
of = OutputFile (fname_out )
757
791
of .write_file_header (params , file_type = GGMLFileType .AllF32 )
758
792
of .write_vocab (vocab )
@@ -941,12 +975,12 @@ def main(args_in: Optional[List[str]] = None) -> None:
941
975
parser .add_argument ("--ctx" , type = int , help = "model training context (default: based on input)" )
942
976
args = parser .parse_args (args_in )
943
977
944
- vocab : Vocab
945
978
if args .dump_single :
946
979
model_plus = lazy_load_file (args .model )
947
980
do_dump_model (model_plus )
948
981
949
982
model_plus = load_some_model (args .model )
983
+
950
984
params = Params .load (model_plus )
951
985
if params .n_ctx == - 1 :
952
986
if args .ctx is None :
@@ -958,6 +992,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
958
992
959
993
print (f"params = { params } " )
960
994
995
+ vocab : Vocab
961
996
if args .vocab_only :
962
997
vocab = load_vocab (args .vocab_dir or args .model , args .vocabtype )
963
998
assert args .outfile , "need --outfile if using --vocab-only"
@@ -968,6 +1003,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
968
1003
if args .dump :
969
1004
do_dump_model (model_plus )
970
1005
return
1006
+
971
1007
if model_plus .vocab is not None and args .vocab_dir is None :
972
1008
vocab = model_plus .vocab
973
1009
else :
0 commit comments