@@ -142,6 +142,7 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
142
142
@dataclass
143
143
class Params :
144
144
n_vocab : int
145
+ n_vocab_sp :int
145
146
n_embd : int
146
147
n_mult : int
147
148
n_head : int
@@ -169,6 +170,7 @@ def guessed(model: 'LazyModel') -> 'Params':
169
170
170
171
return Params (
171
172
n_vocab = n_vocab ,
173
+ n_vocab_sp = n_vocab ,
172
174
n_embd = n_embd ,
173
175
n_mult = 256 ,
174
176
n_head = n_head ,
@@ -191,6 +193,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
191
193
192
194
return Params (
193
195
n_vocab = n_vocab ,
196
+ n_vocab_sp = n_vocab ,
194
197
n_embd = n_embd ,
195
198
n_mult = n_mult ,
196
199
n_head = n_head ,
@@ -215,6 +218,7 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
215
218
216
219
return Params (
217
220
n_vocab = n_vocab ,
221
+ n_vocab_sp = n_vocab
218
222
n_embd = n_embd ,
219
223
n_mult = n_mult ,
220
224
n_head = n_head ,
@@ -239,7 +243,7 @@ def load(model_plus: 'ModelPlus') -> 'Params':
239
243
240
244
241
245
class SentencePieceVocab :
242
- def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ], vocabtype : Optional [str ]) -> None :
246
+ def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ], fname_special_tokens : Optional [ Path ], vocabtype : Optional [str ]) -> None :
243
247
self .vocabtype = vocabtype
244
248
if self .vocabtype == "bpe" :
245
249
self .sentencepiece_tokenizer = json .loads (open (str (fname_tokenizer )).read ())
@@ -264,35 +268,46 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vo
264
268
self .vocab_size : int = self .vocab_size_base + len (self .added_tokens_list )
265
269
self .fname_tokenizer = fname_tokenizer
266
270
self .fname_added_tokens = fname_added_tokens
271
+ special_tokens : Dict [str , Dict [str , Any ]]
272
+ if fname_special_tokens is not None :
273
+ special_tokens = json .load (open (fname_special_tokens ))
274
+ else :
275
+ special_tokens = {}
276
+ token_name_to_id = {"unk_token" : self .sentencepiece_tokenizer .unk_id (), "bos_token" : self .sentencepiece_tokenizer .bos_id (), "eos_token" : self .sentencepiece_tokenizer .eos_id (), "pad_token" : self .sentencepiece_tokenizer .pad_id ()}
277
+ self .special_tokens_map = {token_name_to_id [token_name ]: info ["content" ] if isinstance (info , dict ) else info for token_name , info in special_tokens .items () if token_name in token_name_to_id and token_name_to_id [token_name ] != - 1 }
278
+ self .vocab_special_size : int = len (self .added_tokens_list ) + len (self .special_tokens_map )
267
279
268
280
def sentencepiece_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
269
281
tokenizer = self .sentencepiece_tokenizer
270
282
if self .vocabtype == "bpe" :
271
- from transformers .models .gpt2 import tokenization_gpt2
272
- byte_encoder = tokenization_gpt2 .bytes_to_unicode ()
273
- byte_decoder = {v : k for k , v in byte_encoder .items ()}
274
- for i , item in enumerate (tokenizer ):
275
- text : bytes
276
- text = b'' .join ([x .to_bytes (1 , byteorder = 'big' ) for x in [byte_decoder [y ] for y in item ]])
277
- score : float = - i
278
- yield text , score
283
+ from transformers .models .gpt2 import tokenization_gpt2
284
+ byte_encoder = tokenization_gpt2 .bytes_to_unicode ()
285
+ byte_decoder = {v : k for k , v in byte_encoder .items ()}
286
+ for i , item in enumerate (tokenizer ):
287
+ text : bytes
288
+ text = b'' .join ([x .to_bytes (1 , byteorder = 'big' ) for x in [byte_decoder [y ] for y in item ]])
289
+ score : float = - i
290
+ yield text , score
279
291
else :
280
- for i in range (tokenizer .vocab_size ()):
281
- text : bytes
282
- if tokenizer .is_unknown (i ):
283
- text = " \u2047 " .encode ("utf-8" )
284
- elif tokenizer .is_control (i ):
285
- text = b""
286
- elif tokenizer .is_byte (i ):
287
- piece = tokenizer .id_to_piece (i )
288
- if len (piece ) != 6 :
289
- raise Exception (f"Invalid token: { piece } " )
290
- byte_value = int (piece [3 :- 1 ], 16 )
291
- text = struct .pack ("B" , byte_value )
292
- else :
293
- text = tokenizer .id_to_piece (i ).replace ("\u2581 " , " " ).encode ("utf-8" )
294
- score : float = tokenizer .get_score (i )
295
- yield text , score
292
+ special_tokens = [tokenizer .bos_id (), tokenizer .eos_id (), tokenizer .pad_id ()]
293
+ for i in range (tokenizer .vocab_size ()):
294
+ text : bytes
295
+ if tokenizer .is_unknown (i ):
296
+ text = self .special_tokens_map .get (i , " \u2047 " ).encode ("utf-8" )
297
+ elif i in special_tokens :
298
+ text = self .special_tokens_map .get (i , "" ).encode ("utf-8" )
299
+ elif tokenizer .is_control (i ):
300
+ text = b""
301
+ elif tokenizer .is_byte (i ):
302
+ piece = tokenizer .id_to_piece (i )
303
+ if len (piece ) != 6 :
304
+ raise Exception (f"Invalid token: { piece } " )
305
+ byte_value = int (piece [3 :- 1 ], 16 )
306
+ text = struct .pack ("B" , byte_value )
307
+ else :
308
+ text = tokenizer .id_to_piece (i ).replace ("\u2581 " , " " ).encode ("utf-8" )
309
+ score : float = tokenizer .get_score (i )
310
+ yield text , score
296
311
297
312
def added_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
298
313
for text in self .added_tokens_list :
@@ -303,18 +318,29 @@ def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
303
318
yield from self .sentencepiece_tokens ()
304
319
yield from self .added_tokens ()
305
320
321
+ def all_special_tokens (self ) -> Iterable [int ]:
322
+ for token_id in self .special_tokens_map .keys ():
323
+ yield token_id
324
+ for i in range (len (self .added_tokens_list )):
325
+ yield self .vocab_size_base + i
326
+
306
327
def __repr__ (self ) -> str :
307
328
return f"<SentencePieceVocab with { self .vocab_size_base } base tokens and { len (self .added_tokens_list )} added tokens>"
308
329
309
330
310
331
class GGMLVocab :
311
332
def __init__ (self , tokens : List [Tuple [bytes , float ]]):
312
333
self .tokens = tokens
334
+ self .special_tokens = []
313
335
self .vocab_size = len (tokens )
336
+ self .vocab_special_size = 0
314
337
315
338
def all_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
316
339
return self .tokens
317
340
341
+ def all_special_tokens (self ) -> Iterable [int ]:
342
+ return self .special_tokens
343
+
318
344
def __repr__ (self ) -> str :
319
345
return f"<GGMLVocab with { self .vocab_size } tokens>"
320
346
@@ -1066,8 +1092,9 @@ def __init__(self, fname_out: Path) -> None:
1066
1092
def write_file_header (self , params : Params , file_type : GGMLFileType ) -> None :
1067
1093
self .fout .write (b"ggjt" [::- 1 ]) # magic
1068
1094
values = [
1069
- 1 , # file version
1095
+ 4 , # file version
1070
1096
params .n_vocab ,
1097
+ params .n_vocab_sp ,
1071
1098
params .n_embd ,
1072
1099
params .n_mult ,
1073
1100
params .n_head ,
@@ -1089,11 +1116,14 @@ def write_vocab(self, vocab: Vocab) -> None:
1089
1116
self .fout .write (struct .pack ("i" , len (text )))
1090
1117
self .fout .write (text )
1091
1118
self .fout .write (struct .pack ("f" , score ))
1119
+ for token_id in vocab .all_special_tokens ():
1120
+ self .fout .write (struct .pack ("i" , token_id ))
1092
1121
1093
1122
@staticmethod
1094
1123
def write_vocab_only (fname_out : Path , vocab : Vocab ) -> None :
1095
1124
of = OutputFile (fname_out )
1096
- params = Params (n_vocab = vocab .vocab_size , n_embd = 0 , n_mult = 0 , n_head = 1 , n_layer = 0 )
1125
+ params = Params (n_vocab = vocab .vocab_size , n_vocab_sp = vocab .vocab_special_size , n_embd = 0 , n_mult = 0 ,
1126
+ n_head = 1 , n_layer = 0 )
1097
1127
of = OutputFile (fname_out )
1098
1128
of .write_file_header (params , file_type = GGMLFileType .AllF32 )
1099
1129
of .write_vocab (vocab )
@@ -1249,8 +1279,10 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
1249
1279
f"Could not find tokenizer.model in { path } or its parent; "
1250
1280
"if it's in another directory, pass the directory as --vocab-dir" )
1251
1281
added_tokens_path = path .parent / "added_tokens.json"
1282
+ special_tokens_path = path .parent / "special_tokens_map.json"
1283
+ tokenizer_config_path = path .parent / "tokenizer_config.json"
1252
1284
print (f"Loading vocab file { path } " )
1253
- return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None ,
1285
+ return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None , special_tokens_path if special_tokens_path . exists () else tokenizer_config_path if tokenizer_config_path . exists () else None ,
1254
1286
vocabtype )
1255
1287
1256
1288
@@ -1313,6 +1345,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
1313
1345
vocab_dir = args .vocab_dir if args .vocab_dir else model_plus .paths [0 ].parent
1314
1346
vocab = load_vocab (vocab_dir , args .vocabtype )
1315
1347
params = Params .load (model_plus )
1348
+ params .n_vocab_sp = vocab .vocab_special_size
1316
1349
model = model_plus .model
1317
1350
model = do_necessary_conversions (model , params )
1318
1351
output_type = pick_output_type (model , args .outtype )
0 commit comments