@@ -243,7 +243,7 @@ def load(model_plus: 'ModelPlus') -> 'Params':
243
243
244
244
245
245
class SentencePieceVocab :
246
- def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ], fname_special_tokens : Optional [Path ], vocabtype : Optional [str ]) -> None :
246
+ def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ], fname_special_tokens : Optional [Path ], fname_tokenizer_config : Optional [ Path ], vocabtype : Optional [str ]) -> None :
247
247
self .vocabtype = vocabtype
248
248
if self .vocabtype == "bpe" :
249
249
self .sentencepiece_tokenizer = json .loads (open (str (fname_tokenizer )).read ())
@@ -268,13 +268,40 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], fn
268
268
self .vocab_size : int = self .vocab_size_base + len (self .added_tokens_list )
269
269
self .fname_tokenizer = fname_tokenizer
270
270
self .fname_added_tokens = fname_added_tokens
271
- special_tokens : Dict [str , Dict [str , Any ]]
271
+ self .special_tokens_map : Dict [int , str ] = {}
272
+
273
+ TOKEN_NAME_TO_ID : Dict [str , int ] = {
274
+ "unk_token" : self .sentencepiece_tokenizer .unk_id (),
275
+ "bos_token" : self .sentencepiece_tokenizer .bos_id (),
276
+ "eos_token" : self .sentencepiece_tokenizer .eos_id (),
277
+ "pad_token" : self .sentencepiece_tokenizer .pad_id ()
278
+ }
279
+
280
+ tokenizer_config : Dict [str , Any ]
281
+ if fname_tokenizer_config is not None :
282
+ tokenizer_config = json .load (open (fname_tokenizer_config ))
283
+ else :
284
+ tokenizer_config = {}
285
+ for key , value in tokenizer_config .items ():
286
+ assert isinstance (value , dict ) or isinstance (value , str )
287
+ if key not in TOKEN_NAME_TO_ID or TOKEN_NAME_TO_ID [key ] == - 1 :
288
+ continue
289
+ self .special_tokens_map [TOKEN_NAME_TO_ID [key ]] = value ["content" ] if isinstance (value , dict ) else value
290
+
291
+ special_tokens : Dict [str , Any ]
272
292
if fname_special_tokens is not None :
273
293
special_tokens = json .load (open (fname_special_tokens ))
274
294
else :
275
295
special_tokens = {}
276
- token_name_to_id = {"unk_token" : self .sentencepiece_tokenizer .unk_id (), "bos_token" : self .sentencepiece_tokenizer .bos_id (), "eos_token" : self .sentencepiece_tokenizer .eos_id (), "pad_token" : self .sentencepiece_tokenizer .pad_id ()}
277
- self .special_tokens_map = {token_name_to_id [token_name ]: info ["content" ] if isinstance (info , dict ) else info for token_name , info in special_tokens .items () if token_name in token_name_to_id and token_name_to_id [token_name ] != - 1 }
296
+ for key , value in special_tokens .items ():
297
+ assert isinstance (value , dict ) or isinstance (value , str )
298
+ if key not in TOKEN_NAME_TO_ID :
299
+ continue
300
+ token_id = TOKEN_NAME_TO_ID [key ]
301
+ if token_id == - 1 or token_id in self .special_tokens_map :
302
+ continue
303
+ self .special_tokens_map [token_id ] = value ["content" ] if isinstance (value , dict ) else value
304
+
278
305
self .vocab_special_size : int = len (self .added_tokens_list ) + len (self .special_tokens_map )
279
306
280
307
def sentencepiece_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
@@ -1282,7 +1309,7 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
1282
1309
special_tokens_path = path .parent / "special_tokens_map.json"
1283
1310
tokenizer_config_path = path .parent / "tokenizer_config.json"
1284
1311
print (f"Loading vocab file { path } " )
1285
- return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None , special_tokens_path if special_tokens_path .exists () else tokenizer_config_path if tokenizer_config_path .exists () else None ,
1312
+ return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None , special_tokens_path if special_tokens_path .exists () else None , tokenizer_config_path if tokenizer_config_path .exists () else None ,
1286
1313
vocabtype )
1287
1314
1288
1315
0 commit comments