@@ -234,14 +234,21 @@ def load(model_plus: 'ModelPlus') -> 'Params':
234
234
235
235
236
236
class SentencePieceVocab :
237
- def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ]) -> None :
238
- self .sentencepiece_tokenizer = SentencePieceProcessor (str (fname_tokenizer ))
237
+ def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ], vocabtype : Optional [str ]) -> None :
238
+ self .vocabtype = vocabtype
239
+ if self .vocabtype == "bpe" :
240
+ self .sentencepiece_tokenizer = json .loads (open (str (fname_tokenizer )).read ())
241
+ else :
242
+ self .sentencepiece_tokenizer = SentencePieceProcessor (str (fname_tokenizer ))
239
243
added_tokens : Dict [str , int ]
240
244
if fname_added_tokens is not None :
241
245
added_tokens = json .load (open (fname_added_tokens ))
242
246
else :
243
247
added_tokens = {}
244
- vocab_size : int = self .sentencepiece_tokenizer .vocab_size ()
248
+ if self .vocabtype == "bpe" :
249
+ vocab_size : int = len (self .sentencepiece_tokenizer )
250
+ else :
251
+ vocab_size : int = self .sentencepiece_tokenizer .vocab_size ()
245
252
expected_ids = list (range (vocab_size , vocab_size + len (added_tokens )))
246
253
actual_ids = sorted (added_tokens .values ())
247
254
if expected_ids != actual_ids :
@@ -255,22 +262,32 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
255
262
256
263
def sentencepiece_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
257
264
tokenizer = self .sentencepiece_tokenizer
258
- for i in range (tokenizer .vocab_size ()):
265
+ if self .vocabtype == "bpe" :
266
+ from transformers .models .gpt2 import tokenization_gpt2
267
+ byte_encoder = tokenization_gpt2 .bytes_to_unicode ()
268
+ byte_decoder = {v : k for k , v in byte_encoder .items ()}
269
+ for i , item in enumerate (tokenizer ):
259
270
text : bytes
260
- if tokenizer .is_unknown (i ):
261
- text = " \u2047 " .encode ("utf-8" )
262
- elif tokenizer .is_control (i ):
263
- text = b""
264
- elif tokenizer .is_byte (i ):
265
- piece = tokenizer .id_to_piece (i )
266
- if len (piece ) != 6 :
267
- raise Exception (f"Invalid token: { piece } " )
268
- byte_value = int (piece [3 :- 1 ], 16 )
269
- text = struct .pack ("B" , byte_value )
270
- else :
271
- text = tokenizer .id_to_piece (i ).replace ("\u2581 " , " " ).encode ("utf-8" )
272
- score : float = tokenizer .get_score (i )
271
+ text = b'' .join ([x .to_bytes (1 , byteorder = 'big' ) for x in [byte_decoder [y ] for y in item ]])
272
+ score : float = - i
273
273
yield text , score
274
+ else :
275
+ for i in range (tokenizer .vocab_size ()):
276
+ text : bytes
277
+ if tokenizer .is_unknown (i ):
278
+ text = " \u2047 " .encode ("utf-8" )
279
+ elif tokenizer .is_control (i ):
280
+ text = b""
281
+ elif tokenizer .is_byte (i ):
282
+ piece = tokenizer .id_to_piece (i )
283
+ if len (piece ) != 6 :
284
+ raise Exception (f"Invalid token: { piece } " )
285
+ byte_value = int (piece [3 :- 1 ], 16 )
286
+ text = struct .pack ("B" , byte_value )
287
+ else :
288
+ text = tokenizer .id_to_piece (i ).replace ("\u2581 " , " " ).encode ("utf-8" )
289
+ score : float = tokenizer .get_score (i )
290
+ yield text , score
274
291
275
292
def added_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
276
293
for text in self .added_tokens_list :
@@ -1196,14 +1213,18 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
1196
1213
return {name : model [name ] for name in TENSORS_LIST if name in model }
1197
1214
1198
1215
1199
- def load_vocab (path : Path ) -> SentencePieceVocab :
1216
+ def load_vocab (path : Path , vocabtype : Optional [str ]) -> SentencePieceVocab :
1217
+ print (f"vocabtype: { vocabtype } " )
1200
1218
# Be extra-friendly and accept either a file or a directory. Also, if it's
1201
1219
# a directory, it might be the model directory, and tokenizer.model might
1202
1220
# be in the parent of that.
1203
1221
if path .is_dir ():
1204
- path2 = path / "tokenizer.model"
1222
+ vocab_file = "tokenizer.model"
1223
+ if vocabtype == 'bpe' :
1224
+ vocab_file = "vocab.json"
1225
+ path2 = path / vocab_file
1205
1226
# Use `.parent` instead of /.. to handle the symlink case better.
1206
- path3 = path .parent / "tokenizer.model"
1227
+ path3 = path .parent / vocab_file
1207
1228
if path2 .exists ():
1208
1229
path = path2
1209
1230
elif path3 .exists ():
@@ -1214,7 +1235,8 @@ def load_vocab(path: Path) -> SentencePieceVocab:
1214
1235
"if it's in another directory, pass the directory as --vocab-dir" )
1215
1236
added_tokens_path = path .parent / "added_tokens.json"
1216
1237
print (f"Loading vocab file { path } " )
1217
- return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None )
1238
+ return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None ,
1239
+ vocabtype )
1218
1240
1219
1241
1220
1242
def default_outfile (model_paths : List [Path ], file_type : GGMLFileType ) -> Path :
@@ -1252,14 +1274,15 @@ def main(args_in: Optional[List[str]] = None) -> None:
1252
1274
parser .add_argument ("--outfile" , type = Path , help = "path to write to; default: based on input" )
1253
1275
parser .add_argument ("model" , type = Path ,
1254
1276
help = "directory containing model file, or model file itself (*.pth, *.pt, *.bin)" )
1277
+ parser .add_argument ("--vocabtype" , default = 'spm' , choices = ["spm" , "bpe" ], help = "vocab format (default: spm)" )
1255
1278
args = parser .parse_args (args_in )
1256
1279
1257
1280
vocab : Vocab
1258
1281
if args .dump_single :
1259
1282
model_plus = lazy_load_file (args .model )
1260
1283
do_dump_model (model_plus )
1261
1284
elif args .vocab_only :
1262
- vocab = load_vocab (args .vocab_dir or args .model )
1285
+ vocab = load_vocab (args .vocab_dir or args .model , args . vocabtype )
1263
1286
assert args .outfile , "need --outfile if using --vocab-only"
1264
1287
outfile = args .outfile
1265
1288
OutputFile .write_vocab_only (outfile , vocab )
@@ -1273,7 +1296,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
1273
1296
vocab = model_plus .vocab
1274
1297
else :
1275
1298
vocab_dir = args .vocab_dir if args .vocab_dir else model_plus .paths [0 ].parent
1276
- vocab = load_vocab (vocab_dir )
1299
+ vocab = load_vocab (vocab_dir , args . vocabtype )
1277
1300
params = Params .load (model_plus )
1278
1301
model = model_plus .model
1279
1302
model = do_necessary_conversions (model , params )
0 commit comments