@@ -205,25 +205,58 @@ def load(model_plus: 'ModelPlus') -> 'Params':
205
205
return params
206
206
207
207
208
- class SentencePieceVocab :
209
- def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ], vocabtype : Optional [str ]) -> None :
210
- self .vocabtype = vocabtype
211
- if self .vocabtype == "bpe" :
212
- self .sentencepiece_tokenizer = json .loads (open (str (fname_tokenizer )).read ())
213
- else :
214
- self .sentencepiece_tokenizer = SentencePieceProcessor (str (fname_tokenizer ))
215
-
208
+ class BpeVocab :
209
+ def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ]) -> None :
210
+ self .bpe_tokenizer = json .loads (open (str (fname_tokenizer ), encoding = "utf-8" ).read ())
216
211
added_tokens : Dict [str , int ]
217
212
if fname_added_tokens is not None :
218
- added_tokens = json .load (open (fname_added_tokens ))
213
+ added_tokens = json .load (open (fname_added_tokens , encoding = "utf-8" ))
219
214
else :
220
215
added_tokens = {}
216
+ vocab_size : int = len (self .bpe_tokenizer )
217
+ expected_ids = list (range (vocab_size , vocab_size + len (added_tokens )))
218
+ actual_ids = sorted (added_tokens .values ())
219
+ if expected_ids != actual_ids :
220
+ raise Exception (f"Expected added token IDs to be sequential and start at { len (added_tokens )} ; got { actual_ids } " )
221
+ items = sorted (added_tokens .items (), key = lambda text_idx : text_idx [1 ])
222
+ self .added_tokens_list = [text for (text , idx ) in items ]
223
+ self .vocab_size_base : int = vocab_size
224
+ self .vocab_size : int = self .vocab_size_base + len (self .added_tokens_list )
225
+ self .fname_tokenizer = fname_tokenizer
226
+ self .fname_added_tokens = fname_added_tokens
221
227
222
- if self .vocabtype == "bpe" :
223
- vocab_size : int = len (self .sentencepiece_tokenizer )
224
- else :
225
- vocab_size : int = self .sentencepiece_tokenizer .vocab_size ()
228
+ def bpe_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
229
+ tokenizer = self .bpe_tokenizer
230
+ from transformers .models .gpt2 import tokenization_gpt2
231
+ byte_encoder = tokenization_gpt2 .bytes_to_unicode ()
232
+ byte_decoder = {v : k for k , v in byte_encoder .items ()}
233
+ for i , item in enumerate (tokenizer ):
234
+ text : bytes = item .encode ("utf-8" )
235
+ score : float = - i
236
+ yield text , score
226
237
238
+ def added_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
239
+ for text in self .added_tokens_list :
240
+ score = - 1000.0
241
+ yield text .encode ("utf-8" ), score
242
+
243
+ def all_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
244
+ yield from self .bpe_tokens ()
245
+ yield from self .added_tokens ()
246
+
247
+ def __repr__ (self ) -> str :
248
+ return f"BpeVocab with { self .vocab_size_base } base tokens and { len (self .added_tokens_list )} added tokens>"
249
+
250
+
251
+ class SentencePieceVocab :
252
+ def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ]) -> None :
253
+ self .sentencepiece_tokenizer = SentencePieceProcessor (str (fname_tokenizer ))
254
+ added_tokens : Dict [str , int ]
255
+ if fname_added_tokens is not None :
256
+ added_tokens = json .load (open (fname_added_tokens , encoding = "utf-8" ))
257
+ else :
258
+ added_tokens = {}
259
+ vocab_size : int = self .sentencepiece_tokenizer .vocab_size ()
227
260
expected_ids = list (range (vocab_size , vocab_size + len (added_tokens )))
228
261
actual_ids = sorted (added_tokens .values ())
229
262
if expected_ids != actual_ids :
@@ -238,32 +271,11 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vo
238
271
239
272
def sentencepiece_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
240
273
tokenizer = self .sentencepiece_tokenizer
241
- if self .vocabtype == "bpe" :
242
- from transformers .models .gpt2 import tokenization_gpt2
243
- byte_encoder = tokenization_gpt2 .bytes_to_unicode ()
244
- byte_decoder = {v : k for k , v in byte_encoder .items ()}
245
- for i , item in enumerate (tokenizer ):
246
- text : bytes
247
- text = b'' .join ([x .to_bytes (1 , byteorder = 'big' ) for x in [byte_decoder [y ] for y in item ]])
248
- score : float = - i
249
- yield text , score
250
- else :
251
- for i in range (tokenizer .vocab_size ()):
252
- text : bytes
253
- if tokenizer .is_unknown (i ):
254
- text = " \u2047 " .encode ("utf-8" )
255
- elif tokenizer .is_control (i ):
256
- text = b""
257
- elif tokenizer .is_byte (i ):
258
- piece = tokenizer .id_to_piece (i )
259
- if len (piece ) != 6 :
260
- raise Exception (f"Invalid token: { piece } " )
261
- byte_value = int (piece [3 :- 1 ], 16 )
262
- text = struct .pack ("B" , byte_value )
263
- else :
264
- text = tokenizer .id_to_piece (i ).replace ("\u2581 " , " " ).encode ("utf-8" )
265
- score : float = tokenizer .get_score (i )
266
- yield text , score
274
+ for i in range (tokenizer .vocab_size ()):
275
+ piece = tokenizer .id_to_piece (i )
276
+ text : bytes = piece .encode ("utf-8" )
277
+ score : float = tokenizer .get_score (i )
278
+ yield text , score
267
279
268
280
def added_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
269
281
for text in self .added_tokens_list :
@@ -278,7 +290,7 @@ def __repr__(self) -> str:
278
290
return f"<SentencePieceVocab with { self .vocab_size_base } base tokens and { len (self .added_tokens_list )} added tokens>"
279
291
280
292
281
- Vocab = Union [SentencePieceVocab ]
293
+ Vocab = Union [BpeVocab , SentencePieceVocab ]
282
294
283
295
284
296
def permute (weights : NDArray , n_head : int ) -> NDArray :
@@ -679,7 +691,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
679
691
680
692
def check_vocab_size (params : Params , vocab : Vocab ) -> None :
681
693
if params .n_vocab != vocab .vocab_size :
682
- assert isinstance (vocab , SentencePieceVocab )
694
+ assert isinstance (vocab , BpeVocab ) or isinstance ( vocab , SentencePieceVocab )
683
695
if params .n_vocab == vocab .vocab_size_base :
684
696
print ("Ignoring added_tokens.json since model matches vocab size without it." )
685
697
vocab .added_tokens_list = []
@@ -853,7 +865,7 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
853
865
return {name : model [name ] for name in TENSORS_LIST if name in model }
854
866
855
867
856
- def load_vocab (path : Path , vocabtype : Optional [str ]) -> SentencePieceVocab :
868
+ def load_vocab (path : Path , vocabtype : Optional [str ]) -> Union [ BpeVocab , SentencePieceVocab ] :
857
869
print (f"vocabtype: { vocabtype } " )
858
870
# Be extra-friendly and accept either a file or a directory. Also, if it's
859
871
# a directory, it might be the model directory, and tokenizer.model might
@@ -875,8 +887,12 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
875
887
"if it's in another directory, pass the directory as --vocab-dir" )
876
888
added_tokens_path = path .parent / "added_tokens.json"
877
889
print (f"Loading vocab file { path } " )
878
- return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None ,
879
- vocabtype )
890
+ if vocabtype == "bpe" :
891
+ return BpeVocab (path , added_tokens_path if added_tokens_path .exists () else None )
892
+ elif vocabtype == "spm" :
893
+ return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None )
894
+ else :
895
+ raise ValueError (f"Unsupported vocabulary type { vocabtype } " )
880
896
881
897
882
898
def default_outfile (model_paths : List [Path ], file_type : GGMLFileType ) -> Path :
0 commit comments