@@ -573,6 +573,10 @@ def _set_vocab_sentencepiece(self):
573
573
574
574
vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
575
575
576
+ tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
577
+ scores : list [float ] = [- 10000.0 ] * vocab_size
578
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
579
+
576
580
for token_id in range (tokenizer .vocab_size ()):
577
581
piece = tokenizer .IdToPiece (token_id )
578
582
text = piece .encode ("utf-8" )
@@ -588,21 +592,23 @@ def _set_vocab_sentencepiece(self):
588
592
elif tokenizer .IsByte (token_id ):
589
593
toktype = SentencePieceTokenTypes .BYTE
590
594
591
- tokens . append ( text )
592
- scores . append ( score )
593
- toktypes . append ( toktype )
595
+ tokens [ token_id ] = text
596
+ scores [ token_id ] = score
597
+ toktypes [ token_id ] = toktype
594
598
595
599
added_tokens_file = self .dir_model / 'added_tokens.json'
596
600
if added_tokens_file .is_file ():
597
601
with open (added_tokens_file , "r" , encoding = "utf-8" ) as f :
598
602
added_tokens_json = json .load (f )
599
-
600
603
for key in added_tokens_json :
601
- key = key .encode ("utf-8" )
602
- if key not in tokens :
603
- tokens .append (key )
604
- scores .append (- 1000.0 )
605
- toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
604
+ token_id = added_tokens_json [key ]
605
+ if (token_id >= vocab_size ):
606
+ logger .warning (f'ignore token { token_id } : id is out of range, max={ vocab_size - 1 } ' )
607
+ continue
608
+
609
+ tokens [token_id ] = key .encode ("utf-8" )
610
+ scores [token_id ] = - 1000.0
611
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
606
612
607
613
if vocab_size > len (tokens ):
608
614
pad_count = vocab_size - len (tokens )
@@ -612,8 +618,6 @@ def _set_vocab_sentencepiece(self):
612
618
scores .append (- 1000.0 )
613
619
toktypes .append (SentencePieceTokenTypes .UNUSED )
614
620
615
- assert len (tokens ) == vocab_size
616
-
617
621
self .gguf_writer .add_tokenizer_model ("llama" )
618
622
self .gguf_writer .add_tokenizer_pre ("default" )
619
623
self .gguf_writer .add_token_list (tokens )
0 commit comments