@@ -634,7 +634,7 @@ def _create_vocab_sentencepiece(self):
634
634
635
635
tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
636
636
scores : list [float ] = [- 10000.0 ] * vocab_size
637
- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
637
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
638
638
639
639
for token_id in range (tokenizer .vocab_size ()):
640
640
piece = tokenizer .IdToPiece (token_id )
@@ -677,7 +677,7 @@ def _create_vocab_sentencepiece(self):
677
677
for token_id , token_data in added_tokens_decoder .items ():
678
678
token_id = int (token_id )
679
679
token : str = token_data ["content" ]
680
- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
680
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
681
681
assert tokens [token_id ] == token .encode ("utf-8" )
682
682
if token_data .get ("special" ) or self .does_token_look_special (token ):
683
683
toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
@@ -1916,7 +1916,7 @@ def set_vocab(self):
1916
1916
1917
1917
tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
1918
1918
scores : list [float ] = [- 10000.0 ] * vocab_size
1919
- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
1919
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
1920
1920
1921
1921
for token_id in range (tokenizer .vocab_size ()):
1922
1922
@@ -1961,7 +1961,7 @@ def set_vocab(self):
1961
1961
for token_id , foken_data in added_tokens_decoder .items ():
1962
1962
token_id = int (token_id )
1963
1963
token = foken_data ["content" ].encode ("utf-8" )
1964
- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
1964
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
1965
1965
assert tokens [token_id ] == token
1966
1966
tokens [token_id ] = token
1967
1967
scores [token_id ] = - 1000.0
@@ -1977,7 +1977,7 @@ def set_vocab(self):
1977
1977
for foken_data in added_tokens :
1978
1978
token_id = int (foken_data ["id" ])
1979
1979
token = foken_data ["content" ].encode ("utf-8" )
1980
- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
1980
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
1981
1981
assert tokens [token_id ] == token
1982
1982
tokens [token_id ] = token
1983
1983
scores [token_id ] = - 1000.0
@@ -2766,7 +2766,7 @@ def set_vocab(self):
2766
2766
2767
2767
tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
2768
2768
scores : list [float ] = [- 10000.0 ] * vocab_size
2769
- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
2769
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
2770
2770
2771
2771
for token_id in range (tokenizer .vocab_size ()):
2772
2772
@@ -3021,7 +3021,7 @@ def set_vocab(self):
3021
3021
3022
3022
tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
3023
3023
scores : list [float ] = [- 10000.0 ] * vocab_size
3024
- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
3024
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
3025
3025
3026
3026
for token_id in range (tokenizer .vocab_size ()):
3027
3027
piece = tokenizer .IdToPiece (token_id )
@@ -3239,15 +3239,14 @@ def set_vocab_chatglm3(self):
3239
3239
if len (piece ) != 0 and token_id < tokenizer .tokenizer .sp_model .vocab_size ():
3240
3240
score = tokenizer .tokenizer .sp_model .get_score (token_id )
3241
3241
3242
- if len (piece ) == 0 :
3243
- text = f"[PAD{ token_id } ]" .encode ("utf-8" )
3244
-
3245
3242
if token_id >= tokenizer .tokenizer .sp_model .vocab_size ():
3246
3243
if piece in special_tokens :
3247
- # show special tokens in prompt
3248
- toktype = SentencePieceTokenTypes .USER_DEFINED
3244
+ toktype = SentencePieceTokenTypes .CONTROL
3245
+ elif len (piece ) == 0 :
3246
+ text = f"[PAD{ token_id } ]" .encode ("utf-8" )
3247
+ toktype = SentencePieceTokenTypes .UNUSED
3249
3248
else :
3250
- toktype = SentencePieceTokenTypes .UNKNOWN
3249
+ toktype = SentencePieceTokenTypes .USER_DEFINED
3251
3250
tokens .append (text )
3252
3251
scores .append (score )
3253
3252
toktypes .append (toktype )
0 commit comments