@@ -373,6 +373,29 @@ def from_model_architecture(cls, arch: str) -> type[Model]:
373
373
except KeyError :
374
374
raise NotImplementedError (f'Architecture { arch !r} not supported!' ) from None
375
375
376
+ def does_token_look_special (self , token : str | bytes ) -> bool :
377
+ if isinstance (token , (bytes , bytearray )):
378
+ token_text = token .decode (encoding = "utf-8" )
379
+ elif isinstance (token , memoryview ):
380
+ token_text = token .tobytes ().decode (encoding = "utf-8" )
381
+ else :
382
+ token_text = token
383
+
384
+ # Some models mark some added tokens which ought to be control tokens as not special.
385
+ # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
386
+ seems_special = token_text in (
387
+ "<pad>" , # deepseek-coder
388
+ "<mask>" , "<2mass>" , "[@BOS@]" , # gemma{,-2}
389
+ )
390
+
391
+ seems_special = seems_special or (token_text .startswith ("<|" ) and token_text .endswith ("|>" ))
392
+ seems_special = seems_special or (token_text .startswith ("<|" ) and token_text .endswith ("|>" )) # deepseek-coder
393
+
394
+ # TODO: should these be marked as UNUSED instead? (maybe not)
395
+ seems_special = seems_special or (token_text .startswith ("<unused" ) and token_text .endswith (">" )) # gemma{,-2}
396
+
397
+ return seems_special
398
+
376
399
# used for GPT-2 BPE and WordPiece vocabs
377
400
def get_vocab_base (self ) -> tuple [list [str ], list [int ], str ]:
378
401
tokens : list [str ] = []
@@ -391,16 +414,18 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
391
414
for i in range (vocab_size ):
392
415
if i not in reverse_vocab :
393
416
tokens .append (f"[PAD{ i } ]" )
394
- toktypes .append (gguf .TokenType .USER_DEFINED )
395
- elif reverse_vocab [i ] in added_vocab :
396
- tokens .append (reverse_vocab [i ])
397
- if tokenizer .added_tokens_decoder [i ].special :
398
- toktypes .append (gguf .TokenType .CONTROL )
399
- else :
400
- toktypes .append (gguf .TokenType .USER_DEFINED )
417
+ toktypes .append (gguf .TokenType .UNUSED )
401
418
else :
402
- tokens .append (reverse_vocab [i ])
403
- toktypes .append (gguf .TokenType .NORMAL )
419
+ token : str = reverse_vocab [i ]
420
+ if token in added_vocab :
421
+ if tokenizer .added_tokens_decoder [i ].special or self .does_token_look_special (token ):
422
+ toktypes .append (gguf .TokenType .CONTROL )
423
+ else :
424
+ token = token .replace (b"\xe2 \x96 \x81 " .decode ("utf-8" ), " " ) # pre-normalize user-defined spaces
425
+ toktypes .append (gguf .TokenType .USER_DEFINED )
426
+ else :
427
+ toktypes .append (gguf .TokenType .NORMAL )
428
+ tokens .append (token )
404
429
405
430
return tokens , toktypes , tokpre
406
431
@@ -559,7 +584,7 @@ def _set_vocab_qwen(self):
559
584
for i in range (vocab_size ):
560
585
if i not in reverse_vocab :
561
586
tokens .append (f"[PAD{ i } ]" )
562
- toktypes .append (gguf .TokenType .USER_DEFINED )
587
+ toktypes .append (gguf .TokenType .UNUSED )
563
588
elif reverse_vocab [i ] in added_vocab :
564
589
tokens .append (reverse_vocab [i ])
565
590
toktypes .append (gguf .TokenType .CONTROL )
@@ -609,7 +634,7 @@ def _create_vocab_sentencepiece(self):
609
634
610
635
tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
611
636
scores : list [float ] = [- 10000.0 ] * vocab_size
612
- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
637
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
613
638
614
639
for token_id in range (tokenizer .vocab_size ()):
615
640
piece = tokenizer .IdToPiece (token_id )
@@ -644,6 +669,25 @@ def _create_vocab_sentencepiece(self):
644
669
scores [token_id ] = - 1000.0
645
670
toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
646
671
672
+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
673
+ if tokenizer_config_file .is_file ():
674
+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
675
+ tokenizer_config_json = json .load (f )
676
+ added_tokens_decoder = tokenizer_config_json .get ("added_tokens_decoder" , {})
677
+ for token_id , token_data in added_tokens_decoder .items ():
678
+ token_id = int (token_id )
679
+ token : str = token_data ["content" ]
680
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
681
+ assert tokens [token_id ] == token .encode ("utf-8" )
682
+ if token_data .get ("special" ) or self .does_token_look_special (token ):
683
+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
684
+ else :
685
+ token = token .replace (b"\xe2 \x96 \x81 " .decode ("utf-8" ), " " ) # pre-normalize user-defined spaces
686
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
687
+
688
+ scores [token_id ] = - 1000.0
689
+ tokens [token_id ] = token .encode ("utf-8" )
690
+
647
691
if vocab_size > len (tokens ):
648
692
pad_count = vocab_size - len (tokens )
649
693
logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
@@ -1267,7 +1311,7 @@ def set_vocab(self):
1267
1311
if (self .dir_model / "tokenizer.json" ).is_file ():
1268
1312
self ._set_vocab_gpt2 ()
1269
1313
else :
1270
- # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
1314
+ # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
1271
1315
self ._set_vocab_qwen ()
1272
1316
1273
1317
def set_gguf_parameters (self ):
@@ -1579,7 +1623,6 @@ def set_gguf_parameters(self):
1579
1623
self .gguf_writer .add_rope_freq_base (attn_config ["rope_theta" ])
1580
1624
1581
1625
self .gguf_writer .add_clamp_kqv (attn_config ["clip_qkv" ])
1582
- self .gguf_writer .add_file_type (self .ftype )
1583
1626
1584
1627
self .gguf_writer .add_expert_count (ffn_config ["moe_num_experts" ])
1585
1628
self .gguf_writer .add_expert_used_count (ffn_config ["moe_top_k" ])
@@ -1873,7 +1916,7 @@ def set_vocab(self):
1873
1916
1874
1917
tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
1875
1918
scores : list [float ] = [- 10000.0 ] * vocab_size
1876
- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
1919
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
1877
1920
1878
1921
for token_id in range (tokenizer .vocab_size ()):
1879
1922
@@ -1918,7 +1961,7 @@ def set_vocab(self):
1918
1961
for token_id , foken_data in added_tokens_decoder .items ():
1919
1962
token_id = int (token_id )
1920
1963
token = foken_data ["content" ].encode ("utf-8" )
1921
- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
1964
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
1922
1965
assert tokens [token_id ] == token
1923
1966
tokens [token_id ] = token
1924
1967
scores [token_id ] = - 1000.0
@@ -1934,7 +1977,7 @@ def set_vocab(self):
1934
1977
for foken_data in added_tokens :
1935
1978
token_id = int (foken_data ["id" ])
1936
1979
token = foken_data ["content" ].encode ("utf-8" )
1937
- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
1980
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
1938
1981
assert tokens [token_id ] == token
1939
1982
tokens [token_id ] = token
1940
1983
scores [token_id ] = - 1000.0
@@ -2146,7 +2189,7 @@ def set_vocab(self):
2146
2189
toktype = SentencePieceTokenTypes .BYTE
2147
2190
# take care of ununsed raw token
2148
2191
if piece .startswith ('[UNUSED' ):
2149
- toktype = SentencePieceTokenTypes .UNKNOWN
2192
+ toktype = SentencePieceTokenTypes .UNUSED
2150
2193
2151
2194
tokens .append (text )
2152
2195
scores .append (score )
@@ -2176,7 +2219,7 @@ def set_vocab(self):
2176
2219
if token == chat_eos_token :
2177
2220
chat_eos_token_id = token_id
2178
2221
token = token .encode ("utf-8" )
2179
- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2222
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
2180
2223
assert (tokens [token_id ] == token )
2181
2224
tokens [token_id ] = token
2182
2225
scores [token_id ] = - 1000.0
@@ -2195,7 +2238,7 @@ def set_vocab(self):
2195
2238
if token == chat_eos_token :
2196
2239
chat_eos_token_id = token_id
2197
2240
token = token .encode ("utf-8" )
2198
- if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2241
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
2199
2242
assert (tokens [token_id ] == token )
2200
2243
tokens [token_id ] = token
2201
2244
scores [token_id ] = - 1000.0
@@ -2435,19 +2478,7 @@ class Gemma2Model(Model):
2435
2478
model_arch = gguf .MODEL_ARCH .GEMMA2
2436
2479
2437
2480
def set_vocab (self ):
2438
- tokens , scores , toktypes = self ._create_vocab_sentencepiece ()
2439
- # hack: This is required so that we can properly use start/end-of-turn for chat template
2440
- for i in range (108 ):
2441
- # including <unusedX>, <start_of_turn>, <end_of_turn>
2442
- toktypes [i ] = SentencePieceTokenTypes .CONTROL
2443
- self .gguf_writer .add_tokenizer_model ("llama" )
2444
- self .gguf_writer .add_tokenizer_pre ("default" )
2445
- self .gguf_writer .add_token_list (tokens )
2446
- self .gguf_writer .add_token_scores (scores )
2447
- self .gguf_writer .add_token_types (toktypes )
2448
-
2449
- special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2450
- special_vocab .add_to_gguf (self .gguf_writer )
2481
+ self ._set_vocab_sentencepiece ()
2451
2482
2452
2483
self .gguf_writer .add_add_space_prefix (False )
2453
2484
@@ -2771,7 +2802,7 @@ def set_vocab(self):
2771
2802
2772
2803
tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
2773
2804
scores : list [float ] = [- 10000.0 ] * vocab_size
2774
- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
2805
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
2775
2806
2776
2807
for token_id in range (tokenizer .vocab_size ()):
2777
2808
@@ -3026,7 +3057,7 @@ def set_vocab(self):
3026
3057
3027
3058
tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
3028
3059
scores : list [float ] = [- 10000.0 ] * vocab_size
3029
- toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
3060
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNUSED ] * vocab_size
3030
3061
3031
3062
for token_id in range (tokenizer .vocab_size ()):
3032
3063
piece = tokenizer .IdToPiece (token_id )
@@ -3244,15 +3275,14 @@ def set_vocab_chatglm3(self):
3244
3275
if len (piece ) != 0 and token_id < tokenizer .tokenizer .sp_model .vocab_size ():
3245
3276
score = tokenizer .tokenizer .sp_model .get_score (token_id )
3246
3277
3247
- if len (piece ) == 0 :
3248
- text = f"[PAD{ token_id } ]" .encode ("utf-8" )
3249
-
3250
3278
if token_id >= tokenizer .tokenizer .sp_model .vocab_size ():
3251
3279
if piece in special_tokens :
3252
- # show special tokens in prompt
3253
- toktype = SentencePieceTokenTypes .USER_DEFINED
3280
+ toktype = SentencePieceTokenTypes .CONTROL
3281
+ elif len (piece ) == 0 :
3282
+ text = f"[PAD{ token_id } ]" .encode ("utf-8" )
3283
+ toktype = SentencePieceTokenTypes .UNUSED
3254
3284
else :
3255
- toktype = SentencePieceTokenTypes .UNKNOWN
3285
+ toktype = SentencePieceTokenTypes .USER_DEFINED
3256
3286
tokens .append (text )
3257
3287
scores .append (score )
3258
3288
toktypes .append (toktype )
@@ -3341,7 +3371,7 @@ def set_vocab(self):
3341
3371
for i in range (vocab_size ):
3342
3372
if i not in reverse_vocab :
3343
3373
tokens .append (f"[PAD{ i } ]" )
3344
- toktypes .append (gguf .TokenType .USER_DEFINED )
3374
+ toktypes .append (gguf .TokenType .UNUSED )
3345
3375
elif reverse_vocab [i ] in added_vocab :
3346
3376
tokens .append (reverse_vocab [i ])
3347
3377
if tokenizer .added_tokens_decoder [i ].special :
0 commit comments