@@ -373,17 +373,28 @@ def from_model_architecture(cls, arch: str) -> type[Model]:
373
373
except KeyError :
374
374
raise NotImplementedError (f'Architecture { arch !r} not supported!' ) from None
375
375
376
- def does_token_look_special (self , token : str ) -> bool :
376
+ def does_token_look_special (self , token : str | bytes ) -> bool :
377
+ if isinstance (token , (bytes , bytearray )):
378
+ token_text = token .decode (encoding = "utf-8" )
379
+ elif isinstance (token , memoryview ):
380
+ token_text = token .tobytes ().decode (encoding = "utf-8" )
381
+ else :
382
+ token_text = token
383
+
377
384
# Some models mark some added tokens which ought to be control tokens as not special.
378
385
# (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
379
- is_known_special = token in (
386
+ seems_special = token_text in (
380
387
"<pad>" , # deepseek-coder
381
388
"<mask>" , "<2mass>" , "[@BOS@]" , # gemma{,-2}
382
389
)
383
- # TODO: should these be marked as UNUSED instead?
384
- is_known_special = is_known_special or (token .startswith ("<unused" ) and token .endswith (">" )) # gemma{,-2}
385
390
386
- return is_known_special or (token .startswith (("<|" , "<|" )) and token .endswith (("|>" , "|>" )))
391
+ seems_special = seems_special or (token_text .startswith ("<|" ) and token_text .endswith ("|>" ))
392
+ seems_special = seems_special or (token_text .startswith ("<|" ) and token_text .endswith ("|>" )) # deepseek-coder
393
+
394
+ # TODO: should these be marked as UNUSED instead? (maybe not)
395
+ seems_special = seems_special or (token_text .startswith ("<unused" ) and token_text .endswith (">" )) # gemma{,-2}
396
+
397
+ return seems_special
387
398
388
399
# used for GPT-2 BPE and WordPiece vocabs
389
400
def get_vocab_base (self ) -> tuple [list [str ], list [int ], str ]:
@@ -403,17 +414,18 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
403
414
for i in range (vocab_size ):
404
415
if i not in reverse_vocab :
405
416
tokens .append (f"[PAD{ i } ]" )
406
- toktypes .append (gguf .TokenType .USER_DEFINED )
407
- elif reverse_vocab [ i ] in added_vocab :
417
+ toktypes .append (gguf .TokenType .UNUSED )
418
+ else :
408
419
token : str = reverse_vocab [i ]
409
- tokens .append (token )
410
- if tokenizer .added_tokens_decoder [i ].special or self .does_token_look_special (token ):
411
- toktypes .append (gguf .TokenType .CONTROL )
420
+ if token in added_vocab :
421
+ if tokenizer .added_tokens_decoder [i ].special or self .does_token_look_special (token ):
422
+ toktypes .append (gguf .TokenType .CONTROL )
423
+ else :
424
+ token = token .replace (b"\xe2 \x96 \x81 " .decode ("utf-8" ), " " ) # pre-normalize user-defined spaces
425
+ toktypes .append (gguf .TokenType .USER_DEFINED )
412
426
else :
413
- toktypes .append (gguf .TokenType .USER_DEFINED )
414
- else :
415
- tokens .append (reverse_vocab [i ])
416
- toktypes .append (gguf .TokenType .NORMAL )
427
+ toktypes .append (gguf .TokenType .NORMAL )
428
+ tokens .append (token )
417
429
418
430
return tokens , toktypes , tokpre
419
431
@@ -572,7 +584,7 @@ def _set_vocab_qwen(self):
572
584
for i in range (vocab_size ):
573
585
if i not in reverse_vocab :
574
586
tokens .append (f"[PAD{ i } ]" )
575
- toktypes .append (gguf .TokenType .USER_DEFINED )
587
+ toktypes .append (gguf .TokenType .UNUSED )
576
588
elif reverse_vocab [i ] in added_vocab :
577
589
tokens .append (reverse_vocab [i ])
578
590
toktypes .append (gguf .TokenType .CONTROL )
@@ -657,6 +669,25 @@ def _create_vocab_sentencepiece(self):
657
669
scores [token_id ] = - 1000.0
658
670
toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
659
671
672
+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
673
+ if tokenizer_config_file .is_file ():
674
+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
675
+ tokenizer_config_json = json .load (f )
676
+ added_tokens_decoder = tokenizer_config_json .get ("added_tokens_decoder" , {})
677
+ for token_id , token_data in added_tokens_decoder .items ():
678
+ token_id = int (token_id )
679
+ token : str = token_data ["content" ]
680
+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
681
+ assert tokens [token_id ] == token .encode ("utf-8" )
682
+ if token_data .get ("special" ) or self .does_token_look_special (token ):
683
+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
684
+ else :
685
+ token = token .replace (b"\xe2 \x96 \x81 " .decode ("utf-8" ), " " ) # pre-normalize user-defined spaces
686
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
687
+
688
+ scores [token_id ] = - 1000.0
689
+ tokens [token_id ] = token .encode ("utf-8" )
690
+
660
691
if vocab_size > len (tokens ):
661
692
pad_count = vocab_size - len (tokens )
662
693
logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
@@ -1280,7 +1311,7 @@ def set_vocab(self):
1280
1311
if (self .dir_model / "tokenizer.json" ).is_file ():
1281
1312
self ._set_vocab_gpt2 ()
1282
1313
else :
1283
- # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
1314
+ # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
1284
1315
self ._set_vocab_qwen ()
1285
1316
1286
1317
def set_gguf_parameters (self ):
@@ -1592,7 +1623,6 @@ def set_gguf_parameters(self):
1592
1623
self .gguf_writer .add_rope_freq_base (attn_config ["rope_theta" ])
1593
1624
1594
1625
self .gguf_writer .add_clamp_kqv (attn_config ["clip_qkv" ])
1595
- self .gguf_writer .add_file_type (self .ftype )
1596
1626
1597
1627
self .gguf_writer .add_expert_count (ffn_config ["moe_num_experts" ])
1598
1628
self .gguf_writer .add_expert_used_count (ffn_config ["moe_top_k" ])
@@ -2412,19 +2442,7 @@ class Gemma2Model(Model):
2412
2442
model_arch = gguf .MODEL_ARCH .GEMMA2
2413
2443
2414
2444
def set_vocab (self ):
2415
- tokens , scores , toktypes = self ._create_vocab_sentencepiece ()
2416
- # hack: This is required so that we can properly use start/end-of-turn for chat template
2417
- for i in range (108 ):
2418
- # including <unusedX>, <start_of_turn>, <end_of_turn>
2419
- toktypes [i ] = SentencePieceTokenTypes .CONTROL
2420
- self .gguf_writer .add_tokenizer_model ("llama" )
2421
- self .gguf_writer .add_tokenizer_pre ("default" )
2422
- self .gguf_writer .add_token_list (tokens )
2423
- self .gguf_writer .add_token_scores (scores )
2424
- self .gguf_writer .add_token_types (toktypes )
2425
-
2426
- special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2427
- special_vocab .add_to_gguf (self .gguf_writer )
2445
+ self ._set_vocab_sentencepiece ()
2428
2446
2429
2447
self .gguf_writer .add_add_space_prefix (False )
2430
2448
@@ -3318,7 +3336,7 @@ def set_vocab(self):
3318
3336
for i in range (vocab_size ):
3319
3337
if i not in reverse_vocab :
3320
3338
tokens .append (f"[PAD{ i } ]" )
3321
- toktypes .append (gguf .TokenType .USER_DEFINED )
3339
+ toktypes .append (gguf .TokenType .UNUSED )
3322
3340
elif reverse_vocab [i ] in added_vocab :
3323
3341
tokens .append (reverse_vocab [i ])
3324
3342
if tokenizer .added_tokens_decoder [i ].special :
0 commit comments