@@ -373,6 +373,18 @@ def from_model_architecture(cls, arch: str) -> type[Model]:
373
373
except KeyError :
374
374
raise NotImplementedError (f'Architecture { arch !r} not supported!' ) from None
375
375
376
+ def does_token_look_special (self , token : str ) -> bool :
377
+ # Some models mark some added tokens which ought to be control tokens as not special.
378
+ # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
379
+ is_known_special = token in (
380
+ "<pad>" , # deepseek-coder
381
+ "<mask>" , "<2mass>" , "[@BOS@]" , # gemma{,-2}
382
+ )
383
+ # TODO: should these be marked as UNUSED instead?
384
+ is_known_special = is_known_special or (token .startswith ("<unused" ) and token .endswith (">" )) # gemma{,-2}
385
+
386
+ return is_known_special or (token .startswith (("<|" , "<|" )) and token .endswith (("|>" , "|>" )))
387
+
376
388
# used for GPT-2 BPE and WordPiece vocabs
377
389
def get_vocab_base (self ) -> tuple [list [str ], list [int ], str ]:
378
390
tokens : list [str ] = []
@@ -393,8 +405,9 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
393
405
tokens .append (f"[PAD{ i } ]" )
394
406
toktypes .append (gguf .TokenType .USER_DEFINED )
395
407
elif reverse_vocab [i ] in added_vocab :
396
- tokens .append (reverse_vocab [i ])
397
- if tokenizer .added_tokens_decoder [i ].special :
408
+ token : str = reverse_vocab [i ]
409
+ tokens .append (token )
410
+ if tokenizer .added_tokens_decoder [i ].special or self .does_token_look_special (token ):
398
411
toktypes .append (gguf .TokenType .CONTROL )
399
412
else :
400
413
toktypes .append (gguf .TokenType .USER_DEFINED )
0 commit comments