Skip to content

Commit fb46a15

Browse files
author
jaime-m-p
committed
Merge remote-tracking branch 'upstream/compilade/fix-mpt-pretok' into tokenizer-fixes
2 parents 9b8e05b + 1caa20f commit fb46a15

File tree

3 files changed

+87
-58
lines changed

3 files changed

+87
-58
lines changed

convert_hf_to_gguf.py

Lines changed: 71 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,29 @@ def from_model_architecture(cls, arch: str) -> type[Model]:
373373
except KeyError:
374374
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
375375

376+
def does_token_look_special(self, token: str | bytes) -> bool:
377+
if isinstance(token, (bytes, bytearray)):
378+
token_text = token.decode(encoding="utf-8")
379+
elif isinstance(token, memoryview):
380+
token_text = token.tobytes().decode(encoding="utf-8")
381+
else:
382+
token_text = token
383+
384+
# Some models mark some added tokens which ought to be control tokens as not special.
385+
# (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
386+
seems_special = token_text in (
387+
"<pad>", # deepseek-coder
388+
"<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
389+
)
390+
391+
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
392+
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder
393+
394+
# TODO: should these be marked as UNUSED instead? (maybe not)
395+
seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2}
396+
397+
return seems_special
398+
376399
# used for GPT-2 BPE and WordPiece vocabs
377400
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
378401
tokens: list[str] = []
@@ -391,16 +414,18 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
391414
for i in range(vocab_size):
392415
if i not in reverse_vocab:
393416
tokens.append(f"[PAD{i}]")
394-
toktypes.append(gguf.TokenType.USER_DEFINED)
395-
elif reverse_vocab[i] in added_vocab:
396-
tokens.append(reverse_vocab[i])
397-
if tokenizer.added_tokens_decoder[i].special:
398-
toktypes.append(gguf.TokenType.CONTROL)
399-
else:
400-
toktypes.append(gguf.TokenType.USER_DEFINED)
417+
toktypes.append(gguf.TokenType.UNUSED)
401418
else:
402-
tokens.append(reverse_vocab[i])
403-
toktypes.append(gguf.TokenType.NORMAL)
419+
token: str = reverse_vocab[i]
420+
if token in added_vocab:
421+
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
422+
toktypes.append(gguf.TokenType.CONTROL)
423+
else:
424+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
425+
toktypes.append(gguf.TokenType.USER_DEFINED)
426+
else:
427+
toktypes.append(gguf.TokenType.NORMAL)
428+
tokens.append(token)
404429

405430
return tokens, toktypes, tokpre
406431

@@ -559,7 +584,7 @@ def _set_vocab_qwen(self):
559584
for i in range(vocab_size):
560585
if i not in reverse_vocab:
561586
tokens.append(f"[PAD{i}]")
562-
toktypes.append(gguf.TokenType.USER_DEFINED)
587+
toktypes.append(gguf.TokenType.UNUSED)
563588
elif reverse_vocab[i] in added_vocab:
564589
tokens.append(reverse_vocab[i])
565590
toktypes.append(gguf.TokenType.CONTROL)
@@ -609,7 +634,7 @@ def _create_vocab_sentencepiece(self):
609634

610635
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
611636
scores: list[float] = [-10000.0] * vocab_size
612-
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
637+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
613638

614639
for token_id in range(tokenizer.vocab_size()):
615640
piece = tokenizer.IdToPiece(token_id)
@@ -644,6 +669,25 @@ def _create_vocab_sentencepiece(self):
644669
scores[token_id] = -1000.0
645670
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
646671

672+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
673+
if tokenizer_config_file.is_file():
674+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
675+
tokenizer_config_json = json.load(f)
676+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
677+
for token_id, token_data in added_tokens_decoder.items():
678+
token_id = int(token_id)
679+
token: str = token_data["content"]
680+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
681+
assert tokens[token_id] == token.encode("utf-8")
682+
if token_data.get("special") or self.does_token_look_special(token):
683+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
684+
else:
685+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
686+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
687+
688+
scores[token_id] = -1000.0
689+
tokens[token_id] = token.encode("utf-8")
690+
647691
if vocab_size > len(tokens):
648692
pad_count = vocab_size - len(tokens)
649693
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
@@ -1267,7 +1311,7 @@ def set_vocab(self):
12671311
if (self.dir_model / "tokenizer.json").is_file():
12681312
self._set_vocab_gpt2()
12691313
else:
1270-
# StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
1314+
# StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
12711315
self._set_vocab_qwen()
12721316

12731317
def set_gguf_parameters(self):
@@ -1579,7 +1623,6 @@ def set_gguf_parameters(self):
15791623
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
15801624

15811625
self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
1582-
self.gguf_writer.add_file_type(self.ftype)
15831626

15841627
self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
15851628
self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
@@ -1873,7 +1916,7 @@ def set_vocab(self):
18731916

18741917
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
18751918
scores: list[float] = [-10000.0] * vocab_size
1876-
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
1919+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
18771920

18781921
for token_id in range(tokenizer.vocab_size()):
18791922

@@ -1918,7 +1961,7 @@ def set_vocab(self):
19181961
for token_id, foken_data in added_tokens_decoder.items():
19191962
token_id = int(token_id)
19201963
token = foken_data["content"].encode("utf-8")
1921-
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1964+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
19221965
assert tokens[token_id] == token
19231966
tokens[token_id] = token
19241967
scores[token_id] = -1000.0
@@ -1934,7 +1977,7 @@ def set_vocab(self):
19341977
for foken_data in added_tokens:
19351978
token_id = int(foken_data["id"])
19361979
token = foken_data["content"].encode("utf-8")
1937-
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1980+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
19381981
assert tokens[token_id] == token
19391982
tokens[token_id] = token
19401983
scores[token_id] = -1000.0
@@ -2146,7 +2189,7 @@ def set_vocab(self):
21462189
toktype = SentencePieceTokenTypes.BYTE
21472190
# take care of ununsed raw token
21482191
if piece.startswith('[UNUSED'):
2149-
toktype = SentencePieceTokenTypes.UNKNOWN
2192+
toktype = SentencePieceTokenTypes.UNUSED
21502193

21512194
tokens.append(text)
21522195
scores.append(score)
@@ -2176,7 +2219,7 @@ def set_vocab(self):
21762219
if token == chat_eos_token:
21772220
chat_eos_token_id = token_id
21782221
token = token.encode("utf-8")
2179-
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
2222+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
21802223
assert(tokens[token_id] == token)
21812224
tokens[token_id] = token
21822225
scores[token_id] = -1000.0
@@ -2195,7 +2238,7 @@ def set_vocab(self):
21952238
if token == chat_eos_token:
21962239
chat_eos_token_id = token_id
21972240
token = token.encode("utf-8")
2198-
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
2241+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
21992242
assert(tokens[token_id] == token)
22002243
tokens[token_id] = token
22012244
scores[token_id] = -1000.0
@@ -2435,19 +2478,7 @@ class Gemma2Model(Model):
24352478
model_arch = gguf.MODEL_ARCH.GEMMA2
24362479

24372480
def set_vocab(self):
2438-
tokens, scores, toktypes = self._create_vocab_sentencepiece()
2439-
# hack: This is required so that we can properly use start/end-of-turn for chat template
2440-
for i in range(108):
2441-
# including <unusedX>, <start_of_turn>, <end_of_turn>
2442-
toktypes[i] = SentencePieceTokenTypes.CONTROL
2443-
self.gguf_writer.add_tokenizer_model("llama")
2444-
self.gguf_writer.add_tokenizer_pre("default")
2445-
self.gguf_writer.add_token_list(tokens)
2446-
self.gguf_writer.add_token_scores(scores)
2447-
self.gguf_writer.add_token_types(toktypes)
2448-
2449-
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2450-
special_vocab.add_to_gguf(self.gguf_writer)
2481+
self._set_vocab_sentencepiece()
24512482

24522483
self.gguf_writer.add_add_space_prefix(False)
24532484

@@ -2771,7 +2802,7 @@ def set_vocab(self):
27712802

27722803
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
27732804
scores: list[float] = [-10000.0] * vocab_size
2774-
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
2805+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
27752806

27762807
for token_id in range(tokenizer.vocab_size()):
27772808

@@ -3026,7 +3057,7 @@ def set_vocab(self):
30263057

30273058
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
30283059
scores: list[float] = [-10000.0] * vocab_size
3029-
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
3060+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
30303061

30313062
for token_id in range(tokenizer.vocab_size()):
30323063
piece = tokenizer.IdToPiece(token_id)
@@ -3244,15 +3275,14 @@ def set_vocab_chatglm3(self):
32443275
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
32453276
score = tokenizer.tokenizer.sp_model.get_score(token_id)
32463277

3247-
if len(piece) == 0:
3248-
text = f"[PAD{token_id}]".encode("utf-8")
3249-
32503278
if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
32513279
if piece in special_tokens:
3252-
# show special tokens in prompt
3253-
toktype = SentencePieceTokenTypes.USER_DEFINED
3280+
toktype = SentencePieceTokenTypes.CONTROL
3281+
elif len(piece) == 0:
3282+
text = f"[PAD{token_id}]".encode("utf-8")
3283+
toktype = SentencePieceTokenTypes.UNUSED
32543284
else:
3255-
toktype = SentencePieceTokenTypes.UNKNOWN
3285+
toktype = SentencePieceTokenTypes.USER_DEFINED
32563286
tokens.append(text)
32573287
scores.append(score)
32583288
toktypes.append(toktype)
@@ -3341,7 +3371,7 @@ def set_vocab(self):
33413371
for i in range(vocab_size):
33423372
if i not in reverse_vocab:
33433373
tokens.append(f"[PAD{i}]")
3344-
toktypes.append(gguf.TokenType.USER_DEFINED)
3374+
toktypes.append(gguf.TokenType.UNUSED)
33453375
elif reverse_vocab[i] in added_vocab:
33463376
tokens.append(reverse_vocab[i])
33473377
if tokenizer.added_tokens_decoder[i].special:

src/llama.cpp

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5419,6 +5419,7 @@ static void llm_load_vocab(
54195419
} else if (
54205420
tokenizer_pre == "command-r") {
54215421
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
5422+
vocab.tokenizer_clean_spaces = false;
54225423
} else if (
54235424
tokenizer_pre == "qwen2") {
54245425
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
@@ -5652,7 +5653,7 @@ static void llm_load_vocab(
56525653
// build special tokens cache
56535654
{
56545655
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
5655-
if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
5656+
if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
56565657
vocab.cache_special_tokens.push_back(id);
56575658
}
56585659
}
@@ -15418,17 +15419,6 @@ struct llm_tokenizer_bpe {
1541815419
"[0-9][0-9][0-9]",
1541915420
};
1542015421
break;
15421-
case LLAMA_VOCAB_PRE_TYPE_MPT:
15422-
// TODO: MPT pre-tokenization regexes are unknown
15423-
// the following are close, but not exact. run the following:
15424-
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
15425-
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
15426-
regex_exprs = {
15427-
"\\s?\\p{L}+",
15428-
"\\s?\\p{P}+",
15429-
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
15430-
};
15431-
break;
1543215422
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
1543315423
case LLAMA_VOCAB_PRE_TYPE_REFACT:
1543415424
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
@@ -15438,6 +15428,7 @@ struct llm_tokenizer_bpe {
1543815428
};
1543915429
break;
1544015430
case LLAMA_VOCAB_PRE_TYPE_GPT2:
15431+
case LLAMA_VOCAB_PRE_TYPE_MPT:
1544115432
case LLAMA_VOCAB_PRE_TYPE_OLMO:
1544215433
case LLAMA_VOCAB_PRE_TYPE_JAIS:
1544315434
regex_exprs = {
@@ -15464,8 +15455,8 @@ struct llm_tokenizer_bpe {
1546415455
break;
1546515456
case LLAMA_VOCAB_PRE_TYPE_VIKING:
1546615457
regex_exprs = {
15467-
"\\p{N}",
1546815458
" ?[^(\\s|.,!?…。,、।۔،)]+",
15459+
"\\p{N}",
1546915460
};
1547015461
break;
1547115462
default:
@@ -16185,12 +16176,20 @@ struct fragment_buffer_variant {
1618516176

1618616177
// #define PRETOKENIZERDEBUG
1618716178

16188-
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
16179+
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) {
1618916180
// for each special token
1619016181
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
1619116182
const auto & data = vocab.id_to_token[special_id];
1619216183
const auto & special_token = data.text;
1619316184

16185+
if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
16186+
// Ignore control and unknown tokens when parse_special == false
16187+
continue;
16188+
// User-defined tokens are still pre-tokenized before everything else
16189+
// ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
16190+
// This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
16191+
}
16192+
1619416193
// for each text fragment
1619516194
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
1619616195
while (it != buffer.end()) {
@@ -16303,7 +16302,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1630316302

1630416303
if (!raw_text.empty()) {
1630516304
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
16306-
if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
16305+
tokenizer_st_partition(vocab, fragment_buffer, parse_special);
1630716306
}
1630816307

1630916308
switch (vocab.type) {

tests/test-tokenizer-random.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from typing_extensions import Buffer
2121

2222
import cffi
23-
from transformers import AutoTokenizer
23+
from transformers import AutoTokenizer, PreTrainedTokenizer
2424

2525

2626
logger = logging.getLogger("test-tokenizer-random")
@@ -145,7 +145,7 @@ def decode(self, ids: list[int]) -> str:
145145
class TokenizerGroundtruth (Tokenizer):
146146

147147
def __init__(self, dir_tokenizer: str):
148-
self.model = AutoTokenizer.from_pretrained(dir_tokenizer, trust_remote_code=False)
148+
self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer, trust_remote_code=False)
149149
# guess BOS and EOS
150150
ids = self.encode("a")
151151
assert 1 <= len(ids) <= 3

0 commit comments

Comments
 (0)