Skip to content

Commit f9d42c5

Browse files
committed
convert_hf : identify more added control tokens for SPM tokenziers
This makes Gemma and Gemma-2 tokenize pretty much EVERYTHING correctly, including HTML tags and consecutive spaces, but it unfortunately requires model re-conversion. There seems to be a weird behavior of the HF tokenizer for Gemma, which prefers to use the 16-space token over more lengthy space tokens, while using the SentencePiece tokenizer does not do this. (the implementation in llama.cpp has the same behavior as SentencePiece) * llama : fix wrong pre-tokenization of byte tokens
1 parent 6e351e0 commit f9d42c5

File tree

3 files changed

+55
-37
lines changed

3 files changed

+55
-37
lines changed

convert_hf_to_gguf.py

Lines changed: 49 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -373,17 +373,28 @@ def from_model_architecture(cls, arch: str) -> type[Model]:
373373
except KeyError:
374374
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
375375

376-
def does_token_look_special(self, token: str) -> bool:
376+
def does_token_look_special(self, token: str | bytes) -> bool:
377+
if isinstance(token, (bytes, bytearray)):
378+
token_text = token.decode(encoding="utf-8")
379+
elif isinstance(token, memoryview):
380+
token_text = token.tobytes().decode(encoding="utf-8")
381+
else:
382+
token_text = token
383+
377384
# Some models mark some added tokens which ought to be control tokens as not special.
378385
# (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
379-
is_known_special = token in (
386+
seems_special = token_text in (
380387
"<pad>", # deepseek-coder
381388
"<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
382389
)
383-
# TODO: should these be marked as UNUSED instead?
384-
is_known_special = is_known_special or (token.startswith("<unused") and token.endswith(">")) # gemma{,-2}
385390

386-
return is_known_special or (token.startswith(("<|", "<|")) and token.endswith(("|>", "|>")))
391+
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
392+
seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder
393+
394+
# TODO: should these be marked as UNUSED instead? (maybe not)
395+
seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2}
396+
397+
return seems_special
387398

388399
# used for GPT-2 BPE and WordPiece vocabs
389400
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
@@ -403,17 +414,18 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
403414
for i in range(vocab_size):
404415
if i not in reverse_vocab:
405416
tokens.append(f"[PAD{i}]")
406-
toktypes.append(gguf.TokenType.USER_DEFINED)
407-
elif reverse_vocab[i] in added_vocab:
417+
toktypes.append(gguf.TokenType.UNUSED)
418+
else:
408419
token: str = reverse_vocab[i]
409-
tokens.append(token)
410-
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
411-
toktypes.append(gguf.TokenType.CONTROL)
420+
if token in added_vocab:
421+
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
422+
toktypes.append(gguf.TokenType.CONTROL)
423+
else:
424+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
425+
toktypes.append(gguf.TokenType.USER_DEFINED)
412426
else:
413-
toktypes.append(gguf.TokenType.USER_DEFINED)
414-
else:
415-
tokens.append(reverse_vocab[i])
416-
toktypes.append(gguf.TokenType.NORMAL)
427+
toktypes.append(gguf.TokenType.NORMAL)
428+
tokens.append(token)
417429

418430
return tokens, toktypes, tokpre
419431

@@ -572,7 +584,7 @@ def _set_vocab_qwen(self):
572584
for i in range(vocab_size):
573585
if i not in reverse_vocab:
574586
tokens.append(f"[PAD{i}]")
575-
toktypes.append(gguf.TokenType.USER_DEFINED)
587+
toktypes.append(gguf.TokenType.UNUSED)
576588
elif reverse_vocab[i] in added_vocab:
577589
tokens.append(reverse_vocab[i])
578590
toktypes.append(gguf.TokenType.CONTROL)
@@ -657,6 +669,25 @@ def _create_vocab_sentencepiece(self):
657669
scores[token_id] = -1000.0
658670
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
659671

672+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
673+
if tokenizer_config_file.is_file():
674+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
675+
tokenizer_config_json = json.load(f)
676+
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
677+
for token_id, token_data in added_tokens_decoder.items():
678+
token_id = int(token_id)
679+
token: str = token_data["content"]
680+
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
681+
assert tokens[token_id] == token.encode("utf-8")
682+
if token_data.get("special") or self.does_token_look_special(token):
683+
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
684+
else:
685+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
686+
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
687+
688+
scores[token_id] = -1000.0
689+
tokens[token_id] = token.encode("utf-8")
690+
660691
if vocab_size > len(tokens):
661692
pad_count = vocab_size - len(tokens)
662693
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
@@ -1280,7 +1311,7 @@ def set_vocab(self):
12801311
if (self.dir_model / "tokenizer.json").is_file():
12811312
self._set_vocab_gpt2()
12821313
else:
1283-
# StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
1314+
# StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
12841315
self._set_vocab_qwen()
12851316

12861317
def set_gguf_parameters(self):
@@ -1592,7 +1623,6 @@ def set_gguf_parameters(self):
15921623
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
15931624

15941625
self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
1595-
self.gguf_writer.add_file_type(self.ftype)
15961626

15971627
self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
15981628
self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
@@ -2412,19 +2442,7 @@ class Gemma2Model(Model):
24122442
model_arch = gguf.MODEL_ARCH.GEMMA2
24132443

24142444
def set_vocab(self):
2415-
tokens, scores, toktypes = self._create_vocab_sentencepiece()
2416-
# hack: This is required so that we can properly use start/end-of-turn for chat template
2417-
for i in range(108):
2418-
# including <unusedX>, <start_of_turn>, <end_of_turn>
2419-
toktypes[i] = SentencePieceTokenTypes.CONTROL
2420-
self.gguf_writer.add_tokenizer_model("llama")
2421-
self.gguf_writer.add_tokenizer_pre("default")
2422-
self.gguf_writer.add_token_list(tokens)
2423-
self.gguf_writer.add_token_scores(scores)
2424-
self.gguf_writer.add_token_types(toktypes)
2425-
2426-
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2427-
special_vocab.add_to_gguf(self.gguf_writer)
2445+
self._set_vocab_sentencepiece()
24282446

24292447
self.gguf_writer.add_add_space_prefix(False)
24302448

@@ -3318,7 +3336,7 @@ def set_vocab(self):
33183336
for i in range(vocab_size):
33193337
if i not in reverse_vocab:
33203338
tokens.append(f"[PAD{i}]")
3321-
toktypes.append(gguf.TokenType.USER_DEFINED)
3339+
toktypes.append(gguf.TokenType.UNUSED)
33223340
elif reverse_vocab[i] in added_vocab:
33233341
tokens.append(reverse_vocab[i])
33243342
if tokenizer.added_tokens_decoder[i].special:

src/llama.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5640,7 +5640,7 @@ static void llm_load_vocab(
56405640
// build special tokens cache
56415641
{
56425642
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
5643-
if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
5643+
if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
56445644
vocab.cache_special_tokens.push_back(id);
56455645
}
56465646
}

tests/test-tokenizer-random.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from typing_extensions import Buffer
2121

2222
import cffi
23-
from transformers import AutoTokenizer
23+
from transformers import AutoTokenizer, PreTrainedTokenizer
2424

2525

2626
logger = logging.getLogger("test-tokenizer-random")
@@ -129,7 +129,7 @@ def decode(self, ids: list[int]) -> str:
129129
class TokenizerGroundtruth (Tokenizer):
130130

131131
def __init__(self, dir_tokenizer: str):
132-
self.model = AutoTokenizer.from_pretrained(dir_tokenizer)
132+
self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
133133
# guess BOS and EOS
134134
ids = self.encode("a")
135135
assert 1 <= len(ids) <= 3
@@ -143,7 +143,7 @@ def __init__(self, dir_tokenizer: str):
143143
self.vocab = list(sorted(self.vocab))
144144
# tokens and lists
145145
self.special_tokens = list(self.model.all_special_tokens)
146-
self.added_tokens = list(self.model.added_tokens_encoder)
146+
self.added_tokens = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False)
147147
self.bos_token = self.model.bos_token
148148
self.eos_token = self.model.eos_token
149149

@@ -458,8 +458,8 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool:
458458
i = find_first_mismatch(ids1, ids2)
459459
ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
460460
ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
461-
logger.error(" Expected: " + str(ids1))
462-
logger.error(" Result: " + str(ids2))
461+
logger.error(" Expected: " + str(ids1) + f" {[tokenizer1.decode([id]) for id in ids1]}")
462+
logger.error(" Result: " + str(ids2) + f" {[tokenizer2.decode([id]) for id in ids2]}")
463463
encode_errors += 1
464464
logger.error(f" {encode_errors=}")
465465
if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):

0 commit comments

Comments
 (0)