Skip to content

Commit 6e351e0

Browse files
committed
convert_hf : identify which user-defined tokens are control tokens
Only used in _set_vocab_gpt2() for now.
1 parent 56df1fc commit 6e351e0

File tree

2 files changed

+15
-10
lines changed

2 files changed

+15
-10
lines changed

convert_hf_to_gguf.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,18 @@ def from_model_architecture(cls, arch: str) -> type[Model]:
373373
except KeyError:
374374
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
375375

376+
def does_token_look_special(self, token: str) -> bool:
377+
# Some models mark some added tokens which ought to be control tokens as not special.
378+
# (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
379+
is_known_special = token in (
380+
"<pad>", # deepseek-coder
381+
"<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
382+
)
383+
# TODO: should these be marked as UNUSED instead?
384+
is_known_special = is_known_special or (token.startswith("<unused") and token.endswith(">")) # gemma{,-2}
385+
386+
return is_known_special or (token.startswith(("<|", "<|")) and token.endswith(("|>", "|>")))
387+
376388
# used for GPT-2 BPE and WordPiece vocabs
377389
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
378390
tokens: list[str] = []
@@ -393,8 +405,9 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
393405
tokens.append(f"[PAD{i}]")
394406
toktypes.append(gguf.TokenType.USER_DEFINED)
395407
elif reverse_vocab[i] in added_vocab:
396-
tokens.append(reverse_vocab[i])
397-
if tokenizer.added_tokens_decoder[i].special:
408+
token: str = reverse_vocab[i]
409+
tokens.append(token)
410+
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
398411
toktypes.append(gguf.TokenType.CONTROL)
399412
else:
400413
toktypes.append(gguf.TokenType.USER_DEFINED)

src/llama.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5512,14 +5512,6 @@ static void llm_load_vocab(
55125512
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
55135513
}
55145514
}
5515-
5516-
if ((token_data.attr & LLAMA_TOKEN_ATTR_USER_DEFINED) && !token_data.text.empty() &&
5517-
token_data.text.front() == '<' && token_data.text.back() == '>') {
5518-
// Some models mark some added tokens which ought to be control tokens as not special.
5519-
// (e.g. command-r, command-r-plus, deepseek-coder)
5520-
// TODO: should this be fixed in the convert script instead?
5521-
token_data.attr = LLAMA_TOKEN_ATTR_CONTROL;
5522-
}
55235515
}
55245516
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
55255517

0 commit comments

Comments
 (0)