Skip to content

Commit 98edea6

Browse files
committed
llama : add UNKNOWN tokens in the special tokens cache
1 parent d4df785 commit 98edea6

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

src/llama.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5641,7 +5641,7 @@ static void llm_load_vocab(
56415641
// build special tokens cache
56425642
{
56435643
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
5644-
if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
5644+
if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
56455645
vocab.cache_special_tokens.push_back(id);
56465646
}
56475647
}
@@ -16168,8 +16168,8 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
1616816168
const auto & data = vocab.id_to_token[special_id];
1616916169
const auto & special_token = data.text;
1617016170

16171-
if (!parse_special && (data.attr & LLAMA_TOKEN_ATTR_CONTROL)) {
16172-
// Only ignore control tokens when parse_special == false
16171+
if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
16172+
// Ignore control and unknown tokens when parse_special == false
1617316173
continue;
1617416174
// User-defined tokens are still pre-tokenized before everything else
1617516175
// ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726

0 commit comments

Comments
 (0)