Skip to content

Commit 44c8648

Browse files
author
jaime-m-p
committed
Fix detokenizer():
UNKNOWN and CONTROL are 'special pieces'. Remove space after UNKNOWN and CONTROL. Refactor llama_token_to_piece().
1 parent 38d54b3 commit 44c8648

File tree

1 file changed

+17
-20
lines changed

1 file changed

+17
-20
lines changed

llama.cpp

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -18430,7 +18430,9 @@ static std::string llama_decode_text(const std::string & text) {
1843018430
// does not write null-terminator to buf
1843118431
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
1843218432
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
18433-
if (!special && llama_is_control_token(model->vocab, token)) {
18433+
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
18434+
const llama_token_attr attr = llama_token_get_attr(model, token);
18435+
if (!special && (attr & attr_special)) {
1843418436
return 0;
1843518437
}
1843618438

@@ -18459,38 +18461,31 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
1845918461
}
1846018462

1846118463
if (0 <= token && token < llama_n_vocab(model)) {
18464+
const std::string & token_text = model->vocab.id_to_token[token].text;
1846218465
switch (llama_vocab_get_type(model->vocab)) {
1846318466
case LLAMA_VOCAB_TYPE_WPM:
1846418467
case LLAMA_VOCAB_TYPE_SPM: {
1846518468
// NOTE: we accept all unsupported token types,
1846618469
// suppressing them like CONTROL tokens.
18467-
if (llama_is_normal_token(model->vocab, token)) {
18468-
std::string result = model->vocab.id_to_token[token].text;
18470+
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
18471+
return _try_copy(token_text.data(), token_text.size());
18472+
} else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
18473+
std::string result = token_text;
1846918474
llama_unescape_whitespace(result);
1847018475
return _try_copy(result.data(), result.size());
18471-
} else if (
18472-
(llama_is_user_defined_token(model->vocab, token)) ||
18473-
(llama_is_control_token (model->vocab, token) && special)) {
18474-
const std::string & result = model->vocab.id_to_token[token].text;
18475-
return _try_copy(result.data(), result.size());
18476-
/**/ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
18477-
/**/ return _try_copy("\xe2\x96\x85", 3);
18478-
} else if (llama_is_byte_token(model->vocab, token)) {
18476+
} else if (attr & LLAMA_TOKEN_ATTR_BYTE) {
1847918477
char byte = (char) llama_token_to_byte(model->vocab, token);
18480-
return _try_copy((char*)&byte, 1);
18478+
return _try_copy((char*) &byte, 1);
1848118479
}
1848218480
break;
1848318481
}
1848418482
case LLAMA_VOCAB_TYPE_BPE: {
1848518483
// NOTE: we accept all unsupported token types,
1848618484
// suppressing them like CONTROL tokens.
18487-
if (llama_is_normal_token(model->vocab, token)) {
18488-
std::string result = llama_decode_text(model->vocab.id_to_token[token].text);
18489-
return _try_copy(result.data(), result.size());
18490-
} else if (
18491-
(llama_is_user_defined_token(model->vocab, token)) ||
18492-
(llama_is_control_token (model->vocab, token) && special)) {
18493-
const std::string & result = model->vocab.id_to_token[token].text;
18485+
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
18486+
return _try_copy(token_text.data(), token_text.size());
18487+
} else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
18488+
std::string result = llama_decode_text(token_text);
1849418489
return _try_copy(result.data(), result.size());
1849518490
}
1849618491
break;
@@ -18510,14 +18505,16 @@ int32_t llama_detokenize(
1851018505
int32_t text_len_max,
1851118506
bool special) {
1851218507
// remove the leading space of the first non-control token
18508+
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
1851318509
bool remove_space = model->vocab.tokenizer_add_space_prefix;
1851418510
int32_t avail = text_len_max;
1851518511
int32_t total = 0;
1851618512

1851718513
for (int32_t i = 0; i < n_tokens; ++i) {
1851818514
GGML_ASSERT(avail >= 0);
1851918515
int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, remove_space, special);
18520-
remove_space = remove_space && llama_is_control_token(model->vocab, tokens[i]); // until non-control token
18516+
const llama_token_attr attr = llama_token_get_attr(model, tokens[i]);
18517+
remove_space = remove_space && (attr & attr_special); // until non-control token
1852118518
if (n_chars < 0) {
1852218519
avail = 0;
1852318520
total -= n_chars;

0 commit comments

Comments
 (0)