Skip to content

Commit a8dabfc

Browse files
committed
Tokenizer fixes ggml-org#8379
by jaime-m-p
1 parent a6bd5c2 commit a8dabfc

File tree

1 file changed

+100
-8
lines changed

1 file changed

+100
-8
lines changed

src/llama.cpp

Lines changed: 100 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21102,14 +21102,106 @@ int32_t llama_tokenize(
2110221102
return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
2110321103
}
2110421104

21105-
int32_t llama_token_to_piece(
21106-
const struct llama_model * model,
21107-
llama_token token,
21108-
char * buf,
21109-
int32_t length,
21110-
int32_t lstrip,
21111-
bool special) {
21112-
return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
21105+
// errors: 'c': copy, 'i': ignore, 'r': replace 0xFFFD, 'v': verbose
21106+
static std::string llama_decode_text(const std::string & text, const char errors = 'v') {
21107+
std::string decoded_text;
21108+
21109+
const auto cpts = unicode_cpts_from_utf8(text);
21110+
for (const auto cpt : cpts) {
21111+
const auto utf8 = unicode_cpt_to_utf8(cpt);
21112+
try {
21113+
decoded_text += unicode_utf8_to_byte(utf8);
21114+
} catch (const std::out_of_range & /*e*/) {
21115+
switch (errors) {
21116+
case 'c':
21117+
decoded_text += utf8; // copy original
21118+
break;
21119+
case 'r':
21120+
decoded_text += "\xEF\xBF\xBD"; // 0xFFFD REPLACEMENT CHARACTER
21121+
break;
21122+
case 'v':
21123+
decoded_text += format("[UNK_BYTE_0x%02X]", cpt);
21124+
break;
21125+
case 'i':
21126+
default:
21127+
// ignore
21128+
break;
21129+
}
21130+
}
21131+
}
21132+
21133+
return decoded_text;
21134+
}
21135+
21136+
// does not write null-terminator to buf
21137+
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
21138+
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
21139+
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
21140+
const llama_token_attr attr = llama_token_get_attr(model, token);
21141+
if (!special && (attr & attr_special)) {
21142+
return 0;
21143+
}
21144+
21145+
// copy piece chars to output text buffer
21146+
// skip up to 'lstrip' leading spaces before copying
21147+
auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
21148+
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
21149+
token++;
21150+
size--;
21151+
}
21152+
if (length < (int32_t)size) {
21153+
return (int32_t) -size;
21154+
}
21155+
memcpy(buf, token, size);
21156+
return (int32_t) size;
21157+
};
21158+
21159+
// if we have a cache - use it
21160+
{
21161+
const auto & cache = model->vocab.cache_token_to_piece;
21162+
21163+
if (!cache.empty()) {
21164+
const auto & result = cache.at(token);
21165+
return _try_copy(result.data(), result.size());
21166+
}
21167+
}
21168+
21169+
if (0 <= token && token < llama_n_vocab(model)) {
21170+
const std::string & token_text = model->vocab.id_to_token[token].text;
21171+
switch (llama_vocab_get_type(model->vocab)) {
21172+
case LLAMA_VOCAB_TYPE_WPM:
21173+
case LLAMA_VOCAB_TYPE_SPM:
21174+
case LLAMA_VOCAB_TYPE_UGM: {
21175+
// NOTE: we accept all unsupported token types,
21176+
// suppressing them like CONTROL tokens.
21177+
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
21178+
return _try_copy(token_text.data(), token_text.size());
21179+
} else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
21180+
std::string result = token_text;
21181+
llama_unescape_whitespace(result);
21182+
return _try_copy(result.data(), result.size());
21183+
} else if (attr & LLAMA_TOKEN_ATTR_BYTE) {
21184+
char byte = (char) llama_token_to_byte(model->vocab, token);
21185+
return _try_copy((char*) &byte, 1);
21186+
}
21187+
break;
21188+
}
21189+
case LLAMA_VOCAB_TYPE_BPE: {
21190+
// NOTE: we accept all unsupported token types,
21191+
// suppressing them like CONTROL tokens.
21192+
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
21193+
return _try_copy(token_text.data(), token_text.size());
21194+
} else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
21195+
std::string result = llama_decode_text(token_text, 'c'); // copy on error //TODO: use a tokenizer variable
21196+
return _try_copy(result.data(), result.size());
21197+
}
21198+
break;
21199+
}
21200+
default:
21201+
GGML_ASSERT(false);
21202+
}
21203+
}
21204+
return 0;
2111321205
}
2111421206

2111521207
int32_t llama_detokenize(

0 commit comments

Comments
 (0)