Skip to content

Commit 48229e6

Browse files
committed
Tokenizer fixes ggml-org#8379
by jaime-m-p
1 parent 42d7092 commit 48229e6

File tree

1 file changed

+100
-8
lines changed

1 file changed

+100
-8
lines changed

src/llama.cpp

Lines changed: 100 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21099,14 +21099,106 @@ int32_t llama_tokenize(
2109921099
return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
2110021100
}
2110121101

21102-
int32_t llama_token_to_piece(
21103-
const struct llama_model * model,
21104-
llama_token token,
21105-
char * buf,
21106-
int32_t length,
21107-
int32_t lstrip,
21108-
bool special) {
21109-
return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
21102+
// errors: 'c': copy, 'i': ignore, 'r': replace 0xFFFD, 'v': verbose
21103+
static std::string llama_decode_text(const std::string & text, const char errors = 'v') {
21104+
std::string decoded_text;
21105+
21106+
const auto cpts = unicode_cpts_from_utf8(text);
21107+
for (const auto cpt : cpts) {
21108+
const auto utf8 = unicode_cpt_to_utf8(cpt);
21109+
try {
21110+
decoded_text += unicode_utf8_to_byte(utf8);
21111+
} catch (const std::out_of_range & /*e*/) {
21112+
switch (errors) {
21113+
case 'c':
21114+
decoded_text += utf8; // copy original
21115+
break;
21116+
case 'r':
21117+
decoded_text += "\xEF\xBF\xBD"; // 0xFFFD REPLACEMENT CHARACTER
21118+
break;
21119+
case 'v':
21120+
decoded_text += format("[UNK_BYTE_0x%02X]", cpt);
21121+
break;
21122+
case 'i':
21123+
default:
21124+
// ignore
21125+
break;
21126+
}
21127+
}
21128+
}
21129+
21130+
return decoded_text;
21131+
}
21132+
21133+
// does not write null-terminator to buf
21134+
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
21135+
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
21136+
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
21137+
const llama_token_attr attr = llama_token_get_attr(model, token);
21138+
if (!special && (attr & attr_special)) {
21139+
return 0;
21140+
}
21141+
21142+
// copy piece chars to output text buffer
21143+
// skip up to 'lstrip' leading spaces before copying
21144+
auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
21145+
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
21146+
token++;
21147+
size--;
21148+
}
21149+
if (length < (int32_t)size) {
21150+
return (int32_t) -size;
21151+
}
21152+
memcpy(buf, token, size);
21153+
return (int32_t) size;
21154+
};
21155+
21156+
// if we have a cache - use it
21157+
{
21158+
const auto & cache = model->vocab.cache_token_to_piece;
21159+
21160+
if (!cache.empty()) {
21161+
const auto & result = cache.at(token);
21162+
return _try_copy(result.data(), result.size());
21163+
}
21164+
}
21165+
21166+
if (0 <= token && token < llama_n_vocab(model)) {
21167+
const std::string & token_text = model->vocab.id_to_token[token].text;
21168+
switch (llama_vocab_get_type(model->vocab)) {
21169+
case LLAMA_VOCAB_TYPE_WPM:
21170+
case LLAMA_VOCAB_TYPE_SPM:
21171+
case LLAMA_VOCAB_TYPE_UGM: {
21172+
// NOTE: we accept all unsupported token types,
21173+
// suppressing them like CONTROL tokens.
21174+
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
21175+
return _try_copy(token_text.data(), token_text.size());
21176+
} else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
21177+
std::string result = token_text;
21178+
llama_unescape_whitespace(result);
21179+
return _try_copy(result.data(), result.size());
21180+
} else if (attr & LLAMA_TOKEN_ATTR_BYTE) {
21181+
char byte = (char) llama_token_to_byte(model->vocab, token);
21182+
return _try_copy((char*) &byte, 1);
21183+
}
21184+
break;
21185+
}
21186+
case LLAMA_VOCAB_TYPE_BPE: {
21187+
// NOTE: we accept all unsupported token types,
21188+
// suppressing them like CONTROL tokens.
21189+
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
21190+
return _try_copy(token_text.data(), token_text.size());
21191+
} else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
21192+
std::string result = llama_decode_text(token_text, 'c'); // copy on error //TODO: use a tokenizer variable
21193+
return _try_copy(result.data(), result.size());
21194+
}
21195+
break;
21196+
}
21197+
default:
21198+
GGML_ASSERT(false);
21199+
}
21200+
}
21201+
return 0;
2111021202
}
2111121203

2111221204
int32_t llama_detokenize(

0 commit comments

Comments
 (0)