@@ -21099,14 +21099,106 @@ int32_t llama_tokenize(
21099
21099
return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
21100
21100
}
21101
21101
21102
- int32_t llama_token_to_piece(
21103
- const struct llama_model * model,
21104
- llama_token token,
21105
- char * buf,
21106
- int32_t length,
21107
- int32_t lstrip,
21108
- bool special) {
21109
- return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
21102
+ // errors: 'c': copy, 'i': ignore, 'r': replace 0xFFFD, 'v': verbose
21103
+ static std::string llama_decode_text(const std::string & text, const char errors = 'v') {
21104
+ std::string decoded_text;
21105
+
21106
+ const auto cpts = unicode_cpts_from_utf8(text);
21107
+ for (const auto cpt : cpts) {
21108
+ const auto utf8 = unicode_cpt_to_utf8(cpt);
21109
+ try {
21110
+ decoded_text += unicode_utf8_to_byte(utf8);
21111
+ } catch (const std::out_of_range & /*e*/) {
21112
+ switch (errors) {
21113
+ case 'c':
21114
+ decoded_text += utf8; // copy original
21115
+ break;
21116
+ case 'r':
21117
+ decoded_text += "\xEF\xBF\xBD"; // 0xFFFD REPLACEMENT CHARACTER
21118
+ break;
21119
+ case 'v':
21120
+ decoded_text += format("[UNK_BYTE_0x%02X]", cpt);
21121
+ break;
21122
+ case 'i':
21123
+ default:
21124
+ // ignore
21125
+ break;
21126
+ }
21127
+ }
21128
+ }
21129
+
21130
+ return decoded_text;
21131
+ }
21132
+
21133
+ // does not write null-terminator to buf
21134
+ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
21135
+ // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
21136
+ static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
21137
+ const llama_token_attr attr = llama_token_get_attr(model, token);
21138
+ if (!special && (attr & attr_special)) {
21139
+ return 0;
21140
+ }
21141
+
21142
+ // copy piece chars to output text buffer
21143
+ // skip up to 'lstrip' leading spaces before copying
21144
+ auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
21145
+ for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
21146
+ token++;
21147
+ size--;
21148
+ }
21149
+ if (length < (int32_t)size) {
21150
+ return (int32_t) -size;
21151
+ }
21152
+ memcpy(buf, token, size);
21153
+ return (int32_t) size;
21154
+ };
21155
+
21156
+ // if we have a cache - use it
21157
+ {
21158
+ const auto & cache = model->vocab.cache_token_to_piece;
21159
+
21160
+ if (!cache.empty()) {
21161
+ const auto & result = cache.at(token);
21162
+ return _try_copy(result.data(), result.size());
21163
+ }
21164
+ }
21165
+
21166
+ if (0 <= token && token < llama_n_vocab(model)) {
21167
+ const std::string & token_text = model->vocab.id_to_token[token].text;
21168
+ switch (llama_vocab_get_type(model->vocab)) {
21169
+ case LLAMA_VOCAB_TYPE_WPM:
21170
+ case LLAMA_VOCAB_TYPE_SPM:
21171
+ case LLAMA_VOCAB_TYPE_UGM: {
21172
+ // NOTE: we accept all unsupported token types,
21173
+ // suppressing them like CONTROL tokens.
21174
+ if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
21175
+ return _try_copy(token_text.data(), token_text.size());
21176
+ } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
21177
+ std::string result = token_text;
21178
+ llama_unescape_whitespace(result);
21179
+ return _try_copy(result.data(), result.size());
21180
+ } else if (attr & LLAMA_TOKEN_ATTR_BYTE) {
21181
+ char byte = (char) llama_token_to_byte(model->vocab, token);
21182
+ return _try_copy((char*) &byte, 1);
21183
+ }
21184
+ break;
21185
+ }
21186
+ case LLAMA_VOCAB_TYPE_BPE: {
21187
+ // NOTE: we accept all unsupported token types,
21188
+ // suppressing them like CONTROL tokens.
21189
+ if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
21190
+ return _try_copy(token_text.data(), token_text.size());
21191
+ } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
21192
+ std::string result = llama_decode_text(token_text, 'c'); // copy on error //TODO: use a tokenizer variable
21193
+ return _try_copy(result.data(), result.size());
21194
+ }
21195
+ break;
21196
+ }
21197
+ default:
21198
+ GGML_ASSERT(false);
21199
+ }
21200
+ }
21201
+ return 0;
21110
21202
}
21111
21203
21112
21204
int32_t llama_detokenize(
0 commit comments