@@ -21102,14 +21102,106 @@ int32_t llama_tokenize(
21102
21102
return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
21103
21103
}
21104
21104
21105
- int32_t llama_token_to_piece(
21106
- const struct llama_model * model,
21107
- llama_token token,
21108
- char * buf,
21109
- int32_t length,
21110
- int32_t lstrip,
21111
- bool special) {
21112
- return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
21105
+ // errors: 'c': copy, 'i': ignore, 'r': replace 0xFFFD, 'v': verbose
21106
+ static std::string llama_decode_text(const std::string & text, const char errors = 'v') {
21107
+ std::string decoded_text;
21108
+
21109
+ const auto cpts = unicode_cpts_from_utf8(text);
21110
+ for (const auto cpt : cpts) {
21111
+ const auto utf8 = unicode_cpt_to_utf8(cpt);
21112
+ try {
21113
+ decoded_text += unicode_utf8_to_byte(utf8);
21114
+ } catch (const std::out_of_range & /*e*/) {
21115
+ switch (errors) {
21116
+ case 'c':
21117
+ decoded_text += utf8; // copy original
21118
+ break;
21119
+ case 'r':
21120
+ decoded_text += "\xEF\xBF\xBD"; // 0xFFFD REPLACEMENT CHARACTER
21121
+ break;
21122
+ case 'v':
21123
+ decoded_text += format("[UNK_BYTE_0x%02X]", cpt);
21124
+ break;
21125
+ case 'i':
21126
+ default:
21127
+ // ignore
21128
+ break;
21129
+ }
21130
+ }
21131
+ }
21132
+
21133
+ return decoded_text;
21134
+ }
21135
+
21136
+ // does not write null-terminator to buf
21137
+ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
21138
+ // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
21139
+ static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
21140
+ const llama_token_attr attr = llama_token_get_attr(model, token);
21141
+ if (!special && (attr & attr_special)) {
21142
+ return 0;
21143
+ }
21144
+
21145
+ // copy piece chars to output text buffer
21146
+ // skip up to 'lstrip' leading spaces before copying
21147
+ auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
21148
+ for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
21149
+ token++;
21150
+ size--;
21151
+ }
21152
+ if (length < (int32_t)size) {
21153
+ return (int32_t) -size;
21154
+ }
21155
+ memcpy(buf, token, size);
21156
+ return (int32_t) size;
21157
+ };
21158
+
21159
+ // if we have a cache - use it
21160
+ {
21161
+ const auto & cache = model->vocab.cache_token_to_piece;
21162
+
21163
+ if (!cache.empty()) {
21164
+ const auto & result = cache.at(token);
21165
+ return _try_copy(result.data(), result.size());
21166
+ }
21167
+ }
21168
+
21169
+ if (0 <= token && token < llama_n_vocab(model)) {
21170
+ const std::string & token_text = model->vocab.id_to_token[token].text;
21171
+ switch (llama_vocab_get_type(model->vocab)) {
21172
+ case LLAMA_VOCAB_TYPE_WPM:
21173
+ case LLAMA_VOCAB_TYPE_SPM:
21174
+ case LLAMA_VOCAB_TYPE_UGM: {
21175
+ // NOTE: we accept all unsupported token types,
21176
+ // suppressing them like CONTROL tokens.
21177
+ if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
21178
+ return _try_copy(token_text.data(), token_text.size());
21179
+ } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
21180
+ std::string result = token_text;
21181
+ llama_unescape_whitespace(result);
21182
+ return _try_copy(result.data(), result.size());
21183
+ } else if (attr & LLAMA_TOKEN_ATTR_BYTE) {
21184
+ char byte = (char) llama_token_to_byte(model->vocab, token);
21185
+ return _try_copy((char*) &byte, 1);
21186
+ }
21187
+ break;
21188
+ }
21189
+ case LLAMA_VOCAB_TYPE_BPE: {
21190
+ // NOTE: we accept all unsupported token types,
21191
+ // suppressing them like CONTROL tokens.
21192
+ if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
21193
+ return _try_copy(token_text.data(), token_text.size());
21194
+ } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
21195
+ std::string result = llama_decode_text(token_text, 'c'); // copy on error //TODO: use a tokenizer variable
21196
+ return _try_copy(result.data(), result.size());
21197
+ }
21198
+ break;
21199
+ }
21200
+ default:
21201
+ GGML_ASSERT(false);
21202
+ }
21203
+ }
21204
+ return 0;
21113
21205
}
21114
21206
21115
21207
int32_t llama_detokenize(
0 commit comments