@@ -1835,18 +1835,19 @@ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1835
1835
1836
1836
// NOTE: avoid ever using this except for building the token_to_piece caches
1837
1837
static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
1838
- std::vector<char> result(8, 0);
1839
- const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
1840
- if (n_tokens < 0) {
1841
- result.resize(-n_tokens);
1842
- int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
1843
- GGML_ASSERT(check == -n_tokens);
1838
+ std::string piece;
1839
+ piece.resize(piece.capacity()); // using string internal cache
1840
+ const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
1841
+ if (n_chars < 0) {
1842
+ piece.resize(-n_chars);
1843
+ int check = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
1844
+ GGML_ASSERT(check == -n_chars);
1844
1845
}
1845
1846
else {
1846
- result .resize(n_tokens );
1847
+ piece .resize(n_chars );
1847
1848
}
1848
1849
1849
- return std::string(result.data(), result.size()) ;
1850
+ return piece ;
1850
1851
}
1851
1852
1852
1853
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
@@ -18418,23 +18419,33 @@ static std::string llama_decode_text(const std::string & text) {
18418
18419
}
18419
18420
18420
18421
// does not write null-terminator to buf
18421
- int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
18422
+ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
18422
18423
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
18423
18424
if (!special && llama_is_control_token(model->vocab, token)) {
18424
18425
return 0;
18425
18426
}
18426
18427
18428
+ // copy piece chars to output text buffer
18429
+ // skip up to 'lstrip' leading spaces before copying
18430
+ auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
18431
+ for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
18432
+ token++;
18433
+ size--;
18434
+ }
18435
+ if (length < (int32_t)size) {
18436
+ return (int32_t) -size;
18437
+ }
18438
+ memcpy(buf, token, size);
18439
+ return (int32_t) size;
18440
+ };
18441
+
18427
18442
// if we have a cache - use it
18428
18443
{
18429
18444
const auto & cache = model->vocab.cache_token_to_piece;
18430
18445
18431
18446
if (!cache.empty()) {
18432
- const auto & res = cache.at(token);
18433
- if (length < (int) res.size()) {
18434
- return -(int) res.size();
18435
- }
18436
- memcpy(buf, res.c_str(), res.size());
18437
- return res.size();
18447
+ const auto & result = cache.at(token);
18448
+ return _try_copy(result.data(), result.size());
18438
18449
}
18439
18450
}
18440
18451
@@ -18447,55 +18458,31 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
18447
18458
if (llama_is_normal_token(model->vocab, token)) {
18448
18459
std::string result = model->vocab.id_to_token[token].text;
18449
18460
llama_unescape_whitespace(result);
18450
- if (length < (int) result.length()) {
18451
- return -(int) result.length();
18452
- }
18453
- memcpy(buf, result.c_str(), result.length());
18454
- return result.length();
18461
+ return _try_copy(result.data(), result.size());
18455
18462
} else if (
18456
18463
(llama_is_user_defined_token(model->vocab, token)) ||
18457
18464
(llama_is_control_token (model->vocab, token) && special)) {
18458
- std::string result = model->vocab.id_to_token[token].text;
18459
- if (length < (int) result.length()) {
18460
- return -(int) result.length();
18461
- }
18462
- memcpy(buf, result.c_str(), result.length());
18463
- return result.length();
18464
- } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
18465
- if (length < 3) {
18466
- return -3;
18467
- }
18468
- memcpy(buf, "\xe2\x96\x85", 3);
18469
- return 3;
18465
+ const std::string & result = model->vocab.id_to_token[token].text;
18466
+ return _try_copy(result.data(), result.size());
18467
+ /**/ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
18468
+ /**/ return _try_copy("\xe2\x96\x85", 3);
18470
18469
} else if (llama_is_byte_token(model->vocab, token)) {
18471
- if (length < 1) {
18472
- return -1;
18473
- }
18474
- buf[0] = llama_token_to_byte(model->vocab, token);
18475
- return 1;
18470
+ char byte = (char) llama_token_to_byte(model->vocab, token);
18471
+ return _try_copy((char*)&byte, 1);
18476
18472
}
18477
18473
break;
18478
18474
}
18479
18475
case LLAMA_VOCAB_TYPE_BPE: {
18480
18476
// NOTE: we accept all unsupported token types,
18481
18477
// suppressing them like CONTROL tokens.
18482
18478
if (llama_is_normal_token(model->vocab, token)) {
18483
- std::string result = model->vocab.id_to_token[token].text;
18484
- result = llama_decode_text(result);
18485
- if (length < (int) result.length()) {
18486
- return -(int) result.length();
18487
- }
18488
- memcpy(buf, result.c_str(), result.length());
18489
- return result.length();
18479
+ std::string result = llama_decode_text(model->vocab.id_to_token[token].text);
18480
+ return _try_copy(result.data(), result.size());
18490
18481
} else if (
18491
18482
(llama_is_user_defined_token(model->vocab, token)) ||
18492
18483
(llama_is_control_token (model->vocab, token) && special)) {
18493
- std::string result = model->vocab.id_to_token[token].text;
18494
- if (length < (int) result.length()) {
18495
- return -(int) result.length();
18496
- }
18497
- memcpy(buf, result.c_str(), result.length());
18498
- return result.length();
18484
+ const std::string & result = model->vocab.id_to_token[token].text;
18485
+ return _try_copy(result.data(), result.size());
18499
18486
}
18500
18487
break;
18501
18488
}
@@ -18513,12 +18500,15 @@ int32_t llama_detokenize(
18513
18500
char * text,
18514
18501
int32_t text_len_max,
18515
18502
bool special) {
18503
+ // remove the leading space of the first non-control token
18504
+ bool remove_space = model->vocab.tokenizer_add_space_prefix;
18516
18505
int32_t avail = text_len_max;
18517
18506
int32_t total = 0;
18518
18507
18519
18508
for (int32_t i = 0; i < n_tokens; ++i) {
18520
18509
GGML_ASSERT(avail >= 0);
18521
- int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, special);
18510
+ int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, remove_space, special);
18511
+ remove_space = remove_space && llama_is_control_token(model->vocab, tokens[i]); // until non-control token
18522
18512
if (n_chars < 0) {
18523
18513
avail = 0;
18524
18514
total -= n_chars;
0 commit comments