Skip to content

Commit 107923c

Browse files
author
jaime-m-p
committed
Better leading space removal
1 parent 9854a9c commit 107923c

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

llama.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18505,12 +18505,12 @@ int32_t llama_detokenize(
1850518505
int32_t text_len_max,
1850618506
bool remove_special,
1850718507
bool unparse_special) {
18508-
// remove the leading space of the first non-control token
18509-
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
18510-
bool remove_space = !unparse_special && model->vocab.tokenizer_add_space_prefix;
1851118508
int32_t avail = text_len_max;
1851218509
int32_t total = 0;
1851318510

18511+
// remove the leading space
18512+
bool remove_space = model->vocab.tokenizer_add_space_prefix;
18513+
1851418514
if (remove_special && model->vocab.tokenizer_add_bos) {
1851518515
if (n_tokens > 0 && tokens[0] == model->vocab.special_bos_id) {
1851618516
n_tokens--;
@@ -18527,15 +18527,15 @@ int32_t llama_detokenize(
1852718527
for (int32_t i = 0; i < n_tokens; ++i) {
1852818528
GGML_ASSERT(avail >= 0);
1852918529
int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, remove_space, unparse_special);
18530-
const llama_token_attr attr = llama_token_get_attr(model, tokens[i]);
18531-
remove_space = remove_space && (attr & attr_special); // until non-control token
1853218530
if (n_chars < 0) {
1853318531
avail = 0;
1853418532
total -= n_chars;
18533+
remove_space = false;
1853518534
} else if (n_chars > 0) {
1853618535
avail -= n_chars;
1853718536
text += n_chars;
1853818537
total += n_chars;
18538+
remove_space = false;
1853918539
}
1854018540
}
1854118541

0 commit comments

Comments
 (0)