Skip to content

Commit ada961c

Browse files
author
jaime-m-p
committed
Implement 'rstrip' properly
1 parent 33de247 commit ada961c

File tree

1 file changed

+14
-29
lines changed

1 file changed

+14
-29
lines changed

llama.cpp

Lines changed: 14 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -13377,16 +13377,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
1337713377

1337813378
// right
1337913379
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
13380-
const int64_t right_reminder_offset = match + special_token.length();
13381-
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13382-
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13380+
int64_t right_reminder_offset = match + special_token.length();
13381+
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13382+
13383+
if (data.attribs & LLAMA_TOKEN_ATTRIB_RSTRIP) {
13384+
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
13385+
right_reminder_offset++;
13386+
right_reminder_length--;
13387+
}
13388+
}
13389+
13390+
if (right_reminder_length > 0) {
13391+
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13392+
it++;
13393+
}
1338313394

1338413395
#ifdef PRETOKENIZERDEBUG
1338513396
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
1338613397
#endif
1338713398

13388-
it++;
13389-
1339013399
if (source == 0) {
1339113400
buffer.erase_after(buffer.before_begin());
1339213401
} else {
@@ -13432,9 +13441,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1343213441
// tokenizer.encode('', add_special_tokens=True) returns [1]
1343313442
// tokenizer.encode('', add_special_tokens=False) returns []
1343413443

13435-
static const bool rtrim = true; //TODO: as param
1343613444
bool is_prev_special = false;
13437-
bool special_token_rtrim = false;
1343813445

1343913446
if (add_special && vocab.special_add_bos != 0) {
1344013447
GGML_ASSERT(vocab.special_bos_id != -1);
@@ -13444,25 +13451,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1344413451

1344513452
for (const auto & fragment : fragment_buffer) {
1344613453
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13447-
// without adding this leading whitespace, we do not get the same results as the original tokenizer
13448-
13449-
// TODO: It's likely possible to get rid of this string copy entirely
13450-
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
13451-
// and passing 'add space prefix' as bool argument
13452-
//
1345313454
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
1345413455

13455-
if (special_token_rtrim) {
13456-
size_t num_whitespaces = 0;
13457-
while (isspace(raw_text[num_whitespaces])) {
13458-
num_whitespaces++;
13459-
}
13460-
if (num_whitespaces == raw_text.size()) {
13461-
continue; // skip if all whitespaces
13462-
}
13463-
raw_text = raw_text.substr(num_whitespaces);
13464-
}
13465-
1346613456
if (vocab.add_space_prefix) {
1346713457
if (!output.size() || is_prev_special) { // prefix with space if first token
1346813458
raw_text = " " + raw_text;
@@ -13478,11 +13468,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1347813468
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1347913469
output.push_back(fragment.token);
1348013470
is_prev_special = true;
13481-
// phi-3 special tokens without rtrim, works fine for llama-spm too
13482-
special_token_rtrim = rtrim
13483-
&& fragment.token != vocab.special_bos_id
13484-
&& fragment.token != vocab.special_unk_id
13485-
&& fragment.token != vocab.special_eos_id;
1348613471
}
1348713472
}
1348813473

0 commit comments

Comments
 (0)