@@ -13377,16 +13377,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
13377
13377
13378
13378
// right
13379
13379
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
13380
- const int64_t right_reminder_offset = match + special_token.length();
13381
- const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13382
- buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13380
+ int64_t right_reminder_offset = match + special_token.length();
13381
+ int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
13382
+
13383
+ if (data.attribs & LLAMA_TOKEN_ATTRIB_RSTRIP) {
13384
+ while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
13385
+ right_reminder_offset++;
13386
+ right_reminder_length--;
13387
+ }
13388
+ }
13389
+
13390
+ if (right_reminder_length > 0) {
13391
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
13392
+ it++;
13393
+ }
13383
13394
13384
13395
#ifdef PRETOKENIZERDEBUG
13385
13396
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
13386
13397
#endif
13387
13398
13388
- it++;
13389
-
13390
13399
if (source == 0) {
13391
13400
buffer.erase_after(buffer.before_begin());
13392
13401
} else {
@@ -13432,9 +13441,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13432
13441
// tokenizer.encode('', add_special_tokens=True) returns [1]
13433
13442
// tokenizer.encode('', add_special_tokens=False) returns []
13434
13443
13435
- static const bool rtrim = true; //TODO: as param
13436
13444
bool is_prev_special = false;
13437
- bool special_token_rtrim = false;
13438
13445
13439
13446
if (add_special && vocab.special_add_bos != 0) {
13440
13447
GGML_ASSERT(vocab.special_bos_id != -1);
@@ -13444,25 +13451,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13444
13451
13445
13452
for (const auto & fragment : fragment_buffer) {
13446
13453
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13447
- // without adding this leading whitespace, we do not get the same results as the original tokenizer
13448
-
13449
- // TODO: It's likely possible to get rid of this string copy entirely
13450
- // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
13451
- // and passing 'add space prefix' as bool argument
13452
- //
13453
13454
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
13454
13455
13455
- if (special_token_rtrim) {
13456
- size_t num_whitespaces = 0;
13457
- while (isspace(raw_text[num_whitespaces])) {
13458
- num_whitespaces++;
13459
- }
13460
- if (num_whitespaces == raw_text.size()) {
13461
- continue; // skip if all whitespaces
13462
- }
13463
- raw_text = raw_text.substr(num_whitespaces);
13464
- }
13465
-
13466
13456
if (vocab.add_space_prefix) {
13467
13457
if (!output.size() || is_prev_special) { // prefix with space if first token
13468
13458
raw_text = " " + raw_text;
@@ -13478,11 +13468,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13478
13468
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13479
13469
output.push_back(fragment.token);
13480
13470
is_prev_special = true;
13481
- // phi-3 special tokens without rtrim, works fine for llama-spm too
13482
- special_token_rtrim = rtrim
13483
- && fragment.token != vocab.special_bos_id
13484
- && fragment.token != vocab.special_unk_id
13485
- && fragment.token != vocab.special_eos_id;
13486
13471
}
13487
13472
}
13488
13473
0 commit comments