@@ -2421,7 +2421,7 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
2421
2421
}
2422
2422
2423
2423
std::vector<llama_token> llama_vocab::tokenize (
2424
- std::string raw_text,
2424
+ const std::string & raw_text,
2425
2425
bool add_special,
2426
2426
bool parse_special) const {
2427
2427
GGML_ASSERT (pimpl->tokenizer && " Tokenizer not initialized. Call llama_vocab::init_tokenizer() first." );
@@ -2452,19 +2452,21 @@ std::vector<llama_token> llama_vocab::tokenize(
2452
2452
2453
2453
for (const auto & fragment : fragment_buffer) {
2454
2454
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2455
- auto raw_text = fragment. raw_text . substr (fragment. offset , fragment. length ) ;
2455
+ std::string text ;
2456
2456
2457
2457
// prefix with space if previous is special
2458
2458
if (tokenizer_add_space_prefix && is_prev_special) {
2459
- raw_text = " " + raw_text ;
2459
+ text += ' ' ;
2460
2460
}
2461
2461
2462
+ text += fragment.raw_text .substr (fragment.offset , fragment.length );
2463
+
2462
2464
#ifdef PRETOKENIZERDEBUG
2463
- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2465
+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
2464
2466
#endif
2465
- llama_escape_whitespace (raw_text );
2467
+ llama_escape_whitespace (text );
2466
2468
llm_tokenizer_spm_session session (*this );
2467
- session.tokenize (raw_text , output);
2469
+ session.tokenize (text , output);
2468
2470
is_prev_special = false ;
2469
2471
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2470
2472
output.push_back (fragment.token );
@@ -2494,12 +2496,12 @@ std::vector<llama_token> llama_vocab::tokenize(
2494
2496
}
2495
2497
for (const auto & fragment : fragment_buffer) {
2496
2498
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2497
- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2499
+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
2498
2500
2499
2501
#ifdef PRETOKENIZERDEBUG
2500
- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2502
+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
2501
2503
#endif
2502
- session.tokenize (raw_text , output);
2504
+ session.tokenize (text , output);
2503
2505
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2504
2506
session.append (fragment.token , output);
2505
2507
}
@@ -2521,12 +2523,12 @@ std::vector<llama_token> llama_vocab::tokenize(
2521
2523
2522
2524
for (const auto & fragment : fragment_buffer) {
2523
2525
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2524
- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2526
+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
2525
2527
2526
2528
#ifdef PRETOKENIZERDEBUG
2527
- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2529
+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
2528
2530
#endif
2529
- session.tokenize (raw_text , output);
2531
+ session.tokenize (text , output);
2530
2532
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2531
2533
output.push_back (fragment.token );
2532
2534
}
@@ -2547,11 +2549,11 @@ std::vector<llama_token> llama_vocab::tokenize(
2547
2549
2548
2550
for (const auto & fragment : fragment_buffer) {
2549
2551
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2550
- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2552
+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
2551
2553
#ifdef PRETOKENIZERDEBUG
2552
- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2554
+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
2553
2555
#endif
2554
- session.tokenize (raw_text , output);
2556
+ session.tokenize (text , output);
2555
2557
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2556
2558
output.push_back (fragment.token );
2557
2559
}
@@ -2574,13 +2576,13 @@ std::vector<llama_token> llama_vocab::tokenize(
2574
2576
llm_tokenizer_rwkv_session session (*this , *static_cast <const llm_tokenizer_rwkv *>(pimpl->tokenizer .get ()));
2575
2577
for (const auto & fragment : fragment_buffer) {
2576
2578
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2577
- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2579
+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
2578
2580
2579
2581
#ifdef PRETOKENIZERDEBUG
2580
- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2582
+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
2581
2583
#endif
2582
2584
2583
- session.tokenize (raw_text , output);
2585
+ session.tokenize (text , output);
2584
2586
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2585
2587
output.push_back (fragment.token );
2586
2588
}
0 commit comments