Skip to content

Commit 4e8bf7c

Browse files
committed
vocab : minor tokenization optimizations
ggml-ci
1 parent aefcffa commit 4e8bf7c

File tree

2 files changed

+21
-19
lines changed

2 files changed

+21
-19
lines changed

src/llama-vocab.cpp

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2421,7 +2421,7 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
24212421
}
24222422

24232423
std::vector<llama_token> llama_vocab::tokenize(
2424-
std::string raw_text,
2424+
const std::string & raw_text,
24252425
bool add_special,
24262426
bool parse_special) const {
24272427
GGML_ASSERT(pimpl->tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
@@ -2452,19 +2452,21 @@ std::vector<llama_token> llama_vocab::tokenize(
24522452

24532453
for (const auto & fragment : fragment_buffer) {
24542454
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2455-
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
2455+
std::string text;
24562456

24572457
// prefix with space if previous is special
24582458
if (tokenizer_add_space_prefix && is_prev_special) {
2459-
raw_text = " " + raw_text;
2459+
text += ' ';
24602460
}
24612461

2462+
text += fragment.raw_text.substr(fragment.offset, fragment.length);
2463+
24622464
#ifdef PRETOKENIZERDEBUG
2463-
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
2465+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
24642466
#endif
2465-
llama_escape_whitespace(raw_text);
2467+
llama_escape_whitespace(text);
24662468
llm_tokenizer_spm_session session(*this);
2467-
session.tokenize(raw_text, output);
2469+
session.tokenize(text, output);
24682470
is_prev_special = false;
24692471
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
24702472
output.push_back(fragment.token);
@@ -2494,12 +2496,12 @@ std::vector<llama_token> llama_vocab::tokenize(
24942496
}
24952497
for (const auto & fragment : fragment_buffer) {
24962498
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2497-
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
2499+
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
24982500

24992501
#ifdef PRETOKENIZERDEBUG
2500-
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
2502+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
25012503
#endif
2502-
session.tokenize(raw_text, output);
2504+
session.tokenize(text, output);
25032505
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
25042506
session.append(fragment.token, output);
25052507
}
@@ -2521,12 +2523,12 @@ std::vector<llama_token> llama_vocab::tokenize(
25212523

25222524
for (const auto & fragment : fragment_buffer) {
25232525
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2524-
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
2526+
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
25252527

25262528
#ifdef PRETOKENIZERDEBUG
2527-
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
2529+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
25282530
#endif
2529-
session.tokenize(raw_text, output);
2531+
session.tokenize(text, output);
25302532
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
25312533
output.push_back(fragment.token);
25322534
}
@@ -2547,11 +2549,11 @@ std::vector<llama_token> llama_vocab::tokenize(
25472549

25482550
for (const auto & fragment : fragment_buffer) {
25492551
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2550-
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
2552+
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
25512553
#ifdef PRETOKENIZERDEBUG
2552-
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
2554+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
25532555
#endif
2554-
session.tokenize(raw_text, output);
2556+
session.tokenize(text, output);
25552557
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
25562558
output.push_back(fragment.token);
25572559
}
@@ -2574,13 +2576,13 @@ std::vector<llama_token> llama_vocab::tokenize(
25742576
llm_tokenizer_rwkv_session session(*this, *static_cast<const llm_tokenizer_rwkv *>(pimpl->tokenizer.get()));
25752577
for (const auto & fragment : fragment_buffer) {
25762578
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2577-
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
2579+
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
25782580

25792581
#ifdef PRETOKENIZERDEBUG
2580-
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
2582+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
25812583
#endif
25822584

2583-
session.tokenize(raw_text, output);
2585+
session.tokenize(text, output);
25842586
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
25852587
output.push_back(fragment.token);
25862588
}

src/llama-vocab.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ struct llama_vocab {
8585
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
8686

8787
std::vector<llama_token> tokenize(
88-
std::string raw_text,
88+
const std::string & raw_text,
8989
bool add_special,
9090
bool parse_special = false) const;
9191

0 commit comments

Comments
 (0)