@@ -2293,6 +2293,8 @@ struct llama_vocab {
2293
2293
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2294
2294
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2295
2295
2296
+ int max_token_len = 0; // used for optimizing longest token search
2297
+
2296
2298
std::unordered_map<token, id> token_to_id;
2297
2299
std::vector<token_data> id_to_token;
2298
2300
@@ -4940,6 +4942,7 @@ static void llm_load_vocab(
4940
4942
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
4941
4943
4942
4944
vocab.token_to_id[word] = i;
4945
+ vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
4943
4946
4944
4947
auto & token_data = vocab.id_to_token[i];
4945
4948
token_data.text = std::move(word);
@@ -5250,6 +5253,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5250
5253
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
5251
5254
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
5252
5255
5256
+ LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
5257
+
5253
5258
if (model.arch == LLM_ARCH_DEEPSEEK2) {
5254
5259
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
5255
5260
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
@@ -13489,7 +13494,7 @@ struct llm_tokenizer_bpe {
13489
13494
struct llm_tokenizer_wpm {
13490
13495
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
13491
13496
13492
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
13497
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
13493
13498
const auto & token_map = vocab.token_to_id;
13494
13499
13495
13500
// normalize and split by whitespace
@@ -13498,7 +13503,7 @@ struct llm_tokenizer_wpm {
13498
13503
// bos token prepended already
13499
13504
13500
13505
// find the longest tokens that form the words
13501
- for (const std::string &word : words) {
13506
+ for (const std::string & word : words) {
13502
13507
// skip empty words
13503
13508
if (word.size() == 0) {
13504
13509
continue;
@@ -13515,7 +13520,7 @@ struct llm_tokenizer_wpm {
13515
13520
for (int i = 0; i < n; ++i) {
13516
13521
// loop through possible match length
13517
13522
bool match = false;
13518
- for (int j = n ; j > i; j--) {
13523
+ for (int j = std::min(n, i + vocab.max_token_len + 1) ; j > i; j--) {
13519
13524
auto it = token_map.find(word1.substr(i, j - i));
13520
13525
if (it != token_map.end()) {
13521
13526
output.push_back(it->second);
@@ -13538,7 +13543,8 @@ struct llm_tokenizer_wpm {
13538
13543
}
13539
13544
}
13540
13545
13541
- std::vector<std::string> preprocess(const std::string & text) {
13546
+ // TODO: reduce string copies by using cpts_offs array
13547
+ std::vector<std::string> preprocess(const std::string & text) const {
13542
13548
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
13543
13549
std::vector<std::string> words(1, "");
13544
13550
@@ -13833,14 +13839,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13833
13839
output.push_back(vocab.special_cls_id);
13834
13840
}
13835
13841
13842
+ llm_tokenizer_wpm tokenizer(vocab);
13843
+
13836
13844
for (const auto & fragment : fragment_buffer) {
13837
13845
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13838
13846
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
13839
13847
13840
13848
#ifdef PRETOKENIZERDEBUG
13841
13849
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
13842
13850
#endif
13843
- llm_tokenizer_wpm tokenizer(vocab);
13844
13851
tokenizer.tokenize(raw_text, output);
13845
13852
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13846
13853
output.push_back(fragment.token);
0 commit comments