Skip to content

Commit f99e1e4

Browse files
llama : lookup word in vocab before doing BPE merges (#7193)
* fix: llama-3 ignore_merges * test: add test for llama-3 bpe ignore_merges * fix: set ignore_merges only for llama-3 * fix: test-tokenizer-1-bpe --ingore-merges detection * fix: copy to fix fallthrough * fix: change ignore_merges to bool * fix: add ignore merges tests to cmake * llama : alternative merge ignore logic --------- Co-authored-by: Haoxiang Fei <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 5ae3426 commit f99e1e4

File tree

5 files changed

+44
-5
lines changed

5 files changed

+44
-5
lines changed

llama.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12253,13 +12253,14 @@ struct llm_tokenizer_bpe {
1225312253

1225412254
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
1225512255
int final_prev_index = -1;
12256+
bool ignore_merges = false;
1225612257

1225712258
std::vector<std::string> word_collection;
1225812259
switch (vocab.type) {
1225912260
case LLAMA_VOCAB_TYPE_BPE:
1226012261
switch (vocab.type_pre) {
1226112262
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12262-
case LLAMA_VOCAB_PRE_TYPE_DBRX:
12263+
ignore_merges = true;
1226312264
word_collection = unicode_regex_split(text, {
1226412265
// original regex from tokenizer.json
1226512266
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12268,6 +12269,12 @@ struct llm_tokenizer_bpe {
1226812269
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
1226912270
});
1227012271
break;
12272+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
12273+
word_collection = unicode_regex_split(text, {
12274+
// same as llama3
12275+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12276+
});
12277+
break;
1227112278
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
1227212279
word_collection = unicode_regex_split(text, {
1227312280
"[\r\n]",
@@ -12351,6 +12358,11 @@ struct llm_tokenizer_bpe {
1235112358
int index = 0;
1235212359
size_t offset = 0;
1235312360

12361+
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
12362+
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
12363+
offset = word.size();
12364+
}
12365+
1235412366
while (offset < word.size()) {
1235512367
llm_symbol sym;
1235612368
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));

models/ggml-vocab-llama-bpe.gguf.inp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,5 @@ __ggml_vocab_test__
104104

105105
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
106106
__ggml_vocab_test__
107+
Việt
108+
__ggml_vocab_test__

models/ggml-vocab-llama-bpe.gguf.out

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,4 @@
4141
8765 8765 1644
4242
8765 8765 8765
4343
198 4815 15073 66597 8004 1602 2355 79772 11187 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 11410 99 247 9468 99 247 220 18 220 1644 220 8765 220 8765 18 220 8765 1644 220 8765 8765 220 8765 8765 18 220 8765 8765 1644 220 18 13 18 220 18 497 18 220 18 1131 18 220 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 76460 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 56560 54337 19175 102118 13373 64571 34694 3114 112203 80112 3436 106451 14196 14196 74694 3089 3089 29249 17523 3001 27708 7801 358 3077 1027 364 83 820 568 596 1070 11 364 793 499 2771 30 364 44 539 2771 358 3358 1304 433 11 364 35 499 1093 1063 15600 30 1226 6 43712 264 64966 43
44+
101798

tests/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
9292
install(TARGETS test-tokenizer-1-bpe RUNTIME)
9393

9494
# TODO: disabled due to slowness
95-
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
95+
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
9696
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
9797
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
9898
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)

tests/test-tokenizer-1-bpe.cpp

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,27 @@
1313
#include <vector>
1414

1515
int main(int argc, char **argv) {
16-
if (argc < 2) {
17-
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
16+
if (argc < 2 || argc > 3) {
17+
fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
1818
return 1;
1919
}
2020

2121
const std::string fname = argv[1];
22+
bool ignore_merges = false;
23+
if (argc == 3) {
24+
if (std::strcmp(argv[2], "--ignore-merges") != 0) {
25+
fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
26+
return 1;
27+
}
28+
ignore_merges = true;
29+
}
2230

2331
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
2432

33+
if (ignore_merges) {
34+
fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__);
35+
}
36+
2537
llama_model * model;
2638
llama_context * ctx;
2739

@@ -65,7 +77,19 @@ int main(int argc, char **argv) {
6577
std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
6678
try {
6779
auto cps = unicode_cpts_from_utf8(str);
68-
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
80+
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
81+
if (ignore_merges && tokens.size() > 1) {
82+
fprintf(stderr,
83+
"%s : error: token %d detokenizes to '%s'(%zu) but "
84+
"tokenization of this to multiple tokens: [",
85+
__func__, i, str.c_str(), str.length());
86+
fprintf(stderr, "%d", tokens[0]);
87+
for (size_t i = 1; i < tokens.size(); i++) {
88+
fprintf(stderr, ", %d", tokens[i]);
89+
}
90+
fprintf(stderr, "]\n");
91+
return 2;
92+
}
6993
std::string check = llama_detokenize_bpe(ctx, tokens);
7094
if (check != str) {
7195
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",

0 commit comments

Comments
 (0)