Skip to content

Commit e1d4a93

Browse files
committed
fix: set ignore_merges only for llama-3
1 parent ca42f36 commit e1d4a93

File tree

2 files changed

+19
-4
lines changed

2 files changed

+19
-4
lines changed

llama.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12200,12 +12200,14 @@ struct llm_tokenizer_bpe {
1220012200

1220112201
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
1220212202
int final_prev_index = -1;
12203+
int ignore_merges = false;
1220312204

1220412205
std::vector<std::string> word_collection;
1220512206
switch (vocab.type) {
1220612207
case LLAMA_VOCAB_TYPE_BPE:
1220712208
switch (vocab.type_pre) {
1220812209
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12210+
ignore_merges = true;
1220912211
case LLAMA_VOCAB_PRE_TYPE_DBRX:
1221012212
word_collection = unicode_regex_split(text, {
1221112213
// original regex from tokenizer.json
@@ -12292,7 +12294,7 @@ struct llm_tokenizer_bpe {
1229212294
symbols_final.clear();
1229312295

1229412296
for (auto & word : word_collection) {
12295-
if (vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
12297+
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
1229612298
llm_symbol sym;
1229712299
sym.text = word.c_str();
1229812300
sym.n = word.size();

tests/test-tokenizer-1-bpe.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,27 @@
1313
#include <vector>
1414

1515
int main(int argc, char **argv) {
16-
if (argc < 2) {
17-
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
16+
if (argc < 2 || argc > 3) {
17+
fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
1818
return 1;
1919
}
2020

2121
const std::string fname = argv[1];
22+
bool ignore_merges = false;
23+
if (argc == 3) {
24+
if (std::strcmp(argv[2], "ignore-merges") != 0) {
25+
fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
26+
return 1;
27+
}
28+
ignore_merges = true;
29+
}
2230

2331
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
2432

33+
if (ignore_merges) {
34+
fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__);
35+
}
36+
2537
llama_model * model;
2638
llama_context * ctx;
2739

@@ -66,7 +78,7 @@ int main(int argc, char **argv) {
6678
try {
6779
auto cps = unicode_cpts_from_utf8(str);
6880
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
69-
if (tokens.size() > 1) {
81+
if (ignore_merges && tokens.size() > 1) {
7082
fprintf(stderr,
7183
"%s : error: token %d detokenizes to '%s'(%zu) but "
7284
"tokenization of this to multiple tokens: [",
@@ -76,6 +88,7 @@ int main(int argc, char **argv) {
7688
fprintf(stderr, ", %d", tokens[i]);
7789
}
7890
fprintf(stderr, "]\n");
91+
return 2;
7992
}
8093
std::string check = llama_detokenize_bpe(ctx, tokens);
8194
if (check != str) {

0 commit comments

Comments
 (0)