Skip to content

Commit 3d6008e

Browse files
committed
test: add test for llama-3 bpe ignore_merges
1 parent fc937a4 commit 3d6008e

File tree

1 file changed

+12
-1
lines changed

1 file changed

+12
-1
lines changed

tests/test-tokenizer-1-bpe.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,18 @@ int main(int argc, char **argv) {
6565
std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
6666
try {
6767
auto cps = unicode_cpts_from_utf8(str);
68-
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
68+
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
69+
if (tokens.size() > 1) {
70+
fprintf(stderr,
71+
"%s : error: token %d detokenizes to '%s'(%zu) but "
72+
"tokenization of this to multiple tokens: [",
73+
__func__, i, str.c_str(), str.length());
74+
fprintf(stderr, "%d", tokens[0]);
75+
for (size_t i = 1; i < tokens.size(); i++) {
76+
fprintf(stderr, ", %d", tokens[i]);
77+
}
78+
fprintf(stderr, "]\n");
79+
}
6980
std::string check = llama_detokenize_bpe(ctx, tokens);
7081
if (check != str) {
7182
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",

0 commit comments

Comments
 (0)