Skip to content

Commit c68d259

Browse files
committed
tests : add more vocabs and tests
ggml-ci
1 parent 43708d2 commit c68d259

21 files changed

+954
-113
lines changed

.gitignore

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,10 +118,8 @@ nppBackup
118118
/tests/test-quantize-fns
119119
/tests/test-quantize-perf
120120
/tests/test-sampling
121-
/tests/test-tokenizer-0-llama
122-
/tests/test-tokenizer-0-falcon
123-
/tests/test-tokenizer-0-deepseek-coder
124-
/tests/test-tokenizer-1-llama
121+
/tests/test-tokenizer-0
122+
/tests/test-tokenizer-1-spm
125123
/tests/test-tokenizer-1-bpe
126124
/tests/test-rope
127125
/tests/test-backend-ops

Makefile

Lines changed: 9 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,9 @@ TEST_TARGETS = \
2020
tests/test-quantize-perf \
2121
tests/test-rope \
2222
tests/test-sampling \
23-
tests/test-tokenizer-0-deepseek-coder \
24-
tests/test-tokenizer-0-deepseek-llm \
25-
tests/test-tokenizer-0-falcon \
26-
tests/test-tokenizer-0-llama \
27-
tests/test-tokenizer-0-llama-v3 \
23+
tests/test-tokenizer-0 \
2824
tests/test-tokenizer-1-bpe \
29-
tests/test-tokenizer-1-llama
25+
tests/test-tokenizer-1-spm
3026

3127
# Code coverage output files
3228
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -65,17 +61,14 @@ default: $(BUILD_TARGETS)
6561
test: $(TEST_TARGETS)
6662
@failures=0; \
6763
for test_target in $(TEST_TARGETS); do \
68-
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
69-
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
70-
elif [ "$$test_target" = "tests/test-tokenizer-0-llama-v3" ]; then \
71-
./$$test_target $(CURDIR)/models/ggml-vocab-llama-v3.gguf; \
72-
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
64+
if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
65+
./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
66+
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
7367
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
74-
elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek-coder" ]; then \
7568
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
76-
elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek-llm" ]; then \
7769
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
78-
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
70+
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
71+
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
7972
continue; \
8073
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
8174
continue; \
@@ -993,29 +986,15 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
993986
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
994987
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
995988

996-
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
989+
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
997990
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
998991
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
999992

1000-
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
1001-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1002-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1003-
1004-
tests/test-tokenizer-0-llama-v3: tests/test-tokenizer-0-llama-v3.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
1005-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1006-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1007-
1008-
tests/test-tokenizer-0-deepseek-coder: tests/test-tokenizer-0-deepseek-coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
1009-
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
1010-
1011-
tests/test-tokenizer-0-deepseek-llm: tests/test-tokenizer-0-deepseek-llm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
1012-
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
1013-
1014993
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
1015994
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1016995
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1017996

1018-
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
997+
tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
1019998
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1020999
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
10211000

convert-hf-to-gguf-update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ def download_file_with_auth(url, token, save_path):
221221
"3333333",
222222
"33333333",
223223
"333333333",
224+
chktxt,
224225
]
225226

226227
# write the tests to ./models/ggml-vocab-{name}.gguf.inp

models/ggml-vocab-bert-bge.gguf

613 KB
Binary file not shown.

models/ggml-vocab-bert-bge.gguf.inp

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
2+
__ggml_vocab_test__
3+
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
13+
__ggml_vocab_test__
14+
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
22+
__ggml_vocab_test__
23+
24+
25+
__ggml_vocab_test__
26+
Hello world
27+
__ggml_vocab_test__
28+
Hello world
29+
__ggml_vocab_test__
30+
Hello World
31+
__ggml_vocab_test__
32+
Hello World
33+
__ggml_vocab_test__
34+
Hello World!
35+
__ggml_vocab_test__
36+
Hello, world!
37+
__ggml_vocab_test__
38+
Hello, world!
39+
__ggml_vocab_test__
40+
this is 🦙.cpp
41+
__ggml_vocab_test__
42+
w048 7tuijk dsdfhu
43+
__ggml_vocab_test__
44+
нещо на Български
45+
__ggml_vocab_test__
46+
កាន់តែពិសេសអាចខលចេញ
47+
__ggml_vocab_test__
48+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
49+
__ggml_vocab_test__
50+
Hello
51+
__ggml_vocab_test__
52+
Hello
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
Hello
62+
__ggml_vocab_test__
63+
(
64+
__ggml_vocab_test__
65+
66+
=
67+
__ggml_vocab_test__
68+
' era
69+
__ggml_vocab_test__
70+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
71+
__ggml_vocab_test__
72+
3
73+
__ggml_vocab_test__
74+
33
75+
__ggml_vocab_test__
76+
333
77+
__ggml_vocab_test__
78+
3333
79+
__ggml_vocab_test__
80+
33333
81+
__ggml_vocab_test__
82+
333333
83+
__ggml_vocab_test__
84+
3333333
85+
__ggml_vocab_test__
86+
33333333
87+
__ggml_vocab_test__
88+
333333333
89+
__ggml_vocab_test__
90+
91+
92+
93+
94+
95+
96+
97+
98+
99+
100+
101+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
102+
__ggml_vocab_test__

models/ggml-vocab-bert-bge.gguf.out

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
2+
3+
4+
5+
6+
7+
8+
9+
10+
7592 2088
11+
7592 2088
12+
7592 2088
13+
7592 2088
14+
7592 2088 999
15+
7592 1010 2088 999
16+
7592 1010 2088 999
17+
2023 2003 100 1012 18133 2361
18+
1059 2692 18139 1021 8525 28418 2243 16233 20952 6979
19+
1192 15290 29754 14150 1192 10260 1181 29755 29436 29741 10260 16856 29747 23925 10325
20+
100
21+
100 1006 3671 1007 100 1006 3674 7861 29147 2483 9530 16280 23854 1007 100 1006 2069 7861 29147 2072 2008 2038 2049 2219 19204 1007
22+
7592
23+
7592
24+
7592
25+
7592
26+
7592
27+
7592 7592
28+
1006
29+
1027
30+
1005 3690
31+
7592 1010 1061 1005 2035 999 2129 2024 2017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995
32+
1017
33+
3943
34+
21211
35+
21211 2509
36+
21211 22394
37+
21211 22394 2509
38+
21211 22394 22394
39+
21211 22394 22394 2509
40+
21211 22394 22394 22394
41+
100 1006 3671 1007 100 1006 3674 7861 29147 2483 9530 16280 23854 1007 100 100 1017 3943 21211 21211 2509 21211 22394 21211 22394 2509 21211 22394 22394 21211 22394 22394 2509 1017 1012 1017 1017 1012 1012 1017 1017 1012 1012 1012 1017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995 1011 1011 1011 1011 1011 1011 1027 1027 1027 1027 1027 1027 1027 1192 15290 29754 14150 1192 10260 1181 29755 29436 29741 10260 16856 29747 23925 10325 1005 1005 1005 1005 1005 1005 1036 1036 1036 1036 1036 1036 1036 1000 1000 1000 1000 1012 1012 1012 1012 1012 1012 999 999 999 999 999 999 1029 1029 1029 1029 1029 1029 1045 1005 2310 2042 1005 2409 2002 1005 1055 2045 1010 1005 2128 2017 2469 1029 1005 1049 2025 2469 1045 1005 2222 2191 2009 1010 1005 1040 2017 2066 2070 5572 1029 2057 1005 2310 1037 1005 2222
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
2+
__ggml_vocab_test__
3+
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
13+
__ggml_vocab_test__
14+
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
22+
__ggml_vocab_test__
23+
24+
25+
__ggml_vocab_test__
26+
Hello world
27+
__ggml_vocab_test__
28+
Hello world
29+
__ggml_vocab_test__
30+
Hello World
31+
__ggml_vocab_test__
32+
Hello World
33+
__ggml_vocab_test__
34+
Hello World!
35+
__ggml_vocab_test__
36+
Hello, world!
37+
__ggml_vocab_test__
38+
Hello, world!
39+
__ggml_vocab_test__
40+
this is 🦙.cpp
41+
__ggml_vocab_test__
42+
w048 7tuijk dsdfhu
43+
__ggml_vocab_test__
44+
нещо на Български
45+
__ggml_vocab_test__
46+
កាន់តែពិសេសអាចខលចេញ
47+
__ggml_vocab_test__
48+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
49+
__ggml_vocab_test__
50+
Hello
51+
__ggml_vocab_test__
52+
Hello
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
Hello
62+
__ggml_vocab_test__
63+
(
64+
__ggml_vocab_test__
65+
66+
=
67+
__ggml_vocab_test__
68+
' era
69+
__ggml_vocab_test__
70+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
71+
__ggml_vocab_test__
72+
3
73+
__ggml_vocab_test__
74+
33
75+
__ggml_vocab_test__
76+
333
77+
__ggml_vocab_test__
78+
3333
79+
__ggml_vocab_test__
80+
33333
81+
__ggml_vocab_test__
82+
333333
83+
__ggml_vocab_test__
84+
3333333
85+
__ggml_vocab_test__
86+
33333333
87+
__ggml_vocab_test__
88+
333333333
89+
__ggml_vocab_test__
90+
91+
92+
93+
94+
95+
96+
97+
98+
99+
100+
101+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
102+
__ggml_vocab_test__
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
2+
207
3+
243
4+
315
5+
184
6+
185
7+
185 185
8+
185 185 185
9+
184 185
10+
17535 1835
11+
414 9489 1835
12+
17535 5414
13+
414 9489 5414
14+
414 9489 5414 0
15+
17535 11 1835 0
16+
414 9489 11 1835 0
17+
437 317 12394 99 234 13 14789
18+
86 15 19 23 207 22 83 3963 27659 26078 3934 14072
19+
1593 6478 616 2251 14994
20+
155 239 209 155 239 114 155 239 228 155 240 220 155 239 224 155 240 211 155 239 231 155 239 115 155 239 240 155 240 210 155 239 240 155 239 95 155 239 114 155 239 214 155 239 210 155 239 236 155 239 214 155 240 210 155 239 218
21+
10047 235 209 334 8760 8 12394 233 114 350 222 10047 221 104 169 116 224 334 4684 3909 992 24330 262 29651 612 8 207 156 237 214 334 5950 992 78 12896 344 638 891 1372 10736 8
22+
17535
23+
414 9489
24+
207 414 9489
25+
243 414 9489
26+
315 414 9489
27+
315 414 9489 185 315 414 9489
28+
334
29+
185 405
30+
6 2895
31+
17535 11 320 6 435 0 1717 417 340 12394 233 210 3015 19100 608 9413 2668 16 18 16 19 16 20 16 1393 169 121 239
32+
18
33+
18 18
34+
18 18 18
35+
18 18 18 18
36+
18 18 18 18 18
37+
18 18 18 18 18 18
38+
18 18 18 18 18 18 18
39+
18 18 18 18 18 18 18 18
40+
18 18 18 18 18 18 18 18 18
41+
185 207 185 185 207 185 185 185 207 12405 459 22758 185 243 185 315 185 251 185 730 185 10047 235 209 334 8760 8 12394 233 114 350 222 10047 221 104 169 116 224 334 4684 3909 992 24330 262 29651 612 8 207 156 237 214 12394 99 234 10047 99 234 207 18 207 18 18 207 18 18 18 207 18 18 18 18 207 18 18 18 18 18 207 18 18 18 18 18 18 207 18 18 18 18 18 18 18 207 18 18 18 18 18 18 18 18 207 18 13 18 207 18 524 18 207 18 1202 18 207 155 239 209 155 239 114 155 239 228 155 240 220 155 239 224 155 240 211 155 239 231 155 239 115 155 239 240 155 240 210 155 239 240 155 239 95 155 239 114 155 239 214 10047 233 210 3015 19100 608 9413 2668 16 18 16 19 16 20 16 1393 169 121 239 18155 374 17194 28 2861 6478 616 2251 14994 31269 4191 6 4686 4686 10252 3358 3358 3409 524 15330 3023 15031 5668 303 6 312 798 651 83 839 362 6 82 741 11 651 1369 340 2037 30 651 44 441 2037 303 6 642 1098 359 11 651 35 340 833 738 10860 30 998 6 10709 245 6 75 43

0 commit comments

Comments
 (0)