Skip to content

Commit ef4cca9

Browse files
committed
cmake : refactor test targets
1 parent 7b1210f commit ef4cca9

File tree

3 files changed

+161
-40
lines changed

3 files changed

+161
-40
lines changed

convert-hf-to-gguf-update.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def download_file_with_auth(url, token, save_path):
6868
else:
6969
print(f"Failed to download file. Status code: {response.status_code}")
7070

71+
# download the tokenizer models
7172
for model in models:
7273
name = model["name"]
7374
repo = model["repo"]
@@ -173,3 +174,84 @@ def download_file_with_auth(url, token, save_path):
173174
print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
174175
print("\n")
175176

177+
# generate tests for each tokenizer model
178+
179+
tests = [
180+
"",
181+
" ",
182+
" ",
183+
" ",
184+
"\t",
185+
"\n",
186+
"\n\n",
187+
"\n\n\n",
188+
"\t\n",
189+
"Hello world",
190+
" Hello world",
191+
"Hello World",
192+
" Hello World",
193+
" Hello World!",
194+
"Hello, world!",
195+
" Hello, world!",
196+
" this is 🦙.cpp",
197+
"w048 7tuijk dsdfhu",
198+
"нещо на Български",
199+
"កាន់តែពិសេសអាចខលចេញ",
200+
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
201+
"Hello",
202+
" Hello",
203+
" Hello",
204+
" Hello",
205+
" Hello",
206+
" Hello\n Hello",
207+
" (",
208+
"\n =",
209+
"' era",
210+
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
211+
"3",
212+
"33",
213+
"333",
214+
"3333",
215+
"33333",
216+
"333333",
217+
"3333333",
218+
"33333333",
219+
"333333333",
220+
]
221+
222+
# write the tests in ./models/test-vocab-inp.txt
223+
# the format is:
224+
#
225+
# test0
226+
# __ggml_vocab_test__
227+
# test1
228+
# __ggml_vocab_test__
229+
# ...
230+
#
231+
232+
with open(f"models/test-vocab-inp.txt", "w") as f:
233+
for text in tests:
234+
f.write(f"{text}")
235+
f.write("\n__ggml_vocab_test__\n")
236+
237+
print("Tests written in ./models/test-vocab-inp.txt")
238+
239+
# with each model, encode all tests and write the results in ./models/test-vocab-out-{name}.txt
240+
# for each test, write the resulting tokens on a separate line
241+
242+
for model in models:
243+
name = model["name"]
244+
tokt = model["tokt"]
245+
246+
# create the tokenizer
247+
from transformers import AutoTokenizer
248+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
249+
250+
with open(f"models/test-vocab-out-{name}.txt", "w") as f:
251+
for text in tests:
252+
res = tokenizer.encode(text)
253+
for r in res:
254+
f.write(f" {r}")
255+
f.write("\n")
256+
257+
print(f"Test results for {name} written in ./models/test-vocab-out-{name}.txt")

tests/CMakeLists.txt

Lines changed: 79 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,40 @@
1+
function(llama_test target)
2+
include(CMakeParseArguments)
3+
set(options)
4+
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
5+
set(multiValueArgs ARGS)
6+
cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
7+
8+
if (NOT DEFINED LLAMA_TEST_LABEL)
9+
set(LLAMA_TEST_LABEL "main")
10+
endif()
11+
if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
12+
set(LLAMA_TEST_WORKING_DIRECTORY .)
13+
endif()
14+
if (DEFINED LLAMA_TEST_NAME)
15+
set(TEST_NAME ${LLAMA_TEST_NAME})
16+
else()
17+
set(TEST_NAME ${target})
18+
endif()
19+
20+
set(TEST_TARGET ${target})
21+
22+
add_test(
23+
NAME ${TEST_NAME}
24+
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
25+
COMMAND $<TARGET_FILE:${TEST_TARGET}>
26+
${LLAMA_TEST_ARGS})
27+
28+
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
29+
endfunction()
30+
131
# Builds and runs a test source file.
232
# Optional args:
333
# - NAME: name of the executable & test target (defaults to the source file name without extension)
434
# - LABEL: label for the test (defaults to main)
535
# - ARGS: arguments to pass to the test executable
636
# - WORKING_DIRECTORY
7-
function(llama_test source)
37+
function(llama_target_and_test source)
838
include(CMakeParseArguments)
939
set(options)
1040
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
@@ -35,45 +65,54 @@ function(llama_test source)
3565
set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
3666
endfunction()
3767

38-
# llama_test(test-double-float.cpp) # SLOW
39-
llama_test(test-quantize-fns.cpp)
40-
llama_test(test-quantize-perf.cpp)
41-
llama_test(test-sampling.cpp)
42-
llama_test(test-chat-template.cpp)
43-
44-
llama_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
45-
llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
46-
llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
47-
48-
llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
49-
llama_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
50-
51-
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
52-
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
53-
54-
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
55-
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
56-
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
57-
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-stablelm-3b-4e1t ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
58-
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
59-
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
60-
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
61-
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
62-
#llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
63-
64-
llama_test(test-grammar-parser.cpp)
65-
llama_test(test-llama-grammar.cpp)
66-
llama_test(test-grammar-integration.cpp)
67-
llama_test(test-grad0.cpp)
68-
# llama_test(test-opt.cpp) # SLOW
69-
llama_test(test-backend-ops.cpp)
70-
71-
llama_test(test-rope.cpp)
72-
73-
llama_test(test-model-load-cancel.cpp LABEL "model")
74-
llama_test(test-autorelease.cpp LABEL "model")
75-
76-
llama_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
68+
# llama_target_and_test(test-double-float.cpp) # SLOW
69+
llama_target_and_test(test-quantize-fns.cpp)
70+
llama_target_and_test(test-quantize-perf.cpp)
71+
llama_target_and_test(test-sampling.cpp)
72+
llama_target_and_test(test-chat-template.cpp)
73+
74+
llama_target_and_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
75+
llama_target_and_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
76+
llama_target_and_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
77+
78+
llama_target_and_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
79+
llama_target_and_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
80+
81+
llama_target_and_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
82+
llama_target_and_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
83+
84+
# build test-tokenizer-1-bpe target once and add many tests
85+
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp get-model.cpp)
86+
target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
87+
install(TARGETS test-tokenizer-1-bpe RUNTIME)
88+
89+
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
90+
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
91+
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
92+
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf)
93+
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
94+
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
95+
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
96+
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
97+
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
98+
99+
100+
101+
102+
103+
llama_target_and_test(test-grammar-parser.cpp)
104+
llama_target_and_test(test-llama-grammar.cpp)
105+
llama_target_and_test(test-grammar-integration.cpp)
106+
llama_target_and_test(test-grad0.cpp)
107+
# llama_target_and_test(test-opt.cpp) # SLOW
108+
llama_target_and_test(test-backend-ops.cpp)
109+
110+
llama_target_and_test(test-rope.cpp)
111+
112+
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
113+
llama_target_and_test(test-autorelease.cpp LABEL "model")
114+
115+
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
77116
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
78117

79118
# dummy executable - not installed

0 commit comments

Comments
 (0)