Skip to content

Commit 43708d2

Browse files
committed
tests : refactor vocab tests
ggml-ci
1 parent ef4cca9 commit 43708d2

15 files changed

+316
-1010
lines changed

convert-hf-to-gguf-update.py

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ class TOKENIZER_TYPE(IntEnum):
4646

4747
# TODO: add models here, base models preferred
4848
models = [
49-
{ "name": "llama-v2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
50-
{ "name": "llama-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
49+
{ "name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
50+
{ "name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
5151
{ "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
5252
{ "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
5353
{ "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
@@ -64,7 +64,7 @@ def download_file_with_auth(url, token, save_path):
6464
if response.status_code == 200:
6565
with open(save_path, 'wb') as f:
6666
f.write(response.content)
67-
print("File downloaded successfully.")
67+
print(f"File {save_path} downloaded successfully")
6868
else:
6969
print(f"Failed to download file. Status code: {response.status_code}")
7070

@@ -82,6 +82,10 @@ def download_file_with_auth(url, token, save_path):
8282

8383
print(f"Downloading {name} to models/tokenizers/{name}")
8484

85+
url = f"{repo}/raw/main/config.json"
86+
save_path = f"models/tokenizers/{name}/config.json"
87+
download_file_with_auth(url, token, save_path)
88+
8589
url = f"{repo}/raw/main/tokenizer.json"
8690
save_path = f"models/tokenizers/{name}/tokenizer.json"
8791
download_file_with_auth(url, token, save_path)
@@ -219,7 +223,7 @@ def download_file_with_auth(url, token, save_path):
219223
"333333333",
220224
]
221225

222-
# write the tests in ./models/test-vocab-inp.txt
226+
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
223227
# the format is:
224228
#
225229
# test0
@@ -229,14 +233,7 @@ def download_file_with_auth(url, token, save_path):
229233
# ...
230234
#
231235

232-
with open(f"models/test-vocab-inp.txt", "w") as f:
233-
for text in tests:
234-
f.write(f"{text}")
235-
f.write("\n__ggml_vocab_test__\n")
236-
237-
print("Tests written in ./models/test-vocab-inp.txt")
238-
239-
# with each model, encode all tests and write the results in ./models/test-vocab-out-{name}.txt
236+
# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
240237
# for each test, write the resulting tokens on a separate line
241238

242239
for model in models:
@@ -247,11 +244,27 @@ def download_file_with_auth(url, token, save_path):
247244
from transformers import AutoTokenizer
248245
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
249246

250-
with open(f"models/test-vocab-out-{name}.txt", "w") as f:
247+
with open(f"models/ggml-vocab-{name}.gguf.inp", "w") as f:
251248
for text in tests:
252-
res = tokenizer.encode(text)
249+
f.write(f"{text}")
250+
f.write("\n__ggml_vocab_test__\n")
251+
252+
with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
253+
for text in tests:
254+
res = tokenizer.encode(text, add_special_tokens=False)
253255
for r in res:
254256
f.write(f" {r}")
255257
f.write("\n")
256258

257-
print(f"Test results for {name} written in ./models/test-vocab-out-{name}.txt")
259+
print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
260+
261+
# generate commands for creating vocab files
262+
263+
print("\nRun the following commands to generate the vocab files for testing:\n")
264+
265+
for model in models:
266+
name = model["name"]
267+
268+
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
269+
270+
print("\n")

convert-hf-to-gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
283283
# don't do this manually - use the convert-hf-to-gguf-update.py script!
284284
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
285285
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
286-
res = "llama-v3"
286+
res = "llama-bpe"
287287
if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
288288
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
289289
res = "deepseek-llm"

llama.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4339,8 +4339,9 @@ static void llm_load_vocab(
43394339
tokenizer_pre == "default") {
43404340
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
43414341
} else if (
4342-
tokenizer_pre == "llama3" ||
4343-
tokenizer_pre == "llama-v3") {
4342+
tokenizer_pre == "llama3" ||
4343+
tokenizer_pre == "llama-v3" ||
4344+
tokenizer_pre == "llama-bpe") {
43444345
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
43454346
} else if (
43464347
tokenizer_pre == "deepseek-llm") {
@@ -12583,7 +12584,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1258312584
} break;
1258412585
case LLAMA_VOCAB_TYPE_BPE:
1258512586
{
12586-
if (add_special && vocab.special_add_bos == 1) {
12587+
if (add_special && vocab.special_add_bos != 0) {
1258712588
GGML_ASSERT(vocab.special_bos_id != -1);
1258812589
output.push_back(vocab.special_bos_id);
1258912590
}

models/ggml-vocab-deepseek-coder.gguf

-1.07 KB
Binary file not shown.

models/ggml-vocab-deepseek-llm.gguf

-510 Bytes
Binary file not shown.

models/ggml-vocab-llama-v3.gguf

-7.46 MB
Binary file not shown.

models/ggml-vocab-llama.gguf

-708 KB
Binary file not shown.

tests/CMakeLists.txt

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -65,21 +65,16 @@ function(llama_target_and_test source)
6565
set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
6666
endfunction()
6767

68-
# llama_target_and_test(test-double-float.cpp) # SLOW
69-
llama_target_and_test(test-quantize-fns.cpp)
70-
llama_target_and_test(test-quantize-perf.cpp)
71-
llama_target_and_test(test-sampling.cpp)
72-
llama_target_and_test(test-chat-template.cpp)
73-
74-
llama_target_and_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
75-
llama_target_and_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
76-
llama_target_and_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
77-
78-
llama_target_and_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
79-
llama_target_and_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
68+
# build test-tokenizer-0 target once and add many tests
69+
add_executable(test-tokenizer-0 test-tokenizer-0.cpp get-model.cpp)
70+
target_link_libraries(test-tokenizer-0 PRIVATE common)
71+
install(TARGETS test-tokenizer-0 RUNTIME)
8072

81-
llama_target_and_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
82-
llama_target_and_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
73+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
74+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
75+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
76+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
77+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
8378

8479
# build test-tokenizer-1-bpe target once and add many tests
8580
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp get-model.cpp)
@@ -96,9 +91,19 @@ llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CUR
9691
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
9792
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
9893

94+
# build test-tokenizer-1-spm target once and add many tests
95+
add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp get-model.cpp)
96+
target_link_libraries(test-tokenizer-1-spm PRIVATE common)
97+
install(TARGETS test-tokenizer-1-spm RUNTIME)
9998

99+
llama_target_and_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
100+
llama_target_and_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
100101

101-
102+
# llama_target_and_test(test-double-float.cpp) # SLOW
103+
llama_target_and_test(test-quantize-fns.cpp)
104+
llama_target_and_test(test-quantize-perf.cpp)
105+
llama_target_and_test(test-sampling.cpp)
106+
llama_target_and_test(test-chat-template.cpp)
102107

103108
llama_target_and_test(test-grammar-parser.cpp)
104109
llama_target_and_test(test-llama-grammar.cpp)

tests/test-tokenizer-0-deepseek-coder.cpp

Lines changed: 0 additions & 188 deletions
This file was deleted.

0 commit comments

Comments
 (0)