@@ -46,8 +46,8 @@ class TOKENIZER_TYPE(IntEnum):
46
46
47
47
# TODO: add models here, base models preferred
48
48
models = [
49
- { "name" : "llama-v2" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
50
- { "name" : "llama-v3" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
49
+ { "name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
50
+ { "name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
51
51
{ "name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
52
52
{ "name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
53
53
{ "name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
@@ -64,7 +64,7 @@ def download_file_with_auth(url, token, save_path):
64
64
if response .status_code == 200 :
65
65
with open (save_path , 'wb' ) as f :
66
66
f .write (response .content )
67
- print ("File downloaded successfully. " )
67
+ print (f "File { save_path } downloaded successfully" )
68
68
else :
69
69
print (f"Failed to download file. Status code: { response .status_code } " )
70
70
@@ -82,6 +82,10 @@ def download_file_with_auth(url, token, save_path):
82
82
83
83
print (f"Downloading { name } to models/tokenizers/{ name } " )
84
84
85
+ url = f"{ repo } /raw/main/config.json"
86
+ save_path = f"models/tokenizers/{ name } /config.json"
87
+ download_file_with_auth (url , token , save_path )
88
+
85
89
url = f"{ repo } /raw/main/tokenizer.json"
86
90
save_path = f"models/tokenizers/{ name } /tokenizer.json"
87
91
download_file_with_auth (url , token , save_path )
@@ -219,7 +223,7 @@ def download_file_with_auth(url, token, save_path):
219
223
"333333333" ,
220
224
]
221
225
222
- # write the tests in ./models/test -vocab-inp.txt
226
+ # write the tests to ./models/ggml -vocab-{name}.gguf.inp
223
227
# the format is:
224
228
#
225
229
# test0
@@ -229,14 +233,7 @@ def download_file_with_auth(url, token, save_path):
229
233
# ...
230
234
#
231
235
232
- with open (f"models/test-vocab-inp.txt" , "w" ) as f :
233
- for text in tests :
234
- f .write (f"{ text } " )
235
- f .write ("\n __ggml_vocab_test__\n " )
236
-
237
- print ("Tests written in ./models/test-vocab-inp.txt" )
238
-
239
- # with each model, encode all tests and write the results in ./models/test-vocab-out-{name}.txt
236
+ # with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
240
237
# for each test, write the resulting tokens on a separate line
241
238
242
239
for model in models :
@@ -247,11 +244,27 @@ def download_file_with_auth(url, token, save_path):
247
244
from transformers import AutoTokenizer
248
245
tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
249
246
250
- with open (f"models/test -vocab-out- { name } .txt " , "w" ) as f :
247
+ with open (f"models/ggml -vocab-{ name } .gguf.inp " , "w" ) as f :
251
248
for text in tests :
252
- res = tokenizer .encode (text )
249
+ f .write (f"{ text } " )
250
+ f .write ("\n __ggml_vocab_test__\n " )
251
+
252
+ with open (f"models/ggml-vocab-{ name } .gguf.out" , "w" ) as f :
253
+ for text in tests :
254
+ res = tokenizer .encode (text , add_special_tokens = False )
253
255
for r in res :
254
256
f .write (f" { r } " )
255
257
f .write ("\n " )
256
258
257
- print (f"Test results for { name } written in ./models/test-vocab-out-{ name } .txt" )
259
+ print (f"Tests for { name } written in ./models/ggml-vocab-{ name } .gguf.*" )
260
+
261
+ # generate commands for creating vocab files
262
+
263
+ print ("\n Run the following commands to generate the vocab files for testing:\n " )
264
+
265
+ for model in models :
266
+ name = model ["name" ]
267
+
268
+ print (f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only" )
269
+
270
+ print ("\n " )
0 commit comments