1
1
#!/usr/bin/env python3
2
2
# -*- coding: utf-8 -*-
3
3
4
- # This script downloads the tokenizer models of the specified models from Huggingface and
5
- # generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
6
- #
7
- # This is necessary in order to analyze the type of pre-tokenizer used by the model and
8
- # provide the necessary information to llama.cpp via the GGUF header in order to implement
9
- # the same pre-tokenizer.
10
- #
11
- # ref: https://github.com/ggml-org/llama.cpp/pull/6920
12
- #
13
- # Instructions:
14
- #
15
- # - Add a new model to the "models" list
16
- # - Run the script with your huggingface token:
17
- #
18
- # python3 convert_hf_to_gguf_update.py <huggingface_token>
19
- #
20
- # - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
21
- # - Update llama.cpp with the new pre-tokenizer if necessary
22
- #
23
- # TODO: generate tokenizer tests for llama.cpp
24
- #
25
-
26
4
import logging
27
5
import os
28
6
import pathlib
32
10
import sys
33
11
import json
34
12
import shutil
13
+ import argparse
35
14
36
15
from hashlib import sha256
37
16
from enum import IntEnum , auto
41
20
logger = logging .getLogger ("convert_hf_to_gguf_update" )
42
21
sess = requests .Session ()
43
22
23
+ convert_py_pth = pathlib .Path ("convert_hf_to_gguf.py" )
24
+ convert_py = convert_py_pth .read_text (encoding = "utf-8" )
25
+ hf_token_pth = pathlib .Path .home () / ".cache" / "huggingface" / "token"
26
+ hf_token = hf_token_pth .read_text (encoding = "utf-8" ).strip () if hf_token_pth .exists () else None
27
+
44
28
45
29
class TOKENIZER_TYPE (IntEnum ):
46
30
SPM = auto ()
@@ -49,20 +33,49 @@ class TOKENIZER_TYPE(IntEnum):
49
33
UGM = auto ()
50
34
51
35
36
+ DOC_STRING = """
37
+ This script downloads the tokenizer models of the specified models from Huggingface and
38
+ generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
39
+
40
+ /!\\ It is intended to be used by contributors and is not meant to be run by end users
41
+
42
+ This is necessary in order to analyze the type of pre-tokenizer used by the model and
43
+ provide the necessary information to llama.cpp via the GGUF header in order to implement
44
+ the same pre-tokenizer.
45
+
46
+ ref: https://github.com/ggml-org/llama.cpp/pull/6920
47
+
48
+ Instructions:
49
+
50
+ - Add a new model to the "models" list
51
+ - Run the script with your huggingface token
52
+ By default, token will be read from ~/.cache/huggingface/token
53
+ - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
54
+ - Update llama.cpp with the new pre-tokenizer if necessary
55
+ """
56
+ # TODO: generate tokenizer tests for llama.cpp
57
+
58
+ parser = argparse .ArgumentParser (description = DOC_STRING , formatter_class = argparse .RawTextHelpFormatter )
59
+ parser .add_argument (
60
+ "--full" , action = "store_true" ,
61
+ help = "download full list of models - make sure you have access to all of them" ,
62
+ )
63
+ parser .add_argument (
64
+ "hf_token" ,
65
+ help = "optional HF token" ,
66
+ nargs = "?" ,
67
+ )
68
+ args = parser .parse_args ()
69
+ hf_token = args .hf_token if args .hf_token is not None else hf_token
70
+
71
+ if hf_token is None :
72
+ logger .error ("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token" )
73
+ sys .exit (1 )
74
+
52
75
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
53
76
# will be updated with time - contributions welcome
54
77
CHK_TXT = '\n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \' \' \' \' \' \' ```````\" \" \" \" ......!!!!!!?????? I\' ve been \' told he\' s there, \' RE you sure? \' M not sure I\' ll make it, \' D you like some tea? We\' Ve a\' lL'
55
78
56
- if len (sys .argv ) == 2 :
57
- token = sys .argv [1 ]
58
- if not token .startswith ("hf_" ):
59
- logger .info ("Huggingface token seems invalid" )
60
- logger .info ("Usage: python convert_hf_to_gguf_update.py <huggingface_token>" )
61
- sys .exit (1 )
62
- else :
63
- logger .info ("Usage: python convert_hf_to_gguf_update.py <huggingface_token>" )
64
- sys .exit (1 )
65
-
66
79
# TODO: add models here, base models preferred
67
80
models = [
68
81
{"name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
@@ -103,7 +116,6 @@ class TOKENIZER_TYPE(IntEnum):
103
116
{"name" : "exaone" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct" , },
104
117
{"name" : "phi-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/microsoft/phi-2" , },
105
118
{"name" : "chameleon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/facebook/chameleon-7b" , },
106
- {"name" : "minerva-7b" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0" , },
107
119
{"name" : "roberta-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sentence-transformers/stsb-roberta-base" },
108
120
{"name" : "gigachat" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct" },
109
121
{"name" : "megrez" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Infinigence/Megrez-3B-Instruct" },
@@ -114,11 +126,19 @@ class TOKENIZER_TYPE(IntEnum):
114
126
{"name" : "trillion" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/trillionlabs/Trillion-7B-preview" , },
115
127
{"name" : "bailingmoe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/inclusionAI/Ling-lite" , },
116
128
{"name" : "llama4" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct" , },
117
- {"name" : "glm4" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-hf" , },
118
129
{"name" : "pixtral" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mistral-community/pixtral-12b" , },
119
130
{"name" : "seed-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base" , },
120
131
]
121
132
133
+ # some models are known to be broken upstream, so we will skip them as exceptions
134
+ pre_computed_hashes = [
135
+ # chatglm-bpe has 2 hashes, why?
136
+ {"name" : "chatglm-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-chat" , "chkhsh" : "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" },
137
+ {"name" : "chatglm-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-chat" , "chkhsh" : "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516" },
138
+ {"name" : "glm4" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/THUDM/glm-4-9b-hf" , "chkhsh" : "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" },
139
+ {"name" : "minerva-7b" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0" , "chkhsh" : "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35" },
140
+ ]
141
+
122
142
123
143
def download_file_with_auth (url , token , save_path ):
124
144
headers = {"Authorization" : f"Bearer { token } " }
@@ -169,9 +189,29 @@ def download_model(model):
169
189
if os .path .isfile (save_path ):
170
190
logger .info (f"{ name } : File { save_path } already exists - skipping" )
171
191
continue
172
- download_file_with_auth (f"{ repo } /resolve/main/{ file } " , token , save_path )
192
+ download_file_with_auth (f"{ repo } /resolve/main/{ file } " , hf_token , save_path )
193
+
194
+
195
+ # get list of existing models and chkhsh from the convert_hf_to_gguf.py file
196
+ # returns mapping res --> chkhsh
197
+ def get_existing_models (convert_py ):
198
+ pattern = r'if chkhsh == "([a-f0-9]{64})":\s*\n\s*.*\s*res = "([^"]+)"'
199
+ matches = re .findall (pattern , convert_py )
200
+ output = {}
201
+ for chkhsh , res in matches :
202
+ output [res ] = chkhsh
203
+ return output
204
+
173
205
206
+ existing_models = {}
207
+ all_models = models .copy ()
208
+ if not args .full :
209
+ # Filter out models that already exist in convert_hf_to_gguf.py
210
+ existing_models = get_existing_models (convert_py )
211
+ all_models = models .copy ()
212
+ models = [model for model in all_models if model ["name" ] not in existing_models ]
174
213
214
+ logging .info (f"Downloading { len (models )} models..." )
175
215
for model in models :
176
216
try :
177
217
download_model (model )
@@ -182,9 +222,10 @@ def download_model(model):
182
222
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
183
223
184
224
src_ifs = ""
185
- for model in models :
225
+ for model in [ * all_models , * pre_computed_hashes ] :
186
226
name = model ["name" ]
187
227
tokt = model ["tokt" ]
228
+ chkhsh = model .get ("chkhsh" )
188
229
189
230
if tokt == TOKENIZER_TYPE .SPM or tokt == TOKENIZER_TYPE .UGM :
190
231
continue
@@ -195,35 +236,44 @@ def download_model(model):
195
236
continue
196
237
197
238
# create the tokenizer
198
- try :
199
- if name == "t5" :
200
- tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " , use_fast = False )
201
- else :
202
- tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
203
- except OSError as e :
204
- logger .error (f"Error loading tokenizer for model { name } . The model may not exist or is not accessible with the provided token. Error: { e } " )
205
- continue # Skip to the next model if the tokenizer can't be loaded
206
-
207
- chktok = tokenizer .encode (CHK_TXT )
208
- chkhsh = sha256 (str (chktok ).encode ()).hexdigest ()
209
-
210
- logger .info (f"model: { name } " )
211
- logger .info (f"tokt: { tokt } " )
212
- logger .info (f"repo: { model ['repo' ]} " )
213
- logger .info (f"chktok: { chktok } " )
214
- logger .info (f"chkhsh: { chkhsh } " )
215
-
216
- # print the "pre_tokenizer" content from the tokenizer.json
217
- with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
218
- cfg = json .load (f )
219
- normalizer = cfg ["normalizer" ]
220
- logger .info ("normalizer: " + json .dumps (normalizer , indent = 4 ))
221
- pre_tokenizer = cfg ["pre_tokenizer" ]
222
- logger .info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
223
- if "ignore_merges" in cfg ["model" ]:
224
- logger .info ("ignore_merges: " + json .dumps (cfg ["model" ]["ignore_merges" ], indent = 4 ))
225
-
226
- logger .info ("" )
239
+ if chkhsh is not None :
240
+ # if the model has a pre-computed hash, use it
241
+ logger .info (f"Using pre-computed hash for model { name } : { chkhsh } " )
242
+ elif name in existing_models :
243
+ # if the model already exists in convert_hf_to_gguf.py, skip compute hash
244
+ chkhsh = existing_models [name ]
245
+ else :
246
+ # otherwise, compute the hash of the tokenizer
247
+ try :
248
+ logger .info (f"Loading tokenizer from { f'models/tokenizers/{ name } ' } ..." )
249
+ if name == "t5" :
250
+ tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " , use_fast = False )
251
+ else :
252
+ tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
253
+ except OSError as e :
254
+ logger .error (f"Error loading tokenizer for model { name } . The model may not exist or is not accessible with the provided token. Error: { e } " )
255
+ continue # Skip to the next model if the tokenizer can't be loaded
256
+
257
+ chktok = tokenizer .encode (CHK_TXT )
258
+ chkhsh = sha256 (str (chktok ).encode ()).hexdigest ()
259
+
260
+ logger .info (f"model: { name } " )
261
+ logger .info (f"tokt: { tokt } " )
262
+ logger .info (f"repo: { model ['repo' ]} " )
263
+ logger .info (f"chktok: { chktok } " )
264
+ logger .info (f"chkhsh: { chkhsh } " )
265
+
266
+ # print the "pre_tokenizer" content from the tokenizer.json
267
+ with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
268
+ cfg = json .load (f )
269
+ normalizer = cfg ["normalizer" ]
270
+ logger .info ("normalizer: " + json .dumps (normalizer , indent = 4 ))
271
+ pre_tokenizer = cfg ["pre_tokenizer" ]
272
+ logger .info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
273
+ if "ignore_merges" in cfg ["model" ]:
274
+ logger .info ("ignore_merges: " + json .dumps (cfg ["model" ]["ignore_merges" ], indent = 4 ))
275
+
276
+ logger .info ("" )
227
277
228
278
src_ifs += f" if chkhsh == \" { chkhsh } \" :\n "
229
279
src_ifs += f" # ref: { model ['repo' ]} \n "
@@ -271,8 +321,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
271
321
return res
272
322
"""
273
323
274
- convert_py_pth = pathlib .Path ("convert_hf_to_gguf.py" )
275
- convert_py = convert_py_pth .read_text (encoding = "utf-8" )
276
324
convert_py = re .sub (
277
325
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)" ,
278
326
lambda m : m .group (1 ) + src_func + m .group (3 ),
@@ -367,6 +415,10 @@ def get_vocab_base_pre(self, tokenizer) -> str:
367
415
logger .error (f"Failed to load tokenizer for model { name } . Error: { e } " )
368
416
continue # Skip this model and continue with the next one in the loop
369
417
418
+ if not os .path .exists (f"models/ggml-vocab-{ name } .gguf" ):
419
+ logger .info (f"Skip vocab files for model { name } , no GGUF file found" )
420
+ continue
421
+
370
422
with open (f"models/ggml-vocab-{ name } .gguf.inp" , "w" , encoding = "utf-8" ) as f :
371
423
for text in tests :
372
424
f .write (f"{ text } " )
0 commit comments