@@ -49,7 +49,7 @@ class TOKENIZER_TYPE(IntEnum):
49
49
50
50
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
51
51
# will be updated with time - contributions welcome
52
- chktxt = ' \n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \' \' \' \' \' \ ' ```````\" \" \" \" ......!!!!!!?????? I\ ' ve been \ ' told he\ ' s there, \ ' RE you sure? \ ' M not sure I\ ' ll make it, \ ' D you like some tea? We\ ' Ve a\ ' lL'
52
+ chktxt = " \n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''' '```````\" \" \" \" ......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL"
53
53
54
54
if len (sys .argv ) == 2 :
55
55
token = sys .argv [1 ]
@@ -63,29 +63,121 @@ class TOKENIZER_TYPE(IntEnum):
63
63
64
64
# TODO: add models here, base models preferred
65
65
models = [
66
- {"name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
67
- {"name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
68
- {"name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
69
- {"name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
70
- {"name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
71
- {"name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
72
- {"name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
73
- {"name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
74
- {"name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
75
- {"name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
76
- {"name" : "stablelm2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b" , },
77
- {"name" : "refact" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/smallcloudai/Refact-1_6-base" , },
78
- {"name" : "command-r" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/CohereForAI/c4ai-command-r-v01" , },
79
- {"name" : "qwen2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Qwen/Qwen1.5-7B" , },
80
- {"name" : "olmo" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/allenai/OLMo-1.7-7B-hf" , },
81
- {"name" : "dbrx" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/databricks/dbrx-base" , },
82
- {"name" : "jina-v2-en" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-en" , }, # WPM!
83
- {"name" : "jina-v2-es" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-es" , },
84
- {"name" : "jina-v2-de" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-de" , },
85
- {"name" : "smaug-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct" , },
86
- {"name" : "poro-chat" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LumiOpen/Poro-34B-chat" , },
87
- {"name" : "jina-v2-code" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-code" , },
88
- {"name" : "viking" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LumiOpen/Viking-7B" , }, # Also used for Viking 13B and 33B
66
+ {
67
+ "name" : "llama-spm" ,
68
+ "tokt" : TOKENIZER_TYPE .SPM ,
69
+ "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" ,
70
+ },
71
+ {
72
+ "name" : "llama-bpe" ,
73
+ "tokt" : TOKENIZER_TYPE .BPE ,
74
+ "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" ,
75
+ },
76
+ {
77
+ "name" : "phi-3" ,
78
+ "tokt" : TOKENIZER_TYPE .SPM ,
79
+ "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" ,
80
+ },
81
+ {
82
+ "name" : "deepseek-llm" ,
83
+ "tokt" : TOKENIZER_TYPE .BPE ,
84
+ "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" ,
85
+ },
86
+ {
87
+ "name" : "deepseek-coder" ,
88
+ "tokt" : TOKENIZER_TYPE .BPE ,
89
+ "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" ,
90
+ },
91
+ {
92
+ "name" : "falcon" ,
93
+ "tokt" : TOKENIZER_TYPE .BPE ,
94
+ "repo" : "https://huggingface.co/tiiuae/falcon-7b" ,
95
+ },
96
+ {
97
+ "name" : "bert-bge" ,
98
+ "tokt" : TOKENIZER_TYPE .WPM ,
99
+ "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" ,
100
+ },
101
+ {
102
+ "name" : "mpt" ,
103
+ "tokt" : TOKENIZER_TYPE .BPE ,
104
+ "repo" : "https://huggingface.co/mosaicml/mpt-7b" ,
105
+ },
106
+ {
107
+ "name" : "starcoder" ,
108
+ "tokt" : TOKENIZER_TYPE .BPE ,
109
+ "repo" : "https://huggingface.co/bigcode/starcoder2-3b" ,
110
+ },
111
+ {
112
+ "name" : "gpt-2" ,
113
+ "tokt" : TOKENIZER_TYPE .BPE ,
114
+ "repo" : "https://huggingface.co/openai-community/gpt2" ,
115
+ },
116
+ {
117
+ "name" : "stablelm2" ,
118
+ "tokt" : TOKENIZER_TYPE .BPE ,
119
+ "repo" : "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b" ,
120
+ },
121
+ {
122
+ "name" : "refact" ,
123
+ "tokt" : TOKENIZER_TYPE .BPE ,
124
+ "repo" : "https://huggingface.co/smallcloudai/Refact-1_6-base" ,
125
+ },
126
+ {
127
+ "name" : "command-r" ,
128
+ "tokt" : TOKENIZER_TYPE .BPE ,
129
+ "repo" : "https://huggingface.co/CohereForAI/c4ai-command-r-v01" ,
130
+ },
131
+ {
132
+ "name" : "qwen2" ,
133
+ "tokt" : TOKENIZER_TYPE .BPE ,
134
+ "repo" : "https://huggingface.co/Qwen/Qwen1.5-7B" ,
135
+ },
136
+ {
137
+ "name" : "olmo" ,
138
+ "tokt" : TOKENIZER_TYPE .BPE ,
139
+ "repo" : "https://huggingface.co/allenai/OLMo-1.7-7B-hf" ,
140
+ },
141
+ {
142
+ "name" : "dbrx" ,
143
+ "tokt" : TOKENIZER_TYPE .BPE ,
144
+ "repo" : "https://huggingface.co/databricks/dbrx-base" ,
145
+ },
146
+ {
147
+ "name" : "jina-v2-en" ,
148
+ "tokt" : TOKENIZER_TYPE .WPM ,
149
+ "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-en" ,
150
+ }, # WPM!
151
+ {
152
+ "name" : "jina-v2-es" ,
153
+ "tokt" : TOKENIZER_TYPE .BPE ,
154
+ "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-es" ,
155
+ },
156
+ {
157
+ "name" : "jina-v2-de" ,
158
+ "tokt" : TOKENIZER_TYPE .BPE ,
159
+ "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-de" ,
160
+ },
161
+ {
162
+ "name" : "smaug-bpe" ,
163
+ "tokt" : TOKENIZER_TYPE .BPE ,
164
+ "repo" : "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct" ,
165
+ },
166
+ {
167
+ "name" : "poro-chat" ,
168
+ "tokt" : TOKENIZER_TYPE .BPE ,
169
+ "repo" : "https://huggingface.co/LumiOpen/Poro-34B-chat" ,
170
+ },
171
+ {
172
+ "name" : "jina-v2-code" ,
173
+ "tokt" : TOKENIZER_TYPE .BPE ,
174
+ "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-code" ,
175
+ },
176
+ {
177
+ "name" : "viking" ,
178
+ "tokt" : TOKENIZER_TYPE .BPE ,
179
+ "repo" : "https://huggingface.co/LumiOpen/Viking-7B" ,
180
+ }, # Also used for Viking 13B and 33B
89
181
]
90
182
91
183
@@ -94,7 +186,7 @@ def download_file_with_auth(url, token, save_path):
94
186
response = sess .get (url , headers = headers )
95
187
response .raise_for_status ()
96
188
os .makedirs (os .path .dirname (save_path ), exist_ok = True )
97
- with open (save_path , 'wb' ) as f :
189
+ with open (save_path , "wb" ) as f :
98
190
f .write (response .content )
99
191
logger .info (f"File { save_path } downloaded successfully" )
100
192
@@ -144,7 +236,9 @@ def download_model(model):
144
236
try :
145
237
tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
146
238
except OSError as e :
147
- logger .error (f"Error loading tokenizer for model { name } . The model may not exist or is not accessible with the provided token. Error: { e } " )
239
+ logger .error (
240
+ f"Error loading tokenizer for model { name } . The model may not exist or is not accessible with the provided token. Error: { e } "
241
+ )
148
242
continue # Skip to the next model if the tokenizer can't be loaded
149
243
150
244
chktok = tokenizer .encode (chktxt )
@@ -164,13 +258,15 @@ def download_model(model):
164
258
pre_tokenizer = cfg ["pre_tokenizer" ]
165
259
logger .info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
166
260
if "ignore_merges" in cfg ["model" ]:
167
- logger .info ("ignore_merges: " + json .dumps (cfg ["model" ]["ignore_merges" ], indent = 4 ))
261
+ logger .info (
262
+ "ignore_merges: " + json .dumps (cfg ["model" ]["ignore_merges" ], indent = 4 )
263
+ )
168
264
169
265
logger .info ("" )
170
266
171
- src_ifs += f" if chkhsh == \ "{ chkhsh } \ " :\n "
267
+ src_ifs += f' if chkhsh == "{ chkhsh } ":\n '
172
268
src_ifs += f" # ref: { model ['repo' ]} \n "
173
- src_ifs += f" res = \ "{ name } \ "\n "
269
+ src_ifs += f' res = "{ name } "\n '
174
270
175
271
src_func = f"""
176
272
def get_vocab_base_pre(self, tokenizer) -> str:
@@ -326,6 +422,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
326
422
for model in models :
327
423
name = model ["name" ]
328
424
329
- print (f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only" ) # noqa: NP100
425
+ print (
426
+ f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only"
427
+ ) # noqa: NP100
330
428
331
429
logger .info ("\n " )
0 commit comments