@@ -128,7 +128,7 @@ def download_file_with_auth(url, token, save_path):
128
128
print (f"chkhsh: { chkhsh } " )
129
129
130
130
# print the "pre_tokenizer" content from the tokenizer.json
131
- with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" ) as f :
131
+ with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
132
132
cfg = json .load (f )
133
133
pre_tokenizer = cfg ["pre_tokenizer" ]
134
134
print ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
@@ -156,15 +156,19 @@ def download_file_with_auth(url, token, save_path):
156
156
src_func += "\n "
157
157
src_func += " res = None\n "
158
158
src_func += "\n "
159
- src_func += " # NOTE: if you get an error here, you need to add the model to the if-elif chain below\n "
160
- src_func += " # don't do this manually - use the convert-hf-to-gguf-update.py script!\n "
159
+ src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n "
160
+ src_func += " # or pull the latest version of the model from Huggingface\n "
161
+ src_func += " # don't edit the hashes manually!\n "
161
162
src_func += f"{ src_ifs } \n "
162
163
src_func += " if res is None:\n "
163
164
src_func += " print(\" \\ n\" )\n "
164
165
src_func += " print(\" **************************************************************************************\" )\n "
165
166
src_func += " print(\" ** WARNING: The BPE pre-tokenizer was not recognized!\" )\n "
166
- src_func += " print(\" ** This means that it was not added yet or you are using an older version.\" )\n "
167
- src_func += " print(\" ** Check convert-hf-to-gguf-update.py and update it accordingly.\" )\n "
167
+ src_func += " print(\" ** There are 2 possible reasons for this:\" )\n "
168
+ src_func += " print(\" ** - the model has not been added to convert-hf-to-gguf-update.py yet\" )\n "
169
+ src_func += " print(\" ** - the pre-tokenization config has changed upstream\" )\n "
170
+ src_func += " print(\" ** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\" )\n "
171
+ src_func += " print(\" ** ref: https://github.com/ggerganov/llama.cpp/pull/6920\" )\n "
168
172
src_func += " print(\" **\" )\n "
169
173
src_func += " print(f\" ** chkhsh: {chkhsh}\" )\n "
170
174
src_func += " print(\" **************************************************************************************\" )\n "
@@ -249,7 +253,7 @@ def download_file_with_auth(url, token, save_path):
249
253
from transformers import AutoTokenizer
250
254
tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
251
255
252
- with open (f"models/ggml-vocab-{ name } .gguf.inp" , "w" ) as f :
256
+ with open (f"models/ggml-vocab-{ name } .gguf.inp" , "w" , encoding = "utf-8" ) as f :
253
257
for text in tests :
254
258
f .write (f"{ text } " )
255
259
f .write ("\n __ggml_vocab_test__\n " )
0 commit comments