Skip to content

Commit 567cf43

Browse files
ggerganovnopperl
authored andcommitted
convert : use utf8 encoding (ggml-org#7000)
* convert : use utf8 encoding * convert : update instructions and warning message
1 parent 2cc5044 commit 567cf43

File tree

2 files changed

+18
-10
lines changed

2 files changed

+18
-10
lines changed

convert-hf-to-gguf-update.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def download_file_with_auth(url, token, save_path):
128128
print(f"chkhsh: {chkhsh}")
129129

130130
# print the "pre_tokenizer" content from the tokenizer.json
131-
with open(f"models/tokenizers/{name}/tokenizer.json", "r") as f:
131+
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
132132
cfg = json.load(f)
133133
pre_tokenizer = cfg["pre_tokenizer"]
134134
print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
@@ -156,15 +156,19 @@ def download_file_with_auth(url, token, save_path):
156156
src_func += "\n"
157157
src_func += " res = None\n"
158158
src_func += "\n"
159-
src_func += " # NOTE: if you get an error here, you need to add the model to the if-elif chain below\n"
160-
src_func += " # don't do this manually - use the convert-hf-to-gguf-update.py script!\n"
159+
src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n"
160+
src_func += " # or pull the latest version of the model from Huggingface\n"
161+
src_func += " # don't edit the hashes manually!\n"
161162
src_func += f"{src_ifs}\n"
162163
src_func += " if res is None:\n"
163164
src_func += " print(\"\\n\")\n"
164165
src_func += " print(\"**************************************************************************************\")\n"
165166
src_func += " print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
166-
src_func += " print(\"** This means that it was not added yet or you are using an older version.\")\n"
167-
src_func += " print(\"** Check convert-hf-to-gguf-update.py and update it accordingly.\")\n"
167+
src_func += " print(\"** There are 2 possible reasons for this:\")\n"
168+
src_func += " print(\"** - the model has not been added to convert-hf-to-gguf-update.py yet\")\n"
169+
src_func += " print(\"** - the pre-tokenization config has changed upstream\")\n"
170+
src_func += " print(\"** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n"
171+
src_func += " print(\"** ref: https://github.com/ggerganov/llama.cpp/pull/6920\")\n"
168172
src_func += " print(\"**\")\n"
169173
src_func += " print(f\"** chkhsh: {chkhsh}\")\n"
170174
src_func += " print(\"**************************************************************************************\")\n"
@@ -249,7 +253,7 @@ def download_file_with_auth(url, token, save_path):
249253
from transformers import AutoTokenizer
250254
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
251255

252-
with open(f"models/ggml-vocab-{name}.gguf.inp", "w") as f:
256+
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
253257
for text in tests:
254258
f.write(f"{text}")
255259
f.write("\n__ggml_vocab_test__\n")

convert-hf-to-gguf.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -279,8 +279,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
279279

280280
res = None
281281

282-
# NOTE: if you get an error here, you need to add the model to the if-elif chain below
283-
# don't do this manually - use the convert-hf-to-gguf-update.py script!
282+
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
283+
# or pull the latest version of the model from Huggingface
284+
# don't edit the hashes manually!
284285
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
285286
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
286287
res = "llama-bpe"
@@ -310,8 +311,11 @@ def get_vocab_base_pre(self, tokenizer) -> str:
310311
print("\n")
311312
print("**************************************************************************************")
312313
print("** WARNING: The BPE pre-tokenizer was not recognized!")
313-
print("** This means that it was not added yet or you are using an older version.")
314-
print("** Check convert-hf-to-gguf-update.py and update it accordingly.")
314+
print("** There are 2 possible reasons for this:")
315+
print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
316+
print("** - the pre-tokenization config has changed upstream")
317+
print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
318+
print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
315319
print("**")
316320
print(f"** chkhsh: {chkhsh}")
317321
print("**************************************************************************************")

0 commit comments

Comments
 (0)