Skip to content

Commit 07e4351

Browse files
authored
convert : allow partial update to the chkhsh pre-tokenizer list (#13847)
* convert : allow partial update to the chkhsh pre-tokenizer list * code style * update tokenizer out * rm inp/out files for models not having gguf * fixed hash for glm * skip nomic-bert-moe test * Update convert_hf_to_gguf_update.py * fix minerva-7b hash * rm redundant import
1 parent 291f2b6 commit 07e4351

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+163
-1212
lines changed

convert_hf_to_gguf.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -674,12 +674,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
674674
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
675675
# ref: https://huggingface.co/tiiuae/falcon-7b
676676
res = "falcon"
677-
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
678-
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
679-
res = "falcon3"
680677
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
681678
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
682679
res = "bert-bge"
680+
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
681+
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
682+
res = "falcon3"
683683
if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
684684
# ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
685685
res = "bert-bge-large"
@@ -731,9 +731,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
731731
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
732732
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
733733
res = "jina-v2-code"
734-
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
735-
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
736-
res = "chatglm-bpe"
737734
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
738735
# ref: https://huggingface.co/LumiOpen/Viking-7B
739736
res = "viking"
@@ -764,9 +761,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
764761
if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
765762
# ref: https://huggingface.co/facebook/chameleon-7b
766763
res = "chameleon"
767-
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
768-
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
769-
res = "minerva-7b"
770764
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
771765
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
772766
res = "roberta-bpe"
@@ -797,15 +791,24 @@ def get_vocab_base_pre(self, tokenizer) -> str:
797791
if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
798792
# ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
799793
res = "llama4"
800-
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
801-
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
802-
res = "glm4"
803794
if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
804795
# ref: https://huggingface.co/mistral-community/pixtral-12b
805796
res = "pixtral"
806797
if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
807798
# ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
808799
res = "seed-coder"
800+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
801+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
802+
res = "chatglm-bpe"
803+
if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
804+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
805+
res = "chatglm-bpe"
806+
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
807+
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
808+
res = "glm4"
809+
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
810+
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
811+
res = "minerva-7b"
809812

810813
if res is None:
811814
logger.warning("\n")

convert_hf_to_gguf_update.py

Lines changed: 119 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,6 @@
11
#!/usr/bin/env python3
22
# -*- coding: utf-8 -*-
33

4-
# This script downloads the tokenizer models of the specified models from Huggingface and
5-
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
6-
#
7-
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
8-
# provide the necessary information to llama.cpp via the GGUF header in order to implement
9-
# the same pre-tokenizer.
10-
#
11-
# ref: https://github.com/ggml-org/llama.cpp/pull/6920
12-
#
13-
# Instructions:
14-
#
15-
# - Add a new model to the "models" list
16-
# - Run the script with your huggingface token:
17-
#
18-
# python3 convert_hf_to_gguf_update.py <huggingface_token>
19-
#
20-
# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
21-
# - Update llama.cpp with the new pre-tokenizer if necessary
22-
#
23-
# TODO: generate tokenizer tests for llama.cpp
24-
#
25-
264
import logging
275
import os
286
import pathlib
@@ -32,6 +10,7 @@
3210
import sys
3311
import json
3412
import shutil
13+
import argparse
3514

3615
from hashlib import sha256
3716
from enum import IntEnum, auto
@@ -41,6 +20,11 @@
4120
logger = logging.getLogger("convert_hf_to_gguf_update")
4221
sess = requests.Session()
4322

23+
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
24+
convert_py = convert_py_pth.read_text(encoding="utf-8")
25+
hf_token_pth = pathlib.Path.home() / ".cache" / "huggingface" / "token"
26+
hf_token = hf_token_pth.read_text(encoding="utf-8").strip() if hf_token_pth.exists() else None
27+
4428

4529
class TOKENIZER_TYPE(IntEnum):
4630
SPM = auto()
@@ -49,20 +33,49 @@ class TOKENIZER_TYPE(IntEnum):
4933
UGM = auto()
5034

5135

36+
DOC_STRING = """
37+
This script downloads the tokenizer models of the specified models from Huggingface and
38+
generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
39+
40+
/!\\ It is intended to be used by contributors and is not meant to be run by end users
41+
42+
This is necessary in order to analyze the type of pre-tokenizer used by the model and
43+
provide the necessary information to llama.cpp via the GGUF header in order to implement
44+
the same pre-tokenizer.
45+
46+
ref: https://github.com/ggml-org/llama.cpp/pull/6920
47+
48+
Instructions:
49+
50+
- Add a new model to the "models" list
51+
- Run the script with your huggingface token
52+
By default, token will be read from ~/.cache/huggingface/token
53+
- The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
54+
- Update llama.cpp with the new pre-tokenizer if necessary
55+
"""
56+
# TODO: generate tokenizer tests for llama.cpp
57+
58+
parser = argparse.ArgumentParser(description=DOC_STRING, formatter_class=argparse.RawTextHelpFormatter)
59+
parser.add_argument(
60+
"--full", action="store_true",
61+
help="download full list of models - make sure you have access to all of them",
62+
)
63+
parser.add_argument(
64+
"hf_token",
65+
help="optional HF token",
66+
nargs="?",
67+
)
68+
args = parser.parse_args()
69+
hf_token = args.hf_token if args.hf_token is not None else hf_token
70+
71+
if hf_token is None:
72+
logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token")
73+
sys.exit(1)
74+
5275
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
5376
# will be updated with time - contributions welcome
5477
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
5578

56-
if len(sys.argv) == 2:
57-
token = sys.argv[1]
58-
if not token.startswith("hf_"):
59-
logger.info("Huggingface token seems invalid")
60-
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
61-
sys.exit(1)
62-
else:
63-
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
64-
sys.exit(1)
65-
6679
# TODO: add models here, base models preferred
6780
models = [
6881
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
@@ -103,7 +116,6 @@ class TOKENIZER_TYPE(IntEnum):
103116
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
104117
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
105118
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
106-
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
107119
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
108120
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
109121
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
@@ -114,11 +126,19 @@ class TOKENIZER_TYPE(IntEnum):
114126
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
115127
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
116128
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
117-
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
118129
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
119130
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
120131
]
121132

133+
# some models are known to be broken upstream, so we will skip them as exceptions
134+
pre_computed_hashes = [
135+
# chatglm-bpe has 2 hashes, why?
136+
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b"},
137+
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
138+
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
139+
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
140+
]
141+
122142

123143
def download_file_with_auth(url, token, save_path):
124144
headers = {"Authorization": f"Bearer {token}"}
@@ -169,9 +189,29 @@ def download_model(model):
169189
if os.path.isfile(save_path):
170190
logger.info(f"{name}: File {save_path} already exists - skipping")
171191
continue
172-
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
192+
download_file_with_auth(f"{repo}/resolve/main/{file}", hf_token, save_path)
193+
194+
195+
# get list of existing models and chkhsh from the convert_hf_to_gguf.py file
196+
# returns mapping res --> chkhsh
197+
def get_existing_models(convert_py):
198+
pattern = r'if chkhsh == "([a-f0-9]{64})":\s*\n\s*.*\s*res = "([^"]+)"'
199+
matches = re.findall(pattern, convert_py)
200+
output = {}
201+
for chkhsh, res in matches:
202+
output[res] = chkhsh
203+
return output
204+
173205

206+
existing_models = {}
207+
all_models = models.copy()
208+
if not args.full:
209+
# Filter out models that already exist in convert_hf_to_gguf.py
210+
existing_models = get_existing_models(convert_py)
211+
all_models = models.copy()
212+
models = [model for model in all_models if model["name"] not in existing_models]
174213

214+
logging.info(f"Downloading {len(models)} models...")
175215
for model in models:
176216
try:
177217
download_model(model)
@@ -182,9 +222,10 @@ def download_model(model):
182222
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
183223

184224
src_ifs = ""
185-
for model in models:
225+
for model in [*all_models, *pre_computed_hashes]:
186226
name = model["name"]
187227
tokt = model["tokt"]
228+
chkhsh = model.get("chkhsh")
188229

189230
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
190231
continue
@@ -195,35 +236,44 @@ def download_model(model):
195236
continue
196237

197238
# create the tokenizer
198-
try:
199-
if name == "t5":
200-
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
201-
else:
202-
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
203-
except OSError as e:
204-
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
205-
continue # Skip to the next model if the tokenizer can't be loaded
206-
207-
chktok = tokenizer.encode(CHK_TXT)
208-
chkhsh = sha256(str(chktok).encode()).hexdigest()
209-
210-
logger.info(f"model: {name}")
211-
logger.info(f"tokt: {tokt}")
212-
logger.info(f"repo: {model['repo']}")
213-
logger.info(f"chktok: {chktok}")
214-
logger.info(f"chkhsh: {chkhsh}")
215-
216-
# print the "pre_tokenizer" content from the tokenizer.json
217-
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
218-
cfg = json.load(f)
219-
normalizer = cfg["normalizer"]
220-
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
221-
pre_tokenizer = cfg["pre_tokenizer"]
222-
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
223-
if "ignore_merges" in cfg["model"]:
224-
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
225-
226-
logger.info("")
239+
if chkhsh is not None:
240+
# if the model has a pre-computed hash, use it
241+
logger.info(f"Using pre-computed hash for model {name}: {chkhsh}")
242+
elif name in existing_models:
243+
# if the model already exists in convert_hf_to_gguf.py, skip compute hash
244+
chkhsh = existing_models[name]
245+
else:
246+
# otherwise, compute the hash of the tokenizer
247+
try:
248+
logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
249+
if name == "t5":
250+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
251+
else:
252+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
253+
except OSError as e:
254+
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
255+
continue # Skip to the next model if the tokenizer can't be loaded
256+
257+
chktok = tokenizer.encode(CHK_TXT)
258+
chkhsh = sha256(str(chktok).encode()).hexdigest()
259+
260+
logger.info(f"model: {name}")
261+
logger.info(f"tokt: {tokt}")
262+
logger.info(f"repo: {model['repo']}")
263+
logger.info(f"chktok: {chktok}")
264+
logger.info(f"chkhsh: {chkhsh}")
265+
266+
# print the "pre_tokenizer" content from the tokenizer.json
267+
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
268+
cfg = json.load(f)
269+
normalizer = cfg["normalizer"]
270+
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
271+
pre_tokenizer = cfg["pre_tokenizer"]
272+
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
273+
if "ignore_merges" in cfg["model"]:
274+
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
275+
276+
logger.info("")
227277

228278
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
229279
src_ifs += f" # ref: {model['repo']}\n"
@@ -271,8 +321,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
271321
return res
272322
"""
273323

274-
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
275-
convert_py = convert_py_pth.read_text(encoding="utf-8")
276324
convert_py = re.sub(
277325
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
278326
lambda m: m.group(1) + src_func + m.group(3),
@@ -367,6 +415,10 @@ def get_vocab_base_pre(self, tokenizer) -> str:
367415
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
368416
continue # Skip this model and continue with the next one in the loop
369417

418+
if not os.path.exists(f"models/ggml-vocab-{name}.gguf"):
419+
logger.info(f"Skip vocab files for model {name}, no GGUF file found")
420+
continue
421+
370422
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
371423
for text in tests:
372424
f.write(f"{text}")

models/ggml-vocab-bert-bge.gguf.inp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
ied 4 ½ months
22
__ggml_vocab_test__
3-
Führer
3+
Äpfel
44
__ggml_vocab_test__
55

66
__ggml_vocab_test__

models/ggml-vocab-bert-bge.gguf.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
29464 2094 1018 1092 2706
2-
11865 17875
2+
9706 7959 2140
33

44

55

0 commit comments

Comments
 (0)