Skip to content

Commit d273c14

Browse files
authored
py : convert-hf-to-gguf-update improvements (#7340)
* convert-hf-to-gguf-update: automate updating * convert-hf-to-gguf-update: improve download * share requests session for performance * create directories only when needed, don't skip downloads when empty directory encountered * be more graceful about errors
1 parent 27b0406 commit d273c14

File tree

2 files changed

+39
-46
lines changed

2 files changed

+39
-46
lines changed

convert-hf-to-gguf-update.py

Lines changed: 37 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,13 @@
2020
# - Update llama.cpp with the new pre-tokenizer if necessary
2121
#
2222
# TODO: generate tokenizer tests for llama.cpp
23-
# TODO: automate the update of convert-hf-to-gguf.py
2423
#
2524

2625
import logging
2726
import os
27+
import pathlib
28+
import re
29+
2830
import requests
2931
import sys
3032
import json
@@ -35,6 +37,7 @@
3537

3638
logging.basicConfig(level=logging.DEBUG)
3739
logger = logging.getLogger("convert-hf-to-gguf-update")
40+
sess = requests.Session()
3841

3942

4043
class TOKENIZER_TYPE(IntEnum):
@@ -79,63 +82,44 @@ class TOKENIZER_TYPE(IntEnum):
7982
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
8083
]
8184

82-
# make directory "models/tokenizers" if it doesn't exist
83-
if not os.path.exists("models/tokenizers"):
84-
os.makedirs("models/tokenizers")
85-
8685

8786
def download_file_with_auth(url, token, save_path):
8887
headers = {"Authorization": f"Bearer {token}"}
89-
response = requests.get(url, headers=headers)
90-
if response.status_code == 200:
91-
with open(save_path, 'wb') as f:
92-
f.write(response.content)
93-
logger.info(f"File {save_path} downloaded successfully")
94-
else:
95-
logger.info(f"Failed to download file. Status code: {response.status_code}")
88+
response = sess.get(url, headers=headers)
89+
response.raise_for_status()
90+
os.makedirs(os.path.dirname(save_path), exist_ok=True)
91+
with open(save_path, 'wb') as f:
92+
f.write(response.content)
93+
logger.info(f"File {save_path} downloaded successfully")
9694

9795

98-
# download the tokenizer models
99-
for model in models:
96+
def download_model(model):
10097
name = model["name"]
10198
repo = model["repo"]
10299
tokt = model["tokt"]
103100

104-
if not os.path.exists(f"models/tokenizers/{name}"):
105-
os.makedirs(f"models/tokenizers/{name}")
106-
else:
107-
logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
108-
continue
109-
110-
logger.info(f"Downloading {name} to models/tokenizers/{name}")
101+
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
111102

112-
url = f"{repo}/raw/main/config.json"
113-
save_path = f"models/tokenizers/{name}/config.json"
114-
download_file_with_auth(url, token, save_path)
103+
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
104+
if tokt == TOKENIZER_TYPE.SPM:
105+
files.append("tokenizer.model")
115106

116-
url = f"{repo}/raw/main/tokenizer.json"
117-
save_path = f"models/tokenizers/{name}/tokenizer.json"
118-
download_file_with_auth(url, token, save_path)
107+
for file in files:
108+
save_path = f"models/tokenizers/{name}/{file}"
109+
if os.path.isfile(save_path):
110+
logger.info(f"{name}: File {save_path} already exists - skipping")
111+
continue
112+
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
119113

120-
# if downloaded file is less than 1KB, we likely need to download an LFS instead
121-
if os.path.getsize(save_path) < 1024:
122-
# remove the file
123-
os.remove(save_path)
124-
url = f"{repo}/resolve/main/tokenizer.json"
125-
save_path = f"models/tokenizers/{name}/tokenizer.json"
126-
download_file_with_auth(url, token, save_path)
127114

128-
if tokt == TOKENIZER_TYPE.SPM:
129-
url = f"{repo}/resolve/main/tokenizer.model"
130-
save_path = f"models/tokenizers/{name}/tokenizer.model"
131-
download_file_with_auth(url, token, save_path)
115+
for model in models:
116+
try:
117+
download_model(model)
118+
except Exception as e:
119+
logger.error(f"Failed to download model {model['name']}. Error: {e}")
132120

133-
url = f"{repo}/raw/main/tokenizer_config.json"
134-
save_path = f"models/tokenizers/{name}/tokenizer_config.json"
135-
download_file_with_auth(url, token, save_path)
136121

137122
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
138-
# TODO: auto-update convert-hf-to-gguf.py with the generated function
139123

140124
src_ifs = ""
141125
for model in models:
@@ -224,11 +208,18 @@ def get_vocab_base_pre(self, tokenizer) -> str:
224208
return res
225209
"""
226210

227-
print(src_func) # noqa: NP100
211+
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
212+
convert_py = convert_py_pth.read_text()
213+
convert_py = re.sub(
214+
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
215+
lambda m: m.group(1) + src_func + m.group(3),
216+
convert_py,
217+
flags=re.DOTALL | re.MULTILINE,
218+
)
228219

229-
logger.info("\n")
230-
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
231-
logger.info("\n")
220+
convert_py_pth.write_text(convert_py)
221+
222+
logger.info("+++ convert-hf-to-gguf.py was updated")
232223

233224
# generate tests for each tokenizer model
234225

convert-hf-to-gguf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
402402
# NOTE: this function is generated by convert-hf-to-gguf-update.py
403403
# do not modify it manually!
404404
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
405+
# Marker: Start get_vocab_base_pre
405406
def get_vocab_base_pre(self, tokenizer) -> str:
406407
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
407408
# is specific for the BPE pre-tokenizer used by the model
@@ -489,6 +490,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
489490
logger.debug(f"chkhsh: {chkhsh}")
490491

491492
return res
493+
# Marker: End get_vocab_base_pre
492494

493495
def _set_vocab_gpt2(self) -> None:
494496
tokens, toktypes, tokpre = self.get_vocab_base()

0 commit comments

Comments
 (0)