Skip to content

Commit 86016b7

Browse files
committed
convert-hf-to-gguf-update: automate updating
1 parent 9afdffe commit 86016b7

File tree

2 files changed

+16
-6
lines changed

2 files changed

+16
-6
lines changed

convert-hf-to-gguf-update.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,13 @@
2020
# - Update llama.cpp with the new pre-tokenizer if necessary
2121
#
2222
# TODO: generate tokenizer tests for llama.cpp
23-
# TODO: automate the update of convert-hf-to-gguf.py
2423
#
2524

2625
import logging
2726
import os
27+
import pathlib
28+
import re
29+
2830
import requests
2931
import sys
3032
import json
@@ -135,7 +137,6 @@ def download_file_with_auth(url, token, save_path):
135137
download_file_with_auth(url, token, save_path)
136138

137139
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
138-
# TODO: auto-update convert-hf-to-gguf.py with the generated function
139140

140141
src_ifs = ""
141142
for model in models:
@@ -224,11 +225,18 @@ def get_vocab_base_pre(self, tokenizer) -> str:
224225
return res
225226
"""
226227

227-
print(src_func) # noqa: NP100
228+
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
229+
convert_py = convert_py_pth.read_text()
230+
convert_py = re.sub(
231+
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
232+
lambda m: m.group(1) + src_func + m.group(3),
233+
convert_py,
234+
flags=re.DOTALL | re.MULTILINE,
235+
)
228236

229-
logger.info("\n")
230-
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
231-
logger.info("\n")
237+
convert_py_pth.write_text(convert_py)
238+
239+
logger.info("+++ convert-hf-to-gguf.py was updated")
232240

233241
# generate tests for each tokenizer model
234242

convert-hf-to-gguf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
402402
# NOTE: this function is generated by convert-hf-to-gguf-update.py
403403
# do not modify it manually!
404404
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
405+
# Marker: Start get_vocab_base_pre
405406
def get_vocab_base_pre(self, tokenizer) -> str:
406407
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
407408
# is specific for the BPE pre-tokenizer used by the model
@@ -489,6 +490,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
489490
logger.debug(f"chkhsh: {chkhsh}")
490491

491492
return res
493+
# Marker: End get_vocab_base_pre
492494

493495
def _set_vocab_gpt2(self) -> None:
494496
tokens, toktypes, tokpre = self.get_vocab_base()

0 commit comments

Comments
 (0)