convert-hf-to-gguf-update: automate updating

akx · akx · commit 86016b7d8d23 · 2024-05-17T14:02:31.000+03:00
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
@@ -20,11 +20,13 @@
 # - Update llama.cpp with the new pre-tokenizer if necessary
 #
 # TODO: generate tokenizer tests for llama.cpp
-# TODO: automate the update of convert-hf-to-gguf.py
 #
 
 import logging
 import os
+import pathlib
+import re
+
 import requests
 import sys
 import json
@@ -135,7 +137,6 @@ def download_file_with_auth(url, token, save_path):
     download_file_with_auth(url, token, save_path)
 
 # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
-# TODO: auto-update convert-hf-to-gguf.py with the generated function
 
 src_ifs = ""
 for model in models:
@@ -224,11 +225,18 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         return res
 """
 
-print(src_func) # noqa: NP100
+convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
+convert_py = convert_py_pth.read_text()
+convert_py = re.sub(
+    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
+    lambda m: m.group(1) + src_func + m.group(3),
+    convert_py,
+    flags=re.DOTALL | re.MULTILINE,
+)
 
-logger.info("\n")
-logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
-logger.info("\n")
+convert_py_pth.write_text(convert_py)
+
+logger.info("+++ convert-hf-to-gguf.py was updated")
 
 # generate tests for each tokenizer model
 
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -402,6 +402,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
     # NOTE: this function is generated by convert-hf-to-gguf-update.py
     #       do not modify it manually!
     # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
+    # Marker: Start get_vocab_base_pre
     def get_vocab_base_pre(self, tokenizer) -> str:
         # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
         # is specific for the BPE pre-tokenizer used by the model
@@ -489,6 +490,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         logger.debug(f"chkhsh: {chkhsh}")
 
         return res
+        # Marker: End get_vocab_base_pre
 
     def _set_vocab_gpt2(self) -> None:
         tokens, toktypes, tokpre = self.get_vocab_base()