|
20 | 20 | # - Update llama.cpp with the new pre-tokenizer if necessary
|
21 | 21 | #
|
22 | 22 | # TODO: generate tokenizer tests for llama.cpp
|
23 |
| -# TODO: automate the update of convert-hf-to-gguf.py |
24 | 23 | #
|
25 | 24 |
|
26 | 25 | import logging
|
27 | 26 | import os
|
| 27 | +import pathlib |
| 28 | +import re |
| 29 | + |
28 | 30 | import requests
|
29 | 31 | import sys
|
30 | 32 | import json
|
|
35 | 37 |
|
36 | 38 | logging.basicConfig(level=logging.DEBUG)
|
37 | 39 | logger = logging.getLogger("convert-hf-to-gguf-update")
|
| 40 | +sess = requests.Session() |
38 | 41 |
|
39 | 42 |
|
40 | 43 | class TOKENIZER_TYPE(IntEnum):
|
@@ -79,63 +82,44 @@ class TOKENIZER_TYPE(IntEnum):
|
79 | 82 | {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
80 | 83 | ]
|
81 | 84 |
|
82 |
| -# make directory "models/tokenizers" if it doesn't exist |
83 |
| -if not os.path.exists("models/tokenizers"): |
84 |
| - os.makedirs("models/tokenizers") |
85 |
| - |
86 | 85 |
|
87 | 86 | def download_file_with_auth(url, token, save_path):
|
88 | 87 | headers = {"Authorization": f"Bearer {token}"}
|
89 |
| - response = requests.get(url, headers=headers) |
90 |
| - if response.status_code == 200: |
91 |
| - with open(save_path, 'wb') as f: |
92 |
| - f.write(response.content) |
93 |
| - logger.info(f"File {save_path} downloaded successfully") |
94 |
| - else: |
95 |
| - logger.info(f"Failed to download file. Status code: {response.status_code}") |
| 88 | + response = sess.get(url, headers=headers) |
| 89 | + response.raise_for_status() |
| 90 | + os.makedirs(os.path.dirname(save_path), exist_ok=True) |
| 91 | + with open(save_path, 'wb') as f: |
| 92 | + f.write(response.content) |
| 93 | + logger.info(f"File {save_path} downloaded successfully") |
96 | 94 |
|
97 | 95 |
|
98 |
| -# download the tokenizer models |
99 |
| -for model in models: |
| 96 | +def download_model(model): |
100 | 97 | name = model["name"]
|
101 | 98 | repo = model["repo"]
|
102 | 99 | tokt = model["tokt"]
|
103 | 100 |
|
104 |
| - if not os.path.exists(f"models/tokenizers/{name}"): |
105 |
| - os.makedirs(f"models/tokenizers/{name}") |
106 |
| - else: |
107 |
| - logger.info(f"Directory models/tokenizers/{name} already exists - skipping") |
108 |
| - continue |
109 |
| - |
110 |
| - logger.info(f"Downloading {name} to models/tokenizers/{name}") |
| 101 | + os.makedirs(f"models/tokenizers/{name}", exist_ok=True) |
111 | 102 |
|
112 |
| - url = f"{repo}/raw/main/config.json" |
113 |
| - save_path = f"models/tokenizers/{name}/config.json" |
114 |
| - download_file_with_auth(url, token, save_path) |
| 103 | + files = ["config.json", "tokenizer.json", "tokenizer_config.json"] |
| 104 | + if tokt == TOKENIZER_TYPE.SPM: |
| 105 | + files.append("tokenizer.model") |
115 | 106 |
|
116 |
| - url = f"{repo}/raw/main/tokenizer.json" |
117 |
| - save_path = f"models/tokenizers/{name}/tokenizer.json" |
118 |
| - download_file_with_auth(url, token, save_path) |
| 107 | + for file in files: |
| 108 | + save_path = f"models/tokenizers/{name}/{file}" |
| 109 | + if os.path.isfile(save_path): |
| 110 | + logger.info(f"{name}: File {save_path} already exists - skipping") |
| 111 | + continue |
| 112 | + download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) |
119 | 113 |
|
120 |
| - # if downloaded file is less than 1KB, we likely need to download an LFS instead |
121 |
| - if os.path.getsize(save_path) < 1024: |
122 |
| - # remove the file |
123 |
| - os.remove(save_path) |
124 |
| - url = f"{repo}/resolve/main/tokenizer.json" |
125 |
| - save_path = f"models/tokenizers/{name}/tokenizer.json" |
126 |
| - download_file_with_auth(url, token, save_path) |
127 | 114 |
|
128 |
| - if tokt == TOKENIZER_TYPE.SPM: |
129 |
| - url = f"{repo}/resolve/main/tokenizer.model" |
130 |
| - save_path = f"models/tokenizers/{name}/tokenizer.model" |
131 |
| - download_file_with_auth(url, token, save_path) |
| 115 | +for model in models: |
| 116 | + try: |
| 117 | + download_model(model) |
| 118 | + except Exception as e: |
| 119 | + logger.error(f"Failed to download model {model['name']}. Error: {e}") |
132 | 120 |
|
133 |
| - url = f"{repo}/raw/main/tokenizer_config.json" |
134 |
| - save_path = f"models/tokenizers/{name}/tokenizer_config.json" |
135 |
| - download_file_with_auth(url, token, save_path) |
136 | 121 |
|
137 | 122 | # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
138 |
| -# TODO: auto-update convert-hf-to-gguf.py with the generated function |
139 | 123 |
|
140 | 124 | src_ifs = ""
|
141 | 125 | for model in models:
|
@@ -224,11 +208,18 @@ def get_vocab_base_pre(self, tokenizer) -> str:
|
224 | 208 | return res
|
225 | 209 | """
|
226 | 210 |
|
227 |
| -print(src_func) # noqa: NP100 |
| 211 | +convert_py_pth = pathlib.Path("convert-hf-to-gguf.py") |
| 212 | +convert_py = convert_py_pth.read_text() |
| 213 | +convert_py = re.sub( |
| 214 | + r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)", |
| 215 | + lambda m: m.group(1) + src_func + m.group(3), |
| 216 | + convert_py, |
| 217 | + flags=re.DOTALL | re.MULTILINE, |
| 218 | +) |
228 | 219 |
|
229 |
| -logger.info("\n") |
230 |
| -logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!") |
231 |
| -logger.info("\n") |
| 220 | +convert_py_pth.write_text(convert_py) |
| 221 | + |
| 222 | +logger.info("+++ convert-hf-to-gguf.py was updated") |
232 | 223 |
|
233 | 224 | # generate tests for each tokenizer model
|
234 | 225 |
|
|
0 commit comments