Skip to content

Commit df5d809

Browse files
committed
Stylistic adjustments for python
* Superflous parens in conditionals were removed. * Unused args in function were removed. * Replaced unused `idx` var with `_` * Initializing file_format and format_version attributes * Renaming constant to capitals * Preventing redefinition of the `f` var Signed-off-by: Jiri Podivin <[email protected]>
1 parent d0a7145 commit df5d809

File tree

3 files changed

+17
-13
lines changed

3 files changed

+17
-13
lines changed

convert-hf-to-gguf-update.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ class TOKENIZER_TYPE(IntEnum):
4949

5050
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
5151
# will be updated with time - contributions welcome
52-
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
52+
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
5353

5454
if len(sys.argv) == 2:
5555
token = sys.argv[1]
@@ -94,8 +94,8 @@ def download_file_with_auth(url, token, save_path):
9494
response = sess.get(url, headers=headers)
9595
response.raise_for_status()
9696
os.makedirs(os.path.dirname(save_path), exist_ok=True)
97-
with open(save_path, 'wb') as f:
98-
f.write(response.content)
97+
with open(save_path, 'wb') as downloaded_file:
98+
downloaded_file.write(response.content)
9999
logger.info(f"File {save_path} downloaded successfully")
100100

101101

@@ -147,7 +147,7 @@ def download_model(model):
147147
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
148148
continue # Skip to the next model if the tokenizer can't be loaded
149149

150-
chktok = tokenizer.encode(chktxt)
150+
chktok = tokenizer.encode(CHK_TXT)
151151
chkhsh = sha256(str(chktok).encode()).hexdigest()
152152

153153
logger.info(f"model: {name}")
@@ -179,7 +179,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
179179
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
180180
# use in llama.cpp to implement the same pre-tokenizer
181181
182-
chktxt = {repr(chktxt)}
182+
chktxt = {repr(CHK_TXT)}
183183
184184
chktok = tokenizer.encode(chktxt)
185185
chkhsh = sha256(str(chktok).encode()).hexdigest()
@@ -273,7 +273,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
273273
"33333333",
274274
"333333333",
275275
# "Cửa Việt", # llama-bpe fails on this
276-
chktxt,
276+
CHK_TXT,
277277
]
278278

279279
# write the tests to ./models/ggml-vocab-{name}.gguf.inp

convert-hf-to-gguf.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -622,7 +622,7 @@ def _set_vocab_sentencepiece(self):
622622
added_tokens_json = json.load(f)
623623
for key in added_tokens_json:
624624
token_id = added_tokens_json[key]
625-
if (token_id >= vocab_size):
625+
if token_id >= vocab_size:
626626
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
627627
continue
628628

@@ -1850,7 +1850,7 @@ def set_vocab(self):
18501850

18511851
for key in added_tokens_json:
18521852
token_id = added_tokens_json[key]
1853-
if (token_id >= vocab_size):
1853+
if token_id >= vocab_size:
18541854
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
18551855
continue
18561856

@@ -1925,7 +1925,7 @@ def set_gguf_parameters(self):
19251925

19261926
# write rope scaling for long context (128k) model
19271927
rope_scaling = self.find_hparam(['rope_scaling'], True)
1928-
if (rope_scaling is None):
1928+
if rope_scaling is None:
19291929
return
19301930

19311931
scale = max_pos_embds / orig_max_pos_embds
@@ -2578,7 +2578,7 @@ def get_tensors(self):
25782578

25792579
yield name, data
25802580

2581-
def set_vocab(self, *args, **kwargs):
2581+
def set_vocab(self):
25822582
tokenizer_class = 'BertTokenizer'
25832583
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
25842584
tokenizer_class = json.load(f)['tokenizer_class']
@@ -2651,7 +2651,7 @@ def set_vocab(self):
26512651
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
26522652
for token_id, token_json in added_tokens_decoder.items():
26532653
token_id = int(token_id)
2654-
if (token_id >= vocab_size):
2654+
if token_id >= vocab_size:
26552655
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
26562656
continue
26572657

@@ -2882,7 +2882,7 @@ def set_vocab(self):
28822882
added_tokens_json = json.load(f)
28832883
for key in added_tokens_json:
28842884
token_id = added_tokens_json[key]
2885-
if (token_id >= vocab_size):
2885+
if token_id >= vocab_size:
28862886
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
28872887
continue
28882888

convert-llama-ggml-to-gguf.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,10 @@ def load(self, data, offset):
132132

133133

134134
class GGMLModel:
135+
136+
file_format: GGMLFormat
137+
format_version: int
138+
135139
def __init__(self):
136140
self.hyperparameters = None
137141
self.vocab = None
@@ -290,7 +294,7 @@ def add_vocab(self, gguf_writer):
290294
if self.vocab_override is not None:
291295
vo = self.vocab_override
292296
logger.info('* Adding vocab item(s)')
293-
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
297+
for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
294298
tokens.append(vbytes)
295299
scores.append(score)
296300
toktypes.append(ttype)

0 commit comments

Comments
 (0)