Skip to content

Stylistic adjustments for python scripts #8233

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,7 +737,7 @@ def _create_vocab_sentencepiece(self):
added_tokens_json = json.load(f)
for key in added_tokens_json:
token_id = added_tokens_json[key]
if (token_id >= vocab_size):
if token_id >= vocab_size:
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue

Expand Down Expand Up @@ -2005,7 +2005,7 @@ def set_vocab(self):

for key in added_tokens_json:
token_id = added_tokens_json[key]
if (token_id >= vocab_size):
if token_id >= vocab_size:
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue

Expand Down Expand Up @@ -2081,7 +2081,7 @@ def set_gguf_parameters(self):

# write rope scaling for long context (128k) model
rope_scaling = self.find_hparam(['rope_scaling'], True)
if (rope_scaling is None):
if rope_scaling is None:
return

scale = max_pos_embds / orig_max_pos_embds
Expand Down Expand Up @@ -2728,7 +2728,7 @@ def get_tensors(self):

yield name, data

def set_vocab(self, *args, **kwargs):
def set_vocab(self):
tokenizer_class = 'BertTokenizer'
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
tokenizer_class = json.load(f)['tokenizer_class']
Expand Down Expand Up @@ -2876,7 +2876,7 @@ def set_vocab(self):
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
for token_id, token_json in added_tokens_decoder.items():
token_id = int(token_id)
if (token_id >= vocab_size):
if token_id >= vocab_size:
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue

Expand Down Expand Up @@ -3125,7 +3125,7 @@ def set_vocab(self):
added_tokens_json = json.load(f)
for key in added_tokens_json:
token_id = added_tokens_json[key]
if (token_id >= vocab_size):
if token_id >= vocab_size:
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue

Expand Down
12 changes: 6 additions & 6 deletions convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):

# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'

if len(sys.argv) == 2:
token = sys.argv[1]
Expand Down Expand Up @@ -100,8 +100,8 @@ def download_file_with_auth(url, token, save_path):
response = sess.get(url, headers=headers)
response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, 'wb') as f:
f.write(response.content)
with open(save_path, 'wb') as downloaded_file:
downloaded_file.write(response.content)
logger.info(f"File {save_path} downloaded successfully")


Expand Down Expand Up @@ -160,7 +160,7 @@ def download_model(model):
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded

chktok = tokenizer.encode(chktxt)
chktok = tokenizer.encode(CHK_TXT)
chkhsh = sha256(str(chktok).encode()).hexdigest()

logger.info(f"model: {name}")
Expand Down Expand Up @@ -192,7 +192,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer

chktxt = {repr(chktxt)}
chktxt = {repr(CHK_TXT)}

chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
Expand Down Expand Up @@ -288,7 +288,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
"333333333",
"Cửa Việt", # llama-bpe fails on this
" discards",
chktxt,
CHK_TXT,
]

# write the tests to ./models/ggml-vocab-{name}.gguf.inp
Expand Down
6 changes: 5 additions & 1 deletion convert_llama_ggml_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ def load(self, data, offset):


class GGMLModel:

file_format: GGMLFormat
format_version: int

def __init__(self):
self.hyperparameters = None
self.vocab = None
Expand Down Expand Up @@ -290,7 +294,7 @@ def add_vocab(self, gguf_writer):
if self.vocab_override is not None:
vo = self.vocab_override
logger.info('* Adding vocab item(s)')
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
tokens.append(vbytes)
scores.append(score)
toktypes.append(ttype)
Expand Down
Loading