Skip to content

Commit 9853f2c

Browse files
authored
convert-falcon-hf-to-gguf.py : fix special token mapping
1 parent 7bbbf38 commit 9853f2c

File tree

1 file changed

+17
-32
lines changed

1 file changed

+17
-32
lines changed

convert-falcon-hf-to-gguf.py

Lines changed: 17 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -164,38 +164,23 @@ def count_model_parts(dir_model: str) -> int:
164164
gguf_writer.add_token_scores(scores)
165165
gguf_writer.add_token_types(toktypes)
166166

167-
if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
168-
print("gguf: get special token ids")
169-
170-
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
171-
tokenizer_config = json.load(f)
172-
173-
# find special token ids
174-
175-
if "bos_token" in tokenizer_config:
176-
for key in tokenizer_json["added_tokens"]:
177-
if key["content"] == tokenizer_config["bos_token"]:
178-
gguf_writer.add_bos_token_id(key["id"])
179-
180-
if "eos_token" in tokenizer_config:
181-
for key in tokenizer_json["added_tokens"]:
182-
if key["content"] == tokenizer_config["eos_token"]:
183-
gguf_writer.add_eos_token_id(key["id"])
184-
185-
if "unk_token" in tokenizer_config:
186-
for key in tokenizer_json["added_tokens"]:
187-
if key["content"] == tokenizer_config["unk_token"]:
188-
gguf_writer.add_unk_token_id(key["id"])
189-
190-
if "sep_token" in tokenizer_config:
191-
for key in tokenizer_json["added_tokens"]:
192-
if key["content"] == tokenizer_config["sep_token"]:
193-
gguf_writer.add_sep_token_id(key["id"])
194-
195-
if "pad_token" in tokenizer_config:
196-
for key in tokenizer_json["added_tokens"]:
197-
if key["content"] == tokenizer_config["pad_token"]:
198-
gguf_writer.add_pad_token_id(key["id"])
167+
print("gguf: get special token ids")
168+
# Look for special tokens in config.json
169+
170+
if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
171+
gguf_writer.add_bos_token_id(hparams["bos_token_id"])
172+
173+
if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
174+
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
175+
176+
if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
177+
gguf_writer.add_unk_token_id(hparams["unk_token_id"])
178+
179+
if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
180+
gguf_writer.add_sep_token_id(hparams["sep_token_id"])
181+
182+
if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
183+
gguf_writer.add_pad_token_id(hparams["pad_token_id"])
199184

200185

201186
# TENSORS

0 commit comments

Comments
 (0)