Skip to content

Commit d4df785

Browse files
committed
convert_hf : reduce usages of the UNKNOWN token type
1 parent d6fe269 commit d4df785

File tree

1 file changed

+12
-13
lines changed

1 file changed

+12
-13
lines changed

convert_hf_to_gguf.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -634,7 +634,7 @@ def _create_vocab_sentencepiece(self):
634634

635635
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
636636
scores: list[float] = [-10000.0] * vocab_size
637-
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
637+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
638638

639639
for token_id in range(tokenizer.vocab_size()):
640640
piece = tokenizer.IdToPiece(token_id)
@@ -677,7 +677,7 @@ def _create_vocab_sentencepiece(self):
677677
for token_id, token_data in added_tokens_decoder.items():
678678
token_id = int(token_id)
679679
token: str = token_data["content"]
680-
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
680+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
681681
assert tokens[token_id] == token.encode("utf-8")
682682
if token_data.get("special") or self.does_token_look_special(token):
683683
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
@@ -1916,7 +1916,7 @@ def set_vocab(self):
19161916

19171917
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
19181918
scores: list[float] = [-10000.0] * vocab_size
1919-
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
1919+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
19201920

19211921
for token_id in range(tokenizer.vocab_size()):
19221922

@@ -1961,7 +1961,7 @@ def set_vocab(self):
19611961
for token_id, foken_data in added_tokens_decoder.items():
19621962
token_id = int(token_id)
19631963
token = foken_data["content"].encode("utf-8")
1964-
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1964+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
19651965
assert tokens[token_id] == token
19661966
tokens[token_id] = token
19671967
scores[token_id] = -1000.0
@@ -1977,7 +1977,7 @@ def set_vocab(self):
19771977
for foken_data in added_tokens:
19781978
token_id = int(foken_data["id"])
19791979
token = foken_data["content"].encode("utf-8")
1980-
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
1980+
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
19811981
assert tokens[token_id] == token
19821982
tokens[token_id] = token
19831983
scores[token_id] = -1000.0
@@ -2766,7 +2766,7 @@ def set_vocab(self):
27662766

27672767
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
27682768
scores: list[float] = [-10000.0] * vocab_size
2769-
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
2769+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
27702770

27712771
for token_id in range(tokenizer.vocab_size()):
27722772

@@ -3021,7 +3021,7 @@ def set_vocab(self):
30213021

30223022
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
30233023
scores: list[float] = [-10000.0] * vocab_size
3024-
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
3024+
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
30253025

30263026
for token_id in range(tokenizer.vocab_size()):
30273027
piece = tokenizer.IdToPiece(token_id)
@@ -3239,15 +3239,14 @@ def set_vocab_chatglm3(self):
32393239
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
32403240
score = tokenizer.tokenizer.sp_model.get_score(token_id)
32413241

3242-
if len(piece) == 0:
3243-
text = f"[PAD{token_id}]".encode("utf-8")
3244-
32453242
if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
32463243
if piece in special_tokens:
3247-
# show special tokens in prompt
3248-
toktype = SentencePieceTokenTypes.USER_DEFINED
3244+
toktype = SentencePieceTokenTypes.CONTROL
3245+
elif len(piece) == 0:
3246+
text = f"[PAD{token_id}]".encode("utf-8")
3247+
toktype = SentencePieceTokenTypes.UNUSED
32493248
else:
3250-
toktype = SentencePieceTokenTypes.UNKNOWN
3249+
toktype = SentencePieceTokenTypes.USER_DEFINED
32513250
tokens.append(text)
32523251
scores.append(score)
32533252
toktypes.append(toktype)

0 commit comments

Comments
 (0)