Skip to content

Commit 92e41ec

Browse files
Update log to only print when input and output characters are different
1 parent d8d2f37 commit 92e41ec

File tree

1 file changed

+3
-1
lines changed

1 file changed

+3
-1
lines changed

convert_hf_to_gguf.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -529,8 +529,10 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
529529
# used for `\n` / `\t` have been manually added in the added tokens
530530
# To avoid unexpected issues - we make sure to encode single-char tokens
531531
if len(token) == 1:
532-
logger.info("Ecode-Decode special characters using AutoTokenizer")
532+
previous_token = token
533533
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
534+
if previous_token != token:
535+
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
534536

535537
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
536538
toktypes.append(gguf.TokenType.CONTROL)

0 commit comments

Comments
 (0)