convert_hf : reduce usages of UNKNOWN for InternLM2

compilade · compilade · commit 1caa20fc7a4b · 2024-07-10T17:33:04.000-04:00
This makes the changes from #8321 more consistent with the other changes made here.
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -2189,7 +2189,7 @@ def set_vocab(self):
                 toktype = SentencePieceTokenTypes.BYTE
             # take care of ununsed raw token
             if piece.startswith('[UNUSED'):
-                toktype = SentencePieceTokenTypes.UNKNOWN
+                toktype = SentencePieceTokenTypes.UNUSED
 
             tokens.append(text)
             scores.append(score)
@@ -2219,7 +2219,7 @@ def set_vocab(self):
                     if token == chat_eos_token:
                         chat_eos_token_id = token_id
                     token = token.encode("utf-8")
-                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
+                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
                         assert(tokens[token_id] == token)
                     tokens[token_id] = token
                     scores[token_id] = -1000.0
@@ -2238,7 +2238,7 @@ def set_vocab(self):
                     if token == chat_eos_token:
                         chat_eos_token_id = token_id
                     token = token.encode("utf-8")
-                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
+                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
                         assert(tokens[token_id] == token)
                     tokens[token_id] = token
                     scores[token_id] = -1000.0