convert_hf : identify which user-defined tokens are control tokens

compilade · compilade · commit 6e351e04252e · 2024-07-07T16:59:07.000-04:00
Only used in _set_vocab_gpt2() for now.
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -373,6 +373,18 @@ def from_model_architecture(cls, arch: str) -> type[Model]:
         except KeyError:
             raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
 
+    def does_token_look_special(self, token: str) -> bool:
+        # Some models mark some added tokens which ought to be control tokens as not special.
+        # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
+        is_known_special = token in (
+            "<pad>",  # deepseek-coder
+            "<mask>", "<2mass>", "[@BOS@]",  # gemma{,-2}
+        )
+        # TODO: should these be marked as UNUSED instead?
+        is_known_special = is_known_special or (token.startswith("<unused") and token.endswith(">"))  # gemma{,-2}
+
+        return is_known_special or (token.startswith(("<|", "<｜")) and token.endswith(("|>", "｜>")))
+
     # used for GPT-2 BPE and WordPiece vocabs
     def get_vocab_base(self) -> tuple[list[str], list[int], str]:
         tokens: list[str] = []
@@ -393,8 +405,9 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
                 tokens.append(f"[PAD{i}]")
                 toktypes.append(gguf.TokenType.USER_DEFINED)
             elif reverse_vocab[i] in added_vocab:
-                tokens.append(reverse_vocab[i])
-                if tokenizer.added_tokens_decoder[i].special:
+                token: str = reverse_vocab[i]
+                tokens.append(token)
+                if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
                     toktypes.append(gguf.TokenType.CONTROL)
                 else:
                     toktypes.append(gguf.TokenType.USER_DEFINED)
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -5512,14 +5512,6 @@ static void llm_load_vocab(
                 default:                            token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED;    break;
             }
         }
-
-        if ((token_data.attr & LLAMA_TOKEN_ATTR_USER_DEFINED) && !token_data.text.empty() &&
-            token_data.text.front() == '<' && token_data.text.back() == '>') {
-            // Some models mark some added tokens which ought to be control tokens as not special.
-            // (e.g. command-r, command-r-plus, deepseek-coder)
-            // TODO: should this be fixed in the convert script instead?
-            token_data.attr = LLAMA_TOKEN_ATTR_CONTROL;
-        }
     }
     GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
 

Original file line number	Diff line number	Diff line change
`@@ -5512,14 +5512,6 @@ static void llm_load_vocab(`
`5512`	`5512`	`default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;`
`5513`	`5513`	`}`
`5514`	`5514`	`}`
`5515`		`-`
`5516`		`- if ((token_data.attr & LLAMA_TOKEN_ATTR_USER_DEFINED) && !token_data.text.empty() &&`
`5517`		`- token_data.text.front() == '<' && token_data.text.back() == '>') {`
`5518`		`- // Some models mark some added tokens which ought to be control tokens as not special.`
`5519`		`- // (e.g. command-r, command-r-plus, deepseek-coder)`
`5520`		`- // TODO: should this be fixed in the convert script instead?`
`5521`		`- token_data.attr = LLAMA_TOKEN_ATTR_CONTROL;`
`5522`		`- }`
`5523`	`5515`	`}`
`5524`	`5516`	`GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());`
`5525`	`5517`