fix(chat): Add HFTokenizerChatFormatter and use it for HF tokenizers

gabe-l-hart · gabe-l-hart · commit 15a275dade5e · 2024-11-20T10:16:44.000-07:00
This will allow the jinja2 templates for HF tokenizers to be applied
without needing to hard-code the formatter logic. This will likely need to
be duplicated in the embedded code version of chat.

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -125,6 +125,15 @@ def encode_dialog_prompt(self, dialog) -> List[int]:
         return tokens
 
 
+class HFTokenizerChatFormatter(_ChatFormatter):
+    """Chat formatter that uses the built-in formatting capabilities of an HF
+    tokenizer instance
+    """
+    def encode_dialog_prompt(self, dialog) -> List[int]:
+        rendered = self.tokenizer.apply_chat_template(dialog, add_generation_prompt=True)
+        return self.tokenizer.encode(rendered)
+
+
 @dataclass
 class GeneratorArgs:
     prompt: Optional[str] = (
@@ -286,6 +295,10 @@ def __init__(
                 logging.debug(
                     "Llama3 model detected in chat mode. Using updated sentence schemas"
                 )
+        elif self.tokenizer_args.is_hf_tokenizer:
+            if not self.tokenizer.has_chat_template():
+                raise ValueError("Tokenizer must have a chat template")
+            self.chat_formatter = HFTokenizerChatFormatter(self.tokenizer)
         else:
             self.chat_formatter = Llama2ChatFormatter(self.tokenizer)