fix: Remove trailing \n from llama3 <|eot_id|>

gabe-l-hart · gabe-l-hart · commit d624ed33ceaa · 2024-12-11T09:05:30.000-07:00
There's inconsistency in the documentation on whether or not there should
be a \n after &lt;|eot_id|&gt;, but this maintains consistency with previous
formatting

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/tests/test_chat_formatters.py b/tests/test_chat_formatters.py
@@ -139,44 +139,33 @@ def test_llama2_chat_formatter(messages, expected):
         # single user message (no system prompt)
         (MSGS_NO_SYS, f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
 
-{USER1}<|eot_id|>
-"""),
+{USER1}<|eot_id|>"""),
         # sys, usr
         (MSGS_SYS_USR, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
 
-{SYSTEM_PROMPT}<|eot_id|>
-<|start_header_id|>user<|end_header_id|>
+{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>
 
-{USER1}<|eot_id|>
-"""),
+{USER1}<|eot_id|>"""),
         # sys, usr, asst
         (MSGS_SYS_USR_ASST, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
 
-{SYSTEM_PROMPT}<|eot_id|>
-<|start_header_id|>user<|end_header_id|>
+{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>
 
-{USER1}<|eot_id|>
-<|start_header_id|>assistant<|end_header_id|>
+{USER1}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 
-{ASSISTANT1}<|eot_id|>
-"""),
+{ASSISTANT1}<|eot_id|>"""),
         # sys, usr, asst, usr, asst
         (MSGS_MULTI_TURN, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
 
-{SYSTEM_PROMPT}<|eot_id|>
-<|start_header_id|>user<|end_header_id|>
+{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>
 
-{USER1}<|eot_id|>
-<|start_header_id|>assistant<|end_header_id|>
+{USER1}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 
-{ASSISTANT1}<|eot_id|>
-<|start_header_id|>user<|end_header_id|>
+{ASSISTANT1}<|eot_id|><|start_header_id|>user<|end_header_id|>
 
-{USER2}<|eot_id|>
-<|start_header_id|>assistant<|end_header_id|>
+{USER2}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 
-{ASSISTANT2}<|eot_id|>
-"""),
+{ASSISTANT2}<|eot_id|>"""),
     ]
 )
 @pytest.mark.parametrize("add_generation_prompt", [True, False])
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -121,7 +121,7 @@ def _encode_message(self, message: _ChatFormatter.MESSAGE_TYPE) -> List[int]:
                         self.tokenizer.encode(content["text"], bos=False, eos=False)
                     )
 
-        tokens.append(self.tokenizer.special_tokens["<|eot_id|>\n"])
+        tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
         return tokens
 
     def encode_dialog_prompt(

Original file line number	Diff line number	Diff line change
`@@ -121,7 +121,7 @@ def _encode_message(self, message: _ChatFormatter.MESSAGE_TYPE) -> List[int]:`
`121`	`121`	`self.tokenizer.encode(content["text"], bos=False, eos=False)`
`122`	`122`	`)`
`123`	`123`
`124`		`- tokens.append(self.tokenizer.special_tokens["<\|eot_id\|>\n"])`
	`124`	`+ tokens.append(self.tokenizer.special_tokens["<\|eot_id\|>"])`
`125`	`125`	`return tokens`
`126`	`126`
`127`	`127`	`def encode_dialog_prompt(`