[torchchat] persistent history in chat (#427)

JacobSzwejbka · malfet · commit 4eeba97c568d · 2024-07-17T09:55:43.000-07:00
* [torchchat] persistent history in chat

* remove some prints

* add system prompt

* use gh ci to debug

* more ci testing

* more ci testing

* uncomment tests

* add kwarg to generate

* chat works for llama3

* fix case where max-new-tokens is hit for llama3

* remove time added by accident in merge conflict

* remove llama3 detection from cli and piggy back off tokenizer instead

* remove more llama3 flags
diff --git a/README.md b/README.md
@@ -189,7 +189,7 @@ Torchchat also supports loading of many models in the GGUF format. See the [docu
 
 ```
 # Llama 3 8B Instruct
-python3 torchchat.py chat llama3
+python3 torchchat.py chat llama3 --dtype fp16
 ```
 
 ```
diff --git a/generate.py b/generate.py
@@ -11,7 +11,7 @@
 import time
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Optional, Tuple, List
 
 import torch
 import torch._dynamo.config
@@ -30,6 +30,37 @@
 logger = logging.getLogger(__name__)
 
 B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>", "<</SYS>>"
+
+class ChatFormat:
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    def encode_header(self, message) -> List[int]:
+        tokens = []
+        tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
+        tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False))
+        tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"])
+        tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
+        return tokens
+
+    def encode_message(self, message) -> List[int]:
+        tokens = self.encode_header(message)
+        tokens.extend(
+            self.tokenizer.encode(message["content"].strip(), bos=False, eos=False)
+        )
+        tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
+        return tokens
+
+    def encode_dialog_prompt(self, dialog) -> List[int]:
+        tokens = []
+        tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
+        for message in dialog:
+            tokens.extend(self.encode_message(message))
+        # Add the start of an assistant message for the model to complete.
+        tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
+        return tokens
+
 
 
 @dataclass
@@ -173,21 +204,35 @@ def decode_n_tokens(
     num_new_tokens: int,
     need_probs: bool,
     callback=lambda _: _,
+    eos_token_id: int = 2,
+    eot_id: Optional[int] = None,
     **sampling_kwargs,
 ):
     new_tokens, new_probs = [], []
-    for _ in range(num_new_tokens):
+    encountered_eos = False
+    for i in range(num_new_tokens - 1): # -1 to save space to run an EoS if dont generate it naturally
         # Actually better for Inductor to codegen attention here
         with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
             next_token, next_prob = decode_one_token(
-                model, cur_token, input_pos, need_probs=need_probs, **sampling_kwargs
+                model, cur_token.clone(), input_pos, need_probs=need_probs, **sampling_kwargs
             )
             input_pos += 1
             new_tokens.append(next_token.clone())
             callback(new_tokens[-1])
             if need_probs:
                 new_probs.append(next_prob.clone())
             cur_token = next_token.view(1, -1)
+            # encountered eos
+            if (next_token.item() == eos_token_id or (eot_id is not None and next_token.item() == eot_id)):
+                encountered_eos = True
+                _, _ = decode_one_token(model, cur_token, input_pos, need_probs, **sampling_kwargs)
+                input_pos += 1
+                break
+    if not encountered_eos:
+        eos_token = torch.tensor([eos_token_id if eot_id is None else eot_id], dtype=cur_token.dtype, device=cur_token.device)
+        new_tokens.append(eos_token.clone())
+        _, _ = decode_one_token(model, eos_token.view(1, -1), input_pos, need_probs, **sampling_kwargs)
+        input_pos += 1
 
     return new_tokens, new_probs
 
@@ -265,40 +310,39 @@ def generate(
     max_new_tokens: int,
     *,
     chat_mode: bool,
+    start_pos: int = 0,
     draft_model: Transformer,
     speculate_k: Optional[int] = 8,
     sequential_prefill=True,
     callback=lambda x: x,
+    tokenizer=None,
+    max_seq_length: int,
+    is_llama3_model: bool = False,
     **sampling_kwargs,
 ) -> torch.Tensor:
     """
     Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
     """
-
     is_speculative = draft_model is not None
+    device, dtype = prompt.device, prompt.dtype
+
     # create an empty tensor of the expected final shape and fill in the current tokens
     T = prompt.size(0)
+    max_new_tokens = min(max_new_tokens, max_seq_length - start_pos - T)
     T_new = T + max_new_tokens
-    if chat_mode:
-        max_seq_length = 350
-    else:
-        max_seq_length = min(T_new, model.config.block_size)
-
-    device, dtype = prompt.device, prompt.dtype
-    max_seq_length = (
-        max_seq_length + speculate_k + 1 if is_speculative else max_seq_length
-    )
-    model = model.to(device=device)
-    with torch.device(device):
-        model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
-        if is_speculative and draft_model is not model:
-            draft_model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
+    # set up caches only if first inference
+    if start_pos == 0:
+        model = model.to(device=device)
+        with torch.device(device):
+            model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
+            if is_speculative and draft_model is not model:
+                draft_model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
 
     # create an empty tensor of the expected final shape and fill in the current tokens
     empty = torch.empty(T_new, dtype=dtype, device=device)
     empty[:T] = prompt
     seq = empty
-    input_pos = torch.arange(0, T, device=device, dtype=torch.int)
+    input_pos = torch.arange(start_pos, T + start_pos, device=device, dtype=torch.int)
 
     next_token = prefill(
         model,
@@ -317,12 +361,13 @@ def generate(
         )
     seq[T] = next_token
 
-    input_pos = torch.tensor([T], device=device, dtype=torch.int)
-    accept_counts = [0] * (speculate_k + 1)
+    num_tokens_generated = 0
+    input_pos = torch.tensor([start_pos + T], device=device, dtype=torch.int)
+    accept_counts = [0] * (speculate_k + 1) # creates array of [0, 0, 0, ...] that is speculate_k + 1 long
 
     if is_speculative:
         input_pos = input_pos.item()  # for speculative decoding easier to keep on host
-        while input_pos < T_new - 1:
+        while input_pos < max_new_tokens - 1:
             cur_token = next_token.view(())
 
             next_tokens = speculative_decode(
@@ -344,9 +389,12 @@ def generate(
             max_new_tokens - 1,
             callback=callback,
             need_probs=False,
+            eos_token_id = tokenizer.eos_id() if tokenizer else 2,
+            eot_id = tokenizer.special_tokens["<|eot_id|>"] if is_llama3_model else None,
             **sampling_kwargs,
         )
-        seq[T + 1 :] = torch.cat(generated_tokens)
+        seq[T + 1 : T + 1 + len(generated_tokens)] = torch.cat(generated_tokens)
+        seq = seq[:T + 1 + len(generated_tokens)] # If we dont generate all the way to max_new_tokens slice off the extra space we allocated.
 
     generate_stats = {"accept_counts": accept_counts}
     return seq, generate_stats
@@ -359,8 +407,6 @@ def encode_tokens(tokenizer, string, bos=True, device="cpu"):
     return torch.tensor(tokens, dtype=torch.int, device=device)
 
 
-B_INST, E_INST = "[INST]", "[/INST]"
-
 
 def get_device_info(name: str) -> str:
     import platform
@@ -430,6 +476,12 @@ def _main(
 
     tokenizer = _initialize_tokenizer(tokenizer_args)
 
+    # Right now the assumption is only llama3 uses tiktokenizer and it must use tiktokenizer.
+    # Piggy backing off of this flag then for now to identify llama3 without prompting user.
+    is_llama3_model = tokenizer_args.is_tiktoken
+    if generator_args.chat_mode and is_llama3_model:
+        logging.debug("Llama3 model detected in chat mode. Using updated sentence schemas")
+
     builder_args.setup_caches = False
     model = _initialize_model(builder_args, quantize, tokenizer)
 
@@ -484,21 +536,65 @@ def _main(
         if generator_args.compile_prefill:
             prefill = torch.compile(prefill, fullgraph=True, dynamic=True)
 
+    system_prompt=None
+    # Set up our max_seq_length
+    if generator_args.chat_mode:
+        max_seq_length = 2048
+        print(f"Entering Chat Mode. Will continue chatting back and forth with the language model until the models max context length of {max_seq_length} tokens is hit or until the user says /bye")
+        get_system_prompt = input("Do you want to enter a system prompt? Enter y for yes and anything else for no. \n")
+        if (get_system_prompt == "y" or get_system_prompt == "Y"):
+            system_prompt = input("What is your system prompt? \n")
+        if is_llama3_model:
+            chat_formatter = ChatFormat(tokenizer)
+    else:
+        max_seq_length = min(encoded.size(0) + generator_args.max_new_tokens, model.config.block_size)
+
+
+    max_seq_length = (
+        max_seq_length + speculate_k + 1 if draft_model is not None else max_seq_length
+    )
+
     aggregate_metrics = {
         "tokens_per_sec": [],
         "accept_counts": [],
     }
     start = -1 if generator_args.compile else 0
+    start_pos = 0
 
-    for i in range(start, generator_args.num_samples):
+
+    # arbitrarily large number as chat mode goes until max_seq length or user exits
+    num_samples = generator_args.num_samples if not generator_args.chat_mode else 100000
+    i = -1  # long loop and Im scared someone will add a continue in it, so start at -1 and increment at the start
+    while (i < num_samples):
+        i += 1
         device_sync(device=builder_args.device)
         if i >= 0 and generator_args.chat_mode:
             prompt = input("What is your prompt? \n")
-            if builder_args.is_chat_model:
-                prompt = f"{B_INST} {prompt.strip()} {E_INST}"
-            encoded = encode_tokens(
-                tokenizer, prompt, bos=True, device=builder_args.device
-            )
+            if (prompt == "/bye"):
+                print("Exiting Chat.\n")
+                break
+            if not is_llama3_model:
+                if system_prompt is not None:
+                    prompt = f"{B_INST} {B_SYS}\n{system_prompt.strip()}\n{E_SYS}\n\n{prompt.strip} {E_INST}"
+                    system_prompt = None # can only provide system prompt on first interaction
+                else:
+                    prompt = f"{B_INST} {prompt.strip()} {E_INST}"
+                encoded = encode_tokens(
+                    tokenizer, prompt, bos=True, device=builder_args.device
+                )
+            else:
+                if system_prompt is not None:
+                    encoded = chat_formatter.encode_dialog_prompt([{"role" : "system", "content" : system_prompt}, {"role" : "user", "content" : prompt}])
+                    system_prompt = None
+                elif(i == 0):
+                    encoded = chat_formatter.encode_dialog_prompt([{"role" : "user", "content" : prompt}])
+                else:
+                    encoded = chat_formatter.encode_message({"role" : "user", "content" : prompt})
+                    encoded.extend(chat_formatter.encode_header({"role": "assistant", "content": ""}))
+                encoded = torch.tensor(encoded, dtype=torch.int, device=builder_args.device)
+            if (encoded.size(0) + start_pos > max_seq_length):
+                print("This prompt would take us past the max_seq_length. Ending Conversation.")
+                break
 
         if generator_args.chat_mode and i >= 0:
             buffer = []
@@ -510,7 +606,7 @@ def callback(
             ):
                 if done_generating:
                     return
-                buffer.append(tokenizer.decode([period_id] + x.tolist())[1:])
+                buffer.append(tokenizer.decode([period_id] + x.tolist())[1:]) # I think this results in the first output token being dropped from the display which is wrong.
                 if x.item() == tokenizer.eos_id():
                     done_generating = True
                 if len(buffer) == 4 or done_generating:
@@ -545,8 +641,13 @@ def callback(x):
                 temperature=generator_args.temperature,
                 top_k=generator_args.top_k,
                 sequential_prefill=generator_args.sequential_prefill,
+                start_pos=start_pos,
+                tokenizer=tokenizer,
+                max_seq_length=max_seq_length,
+                is_llama3_model=is_llama3_model,
             )
             aggregate_metrics["accept_counts"].append(metrics["accept_counts"])
+            start_pos += y.size(0)
         if i == -1:
             logging.info(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
             continue
@@ -569,6 +670,11 @@ def callback(x):
             f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_sec:.02f} tokens/sec"
         )
         logging.info(f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s")
+
+        if (start_pos >= max_seq_length):
+            print("Max Sequence Length Reached. Ending Conversation.")
+            break
+
     print("==========")
     if is_speculative:
         counts_aggregated = [sum(i) for i in zip(*aggregate_metrics["accept_counts"])]