Update eager runner to support AttentionSink (#7149)

pytorchbot · helunwencser · web-flow · commit 5f0a14a2fa1c · 2024-12-02T17:29:13.000-08:00
* Transform model to be able to use Attention Sink Pull Request resolved: #6700 This PR adds necessary functions for transforming the model to be able to use Attention Sink. ghstack-source-id: 256108077 @exported-using-ghexport Differential Revision: [D65571289](https://our.internmc.facebook.com/intern/diff/D65571289/) * Update eager runner to support AttentionSink Pull Request resolved: #6921 This PR updates the eager runner to support AttentionSink. It also fixes issues in the `chat_completion` function to properly handle the position id. ghstack-source-id: 256108078 Differential Revision: [D66076486](https://our.internmc.facebook.com/intern/diff/D66076486/) * add eval for attention sink (#7150) Pull Request resolved: #7070 This PR adds the function to evaluate the model's perplexity when AttentionSink is enabled. This is mostly copied from https://github.com/mit-han-lab/streaming-llm/blob/main/examples/eval_long_ppl.py which is used by the AttentionSink paper to evaluate the model's perplexity when AttentionSink is enabled. ghstack-source-id: 256108079 @exported-using-ghexport Differential Revision: [D66474732](https://our.internmc.facebook.com/intern/diff/D66474732/) Co-authored-by: Lunwen He <lwhecser@gmail.com> --------- Co-authored-by: Lunwen He <lwhecser@gmail.com>
diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS
@@ -150,6 +150,8 @@ runtime.python_library(
         "@EXECUTORCH_CLIENTS",
     ],
     deps = [
+        "fbsource//third-party/pypi/tqdm:tqdm",
+        "fbsource//third-party/pypi/datasets:datasets",
         "fbsource//third-party/pypi/lm-eval:lm-eval",
         "fbsource//third-party/pypi/tiktoken:tiktoken",
         ":export_library",
diff --git a/examples/models/llama/eval_llama.py b/examples/models/llama/eval_llama.py
@@ -10,7 +10,11 @@
 
 import torch
 
-from .eval_llama_lib import build_args_parser, eval_llama
+from .eval_llama_lib import (
+    build_args_parser,
+    eval_llama,
+    eval_llama_with_attention_sink,
+)
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -24,7 +28,10 @@ def main() -> None:
     args = parser.parse_args()
     # Overrides this arg, because evaluation requires full logits.
     args.generate_full_logits = True
-    eval_llama(modelname, args)  # pyre-ignore
+    if args.use_attention_sink:
+        eval_llama_with_attention_sink(modelname, args)  # pyre-ignore
+    else:
+        eval_llama(modelname, args)  # pyre-ignore
 
 
 if __name__ == "__main__":
diff --git a/examples/models/llama/eval_llama_lib.py b/examples/models/llama/eval_llama_lib.py
@@ -10,6 +10,8 @@
 from typing import Optional, Union
 
 import torch
+
+from datasets import load_dataset
 from executorch.examples.models.llama.export_llama_lib import (
     get_quantizer_and_quant_params,
 )
@@ -21,6 +23,8 @@
 )
 from executorch.extension.llm.tokenizer.utils import get_tokenizer
 from lm_eval.evaluator import simple_evaluate
+from torch.nn import CrossEntropyLoss
+from tqdm import tqdm
 
 from .evaluate.eager_eval import EagerEvalWrapper
 
@@ -280,6 +284,9 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="Save the checkpoint after source transformations, for other evaluation platform to run the same checkpoint.",
     )
 
+    # Set of parameters secpific to AttentionSink.
+    parser.add_argument("--attention_sink_eval_tokens", type=int, default=0)
+
     return parser
 
 
@@ -309,3 +316,60 @@ def eval_llama(
 
     for task, res in eval_results["results"].items():
         print(f"{task}: {res}")
+
+
+def eval_llama_with_attention_sink(model_name: str, args: argparse.ArgumentParser):
+    """
+    Evaluate the model's perplexity when AttentionSink is enabled.
+
+    This is mostly copied from https://github.com/mit-han-lab/streaming-llm/blob/main/examples/eval_long_ppl.py
+    """
+    assert args.use_attention_sink is not None  # pyre-ignore [16]
+    assert args.attention_sink_eval_tokens > 0  # pyre-ignore [16]
+    attention_sink_params = args.use_attention_sink.split(",")
+    assert len(attention_sink_params) == 3
+    sink_size = int(attention_sink_params[0])
+    window_size = int(attention_sink_params[1])
+
+    assert args.max_seq_length == sink_size + window_size  # pyre-ignore [16]
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    manager: LLMEdgeManager = _prepare_for_llama_export(args)
+    model = manager.model.eval().to(device=device)
+    tokenizer = get_tokenizer(args.tokenizer_path)  # pyre-ignore [16]
+
+    eval_data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+
+    nlls = []
+    loss_fn = CrossEntropyLoss(reduction="none")
+    progress_bar = tqdm(total=args.attention_sink_eval_tokens)
+    input_pos = 0
+    while input_pos < args.attention_sink_eval_tokens:
+        for text in eval_data["text"]:  # pyre-ignore [16]
+            tokens = tokenizer.encode(text, bos=False, eos=False)
+            if len(tokens) <= 0:
+                continue
+            with torch.no_grad():
+                num_tokens = min(
+                    len(tokens) - 1, args.attention_sink_eval_tokens - input_pos
+                )
+                logits = model(
+                    torch.tensor(
+                        [tokens[:num_tokens]], dtype=torch.int64, device=device
+                    ),
+                    torch.tensor([input_pos], dtype=torch.int64, device=device),
+                ).squeeze(dim=0)
+                neg_log_likelihood = loss_fn(
+                    logits,
+                    torch.tensor(
+                        [tokens[1 : num_tokens + 1]], dtype=torch.int64, device=device
+                    ).view(-1),
+                )
+                nlls.append(neg_log_likelihood)
+                input_pos += num_tokens
+                progress_bar.update(num_tokens)
+            if input_pos >= args.attention_sink_eval_tokens:
+                break
+    ppl = torch.exp(torch.cat(nlls).mean())
+    print(f"Perplexity: {ppl.item()}")
+    return ppl.item()
diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
@@ -84,7 +84,11 @@ def execute_runner(runner_class: Type[LlamaRunner]) -> None:
     with torch.no_grad():
         runner = runner_class(args)  # pyre-ignore: Missing argument [20]
         generated_tokens = (
-            runner.chat_completion(temperature=args.temperature)
+            runner.chat_completion(
+                max_seq_len=1000000 if args.use_attention_sink else args.max_seq_length,
+                temperature=args.temperature,
+                show_progress=args.show_tokens,
+            )
             if args.chat
             else runner.text_completion(
                 prompt=args.prompt,
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
@@ -168,18 +168,19 @@ def text_completion(
 
     def chat_completion(
         self,
+        max_seq_len: int,
         temperature: float = 0.6,
         top_p: float = 0.9,
+        show_progress: bool = False,
     ) -> List[int]:
         """
         Perform multi-turn chat with the language model.
 
             Args:
-                prompt (str): Text prompt for completion.
+                max_seq_len (int): Maximum number of tokens to generate for each prompt.
                 temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6.
                 top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
-                echo (bool, optional): Flag indicating whether to include prompt tokens in the generated output. Defaults to False.
-
+                show_progress (bool, optional): Flag indicating whether to show number of tokens generated.
             Returns:
                 Generated list of tokens.
 
@@ -188,20 +189,26 @@ def chat_completion(
         """
         exit_prompt = "exit"
         tokens = []
+        pre_stop_token = []
         prompt = input("Me: ")
         while prompt and prompt != exit_prompt:
             print("LLM: ", end="", flush=True)
-            new_tokens = self.generate(
-                prompt_tokens=self.tokenizer.encode(
-                    self._format_prompt(prompt), bos=True, eos=False
-                ),
-                max_seq_len=self.max_seq_len,
+            prompt_tokens = self.tokenizer.encode(
+                self._format_prompt(prompt), bos=True, eos=False
+            )
+            generated_tokens = self.generate(
+                prompt_tokens=pre_stop_token + prompt_tokens,
+                max_seq_len=max_seq_len,
                 temperature=temperature,
                 top_p=top_p,
-                echo=True,
+                echo=False,
                 pos_base=len(tokens) - 1 if len(tokens) > 0 else 0,
             )
-            tokens.extend(new_tokens)
+            pre_stop_token = generated_tokens[-1:]
+            tokens.extend(prompt_tokens)
+            tokens.extend(generated_tokens)
+            if show_progress:
+                print(f"[Generated {len(tokens)} tokens]")
             prompt = input("Me: ")
         return tokens