Make TorchTune Llama model KV cache compatible in eager (#6643)

jackzhxng · web-flow · commit 7b76f0f3237e · 2024-11-14T19:55:13.000-08:00
diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
@@ -6,14 +6,13 @@
 
 import argparse
 import json
-from typing import Optional
+from typing import Optional, Type
 
 import torch
 
 from executorch.examples.models.llama.export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
-    TORCHTUNE_DEFINED_MODELS,
 )
 from executorch.examples.models.llama.runner.generation import LlamaRunner
 from executorch.extension.llm.export.builder import LLMEdgeManager
@@ -33,7 +32,6 @@ def __init__(self, args):
             max_batch_size=1,
             use_kv_cache=args.use_kv_cache,
             vocab_size=params["vocab_size"],
-            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,
             device="cuda" if torch.cuda.is_available() else "cpu",
         )
         manager: LLMEdgeManager = _prepare_for_llama_export(args)
@@ -79,11 +77,10 @@ def build_args_parser() -> argparse.ArgumentParser:
     return parser
 
 
-def main() -> None:
+def execute_runner(runner_class: Type[LlamaRunner]) -> None:
     parser = build_args_parser()
     args = parser.parse_args()
-
-    runner = EagerLlamaRunner(args)
+    runner = runner_class(args)
     generated_tokens = (
         runner.chat_completion(temperature=args.temperature)
         if args.chat
@@ -97,5 +94,9 @@ def main() -> None:
         print(f"Generated {len(generated_tokens)} tokens: {generated_tokens}")
 
 
+def main() -> None:
+    execute_runner(EagerLlamaRunner)
+
+
 if __name__ == "__main__":
     main()  # pragma: no cover
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
@@ -53,7 +53,6 @@ def __init__(
         max_batch_size: int,
         use_kv_cache: bool,
         vocab_size: int,
-        has_full_logits: bool = False,
         device: str = "cpu",
     ):
         """
@@ -65,14 +64,12 @@ def __init__(
         max_batch_size: max batch size.
         use_kv_cache: whether to use a KV cache.
         vocab_size: number of items in the vocab.
-        has_full_logits: whether the model returns the full logits or only returns the last logit.
         device: device to run the runner on.
         """
         self.max_seq_len = max_seq_len
         self.max_batch_size = max_batch_size
         self.use_kv_cache = use_kv_cache
         self.tokenizer = get_tokenizer(tokenizer_path)
-        self.has_full_logits = has_full_logits
         self.device = device
         assert vocab_size == self.tokenizer.n_words
 
@@ -93,7 +90,7 @@ def generate(  # noqa: C901
         echo: bool = False,
         pos_base: int = 0,
     ) -> List[int]:
-        # prefill
+        # Prefill
         logits = self.forward(
             tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
             input_pos=(
@@ -103,10 +100,7 @@ def generate(  # noqa: C901
             ),
         )
 
-        if self.has_full_logits:
-            current_token = next_token(logits[:, -1, :], temperature, top_p)
-        else:
-            current_token = next_token(logits, temperature, top_p)
+        current_token = next_token(logits, temperature, top_p)
         print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
         tokens = prompt_tokens + [current_token]
 
@@ -128,10 +122,7 @@ def generate(  # noqa: C901
                 )
 
             # If the logits aren't already clipped to only contain the last logit, clip them.
-            if self.has_full_logits:
-                current_token = next_token(logits[:, -1, :], temperature, top_p)
-            else:
-                current_token = next_token(logits, temperature, top_p)
+            current_token = next_token(logits, temperature, top_p)
             tokens.append(current_token)
 
             if current_token == self.tokenizer.eos_id or (
diff --git a/examples/models/llama/runner/native.py b/examples/models/llama/runner/native.py
@@ -41,7 +41,6 @@ def __init__(self, args):
             max_batch_size=1,
             use_kv_cache=args.kv_cache,
             vocab_size=params["vocab_size"],
-            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,
         )
         self.model = _load_for_executorch(args.pte)
 
diff --git a/examples/models/llama3_2_vision/runner/eager.py b/examples/models/llama3_2_vision/runner/eager.py
@@ -0,0 +1,53 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+from typing import Optional
+
+import torch
+
+from executorch.examples.models.llama.export_llama_lib import _prepare_for_llama_export
+from executorch.examples.models.llama.runner.eager import execute_runner
+from executorch.examples.models.llama3_2_vision.runner.generation import (
+    TorchTuneLlamaRunner,
+)
+from executorch.extension.llm.export import LLMEdgeManager
+
+
+class EagerLlamaRunner(TorchTuneLlamaRunner):
+    """
+    Runs llama in eager mode with provided checkpoint file.
+    """
+
+    def __init__(self, args):
+        with open(args.params, "r") as f:
+            params = json.loads(f.read())
+        super().__init__(
+            tokenizer_path=args.tokenizer_path,
+            max_seq_len=args.max_seq_length,
+            max_batch_size=1,
+            use_kv_cache=args.use_kv_cache,
+            vocab_size=params["vocab_size"],
+            device="cuda" if torch.cuda.is_available() else "cpu",
+        )
+        manager: LLMEdgeManager = _prepare_for_llama_export(args)
+        self.model = manager.model.eval().to(device=self.device)
+
+    def forward(
+        self,
+        tokens: Optional[torch.LongTensor] = None,
+        input_pos: Optional[torch.LongTensor] = None,
+        mask: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        return self.model.forward(tokens=tokens, input_pos=input_pos, mask=mask)
+
+
+def main() -> None:
+    execute_runner(EagerLlamaRunner)
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/examples/models/llama3_2_vision/runner/generation.py b/examples/models/llama3_2_vision/runner/generation.py
@@ -0,0 +1,101 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List
+
+import torch
+from executorch.examples.models.llama.runner.generation import LlamaRunner, next_token
+
+
+class TorchTuneLlamaRunner(LlamaRunner):
+    def __init__(
+        self,
+        tokenizer_path: str,
+        max_seq_len: int,
+        max_batch_size: int,
+        use_kv_cache: bool,
+        vocab_size: int,
+        device: str = "cpu",
+    ):
+        super().__init__(
+            tokenizer_path,
+            max_seq_len,
+            max_batch_size,
+            use_kv_cache,
+            vocab_size,
+            device,
+        )
+
+        self.causal_mask = torch.tril(
+            torch.ones(
+                size=(max_seq_len, max_seq_len),
+                dtype=torch.bool,
+            )
+        )
+        self.input_pos = torch.arange(max_seq_len)
+
+    def generate(  # noqa: C901
+        self,
+        prompt_tokens: List[int],
+        max_seq_len: int,
+        temperature: float = 0.8,
+        top_p: float = 0.9,
+        echo: bool = False,
+    ) -> List[int]:
+        # Prefill
+        seq_len = len(prompt_tokens)
+        input_pos = self.input_pos[None, :seq_len]
+        mask = self.causal_mask[None, :seq_len]
+        if self.use_kv_cache:
+            logits = self.forward(
+                tokens=torch.tensor(
+                    [prompt_tokens], dtype=torch.long, device=self.device
+                ),
+                input_pos=input_pos,
+                mask=mask,
+            )
+        else:
+            logits = self.forward(
+                tokens=torch.tensor(
+                    [prompt_tokens], dtype=torch.long, device=self.device
+                ),
+            )
+
+        # Only need the last logit.
+        current_token = next_token(logits[:, -1, :], temperature, top_p)
+        print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
+        tokens = prompt_tokens + [current_token]
+
+        while len(tokens) < max_seq_len:
+            mask = self.causal_mask[None, seq_len, None, :]
+            input_pos = self.input_pos[None, seq_len, None]
+            if self.use_kv_cache:
+                logits = self.forward(
+                    tokens=torch.tensor(
+                        [[current_token]], dtype=torch.long, device=self.device
+                    ),
+                    input_pos=input_pos,
+                    mask=mask,
+                )
+            else:
+                logits = self.forward(
+                    tokens=torch.tensor([tokens], dtype=torch.long, device=self.device),
+                )
+
+            # Only need the last logit.
+            current_token = next_token(logits[:, -1, :], temperature, top_p)
+            tokens.append(current_token)
+
+            if current_token == self.tokenizer.eos_id or (
+                hasattr(self.tokenizer, "stop_tokens")
+                and current_token in self.tokenizer.stop_tokens
+            ):
+                break
+
+            print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
+            seq_len += 1
+
+        return tokens if echo else tokens[len(prompt_tokens) :]
diff --git a/examples/models/llama3_2_vision/runner/native.py b/examples/models/llama3_2_vision/runner/native.py
@@ -0,0 +1,131 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import json
+from typing import Optional
+
+import torch
+
+from executorch.examples.models.llama.export_llama_lib import (
+    EXECUTORCH_DEFINED_MODELS,
+    TORCHTUNE_DEFINED_MODELS,
+)
+from executorch.examples.models.llama3_2_vision.runner.generation import (
+    TorchTuneLlamaRunner,
+)
+
+from executorch.extension.pybindings.portable_lib import _load_for_executorch
+
+# Load custom ops and quantized ops.
+from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
+
+# Note: import this after portable_lib
+from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
+from executorch.kernels import quantized  # noqa
+
+
+class NativeLlamaRunner(TorchTuneLlamaRunner):
+    """
+    Runs llama via ExecuTorch with provided pte file.
+    """
+
+    def __init__(self, args):
+        with open(args.params, "r") as f:
+            params = json.loads(f.read())
+        super().__init__(
+            tokenizer_path=args.tokenizer,
+            max_seq_len=args.max_len,
+            max_batch_size=1,
+            use_kv_cache=args.kv_cache,
+            vocab_size=params["vocab_size"],
+        )
+        self.model = _load_for_executorch(args.pte)
+        self.use_kv_cache = args.kv_cache
+
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        input_pos: Optional[torch.Tensor] = None,
+        mask: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        return (
+            self.model.forward((tokens, input_pos, mask))
+            if self.use_kv_cache
+            else self.model.forward((tokens,))
+        )[0]
+
+
+def build_args_parser() -> argparse.ArgumentParser:
+    # TODO: merge these with build_args_parser from export_llama_lib.
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model",
+        default="llama3",
+        choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS,
+    )
+
+    parser.add_argument(
+        "-f",
+        "--pte",
+        type=str,
+        default=None,
+        help="path to exported executorch .pte file",
+    )
+
+    parser.add_argument(
+        "-p", "--params", type=str, default=None, help="model params file"
+    )
+
+    parser.add_argument(
+        "-t",
+        "--tokenizer",
+        type=str,
+        default=None,
+    )
+
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="Hello",
+    )
+
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.6,
+    )
+
+    parser.add_argument(
+        "-kv",
+        "--kv_cache",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--max_len",
+        type=int,
+        default=128,
+        help="Maximum length of the generated response sequence.",
+    )
+
+    return parser
+
+
+def main() -> None:
+    parser = build_args_parser()
+    args = parser.parse_args()
+    runner = NativeLlamaRunner(args)
+    generated_tokens = runner.text_completion(
+        prompt=args.prompt,
+        temperature=args.temperature,
+    )
+    print(f"Response: {generated_tokens}")
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/examples/models/llama3_2_vision/text_decoder/model.py b/examples/models/llama3_2_vision/text_decoder/model.py
diff --git a/examples/models/model_factory.py b/examples/models/model_factory.py

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,6 @@ def __init__(self, args):`
`41`	`41`	`max_batch_size=1,`
`42`	`42`	`use_kv_cache=args.kv_cache,`
`43`	`43`	`vocab_size=params["vocab_size"],`
`44`		`- has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,`
`45`	`44`	`)`
`46`	`45`	`self.model = _load_for_executorch(args.pte)`
`47`	`46`