Merge branch 'jz/native-runner-tt' into jz/tt-llama-3

jackzhxng · jackzhxng · commit 1dd12f004065 · 2024-11-01T10:44:15.000-07:00
diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
@@ -10,10 +10,10 @@
 
 import torch
 
-from examples.models.llama.llama_transformer import ModelArgs
 from executorch.examples.models.llama.export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
+    TORCHTUNE_DEFINED_MODELS,
 )
 from executorch.examples.models.llama.runner.generation import LlamaRunner
 from executorch.extension.llm.export import LLMEdgeManager
@@ -27,15 +27,13 @@ class EagerLlamaRunner(LlamaRunner):
     def __init__(self, args):
         with open(args.params, "r") as f:
             params = json.loads(f.read())
-        model_args: ModelArgs = ModelArgs(
+        super().__init__(
+            tokenizer_path=args.tokenizer_path,
             max_seq_len=args.max_seq_length,
             max_batch_size=1,
             use_kv_cache=args.use_kv_cache,
-            **params,
-        )
-        super().__init__(
-            tokenizer_path=args.tokenizer_path,
-            model_args=model_args,
+            vocab_size=params["vocab_size"],
+            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,
             device="cuda" if torch.cuda.is_available() else "cpu",
         )
         manager: LLMEdgeManager = _prepare_for_llama_export(args)
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
@@ -9,7 +9,6 @@
 
 import torch
 
-from executorch.examples.models.llama.llama_transformer import ModelArgs
 from executorch.extension.llm.tokenizer.utils import get_tokenizer
 
 
@@ -51,11 +50,35 @@ def next_token(logits: torch.Tensor, temperature: float, top_p: float) -> int:
 
 
 class LlamaRunner(ABC):
-    def __init__(self, tokenizer_path: str, model_args: ModelArgs, device: str = "cpu"):
-        self.params = model_args
+    def __init__(
+        self,
+        tokenizer_path: str,
+        max_seq_len: int,
+        max_batch_size: int,
+        use_kv_cache: bool,
+        vocab_size: int,
+        has_full_logits: bool = False,
+        device: str = "cpu",
+    ):
+        """
+        Constructor.
+
+        Args:
+        tokenizer_path: path to tokenizer.model file.
+        max_seq_len: max length of the output sequence, after which the output will be clipped.
+        max_batch_size: max batch size.
+        use_kv_cache: whether to use a KV cache.
+        vocab_size: number of items in the vocab.
+        has_full_logits: whether the model returns the full logits or only returns the last logit.
+        device: device to run the runner on.
+        """
+        self.max_seq_len = max_seq_len
+        self.max_batch_size = max_batch_size
+        self.use_kv_cache = use_kv_cache
         self.tokenizer = get_tokenizer(tokenizer_path)
-        assert model_args.vocab_size == self.tokenizer.n_words
+        self.has_full_logits = has_full_logits
         self.device = device
+        assert vocab_size == self.tokenizer.n_words
 
     @abstractmethod
     def forward(
@@ -77,16 +100,22 @@ def generate(  # noqa: C901
             tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
             input_pos=(
                 torch.tensor([0], dtype=torch.long, device=self.device)
-                if self.params.use_kv_cache
+                if self.use_kv_cache
                 else None
             ),
         )
 
-        current_token = next_token(logits, temperature, top_p)
+        current_token = next_token(logits[:, -1, :], temperature, top_p)
+        if self.has_full_logits:
+            current_token = next_token(logits[:, -1, :], temperature, top_p)
+        else:
+            current_token = next_token(logits, temperature, top_p)
         tokens = prompt_tokens + [current_token]
 
-        while len(tokens) < self.params.max_seq_len:
-            if self.params.use_kv_cache:
+        i = 0
+        while len(tokens) < self.max_seq_len:
+            print(f"{i} out of {self.max_seq_len} max tokens generated")
+            if self.use_kv_cache:
                 logits = self.forward(
                     tokens=torch.tensor(
                         [[current_token]], dtype=torch.long, device=self.device
@@ -99,13 +128,21 @@ def generate(  # noqa: C901
                 logits = self.forward(
                     tokens=torch.tensor([tokens], dtype=torch.long, device=self.device),
                 )
-            current_token = next_token(logits, temperature, top_p)
+
+            # If the logits aren't already clipped to only contain the last logit, clip them.
+            if self.has_full_logits:
+                current_token = next_token(logits[:, -1, :], temperature, top_p)
+            else:
+                current_token = next_token(logits, temperature, top_p)
+
             if current_token == self.tokenizer.eos_id or (
                 hasattr(self.tokenizer, "stop_tokens")
                 and current_token in self.tokenizer.stop_tokens
             ):
                 break
+
             tokens.append(current_token)
+            i += 1
 
         return tokens if echo else tokens[len(prompt_tokens) :]
 
diff --git a/examples/models/llama/runner/native.py b/examples/models/llama/runner/native.py
@@ -10,18 +10,22 @@
 
 import torch
 
-from examples.models.llama.llama_transformer import ModelArgs
+from executorch.examples.models.llama.export_llama_lib import (
+    EXECUTORCH_DEFINED_MODELS,
+    TORCHTUNE_DEFINED_MODELS,
+)
+
 from executorch.extension.pybindings.portable_lib import _load_for_executorch
 
 # Load custom ops and quantized ops.
 from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
 
+from executorch.examples.models.llama.runner.generation import LlamaRunner
+
 # Note: import this after portable_lib
-from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
+# from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
 from executorch.kernels import quantized  # noqa
 
-from .generation import LlamaRunner
-
 
 class NativeLlamaRunner(LlamaRunner):
     """
@@ -31,30 +35,44 @@ class NativeLlamaRunner(LlamaRunner):
     def __init__(self, args):
         with open(args.params, "r") as f:
             params = json.loads(f.read())
-        model_args: ModelArgs = ModelArgs(
+        super().__init__(
+            tokenizer_path=args.tokenizer,
             max_seq_len=args.max_len,
             max_batch_size=1,
             use_kv_cache=args.kv_cache,
-            **params,
+            vocab_size=params["vocab_size"],
+            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,
         )
-        super().__init__(tokenizer_path=args.tokenizer, model_args=model_args)
         self.model = _load_for_executorch(args.pte)
 
     def forward(
         self,
         tokens: Optional[torch.LongTensor] = None,
         input_pos: Optional[torch.LongTensor] = None,
     ) -> torch.Tensor:
-        return (
-            self.model.forward((tokens, input_pos))
-            if input_pos is not None
-            else self.model.forward((tokens,))
-        )[0]
+        # TODO: in LlamaRunner there is a generate function that automatically generates
+        # input_pos tensor and inputs it into the model. Atm TorchTune models use
+        # kwargs for the input_pos, so we will need to make some changes. At least
+        # for the time being, we can run the non-kv cache version of the Torchtune
+        # model with just the tokens like below.
+        return (self.model.forward((tokens,)))[0]
+        # return (
+        #     self.model.forward((tokens, input_pos))
+        #     if input_pos is not None
+        #     else self.model.forward((tokens,))
+        # )[0]
 
 
 def build_args_parser() -> argparse.ArgumentParser:
+    # TODO: merge these with build_args_parser from export_llama_lib.
     parser = argparse.ArgumentParser()
 
+    parser.add_argument(
+        "--model",
+        default="llama",
+        choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS,
+    )
+
     parser.add_argument(
         "-f",
         "--pte",
@@ -89,7 +107,6 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "-kv",
         "--kv_cache",
-        default=True,
         action="store_true",
     )
 
diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
@@ -40,7 +40,9 @@ class Llama3_2Decoder(EagerModelBase):
 
     def __init__(self, **kwargs):
         # Set member vars from kwargs.
-        self.max_seq_len = kwargs.get("max_seq_len", 8192)  # Trained to be a lot larger, but this value is kept small because of static kv cache at the moment.
+        self.max_seq_len = kwargs.get(
+            "max_seq_len", 8192
+        )  # Trained to be a lot larger, but this value is kept small because of static kv cache at the moment.
         self.encoder_max_seq_len = kwargs.get(
             "encoder_max_seq_len", int(4 * (448 / 14) ** 2 + 1)
         )  # Same as above.
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
@@ -194,6 +194,11 @@ def export(self) -> "LLMEdgeManager":
                     strict=True,
                 ).module()
             else:
+                print("Exporting with:")
+                print(f"inputs: {self.example_inputs}")
+                print(f"kwargs: {self.example_kwarg_inputs}")
+                print(f"dynamic shapes: {dynamic_shape}")
+
                 # pyre-fixme[8]: Attribute has type `Optional[GraphModule]`; used as
                 #  `Module`.
                 self.pre_autograd_graph_module = export_for_training(