Fix

jackzhxng · jackzhxng · commit 96d579858d94 · 2024-10-31T15:01:55.000-07:00
diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
@@ -34,7 +34,7 @@ def __init__(self, args):
             max_batch_size=1,
             use_kv_cache=args.use_kv_cache,
             vocab_size=params["vocab_size"],
-            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS
+            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,
             device="cuda" if torch.cuda.is_available() else "cpu",
         )
         manager: LLMEdgeManager = _prepare_for_llama_export(args)
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
@@ -73,7 +73,6 @@ def __init__(
         has_full_logits: whether the model returns the full logits or only returns the last logit.
         device: device to run the runner on.
         """
-        self.model_name = model
         self.max_seq_len = max_seq_len
         self.max_batch_size = max_batch_size
         self.use_kv_cache = use_kv_cache
diff --git a/examples/models/llama/runner/native.py b/examples/models/llama/runner/native.py
@@ -10,12 +10,14 @@
 
 import torch
 
+from executorch.examples.models.llama.export_llama_lib import EXECUTORCH_DEFINED_MODELS, TORCHTUNE_DEFINED_MODELS
+
 from executorch.extension.pybindings.portable_lib import _load_for_executorch
 
 # Load custom ops and quantized ops.
 from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
 
-from executorch.examples.models.llama2.runner.generation import LlamaRunner
+from executorch.examples.models.llama.runner.generation import LlamaRunner
 
 # Note: import this after portable_lib
 # from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa # usort: skip
@@ -36,6 +38,7 @@ def __init__(self, args):
             max_batch_size=1,
             use_kv_cache=args.kv_cache,
             vocab_size=params["vocab_size"],
+            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,
         )
         self.model = _load_for_executorch(args.pte)
 
@@ -58,8 +61,15 @@ def forward(
 
 
 def build_args_parser() -> argparse.ArgumentParser:
+    # TODO: merge these with build_args_parser from export_llama_lib.
     parser = argparse.ArgumentParser()
 
+    parser.add_argument(
+        "--model",
+        default="llama",
+        choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS,
+    )
+
     parser.add_argument(
         "-f",
         "--pte",

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ def __init__(self, args):`
`34`	`34`	`max_batch_size=1,`
`35`	`35`	`use_kv_cache=args.use_kv_cache,`
`36`	`36`	`vocab_size=params["vocab_size"],`
`37`		`- has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS`
	`37`	`+ has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,`
`38`	`38`	`device="cuda" if torch.cuda.is_available() else "cpu",`
`39`	`39`	`)`
`40`	`40`	`manager: LLMEdgeManager = _prepare_for_llama_export(args)`