Clean up

jackzhxng · jackzhxng · commit 37011d3de2eb · 2024-11-01T08:11:47.000-07:00
diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
@@ -10,7 +10,6 @@
 
 import torch
 
-from examples.models.llama.llama_transformer import ModelArgs
 from executorch.examples.models.llama.export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
@@ -9,7 +9,6 @@
 
 import torch
 
-from executorch.examples.models.llama.llama_transformer import ModelArgs
 from executorch.extension.llm.tokenizer.utils import get_tokenizer
 
 
@@ -63,7 +62,7 @@ def __init__(
     ):
         """
         Constructor.
-        
+
         Args:
         tokenizer_path: path to tokenizer.model file.
         max_seq_len: max length of the output sequence, after which the output will be clipped.
@@ -100,13 +99,17 @@ def generate(  # noqa: C901
         logits = self.forward(
             tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
             input_pos=(
-                torch.tensor([0], dtype=torch.long, device=self.device) if self.use_kv_cache else None
+                torch.tensor([0], dtype=torch.long, device=self.device)
+                if self.use_kv_cache
+                else None
             ),
         )
 
-        # TODO: accomodate TorchTune model, which doesn't
-        # make an optimization of dropping all logits but the last.
         current_token = next_token(logits[:, -1, :], temperature, top_p)
+        if self.has_full_logits:
+            current_token = next_token(logits[:, -1, :], temperature, top_p)
+        else:
+            current_token = next_token(logits, temperature, top_p)
         tokens = prompt_tokens + [current_token]
 
         i = 0
diff --git a/examples/models/llama/runner/native.py b/examples/models/llama/runner/native.py
@@ -10,7 +10,10 @@
 
 import torch
 
-from executorch.examples.models.llama.export_llama_lib import EXECUTORCH_DEFINED_MODELS, TORCHTUNE_DEFINED_MODELS
+from executorch.examples.models.llama.export_llama_lib import (
+    EXECUTORCH_DEFINED_MODELS,
+    TORCHTUNE_DEFINED_MODELS,
+)
 
 from executorch.extension.pybindings.portable_lib import _load_for_executorch
 
diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
@@ -40,7 +40,9 @@ class Llama3_2Decoder(EagerModelBase):
 
     def __init__(self, **kwargs):
         # Set member vars from kwargs.
-        self.max_seq_len = kwargs.get("max_seq_len", 8192)  # Trained to be a lot larger, but this value is kept small because of static kv cache at the moment.
+        self.max_seq_len = kwargs.get(
+            "max_seq_len", 8192
+        )  # Trained to be a lot larger, but this value is kept small because of static kv cache at the moment.
         self.encoder_max_seq_len = kwargs.get(
             "encoder_max_seq_len", int(4 * (448 / 14) ** 2 + 1)
         )  # Same as above.