pytorch
diff --git a/‎README.md
Lines changed: 0 additions & 1 deletion b/‎README.md
Lines changed: 0 additions & 1 deletion
diff --git a/‎eval.py
Lines changed: 19 additions & 11 deletions b/‎eval.py
Lines changed: 19 additions & 11 deletions
@@ -118,7 +118,6 @@ python3 torchchat.py generate llama3 --prompt "write me a story about a boy and
 
 For more information run `python3 torchchat.py generate --help`
 
-
 ### Browser
 
 [skip default]: begin
 
@@ -4,8 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import argparse
-import time
-from typing import Optional
+from typing import Callable, Optional
 
 import torch
 import torch._dynamo.config
@@ -20,7 +19,6 @@
 from build.model import Transformer
 from build.utils import set_precision
 from cli import add_arguments_for_verb, arg_init
-from generate import encode_tokens, model_forward
 from utils.measure_time import measure_time
 
 torch._dynamo.config.automatic_dynamic_shapes = True
@@ -85,11 +83,17 @@ def __init__(
         self,
         model: Transformer,
         tokenizer,
+        model_forward: Optional[Callable] = None,
         max_seq_length: Optional[int] = None,
         device="cpu",
     ):
         super().__init__(device=device)
         self._model = model
+        self._model_forward = (
+            model_forward
+            if model_forward is not None
+            else lambda x, input_pos: model(x, input_pos)
+        )
         self._tokenizer = tokenizer
         self._device = torch.device(device)
         self._max_seq_length = 2048 if max_seq_length is None else max_seq_length
@@ -116,11 +120,8 @@ def device(self):
         return self._device
 
     def tok_encode(self, string: str, **kwargs):
-        encoded = encode_tokens(self._tokenizer, string, bos=True, device=self._device)
-        # encoded is a pytorch tensor, but some internal logic in the
-        # eval harness expects it to be a list instead
-        # TODO: verify this for multi-batch as well
-        encoded = encoded.tolist()
+        bos_id = self._tokenizer.bos_id()
+        encoded = [bos_id] + self._tokenizer.encode(string)
         return encoded
 
     def tok_decode(self, tokens):
@@ -142,7 +143,7 @@ def _model_call(self, inps):
         )
         x = seq.index_select(0, input_pos).view(1, -1)
         with measure_time(message=None) as measure:
-            logits = model_forward(self._model, x, input_pos)
+            logits = self._model_forward(x, input_pos)
         self.times.append(measure.get_time())
         return logits
 
@@ -153,6 +154,7 @@ def _model_generate(self, context, max_length, eos_token_id):
 @torch.no_grad()
 def eval(
     model: Transformer,
+    model_forward: Callable,
     tokenizer,
     tasks: Optional[list] = None,
     limit: Optional[int] = None,
@@ -176,7 +178,11 @@ def eval(
         tasks = ["wikitext"]
 
     model_eval_wrapper = GPTFastEvalWrapper(
-        model, tokenizer, max_seq_length, device=device
+        model,
+        tokenizer,
+        model_forward=model_forward,
+        max_seq_length=max_seq_length,
+        device=device,
     )
 
     try:
@@ -231,11 +237,12 @@ def main(args) -> None:
     )
     tokenizer_args.validate_model(model)
 
+    model_forward = lambda x, input_pos: model(x, input_pos)  # noqa
+
     if compile:
         assert not (
             builder_args.dso_path or builder_args.pte_path
         ), "cannot compile exported model"
-        global model_forward
         model_forward = torch.compile(
             model_forward, mode="reduce-overhead", dynamic=True, fullgraph=True
         )
@@ -244,6 +251,7 @@ def main(args) -> None:
     with measure_time("Time to run eval: {time:.02f}s."):
         result = eval(
             model.to(device),
+            model_forward,
             tokenizer,
             tasks,
             limit,