pytorch · mikekgfb · Apr 25, 2024 · Apr 23, 2024 · Apr 24, 2024 · Apr 24, 2024
diff --git a/docs/evaluation.md b/docs/evaluation.md
@@ -1,7 +1,30 @@
 
-# Model evaluation
+Evaluation Features
+===================
 
-TODO(jerry):
-Add documentation about `torchchat eval` explaining the process and options.
+Torchchat provides evaluation functionality for your language model on a variety of tasks using the [lm-evaluation-harness](https://github.com/facebookresearch/lm_eval) library.
 
-[#339](https://github.com/pytorch/torchchat/issues/339)
+Usage
+-----
+
+The evaluation mode of `torchchat.py` script can be used to evaluate your language model on various tasks available in the `lm_eval` library such as "wikitext". You can specify the task(s) you want to evaluate using the `--tasks` option, and limit the evaluation using the `--limit` option. If no task is specified, it will default to evaluating on "wikitext".
+
+**Examples**
+
+Running wikitext for 10 iterations
+```
+python3 torchchat.py eval stories15M --tasks wikitext --limit 10
+```
+
+Running an exported model
+```
+# python3 torchchat.py export stories15M --output-pte-path stories15M.pte
+python3 torchchat.py eval --pte-path stories15M.pte
+```
+
+Running multiple tasks and calling eval.py directly:
+```
+python3 eval.py --pte-path stories15M.pte --tasks wikitext hellaswag
+```
+
+For more information and a list of tasks/metrics see [lm-evaluation-harness](https://github.com/facebookresearch/lm_eval).
diff --git a/eval.py b/eval.py
@@ -28,7 +28,7 @@
 torch._inductor.config.epilogue_fusion = False
 torch._inductor.config.triton.cudagraphs = True
 torch._dynamo.config.cache_size_limit = 100000
-
+import time
 
 try:
     import lm_eval
@@ -107,6 +107,7 @@ def __init__(
         self._tokenizer = tokenizer
         self._device = torch.device(device)
         self._max_seq_length = 2048 if max_seq_length is None else max_seq_length
+        self.times = []
 
     @property
     def eot_token_id(self):
@@ -154,7 +155,9 @@ def _model_call(self, inps):
             )
         )
         x = seq.index_select(0, input_pos).view(1, -1)
+        start = time.time()
         logits = model_forward(self._model, x, input_pos)
+        self.times.append(time.time()-start)
         return logits
 
     def _model_generate(self, context, max_length, eos_token_id):
@@ -205,6 +208,7 @@ def eval(
         task_dict,
         limit=limit,
     )
+    eval_results["times"] = model_eval_wrapper.times
     return eval_results
 
 
@@ -260,7 +264,10 @@ def main(args) -> None:
         max_seq_length,
         device=builder_args.device,
     )
-    print(f"Time to run eval: {time.time() - t1:.02f} seconds.")
+    print(f"Time to run eval: {time.time() - t1:.02f}s.")
+    times=torch.tensor(result["times"])
+    print(f"Time in model.forward: {times.sum():.02f}s, over {times.numel()} model evaluations")
+    print(f"forward run time stats - Median: {times.median():.02f}s Min: {times.min():.02f}s Max: {times.max():.02f}s")
     if builder_args.dso_path:
         print(f"For model {builder_args.dso_path}")
     elif builder_args.pte_path:
@@ -273,7 +280,10 @@ def main(args) -> None:
         raise RuntimeError("Well That's Fine. How did we get here")
 
     for task, res in result["results"].items():
-        print(f"{task}: {res}")
+        print(f"{task}:")
+        for metric, val in res.items():
+            if val != "N/A":
+                print(f" {metric}: {val if isinstance(val, str) else f'{val:0.4f}'}")
 
 
 if __name__ == "__main__":