adding run time info to eval and cleaning up output (#422)

HDCharles · malfet · commit d405f0a72d32 · 2024-07-16T22:53:17.000-07:00
* adding run time info to eval and cleaning up output

Summary:

output now includes info on model run time distribution and a cleaned up
result output.

Test Plan:

python eval.py --checkpoint-path checkpoints/$MODEL_REPO/model.pth \
    --dtype bfloat16 --device cuda \

Time to run eval: 53.31s.
Time in model.forward: 20.29s, over 186 model evaluations
forward run time stats - Median: 0.10s Min: 0.04s Max: 2.18s
For model checkpoints/meta-llama/Llama-2-7b-hf/model.pth
wikitext:
 word_perplexity,none: 9.1649
 byte_perplexity,none: 1.5133
 bits_per_byte,none: 0.5977
 alias: wikitext

Reviewers:

Subscribers:

Tasks:

Tags:

* Adding evaluation.md content

Summary: see added content

Test Plan: n/a

Reviewers:

Subscribers:

Tasks:

Tags:

* docs update

Summary: removing install instructions

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/docs/evaluation.md b/docs/evaluation.md
@@ -1,7 +1,30 @@
 
-# Model evaluation
+Evaluation Features
+===================
 
-TODO(jerry):
-Add documentation about `torchchat eval` explaining the process and options.
+Torchchat provides evaluation functionality for your language model on a variety of tasks using the [lm-evaluation-harness](https://github.com/facebookresearch/lm_eval) library.
 
-[#339](https://github.com/pytorch/torchchat/issues/339)
+Usage
+-----
+
+The evaluation mode of `torchchat.py` script can be used to evaluate your language model on various tasks available in the `lm_eval` library such as "wikitext". You can specify the task(s) you want to evaluate using the `--tasks` option, and limit the evaluation using the `--limit` option. If no task is specified, it will default to evaluating on "wikitext".
+
+**Examples**
+
+Running wikitext for 10 iterations
+```
+python3 torchchat.py eval stories15M --tasks wikitext --limit 10
+```
+
+Running an exported model
+```
+# python3 torchchat.py export stories15M --output-pte-path stories15M.pte
+python3 torchchat.py eval --pte-path stories15M.pte
+```
+
+Running multiple tasks and calling eval.py directly:
+```
+python3 eval.py --pte-path stories15M.pte --tasks wikitext hellaswag
+```
+
+For more information and a list of tasks/metrics see [lm-evaluation-harness](https://github.com/facebookresearch/lm_eval).
diff --git a/eval.py b/eval.py
@@ -28,7 +28,7 @@
 torch._inductor.config.epilogue_fusion = False
 torch._inductor.config.triton.cudagraphs = True
 torch._dynamo.config.cache_size_limit = 100000
-
+import time
 
 try:
     import lm_eval
@@ -108,6 +108,7 @@ def __init__(
         self._tokenizer = tokenizer
         self._device = torch.device(device)
         self._max_seq_length = 2048 if max_seq_length is None else max_seq_length
+        self.times = []
 
     @property
     def eot_token_id(self):
@@ -155,7 +156,9 @@ def _model_call(self, inps):
             )
         )
         x = seq.index_select(0, input_pos).view(1, -1)
+        start = time.time()
         logits = model_forward(self._model, x, input_pos)
+        self.times.append(time.time()-start)
         return logits
 
     def _model_generate(self, context, max_length, eos_token_id):
@@ -206,6 +209,7 @@ def eval(
         task_dict,
         limit=limit,
     )
+    eval_results["times"] = model_eval_wrapper.times
     return eval_results
 
 
@@ -261,7 +265,10 @@ def main(args) -> None:
         max_seq_length,
         device=builder_args.device,
     )
-    print(f"Time to run eval: {time.time() - t1:.02f} seconds.")
+    print(f"Time to run eval: {time.time() - t1:.02f}s.")
+    times=torch.tensor(result["times"])
+    print(f"Time in model.forward: {times.sum():.02f}s, over {times.numel()} model evaluations")
+    print(f"forward run time stats - Median: {times.median():.02f}s Min: {times.min():.02f}s Max: {times.max():.02f}s")
     if builder_args.dso_path:
         print(f"For model {builder_args.dso_path}")
     elif builder_args.pte_path:
@@ -274,7 +281,10 @@ def main(args) -> None:
         raise RuntimeError("Well That's Fine. How did we get here")
 
     for task, res in result["results"].items():
-        print(f"{task}: {res}")
+        print(f"{task}:")
+        for metric, val in res.items():
+            if val != "N/A":
+                print(f" {metric}: {val if isinstance(val, str) else f'{val:0.4f}'}")
 
 
 if __name__ == "__main__":