Skip to content

Commit e95aa9d

Browse files
helunwencserfacebook-github-bot
authored andcommitted
add option to run mmlu with 5 shots (#6146)
Summary: Pull Request resolved: #6146 This PR does the following changes: - add `--num_fewshot` option which is required for running MMLU task with 5 shots - set the default value of `--limit` to none such that we can actually run all examples - update `eval_llama` to call `simple_evaluate` which is a wrapper of `evaluate` and does some extra work for us like getting the task dict Test Plan: - Make sure UncycloText perplexity for llama 3.2 1B stays the same before and after the change. Before, run eval_llama for llama 3.2 1B with limit set to None: ``` wikitext: {'word_perplexity,none': 12.78246428138387, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.610432252171856, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.6874479705552373, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'} ``` After, run eval_llama for llama 3.2 1B: ``` wikitext: {'word_perplexity,none': 12.78246428138387, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.610432252171856, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.6874479705552373, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'} ``` - Make sure that lm_eval(v0.4.2, which is used by eval_llama) and eval_llama reports similar number for llama 3.2 1B and 3B BF16 for MMLU task with 5 shots. Example command for lm_eval: ``` lm_eval --model hf \ --model_args pretrained=meta-llama/Llama-3.2-1B-Instruct \ --tasks mmlu \ --device cuda \ -f 5 \ --batch_size auto ``` Example command for eval_llama: ``` python -m examples.models.llama2.eval_llama \ -c /home/lunwenh/models/1B_Instruct/consolidated.00.pth \ -p /home/lunwenh/models/1B_Instruct/params.json \ -t /home/lunwenh/models/1B_Instruct/tokenizer.model \ -kv \ -d bf16 \ --tasks mmlu \ -f 5 \ --max_seq_length 2048 ``` imported-using-ghimport Reviewed By: mergennachin Differential Revision: D64215268 Pulled By: helunwencser fbshipit-source-id: 606dd279201c4165cf8d218da50cef1457288ed6
1 parent 61c501c commit e95aa9d

File tree

3 files changed

+22
-50
lines changed

3 files changed

+22
-50
lines changed

examples/models/llama2/eval_llama_lib.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@
2121
)
2222
from executorch.extension.llm.tokenizer.utils import get_tokenizer
2323
from lm_eval.api.model import LM
24+
from lm_eval.evaluator import simple_evaluate
2425

25-
from .evaluate.eager_eval import EagerEvalWrapper, evaluate_model
26+
from .evaluate.eager_eval import EagerEvalWrapper
2627

2728
from .export_llama_lib import (
2829
_prepare_for_llama_export,
@@ -246,9 +247,19 @@ def build_args_parser() -> argparse.ArgumentParser:
246247
help="list of lm-eluther tasks to evaluate usage: --tasks task1 task2",
247248
)
248249
parser.add_argument(
249-
"--limit", type=int, default=5, help="number of samples to evalulate"
250+
"--limit",
251+
type=int,
252+
default=None,
253+
help="number of samples to evalulate. If not set, evaluate all samples",
254+
)
255+
parser.add_argument(
256+
"-f",
257+
"--num_fewshot",
258+
type=int,
259+
default=None,
260+
metavar="N",
261+
help="Number of examples in few-shot context",
250262
)
251-
252263
# Add additional args specific to eval via an ET Runner
253264
# Note: For initial integration, the tokenizer.model is also required
254265
parser.add_argument(
@@ -281,11 +292,13 @@ def eval_llama(
281292
eval_wrapper = gen_eval_wrapper(model_name, args)
282293

283294
# Evaluate the model
284-
eval_results = evaluate_model(
285-
eval_wrapper,
286-
args.tasks, # pyre-ignore
287-
args.limit, # pyre-ignore
288-
)
295+
with torch.no_grad():
296+
eval_results = simple_evaluate(
297+
model=eval_wrapper,
298+
tasks=args.tasks, # pyre-ignore: Undefined attribute [16]: `argparse.ArgumentParser` has no attribute `tasks`
299+
num_fewshot=args.num_fewshot, # pyre-ignore: Undefined attribute [16]: `argparse.ArgumentParser` has no attribute `num_fewshot`
300+
limit=args.limit, # pyre-ignore: Undefined attribute [16]: `argparse.ArgumentParser` has no attribute `limit`
301+
)
289302

290303
for task, res in eval_results["results"].items():
291304
print(f"{task}: {res}")

examples/models/llama2/evaluate/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,8 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7-
from .eager_eval import EagerEvalWrapper, evaluate_model
7+
from .eager_eval import EagerEvalWrapper
88

99
__all__ = [
10-
"evaluate_model",
1110
"EagerEvalWrapper",
1211
]

examples/models/llama2/evaluate/eager_eval.py

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,13 @@
77

88
from typing import Optional, Union
99

10-
import lm_eval
1110
import torch
1211
from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken
1312
from executorch.extension.llm.tokenizer.tokenizer import (
1413
Tokenizer as SentencePieceTokenizer,
1514
)
1615

17-
from lm_eval.api.model import LM
18-
from lm_eval.evaluator import evaluate
1916
from lm_eval.models.huggingface import HFLM as eval_wrapper
20-
from lm_eval.tasks import get_task_dict
2117

2218
from torch import nn
2319

@@ -79,39 +75,3 @@ def _model_call(self, inps):
7975

8076
def _model_generate(self, context, max_length, eos_token_id):
8177
raise Exception("unimplemented")
82-
83-
84-
@torch.no_grad()
85-
def evaluate_model(
86-
eval_wrapper: LM,
87-
tasks: Optional[list] = None,
88-
limit: Optional[int] = None,
89-
) -> dict:
90-
"""
91-
Evaluates a language model on a specified task using the lm-evaluation-harness library.
92-
93-
Args:
94-
eval_wrapper (LM): A LM wrapper class compatible with lm-evaluation-harness evaluation
95-
tasks: Optional[list]: The names of the evaluation tasks to perform.
96-
limit (Optional[int]): The maximum number of samples to evaluate (None for all available).
97-
98-
Returns:
99-
eval_results (dict): A dictionary of evaluation results for the specified task(s).
100-
"""
101-
102-
if tasks is None:
103-
tasks = ["wikitext"]
104-
105-
if "hendrycks_test" in tasks:
106-
tasks.remove("hendrycks_test")
107-
tasks += list(
108-
lm_eval.tasks.hendrycks_test.create_all_tasks().keys() # pyre-ignore
109-
)
110-
task_dict = get_task_dict(tasks)
111-
112-
eval_results = evaluate(
113-
eval_wrapper,
114-
task_dict,
115-
limit=limit,
116-
)
117-
return eval_results

0 commit comments

Comments
 (0)