|
9 | 9 |
|
10 | 10 | from typing import Optional, Union
|
11 | 11 |
|
12 |
| -import lm_eval |
13 | 12 | import torch
|
14 | 13 | from executorch.examples.models.llama2.export_llama_lib import (
|
15 | 14 | get_quantizer_and_quant_params,
|
16 | 15 | )
|
| 16 | +from executorch.examples.models.llama2.source_transformation.quantize import ( |
| 17 | + EagerEvalWrapper, |
| 18 | + evaluate_model, |
| 19 | +) |
17 | 20 | from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken
|
18 | 21 | from executorch.examples.models.llama2.tokenizer.tokenizer import (
|
19 | 22 | Tokenizer as SentencePieceTokenizer,
|
20 | 23 | )
|
21 | 24 |
|
22 | 25 | from lm_eval.api.model import LM
|
23 |
| -from lm_eval.evaluator import evaluate |
24 |
| -from lm_eval.models.huggingface import HFLM as eval_wrapper |
25 |
| -from lm_eval.tasks import get_task_dict |
26 |
| - |
27 |
| -from torch import nn |
28 | 26 |
|
29 | 27 | from .builder import LlamaEdgeManager
|
30 | 28 | from .export_llama_lib import (
|
|
33 | 31 | )
|
34 | 32 |
|
35 | 33 |
|
36 |
| -class EagerEvalWrapper(eval_wrapper): |
37 |
| - """ |
38 |
| - A wrapper class based on GPTFast, providing integration with the lm-evaluation-harness library. |
39 |
| - """ |
40 |
| - |
41 |
| - def __init__( |
42 |
| - self, |
43 |
| - model: nn.Module, |
44 |
| - tokenizer: Union[SentencePieceTokenizer, Tiktoken], |
45 |
| - max_seq_length: Optional[int] = None, |
46 |
| - use_kv_cache: bool = False, |
47 |
| - ): |
48 |
| - device = "cuda" if torch.cuda.is_available() else "cpu" |
49 |
| - super().__init__(device=device) |
50 |
| - self._model = model |
51 |
| - self._tokenizer = tokenizer |
52 |
| - self._device = torch.device(device) |
53 |
| - self._max_seq_length = 2048 if max_seq_length is None else max_seq_length |
54 |
| - self._use_kv_cache = use_kv_cache |
55 |
| - |
56 |
| - @property |
57 |
| - def eot_token_id(self): |
58 |
| - return self._tokenizer.eos_id |
59 |
| - |
60 |
| - @property |
61 |
| - def max_length(self): |
62 |
| - return self._max_seq_length |
63 |
| - |
64 |
| - @property |
65 |
| - def max_gen_toks(self): |
66 |
| - return 50 |
67 |
| - |
68 |
| - @property |
69 |
| - def batch_size(self): |
70 |
| - return 1 |
71 |
| - |
72 |
| - @property |
73 |
| - def device(self): |
74 |
| - return self._device |
75 |
| - |
76 |
| - def tok_encode(self, string: str, **kwargs): |
77 |
| - tokens = self._tokenizer.encode(string, bos=True, eos=False) |
78 |
| - encoded = torch.tensor(tokens, dtype=torch.int, device=self.device) |
79 |
| - # encoded is a pytorch tensor, but some internal logic in the |
80 |
| - # eval harness expects it to be a list instead |
81 |
| - # TODO: verify this for multi-batch as well |
82 |
| - encoded = encoded.tolist() |
83 |
| - return encoded |
84 |
| - |
85 |
| - def tok_decode(self, tokens): |
86 |
| - decoded = self._tokenizer.decode(tokens) |
87 |
| - return decoded |
88 |
| - |
89 |
| - def _model_call(self, inps): |
90 |
| - if self._use_kv_cache: |
91 |
| - pos_tensor = torch.arange( |
92 |
| - self._max_seq_length, dtype=torch.int64, device=self.device |
93 |
| - ) |
94 |
| - |
95 |
| - # Batch process the whole sequence. |
96 |
| - logits = self._model(inps[:, : self._max_seq_length], pos_tensor) |
97 |
| - return logits |
98 |
| - else: |
99 |
| - return self._model(inps) |
100 |
| - |
101 |
| - def _model_generate(self, context, max_length, eos_token_id): |
102 |
| - raise Exception("unimplemented") |
103 |
| - |
104 |
| - |
105 | 34 | class ETPybindEvalWrapper(EagerEvalWrapper):
|
106 | 35 | """
|
107 | 36 | A wrapper class for ExecuTorch py-binded integration with the
|
@@ -165,40 +94,6 @@ def _model_call(self, inps):
|
165 | 94 | pass
|
166 | 95 |
|
167 | 96 |
|
168 |
| -@torch.no_grad() |
169 |
| -def eval( |
170 |
| - eval_wrapper: LM, |
171 |
| - tasks: Optional[list] = None, |
172 |
| - limit: Optional[int] = None, |
173 |
| -) -> dict: |
174 |
| - """ |
175 |
| - Evaluates a language model on a specified task using the lm-evaluation-harness library. |
176 |
| -
|
177 |
| - Args: |
178 |
| - eval_wrapper (LM): A LM wrapper class compatible with lm-evaluation-harness evaluation |
179 |
| - task (str): The name of the evaluation task to perform. |
180 |
| - limit (Optional[int]): The maximum number of samples to evaluate (None for all available). |
181 |
| -
|
182 |
| - Returns: |
183 |
| - eval_results (dict): A dictionary of evaluation results for the specified task(s). |
184 |
| - """ |
185 |
| - |
186 |
| - if tasks is None: |
187 |
| - tasks = ["wikitext"] |
188 |
| - |
189 |
| - if "hendrycks_test" in tasks: |
190 |
| - tasks.remove("hendrycks_test") |
191 |
| - tasks += list(lm_eval.tasks.hendrycks_test.create_all_tasks().keys()) |
192 |
| - task_dict = get_task_dict(tasks) |
193 |
| - |
194 |
| - eval_results = evaluate( |
195 |
| - eval_wrapper, |
196 |
| - task_dict, |
197 |
| - limit=limit, |
198 |
| - ) |
199 |
| - return eval_results |
200 |
| - |
201 |
| - |
202 | 97 | def gen_eval_wrapper(
|
203 | 98 | model_name: str,
|
204 | 99 | args: argparse.ArgumentParser,
|
@@ -307,7 +202,7 @@ def eval_llama(
|
307 | 202 | eval_wrapper = gen_eval_wrapper(model_name, args)
|
308 | 203 |
|
309 | 204 | # Evaluate the model
|
310 |
| - eval_results = eval( |
| 205 | + eval_results = evaluate_model( |
311 | 206 | eval_wrapper,
|
312 | 207 | args.tasks,
|
313 | 208 | args.limit,
|
|
0 commit comments