Skip to content

Commit 780ed25

Browse files
lucylqfacebook-github-bot
authored andcommitted
Add tiktoken to eval (#3044)
Summary: Pull Request resolved: #3044 Test Plan: Imported from GitHub, without a `Test Plan:` line. ``` python -m examples.models.llama2.eval_llama --pte llama3_4_ckpts_x.pte -p ../llama-models/llama3/params_less.json -t ../llama-models/llama3/tokenizer.model --max_seq_len=127 --limit 5 wikitext: {'word_perplexity,none': 22.00035213493939, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.8289244201951567, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.8709954573378033, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'} ``` Reviewed By: larryliu0820 Differential Revision: D56163999 Pulled By: lucylq fbshipit-source-id: db255a6e49a3e9b6db92c9f94fe9e7fcb475c924
1 parent 49d1f02 commit 780ed25

File tree

1 file changed

+18
-8
lines changed

1 file changed

+18
-8
lines changed

examples/models/llama2/eval_llama_lib.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,22 @@
66

77

88
import argparse
9-
from typing import Optional
9+
10+
from typing import Optional, Union
1011

1112
import lm_eval
1213
import torch
1314

15+
from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken
16+
from executorch.examples.models.llama2.tokenizer.tokenizer import (
17+
Tokenizer as SentencePieceTokenizer,
18+
)
19+
1420
from lm_eval.api.model import LM
1521
from lm_eval.evaluator import evaluate
1622
from lm_eval.models.huggingface import HFLM as eval_wrapper
1723
from lm_eval.tasks import get_task_dict
18-
from sentencepiece import SentencePieceProcessor
24+
1925
from torch import nn
2026

2127
from .builder import LlamaEdgeManager
@@ -33,7 +39,7 @@ class GPTFastEvalWrapper(eval_wrapper):
3339
def __init__(
3440
self,
3541
model: nn.Module,
36-
tokenizer: SentencePieceProcessor,
42+
tokenizer: Union[SentencePieceTokenizer, Tiktoken],
3743
max_seq_length: Optional[int] = None,
3844
):
3945
super().__init__()
@@ -46,7 +52,7 @@ def __init__(
4652

4753
@property
4854
def eot_token_id(self):
49-
return self._tokenizer.eos_id()
55+
return self._tokenizer.eos_id
5056

5157
@property
5258
def max_length(self):
@@ -65,7 +71,7 @@ def device(self):
6571
return self._device
6672

6773
def tok_encode(self, string: str, **kwargs):
68-
tokens = [self._tokenizer.bos_id()] + self._tokenizer.encode(string)
74+
tokens = self._tokenizer.encode(string, bos=True, eos=False)
6975
encoded = torch.tensor(tokens, dtype=torch.int, device=self.device)
7076
# encoded is a pytorch tensor, but some internal logic in the
7177
# eval harness expects it to be a list instead
@@ -93,7 +99,7 @@ class ETEagerEvalWrapper(GPTFastEvalWrapper):
9399
def __init__(
94100
self,
95101
model: str,
96-
tokenizer: SentencePieceProcessor,
102+
tokenizer: Union[SentencePieceTokenizer, Tiktoken],
97103
max_seq_length: Optional[int] = None,
98104
):
99105
super().__init__(None, tokenizer, max_seq_length)
@@ -120,7 +126,7 @@ class ETRunnerEvalWrapper(GPTFastEvalWrapper):
120126
def __init__(
121127
self,
122128
model: str,
123-
tokenizer: SentencePieceProcessor,
129+
tokenizer: Union[SentencePieceTokenizer, Tiktoken],
124130
tokenizer_bin: str,
125131
max_seq_length: Optional[int] = None,
126132
):
@@ -183,7 +189,11 @@ def gen_eval_wrapper(
183189
Returns:
184190
eval_wrapper (LM): A wrapper interface for the lm-evaluation-harness library.
185191
"""
186-
tokenizer = SentencePieceProcessor(model_file=str(args.tokenizer_path))
192+
try:
193+
tokenizer = SentencePieceTokenizer(model_path=str(args.tokenizer_path))
194+
except Exception:
195+
print("Using Tiktokenizer")
196+
tokenizer = Tiktoken(model_path=str(args.tokenizer_path))
187197

188198
# ExecuTorch Binary Evaluation
189199
if (model := args.pte) is not None:

0 commit comments

Comments
 (0)