-
Notifications
You must be signed in to change notification settings - Fork 250
Rm tiktoken flag #424
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Rm tiktoken flag #424
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -139,30 +139,61 @@ def from_speculative_args(cls, args): # -> BuilderArgs: | |
@dataclass | ||
class TokenizerArgs: | ||
tokenizer_path: Optional[Union[Path, str]] = None | ||
is_sentencepiece: bool = True | ||
is_sentencepiece: bool = False | ||
is_tiktoken: bool = False | ||
t: Optional[Any] = None | ||
|
||
def __post_init__(self): | ||
try: | ||
from tokenizer.tiktoken import Tokenizer as TiktokenTokenizer | ||
|
||
self.t = TiktokenTokenizer( | ||
model_path=str(self.tokenizer_path) | ||
) | ||
self.is_tiktoken = True | ||
self.is_sentencepiece = False | ||
return | ||
except: | ||
pass | ||
|
||
try: | ||
from sentencepiece import SentencePieceProcessor | ||
|
||
self.t = SentencePieceProcessor( | ||
model_file=str(self.tokenizer_path) | ||
) | ||
self.is_tiktoken = False | ||
self.is_sentencepiece = True | ||
return | ||
except: | ||
pass | ||
|
||
self.is_tiktoken = False | ||
self.is_sentencepiece = False | ||
self.t = None | ||
return | ||
|
||
|
||
def validate_model( | ||
self, | ||
model: Transformer, | ||
model_description: str = "model", | ||
): | ||
) -> None: | ||
if model is None: | ||
return | ||
|
||
use_tiktoken = model.config.use_tiktoken | ||
is_tiktoken = self.is_tiktoken | ||
condition = False # not (self.is_tiktoken == model.config.use_tiktoken) or not (self.is_sentencepiece == not model.config.use_tiktoken) | ||
|
||
if use_tiktoken is None: | ||
model.config.use_tiktoken = is_tiktoken | ||
elif use_tiktoken != is_tiktoken: | ||
if condition: | ||
raise RuntimeError( | ||
f"model-specified tokenizer ({tokenizer_setting_to_name(use_tiktoken)} does not match provided tokenizer ({tokenizer_setting_to_name(is_tiktoken)} for {model_description}" | ||
"test" # f"model-specified tokenizer ({tokenizer_setting_to_name(model.config.use_tiktoken)} does not match provided tokenizer ({tokenizer_setting_to_name(self.is_tiktoken)} for {model_description}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we add details to tell the user how to resolve the issue? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I'll uncomment this in a followup PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Put another way: "you gave me the wrong tokenizer, give me the right one. you must specify the tokenizer that corresponds to this model, not some random file you picked up from the side of the road.". It's hard to put it clearly, and politely |
||
) | ||
|
||
return | ||
|
||
@classmethod | ||
def from_args(cls, args): # -> TokenizerArgs: | ||
is_sentencepiece = True | ||
is_sentencepiece = False | ||
is_tiktoken = False | ||
|
||
if args.tokenizer_path: | ||
|
@@ -185,28 +216,16 @@ def from_args(cls, args): # -> TokenizerArgs: | |
if not tokenizer_path.is_file(): | ||
raise RuntimeError(f"did not find tokenizer at {tokenizer_path}") | ||
|
||
if args.tiktoken: | ||
is_sentencepiece = False | ||
is_tiktoken = True | ||
|
||
return cls( | ||
tokenizer_path=tokenizer_path, | ||
is_sentencepiece=is_sentencepiece, | ||
is_tiktoken=is_tiktoken, | ||
t=None, | ||
) | ||
|
||
|
||
def _initialize_tokenizer(tokenizer_args: TokenizerArgs): | ||
if tokenizer_args.is_sentencepiece: | ||
from sentencepiece import SentencePieceProcessor | ||
|
||
return SentencePieceProcessor(model_file=str(tokenizer_args.tokenizer_path)) | ||
elif tokenizer_args.is_tiktoken: | ||
from tokenizer.tiktoken import Tokenizer as TiktokenTokenizer | ||
|
||
return TiktokenTokenizer(model_path=str(tokenizer_args.tokenizer_path)) | ||
else: | ||
raise RuntimeError("must specify a valid tokenizer in TokenizerArgs") | ||
return tokenizer_args.t | ||
|
||
|
||
torch._inductor.config.coordinate_descent_tuning = True | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This will indeed work, but this will be somewhat slow. Is there a way to check some magic from a file or something?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wow, is tiktoken is a simple space separate base64 token + token number
For example, token 125355 is encoded as
INGF0LDRgNCw0LrRgtC10YDQuNGB0YLQuNC60Lg=
andThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
that's very charakterisiki of modern formats lolz. Or uunet of the days long gone.
Def was my first choice, but we're running out of time. I wish
file $filename
worked on these things!Def should write a checker, although - the risk is that the format changes and we end up having to chase implementations. It's optimized for llama3 (there's no overhead for that one).