Skip to content

Commit bafff76

Browse files
committed
feat(builder): Add support for using the TokenizersTokenizer in builder
Branch: GraniteCodeSupport Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 99de873 commit bafff76

File tree

1 file changed

+35
-5
lines changed

1 file changed

+35
-5
lines changed

torchchat/cli/builder.py

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ class TokenizerArgs:
193193
tokenizer_path: Optional[Union[Path, str]] = None
194194
is_sentencepiece: bool = False
195195
is_tiktoken: bool = False
196+
is_tokenizers: bool = False
196197
t: Optional[Any] = None
197198

198199
def __post_init__(self):
@@ -202,6 +203,7 @@ def __post_init__(self):
202203
self.t = TiktokenTokenizer(model_path=str(self.tokenizer_path))
203204
self.is_tiktoken = True
204205
self.is_sentencepiece = False
206+
self.is_tokenizers = False
205207
return
206208
except:
207209
pass
@@ -212,12 +214,25 @@ def __post_init__(self):
212214
self.t = SentencePieceProcessor(model_file=str(self.tokenizer_path))
213215
self.is_tiktoken = False
214216
self.is_sentencepiece = True
217+
self.is_tokenizers = False
218+
return
219+
except:
220+
pass
221+
222+
try:
223+
from tokenizer.tokenizers import TokenizersTokenizer
224+
225+
self.t = TokenizersTokenizer(str(self.tokenizer_path))
226+
self.is_tiktoken = False
227+
self.is_sentencepiece = False
228+
self.is_tokenizers = True
215229
return
216230
except:
217231
pass
218232

219233
self.is_tiktoken = False
220234
self.is_sentencepiece = False
235+
self.is_tokenizers = False
221236
self.t = None
222237
return
223238

@@ -229,16 +244,27 @@ def validate_model(
229244
if model is None:
230245
return
231246

232-
if self.is_tiktoken == self.is_sentencepiece:
247+
if len(list(filter(lambda x: x, [self.is_tiktoken, self.is_tokenizers, self.is_sentencepiece]))) != 1:
233248
raise RuntimeError(f"no tokenizer was found at {self.tokenizer_path}")
234249

235250
is_tiktoken = self.is_tiktoken
236251
is_sentencepiece = self.is_sentencepiece
252+
is_tokenizers = self.is_tokenizers
237253
use_tiktoken = model.config.use_tiktoken
254+
use_tokenizers = model.config.use_tokenizers
255+
use_sentencepiece = not (use_tiktoken or use_tokenizers)
238256

239-
if not (is_tiktoken == use_tiktoken) or not (is_sentencepiece != use_tiktoken):
257+
if (
258+
(is_tiktoken and not use_tiktoken) or
259+
(is_tokenizers and not use_tokenizers) or
260+
(is_sentencepiece and not use_sentencepiece)
261+
):
240262
raise RuntimeError(
241-
f"model-specified tokenizer ({tokenizer_setting_to_name(use_tiktoken)}) does not match provided tokenizer ({tokenizer_setting_to_name(is_tiktoken)}) for {model_description}"
263+
"model-specified tokenizer ({}) does not match provided tokenizer ({}) for {}".format(
264+
tokenizer_setting_to_name(use_tiktoken, use_tokenizers),
265+
tokenizer_setting_to_name(is_tiktoken, is_tokenizers),
266+
model_description,
267+
)
242268
)
243269

244270
return
@@ -594,5 +620,9 @@ def _initialize_model(
594620
return model
595621

596622

597-
def tokenizer_setting_to_name(tiktoken: bool = False) -> str:
598-
return "TikToken" if tiktoken else "SentencePiece"
623+
def tokenizer_setting_to_name(tiktoken: bool, tokenizers: bool) -> str:
624+
if tiktoken:
625+
return "TikToken"
626+
if tokenizers:
627+
return "Tokenizers"
628+
return "SentencePiece"

0 commit comments

Comments
 (0)