@@ -204,7 +204,7 @@ class TokenizerArgs:
204
204
tokenizer_path : Optional [Union [Path , str ]] = None
205
205
is_sentencepiece : bool = False
206
206
is_tiktoken : bool = False
207
- is_tokenizers : bool = False
207
+ is_hf_tokenizer : bool = False
208
208
t : Optional [Any ] = None
209
209
210
210
def __post_init__ (self ):
@@ -214,7 +214,7 @@ def __post_init__(self):
214
214
self .t = TiktokenTokenizer (model_path = str (self .tokenizer_path ))
215
215
self .is_tiktoken = True
216
216
self .is_sentencepiece = False
217
- self .is_tokenizers = False
217
+ self .is_hf_tokenizer = False
218
218
return
219
219
except :
220
220
pass
@@ -225,25 +225,25 @@ def __post_init__(self):
225
225
self .t = SentencePieceProcessor (model_file = str (self .tokenizer_path ))
226
226
self .is_tiktoken = False
227
227
self .is_sentencepiece = True
228
- self .is_tokenizers = False
228
+ self .is_hf_tokenizer = False
229
229
return
230
230
except :
231
231
pass
232
232
233
233
try :
234
- from tokenizer .tokenizers import TokenizersTokenizer
234
+ from tokenizer .hf_tokenizer import HFTokenizer
235
235
236
- self .t = TokenizersTokenizer (str (self .tokenizer_path ))
236
+ self .t = HFTokenizer (str (self .tokenizer_path ))
237
237
self .is_tiktoken = False
238
238
self .is_sentencepiece = False
239
- self .is_tokenizers = True
239
+ self .is_hf_tokenizer = True
240
240
return
241
241
except :
242
242
pass
243
243
244
244
self .is_tiktoken = False
245
245
self .is_sentencepiece = False
246
- self .is_tokenizers = False
246
+ self .is_hf_tokenizer = False
247
247
self .t = None
248
248
return
249
249
@@ -255,25 +255,25 @@ def validate_model(
255
255
if model is None :
256
256
return
257
257
258
- if len ( list ( filter ( lambda x : x , [self .is_tiktoken , self .is_tokenizers , self .is_sentencepiece ])) ) != 1 :
258
+ if sum ( [self .is_tiktoken , self .is_hf_tokenizer , self .is_sentencepiece ]) != 1 :
259
259
raise RuntimeError (f"no tokenizer was found at { self .tokenizer_path } " )
260
260
261
261
is_tiktoken = self .is_tiktoken
262
262
is_sentencepiece = self .is_sentencepiece
263
- is_tokenizers = self .is_tokenizers
263
+ is_hf_tokenizer = self .is_hf_tokenizer
264
264
use_tiktoken = model .config .use_tiktoken
265
- use_tokenizers = model .config .use_tokenizers
266
- use_sentencepiece = not (use_tiktoken or use_tokenizers )
265
+ use_hf_tokenizer = model .config .use_hf_tokenizer
266
+ use_sentencepiece = not (use_tiktoken or use_hf_tokenizer )
267
267
268
268
if (
269
269
(is_tiktoken and not use_tiktoken ) or
270
- (is_tokenizers and not use_tokenizers ) or
270
+ (is_hf_tokenizer and not use_hf_tokenizer ) or
271
271
(is_sentencepiece and not use_sentencepiece )
272
272
):
273
273
raise RuntimeError (
274
274
"model-specified tokenizer ({}) does not match provided tokenizer ({}) for {}" .format (
275
- tokenizer_setting_to_name (use_tiktoken , use_tokenizers ),
276
- tokenizer_setting_to_name (is_tiktoken , is_tokenizers ),
275
+ tokenizer_setting_to_name (use_tiktoken , use_hf_tokenizer ),
276
+ tokenizer_setting_to_name (is_tiktoken , is_hf_tokenizer ),
277
277
model_description ,
278
278
)
279
279
)
0 commit comments