Skip to content

Commit 4a20f69

Browse files
committed
fix(hf_tokenizer): Rename to HFTokenizer and corresponding flags
#1251 Branch: TokenizersTokenizer-1251 Co-Authored-By: [email protected] Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 9d9a4a7 commit 4a20f69

File tree

3 files changed

+22
-22
lines changed

3 files changed

+22
-22
lines changed

tokenizer/tokenizers.py renamed to tokenizer/hf_tokenizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616
from .base import TokenizerBase
1717

1818

19-
class TokenizersTokenizer(TokenizerBase):
19+
class HFTokenizer(TokenizerBase):
2020
"""
21-
Wrapper around the `tokenizers` library for API compatibility
21+
Wrapper around the Huggingface `tokenizers` library for API compatibility
2222
"""
2323

2424
def __init__(self, file_path: str):

torchchat/cli/builder.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ class TokenizerArgs:
204204
tokenizer_path: Optional[Union[Path, str]] = None
205205
is_sentencepiece: bool = False
206206
is_tiktoken: bool = False
207-
is_tokenizers: bool = False
207+
is_hf_tokenizer: bool = False
208208
t: Optional[Any] = None
209209

210210
def __post_init__(self):
@@ -214,7 +214,7 @@ def __post_init__(self):
214214
self.t = TiktokenTokenizer(model_path=str(self.tokenizer_path))
215215
self.is_tiktoken = True
216216
self.is_sentencepiece = False
217-
self.is_tokenizers = False
217+
self.is_hf_tokenizer = False
218218
return
219219
except:
220220
pass
@@ -225,25 +225,25 @@ def __post_init__(self):
225225
self.t = SentencePieceProcessor(model_file=str(self.tokenizer_path))
226226
self.is_tiktoken = False
227227
self.is_sentencepiece = True
228-
self.is_tokenizers = False
228+
self.is_hf_tokenizer = False
229229
return
230230
except:
231231
pass
232232

233233
try:
234-
from tokenizer.tokenizers import TokenizersTokenizer
234+
from tokenizer.hf_tokenizer import HFTokenizer
235235

236-
self.t = TokenizersTokenizer(str(self.tokenizer_path))
236+
self.t = HFTokenizer(str(self.tokenizer_path))
237237
self.is_tiktoken = False
238238
self.is_sentencepiece = False
239-
self.is_tokenizers = True
239+
self.is_hf_tokenizer = True
240240
return
241241
except:
242242
pass
243243

244244
self.is_tiktoken = False
245245
self.is_sentencepiece = False
246-
self.is_tokenizers = False
246+
self.is_hf_tokenizer = False
247247
self.t = None
248248
return
249249

@@ -255,25 +255,25 @@ def validate_model(
255255
if model is None:
256256
return
257257

258-
if len(list(filter(lambda x: x, [self.is_tiktoken, self.is_tokenizers, self.is_sentencepiece]))) != 1:
258+
if sum([self.is_tiktoken, self.is_hf_tokenizer, self.is_sentencepiece]) != 1:
259259
raise RuntimeError(f"no tokenizer was found at {self.tokenizer_path}")
260260

261261
is_tiktoken = self.is_tiktoken
262262
is_sentencepiece = self.is_sentencepiece
263-
is_tokenizers = self.is_tokenizers
263+
is_hf_tokenizer = self.is_hf_tokenizer
264264
use_tiktoken = model.config.use_tiktoken
265-
use_tokenizers = model.config.use_tokenizers
266-
use_sentencepiece = not (use_tiktoken or use_tokenizers)
265+
use_hf_tokenizer = model.config.use_hf_tokenizer
266+
use_sentencepiece = not (use_tiktoken or use_hf_tokenizer)
267267

268268
if (
269269
(is_tiktoken and not use_tiktoken) or
270-
(is_tokenizers and not use_tokenizers) or
270+
(is_hf_tokenizer and not use_hf_tokenizer) or
271271
(is_sentencepiece and not use_sentencepiece)
272272
):
273273
raise RuntimeError(
274274
"model-specified tokenizer ({}) does not match provided tokenizer ({}) for {}".format(
275-
tokenizer_setting_to_name(use_tiktoken, use_tokenizers),
276-
tokenizer_setting_to_name(is_tiktoken, is_tokenizers),
275+
tokenizer_setting_to_name(use_tiktoken, use_hf_tokenizer),
276+
tokenizer_setting_to_name(is_tiktoken, is_hf_tokenizer),
277277
model_description,
278278
)
279279
)

torchchat/model.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ class TransformerArgs:
272272
ffn_dim_multiplier: Optional[int] = None
273273
# Select the desired tokenizer. Defaults to sentencepiece
274274
use_tiktoken: bool = False
275-
use_tokenizers: bool = False
275+
use_hf_tokenizer: bool = False
276276
max_seq_length: int = 8192
277277
rope_scaling: Optional[Dict[str, Any]] = None
278278
# For pipeline parallel
@@ -329,14 +329,14 @@ class ModelArgs:
329329
model_type: ModelType
330330
transformer_args: Dict[str, Dict[str, Any]]
331331
use_tiktoken: bool
332-
use_tokenizers: bool
332+
use_hf_tokenizer: bool
333333

334334
def __init__(
335335
self,
336336
transformer_args: Dict[str, Dict[str, Any]],
337337
model_type: ModelType = ModelType.TextOnly,
338338
use_tiktoken: bool = False,
339-
use_tokenizers: bool = False,
339+
use_hf_tokenizer: bool = False,
340340
) -> None:
341341
self._sanity_check(transformer_args, model_type)
342342

@@ -345,7 +345,7 @@ def __init__(
345345

346346
# Model-level attributes
347347
self.use_tiktoken = use_tiktoken
348-
self.use_tokenizers = use_tokenizers
348+
self.use_hf_tokenizer = use_hf_tokenizer
349349

350350
def _sanity_check(
351351
self,
@@ -372,8 +372,8 @@ def from_params(cls, params_path):
372372
}
373373

374374
use_tiktoken = loaded_params.get("use_tiktoken", False)
375-
use_tokenizers = loaded_params.get("use_tokenizers", False)
376-
return cls(transformer_args, model_type, use_tiktoken, use_tokenizers)
375+
use_hf_tokenizer = loaded_params.get("use_hf_tokenizer", False)
376+
return cls(transformer_args, model_type, use_tiktoken, use_hf_tokenizer)
377377

378378
@classmethod
379379
def from_table(cls, name: str):

0 commit comments

Comments
 (0)