Skip to content

Commit 1390ad4

Browse files
committed
feat(tokenizers): Add a python impl of the Tokenizer interface using tokenizers
This allows for all HF tokenizers to be supported in the python layer. It will need significant work to offer similar compatibility at the c++ layer. Signed-off-by: Gabe Goodhart <[email protected]>
1 parent c66e2fb commit 1390ad4

File tree

1 file changed

+64
-0
lines changed

1 file changed

+64
-0
lines changed

tokenizer/tokenizers.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# Standard
8+
from typing import List
9+
import json
10+
11+
# Third Party
12+
from tokenizers import Tokenizer
13+
14+
# Local
15+
from .base import TokenizerBase
16+
17+
18+
class TokenizersTokenizer(TokenizerBase):
19+
"""
20+
Wrapper around the `tokenizers` library for API compatibility
21+
"""
22+
23+
def __init__(self, file_path: str):
24+
self._tokenizer = Tokenizer.from_file(file_path)
25+
# The BOS and EOS tokens are not easily visible from the tokenizer
26+
# object itself, so we extract them at construction with a sample call
27+
self._bos_token = self._tokenizer.encode("Test", add_special_tokens=True).ids[0]
28+
# There is no explicit BOS token in many tokenizers, so we look for a
29+
# single special token that most resembles the BOS token.
30+
self._eos_token = None
31+
tok_content = json.loads(self._tokenizer.to_str())
32+
end_toks = [
33+
tok for tok in tok_content['added_tokens']
34+
if tok["special"] and "end" in tok["content"]
35+
]
36+
assert end_toks, "Unable to find an EOS token in the added tokens"
37+
if len(end_toks) > 1:
38+
end_text_toks = [
39+
tok for tok in end_toks if "text" in tok["content"]
40+
]
41+
if len(end_text_toks) == 1:
42+
self._eos_token = end_text_toks[0]["id"]
43+
assert self._eos_token is not None, "Unable to find an EOS token in the added tokens"
44+
45+
def encode(
46+
self,
47+
s: str,
48+
*,
49+
bos: bool = False,
50+
eos: bool = False,
51+
) -> List[int]:
52+
res = self._tokenizer.encode(s, add_special_tokens=bos).ids
53+
if eos and (not res or res[-1] != self._eos_token):
54+
res.append(self._eos_token)
55+
return res
56+
57+
def decode(self, ids: List[int]) -> str:
58+
return self._tokenizer.decode(ids)
59+
60+
def bos_id(self) -> int:
61+
return self._bos_token
62+
63+
def eos_id(self) -> int:
64+
return self._eos_token

0 commit comments

Comments
 (0)