Skip to content

Commit c66e2fb

Browse files
committed
feat(tokenizer): Add an abstract base class for additional tokenizer support
Branch: GraniteCodeSupport Signed-off-by: Gabe Goodhart <[email protected]>
1 parent c840070 commit c66e2fb

File tree

2 files changed

+35
-1
lines changed

2 files changed

+35
-1
lines changed

tokenizer/base.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
"""
7+
Abstract base class for all tokenizer classes in python matching c++ interface.
8+
"""
9+
10+
# Standard
11+
from abc import ABC, abstractmethod
12+
from typing import List
13+
14+
15+
class TokenizerBase(ABC):
16+
__doc__ = __doc__
17+
18+
@abstractmethod
19+
def encode(self, s: str, *, bos: bool = False, eos: bool = False) -> List[int]:
20+
"""Encode the given string and optionally include bos/eos tokens"""
21+
22+
@abstractmethod
23+
def decode(self, ids: List[int]) -> str:
24+
"""Decode the given token ids into a string"""
25+
26+
@abstractmethod
27+
def bos_id(self) -> int:
28+
"""The id of the begin-of-string token"""
29+
30+
@abstractmethod
31+
def eos_id(self) -> int:
32+
"""The id of the end-of-string token"""

tokenizer/tiktoken.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
import tiktoken
2424
from tiktoken.load import load_tiktoken_bpe
2525

26+
from .base import TokenizerBase
27+
2628

2729
logger = getLogger(__name__)
2830

@@ -38,7 +40,7 @@ class Message(TypedDict):
3840
Dialog = Sequence[Message]
3941

4042

41-
class Tokenizer:
43+
class Tokenizer(TokenizerBase):
4244
"""
4345
tokenizing and encoding/decoding text using the Tiktoken tokenizer.
4446
"""

0 commit comments

Comments
 (0)