Skip to content

Commit 1b35a23

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Migrate extension/llm/tokenizer python users to use the new repo (#22)
Summary: Migrate usages of ``` fbcode//executorch/extension/llm/tokenizer:tokenizer_py ``` to use: ``` fbcode//pytorch/tokenizers/pytorch_tokenizers:tokenizers ``` Differential Revision: D69820450
1 parent b3ba207 commit 1b35a23

File tree

8 files changed

+481
-118
lines changed

8 files changed

+481
-118
lines changed

pytorch_tokenizers/TARGETS

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Any targets that should be shared between fbcode and xplat must be defined in
2+
# targets.bzl. This file can contain xplat-only targets.
3+
4+
load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
5+
load(":targets.bzl", "define_common_targets")
6+
7+
oncall("executorch")
8+
9+
define_common_targets()
10+
11+
python_library(
12+
name = "hf_tokenizer",
13+
srcs = ["hf_tokenizer.py"],
14+
labels = ["autodeps2_generated"],
15+
deps = [
16+
"fbsource//third-party/pypi/tokenizers:tokenizers",
17+
],
18+
)

pytorch_tokenizers/__init__.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
8+
from typing import Optional
9+
10+
from .hf_tokenizer import HuggingFaceTokenizer
11+
from .llama2c import Llama2cTokenizer
12+
from .tiktoken import TiktokenTokenizer
13+
14+
__all__ = ["TiktokenTokenizer", "Llama2cTokenizer", "HuggingFaceTokenizer"]
15+
16+
17+
def get_tokenizer(tokenizer_path: str, tokenizer_config_path: Optional[str] = None):
18+
if tokenizer_path.endswith(".json"):
19+
tokenizer = HuggingFaceTokenizer(tokenizer_path, tokenizer_config_path)
20+
else:
21+
try:
22+
tokenizer = Llama2cTokenizer(model_path=str(tokenizer_path))
23+
except Exception:
24+
print("Using Tiktokenizer")
25+
tokenizer = TiktokenTokenizer(model_path=str(tokenizer_path))
26+
return tokenizer

pytorch_tokenizers/hf_tokenizer.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
import json
8+
import os
9+
from typing import List, Optional
10+
11+
from tokenizers import Tokenizer
12+
13+
14+
class HuggingFaceTokenizer:
15+
"""
16+
Tokenizing and encoding/decoding text using the Hugging face tokenizer.
17+
"""
18+
19+
def __init__(self, model_path: str, config_path: Optional[str] = None):
20+
"""
21+
Initializes the Tokenizer with a tokenizer.json from HuggingFace.
22+
23+
Args:
24+
model_path (str): The path to the Tiktoken model file.
25+
"""
26+
assert os.path.isfile(model_path), model_path
27+
28+
self.model = tokenizer = Tokenizer.from_file(model_path)
29+
30+
self.n_words: int = tokenizer.get_vocab_size()
31+
if config_path:
32+
with open(config_path) as f:
33+
tokenizer_config = json.load(f)
34+
self.bos_id = (
35+
self.model.token_to_id(tokenizer_config["bos_token"])
36+
if tokenizer_config["bos_token"]
37+
else None
38+
)
39+
self.eos_id = self.model.token_to_id(tokenizer_config["eos_token"])
40+
else: # Fallback guess.
41+
self.bos_id = self.model.token_to_id("<|begin_of_text|>")
42+
self.eos_id = self.model.token_to_id("<|endoftext|>")
43+
44+
self.stop_tokens = [
45+
self.eos_id,
46+
]
47+
48+
def encode(self, s: str, *, bos: bool, eos: bool) -> List[int]:
49+
assert type(s) is str
50+
return self.model.encode(s).ids
51+
52+
def decode(self, t: List[int]) -> str:
53+
return self.model.decode(t)
54+
55+
def decode_token(self, t: int) -> str:
56+
return self.model.decode([t])

pytorch_tokenizers/llama2c.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
import logging
8+
import os
9+
import struct
10+
from typing import List
11+
12+
from sentencepiece import SentencePieceProcessor as SentencePieceProcessor
13+
14+
15+
class Llama2cTokenizer:
16+
def __init__(self, model_path: str):
17+
assert os.path.isfile(
18+
model_path
19+
), f"Need a valid tokenizer model path but got {model_path}"
20+
# pyre-fixme[28]: Unexpected keyword argument `model_file` to call `SentencePieceProcessor.__init__`.
21+
self.sp_model = SentencePieceProcessor(model_file=model_path)
22+
self.model_path = model_path
23+
24+
# BOS / EOS token IDs
25+
self.n_words: int = self.sp_model.vocab_size()
26+
self.bos_id: int = self.sp_model.bos_id()
27+
self.eos_id: int = self.sp_model.eos_id()
28+
logging.info(
29+
f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
30+
)
31+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `get_piece_size`.
32+
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
33+
34+
def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
35+
assert type(s) is str
36+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
37+
t = self.sp_model.encode(s)
38+
if bos:
39+
t = [self.bos_id] + t
40+
if eos:
41+
t = t + [self.eos_id]
42+
return t
43+
44+
def decode(self, t: List[int]) -> str:
45+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
46+
return self.sp_model.decode(t)
47+
48+
def decode_token(self, t: int) -> str:
49+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
50+
return self.sp_model.decode(t)
51+
52+
def export(self, output_path: str, *, prepend_padding: bool = False) -> None:
53+
"""
54+
Export tokenizer.model to another serialization format. Here we did some lightweight
55+
processing such as supporting prepend padding token, prepend max token length and
56+
replace '_' back to empty space.
57+
58+
The binary format is:
59+
1. vocab size: int32
60+
2. bos token id: int32
61+
3. eos token id: int32
62+
4. max token length: int32
63+
5. score: float32, len of bytes: int32, token bytes: [byte] for each token
64+
65+
:param output_path: output path of the new binary.
66+
:param prepend_padding: a boolean to control if we want to prepend a padding token.
67+
68+
:return: None
69+
"""
70+
71+
# get all the tokens (postprocessed) and their scores as floats
72+
tokens, scores = [], []
73+
74+
if prepend_padding:
75+
# Here we use the default padding token and its score.
76+
tokens.append("<pad>".encode("utf-8"))
77+
scores.append(-1)
78+
79+
for i in range(self.n_words):
80+
# decode the token and light postprocessing
81+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `id_to_piece`.
82+
t = self.sp_model.id_to_piece(i)
83+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `get_score`.
84+
s = self.sp_model.get_score(i)
85+
# sentencepiece use '<s>' as BOS and '</s>' for EOS
86+
if i == self.bos_id:
87+
t = "<s>"
88+
elif i == self.eos_id:
89+
t = "</s>"
90+
t = t.replace("▁", " ") # sentencepiece uses this character as whitespace
91+
b = t.encode("utf-8") # bytes of this token, utf-8 encoded
92+
93+
tokens.append(b)
94+
scores.append(s)
95+
96+
# record the max token length
97+
max_token_length = 0 if not tokens else max(len(t) for t in tokens)
98+
99+
# write to a binary file
100+
with open(output_path, "wb") as f:
101+
# write the vocab size, bos/eos ids and max token length
102+
f.write(
103+
struct.pack(
104+
"IIII", self.n_words, self.bos_id, self.eos_id, max_token_length
105+
)
106+
)
107+
for bytes, score in zip(tokens, scores):
108+
f.write(struct.pack("fI", score, len(bytes)))
109+
f.write(bytes)
110+
logging.info(f"Wrote tokenizer to {output_path}")

pytorch_tokenizers/targets.bzl

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
2+
3+
def define_common_targets():
4+
"""Defines targets that should be shared between fbcode and xplat.
5+
6+
The directory containing this targets.bzl file should also contain both
7+
TARGETS and BUCK files that call this function.
8+
"""
9+
runtime.python_library(
10+
name = "tokenizers",
11+
srcs = [
12+
"__init__.py",
13+
"llama2c.py",
14+
"tiktoken.py",
15+
"hf_tokenizer.py",
16+
],
17+
base_module = "pytorch_tokenizers",
18+
visibility = [
19+
"//executorch/examples/...",
20+
"//executorch/extension/llm/export/...",
21+
"//bento/...",
22+
"//bento_kernels/...",
23+
"//pytorch/tokenizers/...",
24+
"@EXECUTORCH_CLIENTS",
25+
],
26+
_is_external_target = True,
27+
external_deps = [
28+
"sentencepiece-py",
29+
],
30+
deps = [
31+
"fbsource//third-party/pypi/tiktoken:tiktoken",
32+
"fbsource//third-party/pypi/tokenizers:tokenizers",
33+
],
34+
)

0 commit comments

Comments
 (0)