Skip to content

Commit c340b78

Browse files
authored
Migrate helios' usage of extension/llm/tokenizer to pytorch/tokenizers
Differential Revision: D69885635 Pull Request resolved: #23
1 parent 0763945 commit c340b78

File tree

6 files changed

+193
-0
lines changed

6 files changed

+193
-0
lines changed

tools/tokenize_tool/CMakeLists.txt renamed to examples/tokenize_tool/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#
33
# This source code is licensed under the BSD-style license found in the LICENSE
44
# file in the root directory of this source tree.
5+
# @lint-ignore-every LICENSELINT
56

67
file(GLOB source_files ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
78
get_filename_component(tool_name ${CMAKE_CURRENT_SOURCE_DIR} NAME)

tools/tokenize_tool/main.cpp renamed to examples/tokenize_tool/main.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
89

910
/**
1011
* This is a simple tool to instantiate a tokenizer and run it over some text.
File renamed without changes.
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
# @lint-ignore-every LICENSELINT
7+
8+
9+
# Script to rewrite tokenizer model given by sentencepiece to llama2.c format, with lightweight
10+
# postprocessing logic. The output can be consumed by llama2c_tokenizer.cpp.
11+
12+
import argparse
13+
import logging
14+
import os
15+
import struct
16+
from typing import List
17+
18+
from sentencepiece import SentencePieceProcessor as SentencePieceProcessor
19+
20+
21+
class Tokenizer:
22+
def __init__(self, model_path: str):
23+
assert os.path.isfile(
24+
model_path
25+
), f"Need a valid tokenizer model path but got {model_path}"
26+
# pyre-fixme[28]: Unexpected keyword argument `model_file` to call `SentencePieceProcessor.__init__`.
27+
self.sp_model = SentencePieceProcessor(model_file=model_path)
28+
self.model_path = model_path
29+
30+
# BOS / EOS token IDs
31+
self.n_words: int = self.sp_model.vocab_size()
32+
self.bos_id: int = self.sp_model.bos_id()
33+
self.eos_id: int = self.sp_model.eos_id()
34+
logging.info(
35+
f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
36+
)
37+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `get_piece_size`.
38+
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
39+
40+
def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
41+
assert type(s) is str
42+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
43+
t = self.sp_model.encode(s)
44+
if bos:
45+
t = [self.bos_id] + t
46+
if eos:
47+
t = t + [self.eos_id]
48+
return t
49+
50+
def decode(self, t: List[int]) -> str:
51+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
52+
return self.sp_model.decode(t)
53+
54+
def decode_token(self, t: int) -> str:
55+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
56+
return self.sp_model.decode(t)
57+
58+
def export(self, output_path: str, *, prepend_padding: bool = False) -> None:
59+
"""
60+
Export tokenizer.model to another serialization format. Here we did some lightweight
61+
processing such as supporting prepend padding token, prepend max token length and
62+
replace '_' back to empty space.
63+
64+
The binary format is:
65+
1. vocab size: int32
66+
2. bos token id: int32
67+
3. eos token id: int32
68+
4. max token length: int32
69+
5. score: float32, len of bytes: int32, token bytes: [byte] for each token
70+
71+
:param output_path: output path of the new binary.
72+
:param prepend_padding: a boolean to control if we want to prepend a padding token.
73+
74+
:return: None
75+
"""
76+
77+
# get all the tokens (postprocessed) and their scores as floats
78+
tokens, scores = [], []
79+
80+
if prepend_padding:
81+
# Here we use the default padding token and its score.
82+
tokens.append("<pad>".encode("utf-8"))
83+
scores.append(-1)
84+
85+
for i in range(self.n_words):
86+
# decode the token and light postprocessing
87+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `id_to_piece`.
88+
t = self.sp_model.id_to_piece(i)
89+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `get_score`.
90+
s = self.sp_model.get_score(i)
91+
# sentencepiece use '<s>' as BOS and '</s>' for EOS
92+
if i == self.bos_id:
93+
t = "<s>"
94+
elif i == self.eos_id:
95+
t = "</s>"
96+
t = t.replace("▁", " ") # sentencepiece uses this character as whitespace
97+
b = t.encode("utf-8") # bytes of this token, utf-8 encoded
98+
99+
tokens.append(b)
100+
scores.append(s)
101+
102+
# record the max token length
103+
max_token_length = 0 if not tokens else max(len(t) for t in tokens)
104+
105+
# write to a binary file
106+
with open(output_path, "wb") as f:
107+
# write the vocab size, bos/eos ids and max token length
108+
f.write(
109+
struct.pack(
110+
"IIII", self.n_words, self.bos_id, self.eos_id, max_token_length
111+
)
112+
)
113+
for bytes, score in zip(tokens, scores):
114+
f.write(struct.pack("fI", score, len(bytes)))
115+
f.write(bytes)
116+
logging.info(f"Wrote tokenizer to {output_path}")
117+
118+
119+
if __name__ == "__main__":
120+
parser = argparse.ArgumentParser()
121+
parser.add_argument(
122+
"-t",
123+
"--tokenizer-model",
124+
type=str,
125+
default="tokenizer.model",
126+
help="path to tokenizer model, given by sentencepiece",
127+
)
128+
parser.add_argument(
129+
"-o",
130+
"--output-path",
131+
type=str,
132+
default=None,
133+
help="output path of postprocessed tokenizer model",
134+
)
135+
parser.add_argument(
136+
"-p",
137+
"--prepend-padding",
138+
action="store_true",
139+
help="whether to prepend a padding token to the beginning of the tokenizer",
140+
)
141+
142+
args = parser.parse_args()
143+
144+
t = Tokenizer(args.tokenizer_model)
145+
146+
output_path = (
147+
args.output_path
148+
if args.output_path
149+
else args.tokenizer_model.replace(".model", ".bin")
150+
)
151+
t.export(output_path, prepend_padding=args.prepend_padding)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
2+
3+
def define_common_targets():
4+
"""Defines targets that should be shared between fbcode and xplat.
5+
6+
The directory containing this targets.bzl file should also contain both
7+
TARGETS and BUCK files that call this function.
8+
"""
9+
runtime.python_library(
10+
name = "convert_lib",
11+
srcs = [
12+
"__init__.py",
13+
"convert.py",
14+
],
15+
base_module = "pytorch_tokenizers.tools.llama2c",
16+
visibility = [
17+
"//executorch/examples/...",
18+
"//executorch/extension/llm/export/...",
19+
"//bento/...",
20+
"//bento_kernels/...",
21+
"@EXECUTORCH_CLIENTS",
22+
],
23+
_is_external_target = True,
24+
external_deps = [
25+
"sentencepiece-py",
26+
],
27+
)
28+
29+
runtime.python_binary(
30+
name = "convert",
31+
main_module = "pytorch_tokenizers.tools.llama2c.convert",
32+
visibility = [
33+
"//executorch/examples/...",
34+
"fbsource//xplat/executorch/examples/...",
35+
],
36+
_is_external_target = True,
37+
deps = [
38+
":convert_lib",
39+
],
40+
)

0 commit comments

Comments
 (0)