Skip to content

Commit 70a84b6

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Buckify tokenizers (#17)
Summary: X-link: pytorch/executorch#8408 Pull Request resolved: #17 So that it can be used by ET internally Reviewed By: jackzhxng Differential Revision: D69509028
1 parent f2fc3d6 commit 70a84b6

17 files changed

+314
-45
lines changed

TARGETS

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Any targets that should be shared between fbcode and xplat must be defined in
2+
# targets.bzl. This file can contain fbcode-only targets.
3+
4+
load(":targets.bzl", "define_common_targets")
5+
6+
oncall("executorch")
7+
8+
define_common_targets()

include/detail/bpe_tokenizer_base.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,15 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
89

910
// Base class for all BPE tokenizer implementations
1011
#pragma once
1112

1213
// Standard
14+
#include <memory>
15+
#include <optional>
16+
#include <string>
1317
#include <unordered_map>
1418
#include <vector>
1519

include/pre_tokenizer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
9+
810
#pragma once
911

1012
// Standard
@@ -41,6 +43,8 @@ class PreTokenizer {
4143
*/
4244
virtual std::vector<std::string> pre_tokenize(
4345
re2::StringPiece input) const = 0;
46+
47+
virtual ~PreTokenizer() = default;
4448
}; // end class PreTokenizer
4549

4650
// -- Factory ------------------------------------------------------------------

include/sentencepiece.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
89

9-
// A tokenizer that works with sentencepiece.
10+
// A tokenizer that works with sentencepiece. Used by Llama2.
1011
#pragma once
1112

1213
#include <memory>

include/token_decoder.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
9+
810
#pragma once
911

1012
// Standard
@@ -45,6 +47,9 @@ class TokenDecoder {
4547
*/
4648
virtual std::string decode(re2::StringPiece token) const = 0;
4749

50+
// virtual destructor
51+
virtual ~TokenDecoder() = default;
52+
4853
}; // end class TokenDecoder
4954

5055
// -- Factory ------------------------------------------------------------------

src/bpe_tokenizer_base.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
9+
810
#include "detail/bpe_tokenizer_base.h"
911

1012
// Standard
@@ -56,7 +58,7 @@ static std::vector<uint64_t> _byte_pair_merge(
5658
if (rank) {
5759
// usize::MAX is a sentinel value and cannot be a valid rank
5860
if (*rank == _max_size()) {
59-
fprintf(stderr, "at %" PRIu32 " rank is too large\n", i);
61+
TK_LOG(Error, "at %" PRIu32 " rank is too large\n", i);
6062
}
6163
parts[i].second = *rank;
6264
}
@@ -177,8 +179,8 @@ BPETokenizerBase::encode_with_special_token_(
177179
} catch (const std::out_of_range&) {
178180
// Should never go here, since special pattern includes all special
179181
// chars.
180-
fprintf(stderr, "unknown special token: %s\n", special->c_str());
181-
exit(EXIT_FAILURE);
182+
TK_LOG(Error, "unknown special token: %s\n", special->c_str());
183+
return Error::EncodeFailure;
182184
}
183185

184186
tokens.push_back(token);
@@ -259,8 +261,8 @@ Result<std::string> BPETokenizerBase::decode(uint64_t prev, uint64_t cur)
259261
if (iter != special_token_decoder_.end()) {
260262
token_bytes = iter->second;
261263
} else {
262-
fprintf(stderr, "unknown token: %" PRIu64 "\n", cur);
263-
exit(EXIT_FAILURE);
264+
TK_LOG(Error, "unknown token: %" PRIu64 "\n", cur);
265+
return Error::DecodeFailure;
264266
}
265267
}
266268
_decode(token_bytes, ret);

src/hf_tokenizer.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
9+
810
#include "hf_tokenizer.h"
911

1012
// Standard
@@ -256,7 +258,11 @@ void HFTokenizer::_decode(re2::StringPiece input, std::string& ret) const {
256258
if (_decoder) {
257259
ret += _decoder->decode(input);
258260
} else {
261+
#ifdef _USE_INTERNAL_STRING_VIEW
262+
ret += input.as_string();
263+
#else
259264
ret += input;
265+
#endif
260266
}
261267
}
262268

src/tiktoken.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,11 @@ Error Tiktoken::_encode(
183183
}
184184

185185
void Tiktoken::_decode(re2::StringPiece input, std::string& ret) const {
186+
#ifdef _USE_INTERNAL_STRING_VIEW
187+
ret += input.as_string();
188+
#else
186189
ret += input;
190+
#endif
187191
}
188192

189193
template <typename T>

src/token_decoder.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
9+
810
#include "token_decoder.h"
911

1012
// Standard
@@ -60,7 +62,7 @@ static std::string format(const char* fmt, ...) {
6062
int size = vsnprintf(NULL, 0, fmt, ap);
6163
// GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
6264
std::vector<char> buf(size + 1);
63-
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
65+
// int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
6466
// GGML_ASSERT(size2 == size);
6567
va_end(ap2);
6668
va_end(ap);

targets.bzl

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
load("@fbsource//tools/build_defs:glob_defs.bzl", "subdir_glob")
2+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
3+
4+
def define_common_targets():
5+
"""Defines targets that should be shared between fbcode and xplat.
6+
7+
The directory containing this targets.bzl file should also contain both
8+
TARGETS and BUCK files that call this function.
9+
"""
10+
11+
runtime.cxx_library(
12+
name = "headers",
13+
exported_headers = subdir_glob([
14+
("include", "*.h"),
15+
("include", "**/*.h"),
16+
]),
17+
header_namespace = "",
18+
visibility = [
19+
"@EXECUTORCH_CLIENTS",
20+
],
21+
)
22+
23+
runtime.cxx_library(
24+
name = "sentencepiece",
25+
srcs = [
26+
"src/sentencepiece.cpp",
27+
],
28+
exported_deps = [
29+
":headers",
30+
],
31+
visibility = [
32+
"@EXECUTORCH_CLIENTS",
33+
],
34+
compiler_flags = [
35+
"-D_USE_INTERNAL_STRING_VIEW",
36+
],
37+
deps = [
38+
"fbsource//third-party/sentencepiece:sentencepiece",
39+
],
40+
)
41+
42+
runtime.cxx_library(
43+
name = "tiktoken",
44+
srcs = [
45+
"src/tiktoken.cpp",
46+
"src/bpe_tokenizer_base.cpp",
47+
],
48+
exported_deps = [
49+
":headers",
50+
],
51+
visibility = [
52+
"@EXECUTORCH_CLIENTS",
53+
],
54+
compiler_flags = [
55+
"-D_USE_INTERNAL_STRING_VIEW",
56+
],
57+
exported_external_deps = [
58+
"re2",
59+
],
60+
)
61+
62+
runtime.cxx_library(
63+
name = "unicode",
64+
srcs = [
65+
"third-party/llama.cpp-unicode/src/unicode.cpp",
66+
"third-party/llama.cpp-unicode/src/unicode-data.cpp",
67+
],
68+
exported_headers = subdir_glob([
69+
("third-party/llama.cpp-unicode/include", "*.h"),
70+
]),
71+
header_namespace = "",
72+
)
73+
74+
runtime.cxx_library(
75+
name = "hf_tokenizer",
76+
srcs = [
77+
"src/hf_tokenizer.cpp",
78+
"src/bpe_tokenizer_base.cpp",
79+
"src/pre_tokenizer.cpp",
80+
"src/token_decoder.cpp",
81+
],
82+
exported_deps = [
83+
":headers",
84+
":unicode",
85+
],
86+
visibility = [
87+
"@EXECUTORCH_CLIENTS",
88+
],
89+
compiler_flags = [
90+
"-D_USE_INTERNAL_STRING_VIEW",
91+
],
92+
exported_external_deps = [
93+
"re2",
94+
"nlohmann_json",
95+
],
96+
)

test/resources/test_bpe_tokenizer.bin

16 Bytes
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
tet 0
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ICAgICAgIA== 18446744073709551616
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ICAgICAgIA==10

test/test_sentencepiece.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,28 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
89

9-
#include "gtest/gtest.h"
10+
#ifdef TOKENIZERS_FB_BUCK
11+
#include <TestResourceUtils/TestResourceUtils.h>
12+
#endif
13+
#include <gtest/gtest.h>
1014
#include "sentencepiece.h"
1115

1216
namespace tokenizers {
1317

18+
namespace {
19+
static inline std::string _get_resource_path(const std::string& name) {
20+
#ifdef TOKENIZERS_FB_BUCK
21+
return facebook::xplat::testing::getPathForTestResource(
22+
"test/resources/" + name);
23+
#else
24+
return std::getenv("RESOURCES_PATH") + std::string("/") + name;
25+
#endif
26+
}
27+
28+
} // namespace
29+
1430
TEST(SPTokenizerTest, TestEncodeWithoutLoad) {
1531
SPTokenizer tokenizer;
1632
std::string text = "Hello world!";
@@ -26,7 +42,7 @@ TEST(SPTokenizerTest, TestDecodeWithoutLoad) {
2642

2743
TEST(SPTokenizerTest, TestLoad) {
2844
SPTokenizer tokenizer;
29-
auto path = RESOURCES_PATH + std::string("/test_sentencepiece.model");
45+
auto path = _get_resource_path("test_sentencepiece.model");
3046
auto error = tokenizer.load(path);
3147
EXPECT_EQ(error, Error::Ok);
3248
}
@@ -39,7 +55,7 @@ TEST(SPTokenizerTest, TestLoadInvalidPath) {
3955

4056
TEST(SPTokenizerTest, TestEncode) {
4157
SPTokenizer tokenizer;
42-
auto path = RESOURCES_PATH + std::string("/test_sentencepiece.model");
58+
auto path = _get_resource_path("test_sentencepiece.model");
4359
auto error = tokenizer.load(path);
4460
EXPECT_EQ(error, Error::Ok);
4561
std::string text = "Hello world!";
@@ -54,7 +70,7 @@ TEST(SPTokenizerTest, TestEncode) {
5470

5571
TEST(SPTokenizerTest, TestDecode) {
5672
SPTokenizer tokenizer;
57-
auto path = RESOURCES_PATH + std::string("/test_sentencepiece.model");
73+
auto path = _get_resource_path("test_sentencepiece.model");
5874
auto error = tokenizer.load(path);
5975
EXPECT_EQ(error, Error::Ok);
6076
std::vector<uint64_t> tokens = {1, 15043, 3186, 29991};

0 commit comments

Comments
 (0)