Skip to content

Commit 96a7954

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Buckify tokenizers (#17)
Summary: Pull Request resolved: #17 So that it can be used by ET internally Reviewed By: jackzhxng Differential Revision: D69509028
1 parent f2fc3d6 commit 96a7954

17 files changed

+297
-45
lines changed

TARGETS

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Any targets that should be shared between fbcode and xplat must be defined in
2+
# targets.bzl. This file can contain fbcode-only targets.
3+
4+
load(":targets.bzl", "define_common_targets")
5+
6+
oncall("executorch")
7+
8+
define_common_targets()

include/detail/bpe_tokenizer_base.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
#pragma once
1111

1212
// Standard
13+
#include <memory>
14+
#include <optional>
15+
#include <string>
1316
#include <unordered_map>
1417
#include <vector>
1518

include/pre_tokenizer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ class PreTokenizer {
4141
*/
4242
virtual std::vector<std::string> pre_tokenize(
4343
re2::StringPiece input) const = 0;
44+
45+
virtual ~PreTokenizer() = default;
4446
}; // end class PreTokenizer
4547

4648
// -- Factory ------------------------------------------------------------------

include/sentencepiece.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
// A tokenizer that works with sentencepiece.
9+
// A tokenizer that works with sentencepiece. Used by Llama2.
1010
#pragma once
1111

1212
#include <memory>

include/token_decoder.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ class TokenDecoder {
4545
*/
4646
virtual std::string decode(re2::StringPiece token) const = 0;
4747

48+
// virtual destructor
49+
virtual ~TokenDecoder() = default;
50+
4851
}; // end class TokenDecoder
4952

5053
// -- Factory ------------------------------------------------------------------

src/bpe_tokenizer_base.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ static std::vector<uint64_t> _byte_pair_merge(
5656
if (rank) {
5757
// usize::MAX is a sentinel value and cannot be a valid rank
5858
if (*rank == _max_size()) {
59-
fprintf(stderr, "at %" PRIu32 " rank is too large\n", i);
59+
TK_LOG(Error, "at %" PRIu32 " rank is too large\n", i);
6060
}
6161
parts[i].second = *rank;
6262
}
@@ -177,8 +177,8 @@ BPETokenizerBase::encode_with_special_token_(
177177
} catch (const std::out_of_range&) {
178178
// Should never go here, since special pattern includes all special
179179
// chars.
180-
fprintf(stderr, "unknown special token: %s\n", special->c_str());
181-
exit(EXIT_FAILURE);
180+
TK_LOG(Error, "unknown special token: %s\n", special->c_str());
181+
return Error::EncodeFailure;
182182
}
183183

184184
tokens.push_back(token);
@@ -259,8 +259,8 @@ Result<std::string> BPETokenizerBase::decode(uint64_t prev, uint64_t cur)
259259
if (iter != special_token_decoder_.end()) {
260260
token_bytes = iter->second;
261261
} else {
262-
fprintf(stderr, "unknown token: %" PRIu64 "\n", cur);
263-
exit(EXIT_FAILURE);
262+
TK_LOG(Error, "unknown token: %" PRIu64 "\n", cur);
263+
return Error::DecodeFailure;
264264
}
265265
}
266266
_decode(token_bytes, ret);

src/hf_tokenizer.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,11 @@ void HFTokenizer::_decode(re2::StringPiece input, std::string& ret) const {
256256
if (_decoder) {
257257
ret += _decoder->decode(input);
258258
} else {
259+
#ifdef _USE_INTERNAL_STRING_VIEW
260+
ret += input.as_string();
261+
#else
259262
ret += input;
263+
#endif
260264
}
261265
}
262266

src/tiktoken.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,11 @@ Error Tiktoken::_encode(
183183
}
184184

185185
void Tiktoken::_decode(re2::StringPiece input, std::string& ret) const {
186+
#ifdef _USE_INTERNAL_STRING_VIEW
187+
ret += input.as_string();
188+
#else
186189
ret += input;
190+
#endif
187191
}
188192

189193
template <typename T>

src/token_decoder.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ static std::string format(const char* fmt, ...) {
6060
int size = vsnprintf(NULL, 0, fmt, ap);
6161
// GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
6262
std::vector<char> buf(size + 1);
63-
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
63+
// int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
6464
// GGML_ASSERT(size2 == size);
6565
va_end(ap2);
6666
va_end(ap);

targets.bzl

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
load("@fbsource//tools/build_defs:glob_defs.bzl", "subdir_glob")
2+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
3+
4+
def define_common_targets():
5+
"""Defines targets that should be shared between fbcode and xplat.
6+
7+
The directory containing this targets.bzl file should also contain both
8+
TARGETS and BUCK files that call this function.
9+
"""
10+
11+
runtime.cxx_library(
12+
name = "headers",
13+
exported_headers = subdir_glob([
14+
("include", "*.h"),
15+
("include", "**/*.h"),
16+
]),
17+
header_namespace = "",
18+
visibility = [
19+
"@EXECUTORCH_CLIENTS",
20+
],
21+
)
22+
23+
runtime.cxx_library(
24+
name = "sentencepiece",
25+
srcs = [
26+
"src/sentencepiece.cpp",
27+
],
28+
exported_deps = [
29+
":headers",
30+
],
31+
visibility = [
32+
"@EXECUTORCH_CLIENTS",
33+
],
34+
compiler_flags = [
35+
"-D_USE_INTERNAL_STRING_VIEW",
36+
],
37+
deps = [
38+
"fbsource//third-party/sentencepiece:sentencepiece",
39+
],
40+
)
41+
42+
runtime.cxx_library(
43+
name = "tiktoken",
44+
srcs = [
45+
"src/tiktoken.cpp",
46+
"src/bpe_tokenizer_base.cpp",
47+
],
48+
exported_deps = [
49+
":headers",
50+
],
51+
visibility = [
52+
"@EXECUTORCH_CLIENTS",
53+
],
54+
compiler_flags = [
55+
"-D_USE_INTERNAL_STRING_VIEW",
56+
],
57+
exported_external_deps = [
58+
"re2",
59+
],
60+
)
61+
62+
runtime.cxx_library(
63+
name = "unicode",
64+
srcs = [
65+
"third-party/llama.cpp-unicode/src/unicode.cpp",
66+
"third-party/llama.cpp-unicode/src/unicode-data.cpp",
67+
],
68+
exported_headers = subdir_glob([
69+
("third-party/llama.cpp-unicode/include", "*.h"),
70+
]),
71+
header_namespace = "",
72+
)
73+
74+
runtime.cxx_library(
75+
name = "hf_tokenizer",
76+
srcs = [
77+
"src/hf_tokenizer.cpp",
78+
"src/bpe_tokenizer_base.cpp",
79+
"src/pre_tokenizer.cpp",
80+
"src/token_decoder.cpp",
81+
],
82+
exported_deps = [
83+
":headers",
84+
":unicode",
85+
],
86+
visibility = [
87+
"@EXECUTORCH_CLIENTS",
88+
],
89+
exported_external_deps = [
90+
"re2",
91+
"nlohmann_json",
92+
],
93+
)

test/resources/test_bpe_tokenizer.bin

16 Bytes
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
tet 0
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ICAgICAgIA== 18446744073709551616
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ICAgICAgIA==10

test/test_sentencepiece.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,26 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#include "gtest/gtest.h"
9+
#ifdef TOKENIZERS_FB_BUCK
10+
#include <TestResourceUtils/TestResourceUtils.h>
11+
#endif
12+
#include <gtest/gtest.h>
1013
#include "sentencepiece.h"
1114

1215
namespace tokenizers {
1316

17+
namespace {
18+
static inline std::string _get_resource_path(const std::string& name) {
19+
#ifdef TOKENIZERS_FB_BUCK
20+
return facebook::xplat::testing::getPathForTestResource(
21+
"test/resources/" + name);
22+
#else
23+
return std::getenv("RESOURCES_PATH") + std::string("/") + name;
24+
#endif
25+
}
26+
27+
} // namespace
28+
1429
TEST(SPTokenizerTest, TestEncodeWithoutLoad) {
1530
SPTokenizer tokenizer;
1631
std::string text = "Hello world!";
@@ -26,7 +41,7 @@ TEST(SPTokenizerTest, TestDecodeWithoutLoad) {
2641

2742
TEST(SPTokenizerTest, TestLoad) {
2843
SPTokenizer tokenizer;
29-
auto path = RESOURCES_PATH + std::string("/test_sentencepiece.model");
44+
auto path = _get_resource_path("test_sentencepiece.model");
3045
auto error = tokenizer.load(path);
3146
EXPECT_EQ(error, Error::Ok);
3247
}
@@ -39,7 +54,7 @@ TEST(SPTokenizerTest, TestLoadInvalidPath) {
3954

4055
TEST(SPTokenizerTest, TestEncode) {
4156
SPTokenizer tokenizer;
42-
auto path = RESOURCES_PATH + std::string("/test_sentencepiece.model");
57+
auto path = _get_resource_path("test_sentencepiece.model");
4358
auto error = tokenizer.load(path);
4459
EXPECT_EQ(error, Error::Ok);
4560
std::string text = "Hello world!";
@@ -54,7 +69,7 @@ TEST(SPTokenizerTest, TestEncode) {
5469

5570
TEST(SPTokenizerTest, TestDecode) {
5671
SPTokenizer tokenizer;
57-
auto path = RESOURCES_PATH + std::string("/test_sentencepiece.model");
72+
auto path = _get_resource_path("test_sentencepiece.model");
5873
auto error = tokenizer.load(path);
5974
EXPECT_EQ(error, Error::Ok);
6075
std::vector<uint64_t> tokens = {1, 15043, 3186, 29991};

0 commit comments

Comments
 (0)