Skip to content

Commit 03744ce

Browse files
authored
Buckify tokenizers
Differential Revision: D69509028 Pull Request resolved: #17
1 parent f2fc3d6 commit 03744ce

23 files changed

+369
-94
lines changed

CMakeLists.txt

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -32,25 +32,27 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece)
3232
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
3333

3434
file(GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp)
35-
file(GLOB unicode_source_files ${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp)
36-
add_library(tokenizers STATIC ${tokenizers_source_files} ${unicode_source_files})
35+
file(GLOB unicode_source_files
36+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp)
37+
add_library(tokenizers STATIC ${tokenizers_source_files}
38+
${unicode_source_files})
3739

3840
# Using abseil from sentencepiece/third_party
3941
target_include_directories(
40-
tokenizers PUBLIC
41-
${CMAKE_CURRENT_SOURCE_DIR}/include
42-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
43-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src
44-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
45-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
46-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include)
42+
tokenizers
43+
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
44+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
45+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src
46+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
47+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
48+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include)
4749

4850
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)
4951

5052
# Build test
5153
if(TOKENIZERS_BUILD_TEST)
52-
enable_testing()
53-
include(FetchContent)
54+
enable_testing()
55+
include(FetchContent)
5456
# CMAKE
5557
FetchContent_Declare(
5658
googletest
@@ -63,20 +65,22 @@ if(TOKENIZERS_BUILD_TEST)
6365
FetchContent_MakeAvailable(googletest)
6466

6567
file(GLOB test_source_files ${CMAKE_CURRENT_SOURCE_DIR}/test/test_*.cpp)
68+
69+
set(test_env "RESOURCES_PATH=${CMAKE_CURRENT_SOURCE_DIR}/test/resources")
6670
foreach(test_source_file ${test_source_files})
67-
get_filename_component(test_name ${test_source_file} NAME_WE)
68-
message(STATUS "Configuring unit test ${test_name}")
69-
add_executable(${test_name} ${test_source_file})
70-
target_include_directories(${test_name} PRIVATE
71-
GTEST_INCLUDE_PATH
72-
${CMAKE_CURRENT_SOURCE_DIR}/include
73-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
74-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
75-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
76-
)
77-
target_link_libraries(${test_name} gtest_main tokenizers)
78-
target_compile_definitions(${test_name} PRIVATE RESOURCES_PATH="${CMAKE_CURRENT_SOURCE_DIR}/test/resources")
79-
add_test(${test_name} "${test_name}")
71+
get_filename_component(test_name ${test_source_file} NAME_WE)
72+
message(STATUS "Configuring unit test ${test_name}")
73+
add_executable(${test_name} ${test_source_file})
74+
target_include_directories(
75+
${test_name}
76+
PRIVATE GTEST_INCLUDE_PATH
77+
${CMAKE_CURRENT_SOURCE_DIR}/include
78+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
79+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
80+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include)
81+
target_link_libraries(${test_name} gtest_main tokenizers)
82+
add_test(${test_name} "${test_name}")
83+
set_tests_properties(${test_name} PROPERTIES ENVIRONMENT ${test_env})
8084
endforeach()
8185
endif()
8286

TARGETS

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Any targets that should be shared between fbcode and xplat must be defined in
2+
# targets.bzl. This file can contain fbcode-only targets.
3+
4+
load(":targets.bzl", "define_common_targets")
5+
6+
oncall("executorch")
7+
8+
define_common_targets()

include/detail/bpe_tokenizer_base.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,15 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
89

910
// Base class for all BPE tokenizer implementations
1011
#pragma once
1112

1213
// Standard
14+
#include <memory>
15+
#include <optional>
16+
#include <string>
1317
#include <unordered_map>
1418
#include <vector>
1519

include/pre_tokenizer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
9+
810
#pragma once
911

1012
// Standard
@@ -41,6 +43,8 @@ class PreTokenizer {
4143
*/
4244
virtual std::vector<std::string> pre_tokenize(
4345
re2::StringPiece input) const = 0;
46+
47+
virtual ~PreTokenizer() = default;
4448
}; // end class PreTokenizer
4549

4650
// -- Factory ------------------------------------------------------------------

include/sentencepiece.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
89

9-
// A tokenizer that works with sentencepiece.
10+
// A tokenizer that works with sentencepiece. Used by Llama2.
1011
#pragma once
1112

1213
#include <memory>

include/token_decoder.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
9+
810
#pragma once
911

1012
// Standard
@@ -45,6 +47,9 @@ class TokenDecoder {
4547
*/
4648
virtual std::string decode(re2::StringPiece token) const = 0;
4749

50+
// virtual destructor
51+
virtual ~TokenDecoder() = default;
52+
4853
}; // end class TokenDecoder
4954

5055
// -- Factory ------------------------------------------------------------------

src/bpe_tokenizer_base.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
9+
810
#include "detail/bpe_tokenizer_base.h"
911

1012
// Standard
@@ -56,7 +58,7 @@ static std::vector<uint64_t> _byte_pair_merge(
5658
if (rank) {
5759
// usize::MAX is a sentinel value and cannot be a valid rank
5860
if (*rank == _max_size()) {
59-
fprintf(stderr, "at %" PRIu32 " rank is too large\n", i);
61+
TK_LOG(Error, "at %" PRIu32 " rank is too large\n", i);
6062
}
6163
parts[i].second = *rank;
6264
}
@@ -177,8 +179,8 @@ BPETokenizerBase::encode_with_special_token_(
177179
} catch (const std::out_of_range&) {
178180
// Should never go here, since special pattern includes all special
179181
// chars.
180-
fprintf(stderr, "unknown special token: %s\n", special->c_str());
181-
exit(EXIT_FAILURE);
182+
TK_LOG(Error, "unknown special token: %s\n", special->c_str());
183+
return Error::EncodeFailure;
182184
}
183185

184186
tokens.push_back(token);
@@ -259,8 +261,8 @@ Result<std::string> BPETokenizerBase::decode(uint64_t prev, uint64_t cur)
259261
if (iter != special_token_decoder_.end()) {
260262
token_bytes = iter->second;
261263
} else {
262-
fprintf(stderr, "unknown token: %" PRIu64 "\n", cur);
263-
exit(EXIT_FAILURE);
264+
TK_LOG(Error, "unknown token: %" PRIu64 "\n", cur);
265+
return Error::DecodeFailure;
264266
}
265267
}
266268
_decode(token_bytes, ret);

src/hf_tokenizer.cpp

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
9+
810
#include "hf_tokenizer.h"
911

1012
// Standard
@@ -127,17 +129,17 @@ Error HFTokenizer::load(const std::string& path) {
127129
// If a tokenizer config file is found, parse it to look up the eos/bos tokens
128130
if (!model_config_json.empty()) {
129131
// Load it and parse it as json
130-
std::ifstream file(model_config_json);
131-
if (!file) {
132+
std::ifstream config_file(model_config_json);
133+
if (!config_file) {
132134
fprintf(stderr, "failed to open encoder file: %s\n", path.c_str());
133135
return Error::LoadFailure;
134136
}
135-
std::string contents(
136-
(std::istreambuf_iterator<char>(file)),
137+
std::string config_contents(
138+
(std::istreambuf_iterator<char>(config_file)),
137139
std::istreambuf_iterator<char>());
138-
json parsed_json;
140+
json parsed_config_json;
139141
try {
140-
parsed_json = json::parse(contents);
142+
parsed_config_json = json::parse(config_contents);
141143
} catch (const json::exception& e) {
142144
std::cout << "Error parsing model config json json file: " << e.what()
143145
<< std::endl;
@@ -146,8 +148,8 @@ Error HFTokenizer::load(const std::string& path) {
146148

147149
// Pull out the token strings
148150
try {
149-
const std::string bos_token = parsed_json.at("bos_token");
150-
const std::string eos_token = parsed_json.at("eos_token");
151+
const std::string bos_token = parsed_config_json.at("bos_token");
152+
const std::string eos_token = parsed_config_json.at("eos_token");
151153
const auto& bos_it = special_token_encoder_.find(bos_token);
152154
const auto& eos_it = special_token_encoder_.find(eos_token);
153155
if (bos_it == special_token_encoder_.end()) {
@@ -256,7 +258,11 @@ void HFTokenizer::_decode(re2::StringPiece input, std::string& ret) const {
256258
if (_decoder) {
257259
ret += _decoder->decode(input);
258260
} else {
261+
#ifdef _USE_INTERNAL_STRING_VIEW
262+
ret += input.as_string();
263+
#else
259264
ret += input;
265+
#endif
260266
}
261267
}
262268

src/pre_tokenizer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ namespace {
129129

130130
// Standard GPT2 regex
131131
// https://github.com/openai/gpt-2/blob/master/src/encoder.py#L53
132-
static const std::string GPT2_EXPR =
132+
constexpr char GPT2_EXPR[] =
133133
R"('s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)";
134134

135135
} // namespace

src/tiktoken.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,11 @@ Error Tiktoken::_encode(
183183
}
184184

185185
void Tiktoken::_decode(re2::StringPiece input, std::string& ret) const {
186+
#ifdef _USE_INTERNAL_STRING_VIEW
187+
ret += input.as_string();
188+
#else
186189
ret += input;
190+
#endif
187191
}
188192

189193
template <typename T>

src/token_decoder.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// @lint-ignore-every LICENSELINT
9+
810
#include "token_decoder.h"
911

1012
// Standard
@@ -60,7 +62,7 @@ static std::string format(const char* fmt, ...) {
6062
int size = vsnprintf(NULL, 0, fmt, ap);
6163
// GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
6264
std::vector<char> buf(size + 1);
63-
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
65+
// int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
6466
// GGML_ASSERT(size2 == size);
6567
va_end(ap2);
6668
va_end(ap);

targets.bzl

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
load("@fbsource//tools/build_defs:glob_defs.bzl", "subdir_glob")
2+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
3+
4+
def define_common_targets():
5+
"""Defines targets that should be shared between fbcode and xplat.
6+
7+
The directory containing this targets.bzl file should also contain both
8+
TARGETS and BUCK files that call this function.
9+
"""
10+
11+
runtime.cxx_library(
12+
name = "headers",
13+
exported_headers = subdir_glob([
14+
("include", "*.h"),
15+
("include", "**/*.h"),
16+
]),
17+
header_namespace = "",
18+
visibility = [
19+
"@EXECUTORCH_CLIENTS",
20+
],
21+
)
22+
23+
runtime.cxx_library(
24+
name = "sentencepiece",
25+
srcs = [
26+
"src/sentencepiece.cpp",
27+
],
28+
exported_deps = [
29+
":headers",
30+
],
31+
visibility = [
32+
"@EXECUTORCH_CLIENTS",
33+
],
34+
compiler_flags = [
35+
"-D_USE_INTERNAL_STRING_VIEW",
36+
],
37+
external_deps = [
38+
"sentencepiece",
39+
],
40+
)
41+
42+
runtime.cxx_library(
43+
name = "tiktoken",
44+
srcs = [
45+
"src/tiktoken.cpp",
46+
"src/bpe_tokenizer_base.cpp",
47+
],
48+
exported_deps = [
49+
":headers",
50+
],
51+
visibility = [
52+
"@EXECUTORCH_CLIENTS",
53+
],
54+
compiler_flags = [
55+
"-D_USE_INTERNAL_STRING_VIEW",
56+
],
57+
exported_external_deps = [
58+
"re2",
59+
],
60+
)
61+
62+
runtime.cxx_library(
63+
name = "unicode",
64+
srcs = [
65+
"third-party/llama.cpp-unicode/src/unicode.cpp",
66+
"third-party/llama.cpp-unicode/src/unicode-data.cpp",
67+
],
68+
exported_headers = subdir_glob([
69+
("third-party/llama.cpp-unicode/include", "*.h"),
70+
]),
71+
header_namespace = "",
72+
)
73+
74+
runtime.cxx_library(
75+
name = "hf_tokenizer",
76+
srcs = [
77+
"src/hf_tokenizer.cpp",
78+
"src/bpe_tokenizer_base.cpp",
79+
"src/pre_tokenizer.cpp",
80+
"src/token_decoder.cpp",
81+
],
82+
exported_deps = [
83+
":headers",
84+
":unicode",
85+
],
86+
visibility = [
87+
"@EXECUTORCH_CLIENTS",
88+
],
89+
compiler_flags = [
90+
"-D_USE_INTERNAL_STRING_VIEW",
91+
],
92+
exported_external_deps = [
93+
"re2",
94+
"nlohmann_json",
95+
],
96+
)

test/resources/test_bpe_tokenizer.bin

16 Bytes
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
tet 0
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ICAgICAgIA== 18446744073709551616
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ICAgICAgIA==10

test/test_pre_tokenizer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ using namespace tokenizers;
1919

2020
// Helpers /////////////////////////////////////////////////////////////////////
2121

22-
void assert_split_match(
22+
static void assert_split_match(
2323
const PreTokenizer& ptok,
2424
const std::string& prompt,
2525
const std::vector<std::string>& expected) {

0 commit comments

Comments
 (0)