Skip to content

Commit ffd4749

Browse files
authored
Gate regex lookahead in cmake behind compile flag
Differential Revision: D73530475 Pull Request resolved: #59
1 parent 13abc73 commit ffd4749

File tree

3 files changed

+64
-24
lines changed

3 files changed

+64
-24
lines changed

CMakeLists.txt

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,33 +18,56 @@ project(Tokenizers)
1818

1919
option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
2020
option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF)
21+
option(SUPPORT_REGEX_LOOKAHEAD
22+
"Support regex lookahead patterns (requires PCRE2)" OFF)
2123

2224
# Ignore weak attribute warning
2325
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
2426

2527
set(ABSL_ENABLE_INSTALL ON)
2628
set(ABSL_PROPAGATE_CXX_STD ON)
29+
2730
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
2831
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
32+
2933
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp)
3034
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2)
3135
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece)
3236

3337
# Configure PCRE2
34-
set(PCRE2_BUILD_PCRE2_8 ON)
35-
set(PCRE2_BUILD_PCRE2_16 OFF)
36-
set(PCRE2_BUILD_PCRE2_32 OFF)
37-
set(PCRE2_BUILD_TESTS OFF)
38-
set(PCRE2_BUILD_PCRE2GREP OFF)
39-
set(PCRE2_BUILD_PCRE2TEST OFF)
40-
set(PCRE2_BUILD_PCRE2GPERF OFF)
41-
set(PCRE2_BUILD_DOCS OFF)
42-
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
43-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)
38+
if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
39+
set(PCRE2_BUILD_PCRE2_8 ON)
40+
set(PCRE2_BUILD_PCRE2_16 OFF)
41+
set(PCRE2_BUILD_PCRE2_32 OFF)
42+
set(PCRE2_BUILD_TESTS OFF)
43+
set(PCRE2_BUILD_PCRE2GREP OFF)
44+
set(PCRE2_BUILD_PCRE2TEST OFF)
45+
set(PCRE2_BUILD_PCRE2GPERF OFF)
46+
set(PCRE2_BUILD_DOCS OFF)
47+
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
48+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)
49+
endif()
4450

4551
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
4652

4753
file(GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp)
54+
set(tokenizers_source_files
55+
${CMAKE_CURRENT_SOURCE_DIR}/src/bpe_tokenizer_base.cpp
56+
${CMAKE_CURRENT_SOURCE_DIR}/src/hf_tokenizer.cpp
57+
${CMAKE_CURRENT_SOURCE_DIR}/src/llama2c_tokenizer.cpp
58+
${CMAKE_CURRENT_SOURCE_DIR}/src/pre_tokenizer.cpp
59+
${CMAKE_CURRENT_SOURCE_DIR}/src/re2_regex.cpp
60+
${CMAKE_CURRENT_SOURCE_DIR}/src/regex.cpp
61+
${CMAKE_CURRENT_SOURCE_DIR}/src/sentencepiece.cpp
62+
${CMAKE_CURRENT_SOURCE_DIR}/src/tiktoken.cpp
63+
${CMAKE_CURRENT_SOURCE_DIR}/src/token_decoder.cpp
64+
)
65+
if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
66+
list(APPEND
67+
tokenizers_source_files
68+
${CMAKE_CURRENT_SOURCE_DIR}/src/pcre2_regex.cpp
69+
${CMAKE_CURRENT_SOURCE_DIR}/src/std_regex.cpp)
70+
endif()
4871
file(GLOB unicode_source_files
4972
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp)
5073
add_library(tokenizers STATIC ${tokenizers_source_files}
@@ -58,10 +81,16 @@ target_include_directories(
5881
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src
5982
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
6083
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
61-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include
62-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
84+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include)
85+
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)
6386

64-
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2 pcre2-8)
87+
if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
88+
target_include_directories(tokenizers
89+
PUBLIC
90+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
91+
target_link_libraries(tokenizers PUBLIC pcre2-8)
92+
target_compile_definitions(tokenizers PUBLIC SUPPORT_REGEX_LOOKAHEAD)
93+
endif()
6594

6695
# Build test
6796
if(TOKENIZERS_BUILD_TEST)

src/regex.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#ifdef SUPPORT_REGEX_LOOKAHEAD
910
#include <pytorch/tokenizers/pcre2_regex.h>
11+
#endif
1012
#include <pytorch/tokenizers/re2_regex.h>
1113
#include <pytorch/tokenizers/regex.h>
1214
#include <pytorch/tokenizers/std_regex.h>
@@ -19,8 +21,8 @@ namespace tokenizers {
1921

2022
/**
2123
* @brief Factory function that creates a regex object using RE2 if possible.
22-
* Falls back to PCRE2 if RE2 rejects the pattern, then to std::regex if
23-
* PCRE2 fails.
24+
* Falls back to PCRE2 if RE2 rejects the pattern and
25+
* SUPPORT_REGEX_LOOKAHEAD is enabled. Otherwise, returns an error.
2426
*/
2527
Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
2628
// Try RE2 first
@@ -30,6 +32,15 @@ Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
3032
return static_cast<std::unique_ptr<IRegex>>(std::move(re2));
3133
}
3234

35+
#ifndef SUPPORT_REGEX_LOOKAHEAD
36+
std::cerr << "RE2 failed to compile pattern with lookahead: " << pattern
37+
<< "\n";
38+
std::cerr << "Error: " << (re2->regex_->error()) << std::endl;
39+
std::cerr
40+
<< "Compile with SUPPORT_REGEX_LOOKAHEAD=ON to enable support for lookahead patterns."
41+
<< std::endl;
42+
return tokenizers::Error::LoadFailure;
43+
#else
3344
if (re2->regex_->error_code() == re2::RE2::ErrorBadPerlOp) {
3445
// RE2 doesn't support some Perl features, try PCRE2
3546
auto pcre2 = std::make_unique<Pcre2Regex>("(" + pattern + ")");
@@ -56,6 +67,7 @@ Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
5667
std::cerr << "Error: " << (re2->regex_->error()) << std::endl;
5768
return tokenizers::Error::LoadFailure;
5869
}
70+
#endif
5971
}
6072

6173
} // namespace tokenizers

targets.bzl

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def define_common_targets():
1414
name = "headers",
1515
exported_headers = subdir_glob([
1616
("include", "pytorch/tokenizers/*.h"),
17-
]),
17+
], exclude = ["pcre2_regex.h", "std_regex.h"]),
1818
visibility = [
1919
"@EXECUTORCH_CLIENTS",
2020
"//pytorch/tokenizers/...",
@@ -23,20 +23,19 @@ def define_common_targets():
2323
platforms = PLATFORMS,
2424
)
2525

26+
# TODO: add target for regex which does lookahed with pcre2
27+
# by adding "-DSUPPORT_REGEX_LOOKAHEAD" as a compiler flag
28+
# and including pcre2 dependencies.
2629
runtime.cxx_library(
2730
name = "regex",
28-
srcs = ["src/regex.cpp"] + glob([
29-
"src/*_regex.cpp",
30-
]),
31-
exported_headers = subdir_glob([
32-
("include", "pytorch/tokenizers/regex.h"),
33-
("include", "pytorch/tokenizers/*_regex.h"),
34-
]),
31+
srcs = [
32+
"src/regex.cpp",
33+
"src/re2_regex.cpp",
34+
],
3535
exported_deps = [
3636
":headers",
3737
],
3838
exported_external_deps = [
39-
"pcre2",
4039
"re2",
4140
],
4241
visibility = ["//pytorch/tokenizers/..."],

0 commit comments

Comments
 (0)