@@ -18,33 +18,56 @@ project(Tokenizers)
18
18
19
19
option (TOKENIZERS_BUILD_TEST "Build tests" OFF )
20
20
option (TOKENIZERS_BUILD_TOOLS "Build tools" OFF )
21
+ option (SUPPORT_REGEX_LOOKAHEAD
22
+ "Support regex lookahead patterns (requires PCRE2)" OFF )
21
23
22
24
# Ignore weak attribute warning
23
25
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes" )
24
26
25
27
set (ABSL_ENABLE_INSTALL ON )
26
28
set (ABSL_PROPAGATE_CXX_STD ON )
29
+
27
30
set (_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE} )
28
31
set (CMAKE_POSITION_INDEPENDENT_CODE ON )
32
+
29
33
add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/abseil-cpp )
30
34
add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/re2 )
31
35
add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/sentencepiece )
32
36
33
37
# Configure PCRE2
34
- set (PCRE2_BUILD_PCRE2_8 ON )
35
- set (PCRE2_BUILD_PCRE2_16 OFF )
36
- set (PCRE2_BUILD_PCRE2_32 OFF )
37
- set (PCRE2_BUILD_TESTS OFF )
38
- set (PCRE2_BUILD_PCRE2GREP OFF )
39
- set (PCRE2_BUILD_PCRE2TEST OFF )
40
- set (PCRE2_BUILD_PCRE2GPERF OFF )
41
- set (PCRE2_BUILD_DOCS OFF )
42
- set (PCRE2_BUILD_LIBPCRE2_PDB OFF )
43
- add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2 )
38
+ if (SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST )
39
+ set (PCRE2_BUILD_PCRE2_8 ON )
40
+ set (PCRE2_BUILD_PCRE2_16 OFF )
41
+ set (PCRE2_BUILD_PCRE2_32 OFF )
42
+ set (PCRE2_BUILD_TESTS OFF )
43
+ set (PCRE2_BUILD_PCRE2GREP OFF )
44
+ set (PCRE2_BUILD_PCRE2TEST OFF )
45
+ set (PCRE2_BUILD_PCRE2GPERF OFF )
46
+ set (PCRE2_BUILD_DOCS OFF )
47
+ set (PCRE2_BUILD_LIBPCRE2_PDB OFF )
48
+ add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2 )
49
+ endif ()
44
50
45
51
set (CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag} )
46
52
47
53
file (GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR} /src/*.cpp )
54
+ set (tokenizers_source_files
55
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/bpe_tokenizer_base.cpp
56
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/hf_tokenizer.cpp
57
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/llama2c_tokenizer.cpp
58
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/pre_tokenizer.cpp
59
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/re2_regex.cpp
60
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/regex.cpp
61
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/sentencepiece.cpp
62
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/tiktoken.cpp
63
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/token_decoder.cpp
64
+ )
65
+ if (SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST )
66
+ list (APPEND
67
+ tokenizers_source_files
68
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/pcre2_regex.cpp
69
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/std_regex.cpp )
70
+ endif ()
48
71
file (GLOB unicode_source_files
49
72
${CMAKE_CURRENT_SOURCE_DIR} /third-party/llama.cpp-unicode/src/*.cpp )
50
73
add_library (tokenizers STATIC ${tokenizers_source_files}
@@ -58,10 +81,16 @@ target_include_directories(
58
81
${CMAKE_CURRENT_SOURCE_DIR} /third-party/sentencepiece/src
59
82
${CMAKE_CURRENT_SOURCE_DIR} /third-party/re2
60
83
${CMAKE_CURRENT_SOURCE_DIR} /third-party/json/single_include
61
- ${CMAKE_CURRENT_SOURCE_DIR} /third-party/llama.cpp-unicode/include
62
- ${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2/src )
84
+ ${CMAKE_CURRENT_SOURCE_DIR} /third-party/llama.cpp-unicode/include )
85
+ target_link_libraries ( tokenizers PUBLIC sentencepiece-static re2::re2 )
63
86
64
- target_link_libraries (tokenizers PUBLIC sentencepiece-static re2::re2 pcre2-8 )
87
+ if (SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST )
88
+ target_include_directories (tokenizers
89
+ PUBLIC
90
+ ${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2/src )
91
+ target_link_libraries (tokenizers PUBLIC pcre2-8 )
92
+ target_compile_definitions (tokenizers PUBLIC SUPPORT_REGEX_LOOKAHEAD )
93
+ endif ()
65
94
66
95
# Build test
67
96
if (TOKENIZERS_BUILD_TEST )
0 commit comments