@@ -19,12 +19,11 @@ using namespace tokenizers;
19
19
20
20
// Helpers /////////////////////////////////////////////////////////////////////
21
21
22
- static void assert_split_match (
23
- const PreTokenizer& ptok,
24
- const std::string& prompt,
25
- const std::vector<std::string>& expected) {
22
+ static void assert_split_match (const PreTokenizer &ptok,
23
+ const std::string &prompt,
24
+ const std::vector<std::string> &expected) {
26
25
re2::StringPiece prompt_view (prompt);
27
- const auto & got = ptok.pre_tokenize (prompt_view);
26
+ const auto & got = ptok.pre_tokenize (prompt_view);
28
27
EXPECT_EQ (expected.size (), got.size ());
29
28
for (auto i = 0 ; i < got.size (); ++i) {
30
29
EXPECT_EQ (expected[i], got[i]);
@@ -35,16 +34,14 @@ static void assert_split_match(
35
34
class RegexPreTokenizerTest : public ::testing::Test {};
36
35
37
36
// Test the basic construction
38
- TEST_F (RegexPreTokenizerTest, Construct) {
39
- RegexPreTokenizer ptok (" [0-9]+" );
40
- }
37
+ TEST_F (RegexPreTokenizerTest, Construct) { RegexPreTokenizer ptok (" [0-9]+" ); }
41
38
42
39
// Test basic splitting using the expression for Tiktoken
43
40
TEST_F (RegexPreTokenizerTest, TiktokenExpr) {
44
41
RegexPreTokenizer ptok (
45
42
R"( (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)" );
46
- assert_split_match (
47
- ptok, " How are you doing? " , {" How" , " are" , " you" , " doing" , " ?" });
43
+ assert_split_match (ptok, " How are you doing? " ,
44
+ {" How" , " are" , " you" , " doing" , " ?" });
48
45
}
49
46
50
47
// DigitsPreTokenizer //////////////////////////////////////////////////////////
@@ -54,18 +51,15 @@ class DigitsPreTokenizerTest : public ::testing::Test {};
54
51
TEST_F (DigitsPreTokenizerTest, IndividualDigits) {
55
52
DigitsPreTokenizer ptok (true );
56
53
assert_split_match (
57
- ptok,
58
- " The number 1 then 234 then 5." ,
54
+ ptok, " The number 1 then 234 then 5." ,
59
55
{" The number " , " 1" , " then " , " 2" , " 3" , " 4" , " then " , " 5" , " ." });
60
56
}
61
57
62
58
// Test digit splitting with contiguous digits
63
59
TEST_F (DigitsPreTokenizerTest, ContiguousDigits) {
64
60
DigitsPreTokenizer ptok (false );
65
- assert_split_match (
66
- ptok,
67
- " The number 1 then 234 then 5." ,
68
- {" The number " , " 1" , " then " , " 234" , " then " , " 5" , " ." });
61
+ assert_split_match (ptok, " The number 1 then 234 then 5." ,
62
+ {" The number " , " 1" , " then " , " 234" , " then " , " 5" , " ." });
69
63
}
70
64
71
65
// ByteLevelPreTokenizer ///////////////////////////////////////////////////////
@@ -75,8 +69,7 @@ TEST_F(ByteLevelPreTokenizerTest, PreTokenizeDefault) {
75
69
ByteLevelPreTokenizer ptok;
76
70
assert_split_match (ptok, " Hello World" , {" ĠHello" , " ĠWorld" });
77
71
assert_split_match (
78
- ptok,
79
- " The number 1 then 234 then 5." ,
72
+ ptok, " The number 1 then 234 then 5." ,
80
73
{" ĠThe" , " Ġnumber" , " Ġ1" , " Ġthen" , " Ġ234" , " Ġthen" , " Ġ5" , " ." });
81
74
}
82
75
@@ -97,22 +90,9 @@ TEST_F(SequencePreTokenizerTest, PreTokenizeDigitAndByteLevel) {
97
90
PreTokenizer::Ptr dptok (new DigitsPreTokenizer (true ));
98
91
PreTokenizer::Ptr bptok (new ByteLevelPreTokenizer (false ));
99
92
SequencePreTokenizer ptok ({dptok, bptok});
100
- assert_split_match (
101
- ptok,
102
- " The number 1 then 234 then 5." ,
103
- {" The" ,
104
- " Ġnumber" ,
105
- " Ġ" ,
106
- " 1" ,
107
- " Ġthen" ,
108
- " Ġ" ,
109
- " 2" ,
110
- " 3" ,
111
- " 4" ,
112
- " Ġthen" ,
113
- " Ġ" ,
114
- " 5" ,
115
- " ." });
93
+ assert_split_match (ptok, " The number 1 then 234 then 5." ,
94
+ {" The" , " Ġnumber" , " Ġ" , " 1" , " Ġthen" , " Ġ" , " 2" , " 3" , " 4" ,
95
+ " Ġthen" , " Ġ" , " 5" , " ." });
116
96
}
117
97
118
98
// PreTokenizerConfig //////////////////////////////////////////////////////////
@@ -152,14 +132,12 @@ TEST_F(PreTokenizerConfigTest, AllTypesFailureCases) {
152
132
153
133
// Sequence
154
134
EXPECT_THROW (PreTokenizerConfig (" Sequence" ).create (), std::runtime_error);
155
- EXPECT_THROW (
156
- PreTokenizerConfig (" Sequence" ).set_pretokenizers ({}).create (),
157
- std::runtime_error);
158
- EXPECT_THROW (
159
- PreTokenizerConfig (" Sequence" )
160
- .set_pretokenizers ({PreTokenizerConfig (" Split" )})
161
- .create (),
162
- std::runtime_error);
135
+ EXPECT_THROW (PreTokenizerConfig (" Sequence" ).set_pretokenizers ({}).create (),
136
+ std::runtime_error);
137
+ EXPECT_THROW (PreTokenizerConfig (" Sequence" )
138
+ .set_pretokenizers ({PreTokenizerConfig (" Split" )})
139
+ .create (),
140
+ std::runtime_error);
163
141
164
142
// Unsupported
165
143
EXPECT_THROW (PreTokenizerConfig (" Unsupported" ).create (), std::runtime_error);
@@ -183,22 +161,9 @@ TEST_F(PreTokenizerConfigTest, ParseJson) {
183
161
}},
184
162
})
185
163
.create ();
186
- assert_split_match (
187
- *ptok,
188
- " The number 1 then 234 then 5." ,
189
- {" The" ,
190
- " Ġnumber" ,
191
- " Ġ" ,
192
- " 1" ,
193
- " Ġthen" ,
194
- " Ġ" ,
195
- " 2" ,
196
- " 3" ,
197
- " 4" ,
198
- " Ġthen" ,
199
- " Ġ" ,
200
- " 5" ,
201
- " ." });
164
+ assert_split_match (*ptok, " The number 1 then 234 then 5." ,
165
+ {" The" , " Ġnumber" , " Ġ" , " 1" , " Ġthen" , " Ġ" , " 2" , " 3" , " 4" ,
166
+ " Ġthen" , " Ġ" , " 5" , " ." });
202
167
}
203
168
204
169
TEST_F (PreTokenizerConfigTest, ParseJsonOptionalKey) {
@@ -208,10 +173,8 @@ TEST_F(PreTokenizerConfigTest, ParseJsonOptionalKey) {
208
173
{" type" , " Digits" },
209
174
})
210
175
.create ();
211
- assert_split_match (
212
- *ptok,
213
- " The number 1 then 234 then 5." ,
214
- {" The number " , " 1" , " then " , " 234" , " then " , " 5" , " ." });
176
+ assert_split_match (*ptok, " The number 1 then 234 then 5." ,
177
+ {" The number " , " 1" , " then " , " 234" , " then " , " 5" , " ." });
215
178
}
216
179
217
180
TEST_F (PreTokenizerConfigTest, Split) {
0 commit comments