Skip to content

Commit 38d54b3

Browse files
author
jaime-m-p
committed
tets: skip unicode surrogaes and undefined
1 parent 0cf2989 commit 38d54b3

File tree

2 files changed

+8
-12
lines changed

2 files changed

+8
-12
lines changed

tests/test-tokenizer-1-bpe.cpp

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -117,15 +117,10 @@ int main(int argc, char **argv) {
117117
for (int i = 0; i < nthread; ++i) {
118118
threads[i] = std::thread([i, nthread, ctx, &errcode]() {
119119
for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
120-
//if (!( // NOLINT
121-
// (cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 &&
122-
// (cp < 0x13 || cp > 0x17) && cp != 0x19 &&
123-
// (cp < 0x1c || cp > 0x1e) &&
124-
// (cp < 0xd800 || cp > 0xdfff) &&
125-
// (cp < 0x00040000 || cp >= 0x000e0000)
126-
// )) {
127-
// continue;
128-
//}
120+
if ((0x0000D800 <= cp && cp <= 0x0000DFFF) || // surrogates \p{Cs}
121+
(0x00040000 <= cp && cp <= 0x000E0000)) { // undefined \p{Cn}
122+
continue;
123+
}
129124

130125
std::string str = unicode_cpt_to_utf8(cp);
131126
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);

tests/test-tokenizer-1-spm.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,10 @@ int main(int argc, char ** argv) {
8787
for (int i = 0; i < nthread; ++i) {
8888
threads[i] = std::thread([i, nthread, ctx, &errcode]() {
8989
for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
90-
//if (cp >= 0xd800 && cp <= 0xdfff) {
91-
// continue;
92-
//}
90+
if ((0x0000D800 <= cp && cp <= 0x0000DFFF) || // surrogates \p{Cs}
91+
(0x00040000 <= cp && cp <= 0x000E0000)) { // undefined \p{Cn}
92+
continue;
93+
}
9394

9495
std::string str = unicode_cpt_to_utf8(cp);
9596
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);

0 commit comments

Comments
 (0)