Skip to content

Commit 95a0df5

Browse files
author
jaime-m-p
committed
Bugfix: custom regexs splits undefined unicode codepoints
1 parent 12e2c31 commit 95a0df5

File tree

1 file changed

+8
-10
lines changed

1 file changed

+8
-10
lines changed

unicode.cpp

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -232,8 +232,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
232232
};
233233

234234
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
235-
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
236-
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
235+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
237236
};
238237

239238
size_t _prev_end = offset_ini;
@@ -295,9 +294,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
295294
continue;
296295
}
297296
// regex: <space>?[^\s\p{L}\p{N}]+
298-
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
297+
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
299298
pos += (cpt == ' ');
300-
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
299+
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
301300
flags2 = _get_flags(++pos);
302301
}
303302
_add_token(pos);
@@ -351,8 +350,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
351350
};
352351

353352
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
354-
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
355-
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
353+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
356354
};
357355

358356
size_t _prev_end = offset_ini;
@@ -394,8 +392,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
394392
}
395393
}
396394

397-
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
398-
if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
395+
// regex: [^\r\n\p{L}\p{N}]?\p{L}+
396+
if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
399397
if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
400398
pos++;
401399
while (_get_flags(pos).is_letter) {
@@ -421,9 +419,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
421419

422420
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
423421
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
424-
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
422+
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
425423
pos += (cpt == ' ');
426-
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
424+
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
427425
flags2 = _get_flags(++pos);
428426
}
429427
uint32_t cpt2 = _get_cpt(pos);

0 commit comments

Comments
 (0)