@@ -232,8 +232,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
232
232
};
233
233
234
234
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
235
- static const codepoint_flags undef (codepoint_flags::UNDEFINED);
236
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags (cpts[pos]) : undef;
235
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags (cpts[pos]) : codepoint_flags{};
237
236
};
238
237
239
238
size_t _prev_end = offset_ini;
@@ -295,9 +294,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
295
294
continue ;
296
295
}
297
296
// regex: <space>?[^\s\p{L}\p{N}]+
298
- if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined )) {
297
+ if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number ) && flags2.as_uint ( )) {
299
298
pos += (cpt == ' ' );
300
- while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined )) {
299
+ while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number ) && flags2.as_uint ( )) {
301
300
flags2 = _get_flags (++pos);
302
301
}
303
302
_add_token (pos);
@@ -351,8 +350,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
351
350
};
352
351
353
352
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
354
- static const codepoint_flags undef (codepoint_flags::UNDEFINED);
355
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags (cpts[pos]) : undef;
353
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags (cpts[pos]) : codepoint_flags{};
356
354
};
357
355
358
356
size_t _prev_end = offset_ini;
@@ -394,8 +392,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
394
392
}
395
393
}
396
394
397
- // regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
398
- if (!(cpt == ' \r ' || cpt == ' \n ' || /* flags.is_letter | */ flags.is_number )) {
395
+ // regex: [^\r\n\p{L}\p{N}]?\p{L}+
396
+ if (!(cpt == ' \r ' || cpt == ' \n ' || flags.is_number )) {
399
397
if (flags.is_letter || _get_flags (pos+1 ).is_letter ) { // one or more letters
400
398
pos++;
401
399
while (_get_flags (pos).is_letter ) {
@@ -421,9 +419,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
421
419
422
420
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
423
421
auto flags2 = (cpt == ' ' ? _get_flags (pos+1 ) : flags);
424
- if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2. is_undefined )) {
422
+ if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number ) && flags. as_uint ( )) {
425
423
pos += (cpt == ' ' );
426
- while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined )) {
424
+ while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number ) && flags2.as_uint ( )) {
427
425
flags2 = _get_flags (++pos);
428
426
}
429
427
uint32_t cpt2 = _get_cpt (pos);
0 commit comments