@@ -226,7 +226,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
226
226
assert (offset_end <= cpts.size ());
227
227
start = offset_end;
228
228
229
- auto _get_cpt = [&] (const size_t pos) -> char32_t {
229
+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
230
230
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0 ;
231
231
};
232
232
@@ -253,18 +253,18 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
253
253
};
254
254
255
255
for (size_t pos = offset_ini; pos < offset_end; /* pos++*/ ) {
256
- const char32_t cpt = _get_cpt (pos);
256
+ const uint32_t cpt = _get_cpt (pos);
257
257
const auto flags = _get_flags (pos);
258
258
259
259
// regex: 's|'t|'re|'ve|'m|'ll|'d
260
260
if (cpt == ' \' ' && pos+1 < offset_end) {
261
- char32_t cpt_next = _get_cpt (pos+1 );
261
+ uint32_t cpt_next = _get_cpt (pos+1 );
262
262
if (cpt_next == ' s' || cpt_next == ' t' || cpt_next == ' m' || cpt_next == ' d' ) {
263
263
pos += _add_token (pos+2 );
264
264
continue ;
265
265
}
266
266
if (pos+2 < offset_end) {
267
- char32_t cpt_next_next = _get_cpt (pos+2 );
267
+ uint32_t cpt_next_next = _get_cpt (pos+2 );
268
268
if ((cpt_next == ' r' && cpt_next_next == ' e' ) ||
269
269
(cpt_next == ' v' && cpt_next_next == ' e' ) ||
270
270
(cpt_next == ' l' && cpt_next_next == ' l' )) {
@@ -344,7 +344,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
344
344
assert (offset_end <= cpts.size ());
345
345
start = offset_end;
346
346
347
- auto _get_cpt = [&] (const size_t pos) -> char32_t {
347
+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
348
348
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0 ;
349
349
};
350
350
@@ -371,18 +371,18 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
371
371
};
372
372
373
373
for (size_t pos = offset_ini; pos < offset_end; /* pos++*/ ) {
374
- const char32_t cpt = _get_cpt (pos);
374
+ const uint32_t cpt = _get_cpt (pos);
375
375
const auto flags = _get_flags (pos);
376
376
377
377
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
378
378
if (cpt == ' \' ' && pos+1 < offset_end) {
379
- char32_t cpt_next = unicode_tolower (_get_cpt (pos+1 ));
379
+ uint32_t cpt_next = unicode_tolower (_get_cpt (pos+1 ));
380
380
if (cpt_next == ' s' || cpt_next == ' t' || cpt_next == ' m' || cpt_next == ' d' ) {
381
381
pos += _add_token (pos+2 );
382
382
continue ;
383
383
}
384
384
if (pos+2 < offset_end) {
385
- char32_t cpt_next_next = unicode_tolower (_get_cpt (pos+2 ));
385
+ uint32_t cpt_next_next = unicode_tolower (_get_cpt (pos+2 ));
386
386
if ((cpt_next == ' r' && cpt_next_next == ' e' ) ||
387
387
(cpt_next == ' v' && cpt_next_next == ' e' ) ||
388
388
(cpt_next == ' l' && cpt_next_next == ' l' )) {
@@ -424,7 +424,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
424
424
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined )) {
425
425
flags2 = _get_flags (++pos);
426
426
}
427
- char32_t cpt2 = _get_cpt (pos);
427
+ uint32_t cpt2 = _get_cpt (pos);
428
428
while (cpt2 == ' \r ' || cpt2 == ' \n ' ) {
429
429
cpt2 = _get_cpt (++pos);
430
430
}
@@ -435,7 +435,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
435
435
size_t num_whitespaces = 0 ;
436
436
size_t last_end_r_or_n = 0 ;
437
437
while (_get_flags (pos+num_whitespaces).is_whitespace ) {
438
- char32_t cpt2 = _get_cpt (pos+num_whitespaces);
438
+ uint32_t cpt2 = _get_cpt (pos+num_whitespaces);
439
439
if (cpt2 == ' \r ' || cpt2 == ' \n ' ) {
440
440
last_end_r_or_n = pos + num_whitespaces + 1 ;
441
441
}
@@ -626,7 +626,7 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
626
626
return map.at (utf8);
627
627
}
628
628
629
- char32_t unicode_tolower (char32_t cp) {
629
+ uint32_t unicode_tolower (uint32_t cp) {
630
630
auto it = unicode_map_lowercase.find (cp);
631
631
return it == unicode_map_lowercase.end () ? cp : it->second ;
632
632
}
0 commit comments