Skip to content

Commit 5b15ab9

Browse files
committed
Fix mb_check_encoding is ASCII when previous part of halfway through UTF-8
1 parent 0bd4ac0 commit 5b15ab9

File tree

2 files changed

+19
-4
lines changed

2 files changed

+19
-4
lines changed

ext/mbstring/mbstring.c

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4896,6 +4896,11 @@ static bool mb_fast_check_utf8_default(zend_string *str)
48964896

48974897
static const int8_t _verror[] = {9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
48984898

4899+
/* Bits mask. If single-byte characters, but there may have previous byte is multi-byte. */
4900+
static const int8_t _bad_mask[] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -16, -32, -64};
4901+
/* If single-byte character of previous byte is multi-byte mask vector */
4902+
int8x16_t bad_mask = vld1q_s8(_bad_mask);
4903+
48994904
/* error flag vertor */
49004905
int8x16_t has_error = vdupq_n_s8(0);
49014906
struct processed_utf_bytes previous = {.rawbytes = vdupq_n_s8(0),
@@ -4907,9 +4912,18 @@ static bool mb_fast_check_utf8_default(zend_string *str)
49074912
/* All bytes are lower than 0x7F, it is ASCII */
49084913
uint8x16_t is_ascii = vqsubq_u8(vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0x7F));
49094914
if (vmaxvq_u8(is_ascii) == 0) {
4910-
previous.rawbytes = vdupq_n_s8(0);
4911-
previous.high_nibbles = vdupq_n_s8(0);
4912-
previous.carried_continuations = vdupq_n_s8(0);
4915+
/* Even if this block only contains sinble-byte characters, there may have been a
4916+
* multi-byte character at the end of the previous block, which was supposed to
4917+
* have continuation bytes in this block
4918+
* This bitmask will pick out a 2/3/4 byte character starting from the last byte of
4919+
* the previous block, a 3/4 byte starting from the 2nd last, or a 4 byte starting from the 3rd last
4920+
*/
4921+
uint8x16_t bad = vceqq_s8(vandq_s8(previous.rawbytes, bad_mask), bad_mask);
4922+
4923+
if (vmaxvq_u8(bad) != 0) {
4924+
return false;
4925+
}
4926+
49134927
continue;
49144928
}
49154929
neon_check_utf8_bytes(current_bytes, &previous, &has_error);

ext/mbstring/tests/utf_encodings.phpt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -820,7 +820,8 @@ $truncated16byte = [
820820
"k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xef\xbf",
821821
"k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0",
822822
"k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0\xbf",
823-
"k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0\xbf\xbf"
823+
"k\x08`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0\xbf\xbf",
824+
"0123456789abcd\xe3\x810123456789abcdef"
824825
];
825826
foreach ($truncated16byte as $trunc) {
826827
if (mb_check_encoding($trunc, 'UTF-8'))

0 commit comments

Comments
 (0)