@@ -4896,6 +4896,11 @@ static bool mb_fast_check_utf8_default(zend_string *str)
4896
4896
4897
4897
static const int8_t _verror [] = {9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 1 };
4898
4898
4899
+ /* Bits mask. If single-byte characters, but there may have previous byte is multi-byte. */
4900
+ static const int8_t _bad_mask [] = {-1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -16 , -32 , -64 };
4901
+ /* If single-byte character of previous byte is multi-byte mask vector */
4902
+ int8x16_t bad_mask = vld1q_s8 (_bad_mask );
4903
+
4899
4904
/* error flag vertor */
4900
4905
int8x16_t has_error = vdupq_n_s8 (0 );
4901
4906
struct processed_utf_bytes previous = {.rawbytes = vdupq_n_s8 (0 ),
@@ -4907,9 +4912,18 @@ static bool mb_fast_check_utf8_default(zend_string *str)
4907
4912
/* All bytes are lower than 0x7F, it is ASCII */
4908
4913
uint8x16_t is_ascii = vqsubq_u8 (vreinterpretq_u8_s8 (current_bytes ), vdupq_n_u8 (0x7F ));
4909
4914
if (vmaxvq_u8 (is_ascii ) == 0 ) {
4910
- previous .rawbytes = vdupq_n_s8 (0 );
4911
- previous .high_nibbles = vdupq_n_s8 (0 );
4912
- previous .carried_continuations = vdupq_n_s8 (0 );
4915
+ /* Even if this block only contains sinble-byte characters, there may have been a
4916
+ * multi-byte character at the end of the previous block, which was supposed to
4917
+ * have continuation bytes in this block
4918
+ * This bitmask will pick out a 2/3/4 byte character starting from the last byte of
4919
+ * the previous block, a 3/4 byte starting from the 2nd last, or a 4 byte starting from the 3rd last
4920
+ */
4921
+ uint8x16_t bad = vceqq_s8 (vandq_s8 (previous .rawbytes , bad_mask ), bad_mask );
4922
+
4923
+ if (vmaxvq_u8 (bad ) != 0 ) {
4924
+ return false;
4925
+ }
4926
+
4913
4927
continue ;
4914
4928
}
4915
4929
neon_check_utf8_bytes (current_bytes , & previous , & has_error );
0 commit comments