Implement mb_str_split using fast text conversion filters

alexdowad · alexdowad · commit 88c99afdac62 · 2023-01-03T09:02:21.000+02:00
There is no great difference between the old and new code for text
encodings which either 1) use a fixed number of bytes per codepoint or
2) for which we have an 'mblen' table which enables us to find the
length of a multi-byte character using a table lookup indexed by the
first byte value.

The big difference is for other text encodings, where we have to
actually decode the string to split it. For such text encodings,
such as ISO-2022-JP and UTF-16, I measured a speedup of 50%-120% over
the previous implementation.
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
@@ -18,6 +18,8 @@
 */
 
 /* {{{ includes */
+#include <limits.h>
+
 #include "libmbfl/config.h"
 #include "php.h"
 #include "php_ini.h"
@@ -1625,171 +1627,121 @@ PHP_FUNCTION(mb_output_handler)
 	}
 }
 
-/* {{{ Convert a multibyte string to an array. If split_length is specified,
- break the string down into chunks each split_length characters long. */
-
-/* structure to pass split params to the callback */
-struct mbfl_split_params {
-	zval *return_value; /* php function return value structure pointer */
-	mbfl_string *result_string; /* string to store result chunk */
-	size_t mb_chunk_length; /* actual chunk length in chars */
-	size_t split_length; /* split length in chars */
-	mbfl_convert_filter *next_filter; /* widechar to encoding converter */
-};
-
-/* callback function to fill split array */
-static int mbfl_split_output(int c, void *data)
-{
-	struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */
-
-	(*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */
-
-	if (params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */
-		mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */
-		mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */
-		mbfl_string *chunk = params->result_string;
-		mbfl_memory_device_result(device, chunk); /* make chunk */
-		add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */
-		efree(chunk->val);
-		params->mb_chunk_length = 0; /* reset mb_chunk size */
-	}
-
-	return 0;
-}
-
 PHP_FUNCTION(mb_str_split)
 {
 	zend_string *str, *encoding = NULL;
-	size_t mb_len, chunks, chunk_len;
-	const char *p, *last; /* pointer for the string cursor and last string char */
-	mbfl_string string, result_string;
-	const mbfl_encoding *mbfl_encoding;
-	zend_long split_length = 1;
+	zend_long split_len = 1;
 
 	ZEND_PARSE_PARAMETERS_START(1, 3)
 		Z_PARAM_STR(str)
 		Z_PARAM_OPTIONAL
-		Z_PARAM_LONG(split_length)
+		Z_PARAM_LONG(split_len)
 		Z_PARAM_STR_OR_NULL(encoding)
 	ZEND_PARSE_PARAMETERS_END();
 
-	if (split_length <= 0) {
+	if (split_len <= 0) {
 		zend_argument_value_error(2, "must be greater than 0");
 		RETURN_THROWS();
+	} else if (split_len > UINT_MAX / 4) {
+		zend_argument_value_error(2, "is too large");
+		RETURN_THROWS();
 	}
 
-	/* fill mbfl_string structure */
-	string.val = (unsigned char *) ZSTR_VAL(str);
-	string.len = ZSTR_LEN(str);
-	string.encoding = php_mb_get_encoding(encoding, 3);
-	if (!string.encoding) {
+	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
+	if (!enc) {
 		RETURN_THROWS();
 	}
 
 	if (ZSTR_LEN(str) == 0) {
 		RETURN_EMPTY_ARRAY();
 	}
 
-	p = ZSTR_VAL(str); /* string cursor pointer */
-	last = ZSTR_VAL(str) + ZSTR_LEN(str); /* last string char pointer */
-
-	mbfl_encoding = string.encoding;
-
-	/* first scenario: 1,2,4-bytes fixed width encodings (head part) */
-	if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */
-		mb_len = string.len;
-		chunk_len = (size_t)split_length; /* chunk length in bytes */
-	} else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS2) { /* 2 bytes */
-		mb_len = string.len / 2;
-		chunk_len = split_length * 2;
-	} else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS4) { /* 4 bytes */
-		mb_len = string.len / 4;
-		chunk_len = split_length * 4;
-	} else if (mbfl_encoding->mblen_table != NULL) {
-		/* second scenario: variable width encodings with length table */
-		char unsigned const *mbtab = mbfl_encoding->mblen_table;
-
-		/* assume that we have 1-bytes characters */
-		array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
-
-		while (p < last) { /* split cycle work until the cursor has reached the last byte */
-			char const *chunk_p = p; /* chunk first byte pointer */
-			chunk_len = 0; /* chunk length in bytes */
-			zend_long char_count;
-
-			for (char_count = 0; char_count < split_length && p < last; ++char_count) {
-				char unsigned const m = mbtab[*(const unsigned char *)p]; /* single character length table */
-				chunk_len += m;
-				p += m;
+	unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
+
+	unsigned int char_len = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
+	if (char_len) {
+		unsigned int chunk_len = char_len * split_len;
+		unsigned int chunks = ((ZSTR_LEN(str) / chunk_len) + split_len - 1) / split_len; /* round up */
+		array_init_size(return_value, chunks);
+		while (p < e) {
+			add_next_index_stringl(return_value, (const char*)p, MIN(chunk_len, e - p));
+			p += chunk_len;
+		}
+	} else if (enc->mblen_table) {
+		unsigned char const *mbtab = enc->mblen_table;
+
+		/* Assume that we have 1-byte characters */
+		array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
+
+		while (p < e) {
+			unsigned char *chunk = p; /* start of chunk */
+
+			for (int char_count = 0; char_count < split_len && p < e; char_count++) {
+				p += mbtab[*p];
+			}
+			if (p > e) {
+				p = e; /* ensure chunk is in bounds */
 			}
-			if (p >= last) chunk_len -= p - last; /* check if chunk is in bounds */
-			add_next_index_stringl(return_value, chunk_p, chunk_len);
+			add_next_index_stringl(return_value, (const char*)chunk, p - chunk);
 		}
-		return;
 	} else {
-		/* third scenario: other multibyte encodings */
-		mbfl_convert_filter *filter, *decoder;
-
-		/* assume that we have 1-bytes characters */
-		array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
-
-		/* decoder filter to decode wchar to encoding */
-		mbfl_memory_device device;
-		mbfl_memory_device_init(&device, split_length + 1, 0);
-
-		decoder = mbfl_convert_filter_new(
-				&mbfl_encoding_wchar,
-				string.encoding,
-				mbfl_memory_device_output,
-				NULL,
-				&device);
-		/* assert that nothing is wrong with the decoder */
-		ZEND_ASSERT(decoder != NULL);
-
-		/* wchar filter */
-		mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */
-		struct mbfl_split_params params = { /* init callback function params structure */
-			.return_value = return_value,
-			.result_string = &result_string,
-			.mb_chunk_length = 0,
-			.split_length = (size_t)split_length,
-			.next_filter = decoder,
-		};
-
-		filter = mbfl_convert_filter_new(
-				string.encoding,
-				&mbfl_encoding_wchar,
-				mbfl_split_output,
-				NULL,
-				&params);
-		/* assert that nothing is wrong with the filter */
-		ZEND_ASSERT(filter != NULL);
-
-		while (p < last - 1) { /* cycle each byte except last with callback function */
-			(*filter->filter_function)(*p++, filter);
-		}
-		params.mb_chunk_length = split_length - 1; /* force to finish current chunk */
-		(*filter->filter_function)(*p++, filter); /* process last char */
-
-		mbfl_convert_filter_delete(decoder);
-		mbfl_convert_filter_delete(filter);
-		mbfl_memory_device_clear(&device);
-		return;
-	}
-
-	/* first scenario: 1,2,4-bytes fixed width encodings (tail part) */
-	chunks = (mb_len + split_length - 1) / split_length; /* (round up idiom) */
-	array_init_size(return_value, chunks);
-	if (chunks != 0) {
-		zend_long i;
-
-		for (i = 0; i < chunks - 1; p += chunk_len, ++i) {
-			add_next_index_stringl(return_value, p, chunk_len);
-		}
-		add_next_index_stringl(return_value, p, last - p);
+		/* Assume that we have 1-byte characters */
+		array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
+
+		uint32_t wchar_buf[128];
+		size_t in_len = ZSTR_LEN(str);
+		unsigned int state = 0, char_count = 0;
+
+		mb_convert_buf buf;
+
+		while (in_len) {
+			size_t out_len = enc->to_wchar(&p, &in_len, wchar_buf, 128, &state);
+			ZEND_ASSERT(out_len <= 128);
+			size_t i = 0;
+
+			/* Is there some output remaining from the previous iteration? */
+			if (char_count) {
+				if (out_len >= split_len - char_count) {
+					/* Finish off an incomplete chunk from previous iteration
+					 * ('buf' was already initialized; we don't need to do it again) */
+					enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
+					i += split_len - char_count;
+					char_count = 0;
+					add_next_index_str(return_value, mb_convert_buf_result(&buf));
+				} else {
+					/* Output from this iteration is not enough to finish the next chunk;
+					 * output what we can, and leave 'buf' to be used again on next iteration */
+					enc->from_wchar(wchar_buf, out_len, &buf, !in_len);
+					char_count += out_len;
+					continue;
+				}
+			}
+
+			while (i < out_len) {
+				/* Prepare for the next chunk */
+				mb_convert_buf_init(&buf, split_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
+
+				if (out_len - i >= split_len) {
+					enc->from_wchar(wchar_buf + i, split_len, &buf, true);
+					i += split_len;
+					add_next_index_str(return_value, mb_convert_buf_result(&buf));
+				} else {
+					/* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
+					 * leave them for the next iteration */
+					enc->from_wchar(wchar_buf + i, out_len - i, &buf, !in_len);
+					char_count = out_len - i;
+					break;
+				}
+			}
+		}
+
+		if (char_count) {
+			/* The main loop above has finished processing the input string, but
+			 * has left a partial chunk in 'buf' */
+			add_next_index_str(return_value, mb_convert_buf_result(&buf));
+		}
 	}
 }
-/* }}} */
 
 static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
 {
diff --git a/ext/mbstring/tests/mb_str_split_error_conditions.phpt b/ext/mbstring/tests/mb_str_split_error_conditions.phpt
@@ -26,8 +26,34 @@ try {
     echo $e->getMessage() . \PHP_EOL;
 }
 
+// For UTF-8, error markers are not inserted
+echo "== INVALID UTF-8 ==\n";
+$array = mb_str_split("abc\xFFabc", 2, "UTF-8");
+echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
+
+// For most other encodings, they are
+echo "== INVALID HZ ==\n";
+// The last string emitted by mb_str_split will include '?' as an error marker,
+// since ά cannot be represented in HZ
+$array = mb_str_split(mb_convert_encoding("ελληνικά", "HZ", "UTF-8"), 2, "HZ");
+echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
+
+// HTML entity error markers
+mb_substitute_character("entity");
+echo "== INVALID HZ IN 'ENTITY' ERROR OUTPUT MODE ==\n";
+// The output here will actually include an HTML entity #x3AC;
+// It will be split into segments of 2 characters each by mb_str_split
+$array = mb_str_split(mb_convert_encoding("ελληνικά", "HZ", "UTF-8"), 2, "HZ");
+echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
+
 ?>
 --EXPECT--
 mb_str_split(): Argument #2 ($length) must be greater than 0
 mb_str_split(): Argument #2 ($length) must be greater than 0
 mb_str_split(): Argument #3 ($encoding) must be a valid encoding, "BAD_ENCODING" given
+== INVALID UTF-8 ==
+[6162, 63ff, 6162, 63]
+== INVALID HZ ==
+[7e7b2645264b7e7d, 7e7b264b26477e7d, 7e7b264d26497e7d, 7e7b264a7e7d3f]
+== INVALID HZ IN 'ENTITY' ERROR OUTPUT MODE ==
+[7e7b2645264b7e7d, 7e7b264b26477e7d, 7e7b264d26497e7d, 7e7b264a7e7d26, 2378, 3341, 433b]
diff --git a/ext/mbstring/tests/mb_str_split_other.phpt b/ext/mbstring/tests/mb_str_split_other.phpt