Skip to content

Commit 88c99af

Browse files
committed
Implement mb_str_split using fast text conversion filters
There is no great difference between the old and new code for text encodings which either 1) use a fixed number of bytes per codepoint or 2) for which we have an 'mblen' table which enables us to find the length of a multi-byte character using a table lookup indexed by the first byte value. The big difference is for other text encodings, where we have to actually decode the string to split it. For such text encodings, such as ISO-2022-JP and UTF-16, I measured a speedup of 50%-120% over the previous implementation.
1 parent a9a6720 commit 88c99af

File tree

3 files changed

+233
-139
lines changed

3 files changed

+233
-139
lines changed

ext/mbstring/mbstring.c

Lines changed: 91 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
*/
1919

2020
/* {{{ includes */
21+
#include <limits.h>
22+
2123
#include "libmbfl/config.h"
2224
#include "php.h"
2325
#include "php_ini.h"
@@ -1625,171 +1627,121 @@ PHP_FUNCTION(mb_output_handler)
16251627
}
16261628
}
16271629

1628-
/* {{{ Convert a multibyte string to an array. If split_length is specified,
1629-
break the string down into chunks each split_length characters long. */
1630-
1631-
/* structure to pass split params to the callback */
1632-
struct mbfl_split_params {
1633-
zval *return_value; /* php function return value structure pointer */
1634-
mbfl_string *result_string; /* string to store result chunk */
1635-
size_t mb_chunk_length; /* actual chunk length in chars */
1636-
size_t split_length; /* split length in chars */
1637-
mbfl_convert_filter *next_filter; /* widechar to encoding converter */
1638-
};
1639-
1640-
/* callback function to fill split array */
1641-
static int mbfl_split_output(int c, void *data)
1642-
{
1643-
struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */
1644-
1645-
(*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */
1646-
1647-
if (params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */
1648-
mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */
1649-
mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */
1650-
mbfl_string *chunk = params->result_string;
1651-
mbfl_memory_device_result(device, chunk); /* make chunk */
1652-
add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */
1653-
efree(chunk->val);
1654-
params->mb_chunk_length = 0; /* reset mb_chunk size */
1655-
}
1656-
1657-
return 0;
1658-
}
1659-
16601630
PHP_FUNCTION(mb_str_split)
16611631
{
16621632
zend_string *str, *encoding = NULL;
1663-
size_t mb_len, chunks, chunk_len;
1664-
const char *p, *last; /* pointer for the string cursor and last string char */
1665-
mbfl_string string, result_string;
1666-
const mbfl_encoding *mbfl_encoding;
1667-
zend_long split_length = 1;
1633+
zend_long split_len = 1;
16681634

16691635
ZEND_PARSE_PARAMETERS_START(1, 3)
16701636
Z_PARAM_STR(str)
16711637
Z_PARAM_OPTIONAL
1672-
Z_PARAM_LONG(split_length)
1638+
Z_PARAM_LONG(split_len)
16731639
Z_PARAM_STR_OR_NULL(encoding)
16741640
ZEND_PARSE_PARAMETERS_END();
16751641

1676-
if (split_length <= 0) {
1642+
if (split_len <= 0) {
16771643
zend_argument_value_error(2, "must be greater than 0");
16781644
RETURN_THROWS();
1645+
} else if (split_len > UINT_MAX / 4) {
1646+
zend_argument_value_error(2, "is too large");
1647+
RETURN_THROWS();
16791648
}
16801649

1681-
/* fill mbfl_string structure */
1682-
string.val = (unsigned char *) ZSTR_VAL(str);
1683-
string.len = ZSTR_LEN(str);
1684-
string.encoding = php_mb_get_encoding(encoding, 3);
1685-
if (!string.encoding) {
1650+
const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
1651+
if (!enc) {
16861652
RETURN_THROWS();
16871653
}
16881654

16891655
if (ZSTR_LEN(str) == 0) {
16901656
RETURN_EMPTY_ARRAY();
16911657
}
16921658

1693-
p = ZSTR_VAL(str); /* string cursor pointer */
1694-
last = ZSTR_VAL(str) + ZSTR_LEN(str); /* last string char pointer */
1695-
1696-
mbfl_encoding = string.encoding;
1697-
1698-
/* first scenario: 1,2,4-bytes fixed width encodings (head part) */
1699-
if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */
1700-
mb_len = string.len;
1701-
chunk_len = (size_t)split_length; /* chunk length in bytes */
1702-
} else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS2) { /* 2 bytes */
1703-
mb_len = string.len / 2;
1704-
chunk_len = split_length * 2;
1705-
} else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS4) { /* 4 bytes */
1706-
mb_len = string.len / 4;
1707-
chunk_len = split_length * 4;
1708-
} else if (mbfl_encoding->mblen_table != NULL) {
1709-
/* second scenario: variable width encodings with length table */
1710-
char unsigned const *mbtab = mbfl_encoding->mblen_table;
1711-
1712-
/* assume that we have 1-bytes characters */
1713-
array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1714-
1715-
while (p < last) { /* split cycle work until the cursor has reached the last byte */
1716-
char const *chunk_p = p; /* chunk first byte pointer */
1717-
chunk_len = 0; /* chunk length in bytes */
1718-
zend_long char_count;
1719-
1720-
for (char_count = 0; char_count < split_length && p < last; ++char_count) {
1721-
char unsigned const m = mbtab[*(const unsigned char *)p]; /* single character length table */
1722-
chunk_len += m;
1723-
p += m;
1659+
unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
1660+
1661+
unsigned int char_len = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1662+
if (char_len) {
1663+
unsigned int chunk_len = char_len * split_len;
1664+
unsigned int chunks = ((ZSTR_LEN(str) / chunk_len) + split_len - 1) / split_len; /* round up */
1665+
array_init_size(return_value, chunks);
1666+
while (p < e) {
1667+
add_next_index_stringl(return_value, (const char*)p, MIN(chunk_len, e - p));
1668+
p += chunk_len;
1669+
}
1670+
} else if (enc->mblen_table) {
1671+
unsigned char const *mbtab = enc->mblen_table;
1672+
1673+
/* Assume that we have 1-byte characters */
1674+
array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1675+
1676+
while (p < e) {
1677+
unsigned char *chunk = p; /* start of chunk */
1678+
1679+
for (int char_count = 0; char_count < split_len && p < e; char_count++) {
1680+
p += mbtab[*p];
1681+
}
1682+
if (p > e) {
1683+
p = e; /* ensure chunk is in bounds */
17241684
}
1725-
if (p >= last) chunk_len -= p - last; /* check if chunk is in bounds */
1726-
add_next_index_stringl(return_value, chunk_p, chunk_len);
1685+
add_next_index_stringl(return_value, (const char*)chunk, p - chunk);
17271686
}
1728-
return;
17291687
} else {
1730-
/* third scenario: other multibyte encodings */
1731-
mbfl_convert_filter *filter, *decoder;
1732-
1733-
/* assume that we have 1-bytes characters */
1734-
array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1735-
1736-
/* decoder filter to decode wchar to encoding */
1737-
mbfl_memory_device device;
1738-
mbfl_memory_device_init(&device, split_length + 1, 0);
1739-
1740-
decoder = mbfl_convert_filter_new(
1741-
&mbfl_encoding_wchar,
1742-
string.encoding,
1743-
mbfl_memory_device_output,
1744-
NULL,
1745-
&device);
1746-
/* assert that nothing is wrong with the decoder */
1747-
ZEND_ASSERT(decoder != NULL);
1748-
1749-
/* wchar filter */
1750-
mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */
1751-
struct mbfl_split_params params = { /* init callback function params structure */
1752-
.return_value = return_value,
1753-
.result_string = &result_string,
1754-
.mb_chunk_length = 0,
1755-
.split_length = (size_t)split_length,
1756-
.next_filter = decoder,
1757-
};
1758-
1759-
filter = mbfl_convert_filter_new(
1760-
string.encoding,
1761-
&mbfl_encoding_wchar,
1762-
mbfl_split_output,
1763-
NULL,
1764-
&params);
1765-
/* assert that nothing is wrong with the filter */
1766-
ZEND_ASSERT(filter != NULL);
1767-
1768-
while (p < last - 1) { /* cycle each byte except last with callback function */
1769-
(*filter->filter_function)(*p++, filter);
1770-
}
1771-
params.mb_chunk_length = split_length - 1; /* force to finish current chunk */
1772-
(*filter->filter_function)(*p++, filter); /* process last char */
1773-
1774-
mbfl_convert_filter_delete(decoder);
1775-
mbfl_convert_filter_delete(filter);
1776-
mbfl_memory_device_clear(&device);
1777-
return;
1778-
}
1779-
1780-
/* first scenario: 1,2,4-bytes fixed width encodings (tail part) */
1781-
chunks = (mb_len + split_length - 1) / split_length; /* (round up idiom) */
1782-
array_init_size(return_value, chunks);
1783-
if (chunks != 0) {
1784-
zend_long i;
1785-
1786-
for (i = 0; i < chunks - 1; p += chunk_len, ++i) {
1787-
add_next_index_stringl(return_value, p, chunk_len);
1788-
}
1789-
add_next_index_stringl(return_value, p, last - p);
1688+
/* Assume that we have 1-byte characters */
1689+
array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1690+
1691+
uint32_t wchar_buf[128];
1692+
size_t in_len = ZSTR_LEN(str);
1693+
unsigned int state = 0, char_count = 0;
1694+
1695+
mb_convert_buf buf;
1696+
1697+
while (in_len) {
1698+
size_t out_len = enc->to_wchar(&p, &in_len, wchar_buf, 128, &state);
1699+
ZEND_ASSERT(out_len <= 128);
1700+
size_t i = 0;
1701+
1702+
/* Is there some output remaining from the previous iteration? */
1703+
if (char_count) {
1704+
if (out_len >= split_len - char_count) {
1705+
/* Finish off an incomplete chunk from previous iteration
1706+
* ('buf' was already initialized; we don't need to do it again) */
1707+
enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
1708+
i += split_len - char_count;
1709+
char_count = 0;
1710+
add_next_index_str(return_value, mb_convert_buf_result(&buf));
1711+
} else {
1712+
/* Output from this iteration is not enough to finish the next chunk;
1713+
* output what we can, and leave 'buf' to be used again on next iteration */
1714+
enc->from_wchar(wchar_buf, out_len, &buf, !in_len);
1715+
char_count += out_len;
1716+
continue;
1717+
}
1718+
}
1719+
1720+
while (i < out_len) {
1721+
/* Prepare for the next chunk */
1722+
mb_convert_buf_init(&buf, split_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
1723+
1724+
if (out_len - i >= split_len) {
1725+
enc->from_wchar(wchar_buf + i, split_len, &buf, true);
1726+
i += split_len;
1727+
add_next_index_str(return_value, mb_convert_buf_result(&buf));
1728+
} else {
1729+
/* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
1730+
* leave them for the next iteration */
1731+
enc->from_wchar(wchar_buf + i, out_len - i, &buf, !in_len);
1732+
char_count = out_len - i;
1733+
break;
1734+
}
1735+
}
1736+
}
1737+
1738+
if (char_count) {
1739+
/* The main loop above has finished processing the input string, but
1740+
* has left a partial chunk in 'buf' */
1741+
add_next_index_str(return_value, mb_convert_buf_result(&buf));
1742+
}
17901743
}
17911744
}
1792-
/* }}} */
17931745

17941746
static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
17951747
{

ext/mbstring/tests/mb_str_split_error_conditions.phpt

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,34 @@ try {
2626
echo $e->getMessage() . \PHP_EOL;
2727
}
2828

29+
// For UTF-8, error markers are not inserted
30+
echo "== INVALID UTF-8 ==\n";
31+
$array = mb_str_split("abc\xFFabc", 2, "UTF-8");
32+
echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
33+
34+
// For most other encodings, they are
35+
echo "== INVALID HZ ==\n";
36+
// The last string emitted by mb_str_split will include '?' as an error marker,
37+
// since ά cannot be represented in HZ
38+
$array = mb_str_split(mb_convert_encoding("ελληνικά", "HZ", "UTF-8"), 2, "HZ");
39+
echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
40+
41+
// HTML entity error markers
42+
mb_substitute_character("entity");
43+
echo "== INVALID HZ IN 'ENTITY' ERROR OUTPUT MODE ==\n";
44+
// The output here will actually include an HTML entity #x3AC;
45+
// It will be split into segments of 2 characters each by mb_str_split
46+
$array = mb_str_split(mb_convert_encoding("ελληνικά", "HZ", "UTF-8"), 2, "HZ");
47+
echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
48+
2949
?>
3050
--EXPECT--
3151
mb_str_split(): Argument #2 ($length) must be greater than 0
3252
mb_str_split(): Argument #2 ($length) must be greater than 0
3353
mb_str_split(): Argument #3 ($encoding) must be a valid encoding, "BAD_ENCODING" given
54+
== INVALID UTF-8 ==
55+
[6162, 63ff, 6162, 63]
56+
== INVALID HZ ==
57+
[7e7b2645264b7e7d, 7e7b264b26477e7d, 7e7b264d26497e7d, 7e7b264a7e7d3f]
58+
== INVALID HZ IN 'ENTITY' ERROR OUTPUT MODE ==
59+
[7e7b2645264b7e7d, 7e7b264b26477e7d, 7e7b264d26497e7d, 7e7b264a7e7d26, 2378, 3341, 433b]

0 commit comments

Comments
 (0)