|
18 | 18 | */
|
19 | 19 |
|
20 | 20 | /* {{{ includes */
|
| 21 | +#include <limits.h> |
| 22 | + |
21 | 23 | #include "libmbfl/config.h"
|
22 | 24 | #include "php.h"
|
23 | 25 | #include "php_ini.h"
|
@@ -1625,171 +1627,121 @@ PHP_FUNCTION(mb_output_handler)
|
1625 | 1627 | }
|
1626 | 1628 | }
|
1627 | 1629 |
|
1628 |
| -/* {{{ Convert a multibyte string to an array. If split_length is specified, |
1629 |
| - break the string down into chunks each split_length characters long. */ |
1630 |
| - |
1631 |
| -/* structure to pass split params to the callback */ |
1632 |
| -struct mbfl_split_params { |
1633 |
| - zval *return_value; /* php function return value structure pointer */ |
1634 |
| - mbfl_string *result_string; /* string to store result chunk */ |
1635 |
| - size_t mb_chunk_length; /* actual chunk length in chars */ |
1636 |
| - size_t split_length; /* split length in chars */ |
1637 |
| - mbfl_convert_filter *next_filter; /* widechar to encoding converter */ |
1638 |
| -}; |
1639 |
| - |
1640 |
| -/* callback function to fill split array */ |
1641 |
| -static int mbfl_split_output(int c, void *data) |
1642 |
| -{ |
1643 |
| - struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */ |
1644 |
| - |
1645 |
| - (*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */ |
1646 |
| - |
1647 |
| - if (params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */ |
1648 |
| - mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */ |
1649 |
| - mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */ |
1650 |
| - mbfl_string *chunk = params->result_string; |
1651 |
| - mbfl_memory_device_result(device, chunk); /* make chunk */ |
1652 |
| - add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */ |
1653 |
| - efree(chunk->val); |
1654 |
| - params->mb_chunk_length = 0; /* reset mb_chunk size */ |
1655 |
| - } |
1656 |
| - |
1657 |
| - return 0; |
1658 |
| -} |
1659 |
| - |
1660 | 1630 | PHP_FUNCTION(mb_str_split)
|
1661 | 1631 | {
|
1662 | 1632 | zend_string *str, *encoding = NULL;
|
1663 |
| - size_t mb_len, chunks, chunk_len; |
1664 |
| - const char *p, *last; /* pointer for the string cursor and last string char */ |
1665 |
| - mbfl_string string, result_string; |
1666 |
| - const mbfl_encoding *mbfl_encoding; |
1667 |
| - zend_long split_length = 1; |
| 1633 | + zend_long split_len = 1; |
1668 | 1634 |
|
1669 | 1635 | ZEND_PARSE_PARAMETERS_START(1, 3)
|
1670 | 1636 | Z_PARAM_STR(str)
|
1671 | 1637 | Z_PARAM_OPTIONAL
|
1672 |
| - Z_PARAM_LONG(split_length) |
| 1638 | + Z_PARAM_LONG(split_len) |
1673 | 1639 | Z_PARAM_STR_OR_NULL(encoding)
|
1674 | 1640 | ZEND_PARSE_PARAMETERS_END();
|
1675 | 1641 |
|
1676 |
| - if (split_length <= 0) { |
| 1642 | + if (split_len <= 0) { |
1677 | 1643 | zend_argument_value_error(2, "must be greater than 0");
|
1678 | 1644 | RETURN_THROWS();
|
| 1645 | + } else if (split_len > UINT_MAX / 4) { |
| 1646 | + zend_argument_value_error(2, "is too large"); |
| 1647 | + RETURN_THROWS(); |
1679 | 1648 | }
|
1680 | 1649 |
|
1681 |
| - /* fill mbfl_string structure */ |
1682 |
| - string.val = (unsigned char *) ZSTR_VAL(str); |
1683 |
| - string.len = ZSTR_LEN(str); |
1684 |
| - string.encoding = php_mb_get_encoding(encoding, 3); |
1685 |
| - if (!string.encoding) { |
| 1650 | + const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3); |
| 1651 | + if (!enc) { |
1686 | 1652 | RETURN_THROWS();
|
1687 | 1653 | }
|
1688 | 1654 |
|
1689 | 1655 | if (ZSTR_LEN(str) == 0) {
|
1690 | 1656 | RETURN_EMPTY_ARRAY();
|
1691 | 1657 | }
|
1692 | 1658 |
|
1693 |
| - p = ZSTR_VAL(str); /* string cursor pointer */ |
1694 |
| - last = ZSTR_VAL(str) + ZSTR_LEN(str); /* last string char pointer */ |
1695 |
| - |
1696 |
| - mbfl_encoding = string.encoding; |
1697 |
| - |
1698 |
| - /* first scenario: 1,2,4-bytes fixed width encodings (head part) */ |
1699 |
| - if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */ |
1700 |
| - mb_len = string.len; |
1701 |
| - chunk_len = (size_t)split_length; /* chunk length in bytes */ |
1702 |
| - } else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS2) { /* 2 bytes */ |
1703 |
| - mb_len = string.len / 2; |
1704 |
| - chunk_len = split_length * 2; |
1705 |
| - } else if (mbfl_encoding->flag & MBFL_ENCTYPE_WCS4) { /* 4 bytes */ |
1706 |
| - mb_len = string.len / 4; |
1707 |
| - chunk_len = split_length * 4; |
1708 |
| - } else if (mbfl_encoding->mblen_table != NULL) { |
1709 |
| - /* second scenario: variable width encodings with length table */ |
1710 |
| - char unsigned const *mbtab = mbfl_encoding->mblen_table; |
1711 |
| - |
1712 |
| - /* assume that we have 1-bytes characters */ |
1713 |
| - array_init_size(return_value, (string.len + split_length) / split_length); /* round up */ |
1714 |
| - |
1715 |
| - while (p < last) { /* split cycle work until the cursor has reached the last byte */ |
1716 |
| - char const *chunk_p = p; /* chunk first byte pointer */ |
1717 |
| - chunk_len = 0; /* chunk length in bytes */ |
1718 |
| - zend_long char_count; |
1719 |
| - |
1720 |
| - for (char_count = 0; char_count < split_length && p < last; ++char_count) { |
1721 |
| - char unsigned const m = mbtab[*(const unsigned char *)p]; /* single character length table */ |
1722 |
| - chunk_len += m; |
1723 |
| - p += m; |
| 1659 | + unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str); |
| 1660 | + |
| 1661 | + unsigned int char_len = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4); |
| 1662 | + if (char_len) { |
| 1663 | + unsigned int chunk_len = char_len * split_len; |
| 1664 | + unsigned int chunks = ((ZSTR_LEN(str) / chunk_len) + split_len - 1) / split_len; /* round up */ |
| 1665 | + array_init_size(return_value, chunks); |
| 1666 | + while (p < e) { |
| 1667 | + add_next_index_stringl(return_value, (const char*)p, MIN(chunk_len, e - p)); |
| 1668 | + p += chunk_len; |
| 1669 | + } |
| 1670 | + } else if (enc->mblen_table) { |
| 1671 | + unsigned char const *mbtab = enc->mblen_table; |
| 1672 | + |
| 1673 | + /* Assume that we have 1-byte characters */ |
| 1674 | + array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len); |
| 1675 | + |
| 1676 | + while (p < e) { |
| 1677 | + unsigned char *chunk = p; /* start of chunk */ |
| 1678 | + |
| 1679 | + for (int char_count = 0; char_count < split_len && p < e; char_count++) { |
| 1680 | + p += mbtab[*p]; |
| 1681 | + } |
| 1682 | + if (p > e) { |
| 1683 | + p = e; /* ensure chunk is in bounds */ |
1724 | 1684 | }
|
1725 |
| - if (p >= last) chunk_len -= p - last; /* check if chunk is in bounds */ |
1726 |
| - add_next_index_stringl(return_value, chunk_p, chunk_len); |
| 1685 | + add_next_index_stringl(return_value, (const char*)chunk, p - chunk); |
1727 | 1686 | }
|
1728 |
| - return; |
1729 | 1687 | } else {
|
1730 |
| - /* third scenario: other multibyte encodings */ |
1731 |
| - mbfl_convert_filter *filter, *decoder; |
1732 |
| - |
1733 |
| - /* assume that we have 1-bytes characters */ |
1734 |
| - array_init_size(return_value, (string.len + split_length) / split_length); /* round up */ |
1735 |
| - |
1736 |
| - /* decoder filter to decode wchar to encoding */ |
1737 |
| - mbfl_memory_device device; |
1738 |
| - mbfl_memory_device_init(&device, split_length + 1, 0); |
1739 |
| - |
1740 |
| - decoder = mbfl_convert_filter_new( |
1741 |
| - &mbfl_encoding_wchar, |
1742 |
| - string.encoding, |
1743 |
| - mbfl_memory_device_output, |
1744 |
| - NULL, |
1745 |
| - &device); |
1746 |
| - /* assert that nothing is wrong with the decoder */ |
1747 |
| - ZEND_ASSERT(decoder != NULL); |
1748 |
| - |
1749 |
| - /* wchar filter */ |
1750 |
| - mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */ |
1751 |
| - struct mbfl_split_params params = { /* init callback function params structure */ |
1752 |
| - .return_value = return_value, |
1753 |
| - .result_string = &result_string, |
1754 |
| - .mb_chunk_length = 0, |
1755 |
| - .split_length = (size_t)split_length, |
1756 |
| - .next_filter = decoder, |
1757 |
| - }; |
1758 |
| - |
1759 |
| - filter = mbfl_convert_filter_new( |
1760 |
| - string.encoding, |
1761 |
| - &mbfl_encoding_wchar, |
1762 |
| - mbfl_split_output, |
1763 |
| - NULL, |
1764 |
| - ¶ms); |
1765 |
| - /* assert that nothing is wrong with the filter */ |
1766 |
| - ZEND_ASSERT(filter != NULL); |
1767 |
| - |
1768 |
| - while (p < last - 1) { /* cycle each byte except last with callback function */ |
1769 |
| - (*filter->filter_function)(*p++, filter); |
1770 |
| - } |
1771 |
| - params.mb_chunk_length = split_length - 1; /* force to finish current chunk */ |
1772 |
| - (*filter->filter_function)(*p++, filter); /* process last char */ |
1773 |
| - |
1774 |
| - mbfl_convert_filter_delete(decoder); |
1775 |
| - mbfl_convert_filter_delete(filter); |
1776 |
| - mbfl_memory_device_clear(&device); |
1777 |
| - return; |
1778 |
| - } |
1779 |
| - |
1780 |
| - /* first scenario: 1,2,4-bytes fixed width encodings (tail part) */ |
1781 |
| - chunks = (mb_len + split_length - 1) / split_length; /* (round up idiom) */ |
1782 |
| - array_init_size(return_value, chunks); |
1783 |
| - if (chunks != 0) { |
1784 |
| - zend_long i; |
1785 |
| - |
1786 |
| - for (i = 0; i < chunks - 1; p += chunk_len, ++i) { |
1787 |
| - add_next_index_stringl(return_value, p, chunk_len); |
1788 |
| - } |
1789 |
| - add_next_index_stringl(return_value, p, last - p); |
| 1688 | + /* Assume that we have 1-byte characters */ |
| 1689 | + array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len); |
| 1690 | + |
| 1691 | + uint32_t wchar_buf[128]; |
| 1692 | + size_t in_len = ZSTR_LEN(str); |
| 1693 | + unsigned int state = 0, char_count = 0; |
| 1694 | + |
| 1695 | + mb_convert_buf buf; |
| 1696 | + |
| 1697 | + while (in_len) { |
| 1698 | + size_t out_len = enc->to_wchar(&p, &in_len, wchar_buf, 128, &state); |
| 1699 | + ZEND_ASSERT(out_len <= 128); |
| 1700 | + size_t i = 0; |
| 1701 | + |
| 1702 | + /* Is there some output remaining from the previous iteration? */ |
| 1703 | + if (char_count) { |
| 1704 | + if (out_len >= split_len - char_count) { |
| 1705 | + /* Finish off an incomplete chunk from previous iteration |
| 1706 | + * ('buf' was already initialized; we don't need to do it again) */ |
| 1707 | + enc->from_wchar(wchar_buf, split_len - char_count, &buf, true); |
| 1708 | + i += split_len - char_count; |
| 1709 | + char_count = 0; |
| 1710 | + add_next_index_str(return_value, mb_convert_buf_result(&buf)); |
| 1711 | + } else { |
| 1712 | + /* Output from this iteration is not enough to finish the next chunk; |
| 1713 | + * output what we can, and leave 'buf' to be used again on next iteration */ |
| 1714 | + enc->from_wchar(wchar_buf, out_len, &buf, !in_len); |
| 1715 | + char_count += out_len; |
| 1716 | + continue; |
| 1717 | + } |
| 1718 | + } |
| 1719 | + |
| 1720 | + while (i < out_len) { |
| 1721 | + /* Prepare for the next chunk */ |
| 1722 | + mb_convert_buf_init(&buf, split_len, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode)); |
| 1723 | + |
| 1724 | + if (out_len - i >= split_len) { |
| 1725 | + enc->from_wchar(wchar_buf + i, split_len, &buf, true); |
| 1726 | + i += split_len; |
| 1727 | + add_next_index_str(return_value, mb_convert_buf_result(&buf)); |
| 1728 | + } else { |
| 1729 | + /* The remaining codepoints in wchar_buf aren't enough to finish a chunk; |
| 1730 | + * leave them for the next iteration */ |
| 1731 | + enc->from_wchar(wchar_buf + i, out_len - i, &buf, !in_len); |
| 1732 | + char_count = out_len - i; |
| 1733 | + break; |
| 1734 | + } |
| 1735 | + } |
| 1736 | + } |
| 1737 | + |
| 1738 | + if (char_count) { |
| 1739 | + /* The main loop above has finished processing the input string, but |
| 1740 | + * has left a partial chunk in 'buf' */ |
| 1741 | + add_next_index_str(return_value, mb_convert_buf_result(&buf)); |
| 1742 | + } |
1790 | 1743 | }
|
1791 | 1744 | }
|
1792 |
| -/* }}} */ |
1793 | 1745 |
|
1794 | 1746 | static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
|
1795 | 1747 | {
|
|
0 commit comments