Skip to content

Commit 8856b3a

Browse files
committed
Merge branch 'pull-request/1974' into PHP-5.6
* pull-request/1974: Fix #68447: grapheme_extract take an extra trailing character
2 parents 5049ef2 + df683fa commit 8856b3a

File tree

2 files changed

+60
-53
lines changed

2 files changed

+60
-53
lines changed

ext/intl/grapheme/grapheme_string.c

Lines changed: 32 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -702,8 +702,10 @@ PHP_FUNCTION(grapheme_stristr)
702702
static inline int32_t
703703
grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
704704
{
705-
int pos = 0, prev_pos = 0;
706-
int ret_pos = 0, prev_ret_pos = 0;
705+
int pos = 0;
706+
int ret_pos = 0;
707+
int break_pos, prev_break_pos;
708+
int count = 0;
707709

708710
while ( 1 ) {
709711
pos = ubrk_next(bi);
@@ -712,23 +714,24 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char
712714
break;
713715
}
714716

715-
/* if we are beyond our limit, then the loop is done */
716-
if ( pos > csize ) {
717-
break;
718-
}
717+
for ( break_pos = ret_pos; break_pos < pos; ) {
718+
count++;
719+
prev_break_pos = break_pos;
720+
U8_FWD_1(pstr, break_pos, str_len);
719721

720-
/* update our pointer in the original UTF-8 buffer by as many characters
721-
as ubrk_next iterated over */
722-
723-
prev_ret_pos = ret_pos;
724-
U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
722+
if ( prev_break_pos == break_pos ) {
723+
/* something wrong - malformed utf8? */
724+
csize = 0;
725+
break;
726+
}
727+
}
725728

726-
if ( prev_ret_pos == ret_pos ) {
727-
/* something wrong - malformed utf8? */
729+
/* if we are beyond our limit, then the loop is done */
730+
if ( count > csize ) {
728731
break;
729732
}
730733

731-
prev_pos = pos;
734+
ret_pos = break_pos;
732735
}
733736

734737
return ret_pos;
@@ -739,8 +742,8 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char
739742
static inline int32_t
740743
grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
741744
{
742-
int pos = 0, prev_pos = 0;
743-
int ret_pos = 0, prev_ret_pos = 0;
745+
int pos = 0;
746+
int ret_pos = 0;
744747

745748
while ( 1 ) {
746749
pos = ubrk_next(bi);
@@ -749,20 +752,11 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char
749752
break;
750753
}
751754

752-
prev_ret_pos = ret_pos;
753-
U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
754-
755-
if ( ret_pos > bsize ) {
756-
ret_pos = prev_ret_pos;
757-
break;
758-
}
759-
760-
if ( prev_ret_pos == ret_pos ) {
761-
/* something wrong - malformed utf8? */
755+
if ( pos > bsize ) {
762756
break;
763757
}
764758

765-
prev_pos = pos;
759+
ret_pos = pos;
766760
}
767761

768762
return ret_pos;
@@ -773,7 +767,7 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char
773767
static inline int32_t
774768
grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
775769
{
776-
int pos = 0, next_pos = 0;
770+
int next_pos = 0;
777771
int ret_pos = 0;
778772

779773
while ( size ) {
@@ -782,16 +776,10 @@ grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pst
782776
if ( UBRK_DONE == next_pos ) {
783777
break;
784778
}
785-
pos = next_pos;
779+
ret_pos = next_pos;
786780
size--;
787781
}
788782

789-
/* pos is one past the last UChar - and represent the number of code units to
790-
advance in the utf-8 buffer
791-
*/
792-
793-
U8_FWD_N(pstr, ret_pos, str_len, pos);
794-
795783
return ret_pos;
796784
}
797785
/* }}} */
@@ -810,11 +798,11 @@ static grapheme_extract_iter grapheme_extract_iters[] = {
810798
Function to extract a sequence of default grapheme clusters */
811799
PHP_FUNCTION(grapheme_extract)
812800
{
813-
unsigned char *str, *pstr;
814-
UChar *ustr;
815-
int str_len, ustr_len;
816-
long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
817-
long lstart = 0; /* starting position in str in bytes */
801+
char *str, *pstr;
802+
UText ut = UTEXT_INITIALIZER;
803+
size_t str_len;
804+
zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
805+
zend_long lstart = 0; /* starting position in str in bytes */
818806
int32_t start = 0;
819807
long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
820808
UErrorCode status;
@@ -900,21 +888,15 @@ PHP_FUNCTION(grapheme_extract)
900888
RETURN_STRINGL(((char *)pstr), nsize, 1);
901889
}
902890

903-
/* convert the strings to UTF-16. */
904-
ustr = NULL;
905-
ustr_len = 0;
906891
status = U_ZERO_ERROR;
907-
intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
892+
utext_openUTF8(&ut, pstr, str_len, &status);
908893

909894
if ( U_FAILURE( status ) ) {
910895
/* Set global error code. */
911896
intl_error_set_code( NULL, status TSRMLS_CC );
912897

913898
/* Set error messages. */
914-
intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
915-
916-
if ( NULL != ustr )
917-
efree( ustr );
899+
intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 TSRMLS_CC );
918900

919901
RETURN_FALSE;
920902
}
@@ -923,18 +905,15 @@ PHP_FUNCTION(grapheme_extract)
923905
status = U_ZERO_ERROR;
924906
bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
925907

926-
ubrk_setText(bi, ustr, ustr_len, &status);
927-
908+
ubrk_setUText(bi, &ut, &status);
928909
/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
929910
can't back up. So, we will not do anything. */
930911

931912
/* now we need to find the end of the chunk the user wants us to return */
932913

933914
ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
934915

935-
if (ustr) {
936-
efree(ustr);
937-
}
916+
utext_close(&ut);
938917
ubrk_close(bi);
939918

940919
if ( NULL != next ) {

ext/intl/tests/bug68447.phpt

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
--TEST--
2+
Bug #68447: grapheme_extract take an extra trailing character
3+
--SKIPIF--
4+
<?php if( !extension_loaded( 'intl' ) ) print 'skip'; ?>
5+
--FILE--
6+
<?php
7+
$katsushikaku = "葛󠄁飾区";
8+
echo grapheme_extract($katsushikaku, 1) . "\n";
9+
10+
$haiyore = "這󠄀いよれ";
11+
echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_COUNT) . "\n";
12+
echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_COUNT) . "\n";
13+
echo grapheme_extract($haiyore, 6, GRAPHEME_EXTR_MAXBYTES) . "\n";
14+
echo grapheme_extract($haiyore, 9, GRAPHEME_EXTR_MAXBYTES) . "\n";
15+
echo grapheme_extract($haiyore, 12, GRAPHEME_EXTR_MAXBYTES) . "\n";
16+
echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_MAXCHARS) . "\n";
17+
echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_MAXCHARS) . "\n";
18+
echo grapheme_extract($haiyore, 3, GRAPHEME_EXTR_MAXCHARS) . "\n";
19+
--EXPECT--
20+
葛󠄁
21+
這󠄀
22+
這󠄀い
23+
24+
這󠄀
25+
這󠄀い
26+
27+
這󠄀
28+
這󠄀い

0 commit comments

Comments
 (0)