@@ -702,8 +702,10 @@ PHP_FUNCTION(grapheme_stristr)
702
702
static inline int32_t
703
703
grapheme_extract_charcount_iter (UBreakIterator * bi , int32_t csize , unsigned char * pstr , int32_t str_len )
704
704
{
705
- int pos = 0 , prev_pos = 0 ;
706
- int ret_pos = 0 , prev_ret_pos = 0 ;
705
+ int pos = 0 ;
706
+ int ret_pos = 0 ;
707
+ int break_pos , prev_break_pos ;
708
+ int count = 0 ;
707
709
708
710
while ( 1 ) {
709
711
pos = ubrk_next (bi );
@@ -712,23 +714,24 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char
712
714
break ;
713
715
}
714
716
715
- /* if we are beyond our limit, then the loop is done */
716
- if ( pos > csize ) {
717
- break ;
718
- }
717
+ for ( break_pos = ret_pos ; break_pos < pos ; ) {
718
+ count ++ ;
719
+ prev_break_pos = break_pos ;
720
+ U8_FWD_1 ( pstr , break_pos , str_len );
719
721
720
- /* update our pointer in the original UTF-8 buffer by as many characters
721
- as ubrk_next iterated over */
722
-
723
- prev_ret_pos = ret_pos ;
724
- U8_FWD_N (pstr , ret_pos , str_len , pos - prev_pos );
722
+ if ( prev_break_pos == break_pos ) {
723
+ /* something wrong - malformed utf8? */
724
+ csize = 0 ;
725
+ break ;
726
+ }
727
+ }
725
728
726
- if ( prev_ret_pos == ret_pos ) {
727
- /* something wrong - malformed utf8? */
729
+ /* if we are beyond our limit, then the loop is done */
730
+ if ( count > csize ) {
728
731
break ;
729
732
}
730
733
731
- prev_pos = pos ;
734
+ ret_pos = break_pos ;
732
735
}
733
736
734
737
return ret_pos ;
@@ -739,8 +742,8 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char
739
742
static inline int32_t
740
743
grapheme_extract_bytecount_iter (UBreakIterator * bi , int32_t bsize , unsigned char * pstr , int32_t str_len )
741
744
{
742
- int pos = 0 , prev_pos = 0 ;
743
- int ret_pos = 0 , prev_ret_pos = 0 ;
745
+ int pos = 0 ;
746
+ int ret_pos = 0 ;
744
747
745
748
while ( 1 ) {
746
749
pos = ubrk_next (bi );
@@ -749,20 +752,11 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char
749
752
break ;
750
753
}
751
754
752
- prev_ret_pos = ret_pos ;
753
- U8_FWD_N (pstr , ret_pos , str_len , pos - prev_pos );
754
-
755
- if ( ret_pos > bsize ) {
756
- ret_pos = prev_ret_pos ;
757
- break ;
758
- }
759
-
760
- if ( prev_ret_pos == ret_pos ) {
761
- /* something wrong - malformed utf8? */
755
+ if ( pos > bsize ) {
762
756
break ;
763
757
}
764
758
765
- prev_pos = pos ;
759
+ ret_pos = pos ;
766
760
}
767
761
768
762
return ret_pos ;
@@ -773,7 +767,7 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char
773
767
static inline int32_t
774
768
grapheme_extract_count_iter (UBreakIterator * bi , int32_t size , unsigned char * pstr , int32_t str_len )
775
769
{
776
- int pos = 0 , next_pos = 0 ;
770
+ int next_pos = 0 ;
777
771
int ret_pos = 0 ;
778
772
779
773
while ( size ) {
@@ -782,16 +776,10 @@ grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pst
782
776
if ( UBRK_DONE == next_pos ) {
783
777
break ;
784
778
}
785
- pos = next_pos ;
779
+ ret_pos = next_pos ;
786
780
size -- ;
787
781
}
788
782
789
- /* pos is one past the last UChar - and represent the number of code units to
790
- advance in the utf-8 buffer
791
- */
792
-
793
- U8_FWD_N (pstr , ret_pos , str_len , pos );
794
-
795
783
return ret_pos ;
796
784
}
797
785
/* }}} */
@@ -810,11 +798,11 @@ static grapheme_extract_iter grapheme_extract_iters[] = {
810
798
Function to extract a sequence of default grapheme clusters */
811
799
PHP_FUNCTION (grapheme_extract )
812
800
{
813
- unsigned char * str , * pstr ;
814
- UChar * ustr ;
815
- int str_len , ustr_len ;
816
- long size ; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
817
- long lstart = 0 ; /* starting position in str in bytes */
801
+ char * str , * pstr ;
802
+ UText ut = UTEXT_INITIALIZER ;
803
+ size_t str_len ;
804
+ zend_long size ; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
805
+ zend_long lstart = 0 ; /* starting position in str in bytes */
818
806
int32_t start = 0 ;
819
807
long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT ;
820
808
UErrorCode status ;
@@ -900,21 +888,15 @@ PHP_FUNCTION(grapheme_extract)
900
888
RETURN_STRINGL (((char * )pstr ), nsize , 1 );
901
889
}
902
890
903
- /* convert the strings to UTF-16. */
904
- ustr = NULL ;
905
- ustr_len = 0 ;
906
891
status = U_ZERO_ERROR ;
907
- intl_convert_utf8_to_utf16 ( & ustr , & ustr_len , ( char * ) pstr , str_len , & status );
892
+ utext_openUTF8 ( & ut , pstr , str_len , & status );
908
893
909
894
if ( U_FAILURE ( status ) ) {
910
895
/* Set global error code. */
911
896
intl_error_set_code ( NULL , status TSRMLS_CC );
912
897
913
898
/* Set error messages. */
914
- intl_error_set_custom_msg ( NULL , "Error converting input string to UTF-16" , 0 TSRMLS_CC );
915
-
916
- if ( NULL != ustr )
917
- efree ( ustr );
899
+ intl_error_set_custom_msg ( NULL , "Error opening UTF-8 text" , 0 TSRMLS_CC );
918
900
919
901
RETURN_FALSE ;
920
902
}
@@ -923,18 +905,15 @@ PHP_FUNCTION(grapheme_extract)
923
905
status = U_ZERO_ERROR ;
924
906
bi = grapheme_get_break_iterator (u_break_iterator_buffer , & status TSRMLS_CC );
925
907
926
- ubrk_setText (bi , ustr , ustr_len , & status );
927
-
908
+ ubrk_setUText (bi , & ut , & status );
928
909
/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
929
910
can't back up. So, we will not do anything. */
930
911
931
912
/* now we need to find the end of the chunk the user wants us to return */
932
913
933
914
ret_pos = (* grapheme_extract_iters [extract_type ])(bi , size , pstr , str_len );
934
915
935
- if (ustr ) {
936
- efree (ustr );
937
- }
916
+ utext_close (& ut );
938
917
ubrk_close (bi );
939
918
940
919
if ( NULL != next ) {
0 commit comments