@@ -11088,7 +11088,7 @@ static void mb_wchar_to_gb18030(uint32_t *in, size_t len, mb_convert_buf *buf, b
11088
11088
continue ;
11089
11089
} else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max ) {
11090
11090
if (w == 0x1F9 ) {
11091
- s = 0xA8Bf ;
11091
+ s = 0xA8BF ;
11092
11092
} else {
11093
11093
s = ucs_a1_cp936_table [w - ucs_a1_cp936_table_min ];
11094
11094
}
@@ -11560,6 +11560,319 @@ static void mb_wchar_to_cp936(uint32_t *in, size_t len, mb_convert_buf *buf, boo
11560
11560
MB_CONVERT_BUF_STORE (buf , out , limit );
11561
11561
}
11562
11562
11563
+ static const unsigned short gb18030_2022_pua_tbl3 [] = {
11564
+ /* 0xFE50 */
11565
+ 0x0000 ,0xE816 ,0xE817 ,0xE818 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,
11566
+ 0x0000 ,0x9FB4 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,
11567
+ 0x0000 ,0x9FB5 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x9FB6 ,0x9FB7 ,
11568
+ 0x0000 ,0x0000 ,0x0000 ,0x0000 ,0xE831 ,0x9FB8 ,0x0000 ,0x0000 ,
11569
+ 0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0xE83B ,0x0000 ,
11570
+ 0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x9FB9 ,0x0000 ,
11571
+ 0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,
11572
+ 0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,
11573
+ 0x9FBA ,0xE855 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,
11574
+ 0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,
11575
+ /* 0xFEA0 */
11576
+ 0x9FBB
11577
+ };
11578
+
11579
+ static size_t mb_gb18030_2022_to_wchar (unsigned char * * in , size_t * in_len , uint32_t * buf , size_t bufsize , unsigned int * state )
11580
+ {
11581
+ unsigned char * p = * in , * e = p + * in_len ;
11582
+ uint32_t * out = buf , * limit = buf + bufsize ;
11583
+
11584
+ while (p < e && out < limit ) {
11585
+ unsigned char c = * p ++ ;
11586
+
11587
+ if (c < 0x80 ) {
11588
+ * out ++ = c ;
11589
+ } else if (c == 0x80 || c == 0xFF ) {
11590
+ * out ++ = MBFL_BAD_INPUT ;
11591
+ } else {
11592
+ if (p == e ) {
11593
+ * out ++ = MBFL_BAD_INPUT ;
11594
+ break ;
11595
+ }
11596
+ unsigned char c2 = * p ++ ;
11597
+
11598
+ if (((c >= 0x81 && c <= 0x84 ) || (c >= 0x90 && c <= 0xE3 )) && c2 >= 0x30 && c2 <= 0x39 ) {
11599
+ if (p >= e ) {
11600
+ * out ++ = MBFL_BAD_INPUT ;
11601
+ break ;
11602
+ }
11603
+ unsigned char c3 = * p ++ ;
11604
+
11605
+ if (c3 >= 0x81 && c3 <= 0xFE && p < e ) {
11606
+ unsigned char c4 = * p ++ ;
11607
+
11608
+ if (c4 >= 0x30 && c4 <= 0x39 ) {
11609
+ if (c >= 0x90 && c <= 0xE3 ) {
11610
+ unsigned int w = ((((c - 0x90 )* 10 + (c2 - 0x30 ))* 126 + (c3 - 0x81 )))* 10 + (c4 - 0x30 ) + 0x10000 ;
11611
+ * out ++ = (w > 0x10FFFF ) ? MBFL_BAD_INPUT : w ;
11612
+ } else {
11613
+ /* Unicode BMP */
11614
+ unsigned int w = (((c - 0x81 )* 10 + (c2 - 0x30 ))* 126 + (c3 - 0x81 ))* 10 + (c4 - 0x30 );
11615
+ if (w == 0x98A4 ) {
11616
+ * out ++ = 0xE78D ;
11617
+ } else if (w == 0x98A6 ) {
11618
+ * out ++ = 0xE78E ;
11619
+ } else if (w == 0x98A5 ) {
11620
+ * out ++ = 0xE78F ;
11621
+ } else if (w >= 0x98A7 && w <= 0x98AD ) {
11622
+ * out ++ = w + (0xE790 - 0x98A7 );
11623
+ } else if (w == 0x1D21 ) {
11624
+ * out ++ = 0xE7C7 ;
11625
+ } else if (w == 0x4A71 ) {
11626
+ * out ++ = 0xE81E ;
11627
+ } else if (w == 0x4A72 ) {
11628
+ * out ++ = 0xE826 ;
11629
+ } else if (w >= 0x4A73 && w <= 0x4A74 ) {
11630
+ * out ++ = w + (0xE82B - 0x4A73 );
11631
+ } else if (w == 0x4A75 ) {
11632
+ * out ++ = 0xE832 ;
11633
+ } else if (w == 0x4A76 ) {
11634
+ * out ++ = 0xE843 ;
11635
+ } else if (w == 0x4A77 ) {
11636
+ * out ++ = 0xE854 ;
11637
+ } else if (w == 0x4A78 ) {
11638
+ * out ++ = 0xE864 ;
11639
+ } else if (w <= 0x99FB ) {
11640
+ * out ++ = w + mbfl_gb_uni_ofst [mbfl_bisec_srch (w , mbfl_gb2uni_tbl , mbfl_gb_uni_max )];
11641
+ } else {
11642
+ * out ++ = MBFL_BAD_INPUT ;
11643
+ }
11644
+ }
11645
+ } else {
11646
+ * out ++ = MBFL_BAD_INPUT ;
11647
+ }
11648
+ } else {
11649
+ * out ++ = MBFL_BAD_INPUT ;
11650
+ }
11651
+ } else if (((c >= 0xAA && c <= 0xAF ) || (c >= 0xF8 && c <= 0xFE )) && (c2 >= 0xA1 && c2 <= 0xFE )) {
11652
+ /* UDA part 1, 2: U+E000-U+E4C5 */
11653
+ * out ++ = 94 * (c >= 0xF8 ? c - 0xF2 : c - 0xAA ) + (c2 - 0xA1 ) + 0xE000 ;
11654
+ } else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F ) {
11655
+ /* UDA part 3: U+E4C6-U+E765 */
11656
+ * out ++ = 96 * (c - 0xA1 ) + c2 - (c2 >= 0x80 ? 0x41 : 0x40 ) + 0xE4C6 ;
11657
+ } else if (c2 >= 0x40 && c2 != 0x7F && c2 != 0xFF ) {
11658
+ unsigned int w = (c - 0x81 )* 192 + c2 - 0x40 ;
11659
+
11660
+ if (w >= 0x192B ) {
11661
+ if (w <= 0x1EBE ) {
11662
+ if (w != 0x1963 && w != 0x1DBF && (w < 0x1E49 || w > 0x1E55 ) && w != 0x1E7F ) {
11663
+ * out ++ = gb18030_2022_pua_tbl1 [w - 0x192B ];
11664
+ continue ;
11665
+ }
11666
+ } else if (w >= 0x413A ) {
11667
+ if (w <= 0x413E ) {
11668
+ * out ++ = cp936_pua_tbl2 [w - 0x413A ];
11669
+ continue ;
11670
+ } else if (w >= 0x5DD0 && w <= 0x5E20 ) {
11671
+ unsigned int c = gb18030_2022_pua_tbl3 [w - 0x5DD0 ];
11672
+ if (c ) {
11673
+ * out ++ = c ;
11674
+ continue ;
11675
+ }
11676
+ }
11677
+ }
11678
+ }
11679
+
11680
+ if ((c >= 0x81 && c <= 0xA9 ) || (c >= 0xB0 && c <= 0xF7 && c2 >= 0xA1 ) || (c >= 0xAA && c <= 0xFE && c2 <= 0xA0 )) {
11681
+ ZEND_ASSERT (w < cp936_ucs_table_size );
11682
+ * out ++ = cp936_ucs_table [w ];
11683
+ } else {
11684
+ * out ++ = MBFL_BAD_INPUT ;
11685
+ }
11686
+ } else {
11687
+ * out ++ = MBFL_BAD_INPUT ;
11688
+ }
11689
+ }
11690
+ }
11691
+
11692
+ * in_len = e - p ;
11693
+ * in = p ;
11694
+ return out - buf ;
11695
+ }
11696
+
11697
+ static void mb_wchar_to_gb18030_2022 (uint32_t * in , size_t len , mb_convert_buf * buf , bool end )
11698
+ {
11699
+ unsigned char * out , * limit ;
11700
+ MB_CONVERT_BUF_LOAD (buf , out , limit );
11701
+ MB_CONVERT_BUF_ENSURE (buf , out , limit , len );
11702
+
11703
+ while (len -- ) {
11704
+ uint32_t w = * in ++ ;
11705
+ unsigned int s = 0 ;
11706
+
11707
+ if (w == 0 ) {
11708
+ out = mb_convert_buf_add (out , 0 );
11709
+ continue ;
11710
+ } else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max ) {
11711
+ if (w == 0x1F9 ) {
11712
+ s = 0xA8BF ;
11713
+ } else {
11714
+ s = ucs_a1_cp936_table [w - ucs_a1_cp936_table_min ];
11715
+ }
11716
+ } else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max ) {
11717
+ if (w == 0x20AC ) { /* Euro sign */
11718
+ s = 0xA2E3 ;
11719
+ } else {
11720
+ s = ucs_a2_cp936_table [w - ucs_a2_cp936_table_min ];
11721
+ }
11722
+ } else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max ) {
11723
+ s = ucs_a3_cp936_table [w - ucs_a3_cp936_table_min ];
11724
+ } else if (w >= 0x9FB4 && w <= 0x9FBB ) {
11725
+ /* Newly mapped in GB18030-2022 */
11726
+ if (w == 0x9FB4 ) {
11727
+ s = 0xFE59 ;
11728
+ } else if (w == 0x9FB5 ) {
11729
+ s = 0xFE61 ;
11730
+ } else if (w == 0x9FB6 ) {
11731
+ s = 0xFE66 ;
11732
+ } else if (w == 0x9FB7 ) {
11733
+ s = 0xFE67 ;
11734
+ } else if (w == 0x9FB8 ) {
11735
+ s = 0xFE6D ;
11736
+ } else if (w == 0x9FB9 ) {
11737
+ s = 0xFE7E ;
11738
+ } else if (w == 0x9FBA ) {
11739
+ s = 0xFE90 ;
11740
+ } else {
11741
+ s = 0xFEA0 ;
11742
+ }
11743
+ } else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max ) {
11744
+ s = ucs_i_cp936_table [w - ucs_i_cp936_table_min ];
11745
+ } else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max ) {
11746
+ /* U+F900-U+FA2F CJK Compatibility Ideographs */
11747
+ if (w == 0xF92C ) {
11748
+ s = 0xFD9C ;
11749
+ } else if (w == 0xF979 ) {
11750
+ s = 0xFD9D ;
11751
+ } else if (w == 0xF995 ) {
11752
+ s = 0xFD9E ;
11753
+ } else if (w == 0xF9E7 ) {
11754
+ s = 0xFD9F ;
11755
+ } else if (w == 0xF9F1 ) {
11756
+ s = 0xFDA0 ;
11757
+ } else if (w >= 0xFA0C && w <= 0xFA29 ) {
11758
+ s = ucs_ci_s_cp936_table [w - 0xFA0C ];
11759
+ }
11760
+ } else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max ) {
11761
+ /* CJK Compatibility Forms */
11762
+ s = ucs_cf_cp936_table [w - ucs_cf_cp936_table_min ];
11763
+ } else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max ) {
11764
+ /* U+FE50-U+FE6F Small Form Variants */
11765
+ s = ucs_sfv_cp936_table [w - ucs_sfv_cp936_table_min ];
11766
+ } else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max ) {
11767
+ /* U+FF00-U+FFFF HW/FW Forms */
11768
+ if (w == 0xFF04 ) {
11769
+ s = 0xA1E7 ;
11770
+ } else if (w == 0xFF5E ) {
11771
+ s = 0xA1AB ;
11772
+ } else if (w >= 0xFF01 && w <= 0xFF5D ) {
11773
+ s = w - 0xFF01 + 0xA3A1 ;
11774
+ } else if (w >= 0xFFE0 && w <= 0xFFE5 ) {
11775
+ s = ucs_hff_s_cp936_table [w - 0xFFE0 ];
11776
+ }
11777
+ } else if (w >= 0xE000 && w <= 0xE864 ) {
11778
+ /* PUA */
11779
+ if (w < 0xE766 ) {
11780
+ if (w < 0xE4C6 ) {
11781
+ unsigned int c1 = w - 0xE000 ;
11782
+ s = (c1 % 94 ) + 0xA1 ;
11783
+ c1 /= 94 ;
11784
+ s |= (c1 + (c1 < 0x06 ? 0xAA : 0xF2 )) << 8 ;
11785
+ } else {
11786
+ unsigned int c1 = w - 0xE4C6 ;
11787
+ s = ((c1 / 96 ) + 0xA1 ) << 8 ;
11788
+ c1 %= 96 ;
11789
+ s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40 );
11790
+ }
11791
+ } else {
11792
+ /* U+E766-U+E864 */
11793
+ unsigned int k1 = 0 , k2 = mbfl_gb18030_2022_pua_tbl_max ;
11794
+ while (k1 < k2 ) {
11795
+ unsigned int k = (k1 + k2 ) >> 1 ;
11796
+ if (w < mbfl_gb18030_2022_pua_tbl [k ][0 ]) {
11797
+ k2 = k ;
11798
+ } else if (w > mbfl_gb18030_2022_pua_tbl [k ][1 ]) {
11799
+ k1 = k + 1 ;
11800
+ } else {
11801
+ s = w - mbfl_gb18030_2022_pua_tbl [k ][0 ] + mbfl_gb18030_2022_pua_tbl [k ][2 ];
11802
+ break ;
11803
+ }
11804
+ }
11805
+ }
11806
+ } else if (w >= 0xFE10 && w <= 0xFE19 ) {
11807
+ /* Newly mapped codepoints in GB18030-2022 */
11808
+ if (w == 0xFE11 ) {
11809
+ s = 0xA6DB ;
11810
+ } else if (w == 0xFE12 ) {
11811
+ s = 0xA6DA ;
11812
+ } else if (w <= 0xFE16 ) {
11813
+ s = w - (0xFE10 - 0xA6D9 );
11814
+ } else if (w <= 0xFE18 ) {
11815
+ s = w - (0xFE17 - 0xA6EC );
11816
+ } else {
11817
+ s = 0xA6F3 ;
11818
+ }
11819
+ } else if (w == 0x1E3F ) {
11820
+ /* Newly mapped codepoint in GB18030-2022 */
11821
+ s = 0xA8BC ;
11822
+ }
11823
+
11824
+ /* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
11825
+ * do a binary search in a table of differing codepoints to see if we have one */
11826
+ if (!s && w >= mbfl_gb18030_c_tbl_key [0 ] && w <= mbfl_gb18030_c_tbl_key [mbfl_gb18030_c_tbl_max - 1 ]) {
11827
+ int i = mbfl_bisec_srch2 (w , mbfl_gb18030_c_tbl_key , mbfl_gb18030_c_tbl_max );
11828
+ if (i >= 0 ) {
11829
+ s = mbfl_gb18030_c_tbl_val [i ];
11830
+ }
11831
+ }
11832
+
11833
+ /* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
11834
+ if (!s && w >= 0x80 && w <= 0xFFFF ) {
11835
+ /* BMP */
11836
+ int i = mbfl_bisec_srch (w , mbfl_uni2gb2022_tbl , mbfl_gb2022_uni_max );
11837
+ if (i >= 0 ) {
11838
+ unsigned int c1 = w - mbfl_gb2022_uni_ofst [i ];
11839
+ s = (c1 % 10 ) + 0x30 ;
11840
+ c1 /= 10 ;
11841
+ s |= ((c1 % 126 ) + 0x81 ) << 8 ;
11842
+ c1 /= 126 ;
11843
+ s |= ((c1 % 10 ) + 0x30 ) << 16 ;
11844
+ c1 /= 10 ;
11845
+ s |= (c1 + 0x81 ) << 24 ;
11846
+ }
11847
+ } else if (w >= 0x10000 && w <= 0x10FFFF ) {
11848
+ /* Code set 3: Unicode U+10000-U+10FFFF */
11849
+ unsigned int c1 = w - 0x10000 ;
11850
+ s = (c1 % 10 ) + 0x30 ;
11851
+ c1 /= 10 ;
11852
+ s |= ((c1 % 126 ) + 0x81 ) << 8 ;
11853
+ c1 /= 126 ;
11854
+ s |= ((c1 % 10 ) + 0x30 ) << 16 ;
11855
+ c1 /= 10 ;
11856
+ s |= (c1 + 0x90 ) << 24 ;
11857
+ }
11858
+
11859
+ if (!s ) {
11860
+ MB_CONVERT_ERROR (buf , out , limit , w , mb_wchar_to_gb18030 );
11861
+ MB_CONVERT_BUF_ENSURE (buf , out , limit , len );
11862
+ } else if (s < 0x80 ) {
11863
+ out = mb_convert_buf_add (out , s );
11864
+ } else if (s > 0xFFFFFF ) {
11865
+ MB_CONVERT_BUF_ENSURE (buf , out , limit , len + 4 );
11866
+ out = mb_convert_buf_add4 (out , (s >> 24 ) & 0xFF , (s >> 16 ) & 0xFF , (s >> 8 ) & 0xFF , s & 0xFF );
11867
+ } else {
11868
+ MB_CONVERT_BUF_ENSURE (buf , out , limit , len + 2 );
11869
+ out = mb_convert_buf_add2 (out , (s >> 8 ) & 0xFF , s & 0xFF );
11870
+ }
11871
+ }
11872
+
11873
+ MB_CONVERT_BUF_STORE (buf , out , limit );
11874
+ }
11875
+
11563
11876
/* Step through a GB18030 string one character at a time. Find the last position at or
11564
11877
* before `limit` which falls directly after the end of a (single or multi-byte) character */
11565
11878
static zend_always_inline unsigned char * step_through_gb18030_str (unsigned char * p , unsigned char * limit )
@@ -11673,6 +11986,21 @@ const mbfl_encoding mbfl_encoding_cp936 = {
11673
11986
NULL ,
11674
11987
};
11675
11988
11989
+ const mbfl_encoding mbfl_encoding_gb18030_2022 = {
11990
+ mbfl_no_encoding_gb18030_2022 ,
11991
+ "GB18030-2022" ,
11992
+ "GB18030-2022" ,
11993
+ NULL ,
11994
+ NULL ,
11995
+ MBFL_ENCTYPE_GL_UNSAFE ,
11996
+ NULL ,
11997
+ NULL ,
11998
+ mb_gb18030_2022_to_wchar ,
11999
+ mb_wchar_to_gb18030_2022 ,
12000
+ NULL ,
12001
+ mb_cut_gb18030 ,
12002
+ };
12003
+
11676
12004
/*
11677
12005
* BIG5/CP950
11678
12006
*/
0 commit comments