34
34
#include "unicode_table_gb18030.h"
35
35
36
36
static int mbfl_filt_conv_gb18030_wchar_flush (mbfl_convert_filter * filter );
37
+ static size_t mb_gb18030_to_wchar (unsigned char * * in , size_t * in_len , uint32_t * buf , size_t bufsize , unsigned int * state );
38
+ static void mb_wchar_to_gb18030 (uint32_t * in , size_t len , mb_convert_buf * buf , bool end );
37
39
38
40
static const char * mbfl_encoding_gb18030_aliases [] = {"gb-18030" , "gb-18030-2000" , NULL };
39
41
@@ -46,8 +48,8 @@ const mbfl_encoding mbfl_encoding_gb18030 = {
46
48
MBFL_ENCTYPE_GL_UNSAFE ,
47
49
& vtbl_gb18030_wchar ,
48
50
& vtbl_wchar_gb18030 ,
49
- NULL ,
50
- NULL
51
+ mb_gb18030_to_wchar ,
52
+ mb_wchar_to_gb18030
51
53
};
52
54
53
55
const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {
@@ -382,3 +384,229 @@ int mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
382
384
383
385
return 0 ;
384
386
}
387
+
388
+ static size_t mb_gb18030_to_wchar (unsigned char * * in , size_t * in_len , uint32_t * buf , size_t bufsize , unsigned int * state )
389
+ {
390
+ unsigned char * p = * in , * e = p + * in_len ;
391
+ uint32_t * out = buf , * limit = buf + bufsize ;
392
+
393
+ while (p < e && out < limit ) {
394
+ unsigned char c = * p ++ ;
395
+
396
+ if (c < 0x80 ) {
397
+ * out ++ = c ;
398
+ } else if (c > 0x80 && c < 0xFF && p < e ) {
399
+ unsigned char c2 = * p ++ ;
400
+ unsigned int s = (c << 8 ) | c2 ;
401
+
402
+ if (((c >= 0x81 && c <= 0x84 ) || (c >= 0x90 && c <= 0xE3 )) && c2 >= 0x30 && c2 <= 0x39 ) {
403
+ if (p >= e ) {
404
+ * out ++ = MBFL_BAD_INPUT ;
405
+ break ;
406
+ }
407
+ unsigned char c3 = * p ++ ;
408
+
409
+ if (c3 >= 0x81 && c3 <= 0xFE && p < e ) {
410
+ unsigned char c4 = * p ++ ;
411
+
412
+ if (c4 >= 0x30 && c4 <= 0x39 ) {
413
+ if (c >= 0x90 && c <= 0xE3 ) {
414
+ unsigned int w = ((((c - 0x90 )* 10 + (c2 - 0x30 ))* 126 + (c3 - 0x81 )))* 10 + (c4 - 0x30 ) + 0x10000 ;
415
+ * out ++ = (w > 0x10FFFF ) ? MBFL_BAD_INPUT : w ;
416
+ } else {
417
+ /* Unicode BMP */
418
+ unsigned int w = (((c - 0x81 )* 10 + (c2 - 0x30 ))* 126 + (c3 - 0x81 ))* 10 + (c4 - 0x30 );
419
+ if (w <= 39419 ) {
420
+ * out ++ = w + mbfl_gb_uni_ofst [mbfl_bisec_srch (w , mbfl_gb2uni_tbl , mbfl_gb_uni_max )];
421
+ } else {
422
+ * out ++ = MBFL_BAD_INPUT ;
423
+ }
424
+ }
425
+ } else {
426
+ * out ++ = MBFL_BAD_INPUT ;
427
+ }
428
+ } else {
429
+ * out ++ = MBFL_BAD_INPUT ;
430
+ }
431
+ } else if (((c >= 0xAA && c <= 0xAF ) || (c >= 0xF8 && c <= 0xFE )) && (c2 >= 0xA1 && c2 <= 0xFE )) {
432
+ /* UDA part 1, 2: U+E000-U+E4C5 */
433
+ * out ++ = 94 * (c >= 0xF8 ? c - 0xF2 : c - 0xAA ) + (c2 - 0xA1 ) + 0xE000 ;
434
+ } else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F ) {
435
+ /* UDA part 3: U+E4C6-U+E765 */
436
+ * out ++ = 96 * (c - 0xA1 ) + c2 - (c2 >= 0x80 ? 0x41 : 0x40 ) + 0xE4C6 ;
437
+ } else {
438
+ if ((s >= 0xA2AB && s <= 0xA9FE ) || (s >= 0xD7FA && s <= 0xD7FE ) || (s >= 0xFE50 && s <= 0xFEA0 )) {
439
+ for (int i = 0 ; i < mbfl_gb18030_pua_tbl_max ; i ++ ) {
440
+ if (s >= mbfl_gb18030_pua_tbl [i ][2 ] && s <= mbfl_gb18030_pua_tbl [i ][2 ] + mbfl_gb18030_pua_tbl [i ][1 ] - mbfl_gb18030_pua_tbl [i ][0 ]) {
441
+ * out ++ = s - mbfl_gb18030_pua_tbl [i ][2 ] + mbfl_gb18030_pua_tbl [i ][0 ];
442
+ goto next_iteration ;
443
+ }
444
+ }
445
+ }
446
+
447
+ if ((c >= 0xA1 && c <= 0xA9 && c2 >= 0xA1 && c2 <= 0xFE ) ||
448
+ (c >= 0xB0 && c <= 0xf7 && c2 >= 0xa1 && c2 <= 0xfe ) ||
449
+ (c >= 0x81 && c <= 0xa0 && c2 >= 0x40 && c2 <= 0xfe && c2 != 0x7f ) ||
450
+ (c >= 0xAA && c <= 0xfe && c2 >= 0x40 && c2 <= 0xa0 && c2 != 0x7f ) ||
451
+ (c >= 0xA8 && c <= 0xa9 && c2 >= 0x40 && c2 <= 0xa0 && c2 != 0x7F )) {
452
+ unsigned int w = (c - 0x81 )* 192 + c2 - 0x40 ;
453
+ ZEND_ASSERT (w < cp936_ucs_table_size );
454
+ * out ++ = cp936_ucs_table [w ];
455
+ } else {
456
+ * out ++ = MBFL_BAD_INPUT ;
457
+ }
458
+ }
459
+ } else {
460
+ * out ++ = MBFL_BAD_INPUT ;
461
+ }
462
+ next_iteration : ;
463
+ }
464
+
465
+ * in_len = e - p ;
466
+ * in = p ;
467
+ return out - buf ;
468
+ }
469
+
470
+ static void mb_wchar_to_gb18030 (uint32_t * in , size_t len , mb_convert_buf * buf , bool end )
471
+ {
472
+ unsigned char * out , * limit ;
473
+ MB_CONVERT_BUF_LOAD (buf , out , limit );
474
+ MB_CONVERT_BUF_ENSURE (buf , out , limit , len );
475
+
476
+ while (len -- ) {
477
+ uint32_t w = * in ++ ;
478
+ unsigned int s = 0 ;
479
+
480
+ if (w == 0 ) {
481
+ out = mb_convert_buf_add (out , 0 );
482
+ continue ;
483
+ } else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max ) {
484
+ if (w == 0x1F9 ) {
485
+ s = 0xA8Bf ;
486
+ } else {
487
+ s = ucs_a1_cp936_table [w - ucs_a1_cp936_table_min ];
488
+ }
489
+ } else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max ) {
490
+ if (w == 0x20AC ) { /* Euro sign */
491
+ s = 0xA2E3 ;
492
+ } else {
493
+ s = ucs_a2_cp936_table [w - ucs_a2_cp936_table_min ];
494
+ }
495
+ } else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max ) {
496
+ s = ucs_a3_cp936_table [w - ucs_a3_cp936_table_min ];
497
+ } else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max ) {
498
+ s = ucs_i_cp936_table [w - ucs_i_cp936_table_min ];
499
+ } else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max ) {
500
+ /* U+F900-U+FA2F CJK Compatibility Ideographs */
501
+ if (w == 0xF92C ) {
502
+ s = 0xFD9C ;
503
+ } else if (w == 0xF979 ) {
504
+ s = 0xFD9D ;
505
+ } else if (w == 0xF995 ) {
506
+ s = 0xFD9E ;
507
+ } else if (w == 0xF9E7 ) {
508
+ s = 0xFD9F ;
509
+ } else if (w == 0xF9F1 ) {
510
+ s = 0xFDA0 ;
511
+ } else if (w >= 0xFA0C && w <= 0xFA29 ) {
512
+ s = ucs_ci_s_cp936_table [w - 0xFA0C ];
513
+ }
514
+ } else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max ) {
515
+ /* CJK Compatibility Forms */
516
+ s = ucs_cf_cp936_table [w - ucs_cf_cp936_table_min ];
517
+ } else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max ) {
518
+ /* U+FE50-U+FE6F Small Form Variants */
519
+ s = ucs_sfv_cp936_table [w - ucs_sfv_cp936_table_min ];
520
+ } else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max ) {
521
+ /* U+FF00-U+FFFF HW/FW Forms */
522
+ if (w == 0xFF04 ) {
523
+ s = 0xA1E7 ;
524
+ } else if (w == 0xFF5E ) {
525
+ s = 0xA1AB ;
526
+ } else if (w >= 0xFF01 && w <= 0xFF5D ) {
527
+ s = w - 0xFF01 + 0xA3A1 ;
528
+ } else if (w >= 0xFFE0 && w <= 0xFFE5 ) {
529
+ s = ucs_hff_s_cp936_table [w - 0xFFE0 ];
530
+ }
531
+ } else if (w >= 0xE000 && w <= 0xE864 ) {
532
+ /* PUA */
533
+ if (w < 0xE766 ) {
534
+ if (w < 0xE4C6 ) {
535
+ unsigned int c1 = w - 0xE000 ;
536
+ s = (c1 % 94 ) + 0xA1 ;
537
+ c1 /= 94 ;
538
+ s |= (c1 + (c1 < 0x06 ? 0xAA : 0xF2 )) << 8 ;
539
+ } else {
540
+ unsigned int c1 = w - 0xE4C6 ;
541
+ s = ((c1 / 96 ) + 0xA1 ) << 8 ;
542
+ c1 %= 96 ;
543
+ s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40 );
544
+ }
545
+ } else {
546
+ /* U+E766-U+E864 */
547
+ unsigned int k1 = 0 , k2 = mbfl_gb18030_pua_tbl_max ;
548
+ while (k1 < k2 ) {
549
+ unsigned int k = (k1 + k2 ) >> 1 ;
550
+ if (w < mbfl_gb18030_pua_tbl [k ][0 ]) {
551
+ k2 = k ;
552
+ } else if (w > mbfl_gb18030_pua_tbl [k ][1 ]) {
553
+ k1 = k + 1 ;
554
+ } else {
555
+ s = w - mbfl_gb18030_pua_tbl [k ][0 ] + mbfl_gb18030_pua_tbl [k ][2 ];
556
+ break ;
557
+ }
558
+ }
559
+ }
560
+ }
561
+
562
+ /* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
563
+ * do a binary search in a table of differing codepoints to see if we have one */
564
+ if (!s && w >= mbfl_gb18030_c_tbl_key [0 ] && w <= mbfl_gb18030_c_tbl_key [mbfl_gb18030_c_tbl_max - 1 ]) {
565
+ int i = mbfl_bisec_srch2 (w , mbfl_gb18030_c_tbl_key , mbfl_gb18030_c_tbl_max );
566
+ if (i >= 0 ) {
567
+ s = mbfl_gb18030_c_tbl_val [i ];
568
+ }
569
+ }
570
+
571
+ /* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
572
+ if (!s && w >= 0x80 && w <= 0xFFFF ) {
573
+ /* BMP */
574
+ int i = mbfl_bisec_srch (w , mbfl_uni2gb_tbl , mbfl_gb_uni_max );
575
+ if (i >= 0 ) {
576
+ unsigned int c1 = w - mbfl_gb_uni_ofst [i ];
577
+ s = (c1 % 10 ) + 0x30 ;
578
+ c1 /= 10 ;
579
+ s |= ((c1 % 126 ) + 0x81 ) << 8 ;
580
+ c1 /= 126 ;
581
+ s |= ((c1 % 10 ) + 0x30 ) << 16 ;
582
+ c1 /= 10 ;
583
+ s |= (c1 + 0x81 ) << 24 ;
584
+ }
585
+ } else if (w >= 0x10000 && w <= 0x10FFFF ) {
586
+ /* Code set 3: Unicode U+10000-U+10FFFF */
587
+ unsigned int c1 = w - 0x10000 ;
588
+ s = (c1 % 10 ) + 0x30 ;
589
+ c1 /= 10 ;
590
+ s |= ((c1 % 126 ) + 0x81 ) << 8 ;
591
+ c1 /= 126 ;
592
+ s |= ((c1 % 10 ) + 0x30 ) << 16 ;
593
+ c1 /= 10 ;
594
+ s |= (c1 + 0x90 ) << 24 ;
595
+ }
596
+
597
+ if (!s ) {
598
+ MB_CONVERT_ERROR (buf , out , limit , w , mb_wchar_to_gb18030 );
599
+ MB_CONVERT_BUF_ENSURE (buf , out , limit , len );
600
+ } else if (s < 0x80 ) {
601
+ out = mb_convert_buf_add (out , s );
602
+ } else if (s > 0xFFFFFF ) {
603
+ MB_CONVERT_BUF_ENSURE (buf , out , limit , len + 4 );
604
+ out = mb_convert_buf_add4 (out , (s >> 24 ) & 0xFF , (s >> 16 ) & 0xFF , (s >> 8 ) & 0xFF , s & 0xFF );
605
+ } else {
606
+ MB_CONVERT_BUF_ENSURE (buf , out , limit , len + 2 );
607
+ out = mb_convert_buf_add2 (out , (s >> 8 ) & 0xFF , s & 0xFF );
608
+ }
609
+ }
610
+
611
+ MB_CONVERT_BUF_STORE (buf , out , limit );
612
+ }
0 commit comments