Skip to content

Commit cb4626c

Browse files
committed
Implement fast text conversion interface for GB18030
1 parent 3e8088d commit cb4626c

File tree

1 file changed

+230
-2
lines changed

1 file changed

+230
-2
lines changed

ext/mbstring/libmbfl/filters/mbfilter_gb18030.c

Lines changed: 230 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
#include "unicode_table_gb18030.h"
3535

3636
static int mbfl_filt_conv_gb18030_wchar_flush(mbfl_convert_filter *filter);
37+
static size_t mb_gb18030_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
38+
static void mb_wchar_to_gb18030(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
3739

3840
static const char *mbfl_encoding_gb18030_aliases[] = {"gb-18030", "gb-18030-2000", NULL};
3941

@@ -46,8 +48,8 @@ const mbfl_encoding mbfl_encoding_gb18030 = {
4648
MBFL_ENCTYPE_GL_UNSAFE,
4749
&vtbl_gb18030_wchar,
4850
&vtbl_wchar_gb18030,
49-
NULL,
50-
NULL
51+
mb_gb18030_to_wchar,
52+
mb_wchar_to_gb18030
5153
};
5254

5355
const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {
@@ -382,3 +384,229 @@ int mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
382384

383385
return 0;
384386
}
387+
388+
static size_t mb_gb18030_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
389+
{
390+
unsigned char *p = *in, *e = p + *in_len;
391+
uint32_t *out = buf, *limit = buf + bufsize;
392+
393+
while (p < e && out < limit) {
394+
unsigned char c = *p++;
395+
396+
if (c < 0x80) {
397+
*out++ = c;
398+
} else if (c > 0x80 && c < 0xFF && p < e) {
399+
unsigned char c2 = *p++;
400+
unsigned int s = (c << 8) | c2;
401+
402+
if (((c >= 0x81 && c <= 0x84) || (c >= 0x90 && c <= 0xE3)) && c2 >= 0x30 && c2 <= 0x39) {
403+
if (p >= e) {
404+
*out++ = MBFL_BAD_INPUT;
405+
break;
406+
}
407+
unsigned char c3 = *p++;
408+
409+
if (c3 >= 0x81 && c3 <= 0xFE && p < e) {
410+
unsigned char c4 = *p++;
411+
412+
if (c4 >= 0x30 && c4 <= 0x39) {
413+
if (c >= 0x90 && c <= 0xE3) {
414+
unsigned int w = ((((c - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c4 - 0x30) + 0x10000;
415+
*out++ = (w > 0x10FFFF) ? MBFL_BAD_INPUT : w;
416+
} else {
417+
/* Unicode BMP */
418+
unsigned int w = (((c - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c4 - 0x30);
419+
if (w <= 39419) {
420+
*out++ = w + mbfl_gb_uni_ofst[mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max)];
421+
} else {
422+
*out++ = MBFL_BAD_INPUT;
423+
}
424+
}
425+
} else {
426+
*out++ = MBFL_BAD_INPUT;
427+
}
428+
} else {
429+
*out++ = MBFL_BAD_INPUT;
430+
}
431+
} else if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && (c2 >= 0xA1 && c2 <= 0xFE)) {
432+
/* UDA part 1, 2: U+E000-U+E4C5 */
433+
*out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
434+
} else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F) {
435+
/* UDA part 3: U+E4C6-U+E765 */
436+
*out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
437+
} else {
438+
if ((s >= 0xA2AB && s <= 0xA9FE) || (s >= 0xD7FA && s <= 0xD7FE) || (s >= 0xFE50 && s <= 0xFEA0)) {
439+
for (int i = 0; i < mbfl_gb18030_pua_tbl_max; i++) {
440+
if (s >= mbfl_gb18030_pua_tbl[i][2] && s <= mbfl_gb18030_pua_tbl[i][2] + mbfl_gb18030_pua_tbl[i][1] - mbfl_gb18030_pua_tbl[i][0]) {
441+
*out++ = s - mbfl_gb18030_pua_tbl[i][2] + mbfl_gb18030_pua_tbl[i][0];
442+
goto next_iteration;
443+
}
444+
}
445+
}
446+
447+
if ((c >= 0xA1 && c <= 0xA9 && c2 >= 0xA1 && c2 <= 0xFE) ||
448+
(c >= 0xB0 && c <= 0xf7 && c2 >= 0xa1 && c2 <= 0xfe) ||
449+
(c >= 0x81 && c <= 0xa0 && c2 >= 0x40 && c2 <= 0xfe && c2 != 0x7f) ||
450+
(c >= 0xAA && c <= 0xfe && c2 >= 0x40 && c2 <= 0xa0 && c2 != 0x7f) ||
451+
(c >= 0xA8 && c <= 0xa9 && c2 >= 0x40 && c2 <= 0xa0 && c2 != 0x7F)) {
452+
unsigned int w = (c - 0x81)*192 + c2 - 0x40;
453+
ZEND_ASSERT(w < cp936_ucs_table_size);
454+
*out++ = cp936_ucs_table[w];
455+
} else {
456+
*out++ = MBFL_BAD_INPUT;
457+
}
458+
}
459+
} else {
460+
*out++ = MBFL_BAD_INPUT;
461+
}
462+
next_iteration: ;
463+
}
464+
465+
*in_len = e - p;
466+
*in = p;
467+
return out - buf;
468+
}
469+
470+
static void mb_wchar_to_gb18030(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
471+
{
472+
unsigned char *out, *limit;
473+
MB_CONVERT_BUF_LOAD(buf, out, limit);
474+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
475+
476+
while (len--) {
477+
uint32_t w = *in++;
478+
unsigned int s = 0;
479+
480+
if (w == 0) {
481+
out = mb_convert_buf_add(out, 0);
482+
continue;
483+
} else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
484+
if (w == 0x1F9) {
485+
s = 0xA8Bf;
486+
} else {
487+
s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
488+
}
489+
} else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
490+
if (w == 0x20AC) { /* Euro sign */
491+
s = 0xA2E3;
492+
} else {
493+
s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
494+
}
495+
} else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
496+
s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
497+
} else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
498+
s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
499+
} else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max) {
500+
/* U+F900-U+FA2F CJK Compatibility Ideographs */
501+
if (w == 0xF92C) {
502+
s = 0xFD9C;
503+
} else if (w == 0xF979) {
504+
s = 0xFD9D;
505+
} else if (w == 0xF995) {
506+
s = 0xFD9E;
507+
} else if (w == 0xF9E7) {
508+
s = 0xFD9F;
509+
} else if (w == 0xF9F1) {
510+
s = 0xFDA0;
511+
} else if (w >= 0xFA0C && w <= 0xFA29) {
512+
s = ucs_ci_s_cp936_table[w - 0xFA0C];
513+
}
514+
} else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max) {
515+
/* CJK Compatibility Forms */
516+
s = ucs_cf_cp936_table[w - ucs_cf_cp936_table_min];
517+
} else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max) {
518+
/* U+FE50-U+FE6F Small Form Variants */
519+
s = ucs_sfv_cp936_table[w - ucs_sfv_cp936_table_min];
520+
} else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
521+
/* U+FF00-U+FFFF HW/FW Forms */
522+
if (w == 0xFF04) {
523+
s = 0xA1E7;
524+
} else if (w == 0xFF5E) {
525+
s = 0xA1AB;
526+
} else if (w >= 0xFF01 && w <= 0xFF5D) {
527+
s = w - 0xFF01 + 0xA3A1;
528+
} else if (w >= 0xFFE0 && w <= 0xFFE5) {
529+
s = ucs_hff_s_cp936_table[w - 0xFFE0];
530+
}
531+
} else if (w >= 0xE000 && w <= 0xE864) {
532+
/* PUA */
533+
if (w < 0xE766) {
534+
if (w < 0xE4C6) {
535+
unsigned int c1 = w - 0xE000;
536+
s = (c1 % 94) + 0xA1;
537+
c1 /= 94;
538+
s |= (c1 + (c1 < 0x06 ? 0xAA : 0xF2)) << 8;
539+
} else {
540+
unsigned int c1 = w - 0xE4C6;
541+
s = ((c1 / 96) + 0xA1) << 8;
542+
c1 %= 96;
543+
s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40);
544+
}
545+
} else {
546+
/* U+E766-U+E864 */
547+
unsigned int k1 = 0, k2 = mbfl_gb18030_pua_tbl_max;
548+
while (k1 < k2) {
549+
unsigned int k = (k1 + k2) >> 1;
550+
if (w < mbfl_gb18030_pua_tbl[k][0]) {
551+
k2 = k;
552+
} else if (w > mbfl_gb18030_pua_tbl[k][1]) {
553+
k1 = k + 1;
554+
} else {
555+
s = w - mbfl_gb18030_pua_tbl[k][0] + mbfl_gb18030_pua_tbl[k][2];
556+
break;
557+
}
558+
}
559+
}
560+
}
561+
562+
/* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
563+
* do a binary search in a table of differing codepoints to see if we have one */
564+
if (!s && w >= mbfl_gb18030_c_tbl_key[0] && w <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
565+
int i = mbfl_bisec_srch2(w, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
566+
if (i >= 0) {
567+
s = mbfl_gb18030_c_tbl_val[i];
568+
}
569+
}
570+
571+
/* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
572+
if (!s && w >= 0x80 && w <= 0xFFFF) {
573+
/* BMP */
574+
int i = mbfl_bisec_srch(w, mbfl_uni2gb_tbl, mbfl_gb_uni_max);
575+
if (i >= 0) {
576+
unsigned int c1 = w - mbfl_gb_uni_ofst[i];
577+
s = (c1 % 10) + 0x30;
578+
c1 /= 10;
579+
s |= ((c1 % 126) + 0x81) << 8;
580+
c1 /= 126;
581+
s |= ((c1 % 10) + 0x30) << 16;
582+
c1 /= 10;
583+
s |= (c1 + 0x81) << 24;
584+
}
585+
} else if (w >= 0x10000 && w <= 0x10FFFF) {
586+
/* Code set 3: Unicode U+10000-U+10FFFF */
587+
unsigned int c1 = w - 0x10000;
588+
s = (c1 % 10) + 0x30;
589+
c1 /= 10;
590+
s |= ((c1 % 126) + 0x81) << 8;
591+
c1 /= 126;
592+
s |= ((c1 % 10) + 0x30) << 16;
593+
c1 /= 10;
594+
s |= (c1 + 0x90) << 24;
595+
}
596+
597+
if (!s) {
598+
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_gb18030);
599+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
600+
} else if (s < 0x80) {
601+
out = mb_convert_buf_add(out, s);
602+
} else if (s > 0xFFFFFF) {
603+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
604+
out = mb_convert_buf_add4(out, (s >> 24) & 0xFF, (s >> 16) & 0xFF, (s >> 8) & 0xFF, s & 0xFF);
605+
} else {
606+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
607+
out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
608+
}
609+
}
610+
611+
MB_CONVERT_BUF_STORE(buf, out, limit);
612+
}

0 commit comments

Comments
 (0)