Skip to content

Commit ffbddc4

Browse files
committed
Optimize conversion of GB18030 to Unicode
As with CP936, iterating over the PUA table and looking for matches in it was a significant bottleneck for GB18030 decoding (though not as severe a bottleneck as for CP936, since more is involved in GB18030 decoding than CP936 decoding). Here are some benchmark results after optimizing out that bottleneck: GB18030, medium - to UTF-16BE - faster by 60.71% (0.0007 vs 0.0017) GB18030, medium - to UTF-8 - faster by 59.88% (0.0007 vs 0.0017) GB18030, long - to UTF-8 - faster by 44.91% (0.0669 vs 0.1214) GB18030, long - to UTF-16BE - faster by 43.05% (0.0672 vs 0.1181) GB18030, short - to UTF-8 - faster by 27.22% (0.0003 vs 0.0004) GB18030, short - to UTF-16BE - faster by 26.98% (0.0003 vs 0.0004) (The 'short' test strings had 0-5 codepoints each, 'medium' ~100 codepoints, and 'long' ~10,000 codepoints. For each benchmark, the test harness cycled through all the test strings 40,000 times.)
1 parent 703725e commit ffbddc4

File tree

1 file changed

+45
-17
lines changed

1 file changed

+45
-17
lines changed

ext/mbstring/libmbfl/filters/mbfilter_gb18030.c

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,22 @@ int mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
388388
return 0;
389389
}
390390

391+
static const unsigned short gb18030_pua_tbl3[] = {
392+
/* 0xFE50 */
393+
0x0000,0xE816,0xE817,0xE818,0x0000,0x0000,0x0000,0x0000,
394+
0x0000,0xE81E,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
395+
0x0000,0xE826,0x0000,0x0000,0x0000,0x0000,0xE82B,0xE82C,
396+
0x0000,0x0000,0x0000,0x0000,0xE831,0xE832,0x0000,0x0000,
397+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE83B,0x0000,
398+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE843,0x0000,
399+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
400+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
401+
0xE854,0xE855,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
402+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
403+
/* 0xFEA0 */
404+
0xE864
405+
};
406+
391407
static size_t mb_gb18030_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
392408
{
393409
unsigned char *p = *in, *e = p + *in_len;
@@ -398,9 +414,14 @@ static size_t mb_gb18030_to_wchar(unsigned char **in, size_t *in_len, uint32_t *
398414

399415
if (c < 0x80) {
400416
*out++ = c;
401-
} else if (c > 0x80 && c < 0xFF && p < e) {
417+
} else if (c == 0x80 || c == 0xFF) {
418+
*out++ = MBFL_BAD_INPUT;
419+
} else {
420+
if (p == e) {
421+
*out++ = MBFL_BAD_INPUT;
422+
break;
423+
}
402424
unsigned char c2 = *p++;
403-
unsigned int s = (c << 8) | c2;
404425

405426
if (((c >= 0x81 && c <= 0x84) || (c >= 0x90 && c <= 0xE3)) && c2 >= 0x30 && c2 <= 0x39) {
406427
if (p >= e) {
@@ -437,32 +458,39 @@ static size_t mb_gb18030_to_wchar(unsigned char **in, size_t *in_len, uint32_t *
437458
} else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F) {
438459
/* UDA part 3: U+E4C6-U+E765 */
439460
*out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
440-
} else {
441-
if ((s >= 0xA2AB && s <= 0xA9FE) || (s >= 0xD7FA && s <= 0xD7FE) || (s >= 0xFE50 && s <= 0xFEA0)) {
442-
for (int i = 0; i < mbfl_gb18030_pua_tbl_max; i++) {
443-
if (s >= mbfl_gb18030_pua_tbl[i][2] && s <= mbfl_gb18030_pua_tbl[i][2] + mbfl_gb18030_pua_tbl[i][1] - mbfl_gb18030_pua_tbl[i][0]) {
444-
*out++ = s - mbfl_gb18030_pua_tbl[i][2] + mbfl_gb18030_pua_tbl[i][0];
445-
goto next_iteration;
461+
} else if (c2 >= 0x40 && c2 != 0x7F && c2 != 0xFF) {
462+
unsigned int w = (c - 0x81)*192 + c2 - 0x40;
463+
464+
if (w >= 0x192B) {
465+
if (w <= 0x1EBE) {
466+
if (w != 0x1963 && w != 0x1DBF && (w < 0x1E49 || w > 0x1E55) && w != 0x1E7F) {
467+
*out++ = cp936_pua_tbl1[w - 0x192B];
468+
continue;
469+
}
470+
} else if (w >= 0x413A) {
471+
if (w <= 0x413E) {
472+
*out++ = cp936_pua_tbl2[w - 0x413A];
473+
continue;
474+
} else if (w >= 0x5DD0 && w <= 0x5E20) {
475+
unsigned int c = gb18030_pua_tbl3[w - 0x5DD0];
476+
if (c) {
477+
*out++ = c;
478+
continue;
479+
}
446480
}
447481
}
448482
}
449483

450-
if ((c >= 0xA1 && c <= 0xA9 && c2 >= 0xA1 && c2 <= 0xFE) ||
451-
(c >= 0xB0 && c <= 0xf7 && c2 >= 0xa1 && c2 <= 0xfe) ||
452-
(c >= 0x81 && c <= 0xa0 && c2 >= 0x40 && c2 <= 0xfe && c2 != 0x7f) ||
453-
(c >= 0xAA && c <= 0xfe && c2 >= 0x40 && c2 <= 0xa0 && c2 != 0x7f) ||
454-
(c >= 0xA8 && c <= 0xa9 && c2 >= 0x40 && c2 <= 0xa0 && c2 != 0x7F)) {
455-
unsigned int w = (c - 0x81)*192 + c2 - 0x40;
484+
if ((c >= 0x81 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7 && c2 >= 0xA1) || (c >= 0xAA && c <= 0xFE && c2 <= 0xA0)) {
456485
ZEND_ASSERT(w < cp936_ucs_table_size);
457486
*out++ = cp936_ucs_table[w];
458487
} else {
459488
*out++ = MBFL_BAD_INPUT;
460489
}
490+
} else {
491+
*out++ = MBFL_BAD_INPUT;
461492
}
462-
} else {
463-
*out++ = MBFL_BAD_INPUT;
464493
}
465-
next_iteration: ;
466494
}
467495

468496
*in_len = e - p;

0 commit comments

Comments
 (0)