Skip to content

Commit e916273

Browse files
committed
Add mbstring support for GB18030-2022 text encoding
The previous version of the GB-18030 standard was published in 2005. This commit adds support for the updated (2022) version of this text encoding. The existing GB18030 implementation has been left unchanged for backwards compatibility; users who want to use the new standard must explicitly indicate the desired text encoding is 'GB18030-2022'. The document which defines GB18030-2022, published by the government of the People's Republic of China, defines three levels of standards compliance. This implementation is intended to achieve Implementation Level 3, which is the highest level of compliance. Experts in the GB18030 standard are requested to assess this implementation and report any deviation from the standard.
1 parent 693e1be commit e916273

File tree

8 files changed

+64518
-1
lines changed

8 files changed

+64518
-1
lines changed

NEWS

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@ Intl:
3535

3636
MBString:
3737
. Added mb_trim, mb_ltrim and mb_rtrim. (Yuya Hamada)
38+
. Added support for the newest version of the GB18030 text encoding,
39+
GB18030-2022. The existing implementation of GB18030 is unchanged;
40+
users who wish to use the updated implementation must explicitly
41+
specify 'GB18030-2022' as the desired text encoding. The standards
42+
document published by the People's Republic of China authorities
43+
defines 3 levels of standards compliance; MBString's implementation
44+
achieves Implementation Level 3, which is the highest level. Experts
45+
on the GB18030 standard are requested to assess MBString's handling
46+
of GB18030-2022 text and report any bugs via GitHub. (Alex Dowad)
3847

3948
Opcache:
4049
. Added large shared segments support for FreeBSD. (David Carlier)

ext/mbstring/libmbfl/filters/mbfilter_cjk.c

Lines changed: 329 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11088,7 +11088,7 @@ static void mb_wchar_to_gb18030(uint32_t *in, size_t len, mb_convert_buf *buf, b
1108811088
continue;
1108911089
} else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
1109011090
if (w == 0x1F9) {
11091-
s = 0xA8Bf;
11091+
s = 0xA8BF;
1109211092
} else {
1109311093
s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
1109411094
}
@@ -11560,6 +11560,319 @@ static void mb_wchar_to_cp936(uint32_t *in, size_t len, mb_convert_buf *buf, boo
1156011560
MB_CONVERT_BUF_STORE(buf, out, limit);
1156111561
}
1156211562

11563+
static const unsigned short gb18030_2022_pua_tbl3[] = {
11564+
/* 0xFE50 */
11565+
0x0000,0xE816,0xE817,0xE818,0x0000,0x0000,0x0000,0x0000,
11566+
0x0000,0x9FB4,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11567+
0x0000,0x9FB5,0x0000,0x0000,0x0000,0x0000,0x9FB6,0x9FB7,
11568+
0x0000,0x0000,0x0000,0x0000,0xE831,0x9FB8,0x0000,0x0000,
11569+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE83B,0x0000,
11570+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x9FB9,0x0000,
11571+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11572+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11573+
0x9FBA,0xE855,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11574+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
11575+
/* 0xFEA0 */
11576+
0x9FBB
11577+
};
11578+
11579+
static size_t mb_gb18030_2022_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
11580+
{
11581+
unsigned char *p = *in, *e = p + *in_len;
11582+
uint32_t *out = buf, *limit = buf + bufsize;
11583+
11584+
while (p < e && out < limit) {
11585+
unsigned char c = *p++;
11586+
11587+
if (c < 0x80) {
11588+
*out++ = c;
11589+
} else if (c == 0x80 || c == 0xFF) {
11590+
*out++ = MBFL_BAD_INPUT;
11591+
} else {
11592+
if (p == e) {
11593+
*out++ = MBFL_BAD_INPUT;
11594+
break;
11595+
}
11596+
unsigned char c2 = *p++;
11597+
11598+
if (((c >= 0x81 && c <= 0x84) || (c >= 0x90 && c <= 0xE3)) && c2 >= 0x30 && c2 <= 0x39) {
11599+
if (p >= e) {
11600+
*out++ = MBFL_BAD_INPUT;
11601+
break;
11602+
}
11603+
unsigned char c3 = *p++;
11604+
11605+
if (c3 >= 0x81 && c3 <= 0xFE && p < e) {
11606+
unsigned char c4 = *p++;
11607+
11608+
if (c4 >= 0x30 && c4 <= 0x39) {
11609+
if (c >= 0x90 && c <= 0xE3) {
11610+
unsigned int w = ((((c - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c4 - 0x30) + 0x10000;
11611+
*out++ = (w > 0x10FFFF) ? MBFL_BAD_INPUT : w;
11612+
} else {
11613+
/* Unicode BMP */
11614+
unsigned int w = (((c - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c4 - 0x30);
11615+
if (w == 0x98A4) {
11616+
*out++ = 0xE78D;
11617+
} else if (w == 0x98A6) {
11618+
*out++ = 0xE78E;
11619+
} else if (w == 0x98A5) {
11620+
*out++ = 0xE78F;
11621+
} else if (w >= 0x98A7 && w <= 0x98AD) {
11622+
*out++ = w + (0xE790 - 0x98A7);
11623+
} else if (w == 0x1D21) {
11624+
*out++ = 0xE7C7;
11625+
} else if (w == 0x4A71) {
11626+
*out++ = 0xE81E;
11627+
} else if (w == 0x4A72) {
11628+
*out++ = 0xE826;
11629+
} else if (w >= 0x4A73 && w <= 0x4A74) {
11630+
*out++ = w + (0xE82B - 0x4A73);
11631+
} else if (w == 0x4A75) {
11632+
*out++ = 0xE832;
11633+
} else if (w == 0x4A76) {
11634+
*out++ = 0xE843;
11635+
} else if (w == 0x4A77) {
11636+
*out++ = 0xE854;
11637+
} else if (w == 0x4A78) {
11638+
*out++ = 0xE864;
11639+
} else if (w <= 0x99FB) {
11640+
*out++ = w + mbfl_gb_uni_ofst[mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max)];
11641+
} else {
11642+
*out++ = MBFL_BAD_INPUT;
11643+
}
11644+
}
11645+
} else {
11646+
*out++ = MBFL_BAD_INPUT;
11647+
}
11648+
} else {
11649+
*out++ = MBFL_BAD_INPUT;
11650+
}
11651+
} else if (((c >= 0xAA && c <= 0xAF) || (c >= 0xF8 && c <= 0xFE)) && (c2 >= 0xA1 && c2 <= 0xFE)) {
11652+
/* UDA part 1, 2: U+E000-U+E4C5 */
11653+
*out++ = 94*(c >= 0xF8 ? c - 0xF2 : c - 0xAA) + (c2 - 0xA1) + 0xE000;
11654+
} else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F) {
11655+
/* UDA part 3: U+E4C6-U+E765 */
11656+
*out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
11657+
} else if (c2 >= 0x40 && c2 != 0x7F && c2 != 0xFF) {
11658+
unsigned int w = (c - 0x81)*192 + c2 - 0x40;
11659+
11660+
if (w >= 0x192B) {
11661+
if (w <= 0x1EBE) {
11662+
if (w != 0x1963 && w != 0x1DBF && (w < 0x1E49 || w > 0x1E55) && w != 0x1E7F) {
11663+
*out++ = gb18030_2022_pua_tbl1[w - 0x192B];
11664+
continue;
11665+
}
11666+
} else if (w >= 0x413A) {
11667+
if (w <= 0x413E) {
11668+
*out++ = cp936_pua_tbl2[w - 0x413A];
11669+
continue;
11670+
} else if (w >= 0x5DD0 && w <= 0x5E20) {
11671+
unsigned int c = gb18030_2022_pua_tbl3[w - 0x5DD0];
11672+
if (c) {
11673+
*out++ = c;
11674+
continue;
11675+
}
11676+
}
11677+
}
11678+
}
11679+
11680+
if ((c >= 0x81 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7 && c2 >= 0xA1) || (c >= 0xAA && c <= 0xFE && c2 <= 0xA0)) {
11681+
ZEND_ASSERT(w < cp936_ucs_table_size);
11682+
*out++ = cp936_ucs_table[w];
11683+
} else {
11684+
*out++ = MBFL_BAD_INPUT;
11685+
}
11686+
} else {
11687+
*out++ = MBFL_BAD_INPUT;
11688+
}
11689+
}
11690+
}
11691+
11692+
*in_len = e - p;
11693+
*in = p;
11694+
return out - buf;
11695+
}
11696+
11697+
static void mb_wchar_to_gb18030_2022(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
11698+
{
11699+
unsigned char *out, *limit;
11700+
MB_CONVERT_BUF_LOAD(buf, out, limit);
11701+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11702+
11703+
while (len--) {
11704+
uint32_t w = *in++;
11705+
unsigned int s = 0;
11706+
11707+
if (w == 0) {
11708+
out = mb_convert_buf_add(out, 0);
11709+
continue;
11710+
} else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max) {
11711+
if (w == 0x1F9) {
11712+
s = 0xA8BF;
11713+
} else {
11714+
s = ucs_a1_cp936_table[w - ucs_a1_cp936_table_min];
11715+
}
11716+
} else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max) {
11717+
if (w == 0x20AC) { /* Euro sign */
11718+
s = 0xA2E3;
11719+
} else {
11720+
s = ucs_a2_cp936_table[w - ucs_a2_cp936_table_min];
11721+
}
11722+
} else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max) {
11723+
s = ucs_a3_cp936_table[w - ucs_a3_cp936_table_min];
11724+
} else if (w >= 0x9FB4 && w <= 0x9FBB) {
11725+
/* Newly mapped in GB18030-2022 */
11726+
if (w == 0x9FB4) {
11727+
s = 0xFE59;
11728+
} else if (w == 0x9FB5) {
11729+
s = 0xFE61;
11730+
} else if (w == 0x9FB6) {
11731+
s = 0xFE66;
11732+
} else if (w == 0x9FB7) {
11733+
s = 0xFE67;
11734+
} else if (w == 0x9FB8) {
11735+
s = 0xFE6D;
11736+
} else if (w == 0x9FB9) {
11737+
s = 0xFE7E;
11738+
} else if (w == 0x9FBA) {
11739+
s = 0xFE90;
11740+
} else {
11741+
s = 0xFEA0;
11742+
}
11743+
} else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max) {
11744+
s = ucs_i_cp936_table[w - ucs_i_cp936_table_min];
11745+
} else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max) {
11746+
/* U+F900-U+FA2F CJK Compatibility Ideographs */
11747+
if (w == 0xF92C) {
11748+
s = 0xFD9C;
11749+
} else if (w == 0xF979) {
11750+
s = 0xFD9D;
11751+
} else if (w == 0xF995) {
11752+
s = 0xFD9E;
11753+
} else if (w == 0xF9E7) {
11754+
s = 0xFD9F;
11755+
} else if (w == 0xF9F1) {
11756+
s = 0xFDA0;
11757+
} else if (w >= 0xFA0C && w <= 0xFA29) {
11758+
s = ucs_ci_s_cp936_table[w - 0xFA0C];
11759+
}
11760+
} else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max) {
11761+
/* CJK Compatibility Forms */
11762+
s = ucs_cf_cp936_table[w - ucs_cf_cp936_table_min];
11763+
} else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max) {
11764+
/* U+FE50-U+FE6F Small Form Variants */
11765+
s = ucs_sfv_cp936_table[w - ucs_sfv_cp936_table_min];
11766+
} else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max) {
11767+
/* U+FF00-U+FFFF HW/FW Forms */
11768+
if (w == 0xFF04) {
11769+
s = 0xA1E7;
11770+
} else if (w == 0xFF5E) {
11771+
s = 0xA1AB;
11772+
} else if (w >= 0xFF01 && w <= 0xFF5D) {
11773+
s = w - 0xFF01 + 0xA3A1;
11774+
} else if (w >= 0xFFE0 && w <= 0xFFE5) {
11775+
s = ucs_hff_s_cp936_table[w - 0xFFE0];
11776+
}
11777+
} else if (w >= 0xE000 && w <= 0xE864) {
11778+
/* PUA */
11779+
if (w < 0xE766) {
11780+
if (w < 0xE4C6) {
11781+
unsigned int c1 = w - 0xE000;
11782+
s = (c1 % 94) + 0xA1;
11783+
c1 /= 94;
11784+
s |= (c1 + (c1 < 0x06 ? 0xAA : 0xF2)) << 8;
11785+
} else {
11786+
unsigned int c1 = w - 0xE4C6;
11787+
s = ((c1 / 96) + 0xA1) << 8;
11788+
c1 %= 96;
11789+
s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40);
11790+
}
11791+
} else {
11792+
/* U+E766-U+E864 */
11793+
unsigned int k1 = 0, k2 = mbfl_gb18030_2022_pua_tbl_max;
11794+
while (k1 < k2) {
11795+
unsigned int k = (k1 + k2) >> 1;
11796+
if (w < mbfl_gb18030_2022_pua_tbl[k][0]) {
11797+
k2 = k;
11798+
} else if (w > mbfl_gb18030_2022_pua_tbl[k][1]) {
11799+
k1 = k + 1;
11800+
} else {
11801+
s = w - mbfl_gb18030_2022_pua_tbl[k][0] + mbfl_gb18030_2022_pua_tbl[k][2];
11802+
break;
11803+
}
11804+
}
11805+
}
11806+
} else if (w >= 0xFE10 && w <= 0xFE19) {
11807+
/* Newly mapped codepoints in GB18030-2022 */
11808+
if (w == 0xFE11) {
11809+
s = 0xA6DB;
11810+
} else if (w == 0xFE12) {
11811+
s = 0xA6DA;
11812+
} else if (w <= 0xFE16) {
11813+
s = w - (0xFE10 - 0xA6D9);
11814+
} else if (w <= 0xFE18) {
11815+
s = w - (0xFE17 - 0xA6EC);
11816+
} else {
11817+
s = 0xA6F3;
11818+
}
11819+
} else if (w == 0x1E3F) {
11820+
/* Newly mapped codepoint in GB18030-2022 */
11821+
s = 0xA8BC;
11822+
}
11823+
11824+
/* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
11825+
* do a binary search in a table of differing codepoints to see if we have one */
11826+
if (!s && w >= mbfl_gb18030_c_tbl_key[0] && w <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
11827+
int i = mbfl_bisec_srch2(w, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
11828+
if (i >= 0) {
11829+
s = mbfl_gb18030_c_tbl_val[i];
11830+
}
11831+
}
11832+
11833+
/* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
11834+
if (!s && w >= 0x80 && w <= 0xFFFF) {
11835+
/* BMP */
11836+
int i = mbfl_bisec_srch(w, mbfl_uni2gb2022_tbl, mbfl_gb2022_uni_max);
11837+
if (i >= 0) {
11838+
unsigned int c1 = w - mbfl_gb2022_uni_ofst[i];
11839+
s = (c1 % 10) + 0x30;
11840+
c1 /= 10;
11841+
s |= ((c1 % 126) + 0x81) << 8;
11842+
c1 /= 126;
11843+
s |= ((c1 % 10) + 0x30) << 16;
11844+
c1 /= 10;
11845+
s |= (c1 + 0x81) << 24;
11846+
}
11847+
} else if (w >= 0x10000 && w <= 0x10FFFF) {
11848+
/* Code set 3: Unicode U+10000-U+10FFFF */
11849+
unsigned int c1 = w - 0x10000;
11850+
s = (c1 % 10) + 0x30;
11851+
c1 /= 10;
11852+
s |= ((c1 % 126) + 0x81) << 8;
11853+
c1 /= 126;
11854+
s |= ((c1 % 10) + 0x30) << 16;
11855+
c1 /= 10;
11856+
s |= (c1 + 0x90) << 24;
11857+
}
11858+
11859+
if (!s) {
11860+
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_gb18030);
11861+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
11862+
} else if (s < 0x80) {
11863+
out = mb_convert_buf_add(out, s);
11864+
} else if (s > 0xFFFFFF) {
11865+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
11866+
out = mb_convert_buf_add4(out, (s >> 24) & 0xFF, (s >> 16) & 0xFF, (s >> 8) & 0xFF, s & 0xFF);
11867+
} else {
11868+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
11869+
out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
11870+
}
11871+
}
11872+
11873+
MB_CONVERT_BUF_STORE(buf, out, limit);
11874+
}
11875+
1156311876
/* Step through a GB18030 string one character at a time. Find the last position at or
1156411877
* before `limit` which falls directly after the end of a (single or multi-byte) character */
1156511878
static zend_always_inline unsigned char* step_through_gb18030_str(unsigned char *p, unsigned char *limit)
@@ -11673,6 +11986,21 @@ const mbfl_encoding mbfl_encoding_cp936 = {
1167311986
NULL,
1167411987
};
1167511988

11989+
const mbfl_encoding mbfl_encoding_gb18030_2022 = {
11990+
mbfl_no_encoding_gb18030_2022,
11991+
"GB18030-2022",
11992+
"GB18030-2022",
11993+
NULL,
11994+
NULL,
11995+
MBFL_ENCTYPE_GL_UNSAFE,
11996+
NULL,
11997+
NULL,
11998+
mb_gb18030_2022_to_wchar,
11999+
mb_wchar_to_gb18030_2022,
12000+
NULL,
12001+
mb_cut_gb18030,
12002+
};
12003+
1167612004
/*
1167712005
* BIG5/CP950
1167812006
*/

ext/mbstring/libmbfl/filters/mbfilter_cjk.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ extern const mbfl_encoding mbfl_encoding_euc_kr;
3232
extern const mbfl_encoding mbfl_encoding_uhc;
3333

3434
extern const mbfl_encoding mbfl_encoding_gb18030;
35+
extern const mbfl_encoding mbfl_encoding_gb18030_2022;
3536
extern const mbfl_encoding mbfl_encoding_cp936;
3637
extern const mbfl_encoding mbfl_encoding_big5;
3738
extern const mbfl_encoding mbfl_encoding_cp950;

0 commit comments

Comments
 (0)