Skip to content

Commit 8f6889b

Browse files
committed
Fix mbstring support for EUC-JP text encoding
- Don't allow control characters to appear in the middle of a multi-byte character. (A strange feature, or perhaps misfeature, of mbstring which is not present in other libraries such as iconv.) - When checking whether string is valid, reject kuten codes which do not map to any character, whether converting from EUC-JP to another encoding, or converting another encoding which uses JIS X 0208/0212 charsets to EUC-JP. - Truncated multi-byte characters are treated as an error.
1 parent ad7e0f1 commit 8f6889b

File tree

3 files changed

+13289
-68
lines changed

3 files changed

+13289
-68
lines changed

ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c

Lines changed: 75 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include "unicode_table_jis.h"
3535

3636
int mbfl_filt_ident_eucjp(int c, mbfl_identify_filter *filter);
37+
static int mbfl_filt_conv_eucjp_wchar_flush(mbfl_convert_filter *filter);
3738

3839
const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */
3940
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -79,7 +80,7 @@ const struct mbfl_convert_vtbl vtbl_eucjp_wchar = {
7980
mbfl_filt_conv_common_ctor,
8081
NULL,
8182
mbfl_filt_conv_eucjp_wchar,
82-
mbfl_filt_conv_common_flush,
83+
mbfl_filt_conv_eucjp_wchar_flush,
8384
NULL,
8485
};
8586

@@ -101,7 +102,7 @@ const struct mbfl_convert_vtbl vtbl_wchar_eucjp = {
101102
int
102103
mbfl_filt_conv_eucjp_wchar(int c, mbfl_convert_filter *filter)
103104
{
104-
int c1, s, w;
105+
int c1, s, w = 0;
105106

106107
switch (filter->status) {
107108
case 0:
@@ -115,9 +116,7 @@ mbfl_filt_conv_eucjp_wchar(int c, mbfl_convert_filter *filter)
115116
} else if (c == 0x8f) { /* X 0212 first char */
116117
filter->status = 3;
117118
} else {
118-
w = c & MBFL_WCSGROUP_MASK;
119-
w |= MBFL_WCSGROUP_THROUGH;
120-
CK((*filter->output_function)(w, filter->data));
119+
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
121120
}
122121
break;
123122

@@ -128,21 +127,13 @@ mbfl_filt_conv_eucjp_wchar(int c, mbfl_convert_filter *filter)
128127
s = (c1 - 0xa1)*94 + c - 0xa1;
129128
if (s >= 0 && s < jisx0208_ucs_table_size) {
130129
w = jisx0208_ucs_table[s];
131-
} else {
132-
w = 0;
133130
}
134131
if (w <= 0) {
135-
w = ((c1 & 0x7f) << 8) | (c & 0x7f);
136-
w &= MBFL_WCSPLANE_MASK;
137-
w |= MBFL_WCSPLANE_JIS0208;
132+
w = ((c1 & 0x7f) << 8) | (c & 0x7f) | MBFL_WCSPLANE_JIS0208;
138133
}
139134
CK((*filter->output_function)(w, filter->data));
140-
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
141-
CK((*filter->output_function)(c, filter->data));
142135
} else {
143-
w = (c1 << 8) | c;
144-
w &= MBFL_WCSGROUP_MASK;
145-
w |= MBFL_WCSGROUP_THROUGH;
136+
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
146137
CK((*filter->output_function)(w, filter->data));
147138
}
148139
break;
@@ -152,47 +143,31 @@ mbfl_filt_conv_eucjp_wchar(int c, mbfl_convert_filter *filter)
152143
if (c > 0xa0 && c < 0xe0) {
153144
w = 0xfec0 + c;
154145
CK((*filter->output_function)(w, filter->data));
155-
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
156-
CK((*filter->output_function)(c, filter->data));
157146
} else {
158-
w = 0x8e00 | c;
159-
w &= MBFL_WCSGROUP_MASK;
160-
w |= MBFL_WCSGROUP_THROUGH;
147+
w = 0x8e00 | c | MBFL_WCSGROUP_THROUGH;
161148
CK((*filter->output_function)(w, filter->data));
162149
}
163150
break;
164151

165-
case 3: /* got 0x8f, X 0212 first char */
166-
if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
167-
CK((*filter->output_function)(c, filter->data));
168-
filter->status = 0;
169-
} else {
170-
filter->status++;
171-
filter->cache = c;
172-
}
152+
case 3: /* got 0x8f, JIS X 0212 first byte */
153+
filter->status++;
154+
filter->cache = c;
173155
break;
174-
case 4: /* got 0x8f, X 0212 second char */
156+
157+
case 4: /* got 0x8f, JIS X 0212 second byte */
175158
filter->status = 0;
176159
c1 = filter->cache;
177-
if (c1 > 0xa0 && c1 < 0xff && c > 0xa0 && c < 0xff) {
160+
if (c > 0xA0 && c < 0xFF && c1 > 0xA0 && c1 < 0xFF) {
178161
s = (c1 - 0xa1)*94 + c - 0xa1;
179162
if (s >= 0 && s < jisx0212_ucs_table_size) {
180163
w = jisx0212_ucs_table[s];
181-
} else {
182-
w = 0;
183164
}
184165
if (w <= 0) {
185-
w = ((c1 & 0x7f) << 8) | (c & 0x7f);
186-
w &= MBFL_WCSPLANE_MASK;
187-
w |= MBFL_WCSPLANE_JIS0212;
166+
w = ((c1 & 0x7f) << 8) | (c & 0x7f) | MBFL_WCSPLANE_JIS0212;
188167
}
189168
CK((*filter->output_function)(w, filter->data));
190-
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
191-
CK((*filter->output_function)(c, filter->data));
192169
} else {
193-
w = (c1 << 8) | c | 0x8f0000;
194-
w &= MBFL_WCSGROUP_MASK;
195-
w |= MBFL_WCSGROUP_THROUGH;
170+
w = (c1 << 8) | c | 0x8f0000 | MBFL_WCSGROUP_THROUGH;
196171
CK((*filter->output_function)(w, filter->data));
197172
}
198173
break;
@@ -205,13 +180,26 @@ mbfl_filt_conv_eucjp_wchar(int c, mbfl_convert_filter *filter)
205180
return c;
206181
}
207182

183+
static int mbfl_filt_conv_eucjp_wchar_flush(mbfl_convert_filter *filter)
184+
{
185+
if (filter->status) {
186+
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
187+
}
188+
189+
if (filter->flush_function) {
190+
(*filter->flush_function)(filter->data);
191+
}
192+
193+
return 0;
194+
}
195+
208196
/*
209197
* wchar => EUC-JP
210198
*/
211199
int
212200
mbfl_filt_conv_wchar_eucjp(int c, mbfl_convert_filter *filter)
213201
{
214-
int c1, s;
202+
int s;
215203

216204
s = 0;
217205
if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
@@ -224,13 +212,7 @@ mbfl_filt_conv_wchar_eucjp(int c, mbfl_convert_filter *filter)
224212
s = ucs_r_jis_table[c - ucs_r_jis_table_min];
225213
}
226214
if (s <= 0) {
227-
c1 = c & ~MBFL_WCSPLANE_MASK;
228-
if (c1 == MBFL_WCSPLANE_JIS0208) {
229-
s = c & MBFL_WCSPLANE_MASK;
230-
} else if (c1 == MBFL_WCSPLANE_JIS0212) {
231-
s = c & MBFL_WCSPLANE_MASK;
232-
s |= 0x8080;
233-
} else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
215+
if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
234216
s = 0x2140;
235217
} else if (c == 0xff5e) { /* FULLWIDTH TILDE */
236218
s = 0x2141;
@@ -244,10 +226,9 @@ mbfl_filt_conv_wchar_eucjp(int c, mbfl_convert_filter *filter)
244226
s = 0x2172;
245227
} else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
246228
s = 0x224c;
247-
}
248-
if (c == 0) {
229+
} else if (c == 0) {
249230
s = 0;
250-
} else if (s <= 0) {
231+
} else {
251232
s = -1;
252233
}
253234
}
@@ -272,45 +253,71 @@ mbfl_filt_conv_wchar_eucjp(int c, mbfl_convert_filter *filter)
272253
return c;
273254
}
274255

256+
/* Not all byte sequences in JIS X 0208 which would otherwise be valid are
257+
* actually mapped to a character */
258+
static inline int in_unused_jisx0208_range(int c1, int c2)
259+
{
260+
/* `c1`, `c2` are kuten codes */
261+
unsigned int s = (c1 - 0x21)*94 + c2 - 0x21;
262+
return s >= jisx0208_ucs_table_size || !jisx0208_ucs_table[s];
263+
}
264+
265+
static inline int in_unused_jisx0212_range(int c1, int c2)
266+
{
267+
unsigned int s = (c1 - 0x21)*94 + c2 - 0x21;
268+
return s >= jisx0212_ucs_table_size || !jisx0212_ucs_table[s];
269+
}
270+
275271
int mbfl_filt_ident_eucjp(int c, mbfl_identify_filter *filter)
276272
{
277-
switch (filter->status) {
278-
case 0: /* latin */
279-
if (c >= 0 && c < 0x80) { /* ok */
273+
unsigned char ku, ten;
274+
275+
switch (filter->status & 0xF) {
276+
case 0: /* latin */
277+
if (c < 0x80) { /* ok */
280278
;
281-
} else if (c > 0xa0 && c < 0xff) { /* kanji first char */
282-
filter->status = 1;
283-
} else if (c == 0x8e) { /* kana first char */
279+
} else if (c > 0xa0 && c < 0xff) {
280+
/* JIS X 0208, first byte
281+
* In EUC-JP, each such byte ranges from 0xA1-0xFE; however,
282+
* the bytes of JIS X 0208 kuten codes range from 0x21-0x7E */
283+
filter->status = ((c - 0xA1 + 0x21) << 8) | 1;
284+
} else if (c == 0x8e) { /* JIS X 0201 */
284285
filter->status = 2;
285-
} else if (c == 0x8f) { /* X 0212 first char */
286+
} else if (c == 0x8f) { /* JIS X 0212 */
286287
filter->status = 3;
287-
} else { /* bad */
288+
} else { /* bad */
288289
filter->flag = 1;
289290
}
290291
break;
291292

292-
case 1: /* got first half */
293-
if (c < 0xa1 || c > 0xfe) { /* bad */
293+
case 1: /* 2nd byte of JIS X 0208 */
294+
ku = filter->status >> 8;
295+
ten = c - 0xA1 + 0x21;
296+
if (c < 0xa1 || c > 0xfe || in_unused_jisx0208_range(ku, ten)) { /* bad */
294297
filter->flag = 1;
295298
}
296299
filter->status = 0;
297300
break;
298301

299-
case 2: /* got 0x8e */
300-
if (c < 0xa1 || c > 0xdf) { /* bad */
302+
case 2: /* JIS X 0201 */
303+
if (c < 0xa1 || c > 0xdf) { /* bad */
301304
filter->flag = 1;
302305
}
303306
filter->status = 0;
304307
break;
305308

306-
case 3: /* got 0x8f */
307-
if (c < 0xa1 || c > 0xfe) { /* bad */
309+
case 3: /* JIS X 0212 */
310+
if (c < 0xa1 || c > 0xfe) { /* bad */
308311
filter->flag = 1;
312+
} else {
313+
filter->status = ((c - 0xA1 + 0x21) << 8) | 4;
309314
}
310-
filter->status++;
311315
break;
312-
case 4: /* got 0x8f */
313-
if (c < 0xa1 || c > 0xfe) { /* bad */
316+
317+
case 4: /* JIS X 0212, final byte */
318+
ku = filter->status >> 8;
319+
ten = c - 0xA1 + 0x21;
320+
if (c < 0xa1 || c > 0xfe || in_unused_jisx0212_range(ku, ten)) { /* bad */
314321
filter->flag = 1;
315322
}
316323
filter->status = 0;

0 commit comments

Comments
 (0)