Skip to content

Commit ad7e0f1

Browse files
committed
Fix mbstring support for Shift-JIS
- Reject otherwise valid kuten codes which don't map to anything in JIS X 0208. - Handle truncated multi-byte characters as an error. - Convert Shift-JIS 0x7E to Unicode 0x203E (overline) as recommended by the Unicode Consortium, and as iconv does. - Convert Shift-JIS 0x5C to Unicode 0xA5 (yen sign) as recommended by the Unicode Consortium, and as iconv does. (NOTE: This will affect PHP scripts which use an internal encoding of Shift-JIS! PHP assigns a special meaning to 0x5C, the backslash. For example, it is used for escapes in double-quoted strings. Mapping the Shift-JIS yen sign to the Unicode yen sign means the yen sign will not be usable for C escapes in double-quoted strings. Japanese PHP programmers who want to write their source code in Shift-JIS for some strange reason will have to use the JIS X 0208 backlash or 'REVERSE SOLIDUS' character for their C escapes.) - Convert Unicode 0x5C (backslash) to Shift-JIS 0x815F (reverse solidus). - Immediately handle error if first Shift-JIS byte is over 0xEF, rather than waiting to see the next byte. (Previously, the value used was 0xFC, which is the limit for the 2nd byte and not the 1st byte of a multi-byte character.) - Don't allow 'control characters' to appear in the middle of a multi-byte character. The test case for bug 47399 is now obsolete. That test assumed that a number of Shift-JIS byte sequences which don't map to any character were 'valid' (because the byte values were within the legal ranges).
1 parent cc03c54 commit ad7e0f1

File tree

5 files changed

+7234
-601
lines changed

5 files changed

+7234
-601
lines changed

Zend/tests/multibyte/multibyte_encoding_001.phpt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ internal_encoding=SJIS
1313
<?php
1414
declare(encoding='Shift_JIS');
1515
$s = "\"; // 0x95+0x5c in script, not somewhere else "
16-
printf("%x:%x\n", ord($s[0]), ord($s[1]));
16+
printf("%x:%x", ord($s[0]), ord($s[1]));
1717
?>
1818
--EXPECT--
1919
95:5c

ext/mbstring/libmbfl/filters/mbfilter_sjis.c

Lines changed: 75 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include "unicode_table_cp932_ext.h"
3737
#include "unicode_table_jis.h"
3838

39+
static int mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter *filter);
3940
int mbfl_filt_ident_sjis(int c, mbfl_identify_filter *filter);
4041

4142
const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
@@ -82,8 +83,8 @@ const struct mbfl_convert_vtbl vtbl_sjis_wchar = {
8283
mbfl_filt_conv_common_ctor,
8384
NULL,
8485
mbfl_filt_conv_sjis_wchar,
85-
mbfl_filt_conv_common_flush,
86-
NULL,
86+
mbfl_filt_conv_sjis_wchar_flush,
87+
NULL
8788
};
8889

8990
const struct mbfl_convert_vtbl vtbl_wchar_sjis = {
@@ -93,7 +94,7 @@ const struct mbfl_convert_vtbl vtbl_wchar_sjis = {
9394
NULL,
9495
mbfl_filt_conv_wchar_sjis,
9596
mbfl_filt_conv_common_flush,
96-
NULL,
97+
NULL
9798
};
9899

99100
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
@@ -141,35 +142,32 @@ const struct mbfl_convert_vtbl vtbl_wchar_sjis = {
141142
} \
142143
} while (0)
143144

144-
145-
/*
146-
* SJIS => wchar
147-
*/
148-
int
149-
mbfl_filt_conv_sjis_wchar(int c, mbfl_convert_filter *filter)
145+
int mbfl_filt_conv_sjis_wchar(int c, mbfl_convert_filter *filter)
150146
{
151-
int c1, s1, s2, w;
147+
int s1, s2, w;
152148

153149
switch (filter->status) {
154150
case 0:
155-
if (c >= 0 && c < 0x80) { /* latin */
151+
if (c == 0x5C) {
152+
CK((*filter->output_function)(0xA5, filter->data));
153+
} else if (c == 0x7E) {
154+
CK((*filter->output_function)(0x203E, filter->data));
155+
} else if (c >= 0 && c < 0x80) { /* ASCII */
156156
CK((*filter->output_function)(c, filter->data));
157-
} else if (c > 0xa0 && c < 0xe0) { /* kana */
158-
CK((*filter->output_function)(0xfec0 + c, filter->data));
159-
} else if (c > 0x80 && c < 0xfd && c != 0xa0) { /* kanji first char */
157+
} else if (c > 0xA0 && c < 0xE0) { /* Kana */
158+
CK((*filter->output_function)(0xFEC0 + c, filter->data));
159+
} else if (c > 0x80 && c < 0xF0 && c != 0xA0) { /* Kanji, first byte */
160160
filter->status = 1;
161161
filter->cache = c;
162162
} else {
163-
w = c & MBFL_WCSGROUP_MASK;
164-
w |= MBFL_WCSGROUP_THROUGH;
165-
CK((*filter->output_function)(w, filter->data));
163+
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
166164
}
167165
break;
168166

169-
case 1: /* kanji second char */
167+
case 1: /* Kanji, second byte */
170168
filter->status = 0;
171-
c1 = filter->cache;
172-
if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
169+
int c1 = filter->cache;
170+
if (c >= 0x40 && c <= 0xFC && c != 0x7F) {
173171
SJIS_DECODE(c1, c, s1, s2);
174172
w = (s1 - 0x21)*94 + s2 - 0x21;
175173
if (w >= 0 && w < jisx0208_ucs_table_size) {
@@ -178,45 +176,45 @@ mbfl_filt_conv_sjis_wchar(int c, mbfl_convert_filter *filter)
178176
w = 0;
179177
}
180178
if (w <= 0) {
181-
if (s1 < 0x7f && s2 < 0x7f) {
182-
w = (s1 << 8) | s2;
183-
w &= MBFL_WCSPLANE_MASK;
184-
w |= MBFL_WCSPLANE_JIS0208;
179+
if (s1 < 0x7F && s2 < 0x7F) {
180+
w = (s1 << 8) | s2 | MBFL_WCSPLANE_JIS0208;
185181
} else {
186-
w = (c1 << 8) | c;
187-
w &= MBFL_WCSGROUP_MASK;
188-
w |= MBFL_WCSGROUP_THROUGH;
182+
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
189183
}
190184
}
191185
CK((*filter->output_function)(w, filter->data));
192-
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
193-
CK((*filter->output_function)(c, filter->data));
194186
} else {
195-
w = (c1 << 8) | c;
196-
w &= MBFL_WCSGROUP_MASK;
197-
w |= MBFL_WCSGROUP_THROUGH;
187+
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
198188
CK((*filter->output_function)(w, filter->data));
199189
}
200-
break;
201-
202-
default:
203-
filter->status = 0;
204-
break;
205190
}
206191

207192
return c;
208193
}
209194

210-
/*
211-
* wchar => SJIS
212-
*/
213-
int
214-
mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
195+
static int mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter *filter)
196+
{
197+
if (filter->status) {
198+
mbfl_filt_conv_illegal_output(filter->cache, filter);
199+
}
200+
201+
if (filter->flush_function) {
202+
(*filter->flush_function)(filter->data);
203+
}
204+
205+
return 0;
206+
}
207+
208+
int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
215209
{
216210
int c1, c2, s1, s2;
217211

218212
s1 = 0;
219-
if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
213+
if (c == 0x5C) {
214+
/* Unicode 0x5C is a backslash; but Shift-JIS uses 0x5C for the
215+
* Yen sign. JIS X 0208 kuten 0x2140 is a backslash. */
216+
s1 = 0x2140;
217+
} else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
220218
s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
221219
} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
222220
s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
@@ -226,42 +224,39 @@ mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
226224
s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
227225
}
228226
if (s1 <= 0) {
229-
c1 = c & ~MBFL_WCSPLANE_MASK;
230-
if (c1 == MBFL_WCSPLANE_JIS0208) {
231-
s1 = c & MBFL_WCSPLANE_MASK;
232-
} else if (c == 0xa5) { /* YEN SIGN */
233-
s1 = 0x216f; /* FULLWIDTH YEN SIGN */
234-
} else if (c == 0x203e) { /* OVER LINE */
235-
s1 = 0x2131; /* FULLWIDTH MACRON */
236-
} else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
227+
if (c == 0xA5) { /* YEN SIGN */
228+
s1 = 0x5C;
229+
} else if (c == 0x203E) { /* OVER LINE */
230+
s1 = 0x7E;
231+
} else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
237232
s1 = 0x2140;
238-
} else if (c == 0xff5e) { /* FULLWIDTH TILDE */
233+
} else if (c == 0xFF5E) { /* FULLWIDTH TILDE */
239234
s1 = 0x2141;
240-
} else if (c == 0x2225) { /* PARALLEL TO */
235+
} else if (c == 0x2225) { /* PARALLEL TO */
241236
s1 = 0x2142;
242-
} else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
243-
s1 = 0x215d;
244-
} else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
237+
} else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
238+
s1 = 0x215D;
239+
} else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */
245240
s1 = 0x2171;
246-
} else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
241+
} else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */
247242
s1 = 0x2172;
248-
} else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
249-
s1 = 0x224c;
250-
}
251-
if (c == 0) {
243+
} else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */
244+
s1 = 0x224C;
245+
} else if (c == 0) {
252246
s1 = 0;
253-
} else if (s1 <= 0) {
247+
} else {
254248
s1 = -1;
255249
}
256-
} else if (s1 >= 0x8080) {
250+
} else if (s1 >= 0x8080) { /* JIS X 0212; not supported */
257251
s1 = -1;
258252
}
253+
259254
if (s1 >= 0) {
260-
if (s1 < 0x100) { /* latin or kana */
255+
if (s1 < 0x100) { /* Latin/Kana */
261256
CK((*filter->output_function)(s1, filter->data));
262-
} else { /* kanji */
263-
c1 = (s1 >> 8) & 0xff;
264-
c2 = s1 & 0xff;
257+
} else { /* Kanji */
258+
c1 = (s1 >> 8) & 0xFF;
259+
c2 = s1 & 0xFF;
265260
SJIS_ENCODE(c1, c2, s1, s2);
266261
CK((*filter->output_function)(s1, filter->data));
267262
CK((*filter->output_function)(s2, filter->data));
@@ -275,18 +270,23 @@ mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
275270

276271
int mbfl_filt_ident_sjis(int c, mbfl_identify_filter *filter)
277272
{
278-
if (filter->status) { /* kanji second char */
279-
if (c < 0x40 || c > 0xfc || c == 0x7f) { /* bad */
273+
if (filter->status) { /* Kanji, second byte */
274+
if (c < 0x40 || c > 0xFC || c == 0x7F) {
280275
filter->flag = 1;
276+
} else {
277+
int s1, s2;
278+
SJIS_DECODE(filter->status, c, s1, s2);
279+
int w = ((s1 - 0x21) * 94) + s2 - 0x21;
280+
if (w >= jisx0208_ucs_table_size || !jisx0208_ucs_table[w]) {
281+
filter->flag = 1;
282+
}
281283
}
282284
filter->status = 0;
283-
} else if (c >= 0 && c < 0x80) { /* latin ok */
284-
;
285-
} else if (c > 0xa0 && c < 0xe0) { /* kana ok */
285+
} else if (c < 0x80 || (c > 0xA0 && c < 0xE0)) { /* Latin/Kana */
286286
;
287-
} else if (c > 0x80 && c < 0xf0 && c != 0xa0) { /* kanji first char */
288-
filter->status = 1;
289-
} else { /* bad */
287+
} else if (c > 0x80 && c < 0xF0 && c != 0xA0) { /* Kanji, first byte */
288+
filter->status = c;
289+
} else {
290290
filter->flag = 1;
291291
}
292292

0 commit comments

Comments
 (0)