Skip to content

Commit 3e8088d

Browse files
committed
Implement fast text conversion interface for EUC-JP-MS
1 parent e5af94b commit 3e8088d

File tree

1 file changed

+197
-2
lines changed

1 file changed

+197
-2
lines changed

ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c

Lines changed: 197 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
#include "cp932_table.h"
3636

3737
static int mbfl_filt_conv_eucjpwin_wchar_flush(mbfl_convert_filter *filter);
38+
static size_t mb_eucjpwin_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
39+
static void mb_wchar_to_eucjpwin(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
3840

3941
static const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */
4042
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -66,8 +68,8 @@ const mbfl_encoding mbfl_encoding_eucjp_win = {
6668
0,
6769
&vtbl_eucjpwin_wchar,
6870
&vtbl_wchar_eucjpwin,
69-
NULL,
70-
NULL
71+
mb_eucjpwin_to_wchar,
72+
mb_wchar_to_eucjpwin
7173
};
7274

7375
const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = {
@@ -337,3 +339,196 @@ int mbfl_filt_conv_wchar_eucjpwin(int c, mbfl_convert_filter *filter)
337339

338340
return 0;
339341
}
342+
343+
static size_t mb_eucjpwin_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
344+
{
345+
unsigned char *p = *in, *e = p + *in_len;
346+
uint32_t *out = buf, *limit = buf + bufsize;
347+
348+
while (p < e && out < limit) {
349+
unsigned char c = *p++;
350+
351+
if (c < 0x80) {
352+
*out++ = c;
353+
} else if (c >= 0xA1 && c <= 0xFE && p < e) {
354+
unsigned char c2 = *p++;
355+
356+
if (c2 >= 0xA1 && c2 <= 0xFE) {
357+
unsigned int s = (c - 0xA1)*94 + c2 - 0xA1, w = 0;
358+
359+
if (s <= 137) {
360+
if (s == 31) {
361+
w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
362+
} else if (s == 32) {
363+
w = 0xFF5E; /* FULLWIDTH TILDE */
364+
} else if (s == 33) {
365+
w = 0x2225; /* PARALLEL TO */
366+
} else if (s == 60) {
367+
w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
368+
} else if (s == 80) {
369+
w = 0xFFE0; /* FULLWIDTH CENT SIGN */
370+
} else if (s == 81) {
371+
w = 0xFFE1; /* FULLWIDTH POUND SIGN */
372+
} else if (s == 137) {
373+
w = 0xFFE2; /* FULLWIDTH NOT SIGN */
374+
}
375+
}
376+
377+
if (w == 0) {
378+
if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
379+
w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
380+
} else if (s < jisx0208_ucs_table_size) {
381+
w = jisx0208_ucs_table[s];
382+
} else if (s >= (84 * 94)) {
383+
w = s - (84 * 94) + 0xE000;
384+
}
385+
}
386+
387+
if (!w)
388+
w = MBFL_BAD_INPUT;
389+
*out++ = w;
390+
} else {
391+
*out++ = MBFL_BAD_INPUT;
392+
}
393+
} else if (c == 0x8E && p < e) {
394+
unsigned char c2 = *p++;
395+
if (c2 >= 0xA1 && c2 <= 0xDF) {
396+
*out++ = 0xFEC0 + c2;
397+
} else {
398+
*out++ = MBFL_BAD_INPUT;
399+
}
400+
} else if (c == 0x8F && p < e) {
401+
unsigned char c2 = *p++;
402+
if (p == e) {
403+
*out++ = MBFL_BAD_INPUT;
404+
continue;
405+
}
406+
unsigned char c3 = *p++;
407+
408+
if (c2 >= 0xA1 && c2 <= 0xFE && c3 >= 0xA1 && c3 <= 0xFE) {
409+
unsigned int s = (c2 - 0xA1)*94 + c3 - 0xA1, w = 0;
410+
411+
if (s < jisx0212_ucs_table_size) {
412+
w = jisx0212_ucs_table[s];
413+
if (w == 0x7E)
414+
w = 0xFF5E; /* FULLWIDTH TILDE */
415+
} else if (s >= (82*94) && s < (84*94)) {
416+
s = (c2 << 8) | c3;
417+
for (int i = 0; i < cp932ext3_eucjp_table_size; i++) {
418+
if (cp932ext3_eucjp_table[i] == s) {
419+
w = cp932ext3_ucs_table[i];
420+
break;
421+
}
422+
}
423+
} else if (s >= (84*94)) {
424+
w = s - (84*94) + 0xE000 + (94*10);
425+
}
426+
427+
if (w == 0xA6)
428+
w = 0xFFE4; /* FULLWIDTH BROKEN BAR */
429+
430+
if (!w)
431+
w = MBFL_BAD_INPUT;
432+
*out++ = w;
433+
} else {
434+
*out++ = MBFL_BAD_INPUT;
435+
}
436+
} else {
437+
*out++ = MBFL_BAD_INPUT;
438+
}
439+
}
440+
441+
*in_len = e - p;
442+
*in = p;
443+
return out - buf;
444+
}
445+
446+
static void mb_wchar_to_eucjpwin(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
447+
{
448+
unsigned char *out, *limit;
449+
MB_CONVERT_BUF_LOAD(buf, out, limit);
450+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
451+
452+
while (len--) {
453+
uint32_t w = *in++;
454+
unsigned int s = 0;
455+
456+
if (w == 0) {
457+
out = mb_convert_buf_add(out, 0);
458+
continue;
459+
} else if (w == 0xAF) { /* U+00AF is MACRON */
460+
s = 0xA2B4; /* Use JIS X 0212 overline */
461+
} else if (w == 0x203E) {
462+
s = 0x7E;
463+
} else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
464+
s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
465+
} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
466+
s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
467+
} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
468+
s = ucs_i_jis_table[w - ucs_i_jis_table_min];
469+
} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
470+
s = ucs_r_jis_table[w - ucs_r_jis_table_min];
471+
} else if (w >= 0xE000 && w < (0xE000 + 10*94)) {
472+
s = w - 0xE000;
473+
s = ((s/94 + 0x75) << 8) + (s%94) + 0x21;
474+
} else if (w >= (0xE000 + 10*94) && w < (0xE000 + 20*94)) {
475+
s = w - (0xE000 + 10*94);
476+
s = ((s/94 + 0xF5) << 8) + (s%94) + 0xA1;
477+
}
478+
479+
if (s == 0xA2F1)
480+
s = 0x2D62; /* NUMERO SIGN */
481+
482+
if (s == 0) {
483+
if (w == 0xA5) { /* YEN SIGN */
484+
s = 0x5C;
485+
} else if (w == 0x2014) { /* EM DASH */
486+
s = 0x213D;
487+
} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
488+
s = 0x2140;
489+
} else if (w == 0x2225) { /* PARALLEL TO */
490+
s = 0x2142;
491+
} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
492+
s = 0x215D;
493+
} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
494+
s = 0x2171;
495+
} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
496+
s = 0x2172;
497+
} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
498+
s = 0x224C;
499+
} else {
500+
for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
501+
if (cp932ext1_ucs_table[i] == w) {
502+
s = (((i/94) + (cp932ext1_ucs_table_min/94) + 0x21) << 8) + (i%94) + 0x21;
503+
break;
504+
}
505+
}
506+
507+
if (!s) {
508+
for (int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) {
509+
if (cp932ext3_ucs_table[i] == w) {
510+
s = cp932ext3_eucjp_table[i];
511+
break;
512+
}
513+
}
514+
}
515+
}
516+
}
517+
518+
if (!s) {
519+
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjpwin);
520+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
521+
} else if (s < 0x80) {
522+
out = mb_convert_buf_add(out, s);
523+
} else if (s < 0x100) {
524+
out = mb_convert_buf_add2(out, 0x8E, s);
525+
} else if (s < 0x8080) {
526+
out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
527+
} else {
528+
MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 3);
529+
out = mb_convert_buf_add3(out, 0x8F, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
530+
}
531+
}
532+
533+
MB_CONVERT_BUF_STORE(buf, out, limit);
534+
}

0 commit comments

Comments
 (0)