Skip to content

Commit 0b32a15

Browse files
committed
Optimize mb_str{,im}width for performance
Rather than doing a linear search of a table of fullwidth codepoint ranges for every input character, 1) Short-cut the search if the codepoint is below the first such range 2) Otherwise, do a binary (rather than linear) search
1 parent f4365d2 commit 0b32a15

File tree

3 files changed

+30
-21
lines changed

3 files changed

+30
-21
lines changed

ext/mbstring/libmbfl/mbfl/eaw_table.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
* which should be displayed as double-width.
1515
*/
1616

17+
#define FIRST_DOUBLEWIDTH_CODEPOINT 0x1100
18+
1719
static const struct {
1820
int begin;
1921
int end;

ext/mbstring/libmbfl/mbfl/mbfilter.c

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1203,31 +1203,33 @@ mbfl_strcut(
12031203
return result;
12041204
}
12051205

1206-
1207-
/*
1208-
* strwidth
1209-
*/
1210-
static size_t is_fullwidth(int c)
1206+
/* Some East Asian characters, when printed at a terminal (or the like), require double
1207+
* the usual amount of horizontal space. We call these "fullwidth" characters. */
1208+
static size_t character_width(int c)
12111209
{
1212-
int i;
1213-
1214-
if (c < mbfl_eaw_table[0].begin) {
1215-
return 0;
1216-
}
1217-
1218-
for (i = 0; i < sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]); i++) {
1219-
if (mbfl_eaw_table[i].begin <= c && c <= mbfl_eaw_table[i].end) {
1220-
return 1;
1210+
if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
1211+
return 1;
1212+
}
1213+
1214+
/* Do a binary search to see if we fall in any of the fullwidth ranges */
1215+
int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
1216+
while (lo < hi) {
1217+
int probe = (lo + hi) / 2;
1218+
if (c < mbfl_eaw_table[probe].begin) {
1219+
hi = probe;
1220+
} else if (c > mbfl_eaw_table[probe].end) {
1221+
lo = probe + 1;
1222+
} else {
1223+
return 2;
12211224
}
12221225
}
12231226

1224-
return 0;
1227+
return 1;
12251228
}
12261229

1227-
static int
1228-
filter_count_width(int c, void* data)
1230+
static int filter_count_width(int c, void* data)
12291231
{
1230-
(*(size_t *)data) += (is_fullwidth(c) ? 2: 1);
1232+
(*(size_t *)data) += character_width(c);
12311233
return 0;
12321234
}
12331235

@@ -1289,7 +1291,7 @@ collector_strimwidth(int c, void* data)
12891291
break;
12901292
default:
12911293
if (pc->outchar >= pc->from) {
1292-
pc->outwidth += (is_fullwidth(c) ? 2: 1);
1294+
pc->outwidth += character_width(c);
12931295

12941296
if (pc->outwidth > pc->width) {
12951297
if (pc->status == 0) {

ext/mbstring/ucgendat/ucgendat.php

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,7 @@ function generateMPH(array $map, bool $fast) {
700700
}
701701

702702
function generateEastAsianWidthData(array $wideRanges) {
703-
$result = <<<'HEADER'
703+
$result = <<<'HEADER'
704704
/* This file was generated by ext/mbstring/ucgendat/ucgendat.php.
705705
*
706706
* DO NOT EDIT THIS FILE!
@@ -717,12 +717,17 @@ function generateEastAsianWidthData(array $wideRanges) {
717717
* which should be displayed as double-width.
718718
*/
719719

720+
HEADER;
721+
722+
$result .= "\n#define FIRST_DOUBLEWIDTH_CODEPOINT 0x" . dechex($wideRanges[0]->start) . "\n\n";
723+
724+
$result .= <<<'TABLESTART'
720725
static const struct {
721726
int begin;
722727
int end;
723728
} mbfl_eaw_table[] = {
724729

725-
HEADER;
730+
TABLESTART;
726731

727732
foreach ($wideRanges as $range) {
728733
$startCode = dechex($range->start);

0 commit comments

Comments
 (0)