Skip to content

Commit a76658b

Browse files
committed
Optimize out bounds check in UHC decoder
This gives a 25% speed boost for conversion operations on long strings (~10,000 codepoints). For shorter strings, the speed boost is less (as the input gets smaller, it is progressively swamped more and more by the overhead of entering and exiting the conversion function). When benchmarking string conversion speed, we are measuring not only the speed of the decoder, but also the time which it takes to re-encode the string in another encoding like UTF-8 or UTF-16. So the performance increase for functions which only need to decode but not re-encode the input string will be much more than 25%.
1 parent ffbddc4 commit a76658b

File tree

1 file changed

+12
-2
lines changed

1 file changed

+12
-2
lines changed

ext/mbstring/libmbfl/filters/mbfilter_uhc.c

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,12 +196,16 @@ static size_t mb_uhc_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf,
196196
unsigned char *p = *in, *e = p + *in_len;
197197
uint32_t *out = buf, *limit = buf + bufsize;
198198

199+
e--; /* Stop the main loop 1 byte short of the end of the input */
200+
199201
while (p < e && out < limit) {
200202
unsigned char c = *p++;
201203

202204
if (c < 0x80) {
203205
*out++ = c;
204-
} else if (c > 0x80 && c < 0xFE && c != 0xC9 && p < e) {
206+
} else if (c > 0x80 && c < 0xFE && c != 0xC9) {
207+
/* We don't need to check p < e here; it's not possible that this pointer dereference
208+
* will be outside the input string, because of e-- above */
205209
unsigned char c2 = *p++;
206210
if (c2 < 0x41 || c2 == 0xFF) {
207211
*out++ = MBFL_BAD_INPUT;
@@ -227,7 +231,13 @@ static size_t mb_uhc_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf,
227231
}
228232
}
229233

230-
*in_len = e - p;
234+
/* Finish up last byte of input string if there is one */
235+
if (p == e && out < limit) {
236+
unsigned char c = *p++;
237+
*out++ = (c < 0x80) ? c : MBFL_BAD_INPUT;
238+
}
239+
240+
*in_len = e - p + 1;
231241
*in = p;
232242
return out - buf;
233243
}

0 commit comments

Comments
 (0)