Skip to content

Commit 935fef2

Browse files
authored
Optimize DOM HTML serialization for UTF-8 (#16376)
* Use a direct call for decoding the UTF-8 buffer * Add fast path for UTF-8 HTML serialization This patch adds a fast path to the HTML serialization encoding that has to encode to UTF-8. Because the DOM internally represents all strings using UTF-8, we only need to validate here. Tested on Uncyclopedia English home page on an i7-4790: ``` Benchmark 1: ./sapi/cli/php x.php Time (mean ± σ): 516.0 ms ± 6.4 ms [User: 511.2 ms, System: 3.5 ms] Range (min … max): 506.0 ms … 527.1 ms 10 runs Benchmark 2: ./sapi/cli/php_old x.php Time (mean ± σ): 682.8 ms ± 6.5 ms [User: 676.8 ms, System: 3.8 ms] Range (min … max): 675.8 ms … 695.6 ms 10 runs Summary ./sapi/cli/php x.php ran 1.32 ± 0.02 times faster than ./sapi/cli/php_old x.php ``` (And if you're interested: it takes over a second on my machine using the old DOMDocument class) Future optimizations are certainly possible, but let's start here.
1 parent 6dd67bb commit 935fef2

File tree

1 file changed

+73
-5
lines changed

1 file changed

+73
-5
lines changed

ext/dom/html_document.c

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -570,12 +570,11 @@ static bool dom_decode_encode_fast_path(
570570
const lxb_char_t *buf_ref_backup = buf_ref;
571571
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
572572
if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
573-
size_t skip = buf_ref - buf_ref_backup; /* Skip invalid data, it's replaced by the UTF-8 replacement bytes */
574573
if (!dom_process_parse_chunk(
575574
ctx,
576575
document,
577576
parser,
578-
buf_ref - last_output - skip,
577+
buf_ref_backup - last_output,
579578
last_output,
580579
buf_ref - last_output,
581580
tokenizer_error_offset,
@@ -1208,6 +1207,68 @@ static zend_result dom_write_output_stream(void *application_data, const char *b
12081207
return SUCCESS;
12091208
}
12101209

1210+
/* Fast path when the output encoding is UTF-8 */
1211+
static zend_result dom_saveHTML_write_string_len_utf8_output(void *application_data, const char *buf, size_t len)
1212+
{
1213+
dom_output_ctx *output = (dom_output_ctx *) application_data;
1214+
1215+
output->decode->status = LXB_STATUS_OK;
1216+
1217+
const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
1218+
const lxb_char_t *last_output = buf_ref;
1219+
const lxb_char_t *buf_end = buf_ref + len;
1220+
1221+
while (buf_ref != buf_end) {
1222+
const lxb_char_t *buf_ref_backup = buf_ref;
1223+
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(output->decode, &buf_ref, buf_end);
1224+
if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
1225+
if (UNEXPECTED(output->write_output(
1226+
output->output_data,
1227+
(const char *) last_output,
1228+
buf_ref_backup - last_output
1229+
) != SUCCESS)) {
1230+
return FAILURE;
1231+
}
1232+
1233+
if (codepoint == LXB_ENCODING_DECODE_CONTINUE) {
1234+
ZEND_ASSERT(buf_ref == buf_end);
1235+
/* The decoder needs more data but the entire buffer is consumed.
1236+
* All valid data is outputted, and if the remaining data for the code point
1237+
* is invalid, the next call will output the replacement bytes. */
1238+
output->decode->status = LXB_STATUS_CONTINUE;
1239+
return SUCCESS;
1240+
}
1241+
1242+
if (UNEXPECTED(output->write_output(
1243+
output->output_data,
1244+
(const char *) LXB_ENCODING_REPLACEMENT_BYTES,
1245+
LXB_ENCODING_REPLACEMENT_SIZE
1246+
) != SUCCESS)) {
1247+
return FAILURE;
1248+
}
1249+
1250+
last_output = buf_ref;
1251+
}
1252+
}
1253+
1254+
if (buf_ref != last_output) {
1255+
if (UNEXPECTED(output->write_output(
1256+
output->output_data,
1257+
(const char *) last_output,
1258+
buf_ref - last_output
1259+
) != SUCCESS)) {
1260+
return FAILURE;
1261+
}
1262+
}
1263+
1264+
return SUCCESS;
1265+
}
1266+
1267+
static zend_result dom_saveHTML_write_string_utf8_output(void *application_data, const char *buf)
1268+
{
1269+
return dom_saveHTML_write_string_len_utf8_output(application_data, buf, strlen(buf));
1270+
}
1271+
12111272
static zend_result dom_saveHTML_write_string_len(void *application_data, const char *buf, size_t len)
12121273
{
12131274
dom_output_ctx *output = (dom_output_ctx *) application_data;
@@ -1216,7 +1277,7 @@ static zend_result dom_saveHTML_write_string_len(void *application_data, const c
12161277
const lxb_char_t *buf_end = buf_ref + len;
12171278

12181279
do {
1219-
decode_status = output->decoding_data->decode(output->decode, &buf_ref, buf_end);
1280+
decode_status = lxb_encoding_decode_utf_8(output->decode, &buf_ref, buf_end);
12201281

12211282
const lxb_codepoint_t *codepoints_ref = output->codepoints;
12221283
const lxb_codepoint_t *codepoints_end = codepoints_ref + lxb_encoding_decode_buf_used(output->decode);
@@ -1272,8 +1333,15 @@ static zend_result dom_common_save(dom_output_ctx *output_ctx, dom_object *inter
12721333
output_ctx->encoding_output = encoding_output;
12731334

12741335
dom_html5_serialize_context ctx;
1275-
ctx.write_string_len = dom_saveHTML_write_string_len;
1276-
ctx.write_string = dom_saveHTML_write_string;
1336+
if (encoding_data->encoding == LXB_ENCODING_UTF_8) {
1337+
/* Fast path */
1338+
ctx.write_string_len = dom_saveHTML_write_string_len_utf8_output;
1339+
ctx.write_string = dom_saveHTML_write_string_utf8_output;
1340+
} else {
1341+
/* Slow path */
1342+
ctx.write_string_len = dom_saveHTML_write_string_len;
1343+
ctx.write_string = dom_saveHTML_write_string;
1344+
}
12771345
ctx.application_data = output_ctx;
12781346
ctx.private_data = php_dom_get_private_data(intern);
12791347
if (UNEXPECTED(dom_html5_serialize_outer(&ctx, node) != SUCCESS)) {

0 commit comments

Comments
 (0)