Skip to content

Commit 1eab596

Browse files
Yuki IzumiYuki Izumi
authored andcommitted
Tagfilter extension
When we encounter a tag that causes an HTML 5 parser's content model flag [1] to be changed to RCDATA, CDATA or RAWTEXT [2] [3], we escape the tag by replacing its opening "<" with "&lt;". This causes the tag to appear verbatim in the page it's placed on. We do this to prevent users breaking the page content, where the parser would not interpret further tags as inserted by cmark as HTML until a matching close tag was hit. (Such a closing tag could exist if a user entered it themselves, but it'd cause all cmark-generated markup in between to be rendered raw, and is unlikely to be desireable behaviour.) [1] https://www.w3.org/TR/2009/WD-html5-20090423/syntax.html#tokenization [2] https://www.w3.org/TR/2009/WD-html5-20090212/serializing-html-fragments.html#parsing-html-fragments [3] https://github.com/google/gumbo-parser/blob/aa91b27b02c0c80c482e24348a457ed7c3c088e0/src/parser.c#L4023-L4053
1 parent 920f728 commit 1eab596

File tree

8 files changed

+125
-5
lines changed

8 files changed

+125
-5
lines changed

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ afl:
8282
-o test/afl_results \
8383
-x test/fuzzing_dictionary \
8484
-t 100 \
85-
$(CMARK) -e table -e strikethrough -e autolink $(CMARK_OPTS)
85+
$(CMARK) -e table -e strikethrough -e autolink -e tagfilter $(CMARK_OPTS)
8686

8787
libFuzzer:
8888
@[ -n "$(LIB_FUZZER_PATH)" ] || { echo '$$LIB_FUZZER_PATH not set'; false; }
@@ -164,8 +164,8 @@ $(ALLTESTS): $(SPEC) $(EXTENSIONS_SPEC)
164164
leakcheck: $(ALLTESTS)
165165
for format in html man xml latex commonmark; do \
166166
for opts in "" "--smart"; do \
167-
echo "cmark -t $$format -e table -e strikethrough -e autolink $$opts" ; \
168-
valgrind -q --leak-check=full --dsymutil=yes --suppressions=suppressions --error-exitcode=1 $(PROG) -t $$format -e table -e strikethrough -e autolink $$opts $(ALLTESTS) >/dev/null || exit 1;\
167+
echo "cmark -t $$format -e table -e strikethrough -e autolink -e tagfilter $$opts" ; \
168+
valgrind -q --leak-check=full --dsymutil=yes --suppressions=suppressions --error-exitcode=1 $(PROG) -t $$format -e table -e strikethrough -e autolink -e tagfilter $$opts $(ALLTESTS) >/dev/null || exit 1;\
169169
done; \
170170
done;
171171

extensions/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ set(LIBRARY_SOURCES
55
table.c
66
strikethrough.c
77
autolink.c
8+
tagfilter.c
89
ext_scanners.c
910
ext_scanners.re
1011
ext_scanners.h

extensions/core-extensions.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22
#include "autolink.h"
33
#include "strikethrough.h"
44
#include "table.h"
5+
#include "tagfilter.h"
56

67
int core_extensions_registration(cmark_plugin *plugin) {
78
cmark_plugin_register_syntax_extension(plugin, create_table_extension());
89
cmark_plugin_register_syntax_extension(plugin,
910
create_strikethrough_extension());
1011
cmark_plugin_register_syntax_extension(plugin, create_autolink_extension());
12+
cmark_plugin_register_syntax_extension(plugin, create_tagfilter_extension());
1113
return 1;
1214
}

extensions/tagfilter.c

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#include "tagfilter.h"
2+
#include <parser.h>
3+
4+
static const char *blacklist[] = {
5+
"title", "textarea", "style", "xmp", "iframe",
6+
"noembed", "noframes", "script", "plaintext", NULL,
7+
};
8+
9+
static int is_tag(const unsigned char *tag_data, size_t tag_size,
10+
const char *tagname) {
11+
size_t i;
12+
13+
if (tag_size < 3 || tag_data[0] != '<')
14+
return 0;
15+
16+
i = 1;
17+
18+
if (tag_data[i] == '/') {
19+
i++;
20+
}
21+
22+
for (; i < tag_size; ++i, ++tagname) {
23+
if (*tagname == 0)
24+
break;
25+
26+
if (tag_data[i] != *tagname)
27+
return 0;
28+
}
29+
30+
if (i == tag_size)
31+
return 0;
32+
33+
if (cmark_isspace(tag_data[i]) || tag_data[i] == '>')
34+
return 1;
35+
36+
if (tag_data[i] == '/' && tag_size >= i + 2 && tag_data[i + 1] == '>')
37+
return 1;
38+
39+
return 0;
40+
}
41+
42+
static int filter(cmark_syntax_extension *ext, const unsigned char *tag,
43+
size_t tag_len) {
44+
const char **it;
45+
46+
for (it = blacklist; *it; ++it) {
47+
if (is_tag(tag, tag_len, *it)) {
48+
return 0;
49+
}
50+
}
51+
52+
return 1;
53+
}
54+
55+
cmark_syntax_extension *create_tagfilter_extension(void) {
56+
cmark_syntax_extension *ext = cmark_syntax_extension_new("tagfilter");
57+
cmark_syntax_extension_set_html_filter_func(ext, filter);
58+
return ext;
59+
}

extensions/tagfilter.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#ifndef TAGFILTER_H
2+
#define TAGFILTER_H
3+
4+
#include "core-extensions.h"
5+
6+
cmark_syntax_extension *create_tagfilter_extension(void);
7+
8+
#endif

test/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,14 @@ IF (PYTHONINTERP_FOUND)
6363
)
6464

6565
add_test(extensions_executable
66-
${PYTHON_EXECUTABLE} "${CMAKE_CURRENT_SOURCE_DIR}/spec_tests.py" "--no-normalize" "--spec" "${CMAKE_CURRENT_SOURCE_DIR}/extensions.txt" "--program" "${CMAKE_CURRENT_BINARY_DIR}/../src/cmark -e table -e strikethrough -e autolink"
66+
${PYTHON_EXECUTABLE} "${CMAKE_CURRENT_SOURCE_DIR}/spec_tests.py" "--no-normalize" "--spec" "${CMAKE_CURRENT_SOURCE_DIR}/extensions.txt" "--program" "${CMAKE_CURRENT_BINARY_DIR}/../src/cmark -e table -e strikethrough -e autolink -e tagfilter"
6767
)
6868

6969
add_test(roundtrip_extensions_executable
7070
${PYTHON_EXECUTABLE}
7171
"${CMAKE_CURRENT_SOURCE_DIR}/roundtrip_tests.py"
7272
"--spec" "${CMAKE_CURRENT_SOURCE_DIR}/extensions.txt"
73-
"--program" "${CMAKE_CURRENT_BINARY_DIR}/../src/cmark -e table -e strikethrough -e autolink"
73+
"--program" "${CMAKE_CURRENT_BINARY_DIR}/../src/cmark -e table -e strikethrough -e autolink -e tagfilter"
7474
)
7575

7676
add_test(regressiontest_executable

test/afl_test_cases/test.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,10 @@ google ~~yahoo~~
4040

4141
google.com http://google.com [email protected]
4242

43+
and <xmp> but
44+
45+
<surewhynot>
46+
sure
47+
</surewhynot>
48+
4349
[f]: /u "t"

test/extensions.txt

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,50 @@ Full stop outside parens shouldn't be included http://google.com/ok.
424424
````````````````````````````````
425425

426426

427+
## HTML tag filter
428+
429+
430+
```````````````````````````````` example
431+
This is <xmp> not okay, but **this** <strong>is</strong>.
432+
433+
<p>This is <xmp> not okay, but **this** <strong>is</strong>.</p>
434+
435+
Nope, I won't have <textarea>.
436+
437+
<p>No <textarea> here either.</p>
438+
439+
<p>This <random /> <thing> is okay</thing> though.</p>
440+
441+
Yep, <totally>okay</totally>.
442+
443+
<!-- HTML comments are okay, though. -->
444+
<!- But we're strict. ->
445+
<! No nonsense. >
446+
<!-- Leave multiline comments the heck alone, though, okay?
447+
Even with {"x":"y"} or 1 > 2 or whatever. Even **markdown**.
448+
-->
449+
<!--- Support everything CommonMark's parser does. -->
450+
<!---->
451+
<!--thistoo-->
452+
.
453+
<p>This is &lt;xmp> not okay, but <strong>this</strong> <strong>is</strong>.</p>
454+
<p>This is &lt;xmp> not okay, but **this** <strong>is</strong>.</p>
455+
<p>Nope, I won't have &lt;textarea>.</p>
456+
<p>No &lt;textarea> here either.</p>
457+
<p>This <random /> <thing> is okay</thing> though.</p>
458+
<p>Yep, <totally>okay</totally>.</p>
459+
<!-- HTML comments are okay, though. -->
460+
<p>&lt;!- But we're strict. -&gt;
461+
&lt;! No nonsense. &gt;</p>
462+
<!-- Leave multiline comments the heck alone, though, okay?
463+
Even with {"x":"y"} or 1 > 2 or whatever. Even **markdown**.
464+
-->
465+
<!--- Support everything CommonMark's parser does. -->
466+
<!---->
467+
<!--thistoo-->
468+
````````````````````````````````
469+
470+
427471
## Interop
428472

429473
Autolink and strikethrough.

0 commit comments

Comments
 (0)