Skip to content

Commit 920f728

Browse files
Yuki IzumiYuki Izumi
authored andcommitted
Autolink extension
The autolinker is based on https://github.com/vmg/rinku with some additional changes and fixes. We do our best not to include punctuation, but to include matching parentheses within a link.
1 parent c55225f commit 920f728

File tree

8 files changed

+441
-5
lines changed

8 files changed

+441
-5
lines changed

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ afl:
8282
-o test/afl_results \
8383
-x test/fuzzing_dictionary \
8484
-t 100 \
85-
$(CMARK) -e table -e strikethrough $(CMARK_OPTS)
85+
$(CMARK) -e table -e strikethrough -e autolink $(CMARK_OPTS)
8686

8787
libFuzzer:
8888
@[ -n "$(LIB_FUZZER_PATH)" ] || { echo '$$LIB_FUZZER_PATH not set'; false; }
@@ -164,8 +164,8 @@ $(ALLTESTS): $(SPEC) $(EXTENSIONS_SPEC)
164164
leakcheck: $(ALLTESTS)
165165
for format in html man xml latex commonmark; do \
166166
for opts in "" "--smart"; do \
167-
echo "cmark -t $$format -e table -e strikethrough $$opts" ; \
168-
valgrind -q --leak-check=full --dsymutil=yes --suppressions=suppressions --error-exitcode=1 $(PROG) -t $$format -e table -e strikethrough $$opts $(ALLTESTS) >/dev/null || exit 1;\
167+
echo "cmark -t $$format -e table -e strikethrough -e autolink $$opts" ; \
168+
valgrind -q --leak-check=full --dsymutil=yes --suppressions=suppressions --error-exitcode=1 $(PROG) -t $$format -e table -e strikethrough -e autolink $$opts $(ALLTESTS) >/dev/null || exit 1;\
169169
done; \
170170
done;
171171

extensions/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ set(LIBRARY_SOURCES
44
core-extensions.c
55
table.c
66
strikethrough.c
7+
autolink.c
78
ext_scanners.c
89
ext_scanners.re
910
ext_scanners.h

extensions/autolink.c

Lines changed: 352 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,352 @@
1+
#include "autolink.h"
2+
#include <parser.h>
3+
#include <string.h>
4+
5+
#if defined(_WIN32)
6+
#define strncasecmp _strnicmp
7+
#else
8+
#include <strings.h>
9+
#endif
10+
11+
static int sd_autolink_issafe(const uint8_t *link, size_t link_len) {
12+
static const size_t valid_uris_count = 5;
13+
static const char *valid_uris[] = {"/", "http://", "https://", "ftp://",
14+
"mailto:"};
15+
16+
size_t i;
17+
18+
for (i = 0; i < valid_uris_count; ++i) {
19+
size_t len = strlen(valid_uris[i]);
20+
21+
if (link_len > len && strncasecmp((char *)link, valid_uris[i], len) == 0 &&
22+
cmark_isalnum(link[len]))
23+
return 1;
24+
}
25+
26+
return 0;
27+
}
28+
29+
static size_t autolink_delim(uint8_t *data, size_t link_end) {
30+
uint8_t cclose, copen;
31+
size_t i;
32+
33+
for (i = 0; i < link_end; ++i)
34+
if (data[i] == '<') {
35+
link_end = i;
36+
break;
37+
}
38+
39+
while (link_end > 0) {
40+
cclose = data[link_end - 1];
41+
42+
switch (cclose) {
43+
case '"':
44+
copen = '"';
45+
break;
46+
case '\'':
47+
copen = '\'';
48+
break;
49+
case ')':
50+
copen = '(';
51+
break;
52+
case ']':
53+
copen = '[';
54+
break;
55+
case '}':
56+
copen = '{';
57+
break;
58+
default:
59+
copen = 0;
60+
}
61+
62+
if (strchr("?!.,:*_~", data[link_end - 1]) != NULL)
63+
link_end--;
64+
65+
else if (data[link_end - 1] == ';') {
66+
size_t new_end = link_end - 2;
67+
68+
while (new_end > 0 && cmark_isalpha(data[new_end]))
69+
new_end--;
70+
71+
if (new_end < link_end - 2 && data[new_end] == '&')
72+
link_end = new_end;
73+
else
74+
link_end--;
75+
} else if (copen != 0) {
76+
size_t closing = 0;
77+
size_t opening = 0;
78+
size_t i = 0;
79+
80+
/* Try to close the final punctuation sign in this same line;
81+
* if we managed to close it outside of the URL, that means that it's
82+
* not part of the URL. If it closes inside the URL, that means it
83+
* is part of the URL.
84+
*
85+
* Examples:
86+
*
87+
* foo http://www.pokemon.com/Pikachu_(Electric) bar
88+
* => http://www.pokemon.com/Pikachu_(Electric)
89+
*
90+
* foo (http://www.pokemon.com/Pikachu_(Electric)) bar
91+
* => http://www.pokemon.com/Pikachu_(Electric)
92+
*
93+
* foo http://www.pokemon.com/Pikachu_(Electric)) bar
94+
* => http://www.pokemon.com/Pikachu_(Electric)
95+
*
96+
* (foo http://www.pokemon.com/Pikachu_(Electric)) bar
97+
* => foo http://www.pokemon.com/Pikachu_(Electric)
98+
*/
99+
100+
while (i < link_end) {
101+
if (data[i] == copen)
102+
opening++;
103+
else if (data[i] == cclose)
104+
closing++;
105+
106+
i++;
107+
}
108+
109+
if (closing == opening)
110+
break;
111+
112+
link_end--;
113+
} else
114+
break;
115+
}
116+
117+
return link_end;
118+
}
119+
120+
static size_t check_domain(uint8_t *data, size_t size, int allow_short) {
121+
size_t i, np = 0, uscore1 = 0, uscore2 = 0;
122+
123+
for (i = 1; i < size - 1; i++) {
124+
if (data[i] == '_')
125+
uscore2++;
126+
else if (data[i] == '.') {
127+
uscore1 = uscore2;
128+
uscore2 = 0;
129+
np++;
130+
} else if (!cmark_isalnum(data[i]) && data[i] != '-')
131+
break;
132+
}
133+
134+
if (uscore1 > 0 || uscore2 > 0)
135+
return 0;
136+
137+
if (allow_short) {
138+
/* We don't need a valid domain in the strict sense (with
139+
* least one dot; so just make sure it's composed of valid
140+
* domain characters and return the length of the the valid
141+
* sequence. */
142+
return i;
143+
} else {
144+
/* a valid domain needs to have at least a dot.
145+
* that's as far as we get */
146+
return np ? i : 0;
147+
}
148+
}
149+
150+
static cmark_node *www_match(cmark_parser *parser, cmark_node *parent,
151+
cmark_inline_parser *inline_parser) {
152+
cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser);
153+
size_t max_rewind = cmark_inline_parser_get_offset(inline_parser);
154+
uint8_t *data = chunk->data + max_rewind;
155+
size_t size = chunk->len - max_rewind;
156+
157+
size_t link_end;
158+
159+
if (max_rewind > 0 && strchr("*_~([", data[-1]) == NULL &&
160+
!cmark_isspace(data[-1]))
161+
return 0;
162+
163+
if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0)
164+
return 0;
165+
166+
link_end = check_domain(data, size, 0);
167+
168+
if (link_end == 0)
169+
return NULL;
170+
171+
while (link_end < size && !cmark_isspace(data[link_end]))
172+
link_end++;
173+
174+
link_end = autolink_delim(data, link_end);
175+
176+
if (link_end == 0)
177+
return NULL;
178+
179+
cmark_inline_parser_set_offset(inline_parser, max_rewind + link_end);
180+
181+
cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
182+
183+
cmark_strbuf buf;
184+
cmark_strbuf_init(parser->mem, &buf, 10);
185+
cmark_strbuf_puts(&buf, "http://");
186+
cmark_strbuf_put(&buf, data, link_end);
187+
node->as.link.url = cmark_chunk_buf_detach(&buf);
188+
189+
cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
190+
text->as.literal = cmark_chunk_dup(chunk, max_rewind, link_end);
191+
cmark_node_append_child(node, text);
192+
193+
return node;
194+
}
195+
196+
static cmark_node *email_match(cmark_parser *parser, cmark_node *parent,
197+
cmark_inline_parser *inline_parser) {
198+
size_t link_end, rewind;
199+
int nb = 0, np = 0, ns = 0;
200+
201+
cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser);
202+
size_t max_rewind = cmark_inline_parser_get_offset(inline_parser);
203+
uint8_t *data = chunk->data + max_rewind;
204+
size_t size = chunk->len - max_rewind;
205+
206+
for (rewind = 0; rewind < max_rewind; ++rewind) {
207+
uint8_t c = data[-rewind - 1];
208+
209+
if (cmark_isalnum(c))
210+
continue;
211+
212+
if (strchr(".+-_", c) != NULL)
213+
continue;
214+
215+
if (c == '/')
216+
ns++;
217+
218+
break;
219+
}
220+
221+
if (rewind == 0 || ns > 0)
222+
return 0;
223+
224+
for (link_end = 0; link_end < size; ++link_end) {
225+
uint8_t c = data[link_end];
226+
227+
if (cmark_isalnum(c))
228+
continue;
229+
230+
if (c == '@')
231+
nb++;
232+
else if (c == '.' && link_end < size - 1)
233+
np++;
234+
else if (c != '-' && c != '_')
235+
break;
236+
}
237+
238+
if (link_end < 2 || nb != 1 || np == 0 ||
239+
(!cmark_isalpha(data[link_end - 1]) && data[link_end - 1] != '.'))
240+
return 0;
241+
242+
link_end = autolink_delim(data, link_end);
243+
244+
if (link_end == 0)
245+
return NULL;
246+
247+
cmark_inline_parser_set_offset(inline_parser, max_rewind + link_end);
248+
cmark_node_unput(parent, rewind);
249+
250+
cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
251+
252+
cmark_strbuf buf;
253+
cmark_strbuf_init(parser->mem, &buf, 10);
254+
cmark_strbuf_puts(&buf, "mailto:");
255+
cmark_strbuf_put(&buf, data - rewind, link_end + rewind);
256+
node->as.link.url = cmark_chunk_buf_detach(&buf);
257+
258+
cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
259+
text->as.literal =
260+
cmark_chunk_dup(chunk, max_rewind - rewind, link_end + rewind);
261+
cmark_node_append_child(node, text);
262+
263+
return node;
264+
}
265+
266+
static cmark_node *url_match(cmark_parser *parser, cmark_node *parent,
267+
cmark_inline_parser *inline_parser) {
268+
size_t link_end, rewind = 0, domain_len;
269+
270+
cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser);
271+
size_t max_rewind = cmark_inline_parser_get_offset(inline_parser);
272+
uint8_t *data = chunk->data + max_rewind;
273+
size_t size = chunk->len - max_rewind;
274+
275+
if (size < 4 || data[1] != '/' || data[2] != '/')
276+
return 0;
277+
278+
while (rewind < max_rewind && cmark_isalpha(data[-rewind - 1]))
279+
rewind++;
280+
281+
if (!sd_autolink_issafe(data - rewind, size + rewind))
282+
return 0;
283+
284+
link_end = strlen("://");
285+
286+
domain_len = check_domain(data + link_end, size - link_end, 1);
287+
288+
if (domain_len == 0)
289+
return 0;
290+
291+
link_end += domain_len;
292+
while (link_end < size && !cmark_isspace(data[link_end]))
293+
link_end++;
294+
295+
link_end = autolink_delim(data, link_end);
296+
297+
if (link_end == 0)
298+
return NULL;
299+
300+
cmark_inline_parser_set_offset(inline_parser, max_rewind + link_end);
301+
cmark_node_unput(parent, rewind);
302+
303+
cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
304+
305+
cmark_chunk url =
306+
cmark_chunk_dup(chunk, max_rewind - rewind, link_end + rewind);
307+
node->as.link.url = url;
308+
309+
cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
310+
text->as.literal = url;
311+
cmark_node_append_child(node, text);
312+
313+
return node;
314+
}
315+
316+
static cmark_node *match(cmark_syntax_extension *ext, cmark_parser *parser,
317+
cmark_node *parent, unsigned char c,
318+
cmark_inline_parser *inline_parser) {
319+
if (cmark_inline_parser_in_bracket(inline_parser, false) ||
320+
cmark_inline_parser_in_bracket(inline_parser, true))
321+
return NULL;
322+
323+
if (c == ':')
324+
return url_match(parser, parent, inline_parser);
325+
326+
if (c == '@')
327+
return email_match(parser, parent, inline_parser);
328+
329+
if (c == 'w')
330+
return www_match(parser, parent, inline_parser);
331+
332+
return NULL;
333+
334+
// note that we could end up re-consuming something already a
335+
// part of an inline, because we don't track when the last
336+
// inline was finished in inlines.c.
337+
}
338+
339+
cmark_syntax_extension *create_autolink_extension(void) {
340+
cmark_syntax_extension *ext = cmark_syntax_extension_new("autolink");
341+
cmark_llist *special_chars = NULL;
342+
343+
cmark_syntax_extension_set_match_inline_func(ext, match);
344+
345+
cmark_mem *mem = cmark_get_default_mem_allocator();
346+
special_chars = cmark_llist_append(mem, special_chars, (void *)':');
347+
special_chars = cmark_llist_append(mem, special_chars, (void *)'@');
348+
special_chars = cmark_llist_append(mem, special_chars, (void *)'w');
349+
cmark_syntax_extension_set_special_inline_chars(ext, special_chars);
350+
351+
return ext;
352+
}

extensions/autolink.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#ifndef AUTOLINK_H
2+
#define AUTOLINK_H
3+
4+
#include "core-extensions.h"
5+
6+
cmark_syntax_extension *create_autolink_extension(void);
7+
8+
#endif

0 commit comments

Comments
 (0)