|
| 1 | +#include "autolink.h" |
| 2 | +#include <parser.h> |
| 3 | +#include <string.h> |
| 4 | + |
| 5 | +#if defined(_WIN32) |
| 6 | +#define strncasecmp _strnicmp |
| 7 | +#else |
| 8 | +#include <strings.h> |
| 9 | +#endif |
| 10 | + |
| 11 | +static int sd_autolink_issafe(const uint8_t *link, size_t link_len) { |
| 12 | + static const size_t valid_uris_count = 5; |
| 13 | + static const char *valid_uris[] = {"/", "http://", "https://", "ftp://", |
| 14 | + "mailto:"}; |
| 15 | + |
| 16 | + size_t i; |
| 17 | + |
| 18 | + for (i = 0; i < valid_uris_count; ++i) { |
| 19 | + size_t len = strlen(valid_uris[i]); |
| 20 | + |
| 21 | + if (link_len > len && strncasecmp((char *)link, valid_uris[i], len) == 0 && |
| 22 | + cmark_isalnum(link[len])) |
| 23 | + return 1; |
| 24 | + } |
| 25 | + |
| 26 | + return 0; |
| 27 | +} |
| 28 | + |
| 29 | +static size_t autolink_delim(uint8_t *data, size_t link_end) { |
| 30 | + uint8_t cclose, copen; |
| 31 | + size_t i; |
| 32 | + |
| 33 | + for (i = 0; i < link_end; ++i) |
| 34 | + if (data[i] == '<') { |
| 35 | + link_end = i; |
| 36 | + break; |
| 37 | + } |
| 38 | + |
| 39 | + while (link_end > 0) { |
| 40 | + cclose = data[link_end - 1]; |
| 41 | + |
| 42 | + switch (cclose) { |
| 43 | + case '"': |
| 44 | + copen = '"'; |
| 45 | + break; |
| 46 | + case '\'': |
| 47 | + copen = '\''; |
| 48 | + break; |
| 49 | + case ')': |
| 50 | + copen = '('; |
| 51 | + break; |
| 52 | + case ']': |
| 53 | + copen = '['; |
| 54 | + break; |
| 55 | + case '}': |
| 56 | + copen = '{'; |
| 57 | + break; |
| 58 | + default: |
| 59 | + copen = 0; |
| 60 | + } |
| 61 | + |
| 62 | + if (strchr("?!.,:*_~", data[link_end - 1]) != NULL) |
| 63 | + link_end--; |
| 64 | + |
| 65 | + else if (data[link_end - 1] == ';') { |
| 66 | + size_t new_end = link_end - 2; |
| 67 | + |
| 68 | + while (new_end > 0 && cmark_isalpha(data[new_end])) |
| 69 | + new_end--; |
| 70 | + |
| 71 | + if (new_end < link_end - 2 && data[new_end] == '&') |
| 72 | + link_end = new_end; |
| 73 | + else |
| 74 | + link_end--; |
| 75 | + } else if (copen != 0) { |
| 76 | + size_t closing = 0; |
| 77 | + size_t opening = 0; |
| 78 | + size_t i = 0; |
| 79 | + |
| 80 | + /* Try to close the final punctuation sign in this same line; |
| 81 | + * if we managed to close it outside of the URL, that means that it's |
| 82 | + * not part of the URL. If it closes inside the URL, that means it |
| 83 | + * is part of the URL. |
| 84 | + * |
| 85 | + * Examples: |
| 86 | + * |
| 87 | + * foo http://www.pokemon.com/Pikachu_(Electric) bar |
| 88 | + * => http://www.pokemon.com/Pikachu_(Electric) |
| 89 | + * |
| 90 | + * foo (http://www.pokemon.com/Pikachu_(Electric)) bar |
| 91 | + * => http://www.pokemon.com/Pikachu_(Electric) |
| 92 | + * |
| 93 | + * foo http://www.pokemon.com/Pikachu_(Electric)) bar |
| 94 | + * => http://www.pokemon.com/Pikachu_(Electric) |
| 95 | + * |
| 96 | + * (foo http://www.pokemon.com/Pikachu_(Electric)) bar |
| 97 | + * => foo http://www.pokemon.com/Pikachu_(Electric) |
| 98 | + */ |
| 99 | + |
| 100 | + while (i < link_end) { |
| 101 | + if (data[i] == copen) |
| 102 | + opening++; |
| 103 | + else if (data[i] == cclose) |
| 104 | + closing++; |
| 105 | + |
| 106 | + i++; |
| 107 | + } |
| 108 | + |
| 109 | + if (closing == opening) |
| 110 | + break; |
| 111 | + |
| 112 | + link_end--; |
| 113 | + } else |
| 114 | + break; |
| 115 | + } |
| 116 | + |
| 117 | + return link_end; |
| 118 | +} |
| 119 | + |
| 120 | +static size_t check_domain(uint8_t *data, size_t size, int allow_short) { |
| 121 | + size_t i, np = 0, uscore1 = 0, uscore2 = 0; |
| 122 | + |
| 123 | + for (i = 1; i < size - 1; i++) { |
| 124 | + if (data[i] == '_') |
| 125 | + uscore2++; |
| 126 | + else if (data[i] == '.') { |
| 127 | + uscore1 = uscore2; |
| 128 | + uscore2 = 0; |
| 129 | + np++; |
| 130 | + } else if (!cmark_isalnum(data[i]) && data[i] != '-') |
| 131 | + break; |
| 132 | + } |
| 133 | + |
| 134 | + if (uscore1 > 0 || uscore2 > 0) |
| 135 | + return 0; |
| 136 | + |
| 137 | + if (allow_short) { |
| 138 | + /* We don't need a valid domain in the strict sense (with |
| 139 | + * least one dot; so just make sure it's composed of valid |
| 140 | + * domain characters and return the length of the the valid |
| 141 | + * sequence. */ |
| 142 | + return i; |
| 143 | + } else { |
| 144 | + /* a valid domain needs to have at least a dot. |
| 145 | + * that's as far as we get */ |
| 146 | + return np ? i : 0; |
| 147 | + } |
| 148 | +} |
| 149 | + |
| 150 | +static cmark_node *www_match(cmark_parser *parser, cmark_node *parent, |
| 151 | + cmark_inline_parser *inline_parser) { |
| 152 | + cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser); |
| 153 | + size_t max_rewind = cmark_inline_parser_get_offset(inline_parser); |
| 154 | + uint8_t *data = chunk->data + max_rewind; |
| 155 | + size_t size = chunk->len - max_rewind; |
| 156 | + |
| 157 | + size_t link_end; |
| 158 | + |
| 159 | + if (max_rewind > 0 && strchr("*_~([", data[-1]) == NULL && |
| 160 | + !cmark_isspace(data[-1])) |
| 161 | + return 0; |
| 162 | + |
| 163 | + if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0) |
| 164 | + return 0; |
| 165 | + |
| 166 | + link_end = check_domain(data, size, 0); |
| 167 | + |
| 168 | + if (link_end == 0) |
| 169 | + return NULL; |
| 170 | + |
| 171 | + while (link_end < size && !cmark_isspace(data[link_end])) |
| 172 | + link_end++; |
| 173 | + |
| 174 | + link_end = autolink_delim(data, link_end); |
| 175 | + |
| 176 | + if (link_end == 0) |
| 177 | + return NULL; |
| 178 | + |
| 179 | + cmark_inline_parser_set_offset(inline_parser, max_rewind + link_end); |
| 180 | + |
| 181 | + cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem); |
| 182 | + |
| 183 | + cmark_strbuf buf; |
| 184 | + cmark_strbuf_init(parser->mem, &buf, 10); |
| 185 | + cmark_strbuf_puts(&buf, "http://"); |
| 186 | + cmark_strbuf_put(&buf, data, link_end); |
| 187 | + node->as.link.url = cmark_chunk_buf_detach(&buf); |
| 188 | + |
| 189 | + cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem); |
| 190 | + text->as.literal = cmark_chunk_dup(chunk, max_rewind, link_end); |
| 191 | + cmark_node_append_child(node, text); |
| 192 | + |
| 193 | + return node; |
| 194 | +} |
| 195 | + |
| 196 | +static cmark_node *email_match(cmark_parser *parser, cmark_node *parent, |
| 197 | + cmark_inline_parser *inline_parser) { |
| 198 | + size_t link_end, rewind; |
| 199 | + int nb = 0, np = 0, ns = 0; |
| 200 | + |
| 201 | + cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser); |
| 202 | + size_t max_rewind = cmark_inline_parser_get_offset(inline_parser); |
| 203 | + uint8_t *data = chunk->data + max_rewind; |
| 204 | + size_t size = chunk->len - max_rewind; |
| 205 | + |
| 206 | + for (rewind = 0; rewind < max_rewind; ++rewind) { |
| 207 | + uint8_t c = data[-rewind - 1]; |
| 208 | + |
| 209 | + if (cmark_isalnum(c)) |
| 210 | + continue; |
| 211 | + |
| 212 | + if (strchr(".+-_", c) != NULL) |
| 213 | + continue; |
| 214 | + |
| 215 | + if (c == '/') |
| 216 | + ns++; |
| 217 | + |
| 218 | + break; |
| 219 | + } |
| 220 | + |
| 221 | + if (rewind == 0 || ns > 0) |
| 222 | + return 0; |
| 223 | + |
| 224 | + for (link_end = 0; link_end < size; ++link_end) { |
| 225 | + uint8_t c = data[link_end]; |
| 226 | + |
| 227 | + if (cmark_isalnum(c)) |
| 228 | + continue; |
| 229 | + |
| 230 | + if (c == '@') |
| 231 | + nb++; |
| 232 | + else if (c == '.' && link_end < size - 1) |
| 233 | + np++; |
| 234 | + else if (c != '-' && c != '_') |
| 235 | + break; |
| 236 | + } |
| 237 | + |
| 238 | + if (link_end < 2 || nb != 1 || np == 0 || |
| 239 | + (!cmark_isalpha(data[link_end - 1]) && data[link_end - 1] != '.')) |
| 240 | + return 0; |
| 241 | + |
| 242 | + link_end = autolink_delim(data, link_end); |
| 243 | + |
| 244 | + if (link_end == 0) |
| 245 | + return NULL; |
| 246 | + |
| 247 | + cmark_inline_parser_set_offset(inline_parser, max_rewind + link_end); |
| 248 | + cmark_node_unput(parent, rewind); |
| 249 | + |
| 250 | + cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem); |
| 251 | + |
| 252 | + cmark_strbuf buf; |
| 253 | + cmark_strbuf_init(parser->mem, &buf, 10); |
| 254 | + cmark_strbuf_puts(&buf, "mailto:"); |
| 255 | + cmark_strbuf_put(&buf, data - rewind, link_end + rewind); |
| 256 | + node->as.link.url = cmark_chunk_buf_detach(&buf); |
| 257 | + |
| 258 | + cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem); |
| 259 | + text->as.literal = |
| 260 | + cmark_chunk_dup(chunk, max_rewind - rewind, link_end + rewind); |
| 261 | + cmark_node_append_child(node, text); |
| 262 | + |
| 263 | + return node; |
| 264 | +} |
| 265 | + |
| 266 | +static cmark_node *url_match(cmark_parser *parser, cmark_node *parent, |
| 267 | + cmark_inline_parser *inline_parser) { |
| 268 | + size_t link_end, rewind = 0, domain_len; |
| 269 | + |
| 270 | + cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser); |
| 271 | + size_t max_rewind = cmark_inline_parser_get_offset(inline_parser); |
| 272 | + uint8_t *data = chunk->data + max_rewind; |
| 273 | + size_t size = chunk->len - max_rewind; |
| 274 | + |
| 275 | + if (size < 4 || data[1] != '/' || data[2] != '/') |
| 276 | + return 0; |
| 277 | + |
| 278 | + while (rewind < max_rewind && cmark_isalpha(data[-rewind - 1])) |
| 279 | + rewind++; |
| 280 | + |
| 281 | + if (!sd_autolink_issafe(data - rewind, size + rewind)) |
| 282 | + return 0; |
| 283 | + |
| 284 | + link_end = strlen("://"); |
| 285 | + |
| 286 | + domain_len = check_domain(data + link_end, size - link_end, 1); |
| 287 | + |
| 288 | + if (domain_len == 0) |
| 289 | + return 0; |
| 290 | + |
| 291 | + link_end += domain_len; |
| 292 | + while (link_end < size && !cmark_isspace(data[link_end])) |
| 293 | + link_end++; |
| 294 | + |
| 295 | + link_end = autolink_delim(data, link_end); |
| 296 | + |
| 297 | + if (link_end == 0) |
| 298 | + return NULL; |
| 299 | + |
| 300 | + cmark_inline_parser_set_offset(inline_parser, max_rewind + link_end); |
| 301 | + cmark_node_unput(parent, rewind); |
| 302 | + |
| 303 | + cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem); |
| 304 | + |
| 305 | + cmark_chunk url = |
| 306 | + cmark_chunk_dup(chunk, max_rewind - rewind, link_end + rewind); |
| 307 | + node->as.link.url = url; |
| 308 | + |
| 309 | + cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem); |
| 310 | + text->as.literal = url; |
| 311 | + cmark_node_append_child(node, text); |
| 312 | + |
| 313 | + return node; |
| 314 | +} |
| 315 | + |
| 316 | +static cmark_node *match(cmark_syntax_extension *ext, cmark_parser *parser, |
| 317 | + cmark_node *parent, unsigned char c, |
| 318 | + cmark_inline_parser *inline_parser) { |
| 319 | + if (cmark_inline_parser_in_bracket(inline_parser, false) || |
| 320 | + cmark_inline_parser_in_bracket(inline_parser, true)) |
| 321 | + return NULL; |
| 322 | + |
| 323 | + if (c == ':') |
| 324 | + return url_match(parser, parent, inline_parser); |
| 325 | + |
| 326 | + if (c == '@') |
| 327 | + return email_match(parser, parent, inline_parser); |
| 328 | + |
| 329 | + if (c == 'w') |
| 330 | + return www_match(parser, parent, inline_parser); |
| 331 | + |
| 332 | + return NULL; |
| 333 | + |
| 334 | + // note that we could end up re-consuming something already a |
| 335 | + // part of an inline, because we don't track when the last |
| 336 | + // inline was finished in inlines.c. |
| 337 | +} |
| 338 | + |
| 339 | +cmark_syntax_extension *create_autolink_extension(void) { |
| 340 | + cmark_syntax_extension *ext = cmark_syntax_extension_new("autolink"); |
| 341 | + cmark_llist *special_chars = NULL; |
| 342 | + |
| 343 | + cmark_syntax_extension_set_match_inline_func(ext, match); |
| 344 | + |
| 345 | + cmark_mem *mem = cmark_get_default_mem_allocator(); |
| 346 | + special_chars = cmark_llist_append(mem, special_chars, (void *)':'); |
| 347 | + special_chars = cmark_llist_append(mem, special_chars, (void *)'@'); |
| 348 | + special_chars = cmark_llist_append(mem, special_chars, (void *)'w'); |
| 349 | + cmark_syntax_extension_set_special_inline_chars(ext, special_chars); |
| 350 | + |
| 351 | + return ext; |
| 352 | +} |
0 commit comments