diff options
Diffstat (limited to 'cmark/src/inlines.c')
| -rw-r--r-- | cmark/src/inlines.c | 1503 |
1 files changed, 1503 insertions, 0 deletions
diff --git a/cmark/src/inlines.c b/cmark/src/inlines.c new file mode 100644 index 0000000000..ab82ca74e2 --- /dev/null +++ b/cmark/src/inlines.c @@ -0,0 +1,1503 @@ +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "cmark_ctype.h" +#include "node.h" +#include "parser.h" +#include "references.h" +#include "cmark.h" +#include "houdini.h" +#include "utf8.h" +#include "scanners.h" +#include "inlines.h" + +static const char *EMDASH = "\xE2\x80\x94"; +static const char *ENDASH = "\xE2\x80\x93"; +static const char *ELLIPSES = "\xE2\x80\xA6"; +static const char *LEFTDOUBLEQUOTE = "\xE2\x80\x9C"; +static const char *RIGHTDOUBLEQUOTE = "\xE2\x80\x9D"; +static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98"; +static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99"; + +// Macros for creating various kinds of simple. +#define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK) +#define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK) +#define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH) +#define make_strong(mem) make_simple(mem, CMARK_NODE_STRONG) + +#define MAXBACKTICKS 1000 + +typedef struct delimiter { + struct delimiter *previous; + struct delimiter *next; + cmark_node *inl_text; + bufsize_t position; + bufsize_t length; + unsigned char delim_char; + bool can_open; + bool can_close; +} delimiter; + +typedef struct bracket { + struct bracket *previous; + cmark_node *inl_text; + bufsize_t position; + bool image; + bool active; + bool bracket_after; +} bracket; + +#define FLAG_SKIP_HTML_CDATA (1u << 0) +#define FLAG_SKIP_HTML_DECLARATION (1u << 1) +#define FLAG_SKIP_HTML_PI (1u << 2) +#define FLAG_SKIP_HTML_COMMENT (1u << 3) + +typedef struct { + cmark_mem *mem; + cmark_chunk input; + unsigned flags; + int line; + bufsize_t pos; + int block_offset; + int column_offset; + cmark_reference_map *refmap; + delimiter *last_delim; + bracket *last_bracket; + bufsize_t backticks[MAXBACKTICKS + 1]; + bool scanned_for_backticks; + bool no_link_openers; +} subject; + +static inline bool S_is_line_end_char(char c) { + return (c == '\n' || c == '\r'); +} + +static delimiter *S_insert_emph(subject *subj, delimiter *opener, + delimiter *closer); + +static int parse_inline(subject *subj, cmark_node *parent, int options); + +static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e, + cmark_chunk *chunk, cmark_reference_map *refmap); +static bufsize_t subject_find_special_char(subject *subj, int options); + +// Create an inline with a literal string value. +static inline cmark_node *make_literal(subject *subj, cmark_node_type t, + int start_column, int end_column) { + cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e)); + e->mem = subj->mem; + e->type = (uint16_t)t; + e->start_line = e->end_line = subj->line; + // columns are 1 based. + e->start_column = start_column + 1 + subj->column_offset + subj->block_offset; + e->end_column = end_column + 1 + subj->column_offset + subj->block_offset; + return e; +} + +// Create an inline with no value. +static inline cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) { + cmark_node *e = (cmark_node *)mem->calloc(1, sizeof(*e)); + e->mem = mem; + e->type = t; + return e; +} + +static cmark_node *make_str(subject *subj, int sc, int ec, cmark_chunk s) { + cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec); + e->data = (unsigned char *)subj->mem->realloc(NULL, s.len + 1); + if (s.data != NULL) { + memcpy(e->data, s.data, s.len); + } + e->data[s.len] = 0; + e->len = s.len; + return e; +} + +static cmark_node *make_str_from_buf(subject *subj, int sc, int ec, + cmark_strbuf *buf) { + cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec); + e->len = buf->size; + e->data = cmark_strbuf_detach(buf); + return e; +} + +// Like make_str, but parses entities. +static cmark_node *make_str_with_entities(subject *subj, + int start_column, int end_column, + cmark_chunk *content) { + cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem); + + if (houdini_unescape_html(&unescaped, content->data, content->len)) { + return make_str_from_buf(subj, start_column, end_column, &unescaped); + } else { + return make_str(subj, start_column, end_column, *content); + } +} + +// Like cmark_node_append_child but without costly sanity checks. +// Assumes that child was newly created. +static void append_child(cmark_node *node, cmark_node *child) { + cmark_node *old_last_child = node->last_child; + + child->next = NULL; + child->prev = old_last_child; + child->parent = node; + node->last_child = child; + + if (old_last_child) { + old_last_child->next = child; + } else { + // Also set first_child if node previously had no children. + node->first_child = child; + } +} + +// Duplicate a chunk by creating a copy of the buffer not by reusing the +// buffer like cmark_chunk_dup does. +static unsigned char *cmark_strdup(cmark_mem *mem, unsigned char *src) { + if (src == NULL) { + return NULL; + } + size_t len = strlen((char *)src); + unsigned char *data = (unsigned char *)mem->realloc(NULL, len + 1); + memcpy(data, src, len + 1); + return data; +} + +static unsigned char *cmark_clean_autolink(cmark_mem *mem, cmark_chunk *url, + int is_email) { + cmark_strbuf buf = CMARK_BUF_INIT(mem); + + cmark_chunk_trim(url); + + if (is_email) + cmark_strbuf_puts(&buf, "mailto:"); + + houdini_unescape_html_f(&buf, url->data, url->len); + return cmark_strbuf_detach(&buf); +} + +static inline cmark_node *make_autolink(subject *subj, int start_column, + int end_column, cmark_chunk url, + int is_email) { + cmark_node *link = make_simple(subj->mem, CMARK_NODE_LINK); + link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email); + link->as.link.title = NULL; + link->start_line = link->end_line = subj->line; + link->start_column = start_column + 1; + link->end_column = end_column + 1; + append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url)); + return link; +} + +static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e, + cmark_chunk *chunk, cmark_reference_map *refmap) { + int i; + e->mem = mem; + e->input = *chunk; + e->flags = 0; + e->line = line_number; + e->pos = 0; + e->block_offset = block_offset; + e->column_offset = 0; + e->refmap = refmap; + e->last_delim = NULL; + e->last_bracket = NULL; + for (i = 0; i <= MAXBACKTICKS; i++) { + e->backticks[i] = 0; + } + e->scanned_for_backticks = false; + e->no_link_openers = true; +} + +static inline int isbacktick(int c) { return (c == '`'); } + +static inline unsigned char peek_char(subject *subj) { + // NULL bytes should have been stripped out by now. If they're + // present, it's a programming error: + assert(!(subj->pos < subj->input.len && subj->input.data[subj->pos] == 0)); + return (subj->pos < subj->input.len) ? subj->input.data[subj->pos] : 0; +} + +static inline unsigned char peek_at(subject *subj, bufsize_t pos) { + return subj->input.data[pos]; +} + +// Return true if there are more characters in the subject. +static inline int is_eof(subject *subj) { + return (subj->pos >= subj->input.len); +} + +// Advance the subject. Doesn't check for eof. +#define advance(subj) (subj)->pos += 1 + +static inline bool skip_spaces(subject *subj) { + bool skipped = false; + while (peek_char(subj) == ' ' || peek_char(subj) == '\t') { + advance(subj); + skipped = true; + } + return skipped; +} + +static inline bool skip_line_end(subject *subj) { + bool seen_line_end_char = false; + if (peek_char(subj) == '\r') { + advance(subj); + seen_line_end_char = true; + } + if (peek_char(subj) == '\n') { + advance(subj); + seen_line_end_char = true; + } + return seen_line_end_char || is_eof(subj); +} + +// Take characters while a predicate holds, and return a string. +static inline cmark_chunk take_while(subject *subj, int (*f)(int)) { + unsigned char c; + bufsize_t startpos = subj->pos; + bufsize_t len = 0; + + while ((c = peek_char(subj)) && (*f)(c)) { + advance(subj); + len++; + } + + return cmark_chunk_dup(&subj->input, startpos, len); +} + +// Return the number of newlines in a given span of text in a subject. If +// the number is greater than zero, also return the number of characters +// between the last newline and the end of the span in `since_newline`. +static int count_newlines(subject *subj, bufsize_t from, bufsize_t len, int *since_newline) { + int nls = 0; + int since_nl = 0; + + while (len--) { + if (subj->input.data[from++] == '\n') { + ++nls; + since_nl = 0; + } else { + ++since_nl; + } + } + + if (!nls) + return 0; + + *since_newline = since_nl; + return nls; +} + +// Adjust `node`'s `end_line`, `end_column`, and `subj`'s `line` and +// `column_offset` according to the number of newlines in a just-matched span +// of text in `subj`. +static void adjust_subj_node_newlines(subject *subj, cmark_node *node, int matchlen, int extra, int options) { + if (!(options & CMARK_OPT_SOURCEPOS)) { + return; + } + + int since_newline; + int newlines = count_newlines(subj, subj->pos - matchlen - extra, matchlen, &since_newline); + if (newlines) { + subj->line += newlines; + node->end_line += newlines; + node->end_column = since_newline; + subj->column_offset = -subj->pos + since_newline + extra; + } +} + +// Try to process a backtick code span that began with a +// span of ticks of length openticklength length (already +// parsed). Return 0 if you don't find matching closing +// backticks, otherwise return the position in the subject +// after the closing backticks. +static bufsize_t scan_to_closing_backticks(subject *subj, + bufsize_t openticklength) { + + bool found = false; + if (openticklength > MAXBACKTICKS) { + // we limit backtick string length because of the array subj->backticks: + return 0; + } + if (subj->scanned_for_backticks && + subj->backticks[openticklength] <= subj->pos) { + // return if we already know there's no closer + return 0; + } + while (!found) { + // read non backticks + unsigned char c; + while ((c = peek_char(subj)) && c != '`') { + advance(subj); + } + if (is_eof(subj)) { + break; + } + bufsize_t numticks = 0; + while (peek_char(subj) == '`') { + advance(subj); + numticks++; + } + // store position of ender + if (numticks <= MAXBACKTICKS) { + subj->backticks[numticks] = subj->pos - numticks; + } + if (numticks == openticklength) { + return (subj->pos); + } + } + // got through whole input without finding closer + subj->scanned_for_backticks = true; + return 0; +} + +// Destructively modify string, converting newlines to +// spaces, then removing a single leading + trailing space, +// unless the code span consists entirely of space characters. +static void S_normalize_code(cmark_strbuf *s) { + bufsize_t r, w; + bool contains_nonspace = false; + + for (r = 0, w = 0; r < s->size; ++r) { + switch (s->ptr[r]) { + case '\r': + if (s->ptr[r + 1] != '\n') { + s->ptr[w++] = ' '; + } + break; + case '\n': + s->ptr[w++] = ' '; + break; + default: + s->ptr[w++] = s->ptr[r]; + } + if (s->ptr[r] != ' ') { + contains_nonspace = true; + } + } + + // begins and ends with space? + if (contains_nonspace && + s->ptr[0] == ' ' && s->ptr[w - 1] == ' ') { + cmark_strbuf_drop(s, 1); + cmark_strbuf_truncate(s, w - 2); + } else { + cmark_strbuf_truncate(s, w); + } + +} + + +// Parse backtick code section or raw backticks, return an inline. +// Assumes that the subject has a backtick at the current position. +static cmark_node *handle_backticks(subject *subj, int options) { + bufsize_t initpos = subj->pos; + cmark_chunk openticks = take_while(subj, isbacktick); + bufsize_t startpos = subj->pos; + bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len); + + if (endpos == 0) { // not found + subj->pos = startpos; // rewind + return make_str(subj, initpos, initpos + openticks.len - 1, openticks); + } else { + cmark_strbuf buf = CMARK_BUF_INIT(subj->mem); + + cmark_strbuf_set(&buf, subj->input.data + startpos, + endpos - startpos - openticks.len); + S_normalize_code(&buf); + + cmark_node *node = make_literal(subj, CMARK_NODE_CODE, startpos, + endpos - openticks.len - 1); + node->len = buf.size; + node->data = cmark_strbuf_detach(&buf); + adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.len, options); + return node; + } +} + + +// Scan ***, **, or * and return number scanned, or 0. +// Advances position. +static int scan_delims(subject *subj, unsigned char c, bool *can_open, + bool *can_close) { + int numdelims = 0; + bufsize_t before_char_pos; + int32_t after_char = 0; + int32_t before_char = 0; + int len; + bool left_flanking, right_flanking; + + if (subj->pos == 0) { + before_char = 10; + } else { + before_char_pos = subj->pos - 1; + // walk back to the beginning of the UTF_8 sequence: + while (peek_at(subj, before_char_pos) >> 6 == 2 && before_char_pos > 0) { + before_char_pos -= 1; + } + len = cmark_utf8proc_iterate(subj->input.data + before_char_pos, + subj->pos - before_char_pos, &before_char); + if (len == -1) { + before_char = 10; + } + } + + if (c == '\'' || c == '"') { + numdelims++; + advance(subj); // limit to 1 delim for quotes + } else { + while (peek_char(subj) == c) { + numdelims++; + advance(subj); + } + } + + len = cmark_utf8proc_iterate(subj->input.data + subj->pos, + subj->input.len - subj->pos, &after_char); + if (len == -1) { + after_char = 10; + } + left_flanking = numdelims > 0 && !cmark_utf8proc_is_space(after_char) && + (!cmark_utf8proc_is_punctuation_or_symbol(after_char) || + cmark_utf8proc_is_space(before_char) || + cmark_utf8proc_is_punctuation_or_symbol(before_char)); + right_flanking = numdelims > 0 && !cmark_utf8proc_is_space(before_char) && + (!cmark_utf8proc_is_punctuation_or_symbol(before_char) || + cmark_utf8proc_is_space(after_char) || + cmark_utf8proc_is_punctuation_or_symbol(after_char)); + if (c == '_') { + *can_open = left_flanking && + (!right_flanking || + cmark_utf8proc_is_punctuation_or_symbol(before_char)); + *can_close = right_flanking && + (!left_flanking || + cmark_utf8proc_is_punctuation_or_symbol(after_char)); + } else if (c == '\'' || c == '"') { + *can_open = left_flanking && + (!right_flanking || before_char == '(' || before_char == '[') && + before_char != ']' && before_char != ')'; + *can_close = right_flanking; + } else { + *can_open = left_flanking; + *can_close = right_flanking; + } + return numdelims; +} + +/* +static void print_delimiters(subject *subj) +{ + delimiter *delim; + delim = subj->last_delim; + while (delim != NULL) { + printf("Item at stack pos %p: %d %d %d next(%p) prev(%p)\n", + (void*)delim, delim->delim_char, + delim->can_open, delim->can_close, + (void*)delim->next, (void*)delim->previous); + delim = delim->previous; + } +} +*/ + +static void remove_delimiter(subject *subj, delimiter *delim) { + if (delim == NULL) + return; + if (delim->next == NULL) { + // end of list: + assert(delim == subj->last_delim); + subj->last_delim = delim->previous; + } else { + delim->next->previous = delim->previous; + } + if (delim->previous != NULL) { + delim->previous->next = delim->next; + } + subj->mem->free(delim); +} + +static void pop_bracket(subject *subj) { + bracket *b; + if (subj->last_bracket == NULL) + return; + b = subj->last_bracket; + subj->last_bracket = subj->last_bracket->previous; + subj->mem->free(b); +} + +static void push_delimiter(subject *subj, unsigned char c, bool can_open, + bool can_close, cmark_node *inl_text) { + delimiter *delim = (delimiter *)subj->mem->calloc(1, sizeof(delimiter)); + delim->delim_char = c; + delim->can_open = can_open; + delim->can_close = can_close; + delim->inl_text = inl_text; + delim->position = subj->pos; + delim->length = inl_text->len; + delim->previous = subj->last_delim; + delim->next = NULL; + if (delim->previous != NULL) { + delim->previous->next = delim; + } + subj->last_delim = delim; +} + +static void push_bracket(subject *subj, bool image, cmark_node *inl_text) { + bracket *b = (bracket *)subj->mem->calloc(1, sizeof(bracket)); + if (subj->last_bracket != NULL) { + subj->last_bracket->bracket_after = true; + } + b->image = image; + b->active = true; + b->inl_text = inl_text; + b->previous = subj->last_bracket; + b->position = subj->pos; + b->bracket_after = false; + subj->last_bracket = b; + if (!image) { + subj->no_link_openers = false; + } +} + +// Assumes the subject has a c at the current position. +static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) { + bufsize_t numdelims; + cmark_node *inl_text; + bool can_open, can_close; + cmark_chunk contents; + + numdelims = scan_delims(subj, c, &can_open, &can_close); + + if (c == '\'' && smart) { + contents = cmark_chunk_literal(RIGHTSINGLEQUOTE); + } else if (c == '"' && smart) { + contents = + cmark_chunk_literal(can_close ? RIGHTDOUBLEQUOTE : LEFTDOUBLEQUOTE); + } else { + contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims); + } + + inl_text = make_str(subj, subj->pos - numdelims, subj->pos - 1, contents); + + if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) { + push_delimiter(subj, c, can_open, can_close, inl_text); + } + + return inl_text; +} + +// Assumes we have a hyphen at the current position. +static cmark_node *handle_hyphen(subject *subj, bool smart) { + int startpos = subj->pos; + + advance(subj); + + if (!smart || peek_char(subj) != '-') { + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("-")); + } + + while (smart && peek_char(subj) == '-') { + advance(subj); + } + + int numhyphens = subj->pos - startpos; + int en_count = 0; + int em_count = 0; + int i; + cmark_strbuf buf = CMARK_BUF_INIT(subj->mem); + + if (numhyphens % 3 == 0) { // if divisible by 3, use all em dashes + em_count = numhyphens / 3; + } else if (numhyphens % 2 == 0) { // if divisible by 2, use all en dashes + en_count = numhyphens / 2; + } else if (numhyphens % 3 == 2) { // use one en dash at end + en_count = 1; + em_count = (numhyphens - 2) / 3; + } else { // use two en dashes at the end + en_count = 2; + em_count = (numhyphens - 4) / 3; + } + + for (i = em_count; i > 0; i--) { + cmark_strbuf_puts(&buf, EMDASH); + } + + for (i = en_count; i > 0; i--) { + cmark_strbuf_puts(&buf, ENDASH); + } + + return make_str_from_buf(subj, startpos, subj->pos - 1, &buf); +} + +// Assumes we have a period at the current position. +static cmark_node *handle_period(subject *subj, bool smart) { + advance(subj); + if (smart && peek_char(subj) == '.') { + advance(subj); + if (peek_char(subj) == '.') { + advance(subj); + return make_str(subj, subj->pos - 3, subj->pos - 1, cmark_chunk_literal(ELLIPSES)); + } else { + return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("..")); + } + } else { + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal(".")); + } +} + +static void process_emphasis(subject *subj, bufsize_t stack_bottom) { + delimiter *candidate; + delimiter *closer = NULL; + delimiter *opener; + delimiter *old_closer; + bool opener_found; + int openers_bottom_index = 0; + bufsize_t openers_bottom[15] = {stack_bottom, stack_bottom, stack_bottom, + stack_bottom, stack_bottom, stack_bottom, + stack_bottom, stack_bottom, stack_bottom, + stack_bottom, stack_bottom, stack_bottom, + stack_bottom, stack_bottom, stack_bottom}; + + // move back to first relevant delim. + candidate = subj->last_delim; + while (candidate != NULL && candidate->position >= stack_bottom) { + closer = candidate; + candidate = candidate->previous; + } + + // now move forward, looking for closers, and handling each + while (closer != NULL) { + if (closer->can_close) { + switch (closer->delim_char) { + case '"': + openers_bottom_index = 0; + break; + case '\'': + openers_bottom_index = 1; + break; + case '_': + openers_bottom_index = 2 + + (closer->can_open ? 3 : 0) + (closer->length % 3); + break; + case '*': + openers_bottom_index = 8 + + (closer->can_open ? 3 : 0) + (closer->length % 3); + break; + default: + assert(false); + } + + // Now look backwards for first matching opener: + opener = closer->previous; + opener_found = false; + while (opener != NULL && + opener->position >= openers_bottom[openers_bottom_index]) { + if (opener->can_open && opener->delim_char == closer->delim_char) { + // interior closer of size 2 can't match opener of size 1 + // or of size 1 can't match 2 + if (!(closer->can_open || opener->can_close) || + closer->length % 3 == 0 || + (opener->length + closer->length) % 3 != 0) { + opener_found = true; + break; + } + } + opener = opener->previous; + } + old_closer = closer; + if (closer->delim_char == '*' || closer->delim_char == '_') { + if (opener_found) { + closer = S_insert_emph(subj, opener, closer); + } else { + closer = closer->next; + } + } else if (closer->delim_char == '\'' || closer->delim_char == '"') { + if (closer->delim_char == '\'') { + cmark_node_set_literal(closer->inl_text, RIGHTSINGLEQUOTE); + } else { + cmark_node_set_literal(closer->inl_text, RIGHTDOUBLEQUOTE); + } + closer = closer->next; + if (opener_found) { + if (old_closer->delim_char == '\'') { + cmark_node_set_literal(opener->inl_text, LEFTSINGLEQUOTE); + } else { + cmark_node_set_literal(opener->inl_text, LEFTDOUBLEQUOTE); + } + remove_delimiter(subj, opener); + remove_delimiter(subj, old_closer); + } + } + if (!opener_found) { + // set lower bound for future searches for openers + openers_bottom[openers_bottom_index] = old_closer->position; + if (!old_closer->can_open) { + // we can remove a closer that can't be an + // opener, once we've seen there's no + // matching opener: + remove_delimiter(subj, old_closer); + } + } + } else { + closer = closer->next; + } + } + // free all delimiters in list until stack_bottom: + while (subj->last_delim != NULL && + subj->last_delim->position >= stack_bottom) { + remove_delimiter(subj, subj->last_delim); + } +} + +static delimiter *S_insert_emph(subject *subj, delimiter *opener, + delimiter *closer) { + delimiter *delim, *tmp_delim; + bufsize_t use_delims; + cmark_node *opener_inl = opener->inl_text; + cmark_node *closer_inl = closer->inl_text; + bufsize_t opener_num_chars = opener_inl->len; + bufsize_t closer_num_chars = closer_inl->len; + cmark_node *tmp, *tmpnext, *emph; + + // calculate the actual number of characters used from this closer + use_delims = (closer_num_chars >= 2 && opener_num_chars >= 2) ? 2 : 1; + + // remove used characters from associated inlines. + opener_num_chars -= use_delims; + closer_num_chars -= use_delims; + opener_inl->len = opener_num_chars; + opener_inl->data[opener_num_chars] = 0; + opener_inl->end_column -= use_delims; + closer_inl->len = closer_num_chars; + closer_inl->data[closer_num_chars] = 0; + closer_inl->start_column += use_delims; + + // free delimiters between opener and closer + delim = closer->previous; + while (delim != NULL && delim != opener) { + tmp_delim = delim->previous; + remove_delimiter(subj, delim); + delim = tmp_delim; + } + + // create new emph or strong, and splice it in to our inlines + // between the opener and closer + emph = use_delims == 1 ? make_emph(subj->mem) : make_strong(subj->mem); + + tmp = opener_inl->next; + if (tmp && tmp != closer_inl) { + emph->first_child = tmp; + tmp->prev = NULL; + + while (tmp && tmp != closer_inl) { + tmpnext = tmp->next; + tmp->parent = emph; + if (tmpnext == closer_inl) { + emph->last_child = tmp; + tmp->next = NULL; + } + tmp = tmpnext; + } + } + + opener_inl->next = emph; + closer_inl->prev = emph; + emph->prev = opener_inl; + emph->next = closer_inl; + emph->parent = opener_inl->parent; + + emph->start_line = opener_inl->start_line; + emph->end_line = closer_inl->end_line; + emph->start_column = opener_inl->start_column + opener_inl->len; + emph->end_column = closer_inl->end_column - closer_inl->len; + + // if opener has 0 characters, remove it and its associated inline + if (opener_num_chars == 0) { + cmark_node_free(opener_inl); + remove_delimiter(subj, opener); + } + + // if closer has 0 characters, remove it and its associated inline + if (closer_num_chars == 0) { + // remove empty closer inline + cmark_node_free(closer_inl); + // remove closer from list + tmp_delim = closer->next; + remove_delimiter(subj, closer); + closer = tmp_delim; + } + + return closer; +} + +// Parse backslash-escape or just a backslash, returning an inline. +static cmark_node *handle_backslash(subject *subj) { + advance(subj); + unsigned char nextchar = peek_char(subj); + if (cmark_ispunct( + nextchar)) { // only ascii symbols and newline can be escaped + advance(subj); + return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1)); + } else if (!is_eof(subj) && skip_line_end(subj)) { + return make_linebreak(subj->mem); + } else { + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\")); + } +} + +// Parse an entity or a regular "&" string. +// Assumes the subject has an '&' character at the current position. +static cmark_node *handle_entity(subject *subj) { + cmark_strbuf ent = CMARK_BUF_INIT(subj->mem); + bufsize_t len; + + advance(subj); + + len = houdini_unescape_ent(&ent, subj->input.data + subj->pos, + subj->input.len - subj->pos); + + if (len <= 0) + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&")); + + subj->pos += len; + return make_str_from_buf(subj, subj->pos - 1 - len, subj->pos - 1, &ent); +} + +// Clean a URL: remove surrounding whitespace, and remove \ that escape +// punctuation. +unsigned char *cmark_clean_url(cmark_mem *mem, cmark_chunk *url) { + cmark_strbuf buf = CMARK_BUF_INIT(mem); + + cmark_chunk_trim(url); + + houdini_unescape_html_f(&buf, url->data, url->len); + + cmark_strbuf_unescape(&buf); + return cmark_strbuf_detach(&buf); +} + +unsigned char *cmark_clean_title(cmark_mem *mem, cmark_chunk *title) { + cmark_strbuf buf = CMARK_BUF_INIT(mem); + unsigned char first, last; + + if (title->len == 0) { + return NULL; + } + + first = title->data[0]; + last = title->data[title->len - 1]; + + // remove surrounding quotes if any: + if ((first == '\'' && last == '\'') || (first == '(' && last == ')') || + (first == '"' && last == '"')) { + houdini_unescape_html_f(&buf, title->data + 1, title->len - 2); + } else { + houdini_unescape_html_f(&buf, title->data, title->len); + } + + cmark_strbuf_unescape(&buf); + return cmark_strbuf_detach(&buf); +} + +// Parse an autolink or HTML tag. +// Assumes the subject has a '<' character at the current position. +static cmark_node *handle_pointy_brace(subject *subj, int options) { + bufsize_t matchlen = 0; + cmark_chunk contents; + + advance(subj); // advance past first < + + // first try to match a URL autolink + matchlen = scan_autolink_uri(&subj->input, subj->pos); + if (matchlen > 0) { + contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1); + subj->pos += matchlen; + + return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 0); + } + + // next try to match an email autolink + matchlen = scan_autolink_email(&subj->input, subj->pos); + if (matchlen > 0) { + contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1); + subj->pos += matchlen; + + return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 1); + } + + // finally, try to match an html tag + if (subj->pos + 2 <= subj->input.len) { + int c = subj->input.data[subj->pos]; + if (c == '!' && (subj->flags & FLAG_SKIP_HTML_COMMENT) == 0) { + c = subj->input.data[subj->pos+1]; + if (c == '-' && subj->input.data[subj->pos+2] == '-') { + if (subj->input.data[subj->pos+3] == '>') { + matchlen = 4; + } else if (subj->input.data[subj->pos+3] == '-' && + subj->input.data[subj->pos+4] == '>') { + matchlen = 5; + } else { + matchlen = scan_html_comment(&subj->input, subj->pos + 1); + if (matchlen > 0) { + matchlen += 1; // prefix "<" + } else { // no match through end of input: set a flag so + // we don't reparse looking for -->: + subj->flags |= FLAG_SKIP_HTML_COMMENT; + } + } + } else if (c == '[') { + if ((subj->flags & FLAG_SKIP_HTML_CDATA) == 0) { + matchlen = scan_html_cdata(&subj->input, subj->pos + 2); + if (matchlen > 0) { + // The regex doesn't require the final "]]>". But if we're not at + // the end of input, it must come after the match. Otherwise, + // disable subsequent scans to avoid quadratic behavior. + matchlen += 5; // prefix "![", suffix "]]>" + if (subj->pos + matchlen > subj->input.len) { + subj->flags |= FLAG_SKIP_HTML_CDATA; + matchlen = 0; + } + } + } + } else if ((subj->flags & FLAG_SKIP_HTML_DECLARATION) == 0) { + matchlen = scan_html_declaration(&subj->input, subj->pos + 1); + if (matchlen > 0) { + matchlen += 2; // prefix "!", suffix ">" + if (subj->pos + matchlen > subj->input.len) { + subj->flags |= FLAG_SKIP_HTML_DECLARATION; + matchlen = 0; + } + } + } + } else if (c == '?') { + if ((subj->flags & FLAG_SKIP_HTML_PI) == 0) { + // Note that we allow an empty match. + matchlen = scan_html_pi(&subj->input, subj->pos + 1); + matchlen += 3; // prefix "?", suffix "?>" + if (subj->pos + matchlen > subj->input.len) { + subj->flags |= FLAG_SKIP_HTML_PI; + matchlen = 0; + } + } + } else { + matchlen = scan_html_tag(&subj->input, subj->pos); + } + } + if (matchlen > 0) { + const unsigned char *src = subj->input.data + subj->pos - 1; + bufsize_t len = matchlen + 1; + subj->pos += matchlen; + cmark_node *node = make_literal(subj, CMARK_NODE_HTML_INLINE, + subj->pos - matchlen - 1, subj->pos - 1); + node->data = (unsigned char *)subj->mem->realloc(NULL, len + 1); + memcpy(node->data, src, len); + node->data[len] = 0; + node->len = len; + adjust_subj_node_newlines(subj, node, matchlen, 1, options); + return node; + } + + // if nothing matches, just return the opening <: + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("<")); +} + +// Parse a link label. Returns 1 if successful. +// Note: unescaped brackets are not allowed in labels. +// The label begins with `[` and ends with the first `]` character +// encountered. Backticks in labels do not start code spans. +static int link_label(subject *subj, cmark_chunk *raw_label) { + bufsize_t startpos = subj->pos; + int length = 0; + unsigned char c; + + // advance past [ + if (peek_char(subj) == '[') { + advance(subj); + } else { + return 0; + } + + while ((c = peek_char(subj)) && c != '[' && c != ']') { + if (c == '\\') { + advance(subj); + length++; + if (cmark_ispunct(peek_char(subj))) { + advance(subj); + length++; + } + } else { + advance(subj); + length++; + } + if (length > MAX_LINK_LABEL_LENGTH) { + goto noMatch; + } + } + + if (c == ']') { // match found + *raw_label = + cmark_chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1)); + cmark_chunk_trim(raw_label); + advance(subj); // advance past ] + return 1; + } + +noMatch: + subj->pos = startpos; // rewind + return 0; +} + +static bufsize_t manual_scan_link_url_2(cmark_chunk *input, bufsize_t offset, + cmark_chunk *output) { + bufsize_t i = offset; + size_t nb_p = 0; + + while (i < input->len) { + if (input->data[i] == '\\' && + i + 1 < input-> len && + cmark_ispunct(input->data[i+1])) + i += 2; + else if (input->data[i] == '(') { + ++nb_p; + ++i; + if (nb_p > 32) + return -1; + } else if (input->data[i] == ')') { + if (nb_p == 0) + break; + --nb_p; + ++i; + } else if (cmark_isspace(input->data[i])) { + if (i == offset) { + return -1; + } + break; + } else { + ++i; + } + } + + if (i >= input->len || nb_p != 0) + return -1; + + { + cmark_chunk result = {input->data + offset, i - offset}; + *output = result; + } + return i - offset; +} + +static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset, + cmark_chunk *output) { + bufsize_t i = offset; + + if (i < input->len && input->data[i] == '<') { + ++i; + while (i < input->len) { + if (input->data[i] == '>') { + ++i; + break; + } else if (input->data[i] == '\\') + i += 2; + else if (input->data[i] == '\n' || input->data[i] == '<') + return -1; + else + ++i; + } + } else { + return manual_scan_link_url_2(input, offset, output); + } + + if (i >= input->len) + return -1; + + { + cmark_chunk result = {input->data + offset + 1, i - 2 - offset}; + *output = result; + } + return i - offset; +} + +// Return a link, an image, or a literal close bracket. +static cmark_node *handle_close_bracket(subject *subj) { + bufsize_t initial_pos, after_link_text_pos; + bufsize_t endurl, starttitle, endtitle, endall; + bufsize_t sps, n; + cmark_reference *ref = NULL; + cmark_chunk url_chunk, title_chunk; + unsigned char *url, *title; + bracket *opener; + cmark_node *inl; + cmark_chunk raw_label; + int found_label; + cmark_node *tmp, *tmpnext; + bool is_image; + + advance(subj); // advance past ] + initial_pos = subj->pos; + + // get last [ or ![ + opener = subj->last_bracket; + + if (opener == NULL) { + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); + } + + // If we got here, we matched a potential link/image text. + // Now we check to see if it's a link/image. + is_image = opener->image; + + if (!is_image && subj->no_link_openers) { + // take delimiter off stack + pop_bracket(subj); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); + } + + after_link_text_pos = subj->pos; + + // First, look for an inline link. + if (peek_char(subj) == '(' && + ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && + ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps, + &url_chunk)) > -1)) { + + // try to parse an explicit link: + endurl = subj->pos + 1 + sps + n; + starttitle = endurl + scan_spacechars(&subj->input, endurl); + + // ensure there are spaces btw url and title + endtitle = (starttitle == endurl) + ? starttitle + : starttitle + scan_link_title(&subj->input, starttitle); + + endall = endtitle + scan_spacechars(&subj->input, endtitle); + + if (peek_at(subj, endall) == ')') { + subj->pos = endall + 1; + + title_chunk = + cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle); + url = cmark_clean_url(subj->mem, &url_chunk); + title = cmark_clean_title(subj->mem, &title_chunk); + cmark_chunk_free(&url_chunk); + cmark_chunk_free(&title_chunk); + goto match; + + } else { + // it could still be a shortcut reference link + subj->pos = after_link_text_pos; + } + } + + // Next, look for a following [link label] that matches in refmap. + // skip spaces + raw_label = cmark_chunk_literal(""); + found_label = link_label(subj, &raw_label); + if (!found_label) { + // If we have a shortcut reference link, back up + // to before the spaces we skipped. + subj->pos = initial_pos; + } + + if ((!found_label || raw_label.len == 0) && !opener->bracket_after) { + cmark_chunk_free(&raw_label); + raw_label = cmark_chunk_dup(&subj->input, opener->position, + initial_pos - opener->position - 1); + found_label = true; + } + + if (found_label) { + ref = cmark_reference_lookup(subj->refmap, &raw_label); + cmark_chunk_free(&raw_label); + } + + if (ref != NULL) { // found + url = cmark_strdup(subj->mem, ref->url); + title = cmark_strdup(subj->mem, ref->title); + goto match; + } else { + goto noMatch; + } + +noMatch: + // If we fall through to here, it means we didn't match a link: + pop_bracket(subj); // remove this opener from delimiter list + subj->pos = initial_pos; + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); + +match: + inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK); + inl->as.link.url = url; + inl->as.link.title = title; + inl->start_line = inl->end_line = subj->line; + inl->start_column = opener->inl_text->start_column; + inl->end_column = subj->pos + subj->column_offset + subj->block_offset; + cmark_node_insert_before(opener->inl_text, inl); + // Add link text: + tmp = opener->inl_text->next; + while (tmp) { + tmpnext = tmp->next; + cmark_node_unlink(tmp); + append_child(inl, tmp); + tmp = tmpnext; + } + + // Free the bracket [: + cmark_node_free(opener->inl_text); + + process_emphasis(subj, opener->position); + pop_bracket(subj); + + // Now, if we have a link, we also want to deactivate links until + // we get a new opener. (This code can be removed if we decide to allow links + // inside links.) + if (!is_image) { + subj->no_link_openers = true; + } + + return NULL; +} + +// Parse a hard or soft linebreak, returning an inline. +// Assumes the subject has a cr or newline at the current position. +static cmark_node *handle_newline(subject *subj) { + bufsize_t nlpos = subj->pos; + // skip over cr, crlf, or lf: + if (peek_at(subj, subj->pos) == '\r') { + advance(subj); + } + if (peek_at(subj, subj->pos) == '\n') { + advance(subj); + } + ++subj->line; + subj->column_offset = -subj->pos; + // skip spaces at beginning of line + skip_spaces(subj); + if (nlpos > 1 && peek_at(subj, nlpos - 1) == ' ' && + peek_at(subj, nlpos - 2) == ' ') { + return make_linebreak(subj->mem); + } else { + return make_softbreak(subj->mem); + } +} + +static bufsize_t subject_find_special_char(subject *subj, int options) { + // "\r\n\\`&_*[]<!" + static const int8_t SPECIAL_CHARS[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + // " ' . - + static const char SMART_PUNCT_CHARS[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + + bufsize_t n = subj->pos + 1; + + while (n < subj->input.len) { + if (SPECIAL_CHARS[subj->input.data[n]]) + return n; + if (options & CMARK_OPT_SMART && SMART_PUNCT_CHARS[subj->input.data[n]]) + return n; + n++; + } + + return subj->input.len; +} + +// Parse an inline, advancing subject, and add it as a child of parent. +// Return 0 if no inline can be parsed, 1 otherwise. +static int parse_inline(subject *subj, cmark_node *parent, int options) { + cmark_node *new_inl = NULL; + cmark_chunk contents; + unsigned char c; + bufsize_t startpos, endpos; + c = peek_char(subj); + if (c == 0) { + return 0; + } + switch (c) { + case '\r': + case '\n': + new_inl = handle_newline(subj); + break; + case '`': + new_inl = handle_backticks(subj, options); + break; + case '\\': + new_inl = handle_backslash(subj); + break; + case '&': + new_inl = handle_entity(subj); + break; + case '<': + new_inl = handle_pointy_brace(subj, options); + break; + case '*': + case '_': + case '\'': + case '"': + new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0); + break; + case '-': + new_inl = handle_hyphen(subj, (options & CMARK_OPT_SMART) != 0); + break; + case '.': + new_inl = handle_period(subj, (options & CMARK_OPT_SMART) != 0); + break; + case '[': + advance(subj); + new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("[")); + push_bracket(subj, false, new_inl); + break; + case ']': + new_inl = handle_close_bracket(subj); + break; + case '!': + advance(subj); + if (peek_char(subj) == '[') { + advance(subj); + new_inl = make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("