From: Alexandre Terrasa <alexandre@moz-code.org>
Date: Tue, 29 May 2018 23:51:14 +0000 (-0400)
Subject: lib/markdown2: introduce markdown AST inline parser
X-Git-Url: http://nitlanguage.org

lib/markdown2: introduce markdown AST inline parser

Signed-off-by: Alexandre Terrasa <alexandre@moz-code.org>
---

diff --git a/lib/markdown2/markdown_inline_parsing.nit b/lib/markdown2/markdown_inline_parsing.nit
new file mode 100644
index 0000000..703f46e
--- /dev/null
+++ b/lib/markdown2/markdown_inline_parsing.nit
@@ -0,0 +1,1401 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Parser for inline markdown
+#
+# Used to create the AST representation of inline nodes like emphasis, code, links
+# images etc.
+module markdown_inline_parsing
+
+import markdown_ast
+
+# Parser for inline content (text, links, emphasis, etc)
+class MdInlineParser
+
+	# List of delimiter processors to use
+	private var delimiter_processors: Array[MdDelimiterProcessor] is lazy do
+		var delimiters = new Array[MdDelimiterProcessor]
+		delimiters.add new MdAsteriskDelimiterProcessor
+		delimiters.add new MdUnderscoreDelimiterProcessor
+		return delimiters
+	end
+
+	# Map special characters to their delimiter processor
+	private var delimiter_processors_map: Map[Char, MdDelimiterProcessor] is lazy do
+		var map = new HashMap[Char, MdDelimiterProcessor]
+		for delimiter_processor in delimiter_processors do
+			add_delimiter_processor(delimiter_processor, map)
+		end
+		special_characters.add_all map.keys
+		return map
+	end
+
+	# Register a delimiter processor
+	private fun add_delimiter_processor(delimiter_processor: MdDelimiterProcessor, map: Map[Char, MdDelimiterProcessor]) do
+		var opening = delimiter_processor.opening_delimiter
+		var closing = delimiter_processor.closing_delimiter
+		if opening == closing then
+			if map.has_key(opening) then
+				var old = map[opening]
+				if old.opening_delimiter == old.closing_delimiter then
+					var s: MdStaggeredDelimiterProcessor
+					if old isa MdStaggeredDelimiterProcessor then
+						s = old
+					else
+						s = new MdStaggeredDelimiterProcessor(opening)
+						s.add old
+					end
+					s.add delimiter_processor
+					map[opening] = s
+				else
+					add_delimiter_processor_for_char(opening, delimiter_processor, map)
+				end
+			else
+				add_delimiter_processor_for_char(opening, delimiter_processor, map)
+			end
+		else
+			add_delimiter_processor_for_char(opening, delimiter_processor, map)
+			add_delimiter_processor_for_char(closing, delimiter_processor, map)
+		end
+	end
+
+	# Register a delimiter processor for a special character
+	private fun add_delimiter_processor_for_char(delimiter_char: Char, delimiter_processor: MdDelimiterProcessor, map: Map[Char, MdDelimiterProcessor]) do
+		assert not map.has_key(delimiter_char) else
+			print "Delimiter processor conflict with delimiter char `{delimiter_char}`"
+		end
+		map[delimiter_char] = delimiter_processor
+	end
+
+	# List of characters that have a special Markdown meaning
+	private var special_characters: Array[Char] = ['\n', '`', '[', ']', '\\', '!', '<', '&']
+
+	# Link references by ID, needs to be built up using `parse_reference` before calling `parse`
+	private var reference_map = new HashMap[String, MdLink]
+
+	# Current block under parsing
+	private var block: MdNode is noinit
+
+	# Current input string
+	private var input: String is noinit
+
+	# Current index
+	private var index: Int is noinit
+
+	# Current line
+	private var line: Int is noinit
+
+	# Current column
+	private var column: Int is noinit
+
+	# Current column offset
+	private var column_offset: Int is noinit
+
+	# Top delimiter (emphasis, strong emphasis or custom emphasis)
+	# Brackets are on a separate stack, different from the algorithm described in the spec.
+	private var last_delimiter: nullable MdDelimiter = null
+
+	# Top opening bracket (`[` or `![`)
+	private var last_bracket: nullable MdBracket = null
+
+	# Parse `input` as inline and add resulting nodes as children to `block`
+	fun parse(input: String, offset: Int, block: MdNode) do
+		self.block = block
+		self.input = input.trim
+		self.index = 0
+		self.last_delimiter = null
+		self.last_bracket = null
+		self.line = block.location.line_start
+		self.column_offset = offset
+		self.column = 1 + column_offset
+
+		var more_to_parse = parse_inline
+		while more_to_parse do
+			more_to_parse = parse_inline
+		end
+
+		process_delimiters(null)
+		merge_child_text_nodes(block)
+	end
+
+	# Advance the current index of `count` characters
+	private fun advance(count: Int) do
+		index += count
+		column += count
+	end
+
+	# Attempt to parse a link reference
+	#
+	# Return how many characters were parsed as a reference.
+	# Returns 0 if none.
+	fun parse_reference(input: String): Int do
+		self.input = input
+		self.index = 0
+		self.column = 0
+		var dest
+		var title
+		var match_chars
+		var start_index = index
+
+		# label
+		match_chars = parse_link_label
+		if match_chars == 0 then return 0
+		advance match_chars
+
+		var raw_label = input.substring(0, match_chars)
+
+		# colon
+		if peek != ':' then return 0
+		advance 1
+
+		# link url
+		spnl
+
+		dest = parse_link_destination.first
+		if dest == null or dest.is_empty then return 0
+
+		var before_title = index
+		var before_column = column
+		spnl
+		title = parse_link_title
+		if title == null then
+			# rewind before spaces
+			index = before_title
+			column = before_column
+		end
+
+		var at_line_end = true
+		if index != input.length and match(re_line_end) == null then
+			if title == null then
+				at_line_end = false
+			else
+				# the potential title we found is not at the line end,
+				# but it could still be a legal link reference if we discard the title
+				title = null
+				# rewind before spaces
+				index = before_title
+				column = before_column
+				# and instead check if the link URL is at the line end
+				at_line_end = match(re_line_end) != null
+			end
+		end
+
+		if not at_line_end then return 0
+
+		var normalized_label = raw_label.normalize_reference
+		if normalized_label.is_empty then return 0
+
+		if not reference_map.has_key(normalized_label) then
+			var link = new MdLink(new MdLocation(0, 0, 0, 0), dest, title)
+			reference_map[normalized_label] = link
+		end
+
+		return index - start_index
+	end
+
+	# Line end pattern
+	private var re_line_end: Regex = "^ *(\n|$)".to_re
+
+	# Append standard text to the current block
+	#
+	# Read `text` between `begin_index` and `end_index`.
+	private fun append_text(text: String, begin_index, end_index: nullable Int): MdText do
+		var node: MdText
+		if begin_index != null and end_index != null then
+			var nb_chars = end_index - begin_index
+			var string = text.substring(begin_index, nb_chars)
+			node = new MdText(
+				new MdLocation(
+					line,
+					column,
+					line,
+					column + nb_chars - 1
+				), string)
+		else
+			node = new MdText(
+				new MdLocation(
+					line,
+					column,
+					line,
+					column + text.length
+				), text)
+		end
+		append_node(node)
+		return node
+	end
+
+	# Append `node` to the current block
+	private fun append_node(node: MdNode) do block.append_child(node)
+
+	# Parse the next inline element in subject, advancing input index
+	#
+	# On success, add the result to block's children and return true.
+	# On failure, return false.
+	private fun parse_inline: Bool do
+		var res: Bool
+		var c = peek
+		if c == '\0' then return false
+		if c == '\n' then
+			res = parse_newline
+		else if c == '\\' then
+			res = parse_backslash
+		else if c == '`' then
+			res = parse_backticks
+		else if c == '[' then
+			res = parse_open_bracket
+		else if c == '!' then
+			res = parse_bang
+		else if c == ']' then
+			res = parse_close_bracket
+		else if c == '<' then
+			res = parse_auto_link or parse_html_inline
+		else if c == '&' then
+			res = parse_entity
+		else
+			if delimiter_processors_map.has_key(c) then
+				res = parse_delimiters(delimiter_processors_map[c], c)
+			else
+				res = parse_string
+			end
+		end
+
+		if not res then
+			advance 1
+			# When we get here, it's only for a single special character that turned
+			# out to not have a special meaning.
+			# So we shouldn't have a single surrogate here, hence it should be ok
+			# to turn it into a String
+			var literal = c.to_s
+			append_text(literal)
+		end
+
+		return true
+	end
+
+	# If `re` matches at current index in the input, advance index and return the match
+	# Else return null.
+	private fun match(re: Pattern): nullable String do
+		if index >= input.length then return null
+		var match = input.search_from(re, index)
+		if match != null then
+			index = match.after
+			column = match.after
+			return match.to_s
+		end
+		return null
+	end
+
+	# Return the char at the current input index, or `\0`
+	private fun peek: Char do
+		if index < input.length then
+			return input.chars[index]
+		end
+		return '\0'
+	end
+
+	# Return the char at the current input index + 1, or `\0`
+	private fun peek_next: Char do
+		if index + 1 < input.length then
+			return input.chars[index + 1]
+		end
+		return '\0'
+	end
+
+	# Parse zero or more space characters, incuding at most one newline
+	private fun spnl: Bool do
+		var found_nl = false
+		loop
+			var c = peek
+			if c == ' ' or c == '\t' then
+				advance 1
+				continue
+			else if c == '\n' then
+				if found_nl then break
+				found_nl = true
+				advance 1
+				continue
+			end
+			break
+		end
+		return true
+	end
+
+	# Parse a new line
+	#
+	# If it was preceded by two spaces, return a hard line break,
+	# otherwise a soft line break
+	private fun parse_newline: Bool do
+		advance 1 # assume we're at a `\n`
+
+		var last_child = block.last_child
+
+		# check previous text for trailing spaces
+		# the `has_suffix` is an optimization to avoid an RE match in the common case
+		if last_child != null and last_child isa MdText and
+		   (last_child.literal.has_suffix(" ")) then
+			var text = last_child
+			var literal = text.literal
+			var match = literal.search(re_final_space)
+			var spaces = if match != null then match.length else 0
+			if spaces > 0 then
+				text.literal = literal.substring(0, literal.length - spaces)
+			end
+			last_child.location.column_end = last_child.location.column_end - spaces
+			if spaces >= 2 then
+				append_node(new MdHardLineBreak(new MdLocation(line, column - spaces - 1, line, column - 1), false))
+			else
+				append_node(new MdSoftLineBreak(new MdLocation(line, column - spaces - 1, line, column -1)))
+			end
+		else
+			append_node(new MdSoftLineBreak(new MdLocation(line, column - 1, line, column - 1)))
+		end
+		line += 1
+		column = 1 + column_offset
+
+		# gobble leading spaces in next line
+		while peek == ' ' do
+			advance 1
+		end
+		return true
+	end
+
+	# Final white spaces pattern
+	private var re_final_space: Regex = " *$".to_re
+
+	# Parse a backslash-escaped special character
+	#
+	# Add either the escaped characters, a hard line break (if the backslash is followed by
+	# a new line), or a literal backslash to the block's children.
+	private fun parse_backslash: Bool do
+		advance 1
+		if peek == '\n' then
+			append_node(new MdHardLineBreak(new MdLocation(line, column - 1, line, column), true))
+			advance 1
+			line += 1
+			column = 1 + column_offset
+		else if index < input.length and input.substring(index, 1).has(re_escapable) then
+		    append_text(input, index, index + 1)
+		    advance 1
+		else
+			append_text("\\")
+		end
+		return true
+	end
+
+	# Escapable characters pattern
+	private var p_escapable = "[]!\"#$%&\'()*+,./:;<=>?@\\[\\\\^_`\\\{|\\\}~-]"
+
+	# Escapable characters regex
+	private var re_escapable: Regex = "^{p_escapable}".to_re
+
+	# Attempt to parse backticks
+	#
+	# Adding either a backtick code span or a literal sequence of backticks.
+	private fun parse_backticks: Bool do
+		var column_before = column
+		var ticks = match(re_ticks_here)
+		if ticks == null then return false
+
+		var after_open_ticks = index
+		var matched = match(re_ticks)
+		while matched != null do
+			if matched == ticks then
+				var content = input.substring(after_open_ticks, index - after_open_ticks - ticks.length)
+				content = content.trim
+				content = content.replace(re_whitespace, " ")
+				var node = new MdCode(new MdLocation(line, column_before, line, column), matched.to_s, content.trim)
+				append_node(node)
+				column += 1
+				return true
+			end
+			matched = match(re_ticks)
+		end
+		# If we got here, we didn't match a closing backtick sequence
+		index = after_open_ticks
+		column = after_open_ticks + 1
+		append_text(ticks)
+		return true
+	end
+
+	# Backticks starting pattern
+	private var re_ticks_here: Regex = "^`+".to_re
+
+	# Backticks pattern
+	private var re_ticks: Regex = "`+".to_re
+
+	# Attempt to parse delimiters like emphasis, strong emphasis or custom delimiters
+	private fun parse_delimiters(delimiter_processor: MdDelimiterProcessor, delimiter_char: Char): Bool do
+		var res = scan_delimiters(delimiter_processor, delimiter_char)
+		if res == null then return false
+
+		var length = res.count
+		var start_index = index
+		var start_column = column
+
+		advance length
+		var column_before = column
+		column = start_column
+		var node = append_text(input, start_index, index)
+		column = column_before
+
+		# Add entry to stack for this opener
+		var last_delimiter = new MdDelimiter(node, delimiter_char, res.can_open, res.can_close, last_delimiter)
+		last_delimiter.length = length
+		last_delimiter.original_length = length
+
+		var prev = last_delimiter.prev
+		if prev != null then
+			prev.next = last_delimiter
+		end
+		self.last_delimiter = last_delimiter
+		return true
+	end
+
+	# Add open bracket to delimiter stack and add a text node to block's children
+	private fun parse_open_bracket: Bool do
+		var start_index = index
+		advance 1
+
+		var node = append_text("[")
+
+		# Add entry to stack for this opener
+		add_bracket(new MdBracket.link(node, start_index, column - 1, last_bracket, last_delimiter))
+		return true
+	end
+
+	# If next character is `[`, add `!` delimiter to delimiter stack and add a text node to
+	# block's children.
+	# Otherwise just add a text node.
+	private fun parse_bang: Bool do
+		var start_index = index
+		advance 1
+
+		if peek == '[' then
+			advance 1
+			var node = append_text("![")
+
+			# Add entry to stack for this opener
+			add_bracket(new MdBracket.image(node, start_index + 1, column - 2, last_bracket, last_delimiter))
+		else
+			append_text("!")
+		end
+		return true
+	end
+
+	# Try match close bracket against an opening delimiter stack
+	#
+	# Add either a link or image, or a plan `[` character, to block's children.
+	# If there is a matching delimiter, remove it from the delimiter stack.
+	private fun parse_close_bracket: Bool do
+		advance 1
+		var start_index = index
+		var start_column = column
+
+		# Get previous `[` or `![`
+		var opener = last_bracket
+		if opener == null then
+			# no matching opener, just return a literal
+			append_text("]")
+			return true
+		end
+
+		if not opener.allowed then
+			# matching opener but it's not allowed, juste return a literal
+			append_text("]")
+			remove_last_bracket
+			return true
+		end
+
+		# check to see if we have a link or image
+		var dest: nullable Couple[nullable String, Bool] = null
+		var title = null
+		var is_link_or_image = false
+
+		# maybe an inline link like `[foo](\uri "title")`
+		if peek == '(' then
+			advance 1
+			spnl
+			dest = parse_link_destination
+			if dest.first != null then
+				spnl
+				# title needs a whitespace before
+				if input.substring(index - 1, 1).has(re_whitespace) then
+					title = parse_link_title
+					spnl
+				end
+				if peek == ')' then
+					advance 1
+					is_link_or_image = true
+				else
+					index = start_index
+					column = start_column
+				end
+			end
+		end
+
+		# maybe a reference link like `[foo][bar]`, `[foo][]` or `[foo]`
+		if not is_link_or_image then
+			# see if there's a link label like `[bar]` or `[]`
+			var before_label = index
+			var label_length = parse_link_label
+			advance label_length
+			var ref = null
+			if label_length > 2 then
+				ref = input.substring(before_label, label_length)
+			else if not opener.bracket_after then
+				# If the second label is empty `[foo][]` or missing `[foo]`, then the first label
+				# is the reference.
+				# But it can only be a reference when there's no (unescaped) bracket in it.
+				# If there is, we don't even need to try to lookup the reference.
+				ref = input.substring(opener.index, start_index - opener.index)
+			end
+
+			if ref != null then
+				var nref = ref.normalize_reference
+				if reference_map.has_key(nref) then
+					var link = reference_map[nref]
+					dest = new Couple[nullable String, Bool](link.destination, false)
+					title = link.title
+					is_link_or_image = true
+				end
+			end
+		end
+
+		if is_link_or_image then
+			# If we got here, open is a potential opener
+			var link_or_image: MdLinkOrImage
+			if opener.is_image then
+				link_or_image = new MdImage(new MdLocation(line, opener.column, line, column - 1), dest.as(not null).first or else "", title)
+			else
+				link_or_image = new MdLink(new MdLocation(line, opener.column, line, column - 1), dest.as(not null).first or else "", title)
+			end
+			link_or_image.has_brackets = dest.as(not null).second
+
+			var node = opener.node.next
+			while node != null do
+				var next = node.next
+				link_or_image.append_child(node)
+				node = next
+			end
+			append_node(link_or_image)
+
+			# Process delimiters such as emphasis inside a link/image
+			process_delimiters(opener.prev_delimiter)
+			merge_child_text_nodes(link_or_image)
+			# We don't need the corresponding text node anymore, we turned it into a node
+			opener.node.unlink
+			remove_last_bracket
+
+			# Links within links are not allowed
+			# We found this link, so there can be no other link around it.
+			if not opener.is_image then
+				var bracket = last_bracket
+				while bracket != null do
+					if not bracket.is_image then
+						# disallow link opener
+						bracket.allowed = false
+					end
+					bracket = bracket.prev
+				end
+			end
+			return true
+		end
+
+		if not is_link_or_image then
+			if parse_wikilink then return true
+		end
+
+		# no link or image
+		append_text("]")
+		remove_last_bracket
+		index = start_index
+		column = start_column
+		return true
+	end
+
+	# Whitespace pattern
+	private var re_whitespace: Regex = "\\s+".to_re
+
+	# Add a bracket token on top of the `last_bracket` stack
+	private fun add_bracket(bracket: MdBracket) do
+		var last_bracket = self.last_bracket
+		if last_bracket != null then
+			last_bracket.bracket_after = true
+		end
+		self.last_bracket = bracket
+	end
+
+	# Remove the last bracket on the `last_bracket` stack
+	private fun remove_last_bracket do
+		var last_bracket = self.last_bracket
+		if last_bracket == null then return
+		self.last_bracket = last_bracket.prev
+	end
+
+	# Wikilink placeholder
+	#
+	# Will be defined in sub module.
+	private fun parse_wikilink: Bool do return false
+
+	# Attempt to parse a link destination, returning the string or null if not match
+	private fun parse_link_destination: Couple[nullable String, Bool] do
+		var buffer = new Buffer
+
+		var c = peek
+		var parens = 0
+
+		var has_bracket = c == '<'
+		if has_bracket then advance 1
+
+		loop
+			c = peek
+			if c == '\0' then
+				break # end of input
+			else if c == ' ' or c == '\t' or c == '\n' or c == '\r' then
+				break # no spaces allowed in urls
+			else if c == '\\' then
+				var next = peek_next
+				if escapable.has(next) then
+					buffer.add next
+					advance 2 # skip over the backslash
+					continue
+				end
+			else if has_bracket and c == '>' then
+				advance 1
+				break
+			else if not has_bracket and c == '(' then
+				parens += 1
+			else if not has_bracket and c == ')' then
+				if parens == 0 then break
+				parens -= 1
+			else if c == '\0' then
+				break
+			end
+			buffer.add c
+			advance 1
+		end
+		return new Couple[nullable String, Bool](buffer.to_s, has_bracket)
+	end
+
+	# Attempt to parse a link title (sans quotes), returning the string or null if no match
+	private fun parse_link_title: nullable String do
+		var c = peek
+		if c != '\'' and c != '"' and c != '(' then
+			return null
+		end
+		var opener = c
+
+		var buffer = new Buffer
+		loop
+			advance 1
+			c = peek
+			if c == opener or (opener == '(' and c == ')') then
+				advance 1
+				break
+			else if c == '\\' then
+				var next = peek_next
+				if escapable.has(next) then
+					buffer.add next
+					advance 1
+					continue
+				end
+			else if c == '\0' then
+				return null
+			end
+			buffer.add c
+		end
+		return buffer.to_s
+	end
+
+	# Escapable characters
+	private var escapable = "[]!\"#$%&\'()*+,./:;<=>?@\\^_`\{|\}~-"
+
+	# Attempt to parse a link label returning number of characters parsed
+	private fun parse_link_label: Int do
+		var i = index
+		while i < input.length do
+			var c = input[i]
+			if i == index and c != '[' then
+				return 0
+			else if c == '[' and i != index then
+				if input[i - 1] != '\\' or (i - 2 > index and input[i - 2] == '\\') then
+					return 0
+				end
+			else if c == ']' then
+				if i > 1001 then return 0
+				if input[i - 1] != '\\' or (i - 2 > index and input[i - 2] == '\\') then
+					return (i - index) + 1
+				end
+			end
+			i += 1
+		end
+		return 0
+	end
+
+	# Attempt to parse an autolink (URL or email in pointy brackets)
+	private fun parse_auto_link: Bool do
+		var column_before = column
+		var m = match(re_autolink_email)
+		if m != null then
+			var dest = m.substring(1, m.length - 2)
+			var node = new MdLink(new MdLocation(line, column_before, line, column), "mailto:{dest}", null, true)
+			node.append_child(new MdText(new MdLocation(line, column_before + 1, line, column - 1), dest))
+			column += 1
+			append_node(node)
+			return true
+		end
+		m = match(re_autolink_url)
+		if m != null then
+			var dest = m.substring(1, m.length - 2)
+			var node = new MdLink(new MdLocation(line, column_before, line, column), dest, null, true)
+			node.append_child(new MdText(new MdLocation(line, column_before + 1, line, column - 1), dest))
+			column += 1
+			append_node(node)
+			return true
+		end
+		return false
+	end
+
+	# Autolink email pattern
+	private var re_autolink_email: Regex = "^<([a-zA-Z0-9.!#$%&'*+/=?^_`\{|\}~-]+@[a-zA-Z0-9]([a-zA-Z0-9-]\{0,61\}[a-zA-Z0-9])?(\\.[a-zA-Z0-9]([a-zA-Z0-9-]\{0,61\}[a-zA-Z0-9])?)*)>".to_re
+
+	# Autolink url pattern
+	private var re_autolink_url: Regex = "^<[a-zA-Z][a-zA-Z0-9.+-]\{1,31\}:[^<> ]*>".to_re
+
+	# Attempt to parse an inline HTML string
+	private fun parse_html_inline: Bool do
+		var column_before = column
+		var m = match(re_html_tag)
+		if m != null then
+			var node = new MdHtmlInline(new MdLocation(line, column_before, line, column), m)
+			column += 1
+			append_node(node)
+			return true
+		end
+		return false
+	end
+
+	private var p_tagname = "[A-Za-z][A-Za-z0-9-]*"
+	private var p_attribute_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*"
+	private var p_uquoted_value = "[^\"'=<>` \t\n]+"
+	private var p_squoted_value = "'[^']*'"
+	private var p_dquoted_value = "\"[^\"]*\""
+	private var p_attribute_value = "({p_uquoted_value}|{p_squoted_value}|{p_dquoted_value})"
+	private var p_attribute_value_spec = "(\\s*=\\s*{p_attribute_value})"
+	private var p_attribute = "(\\s{p_attribute_name}{p_attribute_value_spec}?)"
+	private var p_opentag = "<{p_tagname}{p_attribute}*\\s*/?>"
+	private var p_closetag = "</{p_tagname}\\s*[>]"
+	private var p_html_comment = "<!---->|<!--(-?[^>-])(-?[^-])*-->"
+	private var p_processing_instruction = "[<][?].*?[?][>]"
+	private var p_declaration = "<![A-Z]+\\s+[^>]*>"
+	private var p_cdata = "<!\\[CDATA\\[.*\\]\\]>"
+	private var p_html_tag = "({p_opentag}|{p_closetag}|{p_html_comment}|{p_processing_instruction}|{p_declaration}|{p_cdata})"
+
+	# HTML tag pattern
+	private var re_html_tag: Regex do
+		var re = "^{p_html_tag}".to_re
+		re.ignore_case = true
+		return re
+	end
+
+	# Attempt to parse an HTML entity
+	private fun parse_entity: Bool do
+		var m = match(re_entity_here)
+		if m != null then
+			append_text(m)
+			return true
+		end
+		return false
+	end
+
+	# HTML entity pattern
+	private var re_entity_here: Regex do
+		var re = "^&(#x[a-f0-9]\{1,8\}|#[0-9]\{1,8\}|[a-z][a-z0-9]\{1,31\});".to_re
+		re.ignore_case = true
+		return re
+	end
+
+	# Parse a run of ordinary characters
+	#
+	# Or a single character with a special meaning in markdown, as a plain string.
+	private fun parse_string: Bool do
+		var begin = index
+		var begin_column = column
+		var length = input.length
+		while index != length do
+			if special_characters.has(input.chars[index]) then
+				break
+			end
+			advance 1
+		end
+		if begin != index then
+			var column_before = column
+			column = begin_column
+			append_text(input, begin, index)
+			column = column_before
+			return true
+		end
+		return false
+	end
+
+	# Scan a sequence of characters with code `delimiter_char`
+	#
+	# Return information about the number of delimiters and whether they are positioned
+	# such as they can open and/or close emphasis or strong emphasis.
+	private fun scan_delimiters(delimiter_processor: MdDelimiterProcessor, delimiter_char: Char): nullable MdDelimiterData do
+		var start_index = index
+		var start_column = column
+
+		var delimiter_count = 0
+		while peek == delimiter_char do
+			delimiter_count += 1
+			advance 1
+		end
+
+		if delimiter_count < delimiter_processor.min_length then
+			index = start_index
+			column = start_column
+			return null
+		end
+
+		var before = "\n"
+		if start_index > 0 then
+			before = input.substring(start_index - 1, 1)
+		end
+
+		var char_after = peek
+		var after = "\n"
+		if char_after != '\0' then
+			after = char_after.to_s
+		end
+
+		var before_is_punctuation = before.has(re_punctuation)
+		var before_is_whitespace = before.has(re_whitespace_char)
+		var after_is_punctuation = after.has(re_punctuation)
+		var after_is_whitespace = after.has(re_whitespace_char)
+
+		var left_flanking = not after_is_whitespace and
+			(not after_is_punctuation or before_is_whitespace or before_is_punctuation)
+		var right_flanking = not before_is_whitespace and
+			(not before_is_punctuation or after_is_whitespace or after_is_punctuation)
+
+		var can_open
+		var can_close
+		if delimiter_char == '_' then
+			can_open = left_flanking and (not right_flanking or before_is_punctuation)
+			can_close = right_flanking and (not left_flanking or after_is_punctuation)
+		else
+			can_open = left_flanking and delimiter_char == delimiter_processor.opening_delimiter
+			can_close = right_flanking and delimiter_char == delimiter_processor.closing_delimiter
+		end
+
+		index = start_index
+		column = start_column
+		return new MdDelimiterData(delimiter_count, can_open, can_close)
+	end
+
+	# Punctuation pattern
+	private var re_punctuation: Regex = "^[]!\"#\\$%&'()*+,.:;<=>?@^_`\{|\}~[-]".to_re
+
+	# Whitespace character start pattern
+	private var re_whitespace_char: Regex = "^[ Â \t\r\n]".to_re
+
+	# Process the stack of delimiters
+	private fun process_delimiters(stack_bottom: nullable MdDelimiter) do
+		var openers_bottom = new HashMap[Char, nullable MdDelimiter]
+
+		# find first closer above stack bottom
+		var closer = last_delimiter
+		while closer != null and closer.prev != stack_bottom do
+			closer = closer.prev
+		end
+		# move forward, looking for closers, and handling each
+		while closer != null do
+			var delimiter_char = closer.delimiter_char
+
+			if not closer.can_close then
+				closer = closer.next
+				continue
+			end
+
+			if not delimiter_processors_map.has_key(delimiter_char) then
+				closer = closer.next
+				continue
+			end
+
+			var delimiter_processor = delimiter_processors_map[delimiter_char]
+			var opening_delimiter_char = delimiter_processor.opening_delimiter
+
+			# Found delimiter closer. Now look back for first matching opener
+			var use_delims = 0
+			var opener_found = false
+			var potential_opener_found = false
+			var opener = closer.prev
+
+			while opener != null and opener != stack_bottom and (not openers_bottom.has_key(delimiter_char) or opener != openers_bottom[delimiter_char]) do
+
+				if opener.can_open and opener.delimiter_char == opening_delimiter_char then
+					potential_opener_found = true
+					use_delims = delimiter_processor.delimiter_use(opener, closer)
+					if use_delims > 0 then
+						opener_found = true
+						break
+					end
+				end
+				opener = opener.prev
+			end
+
+			if not opener_found then
+				if not potential_opener_found then
+					# Set lower bound for future searches for openers.
+					# Only do this when we didn't even have a potential opener
+					# (one that matches the character and can open).
+					# If an opener was rejected because of the number of delimiters
+					# (e.g. because of the "multiple of 3" rule),
+					# we want to consider it next time because the number of delimiter
+					# can change as we continue processing.
+					openers_bottom[delimiter_char] = closer.prev
+					if not closer.can_open then
+						# We can remove a closer that can't be an opener,
+						# once we've seen there's no matching opener.
+						remove_delimiters_keep_node(closer)
+					end
+				end
+				closer = closer.next
+				continue
+			end
+
+			var opener_node = opener.as(not null).node
+			var closer_node = closer.node
+
+			# Remove number of used delimieters from stack and inline nodes
+			opener.as(not null).length -= use_delims
+			closer.length -= use_delims
+			opener_node.literal = opener_node.literal.substring(0,
+				opener_node.literal.length - use_delims)
+			closer_node.literal = closer_node.literal.substring(0,
+				closer_node.literal.length - use_delims)
+
+			remove_delimiters_between(opener, closer)
+			# The delimieter processor can re-parent the nodes between opener and closer,
+			# so make sure they're contiguous already.
+			# Exclusive because we want to keep opener/closer themselves.
+			merge_text_nodes_between_exclusive(opener_node, closer_node)
+			delimiter_processor.process(opener_node, closer_node, use_delims)
+
+			# Node delimieter characters left to process, so we can remove
+			# delimieter and the now empty node
+			if opener.as(not null).length == 0 then
+				remove_delimiters_and_node(opener)
+			end
+
+			if closer.length == 0 then
+				var next = closer.next
+				remove_delimiters_and_node(closer)
+				closer = next
+			end
+		end
+
+		# Remove all delimiters
+		while last_delimiter != null and last_delimiter != stack_bottom do
+			remove_delimiters_keep_node(last_delimiter)
+		end
+	end
+
+	# Remove all delimiters between `opener` and `closer`
+	private fun remove_delimiters_between(opener, closer: nullable MdDelimiter) do
+		if opener == null or closer == null then return
+
+		var delimiter = closer.prev
+		while delimiter != null and delimiter != opener do
+			var previous_delimiter = delimiter.prev
+			remove_delimiters_keep_node(delimiter)
+			delimiter = previous_delimiter
+		end
+	end
+
+	# Remove the delimiter and the corresponding text node
+	#
+	# For used delimiters, e.g. `*` in `*foo*`.
+	private fun remove_delimiters_and_node(delim: nullable MdDelimiter) do
+		if delim == null then return
+
+		var node = delim.node
+		node.unlink
+		remove_delimiter(delim)
+	end
+
+	# Remove the delimiter but keep the corresponding node as text
+	#
+	# For unused delimiters such as `_` in `foo_bar`.
+	private fun remove_delimiters_keep_node(delim: nullable MdDelimiter) do
+		remove_delimiter(delim)
+	end
+
+	# Remove the delimiter `delim`
+	private fun remove_delimiter(delim: nullable MdDelimiter) do
+		if delim == null then return
+
+		var prev = delim.prev
+		if prev != null then
+			prev.next = delim.next
+		end
+		var next = delim.next
+		if next == null then
+			# top of stack
+			last_delimiter = prev
+		else
+			next.prev = prev
+		end
+	end
+
+	# Merge all nodes between `from` and `to` excluding `from` and `to`
+	private fun merge_text_nodes_between_exclusive(from, to: nullable MdNode) do
+		if from == null or to == null then return
+		# no node between them
+		if from == to or from.next == to then return
+		merge_text_nodes_inclusive(from.next, to.prev)
+	end
+
+	# Merge all child nodes of `node` into one
+	private fun merge_child_text_nodes(node: nullable MdNode) do
+		if node == null then return
+		# no children or just one child node, no need for merging
+		if node.first_child == node.last_child then return
+		merge_text_nodes_inclusive(node.first_child, node.last_child)
+	end
+
+	# Merge all nodes between `from` and `to` including `from` and `to`
+	private fun merge_text_nodes_inclusive(from, to: nullable MdNode) do
+		var first = null
+		var last = null
+
+		var node = from
+		while node != null do
+			if node isa MdText then
+				var text = node
+				if first == null then first = text
+				last = text
+			else
+				merge_if_needed(first, last)
+				first = null
+				last = null
+			end
+			if node == to then break
+			node = node.next
+		end
+		merge_if_needed(first, last)
+	end
+
+	# Merge all nodes between `first` and `last`
+	private fun merge_if_needed(first, last: nullable MdText) do
+		if first != null and last != null and first != last then
+			var buffer = new Buffer
+			buffer.append(first.literal)
+			var node = first.next
+			var stop = last.next
+			while node != null and node != stop do
+				buffer.append(node.as(MdText).literal)
+				first.location.line_end = node.location.line_end
+				first.location.column_end = node.location.column_end
+				var unlink = node
+				node = node.next
+				unlink.unlink
+			end
+			var literal = buffer.write_to_string
+			first.literal = literal
+		end
+	end
+end
+
+# Custom delimiter processor for additional delimiters besides `_` and `*`
+interface MdDelimiterProcessor
+
+	# The character that marks the beginning of a delimited node
+	#
+	# Must not clash with anu built-in special characters.
+	fun opening_delimiter: Char is abstract
+
+	# The character that marks the ending of a delimited node
+	#
+	# Must not clash with anu built-in special characters.
+	fun closing_delimiter: Char is abstract
+
+	# Minimum number of delimiters characters that are needed to active this
+	#
+	# Must be at least one.
+	fun min_length: Int is abstract
+
+	# Determine how many (if any) of the delimiter characters should be used
+	#
+	# This allows implementations to decide how many characters to use based on the
+	# properties of the delimiter runs.
+	#
+	# An implementation can also return 0 when it doesn't want to allow this particular
+	# combination of delimiter runs.
+	fun delimiter_use(opener, closer: MdDelimiter): Int is abstract
+
+	# Process the matched delimiters
+	#
+	# For example, by wrapping the nodes between `opener` and `closer` in a new node,
+	# or appending a new node after the opener.
+	#
+	# Note that removal of the delimiter from the delimiter nodes and unlinking
+	# them is done by the caller.
+	fun process(opener, closer: MdText, delimiter_use: Int) is abstract
+end
+
+# A delimiter is one or more of the same delimiter character
+#
+# Used for paired delimiters like emphasis or strong emphasis.
+class MdDelimiter
+
+	# Node containing the delimiter
+	var node: MdText
+
+	# Character used as delimiter
+	var delimiter_char: Char
+
+	# Can `self` open a delimiter?
+	var can_open: Bool
+
+	# Cant `self` close a delimiter?
+	var can_close: Bool
+
+	# Previous delimiter found
+	var prev: nullable MdDelimiter
+
+	# Next delimiter found
+	var next: nullable MdDelimiter
+
+	# The number of characters in this delimiter run that are left for processing
+	var length = 1
+
+	# The number of characters originally in this delimiter run
+	#
+	# At the start of processing, this is the same as `length`.
+	var original_length = 1
+end
+
+# Opening bracket for links and images
+class MdBracket
+
+	# Node containing the bracket
+	var node: MdText
+
+	# Index of the bracket in the original string
+	var index: Int
+
+	# COlumn of the bracket
+	var column: Int
+
+	# Is this bracket opening an image?
+	var is_image: Bool
+
+	# Previous bracket
+	var prev: nullable MdBracket
+
+	# Previous delimiter
+	var prev_delimiter: nullable MdDelimiter
+
+	# Whether this bracket is allowed to form a link/image
+	var allowed = true
+
+	# Whether there is an unescaped bracket (opening or closing) anywhere after this bracket
+	var bracket_after = false
+
+	# Create a new bracket for a link
+	init link(node: MdText, index: Int, column: Int, prev: nullable MdBracket, prev_delimiter: nullable MdDelimiter) do
+		init(node, index, column, false, prev, prev_delimiter)
+	end
+
+	# Create a new bracket for an image
+	init image(node: MdText, index: Int, column: Int, prev: nullable MdBracket, prev_delimiter: nullable MdDelimiter) do
+		init(node, index, column, true, prev, prev_delimiter)
+	end
+end
+
+# Data about a delimiter parsing
+private class MdDelimiterData
+
+	# Number of successive delimiters found
+	var count: Int
+
+	# Can this delimiter open an inline construct?
+	var can_open: Bool
+
+	# Can this delimiter close an inline construct?
+	var can_close: Bool
+end
+
+# An implementation of MdDelimiterProcessor that dispatches all calls to others
+#
+# The sub processors called bepends on the length of the delimiter run.
+# All child processors must have different minimum lengths.
+# A given delimiter run is dispatched to the child with the largest acceptable minimum length.
+# If not child is applicable, the one with the largest minimum length is chosen.
+class MdStaggeredDelimiterProcessor
+	super MdDelimiterProcessor
+
+	# Delimiter character
+	var delim: Char
+
+	# Sub processors to apply
+	var processors = new Array[MdDelimiterProcessor]
+
+	redef var min_length = 0
+	redef fun opening_delimiter do return delim
+	redef fun closing_delimiter do return delim
+
+	# Add a new sub delimiter processor
+	fun add(dp: MdDelimiterProcessor) do
+		var len = dp.min_length
+		var i = 0
+		while i < processors.length do
+			var p = processors[i]
+			assert len != p.min_length else
+				print "Cannot add two delimiter processor for `{delim}` " +
+					"and mininimum length `{len}`"
+			end
+			if len > p.min_length then
+				break
+			end
+			i += 1
+		end
+		processors.insert(dp, i)
+	end
+
+	# Find the corresponding processor for a length of `len` delimiter characters
+	fun find_processor(len: Int): MdDelimiterProcessor do
+		for processor in processors do
+			if processor.min_length <= len then return processor
+		end
+		return processors.first
+	end
+
+	redef fun delimiter_use(opener, closer) do
+		return find_processor(opener.length).delimiter_use(opener, closer)
+	end
+
+	redef fun process(opener, closer, delimiter_use) do
+		find_processor(delimiter_use).process(opener, closer, delimiter_use)
+	end
+end
+
+# A processor for emphasis tokens
+class MdEmphasisDelimiterProcessor
+	super MdDelimiterProcessor
+
+	# Delimiter character
+	var delimiter_char: Char
+
+	redef var min_length = 1
+	redef fun opening_delimiter do return delimiter_char
+	redef fun closing_delimiter do return delimiter_char
+
+	redef fun delimiter_use(opener, closer) do
+		# "multiple of 3" rule for internal delimiter runs
+		if (opener.can_close or closer.can_open) and
+		   ((opener.original_length + closer.original_length) % 3 == 0) then
+			return 0
+		end
+		# calculate actual number of delimiters used from this closer
+		if opener.length >= 2 and closer.length >= 2 then
+			return 2
+		end
+		return 1
+	end
+
+	redef fun process(opener, closer, delimiter_use) do
+		var single_delimiter = opening_delimiter.to_s
+		var emphasis: MdNode
+		if delimiter_use == 1 then
+			emphasis = new MdEmphasis(
+				new MdLocation(
+					opener.location.line_start,
+					opener.location.column_start,
+					closer.location.line_end,
+					closer.location.column_end),
+				single_delimiter)
+		else
+			emphasis = new MdStrongEmphasis(
+				new MdLocation(
+					opener.location.line_start,
+					opener.location.column_start + opener.literal.length,
+					closer.location.line_end,
+					closer.location.column_end - closer.literal.length),
+				"{single_delimiter}{single_delimiter}")
+		end
+		var tmp = opener.next
+		while tmp != null and tmp != closer do
+			var next = tmp.next
+			emphasis.append_child(tmp)
+			tmp = next
+		end
+		opener.insert_after(emphasis)
+	end
+end
+
+# Asterisk delimiters processor
+class MdAsteriskDelimiterProcessor
+	super MdEmphasisDelimiterProcessor
+	noautoinit
+
+	redef var delimiter_char = '*'
+end
+
+# Underscore delimters processor
+class MdUnderscoreDelimiterProcessor
+	super MdEmphasisDelimiterProcessor
+	noautoinit
+
+	redef var delimiter_char = '_'
+end
+
+# Utils
+
+redef class String
+
+	# Remove escape backslash from string
+	fun unescape_string: String do
+		if not has(re_escaped) then return self
+
+		var buffer = new Buffer
+		var match = search(re_escaped)
+		var last_end = 0
+		while match != null do
+			buffer.append substring(last_end, match.from - last_end)
+			buffer.append substring(match.from + 1, 1)
+			last_end = match.after
+			match = search_from(re_escaped, last_end)
+		end
+		if last_end < length then
+			buffer.append substring(last_end, length - last_end)
+		end
+		return buffer.to_s
+	end
+
+	# Normalize link reference names
+	private fun normalize_reference: String do
+		var stripped = self.substring(1, length - 2).trim
+		var lowercase = stripped.to_lower # TODO utf-8
+		return lowercase.replace(re_whitespace, " ")
+	end
+end
+
+redef class Sys
+	private var p_escapable = "[]!\"#$%&\'()*+,./:;<=>?@\\[\\\\^_`\\\{|\\\}~-]"
+	private var re_escaped: Regex = "\\\\{p_escapable}".to_re
+	private var re_whitespace: Regex = "\\s+".to_re
+end