Parse a markdown string and split it in blocks.

# Parse a markdown string and split it in blocks.
#
# Blocks are then outputed by an `MarkdownEmitter`.
#
# Usage:
#
#    var proc = new MarkdownProcessor
#    var html = proc.process("**Hello World!**")
#    assert html == "<p><strong>Hello World!</strong></p>\n"
#
# SEE: `String::md_to_html` for a shortcut.
class MarkdownProcessor

	# Work in extended mode (default).
	#
	# Behavior changes when using extended mode:
	#
	# * Lists and code blocks end a paragraph
	#
	#   In normal markdown the following:
	#
	# ~~~md
	# This is a paragraph
	# * and this is not a list
	# ~~~
	#
	#   Will produce:
	#
	# ~~~html
	# <p>This is a paragraph
	# * and this is not a list</p>
	# ~~~
	#
	#   When using extended mode this changes to:
	#
	# ~~~html
	# <p>This is a paragraph</p>
	# <ul>
	# <li>and this is not a list</li>
	# </ul>
	# ~~~
	#
	# * Fences code blocks
	#
	#   If you don't want to indent your all your code with 4 spaces,
	#   you can wrap your code in ``` ``` ``` or `~~~`.
	#
	#   Here's an example:
	#
	# ~~~md
	# fun test do
	#    print "Hello World!"
	# end
	# ~~~
	#
	# * Code blocks meta
	#
	#   If you want to use syntax highlighting tools, most of them need to know what kind
	#   of language they are highlighting.
	#   You can add an optional language identifier after the fence declaration to output
	#   it in the HTML render.
	#
	# ```nit
	# import markdown
	#
	# print "# Hello World!".md_to_html
	# ```
	#
	#   Becomes
	#
	# ~~~html
	# <pre class="nit"><code>import markdown
	#
	# print "Hello World!".md_to_html
	# </code></pre>
	# ~~~
	#
	# * Underscores (Emphasis)
	#
	#   Underscores in the middle of a word like:
	#
	# ~~~md
	# Con_cat_this
	# ~~~
	#
	#   normally produces this:
	#
	# ~~~html
	# <p>Con<em>cat</em>this</p>
	# ~~~
	#
	#   With extended mode they don't result in emphasis.
	#
	# ~~~html
	# <p>Con_cat_this</p>
	# ~~~
	#
	# * Strikethrough
	#
	#   Like in [GFM](https://help.github.com/articles/github-flavored-markdown),
	#   strikethrought span is marked with `~~`.
	#
	# ~~~md
	# ~~Mistaken text.~~
	# ~~~
	#
	#   becomes
	#
	# ~~~html
	# <del>Mistaken text.</del>
	# ~~~
	var ext_mode = true

	# Disable attaching MDLocation to Tokens
	#
	# Locations are useful for some tools but they may
	# cause an important time and space overhead.
	#
	# Default = `false`
	var no_location = false is writable

	# Process the mardown `input` string and return the processed output.
	fun process(input: String): Writable do
		# init processor
		link_refs.clear
		last_link_ref = null
		current_line = null
		current_block = null
		# parse markdown
		var parent = read_lines(input)
		parent.remove_surrounding_empty_lines
		recurse(parent, false)
		# output processed text
		decorator.headlines.clear
		return emit(parent.kind)
	end

	# Split `input` string into `MDLines` and create a parent `MDBlock` with it.
	private fun read_lines(input: String): MDBlock do
		var block = new MDBlock(new MDLocation(1, 1, 1, 1))
		var value = new FlatBuffer
		var i = 0

		var line_pos = 0
		var col_pos = 0

		while i < input.length do
			value.clear
			var pos = 0
			var eol = false
			while not eol and i < input.length do
				col_pos += 1
				var c = input[i]
				if c == '\n' then
					eol = true
				else if c == '\r' then
				else if c == '\t' then
					var np = pos + (4 - (pos & 3))
					while pos < np do
						value.add ' '
						pos += 1
					end
				else
					pos += 1
					value.add c
				end
				i += 1
			end
			line_pos += 1

			var loc = new MDLocation(line_pos, 1, line_pos, col_pos)
			var line = new MDLine(loc, value.write_to_string)
			var is_link_ref = check_link_ref(line)
			# Skip link refs
			if not is_link_ref then block.add_line line
			col_pos = 0
		end
		return block
	end

	# Check if line is a block link definition.
	# Return `true` if line contains a valid link ref and save it into `link_refs`.
	private fun check_link_ref(line: MDLine): Bool do
		var md = line.value
		var is_link_ref = false
		var id = new FlatBuffer
		var link = new FlatBuffer
		var comment = new FlatBuffer
		var pos = -1
		if not line.is_empty and line.leading < 4 and line.value[line.leading] == '[' then
			pos = line.leading + 1
			pos = md.read_until(id, pos, ']')
			if not id.is_empty and pos >= 0 and pos + 2 < line.value.length then
				if line.value[pos + 1] == ':' then
					pos += 2
					pos = md.skip_spaces(pos)
					if pos >= 0 and line.value[pos] == '<' then
						pos += 1
						pos = md.read_until(link, pos, '>')
						pos += 1
					else if pos >= 0 then
						pos = md.read_until(link, pos, ' ', '\n')
					end
					if not link.is_empty then
						pos = md.skip_spaces(pos)
						if pos > 0 and pos < line.value.length then
							var c = line.value[pos]
							if c == '\"' or c == '\'' or c == '(' then
								pos += 1
								if c == '(' then
									pos = md.read_until(comment, pos, ')')
								else
									pos = md.read_until(comment, pos, c)
								end
								if pos > 0 then is_link_ref = true
							end
						else
							is_link_ref = true
						end
					end
				end
			end
		end
		if is_link_ref and not id.is_empty and not link.is_empty then
			var lr = new LinkRef.with_title(link.write_to_string, comment.write_to_string)
			add_link_ref(id.write_to_string, lr)
			if comment.is_empty then last_link_ref = lr
			return true
		else
			comment = new FlatBuffer
			if not line.is_empty and last_link_ref != null then
				pos = line.leading
				var c = line.value[pos]
				if c == '\"' or c == '\'' or c ==  '(' then
					pos += 1
					if c == '(' then
						pos = md.read_until(comment, pos, ')')
					else
						pos = md.read_until(comment, pos, c)
					end
				end
				var last_link_ref = self.last_link_ref
				if not comment.is_empty and last_link_ref != null then
					last_link_ref.title = comment.write_to_string
				end
			end
			if comment.is_empty then return false
			return true
		end
	end

	# Known link refs
	# This list will be needed during output to expand links.
	var link_refs: Map[String, LinkRef] = new HashMap[String, LinkRef]

	# Last encountered link ref (for multiline definitions)
	#
	# Markdown allows link refs to be defined over two lines:
	#
	# ~~~md
	# [id]: http://example.com/longish/path/to/resource/here
	#	"Optional Title Here"
	# ~~~
	#
	private var last_link_ref: nullable LinkRef = null

	# Add a link ref to the list
	fun add_link_ref(key: String, ref: LinkRef) do link_refs[key.to_lower] = ref

	# Recursively split a `block`.
	#
	# The block is splitted according to the type of lines it contains.
	# Some blocks can be splited again recursively like lists.
	# The `in_list` mode is used to recurse on list and build
	# nested paragraphs or code blocks.
	fun recurse(root: MDBlock, in_list: Bool) do
		var old_mode = self.in_list
		var old_root = self.current_block
		self.in_list = in_list

		var line = root.first_line
		while line != null and line.is_empty do
			line = line.next
			if line == null then return
		end

		current_line = line
		current_block = root
		while current_line != null do
			line_kind(current_line.as(not null)).process(self)
		end
		self.in_list = old_mode
		self.current_block = old_root
	end

	# Currently processed line.
	# Used when visiting blocks with `recurse`.
	var current_line: nullable MDLine = null is writable

	# Currently processed block.
	# Used when visiting blocks with `recurse`.
	var current_block: nullable MDBlock = null is writable

	# Is the current recursion in list mode?
	# Used when visiting blocks with `recurse`
	private var in_list = false

	# The type of line.
	# see: `md_line_*`
	fun line_kind(md: MDLine): Line do
		var value = md.value
		var leading = md.leading
		var trailing = md.trailing
		if md.is_empty then return new LineEmpty
		if md.leading > 3 then return new LineCode
		if value[leading] == '#' then return new LineHeadline
		if value[leading] == '>' then return new LineBlockquote

		if ext_mode then
			if value.length - leading - trailing > 2 then
				if value[leading] == '`' and md.count_chars_start('`') >= 3 then
					return new LineFence
				end
				if value[leading] == '~' and md.count_chars_start('~') >= 3 then
					return new LineFence
				end
			end
		end

		if value.length - leading - trailing > 2 and
		   (value[leading] == '*' or value[leading] == '-' or value[leading] == '_') then
		   if md.count_chars(value[leading]) >= 3 then
				return new LineHR
		   end
		end

		if value.length - leading >= 2 and value[leading + 1] == ' ' then
			var c = value[leading]
			if c == '*' or c == '-' or c == '+' then return new LineUList
		end

		if value.length - leading >= 3 and value[leading].is_digit then
			var i = leading + 1
			while i < value.length and value[i].is_digit do i += 1
			if i + 1 < value.length and value[i] == '.' and value[i + 1] == ' ' then
				return new LineOList
			end
		end

		if value[leading] == '<' and md.check_html then return new LineXML

		var next = md.next
		if next != null and not next.is_empty then
			if next.count_chars('=') > 0 then
				return new LineHeadline1
			end
			if next.count_chars('-') > 0 then
				return new LineHeadline2
			end
		end
		return new LineOther
	end

	# Get the token kind at `pos`.
	fun token_at(text: Text, pos: Int): Token do
		var c0: Char
		var c1: Char
		var c2: Char

		if pos > 0 then
			c0 = text[pos - 1]
		else
			c0 = ' '
		end
		var c = text[pos]

		if pos + 1 < text.length then
			c1 = text[pos + 1]
		else
			c1 = ' '
		end
		if pos + 2 < text.length then
			c2 = text[pos + 2]
		else
			c2 = ' '
		end

		var loc
		if no_location then
			loc = null
		else
			loc = new MDLocation(
				current_loc.line_start,
				current_loc.column_start + pos,
				current_loc.line_start,
				current_loc.column_start + pos)
		end

		if c == '*' then
			if c1 == '*' then
				if c0 != ' ' or c2 != ' ' then
					return new TokenStrongStar(loc, pos, c)
				else
					return new TokenEmStar(loc, pos, c)
				end
			end
			if c0 != ' ' or c1 != ' ' then
				return new TokenEmStar(loc, pos, c)
			else
				return new TokenNone(loc, pos, c)
			end
		else if c == '_' then
			if c1 == '_' then
				if c0 != ' ' or c2 != ' ' then
					return new TokenStrongUnderscore(loc, pos, c)
				else
					return new TokenEmUnderscore(loc, pos, c)
				end
			end
			if ext_mode then
				if (c0.is_letter or c0.is_digit) and c0 != '_' and
				   (c1.is_letter or c1.is_digit) then
					return new TokenNone(loc, pos, c)
				else
					return new TokenEmUnderscore(loc, pos, c)
				end
			end
			if c0 != ' ' or c1 != ' ' then
				return new TokenEmUnderscore(loc, pos, c)
			else
				return new TokenNone(loc, pos, c)
			end
		else if c == '!' then
			if c1 == '[' then return new TokenImage(loc, pos, c)
			return new TokenNone(loc, pos, c)
		else if c == '[' then
			return new TokenLink(loc, pos, c)
		else if c == ']' then
			return new TokenNone(loc, pos, c)
		else if c == '`' then
			if c1 == '`' then
				return new TokenCodeDouble(loc, pos, c)
			else
				return new TokenCodeSingle(loc, pos, c)
			end
		else if c == '\\' then
			if c1 == '\\' or c1 == '[' or c1 == ']' or c1 == '(' or c1 == ')' or c1 == '{' or c1 == '}' or c1 == '#' or c1 == '"' or c1 == '\'' or c1 == '.' or c1 == '<' or c1 == '>' or c1 == '*' or c1 == '+' or c1 == '-' or c1 == '_' or c1 == '!' or c1 == '`' or c1 == '~' or c1 == '^' then
				return new TokenEscape(loc, pos, c)
			else
				return new TokenNone(loc, pos, c)
			end
		else if c == '<' then
			return new TokenHTML(loc, pos, c)
		else if c == '&' then
			return new TokenEntity(loc, pos, c)
		else
			if ext_mode then
				if c == '~' and c1 == '~' then
					return new TokenStrike(loc, pos, c)
				end
			end
			return new TokenNone(loc, pos, c)
		end
	end

	# Find the position of a `token` in `self`.
	fun find_token(text: Text, start: Int, token: Token): Int do
		var pos = start
		while pos < text.length do
			if token_at(text, pos).is_same_type(token) then
				return pos
			end
			pos += 1
		end
		return -1
	end

	# Kind of decorator used for decoration.
	type DECORATOR: Decorator

	# Decorator used for output.
	# Default is `HTMLDecorator`
	var decorator: DECORATOR is writable, lazy do
		return new HTMLDecorator
	end

	# Create a new `MarkdownEmitter` using a custom `decorator`.
	init with_decorator(decorator: DECORATOR) do
		self.decorator = decorator
	end

	# Output `block` using `decorator` in the current buffer.
	fun emit(block: Block): Text do
		var buffer = push_buffer
		block.emit(self)
		pop_buffer
		return buffer
	end

	# Output the content of `block`.
	fun emit_in(block: Block) do block.emit_in(self)

	# Transform and emit mardown text
	fun emit_text(text: Text) do emit_text_until(text, 0, null)

	# Transform and emit mardown text starting at `start` and
	# until a token with the same type as `token` is found.
	# Go until the end of `text` if `token` is null.
	fun emit_text_until(text: Text, start: Int, token: nullable Token): Int do
		var old_text = current_text
		var old_pos = current_pos
		current_text = text
		current_pos = start
		while current_pos < text.length do
			if text[current_pos] == '\n' then
				current_loc.line_start += 1
				current_loc.column_start = -current_pos
			end
			var mt = token_at(text, current_pos)
			if (token != null and not token isa TokenNone) and
			(mt.is_same_type(token) or
			(token isa TokenEmStar and mt isa TokenStrongStar) or
			(token isa TokenEmUnderscore and mt isa TokenStrongUnderscore)) then
				return current_pos
			end
			mt.emit(self)
			current_pos += 1
		end
		current_text = old_text
		current_pos = old_pos
		return -1
	end

	# Currently processed position in `current_text`.
	# Used when visiting inline production with `emit_text_until`.
	private var current_pos: Int = -1

	# Currently processed text.
	# Used when visiting inline production with `emit_text_until`.
	private var current_text: nullable Text = null

	# Stacked buffers.
	private var buffer_stack = new List[FlatBuffer]

	# Push a new buffer on the stack.
	private fun push_buffer: FlatBuffer do
		var buffer = new FlatBuffer
		buffer_stack.add buffer
		return buffer
	end

	# Pop the last buffer.
	private fun pop_buffer do buffer_stack.pop

	# Current output buffer.
	private fun current_buffer: FlatBuffer do
		assert not buffer_stack.is_empty
		return buffer_stack.last
	end

	# Stacked locations.
	private var loc_stack = new List[MDLocation]

	# Push a new MDLocation on the stack.
	private fun push_loc(location: MDLocation) do loc_stack.add location

	# Pop the last buffer.
	private fun pop_loc: MDLocation do return loc_stack.pop

	# Current output buffer.
	private fun current_loc: MDLocation do
		assert not loc_stack.is_empty
		return loc_stack.last
	end

	# Append `e` to current buffer.
	fun add(e: Writable) do
		if e isa Text then
			current_buffer.append e
		else
			current_buffer.append e.write_to_string
		end
	end

	# Append `c` to current buffer.
	fun addc(c: Char) do
		current_buffer.add c
	end

	# Append a "\n" line break.
	fun addn do addc '\n'
end

lib/markdown/markdown.nit:20,1--611,3

markdown :: wikilinks $ MarkdownProcessor

# `MarkdownProcessor` is now able to parse wikilinks.
redef class MarkdownProcessor

	redef fun token_at(text, pos) do
		var token = super
		if not token isa TokenLink then return token
		if pos + 1 < text.length then
			var c = text[pos + 1]
			if c == '[' then return new TokenWikiLink(token.location, pos, c)
		end
		return token
	end
end

lib/markdown/wikilinks.nit:27,1--39,3

class MarkdownProcessor

Summary

Parse a markdown string and split it in blocks.

Introduced properties

DECORATOR

add

add_link_ref

addc

addn

current_block

current_block=

current_line

current_line=

decorator

decorator=

defaultinit

emit

emit_in

emit_text

emit_text_until

ext_mode

ext_mode=

find_token

line_kind

link_refs

link_refs=

no_location

no_location=

process

recurse

token_at

with_decorator

Redefined properties

SELF

token_at

Parse a markdown string and split it in blocks.

Introduced properties

type DECORATOR: Decorator

fun add(e: Writable)

fun add_link_ref(key: String, ref: LinkRef)

fun addc(c: Char)

fun addn

fun current_block: nullable MDBlock

fun current_block=(current_block: nullable MDBlock)

fun current_line: nullable MDLine

fun current_line=(current_line: nullable MDLine)

fun decorator: DECORATOR

fun decorator=(decorator: DECORATOR)

init defaultinit

fun emit(block: Block): Text

fun emit_in(block: Block)

fun emit_text(text: Text)

fun emit_text_until(text: Text, start: Int, token: nullable Token): Int

fun ext_mode: Bool

protected fun ext_mode=(ext_mode: Bool)

fun find_token(text: Text, start: Int, token: Token): Int

fun line_kind(md: MDLine): Line

fun link_refs: Map[String, LinkRef]

protected fun link_refs=(link_refs: Map[String, LinkRef])

fun no_location: Bool

fun no_location=(no_location: Bool)

fun process(input: String): Writable

fun recurse(root: MDBlock, in_list: Bool)

fun token_at(text: Text, pos: Int): Token

init with_decorator(decorator: DECORATOR)

Redefined properties

redef type SELF: MarkdownProcessor

redef fun token_at(text: Text, pos: Int): Token

Summary

All properties

!=

==

CLASS

DECORATOR

SELF

add

add_link_ref

addc

addn

class_factory