Markdown parser

# Markdown parser
#
# Used to create the AST representation of a Markdown document.
class MdParser

	# Inline parser used to parse block content
	private var inline_parser = new MdInlineParser is lazy

	# Block parsers factories
	private var block_parser_factories: Collection[MdBlockParserFactory] do
		var factories = new Array[MdBlockParserFactory]
		factories.add new MdBlockQuoteParserFactory
		factories.add new MdHeadingParserFactory
		factories.add new MdFencedCodeBlockParserFactory
		factories.add new MdHtmlBlockParserFactory
		factories.add new MdThematicBreakParserFactory
		factories.add new MdListBlockParserFactory
		factories.add new MdIndentedCodeBlockParserFactory
		return factories
	end

	# Active block parsers
	#
	# Used as a stack to parse nested blocks.
	private var active_block_parsers = new Array[MdBlockParser]

	# All active block parsers
	private var all_block_parsers = new HashSet[MdBlockParser]

	# Return the active block parser
	#
	# The last entry in the `active_block_parsers` stack.
	private fun active_block_parser: MdBlockParser do
		return active_block_parsers.last
	end

	# Activate a `block_parser`
	#
	# Add the `block_parser` on the top of the `active_block_parsers` stack.
	# Also register it in `all_block_parsers`.
	private fun activate_block_parser(block_parser: MdBlockParser) do
		active_block_parsers.add block_parser
		all_block_parsers.add block_parser
	end

	# Deactivate the `active_block_parser`
	private fun deactivate_block_parser do
		active_block_parsers.pop
	end

	# Deactivate and remove the `active_block_parser` from the `all_block_parsers` list
	private fun remove_active_block_parser do
		var old = active_block_parser
		deactivate_block_parser
		all_block_parsers.remove(old)
		old.block.unlink
	end

	# Post-processors applied after the parsing of a document
	var post_processors = new Array[MdPostProcessor] is writable

	# Currently parsed line
	private var line_string: String is noinit

	# Current index (offset) in input `line_string` (starts at 0)
	private var index = 0

	# Current column in input `line_string` (starts at 0)
	#
	# Tab causes column to go to next 4-space tab stop.
	private var column = 0

	# Is the current column within a tab character (partially consumed tab)
	private var column_is_in_tab: Bool is noinit

	# Current line in input string (starts at 1)
	private var line = 1

	# Index of the next non-space character starting from `index`
	private var next_non_space_index = 0

	# Next non-space column
	private var next_non_space_column = 0

	# Current indent in columns
	#
	# Either by spaces or tab stop of 4, starting from `column`.
	private var indent = 0

	# Is the current `line` blank starting from `index`?
	private var is_blank: Bool is noinit

	# Does a node end with a blank line?
	private var last_line_blank = new HashMap[MdNode, Bool]

	# Initialize parser state
	private fun initialize do
		active_block_parsers.clear
		all_block_parsers.clear
		index = 0
		column = 0
		column_is_in_tab = false
		line = 1
		next_non_space_index = 0
		next_non_space_column = 0
		indent = 0
		is_blank = false
		last_line_blank.clear
	end

	# Parse the `input` string as a MdDocument
	fun parse(input: String): MdDocument do
		initialize

		var document_block_parser = new MdDocumentBlockParser(1, 1, 0)
		activate_block_parser(document_block_parser)
		var line_start = 0
		var line_break = find_line_break(input, line_start)
		while line_break != -1 do
			var line_string = input.substring(line_start, line_break - line_start)
			incorporate_line(line_string)
			if line_break + 1 < input.length and
			   input.chars[line_break] == '\r' and
			   input.chars[line_break + 1] == '\n' then
				line_start = line_break + 2
			else
				line_start = line_break + 1
			end
			line_break = find_line_break(input, line_start)
			line += 1
			column = 0
		end

		# Finalize pending line
		if input.length > 0 and (line_start == 0 or line_start < input.length) then
			incorporate_line(input.substring(line_start, input.length - line_start))
		end
		finalize_blocks(active_block_parsers)

		# Walk through a block and its chiildren revursively
		# Parsing string content into inline content where appropriate.
		var all_block_parsers = all_block_parsers.to_a
		var i = all_block_parsers.length - 1
		while i >= 0 do
			var block_parser = all_block_parsers[i]
			block_parser.parse_inlines(inline_parser)
			i -= 1
		end
		var document = document_block_parser.block
		return document
	end

	# Post-process the `document`
	fun post_process(document: MdDocument) do
		for processor in post_processors do
			processor.post_process(self, document)
		end
	end

	# Analyze a line of text and update the document
	#
	# We parse Markdown text by calling this on each line of `input`.
	private fun incorporate_line(input: String) do
		line_string = input
		index = 0
		column = 0
		column_is_in_tab = false

		# For each containing block, try to parse the associated line start.
		var matches = 1
		for i in [1 .. active_block_parsers.length[ do
			var block_parser = active_block_parsers[i]
			find_next_non_space

			var result = block_parser.try_continue(self)
			if result isa MdBlockContinue then
				if result.is_finalize then
					block_parser.finalize(self)
					return
				else
					if result.new_index != -1 then
						set_new_index result.new_index
					else if result.new_column != -1 then
						set_new_column result.new_column
					end
				end
				matches += 1
			else
				break
			end
		end

		var unmatched_block_parsers = active_block_parsers.subarray(
			matches, active_block_parsers.length - matches)
		var last_matched_block_parser = active_block_parsers[matches - 1]
		var block_parser = last_matched_block_parser
		var all_closed = unmatched_block_parsers.is_empty

		# Unless last matched container is a code block, try new container starts,
		# adding children to the last matched container.
		var try_block_starts = block_parser.block isa MdParagraph or
			block_parser.block.is_container

		while try_block_starts do
			find_next_non_space

			# Optimize lookup
			if is_blank or (indent < 4 and line_string.chars[next_non_space_index].is_letter) then
				set_new_index next_non_space_index
				break
			end

			var block_start = find_block_start(block_parser)
			if block_start == null then
				set_new_index next_non_space_index
				break
			end

			if not all_closed then
				finalize_blocks(unmatched_block_parsers)
				all_closed = true
			end

			if block_start.new_index != -1 then
				set_new_index block_start.new_index
			else if block_start.new_column != -1 then
				set_new_column block_start.new_column
			end

			if block_start.replace_active_block_parser then
				remove_active_block_parser
			end

			for new_block_parser in block_start.block_parsers do
				add_child(new_block_parser)
				block_parser = new_block_parser
				try_block_starts = new_block_parser.block.is_container
			end
		end

		# What remains at the offset is a text line.
		# Add the text to the appropriate block.

		# First check for a lazy paragraph continuation
		if not all_closed and not is_blank and active_block_parser isa MdParagraphParser then
			add_line
		else
			# Finalize any blocks not matched
			if not all_closed then
				finalize_blocks(unmatched_block_parsers)
			end
			propagate_last_line_blank(block_parser, last_matched_block_parser)

			if not block_parser.block.is_container then
				add_line
			else if not is_blank then
				# Create a paragraph container for the line
				add_child(new MdParagraphParser(line, column + 1, block_parser.content_offset))
				add_line
			end
		end
	end

	# Find what kind of block starts at `index` in `input`
	private fun find_block_start(block_parser: MdBlockParser): nullable MdBlockStart do
		for block_parser_factory in block_parser_factories do
			var result = block_parser_factory.try_start(self, block_parser)
			if result != null then return result
		end
		return null
	end

	# Add a `block_parser` block's as child of the active block parser block
	private fun add_child(block_parser: MdBlockParser) do
		# Finalize non-parentable blocks
		while not active_block_parser.block.can_contain(block_parser.block) do
			active_block_parser.finalize(self)
		end
		# Append block block parser block to its parent
		active_block_parser.block.append_child(block_parser.block)
		activate_block_parser(block_parser)
	end

	# Add line content to the active block parser
	#
	# We assume it can accept lines.
	private fun add_line do
		var content = null
		if column_is_in_tab then
			# Out column is in a partially consumed tab.
			# Expand the remaining columns to the next tab stop to spaces.
			var after_tab = index + 1
			var rest = line_string.substring(after_tab, line_string.length - after_tab)
			var spaces = column.columns_to_next_tab_stop
			var buffer = new Buffer
			for i in [0 .. spaces[ do
				buffer.add ' '
			end
			buffer.append(rest)
			content = buffer.write_to_string
		else
			content = line_string.substring(index, line_string.length - index)
		end
		active_block_parser.add_line(content)
	end

	# Finalize blocks of previous line
	private fun finalize_blocks(block_parsers: Sequence[MdBlockParser]) do
		var i = block_parsers.length - 1
		while i >= 0 do
			var block_parser = block_parsers[i]
			block_parser.finalize(self)
			i -= 1
		end
	end

	# Advance the `index` position to the next character
	#
	# Also set the `column`.
	# If the next character is a tab, compute the new column accordingly.
	private fun advance do
		var c = line_string.chars[index]
		if c == '\t' then
			index += 1
			column += column.columns_to_next_tab_stop
		else
			index += 1
			column += 1
		end
	end

	# Move `index` to the next non-space character index in the `input` string
	#
	# Also set `next_non_space_index`, `next_non_space_column`, `is_blank` and `indent`.
	private fun find_next_non_space do
		var i = index
		var cols = column

		is_blank = true
		while i < line_string.length do
			var c = line_string.chars[i]
			if c == ' ' then
				i += 1
				cols += 1
				continue
			else if c == '\t' then
				i += 1
				cols += 4 - (cols % 4)
				continue
			end
			is_blank = false
			break
		end

		next_non_space_index = i
		next_non_space_column = cols
		indent = next_non_space_column - column
	end

	# Return the position of the next line break
	#
	# We consider `\r` and `\n`.
	private fun find_line_break(input: String, start_index: Int): Int do
		for i in [start_index .. input.length[ do
			var char = input.chars[i]
			if char == '\r' or char == '\n' then return i
		end
		return -1
	end

	# Set the parser `index` at `new_index`
	#
	# Also set `column` and `column_is_in_tab`.
	private fun set_new_index(new_index: Int) do
		if new_index >= next_non_space_index then
			# We can start from here, no need to calculate tab stops again
			index = next_non_space_index
			column = next_non_space_column
		end
		while index < new_index and index != line_string.length do
			advance
		end
		# If we're going to an index as opposed to a column, we're never within a tab
		column_is_in_tab = false
	end

	# Set the parser `column` at `new_column`
	#
	# Also set `index` and `column_is_in_tab`.
	private fun set_new_column(new_column: Int) do
		if new_column >= next_non_space_column then
			# We can start from here, no need to calculate tab stops again
			index = next_non_space_index
			column = next_non_space_column
		end
		while column < new_column and index != line_string.length do
			advance
		end
		if column > new_column then
			# Last character was a tab and we overshot our target
			index -= 1
			column = new_column
			column_is_in_tab = true
		else
			column_is_in_tab = false
		end
	end

	# Does `block` end with a blank line?
	private fun ends_with_blank_line(block: nullable MdNode): Bool do
		while block != null do
			if is_last_line_blank(block) then return true
			if block isa MdListBlock or block isa MdListItem then
				block = block.last_child
			else
				break
			end
		end
		return false
	end

	# Propagate a blank line to all block_parser blocl's parents
	private fun propagate_last_line_blank(block_parser: MdBlockParser, last_matched_block_parser: MdBlockParser) do
		var last_child = block_parser.block.last_child
		if is_blank and last_child != null then
			last_line_blank[last_child] = true
		end
		var block = block_parser.block

		# Block quotes lines are never blank as they start with `>`.
		# We don't count blanks in fenced code for purposes of thight/loose lists.
		# We also don't set `last_line_blank` on an empty list item.
		var last_line_blank = is_blank and
			not (block isa MdBlockQuote or
			     block isa MdFencedCodeBlock or
				 (block isa MdListItem and block.first_child == null and
										  block_parser != last_matched_block_parser))

		# Propagate `last_line_blank` up through parents
		var node: nullable MdNode = block_parser.block
		while node != null do
			self.last_line_blank[node] = last_line_blank
			node = node.parent
		end
	end

	# Is last line blank for `node`?
	private fun is_last_line_blank(node: MdNode): Bool do
		if not last_line_blank.has_key(node) then return false
		return last_line_blank[node]
	end
end

lib/markdown2/markdown_block_parsing.nit:23,1--474,3

markdown2 :: markdown_github $ MdParser

redef class MdParser

	# Enable Github mode
	var github_mode = false is writable

	redef var inline_parser is lazy do
		var parser = super
		parser.github_mode = github_mode
		return parser
	end
end

lib/markdown2/markdown_github.nit:28,1--38,3

markdown2 :: markdown_wikilinks $ MdParser

redef class MdParser

	# Enable wikilinks mode
	var wikilinks_mode = false is writable

	redef var inline_parser is lazy do
		var parser = super
		parser.wikilinks_mode = wikilinks_mode
		return parser
	end
end

lib/markdown2/markdown_wikilinks.nit:23,1--33,3

class MdParser

Summary

Markdown parser

Introduced properties

defaultinit

github_mode

github_mode=

parse

post_process

post_processors

post_processors=

wikilinks_mode

wikilinks_mode=

Redefined properties

SELF

Markdown parser

Introduced properties

init defaultinit

fun github_mode: Bool

fun github_mode=(github_mode: Bool)

fun parse(input: String): MdDocument

fun post_process(document: MdDocument)

fun post_processors: Array[MdPostProcessor]

fun post_processors=(post_processors: Array[MdPostProcessor])

fun wikilinks_mode: Bool

fun wikilinks_mode=(wikilinks_mode: Bool)

Redefined properties

redef type SELF: MdParser

Summary

All properties

!=

==

CLASS

SELF

class_factory

class_name

defaultinit

defaultinit

get_class

github_mode

github_mode=

hash

init

inspect

inspect_head

is_same_instance

is_same_serialized

is_same_type

object_id

output

output_class_name

parse

post_process

post_processors

post_processors=

serialization_hash

sys

to_jvalue

to_s

wikilinks_mode

wikilinks_mode=

All properties

fun !=(other: nullable Object): Bool

fun ==(other: nullable Object): Bool

type CLASS: Class[SELF]

type SELF: Object

protected fun class_factory(name: String): CLASS

fun class_name: String

init defaultinit

init defaultinit

fun get_class: CLASS

fun github_mode: Bool

fun github_mode=(github_mode: Bool)

fun hash: Int

init init

fun inspect: String

protected fun inspect_head: String

intern fun is_same_instance(other: nullable Object): Bool

fun is_same_serialized(other: nullable Object): Bool

intern fun is_same_type(other: Object): Bool