Markdown blocks parsing

# Markdown blocks parsing
#
# Introduce the parsers for the different Markdown blocks such as headings, lists
# code blocks etc.
module markdown_block_parsing

import markdown_inline_parsing

# Markdown parser
#
# Used to create the AST representation of a Markdown document.
class MdParser

	# Inline parser used to parse block content
	private var inline_parser = new MdInlineParser is lazy

	# Block parsers factories
	private var block_parser_factories: Collection[MdBlockParserFactory] do
		var factories = new Array[MdBlockParserFactory]
		factories.add new MdBlockQuoteParserFactory
		factories.add new MdHeadingParserFactory
		factories.add new MdFencedCodeBlockParserFactory
		factories.add new MdHtmlBlockParserFactory
		factories.add new MdThematicBreakParserFactory
		factories.add new MdListBlockParserFactory
		factories.add new MdIndentedCodeBlockParserFactory
		return factories
	end

	# Active block parsers
	#
	# Used as a stack to parse nested blocks.
	private var active_block_parsers = new Array[MdBlockParser]

	# All active block parsers
	private var all_block_parsers = new HashSet[MdBlockParser]

	# Return the active block parser
	#
	# The last entry in the `active_block_parsers` stack.
	private fun active_block_parser: MdBlockParser do
		return active_block_parsers.last
	end

	# Activate a `block_parser`
	#
	# Add the `block_parser` on the top of the `active_block_parsers` stack.
	# Also register it in `all_block_parsers`.
	private fun activate_block_parser(block_parser: MdBlockParser) do
		active_block_parsers.add block_parser
		all_block_parsers.add block_parser
	end

	# Deactivate the `active_block_parser`
	private fun deactivate_block_parser do
		active_block_parsers.pop
	end

	# Deactivate and remove the `active_block_parser` from the `all_block_parsers` list
	private fun remove_active_block_parser do
		var old = active_block_parser
		deactivate_block_parser
		all_block_parsers.remove(old)
		old.block.unlink
	end

	# Post-processors applied after the parsing of a document
	var post_processors = new Array[MdPostProcessor] is writable

	# Currently parsed line
	private var line_string: String is noinit

	# Current index (offset) in input `line_string` (starts at 0)
	private var index = 0

	# Current column in input `line_string` (starts at 0)
	#
	# Tab causes column to go to next 4-space tab stop.
	private var column = 0

	# Is the current column within a tab character (partially consumed tab)
	private var column_is_in_tab: Bool is noinit

	# Current line in input string (starts at 1)
	private var line = 1

	# Index of the next non-space character starting from `index`
	private var next_non_space_index = 0

	# Next non-space column
	private var next_non_space_column = 0

	# Current indent in columns
	#
	# Either by spaces or tab stop of 4, starting from `column`.
	private var indent = 0

	# Is the current `line` blank starting from `index`?
	private var is_blank: Bool is noinit

	# Does a node end with a blank line?
	private var last_line_blank = new HashMap[MdNode, Bool]

	# Initialize parser state
	private fun initialize do
		active_block_parsers.clear
		all_block_parsers.clear
		index = 0
		column = 0
		column_is_in_tab = false
		line = 1
		next_non_space_index = 0
		next_non_space_column = 0
		indent = 0
		is_blank = false
		last_line_blank.clear
	end

	# Parse the `input` string as a MdDocument
	fun parse(input: String): MdDocument do
		initialize

		var document_block_parser = new MdDocumentBlockParser(1, 1, 0)
		activate_block_parser(document_block_parser)
		var line_start = 0
		var line_break = find_line_break(input, line_start)
		while line_break != -1 do
			var line_string = input.substring(line_start, line_break - line_start)
			incorporate_line(line_string)
			if line_break + 1 < input.length and
			   input.chars[line_break] == '\r' and
			   input.chars[line_break + 1] == '\n' then
				line_start = line_break + 2
			else
				line_start = line_break + 1
			end
			line_break = find_line_break(input, line_start)
			line += 1
			column = 0
		end

		# Finalize pending line
		if input.length > 0 and (line_start == 0 or line_start < input.length) then
			incorporate_line(input.substring(line_start, input.length - line_start))
		end
		finalize_blocks(active_block_parsers)

		# Walk through a block and its chiildren revursively
		# Parsing string content into inline content where appropriate.
		var all_block_parsers = all_block_parsers.to_a
		var i = all_block_parsers.length - 1
		while i >= 0 do
			var block_parser = all_block_parsers[i]
			block_parser.parse_inlines(inline_parser)
			i -= 1
		end
		var document = document_block_parser.block
		return document
	end

	# Post-process the `document`
	fun post_process(document: MdDocument) do
		for processor in post_processors do
			processor.post_process(self, document)
		end
	end

	# Analyze a line of text and update the document
	#
	# We parse Markdown text by calling this on each line of `input`.
	private fun incorporate_line(input: String) do
		line_string = input
		index = 0
		column = 0
		column_is_in_tab = false

		# For each containing block, try to parse the associated line start.
		var matches = 1
		for i in [1 .. active_block_parsers.length[ do
			var block_parser = active_block_parsers[i]
			find_next_non_space

			var result = block_parser.try_continue(self)
			if result isa MdBlockContinue then
				if result.is_finalize then
					block_parser.finalize(self)
					return
				else
					if result.new_index != -1 then
						set_new_index result.new_index
					else if result.new_column != -1 then
						set_new_column result.new_column
					end
				end
				matches += 1
			else
				break
			end
		end

		var unmatched_block_parsers = active_block_parsers.subarray(
			matches, active_block_parsers.length - matches)
		var last_matched_block_parser = active_block_parsers[matches - 1]
		var block_parser = last_matched_block_parser
		var all_closed = unmatched_block_parsers.is_empty

		# Unless last matched container is a code block, try new container starts,
		# adding children to the last matched container.
		var try_block_starts = block_parser.block isa MdParagraph or
			block_parser.block.is_container

		while try_block_starts do
			find_next_non_space

			# Optimize lookup
			if is_blank or (indent < 4 and line_string.chars[next_non_space_index].is_letter) then
				set_new_index next_non_space_index
				break
			end

			var block_start = find_block_start(block_parser)
			if block_start == null then
				set_new_index next_non_space_index
				break
			end

			if not all_closed then
				finalize_blocks(unmatched_block_parsers)
				all_closed = true
			end

			if block_start.new_index != -1 then
				set_new_index block_start.new_index
			else if block_start.new_column != -1 then
				set_new_column block_start.new_column
			end

			if block_start.replace_active_block_parser then
				remove_active_block_parser
			end

			for new_block_parser in block_start.block_parsers do
				add_child(new_block_parser)
				block_parser = new_block_parser
				try_block_starts = new_block_parser.block.is_container
			end
		end

		# What remains at the offset is a text line.
		# Add the text to the appropriate block.

		# First check for a lazy paragraph continuation
		if not all_closed and not is_blank and active_block_parser isa MdParagraphParser then
			add_line
		else
			# Finalize any blocks not matched
			if not all_closed then
				finalize_blocks(unmatched_block_parsers)
			end
			propagate_last_line_blank(block_parser, last_matched_block_parser)

			if not block_parser.block.is_container then
				add_line
			else if not is_blank then
				# Create a paragraph container for the line
				add_child(new MdParagraphParser(line, column + 1, block_parser.content_offset))
				add_line
			end
		end
	end

	# Find what kind of block starts at `index` in `input`
	private fun find_block_start(block_parser: MdBlockParser): nullable MdBlockStart do
		for block_parser_factory in block_parser_factories do
			var result = block_parser_factory.try_start(self, block_parser)
			if result != null then return result
		end
		return null
	end

	# Add a `block_parser` block's as child of the active block parser block
	private fun add_child(block_parser: MdBlockParser) do
		# Finalize non-parentable blocks
		while not active_block_parser.block.can_contain(block_parser.block) do
			active_block_parser.finalize(self)
		end
		# Append block block parser block to its parent
		active_block_parser.block.append_child(block_parser.block)
		activate_block_parser(block_parser)
	end

	# Add line content to the active block parser
	#
	# We assume it can accept lines.
	private fun add_line do
		var content = null
		if column_is_in_tab then
			# Out column is in a partially consumed tab.
			# Expand the remaining columns to the next tab stop to spaces.
			var after_tab = index + 1
			var rest = line_string.substring(after_tab, line_string.length - after_tab)
			var spaces = column.columns_to_next_tab_stop
			var buffer = new Buffer
			for i in [0 .. spaces[ do
				buffer.add ' '
			end
			buffer.append(rest)
			content = buffer.write_to_string
		else
			content = line_string.substring(index, line_string.length - index)
		end
		active_block_parser.add_line(content)
	end

	# Finalize blocks of previous line
	private fun finalize_blocks(block_parsers: Sequence[MdBlockParser]) do
		var i = block_parsers.length - 1
		while i >= 0 do
			var block_parser = block_parsers[i]
			block_parser.finalize(self)
			i -= 1
		end
	end

	# Advance the `index` position to the next character
	#
	# Also set the `column`.
	# If the next character is a tab, compute the new column accordingly.
	private fun advance do
		var c = line_string.chars[index]
		if c == '\t' then
			index += 1
			column += column.columns_to_next_tab_stop
		else
			index += 1
			column += 1
		end
	end

	# Move `index` to the next non-space character index in the `input` string
	#
	# Also set `next_non_space_index`, `next_non_space_column`, `is_blank` and `indent`.
	private fun find_next_non_space do
		var i = index
		var cols = column

		is_blank = true
		while i < line_string.length do
			var c = line_string.chars[i]
			if c == ' ' then
				i += 1
				cols += 1
				continue
			else if c == '\t' then
				i += 1
				cols += 4 - (cols % 4)
				continue
			end
			is_blank = false
			break
		end

		next_non_space_index = i
		next_non_space_column = cols
		indent = next_non_space_column - column
	end

	# Return the position of the next line break
	#
	# We consider `\r` and `\n`.
	private fun find_line_break(input: String, start_index: Int): Int do
		for i in [start_index .. input.length[ do
			var char = input.chars[i]
			if char == '\r' or char == '\n' then return i
		end
		return -1
	end

	# Set the parser `index` at `new_index`
	#
	# Also set `column` and `column_is_in_tab`.
	private fun set_new_index(new_index: Int) do
		if new_index >= next_non_space_index then
			# We can start from here, no need to calculate tab stops again
			index = next_non_space_index
			column = next_non_space_column
		end
		while index < new_index and index != line_string.length do
			advance
		end
		# If we're going to an index as opposed to a column, we're never within a tab
		column_is_in_tab = false
	end

	# Set the parser `column` at `new_column`
	#
	# Also set `index` and `column_is_in_tab`.
	private fun set_new_column(new_column: Int) do
		if new_column >= next_non_space_column then
			# We can start from here, no need to calculate tab stops again
			index = next_non_space_index
			column = next_non_space_column
		end
		while column < new_column and index != line_string.length do
			advance
		end
		if column > new_column then
			# Last character was a tab and we overshot our target
			index -= 1
			column = new_column
			column_is_in_tab = true
		else
			column_is_in_tab = false
		end
	end

	# Does `block` end with a blank line?
	private fun ends_with_blank_line(block: nullable MdNode): Bool do
		while block != null do
			if is_last_line_blank(block) then return true
			if block isa MdListBlock or block isa MdListItem then
				block = block.last_child
			else
				break
			end
		end
		return false
	end

	# Propagate a blank line to all block_parser blocl's parents
	private fun propagate_last_line_blank(block_parser: MdBlockParser, last_matched_block_parser: MdBlockParser) do
		var last_child = block_parser.block.last_child
		if is_blank and last_child != null then
			last_line_blank[last_child] = true
		end
		var block = block_parser.block

		# Block quotes lines are never blank as they start with `>`.
		# We don't count blanks in fenced code for purposes of thight/loose lists.
		# We also don't set `last_line_blank` on an empty list item.
		var last_line_blank = is_blank and
			not (block isa MdBlockQuote or
			     block isa MdFencedCodeBlock or
				 (block isa MdListItem and block.first_child == null and
										  block_parser != last_matched_block_parser))

		# Propagate `last_line_blank` up through parents
		var node: nullable MdNode = block_parser.block
		while node != null do
			self.last_line_blank[node] = last_line_blank
			node = node.parent
		end
	end

	# Is last line blank for `node`?
	private fun is_last_line_blank(node: MdNode): Bool do
		if not last_line_blank.has_key(node) then return false
		return last_line_blank[node]
	end
end

# Block parsing

# Parser for a specific block node
abstract class MdBlockParser

	# Kind of block under construction
	type BLOCK: MdBlock

	# MdBlock under construction
	fun block: BLOCK is abstract

	# Line Start
	var line_start: Int

	# Column start
	var column_start: Int

	# Location at start
	#
	# The location end it initialized at `-1` and will be set later in the
	# `finalize` method.
	var location: MdLocation is lazy do return new MdLocation(line_start, column_start, -1, -1)

	# Column where the content starts
	var content_offset: Int

	# Initialize the current `block`
	fun initialize(parser: MdParser) do end

	# Can `self` continue from the current `index` in `parser`?
	#
	# Return a new `MdBlockContinue` if `self` can continue parsing.
	# Return null otherwise.
	fun try_continue(state: MdParser): nullable MdBlockContinue is abstract

	# Add `line` to the current `block`
	fun add_line(line: String) do end

	# Finalize the current `block`
	#
	# Deactivate `self` from `parser` and call `close_block`.
	fun finalize(parser: MdParser) do
		if parser.active_block_parser == self then
			parser.deactivate_block_parser
		end
	end

	# Parse `block` lines
	fun parse_inlines(inline_parser: MdInlineParser) do end
end

# Result object for continuing parsing of a block
class MdBlockContinue

	# Index from which continue parsing
	var new_index: Int

	# Column from which continue parsing
	var new_column: Int

	# Is the block finalized?
	var is_finalize: Bool

	# Continue from index
	init at_index(new_index: Int) do
		init(new_index, -1, false)
	end

	# Continue from column
	init at_column(new_column: Int) do
		init(-1, new_column, false)
	end

	# Block is finished
	init finished do
		init(-1, -1, true)
	end
end

# Block parser factory for a block node for determining when a block starts
abstract class MdBlockParserFactory

	# Can the associated block parser can start at the current line in `parser`?
	#
	# Return a new `MdBlockStart` if the block parser can start.
	# Return null otherwise.
	fun try_start(parser: MdParser, matched_block_parser: MdBlockParser):
		nullable MdBlockStart is abstract
end

# Result object from starting parsing of a block
class MdBlockStart

	# Block parsers for this block start
	var block_parsers: Array[MdBlockParser]

	# Index where the parsing should start
	var new_index = -1

	# Column where the parsing should start
	var new_column = -1

	# Does the block starting with `self` terminate a previous block?
	var replace_active_block_parser = false

	# Start from `new_index`
	fun at_index(new_index: Int): MdBlockStart do
		self.new_index = new_index
		return self
	end

	# Start from `new_column`
	fun at_column(new_column: Int): MdBlockStart do
		self.new_column = new_column
		return self
	end

	# Start replacing the active block parser
	fun replacing_active_block_parser: MdBlockStart do
		self.replace_active_block_parser = true
		return self
	end
end

# Parser for the whole document
class MdDocumentBlockParser
	super MdBlockParser

	redef type BLOCK: MdDocument
	redef var block = new MdDocument(location) is lazy

	# Always continue at current indent
	redef fun try_continue(state) do return new MdBlockContinue.at_index(state.index)

	redef fun finalize(parser) do
	end

	# redef fun finalize(state) do
	redef fun parse_inlines(inline_parser) do
		var last_child = block.last_child
		if last_child != null then
			location.line_end = last_child.location.line_end
			location.column_end = last_child.location.column_end
		end
	end
end

# Headings parser
class MdHeadingParser
	super MdBlockParser

	redef type BLOCK: MdHeading

	redef var block = new MdHeading(location, level, is_setext, has_atx_trailing) is lazy

	redef var location = new MdLocation(line_start, column_start, line_end, column_end) is lazy

	# Line end
	var line_end: Int

	# Column end
	var column_end: Int

	# Heading level
	var level: Int

	# Heading content
	var content: String

	# Heading has ATX trailing
	var has_atx_trailing: Bool

	# Heading is setext format
	var is_setext: Bool

	# Never continue parsing as an heading is a one liner
	redef fun try_continue(state) do return null

	# Parse the heading content
	redef fun parse_inlines(inline_parser) do
		inline_parser.parse(content, content_offset, block)
	end
end

# Heading parser factory
class MdHeadingParserFactory
	super MdBlockParserFactory

	redef fun try_start(state, matched_block_parser) do
		if state.indent >= 4 then return null

		var next_non_space = state.next_non_space_index
		var line = state.line_string
		var paragraph = null
		if matched_block_parser isa MdParagraphParser then
			paragraph = matched_block_parser.content
		end

		var line_content = line.substring(next_non_space, line.length - next_non_space)
		var match = line_content.search(re_atx_heading)
		if match != null then
			# ATX heading
			var new_offset = next_non_space + match.subs.first.as(not null).length
			var level = match.subs.first.as(not null).to_s.trim.length
			# remove trailing ###s
			var after_leading = line.substring(new_offset, line.length - new_offset)
			var trailing = after_leading.search(re_atx_trailing)
			var has_trailing = trailing != null
			var trailing_length = if trailing != null then trailing.length else 0
			var content = after_leading.replace(re_atx_trailing, "")
			return (new MdBlockStart(
				[new MdHeadingParser(
					state.line,
					next_non_space + 1,
					new_offset + 1,
					state.line,
					new_offset + content.length + trailing_length,
					level,
					content,
					has_trailing, false)])
				).at_index(line.length)
		end

		if paragraph ==  null then return null

		match = line_content.search(re_setext_heading)
		if match == null then return null
		var level = 2
		if match.subs.first.as(not null).to_s.chars.first == '=' then level = 1
		var content = paragraph.to_s
		return (new MdBlockStart(
			[new MdHeadingParser(
				state.line - 1,
				next_non_space + 1,
				0,
				state.line,
				state.column + match.length,
				level,
				content,
				false, true)])
			).at_index(line.length).replacing_active_block_parser
	end
end

# Blockquotes parser
class MdBlockQuoteParser
	super MdBlockParser

	redef type BLOCK: MdBlockQuote
	redef var block = new MdBlockQuote(location) is lazy

	redef fun try_continue(state) do
		var next_non_space = state.next_non_space_index
		var indent = state.indent
		var line = state.line_string

		if indent >= 4 then return null
		if next_non_space >= line.length then return null
		if line.chars[next_non_space] != '>' then return null

		var new_column = state.column + state.indent + 1
		# optional following space or tab
		if state.line_string.is_space_or_tab(next_non_space + 1) then
			new_column += 1
		end
		return new MdBlockContinue.at_column(new_column)
	end

	redef fun parse_inlines(inline_parser) do
		var last_child = block.last_child
		if last_child != null then
			location.line_end = last_child.location.line_end
			location.column_end = last_child.location.column_end
		end
	end
end

# Blockquotes parser factory
class MdBlockQuoteParserFactory
	super MdBlockParserFactory

	redef fun try_start(state, matched_block_parser) do
		var next_non_space = state.next_non_space_index
		var indent = state.indent
		var line = state.line_string

		if indent >= 4 then return null
		if next_non_space >= line.length then return null
		if line.chars[next_non_space] != '>' then return null

		var new_column = state.column + state.indent + 1
		# optional following space or tab
		if state.line_string.is_space_or_tab(next_non_space + 1) then
			new_column += 1
		end
		return (new MdBlockStart(
			[new MdBlockQuoteParser(
				state.line,
				state.column + 1,
				new_column)])
			).at_column(new_column)
	end
end

# Indented code blocks parser
class MdIndentedCodeBlockParser
	super MdBlockParser

	redef type BLOCK: MdIndentedCodeBlock
	redef var block = new MdIndentedCodeBlock(location, use_tabs) is lazy

	# Indent is tab?
	var use_tabs: Bool

	# Block content
	var content = new Buffer

	redef fun try_continue(state) do
		if state.indent >= 4 then
			return new MdBlockContinue.at_column(state.column + 4)
		else if state.is_blank then
			return new MdBlockContinue.at_index(state.next_non_space_index)
		end
		return null
	end

	redef fun add_line(line) do
		if not content.is_empty then
			content.add('\n')
		end
		content.append(line)
	end

	redef fun finalize(parser) do
		super

		add_line(" ")
		var content = self.content.to_s
		var literal = content.replace_first(re_trailing_blank_lines, "\n")
		block.literal = literal

		var lines = literal.split("\n")
		location.line_end = location.line_start + lines.length - 2
		location.column_end = content_offset + lines[lines.length - 2].length + 4
	end
end

# Indented code blocks parser factory
class MdIndentedCodeBlockParserFactory
	super MdBlockParserFactory

	redef fun try_start(state, matched_block_parser) do
		if state.indent < 4 then return null
		if state.is_blank then return null
		if state.active_block_parser.block isa MdParagraph then return null

		var use_tabs = state.line_string.has_prefix("\t")
		return (new MdBlockStart(
			[new MdIndentedCodeBlockParser(
				state.line,
				state.column + 1,
				state.column,
				use_tabs)])
			).at_column(state.column + 4)
	end
end

# Fenced code blocks parser
class MdFencedCodeBlockParser
	super MdBlockParser

	redef type BLOCK: MdFencedCodeBlock
	redef var block = new MdFencedCodeBlock(location, fence_char, fence_length, fence_indent) is lazy

	# Fence character
	var fence_char: Char

	# Fence length
	var fence_length: Int

	# Fence indent
	var fence_indent: Int

	# Fence first line
	var first_line: nullable String = null

	# Fence other lines
	var other_lines = new Buffer

	redef fun try_continue(state) do
		var next_non_space = state.next_non_space_index
		var new_index = state.index
		var line = state.line_string

		if state.indent <= 3 and next_non_space < line.length and
		   line.chars[next_non_space] == fence_char then

			var match = line.substring(next_non_space, line.length - next_non_space).
				search(re_closing_fence)
			if match != null and match.subs[0].as(not null).length >= fence_length then
				# closing fence - we're at end of line, so we can finalize now
				return new MdBlockContinue.finished
			end
		end

		# skip optional spaces of fence indent
		var i = fence_indent
		while i > 0 and new_index < line.length and line.chars[new_index] == ' ' do
			new_index += 1
			i -= 1
		end

		return new MdBlockContinue.at_index(new_index)
	end

	redef fun add_line(line) do
		if first_line == null then
			first_line = line
		else
			other_lines.append(line)
			other_lines.add '\n'
		end
	end

	redef fun finalize(parser) do
		super

		# first line become info string
		var first_line = self.first_line
		if first_line != null then
			var info = first_line.trim.unescape_string
			if not info.is_empty then block.info = info
		end

		var content = other_lines.to_s
		block.literal =  content

		var lines = content.split("\n")
		location.line_end = location.line_start + lines.length
		location.column_end = content_offset + fence_indent + fence_length
	end
end

# Fenced code blocks parser factory
class MdFencedCodeBlockParserFactory
	super MdBlockQuoteParserFactory

	redef fun try_start(state, matched_block_parser) do
		var next_non_space = state.next_non_space_index
		var line = state.line_string

		if state.indent >= 4 then return null

		var match = line.substring(next_non_space, line.length - next_non_space).search(re_opening_fence)
		if match == null then return null

		var fence_length
		var fence_char
		var sub0 = match.subs[0]
		if sub0 != null then
			fence_length = sub0.length
			fence_char = sub0.to_s.chars.first
		else
			fence_length = match.subs[2].as(not null).length
			fence_char = match.subs[2].as(not null).to_s.chars.first
		end
		if fence_char == '`' and match.to_s.has("[^`]+`".to_re) then
			return null
		else if match.to_s.has("[^~]+~".to_re) then
			return null
		end
		return (new MdBlockStart(
			[new MdFencedCodeBlockParser(
				state.line,
				state.column + 1,
				state.column,
				fence_char,
				fence_length,
				state.indent)]
			)).at_index(next_non_space + fence_length)
	end
end

# List blocks parser
class MdListBlockParser
	super MdBlockParser

	redef type BLOCK: MdListBlock

	redef var block is lazy do
		if is_ordered then
			return new MdOrderedList(location, digit.as(not null), delim.as(not null))
		else
			return new MdUnorderedList(location, bullet.as(not null))
		end
	end

	# Is this list ordered
	var is_ordered: Bool

	# List bullet if unordered
	var bullet: nullable Char

	# List digit if ordered
	var digit: nullable Int

	# List delimiter if ordered
	var delim: nullable Char

	redef fun try_continue(state) do return new MdBlockContinue.at_index(state.index)

	redef fun finalize(parser) do
		super

		var item = block.first_child
		while item != null do
			# check for non-final list item ending with blank line
			if parser.ends_with_blank_line(item) and item.next != null then
				block.is_tight = false
				break
			end
			# recurse into children of list item to see if there are spaces between any of them
			var sub_item = item.first_child
			while sub_item != null do
				if parser.ends_with_blank_line(sub_item) and
				   (item.next != null or sub_item.next != null) then
					block.is_tight = false
					break
				end
				sub_item = sub_item.next
			end
			item = item.next
		end
	end

	redef fun parse_inlines(inline_parser) do
		var last_child = block.last_child
		if last_child != null then
			location.line_end = last_child.location.line_end
			location.column_end = last_child.location.column_end
		end
	end
end

# List blocks parser factory
class MdListBlockParserFactory
	super MdBlockQuoteParserFactory

	redef fun try_start(state, matched_block_parser) do
		if state.indent >= 4 and not matched_block_parser isa MdListBlockParser then return null

		var marker_index = state.next_non_space_index
		var marker_column = state.column + state.indent

		var in_paragraph = matched_block_parser isa MdParagraphParser and matched_block_parser.content != null
		var list_data = parse_list_marker(state, state.line_string, marker_index, marker_column, in_paragraph)
		if list_data == null then return null


		var new_column = list_data.content_column
		var list_item_parser = new MdListItemParser(
			state.line,
			state.column + 1,
			new_column,
			new_column - state.column)

		# prepend the list block if needed
		if not matched_block_parser isa MdListBlockParser or not lists_match(matched_block_parser.block, list_data) then
			var list_block_parser = new MdListBlockParser(state.line, state.column + 1, new_column - state.column, list_data.is_ordered, list_data.bullet, list_data.digit, list_data.delim)
			list_block_parser.block.is_tight = true

			return (new MdBlockStart([list_block_parser, list_item_parser: MdBlockParser])).at_column(new_column)
		end
		return (new MdBlockStart([list_item_parser])).at_column(new_column)
	end

	private fun parse_list_marker(state: MdParser, line: String, marker_index, marker_column: Int, in_paragraph: Bool): nullable MdListData do
		var rest = line.substring(marker_index, line.length - marker_index)
		var match = rest.search(re_list_marker)
		if match == null then return null

		var is_ordered
		var bullet = null
		var digit = null
		var delim = null

		var bullet_match = match.subs[0]
		if bullet_match != null then
			is_ordered = false
			bullet = bullet_match.to_s.chars[0]
		else
			is_ordered = true
			digit = match.subs[2].as(not null).to_s.to_i
			delim = match.subs[3].as(not null).to_s.chars[0]
		end

		var marker_length = match.length
		if match.to_s.has_suffix(" ") or match.to_s.has_suffix("\t") then
			marker_length -= 1
		end
		var index_after_marker = marker_index + marker_length

		# marker doesn't include tabs, so counting them as column directly is ok
		var column_after_marker = marker_column + marker_length
		# the column within the line where the content starts
		var content_column = column_after_marker

		# see at which column the content starts if there is content
		var has_content = false
		for i in [index_after_marker .. line.length[ do
			var c = line.chars[i]
			if c == '\t' then
				content_column += content_column.columns_to_next_tab_stop
			else if c == ' ' then
				content_column += 1
			else
				has_content = true
				break
			end
		end

		if in_paragraph then
			# if the list item is ordered, then start number must be 1 to interrupt a paragraph
			if is_ordered and digit != 1 then
				return null
			end
			# empty list item can not interrupt a paragraph
			if not has_content then
				return null
			end
		end

		if not has_content or (content_column - column_after_marker) > 4 then
			# if this line is blank or has a code block, default to 1 space after marker
			content_column = column_after_marker + 1
		end
		return new MdListData(is_ordered, bullet, digit, delim, content_column)
	end

	# Return true if the two list items are of the same type
	#
	# With the same delimiter and bullet character.
	# This is used in agglomerating list items into lists
	private fun lists_match(a: MdListBlock, b: MdListData): Bool do
		if a isa MdUnorderedList and not b.is_ordered then
			return a.bullet_marker == b.bullet
		else if a isa MdOrderedList and b.is_ordered then
			return a.delimiter == b.delim
		end
		return false
	end
end

# Parsed list data
private class MdListData

	var is_ordered: Bool

	var bullet: nullable Char

	var digit: nullable Int

	var delim: nullable Char

	# Column the content start at
	var content_column: Int
end

# List items parser
class MdListItemParser
	super MdBlockParser

	redef type BLOCK: MdListItem
	redef var block = new MdListItem(location) is lazy

	# List item content indend
	var content_indent: Int

	redef fun try_continue(state) do
		if state.is_blank then
			if block.first_child == null then
				# blank line after empty list item
				return null
			end
			return new MdBlockContinue.at_index(state.next_non_space_index)
		end
		if state.indent >= content_indent then
			return new MdBlockContinue.at_column(state.column + content_indent)
		end
		return null
	end

	redef fun parse_inlines(inline_parser) do
		var last_child = block.last_child
		if last_child != null then
			location.line_end = last_child.location.line_end
			location.column_end = last_child.location.column_end
		end
	end
end

# Thematic breaks parser
class MdThematicBreakParser
	super MdBlockParser

	redef type BLOCK: MdThematicBreak
	redef var block = new MdThematicBreak(location, pattern) is lazy

	# Thematic break pattern
	var pattern: String

	redef fun try_continue(state) do return null

	redef fun finalize(parser) do
		super

		location.line_end = line_start
		location.column_end = column_start + pattern.length - 1
	end
end

# Thematic breaks parser factory
class MdThematicBreakParserFactory
	super MdBlockQuoteParserFactory

	redef fun try_start(state, matched_block_parser) do
		if state.indent >= 4 then return null

		var next_non_space = state.next_non_space_index
		var line = state.line_string
		var tbreak  = line.substring(next_non_space, line.length - next_non_space).search(re_thematic_break)
		if tbreak != null then
			return (new MdBlockStart(
				[new MdThematicBreakParser(
					state.line,
					state.column + 1,
					next_non_space,
					tbreak.to_s)]
				)).at_index(line.length)
		end
		return null
	end
end

# Paragraphs parser
class MdParagraphParser
	super MdBlockParser

	redef type BLOCK: MdParagraph

	redef var block = new MdParagraph(location) is lazy

	# Paragraph content
	var content: nullable Buffer = new Buffer

	redef fun try_continue(state) do
		if state.is_blank then return null
		return new MdBlockContinue.at_index(state.index)
	end

	redef fun add_line(line) do
		var content = self.content
		if content == null then return
		if not content.is_empty then
			content.add('\n')
		end
		content.append(line)
	end

	redef fun finalize(parser) do
		super

		var inline_parser = parser.inline_parser
		var content = self.content
		if content == null then return

		var content_string = content.to_s
		var has_reference_defs = false

		var pos = inline_parser.parse_reference(content_string)
		# try parsing the beginning as link reference definitions
		while content_string.length > 3 and content_string.chars[0] == '[' and pos != 0 do
			content_string = content_string.substring(pos, content_string.length - pos)
			has_reference_defs = true
			pos = inline_parser.parse_reference(content_string)
		end

		if has_reference_defs and content_string.is_blank then
			block.unlink
			self.content = null
		else
			self.content = new Buffer.from_text(content_string)
		end
	end

	redef fun parse_inlines(inline_parser) do
		var content = self.content
		if content == null then return
		inline_parser.parse(content.to_s, content_offset, block)

		var last_child = block.last_child
		if last_child != null then
			location.line_end = last_child.location.line_end
			location.column_end = last_child.location.column_end
		end
	end
end

# Html blocks parser
class MdHtmlBlockParser
	super MdBlockParser

	redef type BLOCK: MdHtmlBlock
	redef var block = new MdHtmlBlock(location) is lazy

	# Closing tag pattern
	#
	# Or null if the block is not closed
	var closing_pattern: nullable Pattern

	# Is the current block finished?
	var finished = false

	# Block content
	var content = new Buffer

	redef fun try_continue(state) do
		if finished then return null

		# blank lin ends type 6 and 7 blocks
		if state.is_blank and closing_pattern == null then return null

		return new MdBlockContinue.at_index(state.index)
	end

	redef fun add_line(line) do
		if not content.is_empty then
			content.add('\n')
		end
		content.append(line)
		var closing_pattern = self.closing_pattern
		if closing_pattern != null and line.has(closing_pattern) then
			finished = true
		end
	end

	redef fun finalize(parser) do
		super

		var content = self.content.to_s
		block.literal = content

		var lines = content.split("\n")
		location.line_end = location.line_start + lines.length - 1
		location.column_end = lines.last.length
	end
end

# Html blocks parser factory
class MdHtmlBlockParserFactory
	super MdBlockParserFactory

	redef fun try_start(state, matched_block_parser) do
		var next_non_space = state.next_non_space_index
		var line = state.line_string

		if state.indent >= 4 or line.chars[next_non_space] != '<' then return null

		for block_type in [0..6] do
			# type 7 can not interrupt a paragraph
			if block_type == 6 and matched_block_parser.block isa MdParagraph then continue
			var opener = re_html_blocks[block_type].first
			var closer = re_html_blocks[block_type].last
			if line.substring(next_non_space, line.length - next_non_space).has(opener.as(not null)) then
				return (new MdBlockStart(
					[new MdHtmlBlockParser(
						state.line,
						state.column + 1,
						next_non_space,
						closer)])
					).at_index(state.index)
			end
		end
		return null
	end
end

# Post Processing

# Markdown post processor
#
# A Markdown AST visitor called after parsing from a MdParser
abstract class MdPostProcessor
	super MdVisitor

	# Document behing processed
	#
	# Availlable only during a call to `post_process`.
	var document: nullable MdDocument = null

	# Post process the `document` parsed by `parser`
	fun post_process(parser: MdParser, document: MdDocument) do
		self.document = document
		enter_visit(document)
		self.document = null
	end

	# Call `MdNode::post_process`
	redef fun visit(node) do node.post_process(self)
end

redef class MdNode

	# Accept the visit of a `MdPostProcessor`
	fun post_process(v: MdPostProcessor) do visit_all(v)
end

# Utils

redef class Sys
	# ATX headings matching
	private var re_atx_heading: Regex = "^(#\{1,6\})([ \t]+|$)".to_re

	# ATX trailings matching
	private var re_atx_trailing: Regex = "(^|[ \t]+)#+[ \t]*$".to_re

	# SeText headings matching
	private var re_setext_heading: Regex = "^(=+|-+)[ \t]*$".to_re

	# Blank lines matching
	var re_trailing_blank_lines: Regex = "(\n[ \t]*)+$".to_re

	# Opening fence matching
	var re_opening_fence: Regex = "^(`\{3,\})(.*)|^(~\{3,\})(.*)".to_re

	# Closing fence matching
	var re_closing_fence: Regex = "^(`\{3,\}|~\{3,\})( *$)".to_re

	# List marker matching
	var re_list_marker: Regex = "^([*+-])( |\t|$)|^([0-9]\{1,9\})([.)])( |\t|$)".to_re

	# Thematic break pattern
	var re_thematic_break: Regex = "^((\\*[ \t]*)\{3,\}|(_[ \t]*)\{3,\}|(-[ \t]*)\{3,\})[ \t]*$".to_re

	# HTML blocks patterns
	var re_html_blocks: Array[Array[nullable Regex]] do
		var blocks = new Array[Array[nullable Regex]]

		var re0_opening = "^<(script|pre|style)(\\s|>|$)".to_re
		re0_opening.ignore_case = true
		var re0_closing = "</(script|pre|style)>".to_re
		re0_closing.ignore_case = true
		blocks.add([re0_opening, re0_closing])

		blocks.add([
			"^<!--".to_re,
			"-->".to_re
		])

		blocks.add([
			"^<[?]".to_re,
			"\\?>".to_re
		])

		blocks.add([
			"^<![A-Z]".to_re,
			">".to_re
		])

		blocks.add([
			"^<!\\[CDATA\\[".to_re,
			"\\]\\]>".to_re
		])

		var re5_opening = "^</?(address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(\\s|[/]?[>]|$)".to_re
		re5_opening.ignore_case = true
		blocks.add([re5_opening, null])

		var p_tagname = "[A-Za-z][A-Za-z0-9-]*"
		var p_attribute_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*"
		var p_uquoted_value = "[^\"'=<>`\\x00-\\x20]+"
		var p_squoted_value = "'[^']*'"
		var p_dquoted_value = "\"[^\"]*\""
		var p_attribute_value = "({p_uquoted_value}|{p_squoted_value}|{p_dquoted_value})"
		var p_attribute_value_spec = "(\\s*=\\s*{p_attribute_value})"
		var p_attribute = "(\\s{p_attribute_name}{p_attribute_value_spec}?)"
		var p_opentag = "<{p_tagname}{p_attribute}*\\s*/?>"
		var p_closetag = "</{p_tagname}\\s*[>]"
		var re6_opening = "^({p_opentag}|{p_closetag})\\s*$".to_re
		re6_opening.ignore_case = true
		blocks.add([re6_opening, null])

		return blocks
	end
end

redef class Int

	# Tab stop is 4
	private fun columns_to_next_tab_stop: Int do return 4 - (self % 4)
end

redef class String

	# Is this string blank?
	#
	# i.e. contains only spacing characters.
	private fun is_blank: Bool do
		for i in [0 .. length[ do
			var c = chars[i]
			if c == ' ' or c == '\t' or c == '\n' or c == '\r' then
				continue
			else
				return false
			end
		end
		return true
	end

	# Is the character at `index` a space or a tab
	#
	# Return false if `index > self.length`.
	private fun is_space_or_tab(index: Int): Bool do
		if index >= length then return false
		var c = chars[index]
		return c == ' ' or c == '\t'
	end
end

lib/markdown2/markdown_block_parsing.nit:15,1--1503,3

module markdown_block_parsing

Summary

Markdown blocks parsing

Introduced classes

MdBlockContinue

MdBlockParser

MdBlockParserFactory

MdBlockQuoteParser

MdBlockQuoteParserFactory

MdBlockStart

MdDocumentBlockParser

MdFencedCodeBlockParser

MdFencedCodeBlockParserFactory

MdHeadingParser

MdHeadingParserFactory

MdHtmlBlockParser

MdHtmlBlockParserFactory

MdIndentedCodeBlockParser

MdIndentedCodeBlockParserFactory

MdListBlockParser

MdListBlockParserFactory

MdListItemParser

MdParagraphParser

MdParser

MdPostProcessor

MdThematicBreakParser

MdThematicBreakParserFactory

Redefined classes

Int

MdNode

String

Sys

Markdown blocks parsing

Introduced classes

class MdBlockContinue

abstract class MdBlockParser

abstract class MdBlockParserFactory

class MdBlockQuoteParser

class MdBlockQuoteParserFactory

class MdBlockStart

class MdDocumentBlockParser

class MdFencedCodeBlockParser

class MdFencedCodeBlockParserFactory

class MdHeadingParser

class MdHeadingParserFactory

class MdHtmlBlockParser

class MdHtmlBlockParserFactory

class MdIndentedCodeBlockParser

class MdIndentedCodeBlockParserFactory

class MdListBlockParser

class MdListBlockParserFactory

class MdListItemParser

class MdParagraphParser

class MdParser

abstract class MdPostProcessor

class MdThematicBreakParser

class MdThematicBreakParserFactory

Redefined classes

redef enum Int

redef abstract class MdNode

redef abstract class String

redef class Sys

Summary

All class definitions

Int

MdBlockContinue

MdBlockParser

MdBlockParserFactory

MdBlockQuoteParser

MdBlockQuoteParserFactory

MdBlockStart

MdDocumentBlockParser

MdFencedCodeBlockParser

MdFencedCodeBlockParserFactory

MdHeadingParser

MdHeadingParserFactory

MdHtmlBlockParser

MdHtmlBlockParserFactory

MdIndentedCodeBlockParser

MdIndentedCodeBlockParserFactory