Markdown blocks parsing

Introduce the parsers for the different Markdown blocks such as headings, lists code blocks etc.

Introduced classes

class MdBlockContinue

markdown2 :: MdBlockContinue

Result object for continuing parsing of a block
abstract class MdBlockParser

markdown2 :: MdBlockParser

Parser for a specific block node
abstract class MdBlockParserFactory

markdown2 :: MdBlockParserFactory

Block parser factory for a block node for determining when a block starts
class MdBlockQuoteParser

markdown2 :: MdBlockQuoteParser

Blockquotes parser
class MdBlockQuoteParserFactory

markdown2 :: MdBlockQuoteParserFactory

Blockquotes parser factory
class MdBlockStart

markdown2 :: MdBlockStart

Result object from starting parsing of a block
class MdDocumentBlockParser

markdown2 :: MdDocumentBlockParser

Parser for the whole document
class MdFencedCodeBlockParser

markdown2 :: MdFencedCodeBlockParser

Fenced code blocks parser
class MdFencedCodeBlockParserFactory

markdown2 :: MdFencedCodeBlockParserFactory

Fenced code blocks parser factory
class MdHeadingParser

markdown2 :: MdHeadingParser

Headings parser
class MdHeadingParserFactory

markdown2 :: MdHeadingParserFactory

Heading parser factory
class MdHtmlBlockParser

markdown2 :: MdHtmlBlockParser

Html blocks parser
class MdHtmlBlockParserFactory

markdown2 :: MdHtmlBlockParserFactory

Html blocks parser factory
class MdIndentedCodeBlockParser

markdown2 :: MdIndentedCodeBlockParser

Indented code blocks parser
class MdIndentedCodeBlockParserFactory

markdown2 :: MdIndentedCodeBlockParserFactory

Indented code blocks parser factory
class MdListBlockParser

markdown2 :: MdListBlockParser

List blocks parser
class MdListBlockParserFactory

markdown2 :: MdListBlockParserFactory

List blocks parser factory
class MdListItemParser

markdown2 :: MdListItemParser

List items parser
class MdParagraphParser

markdown2 :: MdParagraphParser

Paragraphs parser
class MdParser

markdown2 :: MdParser

Markdown parser
abstract class MdPostProcessor

markdown2 :: MdPostProcessor

Markdown post processor
class MdThematicBreakParser

markdown2 :: MdThematicBreakParser

Thematic breaks parser
class MdThematicBreakParserFactory

markdown2 :: MdThematicBreakParserFactory

Thematic breaks parser factory

Redefined classes

redef enum Int

markdown2 :: markdown_block_parsing $ Int

Native integer numbers.
redef abstract class MdNode

markdown2 :: markdown_block_parsing $ MdNode

An abstract node
redef abstract class String

markdown2 :: markdown_block_parsing $ String

Immutable sequence of characters.
redef class Sys

markdown2 :: markdown_block_parsing $ Sys

The main class of the program.

All class definitions

redef enum Int

markdown2 :: markdown_block_parsing $ Int

Native integer numbers.
class MdBlockContinue

markdown2 $ MdBlockContinue

Result object for continuing parsing of a block
abstract class MdBlockParser

markdown2 $ MdBlockParser

Parser for a specific block node
abstract class MdBlockParserFactory

markdown2 $ MdBlockParserFactory

Block parser factory for a block node for determining when a block starts
class MdBlockQuoteParser

markdown2 $ MdBlockQuoteParser

Blockquotes parser
class MdBlockQuoteParserFactory

markdown2 $ MdBlockQuoteParserFactory

Blockquotes parser factory
class MdBlockStart

markdown2 $ MdBlockStart

Result object from starting parsing of a block
class MdDocumentBlockParser

markdown2 $ MdDocumentBlockParser

Parser for the whole document
class MdFencedCodeBlockParser

markdown2 $ MdFencedCodeBlockParser

Fenced code blocks parser
class MdFencedCodeBlockParserFactory

markdown2 $ MdFencedCodeBlockParserFactory

Fenced code blocks parser factory
class MdHeadingParser

markdown2 $ MdHeadingParser

Headings parser
class MdHeadingParserFactory

markdown2 $ MdHeadingParserFactory

Heading parser factory
class MdHtmlBlockParser

markdown2 $ MdHtmlBlockParser

Html blocks parser
class MdHtmlBlockParserFactory

markdown2 $ MdHtmlBlockParserFactory

Html blocks parser factory
class MdIndentedCodeBlockParser

markdown2 $ MdIndentedCodeBlockParser

Indented code blocks parser
class MdIndentedCodeBlockParserFactory

markdown2 $ MdIndentedCodeBlockParserFactory

Indented code blocks parser factory
class MdListBlockParser

markdown2 $ MdListBlockParser

List blocks parser
class MdListBlockParserFactory

markdown2 $ MdListBlockParserFactory

List blocks parser factory
class MdListItemParser

markdown2 $ MdListItemParser

List items parser
redef abstract class MdNode

markdown2 :: markdown_block_parsing $ MdNode

An abstract node
class MdParagraphParser

markdown2 $ MdParagraphParser

Paragraphs parser
class MdParser

markdown2 $ MdParser

Markdown parser
abstract class MdPostProcessor

markdown2 $ MdPostProcessor

Markdown post processor
class MdThematicBreakParser

markdown2 $ MdThematicBreakParser

Thematic breaks parser
class MdThematicBreakParserFactory

markdown2 $ MdThematicBreakParserFactory

Thematic breaks parser factory
redef abstract class String

markdown2 :: markdown_block_parsing $ String

Immutable sequence of characters.
redef class Sys

markdown2 :: markdown_block_parsing $ Sys

The main class of the program.
package_diagram markdown2::markdown_block_parsing markdown_block_parsing markdown2::markdown_inline_parsing markdown_inline_parsing markdown2::markdown_block_parsing->markdown2::markdown_inline_parsing markdown2::markdown_ast markdown_ast markdown2::markdown_inline_parsing->markdown2::markdown_ast ...markdown2::markdown_ast ... ...markdown2::markdown_ast->markdown2::markdown_ast markdown2::markdown_github markdown_github markdown2::markdown_github->markdown2::markdown_block_parsing markdown2::markdown_wikilinks markdown_wikilinks markdown2::markdown_wikilinks->markdown2::markdown_block_parsing markdown2::markdown_html_rendering markdown_html_rendering markdown2::markdown_html_rendering->markdown2::markdown_github markdown2::markdown_html_rendering->markdown2::markdown_wikilinks markdown2::markdown_latex_rendering markdown_latex_rendering markdown2::markdown_latex_rendering->markdown2::markdown_github markdown2::markdown_latex_rendering->markdown2::markdown_wikilinks markdown2::markdown_man_rendering markdown_man_rendering markdown2::markdown_man_rendering->markdown2::markdown_github markdown2::markdown_man_rendering->markdown2::markdown_wikilinks markdown2::markdown_md_rendering markdown_md_rendering markdown2::markdown_md_rendering->markdown2::markdown_github markdown2::markdown_md_rendering->markdown2::markdown_wikilinks markdown2::markdown_html_rendering... ... markdown2::markdown_html_rendering...->markdown2::markdown_html_rendering markdown2::markdown_latex_rendering... ... markdown2::markdown_latex_rendering...->markdown2::markdown_latex_rendering markdown2::markdown_man_rendering... ... markdown2::markdown_man_rendering...->markdown2::markdown_man_rendering markdown2::markdown_md_rendering... ... markdown2::markdown_md_rendering...->markdown2::markdown_md_rendering

Ancestors

module abstract_collection

core :: abstract_collection

Abstract collection classes and services.
module abstract_text

core :: abstract_text

Abstract class for manipulation of sequences of characters
module array

core :: array

This module introduces the standard array structure.
module bitset

core :: bitset

Services to handle BitSet
module bytes

core :: bytes

Services for byte streams and arrays
module circular_array

core :: circular_array

Efficient data structure to access both end of the sequence.
module codec_base

core :: codec_base

Base for codecs to use with streams
module codecs

core :: codecs

Group module for all codec-related manipulations
module collection

core :: collection

This module define several collection classes.
module core

core :: core

Standard classes and methods used by default by Nit programs and libraries.
module environ

core :: environ

Access to the environment variables of the process
module error

core :: error

Standard error-management infrastructure.
module exec

core :: exec

Invocation and management of operating system sub-processes.
module file

core :: file

File manipulations (create, read, write, etc.)
module fixed_ints

core :: fixed_ints

Basic integers of fixed-precision
module fixed_ints_text

core :: fixed_ints_text

Text services to complement fixed_ints
module flat

core :: flat

All the array-based text representations
module gc

core :: gc

Access to the Nit internal garbage collection mechanism
module hash_collection

core :: hash_collection

Introduce HashMap and HashSet.
module iso8859_1

core :: iso8859_1

Codec for ISO8859-1 I/O
module kernel

core :: kernel

Most basic classes and methods.
module list

core :: list

This module handle double linked lists
module markdown_ast

markdown2 :: markdown_ast

Markdown AST representation
module math

core :: math

Mathematical operations
module native

core :: native

Native structures for text and bytes
module numeric

core :: numeric

Advanced services for Numeric types
module protocol

core :: protocol

module queue

core :: queue

Queuing data structures and wrappers
module range

core :: range

Module for range of discrete objects.
module re

core :: re

Regular expression support for all services based on Pattern
module ropes

core :: ropes

Tree-based representation of a String.
module sorter

core :: sorter

This module contains classes used to compare things and sorts arrays.
module stream

core :: stream

Input and output streams of characters
module text

core :: text

All the classes and methods related to the manipulation of text entities
module time

core :: time

Management of time and dates
module union_find

core :: union_find

union–find algorithm using an efficient disjoint-set data structure
module utf8

core :: utf8

Codec for UTF-8 I/O

Parents

module markdown_inline_parsing

markdown2 :: markdown_inline_parsing

Parser for inline markdown

Children

module markdown_github

markdown2 :: markdown_github

Markdown Github mode

Descendants

module a_star-m

a_star-m

module markdown_html_rendering

markdown2 :: markdown_html_rendering

HTML rendering of Markdown documents
module markdown_latex_rendering

markdown2 :: markdown_latex_rendering

LaTeX rendering of Markdown documents
module markdown_man_rendering

markdown2 :: markdown_man_rendering

Manpages rendering of Markdown documents
module markdown_md_rendering

markdown2 :: markdown_md_rendering

Markdown rendering of Markdown documents
module nitmd

markdown2 :: nitmd

A Markdown parser for Nit.
module test_markdown

markdown2 :: test_markdown

Test suites for module markdown
# Markdown blocks parsing
#
# Introduce the parsers for the different Markdown blocks such as headings, lists
# code blocks etc.
module markdown_block_parsing

import markdown_inline_parsing

# Markdown parser
#
# Used to create the AST representation of a Markdown document.
class MdParser

	# Inline parser used to parse block content
	private var inline_parser = new MdInlineParser is lazy

	# Block parsers factories
	private var block_parser_factories: Collection[MdBlockParserFactory] do
		var factories = new Array[MdBlockParserFactory]
		factories.add new MdBlockQuoteParserFactory
		factories.add new MdHeadingParserFactory
		factories.add new MdFencedCodeBlockParserFactory
		factories.add new MdHtmlBlockParserFactory
		factories.add new MdThematicBreakParserFactory
		factories.add new MdListBlockParserFactory
		factories.add new MdIndentedCodeBlockParserFactory
		return factories
	end

	# Active block parsers
	#
	# Used as a stack to parse nested blocks.
	private var active_block_parsers = new Array[MdBlockParser]

	# All active block parsers
	private var all_block_parsers = new HashSet[MdBlockParser]

	# Return the active block parser
	#
	# The last entry in the `active_block_parsers` stack.
	private fun active_block_parser: MdBlockParser do
		return active_block_parsers.last
	end

	# Activate a `block_parser`
	#
	# Add the `block_parser` on the top of the `active_block_parsers` stack.
	# Also register it in `all_block_parsers`.
	private fun activate_block_parser(block_parser: MdBlockParser) do
		active_block_parsers.add block_parser
		all_block_parsers.add block_parser
	end

	# Deactivate the `active_block_parser`
	private fun deactivate_block_parser do
		active_block_parsers.pop
	end

	# Deactivate and remove the `active_block_parser` from the `all_block_parsers` list
	private fun remove_active_block_parser do
		var old = active_block_parser
		deactivate_block_parser
		all_block_parsers.remove(old)
		old.block.unlink
	end

	# Post-processors applied after the parsing of a document
	var post_processors = new Array[MdPostProcessor] is writable

	# Currently parsed line
	private var line_string: String is noinit

	# Current index (offset) in input `line_string` (starts at 0)
	private var index = 0

	# Current column in input `line_string` (starts at 0)
	#
	# Tab causes column to go to next 4-space tab stop.
	private var column = 0

	# Is the current column within a tab character (partially consumed tab)
	private var column_is_in_tab: Bool is noinit

	# Current line in input string (starts at 1)
	private var line = 1

	# Index of the next non-space character starting from `index`
	private var next_non_space_index = 0

	# Next non-space column
	private var next_non_space_column = 0

	# Current indent in columns
	#
	# Either by spaces or tab stop of 4, starting from `column`.
	private var indent = 0

	# Is the current `line` blank starting from `index`?
	private var is_blank: Bool is noinit

	# Does a node end with a blank line?
	private var last_line_blank = new HashMap[MdNode, Bool]

	# Initialize parser state
	private fun initialize do
		active_block_parsers.clear
		all_block_parsers.clear
		index = 0
		column = 0
		column_is_in_tab = false
		line = 1
		next_non_space_index = 0
		next_non_space_column = 0
		indent = 0
		is_blank = false
		last_line_blank.clear
	end

	# Parse the `input` string as a MdDocument
	fun parse(input: String): MdDocument do
		initialize

		var document_block_parser = new MdDocumentBlockParser(1, 1, 0)
		activate_block_parser(document_block_parser)
		var line_start = 0
		var line_break = find_line_break(input, line_start)
		while line_break != -1 do
			var line_string = input.substring(line_start, line_break - line_start)
			incorporate_line(line_string)
			if line_break + 1 < input.length and
			   input.chars[line_break] == '\r' and
			   input.chars[line_break + 1] == '\n' then
				line_start = line_break + 2
			else
				line_start = line_break + 1
			end
			line_break = find_line_break(input, line_start)
			line += 1
			column = 0
		end

		# Finalize pending line
		if input.length > 0 and (line_start == 0 or line_start < input.length) then
			incorporate_line(input.substring(line_start, input.length - line_start))
		end
		finalize_blocks(active_block_parsers)

		# Walk through a block and its chiildren revursively
		# Parsing string content into inline content where appropriate.
		var all_block_parsers = all_block_parsers.to_a
		var i = all_block_parsers.length - 1
		while i >= 0 do
			var block_parser = all_block_parsers[i]
			block_parser.parse_inlines(inline_parser)
			i -= 1
		end
		var document = document_block_parser.block
		return document
	end

	# Post-process the `document`
	fun post_process(document: MdDocument) do
		for processor in post_processors do
			processor.post_process(self, document)
		end
	end

	# Analyze a line of text and update the document
	#
	# We parse Markdown text by calling this on each line of `input`.
	private fun incorporate_line(input: String) do
		line_string = input
		index = 0
		column = 0
		column_is_in_tab = false

		# For each containing block, try to parse the associated line start.
		var matches = 1
		for i in [1 .. active_block_parsers.length[ do
			var block_parser = active_block_parsers[i]
			find_next_non_space

			var result = block_parser.try_continue(self)
			if result isa MdBlockContinue then
				if result.is_finalize then
					block_parser.finalize(self)
					return
				else
					if result.new_index != -1 then
						set_new_index result.new_index
					else if result.new_column != -1 then
						set_new_column result.new_column
					end
				end
				matches += 1
			else
				break
			end
		end

		var unmatched_block_parsers = active_block_parsers.subarray(
			matches, active_block_parsers.length - matches)
		var last_matched_block_parser = active_block_parsers[matches - 1]
		var block_parser = last_matched_block_parser
		var all_closed = unmatched_block_parsers.is_empty

		# Unless last matched container is a code block, try new container starts,
		# adding children to the last matched container.
		var try_block_starts = block_parser.block isa MdParagraph or
			block_parser.block.is_container

		while try_block_starts do
			find_next_non_space

			# Optimize lookup
			if is_blank or (indent < 4 and line_string.chars[next_non_space_index].is_letter) then
				set_new_index next_non_space_index
				break
			end

			var block_start = find_block_start(block_parser)
			if block_start == null then
				set_new_index next_non_space_index
				break
			end

			if not all_closed then
				finalize_blocks(unmatched_block_parsers)
				all_closed = true
			end

			if block_start.new_index != -1 then
				set_new_index block_start.new_index
			else if block_start.new_column != -1 then
				set_new_column block_start.new_column
			end

			if block_start.replace_active_block_parser then
				remove_active_block_parser
			end

			for new_block_parser in block_start.block_parsers do
				add_child(new_block_parser)
				block_parser = new_block_parser
				try_block_starts = new_block_parser.block.is_container
			end
		end

		# What remains at the offset is a text line.
		# Add the text to the appropriate block.

		# First check for a lazy paragraph continuation
		if not all_closed and not is_blank and active_block_parser isa MdParagraphParser then
			add_line
		else
			# Finalize any blocks not matched
			if not all_closed then
				finalize_blocks(unmatched_block_parsers)
			end
			propagate_last_line_blank(block_parser, last_matched_block_parser)

			if not block_parser.block.is_container then
				add_line
			else if not is_blank then
				# Create a paragraph container for the line
				add_child(new MdParagraphParser(line, column + 1, block_parser.content_offset))
				add_line
			end
		end
	end

	# Find what kind of block starts at `index` in `input`
	private fun find_block_start(block_parser: MdBlockParser): nullable MdBlockStart do
		for block_parser_factory in block_parser_factories do
			var result = block_parser_factory.try_start(self, block_parser)
			if result != null then return result
		end
		return null
	end

	# Add a `block_parser` block's as child of the active block parser block
	private fun add_child(block_parser: MdBlockParser) do
		# Finalize non-parentable blocks
		while not active_block_parser.block.can_contain(block_parser.block) do
			active_block_parser.finalize(self)
		end
		# Append block block parser block to its parent
		active_block_parser.block.append_child(block_parser.block)
		activate_block_parser(block_parser)
	end

	# Add line content to the active block parser
	#
	# We assume it can accept lines.
	private fun add_line do
		var content = null
		if column_is_in_tab then
			# Out column is in a partially consumed tab.
			# Expand the remaining columns to the next tab stop to spaces.
			var after_tab = index + 1
			var rest = line_string.substring(after_tab, line_string.length - after_tab)
			var spaces = column.columns_to_next_tab_stop
			var buffer = new Buffer
			for i in [0 .. spaces[ do
				buffer.add ' '
			end
			buffer.append(rest)
			content = buffer.write_to_string
		else
			content = line_string.substring(index, line_string.length - index)
		end
		active_block_parser.add_line(content)
	end

	# Finalize blocks of previous line
	private fun finalize_blocks(block_parsers: Sequence[MdBlockParser]) do
		var i = block_parsers.length - 1
		while i >= 0 do
			var block_parser = block_parsers[i]
			block_parser.finalize(self)
			i -= 1
		end
	end

	# Advance the `index` position to the next character
	#
	# Also set the `column`.
	# If the next character is a tab, compute the new column accordingly.
	private fun advance do
		var c = line_string.chars[index]
		if c == '\t' then
			index += 1
			column += column.columns_to_next_tab_stop
		else
			index += 1
			column += 1
		end
	end

	# Move `index` to the next non-space character index in the `input` string
	#
	# Also set `next_non_space_index`, `next_non_space_column`, `is_blank` and `indent`.
	private fun find_next_non_space do
		var i = index
		var cols = column

		is_blank = true
		while i < line_string.length do
			var c = line_string.chars[i]
			if c == ' ' then
				i += 1
				cols += 1
				continue
			else if c == '\t' then
				i += 1
				cols += 4 - (cols % 4)
				continue
			end
			is_blank = false
			break
		end

		next_non_space_index = i
		next_non_space_column = cols
		indent = next_non_space_column - column
	end

	# Return the position of the next line break
	#
	# We consider `\r` and `\n`.
	private fun find_line_break(input: String, start_index: Int): Int do
		for i in [start_index .. input.length[ do
			var char = input.chars[i]
			if char == '\r' or char == '\n' then return i
		end
		return -1
	end

	# Set the parser `index` at `new_index`
	#
	# Also set `column` and `column_is_in_tab`.
	private fun set_new_index(new_index: Int) do
		if new_index >= next_non_space_index then
			# We can start from here, no need to calculate tab stops again
			index = next_non_space_index
			column = next_non_space_column
		end
		while index < new_index and index != line_string.length do
			advance
		end
		# If we're going to an index as opposed to a column, we're never within a tab
		column_is_in_tab = false
	end

	# Set the parser `column` at `new_column`
	#
	# Also set `index` and `column_is_in_tab`.
	private fun set_new_column(new_column: Int) do
		if new_column >= next_non_space_column then
			# We can start from here, no need to calculate tab stops again
			index = next_non_space_index
			column = next_non_space_column
		end
		while column < new_column and index != line_string.length do
			advance
		end
		if column > new_column then
			# Last character was a tab and we overshot our target
			index -= 1
			column = new_column
			column_is_in_tab = true
		else
			column_is_in_tab = false
		end
	end

	# Does `block` end with a blank line?
	private fun ends_with_blank_line(block: nullable MdNode): Bool do
		while block != null do
			if is_last_line_blank(block) then return true
			if block isa MdListBlock or block isa MdListItem then
				block = block.last_child
			else
				break
			end
		end
		return false
	end

	# Propagate a blank line to all block_parser blocl's parents
	private fun propagate_last_line_blank(block_parser: MdBlockParser, last_matched_block_parser: MdBlockParser) do
		var last_child = block_parser.block.last_child
		if is_blank and last_child != null then
			last_line_blank[last_child] = true
		end
		var block = block_parser.block

		# Block quotes lines are never blank as they start with `>`.
		# We don't count blanks in fenced code for purposes of thight/loose lists.
		# We also don't set `last_line_blank` on an empty list item.
		var last_line_blank = is_blank and
			not (block isa MdBlockQuote or
			     block isa MdFencedCodeBlock or
				 (block isa MdListItem and block.first_child == null and
										  block_parser != last_matched_block_parser))

		# Propagate `last_line_blank` up through parents
		var node: nullable MdNode = block_parser.block
		while node != null do
			self.last_line_blank[node] = last_line_blank
			node = node.parent
		end
	end

	# Is last line blank for `node`?
	private fun is_last_line_blank(node: MdNode): Bool do
		if not last_line_blank.has_key(node) then return false
		return last_line_blank[node]
	end
end

# Block parsing

# Parser for a specific block node
abstract class MdBlockParser

	# Kind of block under construction
	type BLOCK: MdBlock

	# MdBlock under construction
	fun block: BLOCK is abstract

	# Line Start
	var line_start: Int

	# Column start
	var column_start: Int

	# Location at start
	#
	# The location end it initialized at `-1` and will be set later in the
	# `finalize` method.
	var location: MdLocation is lazy do return new MdLocation(line_start, column_start, -1, -1)

	# Column where the content starts
	var content_offset: Int

	# Initialize the current `block`
	fun initialize(parser: MdParser) do end

	# Can `self` continue from the current `index` in `parser`?
	#
	# Return a new `MdBlockContinue` if `self` can continue parsing.
	# Return null otherwise.
	fun try_continue(state: MdParser): nullable MdBlockContinue is abstract

	# Add `line` to the current `block`
	fun add_line(line: String) do end

	# Finalize the current `block`
	#
	# Deactivate `self` from `parser` and call `close_block`.
	fun finalize(parser: MdParser) do
		if parser.active_block_parser == self then
			parser.deactivate_block_parser
		end
	end

	# Parse `block` lines
	fun parse_inlines(inline_parser: MdInlineParser) do end
end

# Result object for continuing parsing of a block
class MdBlockContinue

	# Index from which continue parsing
	var new_index: Int

	# Column from which continue parsing
	var new_column: Int

	# Is the block finalized?
	var is_finalize: Bool

	# Continue from index
	init at_index(new_index: Int) do
		init(new_index, -1, false)
	end

	# Continue from column
	init at_column(new_column: Int) do
		init(-1, new_column, false)
	end

	# Block is finished
	init finished do
		init(-1, -1, true)
	end
end

# Block parser factory for a block node for determining when a block starts
abstract class MdBlockParserFactory

	# Can the associated block parser can start at the current line in `parser`?
	#
	# Return a new `MdBlockStart` if the block parser can start.
	# Return null otherwise.
	fun try_start(parser: MdParser, matched_block_parser: MdBlockParser):
		nullable MdBlockStart is abstract
end

# Result object from starting parsing of a block
class MdBlockStart

	# Block parsers for this block start
	var block_parsers: Array[MdBlockParser]

	# Index where the parsing should start
	var new_index = -1

	# Column where the parsing should start
	var new_column = -1

	# Does the block starting with `self` terminate a previous block?
	var replace_active_block_parser = false

	# Start from `new_index`
	fun at_index(new_index: Int): MdBlockStart do
		self.new_index = new_index
		return self
	end

	# Start from `new_column`
	fun at_column(new_column: Int): MdBlockStart do
		self.new_column = new_column
		return self
	end

	# Start replacing the active block parser
	fun replacing_active_block_parser: MdBlockStart do
		self.replace_active_block_parser = true
		return self
	end
end

# Parser for the whole document
class MdDocumentBlockParser
	super MdBlockParser

	redef type BLOCK: MdDocument
	redef var block = new MdDocument(location) is lazy

	# Always continue at current indent
	redef fun try_continue(state) do return new MdBlockContinue.at_index(state.index)

	redef fun finalize(parser) do
	end

	# redef fun finalize(state) do
	redef fun parse_inlines(inline_parser) do
		var last_child = block.last_child
		if last_child != null then
			location.line_end = last_child.location.line_end
			location.column_end = last_child.location.column_end
		end
	end
end

# Headings parser
class MdHeadingParser
	super MdBlockParser

	redef type BLOCK: MdHeading

	redef var block = new MdHeading(location, level, is_setext, has_atx_trailing) is lazy

	redef var location = new MdLocation(line_start, column_start, line_end, column_end) is lazy

	# Line end
	var line_end: Int

	# Column end
	var column_end: Int

	# Heading level
	var level: Int

	# Heading content
	var content: String

	# Heading has ATX trailing
	var has_atx_trailing: Bool

	# Heading is setext format
	var is_setext: Bool

	# Never continue parsing as an heading is a one liner
	redef fun try_continue(state) do return null

	# Parse the heading content
	redef fun parse_inlines(inline_parser) do
		inline_parser.parse(content, content_offset, block)
	end
end

# Heading parser factory
class MdHeadingParserFactory
	super MdBlockParserFactory

	redef fun try_start(state, matched_block_parser) do
		if state.indent >= 4 then return null

		var next_non_space = state.next_non_space_index
		var line = state.line_string
		var paragraph = null
		if matched_block_parser isa MdParagraphParser then
			paragraph = matched_block_parser.content
		end

		var line_content = line.substring(next_non_space, line.length - next_non_space)
		var match = line_content.search(re_atx_heading)
		if match != null then
			# ATX heading
			var new_offset = next_non_space + match.subs.first.as(not null).length
			var level = match.subs.first.as(not null).to_s.trim.length
			# remove trailing ###s
			var after_leading = line.substring(new_offset, line.length - new_offset)
			var trailing = after_leading.search(re_atx_trailing)
			var has_trailing = trailing != null
			var trailing_length = if trailing != null then trailing.length else 0
			var content = after_leading.replace(re_atx_trailing, "")
			return (new MdBlockStart(
				[new MdHeadingParser(
					state.line,
					next_non_space + 1,
					new_offset + 1,
					state.line,
					new_offset + content.length + trailing_length,
					level,
					content,
					has_trailing, false)])
				).at_index(line.length)
		end

		if paragraph ==  null then return null

		match = line_content.search(re_setext_heading)
		if match == null then return null
		var level = 2
		if match.subs.first.as(not null).to_s.chars.first == '=' then level = 1
		var content = paragraph.to_s
		return (new MdBlockStart(
			[new MdHeadingParser(
				state.line - 1,
				next_non_space + 1,
				0,
				state.line,
				state.column + match.length,
				level,
				content,
				false, true)])
			).at_index(line.length).replacing_active_block_parser
	end
end

# Blockquotes parser
class MdBlockQuoteParser
	super MdBlockParser

	redef type BLOCK: MdBlockQuote
	redef var block = new MdBlockQuote(location) is lazy

	redef fun try_continue(state) do
		var next_non_space = state.next_non_space_index
		var indent = state.indent
		var line = state.line_string

		if indent >= 4 then return null
		if next_non_space >= line.length then return null
		if line.chars[next_non_space] != '>' then return null

		var new_column = state.column + state.indent + 1
		# optional following space or tab
		if state.line_string.is_space_or_tab(next_non_space + 1) then
			new_column += 1
		end
		return new MdBlockContinue.at_column(new_column)
	end

	redef fun parse_inlines(inline_parser) do
		var last_child = block.last_child
		if last_child != null then
			location.line_end = last_child.location.line_end
			location.column_end = last_child.location.column_end
		end
	end
end

# Blockquotes parser factory
class MdBlockQuoteParserFactory
	super MdBlockParserFactory

	redef fun try_start(state, matched_block_parser) do
		var next_non_space = state.next_non_space_index
		var indent = state.indent
		var line = state.line_string

		if indent >= 4 then return null
		if next_non_space >= line.length then return null
		if line.chars[next_non_space] != '>' then return null

		var new_column = state.column + state.indent + 1
		# optional following space or tab
		if state.line_string.is_space_or_tab(next_non_space + 1) then
			new_column += 1
		end
		return (new MdBlockStart(
			[new MdBlockQuoteParser(
				state.line,
				state.column + 1,
				new_column)])
			).at_column(new_column)
	end
end

# Indented code blocks parser
class MdIndentedCodeBlockParser
	super MdBlockParser

	redef type BLOCK: MdIndentedCodeBlock
	redef var block = new MdIndentedCodeBlock(location, use_tabs) is lazy

	# Indent is tab?
	var use_tabs: Bool

	# Block content
	var content = new Buffer

	redef fun try_continue(state) do
		if state.indent >= 4 then
			return new MdBlockContinue.at_column(state.column + 4)
		else if state.is_blank then
			return new MdBlockContinue.at_index(state.next_non_space_index)
		end
		return null
	end

	redef fun add_line(line) do
		if not content.is_empty then
			content.add('\n')
		end
		content.append(line)
	end

	redef fun finalize(parser) do
		super

		add_line(" ")
		var content = self.content.to_s
		var literal = content.replace_first(re_trailing_blank_lines, "\n")
		block.literal = literal

		var lines = literal.split("\n")
		location.line_end = location.line_start + lines.length - 2
		location.column_end = content_offset + lines[lines.length - 2].length + 4
	end
end

# Indented code blocks parser factory
class MdIndentedCodeBlockParserFactory
	super MdBlockParserFactory

	redef fun try_start(state, matched_block_parser) do
		if state.indent < 4 then return null
		if state.is_blank then return null
		if state.active_block_parser.block isa MdParagraph then return null

		var use_tabs = state.line_string.has_prefix("\t")
		return (new MdBlockStart(
			[new MdIndentedCodeBlockParser(
				state.line,
				state.column + 1,
				state.column,
				use_tabs)])
			).at_column(state.column + 4)
	end
end

# Fenced code blocks parser
class MdFencedCodeBlockParser
	super MdBlockParser

	redef type BLOCK: MdFencedCodeBlock
	redef var block = new MdFencedCodeBlock(location, fence_char, fence_length, fence_indent) is lazy

	# Fence character
	var fence_char: Char

	# Fence length
	var fence_length: Int

	# Fence indent
	var fence_indent: Int

	# Fence first line
	var first_line: nullable String = null

	# Fence other lines
	var other_lines = new Buffer

	redef fun try_continue(state) do
		var next_non_space = state.next_non_space_index
		var new_index = state.index
		var line = state.line_string

		if state.indent <= 3 and next_non_space < line.length and
		   line.chars[next_non_space] == fence_char then

			var match = line.substring(next_non_space, line.length - next_non_space).
				search(re_closing_fence)
			if match != null and match.subs[0].as(not null).length >= fence_length then
				# closing fence - we're at end of line, so we can finalize now
				return new MdBlockContinue.finished
			end
		end

		# skip optional spaces of fence indent
		var i = fence_indent
		while i > 0 and new_index < line.length and line.chars[new_index] == ' ' do
			new_index += 1
			i -= 1
		end

		return new MdBlockContinue.at_index(new_index)
	end

	redef fun add_line(line) do
		if first_line == null then
			first_line = line
		else
			other_lines.append(line)
			other_lines.add '\n'
		end
	end

	redef fun finalize(parser) do
		super

		# first line become info string
		var first_line = self.first_line
		if first_line != null then
			var info = first_line.trim.unescape_string
			if not info.is_empty then block.info = info
		end

		var content = other_lines.to_s
		block.literal =  content

		var lines = content.split("\n")
		location.line_end = location.line_start + lines.length
		location.column_end = content_offset + fence_indent + fence_length
	end
end

# Fenced code blocks parser factory
class MdFencedCodeBlockParserFactory
	super MdBlockQuoteParserFactory

	redef fun try_start(state, matched_block_parser) do
		var next_non_space = state.next_non_space_index
		var line = state.line_string

		if state.indent >= 4 then return null

		var match = line.substring(next_non_space, line.length - next_non_space).search(re_opening_fence)
		if match == null then return null

		var fence_length
		var fence_char
		var sub0 = match.subs[0]
		if sub0 != null then
			fence_length = sub0.length
			fence_char = sub0.to_s.chars.first
		else
			fence_length = match.subs[2].as(not null).length
			fence_char = match.subs[2].as(not null).to_s.chars.first
		end
		if fence_char == '`' and match.to_s.has("[^`]+`".to_re) then
			return null
		else if match.to_s.has("[^~]+~".to_re) then
			return null
		end
		return (new MdBlockStart(
			[new MdFencedCodeBlockParser(
				state.line,
				state.column + 1,
				state.column,
				fence_char,
				fence_length,
				state.indent)]
			)).at_index(next_non_space + fence_length)
	end
end

# List blocks parser
class MdListBlockParser
	super MdBlockParser

	redef type BLOCK: MdListBlock

	redef var block is lazy do
		if is_ordered then
			return new MdOrderedList(location, digit.as(not null), delim.as(not null))
		else
			return new MdUnorderedList(location, bullet.as(not null))
		end
	end

	# Is this list ordered
	var is_ordered: Bool

	# List bullet if unordered
	var bullet: nullable Char

	# List digit if ordered
	var digit: nullable Int

	# List delimiter if ordered
	var delim: nullable Char

	redef fun try_continue(state) do return new MdBlockContinue.at_index(state.index)

	redef fun finalize(parser) do
		super

		var item = block.first_child
		while item != null do
			# check for non-final list item ending with blank line
			if parser.ends_with_blank_line(item) and item.next != null then
				block.is_tight = false
				break
			end
			# recurse into children of list item to see if there are spaces between any of them
			var sub_item = item.first_child
			while sub_item != null do
				if parser.ends_with_blank_line(sub_item) and
				   (item.next != null or sub_item.next != null) then
					block.is_tight = false
					break
				end
				sub_item = sub_item.next
			end
			item = item.next
		end
	end

	redef fun parse_inlines(inline_parser) do
		var last_child = block.last_child
		if last_child != null then
			location.line_end = last_child.location.line_end
			location.column_end = last_child.location.column_end
		end
	end
end

# List blocks parser factory
class MdListBlockParserFactory
	super MdBlockQuoteParserFactory

	redef fun try_start(state, matched_block_parser) do
		if state.indent >= 4 and not matched_block_parser isa MdListBlockParser then return null

		var marker_index = state.next_non_space_index
		var marker_column = state.column + state.indent

		var in_paragraph = matched_block_parser isa MdParagraphParser and matched_block_parser.content != null
		var list_data = parse_list_marker(state, state.line_string, marker_index, marker_column, in_paragraph)
		if list_data == null then return null


		var new_column = list_data.content_column
		var list_item_parser = new MdListItemParser(
			state.line,
			state.column + 1,
			new_column,
			new_column - state.column)

		# prepend the list block if needed
		if not matched_block_parser isa MdListBlockParser or not lists_match(matched_block_parser.block, list_data) then
			var list_block_parser = new MdListBlockParser(state.line, state.column + 1, new_column - state.column, list_data.is_ordered, list_data.bullet, list_data.digit, list_data.delim)
			list_block_parser.block.is_tight = true

			return (new MdBlockStart([list_block_parser, list_item_parser: MdBlockParser])).at_column(new_column)
		end
		return (new MdBlockStart([list_item_parser])).at_column(new_column)
	end

	private fun parse_list_marker(state: MdParser, line: String, marker_index, marker_column: Int, in_paragraph: Bool): nullable MdListData do
		var rest = line.substring(marker_index, line.length - marker_index)
		var match = rest.search(re_list_marker)
		if match == null then return null

		var is_ordered
		var bullet = null
		var digit = null
		var delim = null

		var bullet_match = match.subs[0]
		if bullet_match != null then
			is_ordered = false
			bullet = bullet_match.to_s.chars[0]
		else
			is_ordered = true
			digit = match.subs[2].as(not null).to_s.to_i
			delim = match.subs[3].as(not null).to_s.chars[0]
		end

		var marker_length = match.length
		if match.to_s.has_suffix(" ") or match.to_s.has_suffix("\t") then
			marker_length -= 1
		end
		var index_after_marker = marker_index + marker_length

		# marker doesn't include tabs, so counting them as column directly is ok
		var column_after_marker = marker_column + marker_length
		# the column within the line where the content starts
		var content_column = column_after_marker

		# see at which column the content starts if there is content
		var has_content = false
		for i in [index_after_marker .. line.length[ do
			var c = line.chars[i]
			if c == '\t' then
				content_column += content_column.columns_to_next_tab_stop
			else if c == ' ' then
				content_column += 1
			else
				has_content = true
				break
			end
		end

		if in_paragraph then
			# if the list item is ordered, then start number must be 1 to interrupt a paragraph
			if is_ordered and digit != 1 then
				return null
			end
			# empty list item can not interrupt a paragraph
			if not has_content then
				return null
			end
		end

		if not has_content or (content_column - column_after_marker) > 4 then
			# if this line is blank or has a code block, default to 1 space after marker
			content_column = column_after_marker + 1
		end
		return new MdListData(is_ordered, bullet, digit, delim, content_column)
	end

	# Return true if the two list items are of the same type
	#
	# With the same delimiter and bullet character.
	# This is used in agglomerating list items into lists
	private fun lists_match(a: MdListBlock, b: MdListData): Bool do
		if a isa MdUnorderedList and not b.is_ordered then
			return a.bullet_marker == b.bullet
		else if a isa MdOrderedList and b.is_ordered then
			return a.delimiter == b.delim
		end
		return false
	end
end

# Parsed list data
private class MdListData

	var is_ordered: Bool

	var bullet: nullable Char

	var digit: nullable Int

	var delim: nullable Char

	# Column the content start at
	var content_column: Int
end

# List items parser
class MdListItemParser
	super MdBlockParser

	redef type BLOCK: MdListItem
	redef var block = new MdListItem(location) is lazy

	# List item content indend
	var content_indent: Int

	redef fun try_continue(state) do
		if state.is_blank then
			if block.first_child == null then
				# blank line after empty list item
				return null
			end
			return new MdBlockContinue.at_index(state.next_non_space_index)
		end
		if state.indent >= content_indent then
			return new MdBlockContinue.at_column(state.column + content_indent)
		end
		return null
	end

	redef fun parse_inlines(inline_parser) do
		var last_child = block.last_child
		if last_child != null then
			location.line_end = last_child.location.line_end
			location.column_end = last_child.location.column_end
		end
	end
end

# Thematic breaks parser
class MdThematicBreakParser
	super MdBlockParser

	redef type BLOCK: MdThematicBreak
	redef var block = new MdThematicBreak(location, pattern) is lazy

	# Thematic break pattern
	var pattern: String

	redef fun try_continue(state) do return null

	redef fun finalize(parser) do
		super

		location.line_end = line_start
		location.column_end = column_start + pattern.length - 1
	end
end

# Thematic breaks parser factory
class MdThematicBreakParserFactory
	super MdBlockQuoteParserFactory

	redef fun try_start(state, matched_block_parser) do
		if state.indent >= 4 then return null

		var next_non_space = state.next_non_space_index
		var line = state.line_string
		var tbreak  = line.substring(next_non_space, line.length - next_non_space).search(re_thematic_break)
		if tbreak != null then
			return (new MdBlockStart(
				[new MdThematicBreakParser(
					state.line,
					state.column + 1,
					next_non_space,
					tbreak.to_s)]
				)).at_index(line.length)
		end
		return null
	end
end

# Paragraphs parser
class MdParagraphParser
	super MdBlockParser

	redef type BLOCK: MdParagraph

	redef var block = new MdParagraph(location) is lazy

	# Paragraph content
	var content: nullable Buffer = new Buffer

	redef fun try_continue(state) do
		if state.is_blank then return null
		return new MdBlockContinue.at_index(state.index)
	end

	redef fun add_line(line) do
		var content = self.content
		if content == null then return
		if not content.is_empty then
			content.add('\n')
		end
		content.append(line)
	end

	redef fun finalize(parser) do
		super

		var inline_parser = parser.inline_parser
		var content = self.content
		if content == null then return

		var content_string = content.to_s
		var has_reference_defs = false

		var pos = inline_parser.parse_reference(content_string)
		# try parsing the beginning as link reference definitions
		while content_string.length > 3 and content_string.chars[0] == '[' and pos != 0 do
			content_string = content_string.substring(pos, content_string.length - pos)
			has_reference_defs = true
			pos = inline_parser.parse_reference(content_string)
		end

		if has_reference_defs and content_string.is_blank then
			block.unlink
			self.content = null
		else
			self.content = new Buffer.from_text(content_string)
		end
	end

	redef fun parse_inlines(inline_parser) do
		var content = self.content
		if content == null then return
		inline_parser.parse(content.to_s, content_offset, block)

		var last_child = block.last_child
		if last_child != null then
			location.line_end = last_child.location.line_end
			location.column_end = last_child.location.column_end
		end
	end
end

# Html blocks parser
class MdHtmlBlockParser
	super MdBlockParser

	redef type BLOCK: MdHtmlBlock
	redef var block = new MdHtmlBlock(location) is lazy

	# Closing tag pattern
	#
	# Or null if the block is not closed
	var closing_pattern: nullable Pattern

	# Is the current block finished?
	var finished = false

	# Block content
	var content = new Buffer

	redef fun try_continue(state) do
		if finished then return null

		# blank lin ends type 6 and 7 blocks
		if state.is_blank and closing_pattern == null then return null

		return new MdBlockContinue.at_index(state.index)
	end

	redef fun add_line(line) do
		if not content.is_empty then
			content.add('\n')
		end
		content.append(line)
		var closing_pattern = self.closing_pattern
		if closing_pattern != null and line.has(closing_pattern) then
			finished = true
		end
	end

	redef fun finalize(parser) do
		super

		var content = self.content.to_s
		block.literal = content

		var lines = content.split("\n")
		location.line_end = location.line_start + lines.length - 1
		location.column_end = lines.last.length
	end
end

# Html blocks parser factory
class MdHtmlBlockParserFactory
	super MdBlockParserFactory

	redef fun try_start(state, matched_block_parser) do
		var next_non_space = state.next_non_space_index
		var line = state.line_string

		if state.indent >= 4 or line.chars[next_non_space] != '<' then return null

		for block_type in [0..6] do
			# type 7 can not interrupt a paragraph
			if block_type == 6 and matched_block_parser.block isa MdParagraph then continue
			var opener = re_html_blocks[block_type].first
			var closer = re_html_blocks[block_type].last
			if line.substring(next_non_space, line.length - next_non_space).has(opener.as(not null)) then
				return (new MdBlockStart(
					[new MdHtmlBlockParser(
						state.line,
						state.column + 1,
						next_non_space,
						closer)])
					).at_index(state.index)
			end
		end
		return null
	end
end

# Post Processing

# Markdown post processor
#
# A Markdown AST visitor called after parsing from a MdParser
abstract class MdPostProcessor
	super MdVisitor

	# Document behing processed
	#
	# Availlable only during a call to `post_process`.
	var document: nullable MdDocument = null

	# Post process the `document` parsed by `parser`
	fun post_process(parser: MdParser, document: MdDocument) do
		self.document = document
		enter_visit(document)
		self.document = null
	end

	# Call `MdNode::post_process`
	redef fun visit(node) do node.post_process(self)
end

redef class MdNode

	# Accept the visit of a `MdPostProcessor`
	fun post_process(v: MdPostProcessor) do visit_all(v)
end

# Utils

redef class Sys
	# ATX headings matching
	private var re_atx_heading: Regex = "^(#\{1,6\})([ \t]+|$)".to_re

	# ATX trailings matching
	private var re_atx_trailing: Regex = "(^|[ \t]+)#+[ \t]*$".to_re

	# SeText headings matching
	private var re_setext_heading: Regex = "^(=+|-+)[ \t]*$".to_re

	# Blank lines matching
	var re_trailing_blank_lines: Regex = "(\n[ \t]*)+$".to_re

	# Opening fence matching
	var re_opening_fence: Regex = "^(`\{3,\})(.*)|^(~\{3,\})(.*)".to_re

	# Closing fence matching
	var re_closing_fence: Regex = "^(`\{3,\}|~\{3,\})( *$)".to_re

	# List marker matching
	var re_list_marker: Regex = "^([*+-])( |\t|$)|^([0-9]\{1,9\})([.)])( |\t|$)".to_re

	# Thematic break pattern
	var re_thematic_break: Regex = "^((\\*[ \t]*)\{3,\}|(_[ \t]*)\{3,\}|(-[ \t]*)\{3,\})[ \t]*$".to_re

	# HTML blocks patterns
	var re_html_blocks: Array[Array[nullable Regex]] do
		var blocks = new Array[Array[nullable Regex]]

		var re0_opening = "^<(script|pre|style)(\\s|>|$)".to_re
		re0_opening.ignore_case = true
		var re0_closing = "</(script|pre|style)>".to_re
		re0_closing.ignore_case = true
		blocks.add([re0_opening, re0_closing])

		blocks.add([
			"^<!--".to_re,
			"-->".to_re
		])

		blocks.add([
			"^<[?]".to_re,
			"\\?>".to_re
		])

		blocks.add([
			"^<![A-Z]".to_re,
			">".to_re
		])

		blocks.add([
			"^<!\\[CDATA\\[".to_re,
			"\\]\\]>".to_re
		])

		var re5_opening = "^</?(address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(\\s|[/]?[>]|$)".to_re
		re5_opening.ignore_case = true
		blocks.add([re5_opening, null])

		var p_tagname = "[A-Za-z][A-Za-z0-9-]*"
		var p_attribute_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*"
		var p_uquoted_value = "[^\"'=<>`\\x00-\\x20]+"
		var p_squoted_value = "'[^']*'"
		var p_dquoted_value = "\"[^\"]*\""
		var p_attribute_value = "({p_uquoted_value}|{p_squoted_value}|{p_dquoted_value})"
		var p_attribute_value_spec = "(\\s*=\\s*{p_attribute_value})"
		var p_attribute = "(\\s{p_attribute_name}{p_attribute_value_spec}?)"
		var p_opentag = "<{p_tagname}{p_attribute}*\\s*/?>"
		var p_closetag = "</{p_tagname}\\s*[>]"
		var re6_opening = "^({p_opentag}|{p_closetag})\\s*$".to_re
		re6_opening.ignore_case = true
		blocks.add([re6_opening, null])

		return blocks
	end
end

redef class Int

	# Tab stop is 4
	private fun columns_to_next_tab_stop: Int do return 4 - (self % 4)
end

redef class String

	# Is this string blank?
	#
	# i.e. contains only spacing characters.
	private fun is_blank: Bool do
		for i in [0 .. length[ do
			var c = chars[i]
			if c == ' ' or c == '\t' or c == '\n' or c == '\r' then
				continue
			else
				return false
			end
		end
		return true
	end

	# Is the character at `index` a space or a tab
	#
	# Return false if `index > self.length`.
	private fun is_space_or_tab(index: Int): Bool do
		if index >= length then return false
		var c = chars[index]
		return c == ' ' or c == '\t'
	end
end
lib/markdown2/markdown_block_parsing.nit:15,1--1503,3