--- /dev/null
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Markdown blocks parsing
+#
+# Introduce the parsers for the different Markdown blocks such as headings, lists
+# code blocks etc.
+module markdown_block_parsing
+
+import markdown_inline_parsing
+
+# Markdown parser
+#
+# Used to create the AST representation of a Markdown document.
+class MdParser
+
+ # Inline parser used to parse block content
+ private var inline_parser = new MdInlineParser is lazy
+
+ # Block parsers factories
+ private var block_parser_factories: Collection[MdBlockParserFactory] do
+ var factories = new Array[MdBlockParserFactory]
+ factories.add new MdBlockQuoteParserFactory
+ factories.add new MdHeadingParserFactory
+ factories.add new MdFencedCodeBlockParserFactory
+ factories.add new MdHtmlBlockParserFactory
+ factories.add new MdThematicBreakParserFactory
+ factories.add new MdListBlockParserFactory
+ factories.add new MdIndentedCodeBlockParserFactory
+ return factories
+ end
+
+ # Active block parsers
+ #
+ # Used as a stack to parse nested blocks.
+ private var active_block_parsers = new Array[MdBlockParser]
+
+ # All active block parsers
+ private var all_block_parsers = new HashSet[MdBlockParser]
+
+ # Return the active block parser
+ #
+ # The last entry in the `active_block_parsers` stack.
+ private fun active_block_parser: MdBlockParser do
+ return active_block_parsers.last
+ end
+
+ # Activate a `block_parser`
+ #
+ # Add the `block_parser` on the top of the `active_block_parsers` stack.
+ # Also register it in `all_block_parsers`.
+ private fun activate_block_parser(block_parser: MdBlockParser) do
+ active_block_parsers.add block_parser
+ all_block_parsers.add block_parser
+ end
+
+ # Deactivate the `active_block_parser`
+ private fun deactivate_block_parser do
+ active_block_parsers.pop
+ end
+
+ # Deactivate and remove the `active_block_parser` from the `all_block_parsers` list
+ private fun remove_active_block_parser do
+ var old = active_block_parser
+ deactivate_block_parser
+ all_block_parsers.remove(old)
+ old.block.unlink
+ end
+
+ # Post-processors applied after the parsing of a document
+ var post_processors = new Array[MdPostProcessor] is writable
+
+ # Currently parsed line
+ private var line_string: String is noinit
+
+ # Current index (offset) in input `line_string` (starts at 0)
+ private var index = 0
+
+ # Current column in input `line_string` (starts at 0)
+ #
+ # Tab causes column to go to next 4-space tab stop.
+ private var column = 0
+
+ # Is the current column within a tab character (partially consumed tab)
+ private var column_is_in_tab: Bool is noinit
+
+ # Current line in input string (starts at 1)
+ private var line = 1
+
+ # Index of the next non-space character starting from `index`
+ private var next_non_space_index = 0
+
+ # Next non-space column
+ private var next_non_space_column = 0
+
+ # Current indent in columns
+ #
+ # Either by spaces or tab stop of 4, starting from `column`.
+ private var indent = 0
+
+ # Is the current `line` blank starting from `index`?
+ private var is_blank: Bool is noinit
+
+ # Does a node end with a blank line?
+ private var last_line_blank = new HashMap[MdNode, Bool]
+
+ # Initialize parser state
+ private fun initialize do
+ active_block_parsers.clear
+ all_block_parsers.clear
+ index = 0
+ column = 0
+ column_is_in_tab = false
+ line = 1
+ next_non_space_index = 0
+ next_non_space_column = 0
+ indent = 0
+ is_blank = false
+ last_line_blank.clear
+ end
+
+ # Parse the `input` string as a MdDocument
+ fun parse(input: String): MdDocument do
+ initialize
+
+ var document_block_parser = new MdDocumentBlockParser(1, 1, 0)
+ activate_block_parser(document_block_parser)
+ var line_start = 0
+ var line_break = find_line_break(input, line_start)
+ while line_break != -1 do
+ var line_string = input.substring(line_start, line_break - line_start)
+ incorporate_line(line_string)
+ if line_break + 1 < input.length and
+ input.chars[line_break] == '\r' and
+ input.chars[line_break + 1] == '\n' then
+ line_start = line_break + 2
+ else
+ line_start = line_break + 1
+ end
+ line_break = find_line_break(input, line_start)
+ line += 1
+ column = 0
+ end
+
+ # Finalize pending line
+ if input.length > 0 and (line_start == 0 or line_start < input.length) then
+ incorporate_line(input.substring(line_start, input.length - line_start))
+ end
+ finalize_blocks(active_block_parsers)
+
+ # Walk through a block and its chiildren revursively
+ # Parsing string content into inline content where appropriate.
+ var all_block_parsers = all_block_parsers.to_a
+ var i = all_block_parsers.length - 1
+ while i >= 0 do
+ var block_parser = all_block_parsers[i]
+ block_parser.parse_inlines(inline_parser)
+ i -= 1
+ end
+ var document = document_block_parser.block
+ return document
+ end
+
+ # Post-process the `document`
+ fun post_process(document: MdDocument) do
+ for processor in post_processors do
+ processor.post_process(self, document)
+ end
+ end
+
+ # Analyze a line of text and update the document
+ #
+ # We parse Markdown text by calling this on each line of `input`.
+ private fun incorporate_line(input: String) do
+ line_string = input
+ index = 0
+ column = 0
+ column_is_in_tab = false
+
+ # For each containing block, try to parse the associated line start.
+ var matches = 1
+ for i in [1 .. active_block_parsers.length[ do
+ var block_parser = active_block_parsers[i]
+ find_next_non_space
+
+ var result = block_parser.try_continue(self)
+ if result isa MdBlockContinue then
+ if result.is_finalize then
+ block_parser.finalize(self)
+ return
+ else
+ if result.new_index != -1 then
+ set_new_index result.new_index
+ else if result.new_column != -1 then
+ set_new_column result.new_column
+ end
+ end
+ matches += 1
+ else
+ break
+ end
+ end
+
+ var unmatched_block_parsers = active_block_parsers.subarray(
+ matches, active_block_parsers.length - matches)
+ var last_matched_block_parser = active_block_parsers[matches - 1]
+ var block_parser = last_matched_block_parser
+ var all_closed = unmatched_block_parsers.is_empty
+
+ # Unless last matched container is a code block, try new container starts,
+ # adding children to the last matched container.
+ var try_block_starts = block_parser.block isa MdParagraph or
+ block_parser.block.is_container
+
+ while try_block_starts do
+ find_next_non_space
+
+ # Optimize lookup
+ if is_blank or (indent < 4 and line_string.chars[next_non_space_index].is_letter) then
+ set_new_index next_non_space_index
+ break
+ end
+
+ var block_start = find_block_start(block_parser)
+ if block_start == null then
+ set_new_index next_non_space_index
+ break
+ end
+
+ if not all_closed then
+ finalize_blocks(unmatched_block_parsers)
+ all_closed = true
+ end
+
+ if block_start.new_index != -1 then
+ set_new_index block_start.new_index
+ else if block_start.new_column != -1 then
+ set_new_column block_start.new_column
+ end
+
+ if block_start.replace_active_block_parser then
+ remove_active_block_parser
+ end
+
+ for new_block_parser in block_start.block_parsers do
+ add_child(new_block_parser)
+ block_parser = new_block_parser
+ try_block_starts = new_block_parser.block.is_container
+ end
+ end
+
+ # What remains at the offset is a text line.
+ # Add the text to the appropriate block.
+
+ # First check for a lazy paragraph continuation
+ if not all_closed and not is_blank and active_block_parser isa MdParagraphParser then
+ add_line
+ else
+ # Finalize any blocks not matched
+ if not all_closed then
+ finalize_blocks(unmatched_block_parsers)
+ end
+ propagate_last_line_blank(block_parser, last_matched_block_parser)
+
+ if not block_parser.block.is_container then
+ add_line
+ else if not is_blank then
+ # Create a paragraph container for the line
+ add_child(new MdParagraphParser(line, column + 1, block_parser.content_offset))
+ add_line
+ end
+ end
+ end
+
+ # Find what kind of block starts at `index` in `input`
+ private fun find_block_start(block_parser: MdBlockParser): nullable MdBlockStart do
+ for block_parser_factory in block_parser_factories do
+ var result = block_parser_factory.try_start(self, block_parser)
+ if result != null then return result
+ end
+ return null
+ end
+
+ # Add a `block_parser` block's as child of the active block parser block
+ private fun add_child(block_parser: MdBlockParser) do
+ # Finalize non-parentable blocks
+ while not active_block_parser.block.can_contain(block_parser.block) do
+ active_block_parser.finalize(self)
+ end
+ # Append block block parser block to its parent
+ active_block_parser.block.append_child(block_parser.block)
+ activate_block_parser(block_parser)
+ end
+
+ # Add line content to the active block parser
+ #
+ # We assume it can accept lines.
+ private fun add_line do
+ var content = null
+ if column_is_in_tab then
+ # Out column is in a partially consumed tab.
+ # Expand the remaining columns to the next tab stop to spaces.
+ var after_tab = index + 1
+ var rest = line_string.substring(after_tab, line_string.length - after_tab)
+ var spaces = column.columns_to_next_tab_stop
+ var buffer = new Buffer
+ for i in [0 .. spaces[ do
+ buffer.add ' '
+ end
+ buffer.append(rest)
+ content = buffer.write_to_string
+ else
+ content = line_string.substring(index, line_string.length - index)
+ end
+ active_block_parser.add_line(content)
+ end
+
+ # Finalize blocks of previous line
+ private fun finalize_blocks(block_parsers: Sequence[MdBlockParser]) do
+ var i = block_parsers.length - 1
+ while i >= 0 do
+ var block_parser = block_parsers[i]
+ block_parser.finalize(self)
+ i -= 1
+ end
+ end
+
+ # Advance the `index` position to the next character
+ #
+ # Also set the `column`.
+ # If the next character is a tab, compute the new column accordingly.
+ private fun advance do
+ var c = line_string.chars[index]
+ if c == '\t' then
+ index += 1
+ column += column.columns_to_next_tab_stop
+ else
+ index += 1
+ column += 1
+ end
+ end
+
+ # Move `index` to the next non-space character index in the `input` string
+ #
+ # Also set `next_non_space_index`, `next_non_space_column`, `is_blank` and `indent`.
+ private fun find_next_non_space do
+ var i = index
+ var cols = column
+
+ is_blank = true
+ while i < line_string.length do
+ var c = line_string.chars[i]
+ if c == ' ' then
+ i += 1
+ cols += 1
+ continue
+ else if c == '\t' then
+ i += 1
+ cols += 4 - (cols % 4)
+ continue
+ end
+ is_blank = false
+ break
+ end
+
+ next_non_space_index = i
+ next_non_space_column = cols
+ indent = next_non_space_column - column
+ end
+
+ # Return the position of the next line break
+ #
+ # We consider `\r` and `\n`.
+ private fun find_line_break(input: String, start_index: Int): Int do
+ for i in [start_index .. input.length[ do
+ var char = input.chars[i]
+ if char == '\r' or char == '\n' then return i
+ end
+ return -1
+ end
+
+ # Set the parser `index` at `new_index`
+ #
+ # Also set `column` and `column_is_in_tab`.
+ private fun set_new_index(new_index: Int) do
+ if new_index >= next_non_space_index then
+ # We can start from here, no need to calculate tab stops again
+ index = next_non_space_index
+ column = next_non_space_column
+ end
+ while index < new_index and index != line_string.length do
+ advance
+ end
+ # If we're going to an index as opposed to a column, we're never within a tab
+ column_is_in_tab = false
+ end
+
+ # Set the parser `column` at `new_column`
+ #
+ # Also set `index` and `column_is_in_tab`.
+ private fun set_new_column(new_column: Int) do
+ if new_column >= next_non_space_column then
+ # We can start from here, no need to calculate tab stops again
+ index = next_non_space_index
+ column = next_non_space_column
+ end
+ while column < new_column and index != line_string.length do
+ advance
+ end
+ if column > new_column then
+ # Last character was a tab and we overshot our target
+ index -= 1
+ column = new_column
+ column_is_in_tab = true
+ else
+ column_is_in_tab = false
+ end
+ end
+
+ # Does `block` end with a blank line?
+ private fun ends_with_blank_line(block: nullable MdNode): Bool do
+ while block != null do
+ if is_last_line_blank(block) then return true
+ if block isa MdListBlock or block isa MdListItem then
+ block = block.last_child
+ else
+ break
+ end
+ end
+ return false
+ end
+
+ # Propagate a blank line to all block_parser blocl's parents
+ private fun propagate_last_line_blank(block_parser: MdBlockParser, last_matched_block_parser: MdBlockParser) do
+ var last_child = block_parser.block.last_child
+ if is_blank and last_child != null then
+ last_line_blank[last_child] = true
+ end
+ var block = block_parser.block
+
+ # Block quotes lines are never blank as they start with `>`.
+ # We don't count blanks in fenced code for purposes of thight/loose lists.
+ # We also don't set `last_line_blank` on an empty list item.
+ var last_line_blank = is_blank and
+ not (block isa MdBlockQuote or
+ block isa MdFencedCodeBlock or
+ (block isa MdListItem and block.first_child == null and
+ block_parser != last_matched_block_parser))
+
+ # Propagate `last_line_blank` up through parents
+ var node: nullable MdNode = block_parser.block
+ while node != null do
+ self.last_line_blank[node] = last_line_blank
+ node = node.parent
+ end
+ end
+
+ # Is last line blank for `node`?
+ private fun is_last_line_blank(node: MdNode): Bool do
+ if not last_line_blank.has_key(node) then return false
+ return last_line_blank[node]
+ end
+end
+
+# Block parsing
+
+# Parser for a specific block node
+abstract class MdBlockParser
+
+ # Kind of block under construction
+ type BLOCK: MdBlock
+
+ # MdBlock under construction
+ fun block: BLOCK is abstract
+
+ # Line Start
+ var line_start: Int
+
+ # Column start
+ var column_start: Int
+
+ # Location at start
+ #
+ # The location end it initialized at `-1` and will be set later in the
+ # `finalize` method.
+ var location: MdLocation is lazy do return new MdLocation(line_start, column_start, -1, -1)
+
+ # Column where the content starts
+ var content_offset: Int
+
+ # Initialize the current `block`
+ fun initialize(parser: MdParser) do end
+
+ # Can `self` continue from the current `index` in `parser`?
+ #
+ # Return a new `MdBlockContinue` if `self` can continue parsing.
+ # Return null otherwise.
+ fun try_continue(state: MdParser): nullable MdBlockContinue is abstract
+
+ # Add `line` to the current `block`
+ fun add_line(line: String) do end
+
+ # Finalize the current `block`
+ #
+ # Deactivate `self` from `parser` and call `close_block`.
+ fun finalize(parser: MdParser) do
+ if parser.active_block_parser == self then
+ parser.deactivate_block_parser
+ end
+ end
+
+ # Parse `block` lines
+ fun parse_inlines(inline_parser: MdInlineParser) do end
+end
+
+# Result object for continuing parsing of a block
+class MdBlockContinue
+
+ # Index from which continue parsing
+ var new_index: Int
+
+ # Column from which continue parsing
+ var new_column: Int
+
+ # Is the block finalized?
+ var is_finalize: Bool
+
+ # Continue from index
+ init at_index(new_index: Int) do
+ init(new_index, -1, false)
+ end
+
+ # Continue from column
+ init at_column(new_column: Int) do
+ init(-1, new_column, false)
+ end
+
+ # Block is finished
+ init finished do
+ init(-1, -1, true)
+ end
+end
+
+# Block parser factory for a block node for determining when a block starts
+abstract class MdBlockParserFactory
+
+ # Can the associated block parser can start at the current line in `parser`?
+ #
+ # Return a new `MdBlockStart` if the block parser can start.
+ # Return null otherwise.
+ fun try_start(parser: MdParser, matched_block_parser: MdBlockParser):
+ nullable MdBlockStart is abstract
+end
+
+# Result object from starting parsing of a block
+class MdBlockStart
+
+ # Block parsers for this block start
+ var block_parsers: Array[MdBlockParser]
+
+ # Index where the parsing should start
+ var new_index = -1
+
+ # Column where the parsing should start
+ var new_column = -1
+
+ # Does the block starting with `self` terminate a previous block?
+ var replace_active_block_parser = false
+
+ # Start from `new_index`
+ fun at_index(new_index: Int): MdBlockStart do
+ self.new_index = new_index
+ return self
+ end
+
+ # Start from `new_column`
+ fun at_column(new_column: Int): MdBlockStart do
+ self.new_column = new_column
+ return self
+ end
+
+ # Start replacing the active block parser
+ fun replacing_active_block_parser: MdBlockStart do
+ self.replace_active_block_parser = true
+ return self
+ end
+end
+
+# Parser for the whole document
+class MdDocumentBlockParser
+ super MdBlockParser
+
+ redef type BLOCK: MdDocument
+ redef var block = new MdDocument(location) is lazy
+
+ # Always continue at current indent
+ redef fun try_continue(state) do return new MdBlockContinue.at_index(state.index)
+
+ redef fun finalize(parser) do
+ end
+
+ # redef fun finalize(state) do
+ redef fun parse_inlines(inline_parser) do
+ var last_child = block.last_child
+ if last_child != null then
+ location.line_end = last_child.location.line_end
+ location.column_end = last_child.location.column_end
+ end
+ end
+end
+
+# Headings parser
+class MdHeadingParser
+ super MdBlockParser
+
+ redef type BLOCK: MdHeading
+
+ redef var block = new MdHeading(location, level, is_setext, has_atx_trailing) is lazy
+
+ redef var location = new MdLocation(line_start, column_start, line_end, column_end) is lazy
+
+ # Line end
+ var line_end: Int
+
+ # Column end
+ var column_end: Int
+
+ # Heading level
+ var level: Int
+
+ # Heading content
+ var content: String
+
+ # Heading has ATX trailing
+ var has_atx_trailing: Bool
+
+ # Heading is setext format
+ var is_setext: Bool
+
+ # Never continue parsing as an heading is a one liner
+ redef fun try_continue(state) do return null
+
+ # Parse the heading content
+ redef fun parse_inlines(inline_parser) do
+ inline_parser.parse(content, content_offset, block)
+ end
+end
+
+# Heading parser factory
+class MdHeadingParserFactory
+ super MdBlockParserFactory
+
+ redef fun try_start(state, matched_block_parser) do
+ if state.indent >= 4 then return null
+
+ var next_non_space = state.next_non_space_index
+ var line = state.line_string
+ var paragraph = null
+ if matched_block_parser isa MdParagraphParser then
+ paragraph = matched_block_parser.content
+ end
+
+ var line_content = line.substring(next_non_space, line.length - next_non_space)
+ var match = line_content.search(re_atx_heading)
+ if match != null then
+ # ATX heading
+ var new_offset = next_non_space + match.subs.first.as(not null).length
+ var level = match.subs.first.as(not null).to_s.trim.length
+ # remove trailing ###s
+ var after_leading = line.substring(new_offset, line.length - new_offset)
+ var trailing = after_leading.search(re_atx_trailing)
+ var has_trailing = trailing != null
+ var trailing_length = if trailing != null then trailing.length else 0
+ var content = after_leading.replace(re_atx_trailing, "")
+ return (new MdBlockStart(
+ [new MdHeadingParser(
+ state.line,
+ next_non_space + 1,
+ new_offset + 1,
+ state.line,
+ new_offset + content.length + trailing_length,
+ level,
+ content,
+ has_trailing, false)])
+ ).at_index(line.length)
+ end
+
+ if paragraph == null then return null
+
+ match = line_content.search(re_setext_heading)
+ if match == null then return null
+ var level = 2
+ if match.subs.first.as(not null).to_s.chars.first == '=' then level = 1
+ var content = paragraph.to_s
+ return (new MdBlockStart(
+ [new MdHeadingParser(
+ state.line - 1,
+ next_non_space + 1,
+ 0,
+ state.line,
+ state.column + match.length,
+ level,
+ content,
+ false, true)])
+ ).at_index(line.length).replacing_active_block_parser
+ end
+end
+
+# Blockquotes parser
+class MdBlockQuoteParser
+ super MdBlockParser
+
+ redef type BLOCK: MdBlockQuote
+ redef var block = new MdBlockQuote(location) is lazy
+
+ redef fun try_continue(state) do
+ var next_non_space = state.next_non_space_index
+ var indent = state.indent
+ var line = state.line_string
+
+ if indent >= 4 then return null
+ if next_non_space >= line.length then return null
+ if line.chars[next_non_space] != '>' then return null
+
+ var new_column = state.column + state.indent + 1
+ # optional following space or tab
+ if state.line_string.is_space_or_tab(next_non_space + 1) then
+ new_column += 1
+ end
+ return new MdBlockContinue.at_column(new_column)
+ end
+
+ redef fun parse_inlines(inline_parser) do
+ var last_child = block.last_child
+ if last_child != null then
+ location.line_end = last_child.location.line_end
+ location.column_end = last_child.location.column_end
+ end
+ end
+end
+
+# Blockquotes parser factory
+class MdBlockQuoteParserFactory
+ super MdBlockParserFactory
+
+ redef fun try_start(state, matched_block_parser) do
+ var next_non_space = state.next_non_space_index
+ var indent = state.indent
+ var line = state.line_string
+
+ if indent >= 4 then return null
+ if next_non_space >= line.length then return null
+ if line.chars[next_non_space] != '>' then return null
+
+ var new_column = state.column + state.indent + 1
+ # optional following space or tab
+ if state.line_string.is_space_or_tab(next_non_space + 1) then
+ new_column += 1
+ end
+ return (new MdBlockStart(
+ [new MdBlockQuoteParser(
+ state.line,
+ state.column + 1,
+ new_column)])
+ ).at_column(new_column)
+ end
+end
+
+# Indented code blocks parser
+class MdIndentedCodeBlockParser
+ super MdBlockParser
+
+ redef type BLOCK: MdIndentedCodeBlock
+ redef var block = new MdIndentedCodeBlock(location, use_tabs) is lazy
+
+ # Indent is tab?
+ var use_tabs: Bool
+
+ # Block content
+ var content = new Buffer
+
+ redef fun try_continue(state) do
+ if state.indent >= 4 then
+ return new MdBlockContinue.at_column(state.column + 4)
+ else if state.is_blank then
+ return new MdBlockContinue.at_index(state.next_non_space_index)
+ end
+ return null
+ end
+
+ redef fun add_line(line) do
+ if not content.is_empty then
+ content.add('\n')
+ end
+ content.append(line)
+ end
+
+ redef fun finalize(parser) do
+ super
+
+ add_line(" ")
+ var content = self.content.to_s
+ var literal = content.replace_first(re_trailing_blank_lines, "\n")
+ block.literal = literal
+
+ var lines = literal.split("\n")
+ location.line_end = location.line_start + lines.length - 2
+ location.column_end = content_offset + lines[lines.length - 2].length + 4
+ end
+end
+
+# Indented code blocks parser factory
+class MdIndentedCodeBlockParserFactory
+ super MdBlockParserFactory
+
+ redef fun try_start(state, matched_block_parser) do
+ if state.indent < 4 then return null
+ if state.is_blank then return null
+ if state.active_block_parser.block isa MdParagraph then return null
+
+ var use_tabs = state.line_string.has_prefix("\t")
+ return (new MdBlockStart(
+ [new MdIndentedCodeBlockParser(
+ state.line,
+ state.column + 1,
+ state.column,
+ use_tabs)])
+ ).at_column(state.column + 4)
+ end
+end
+
+# Fenced code blocks parser
+class MdFencedCodeBlockParser
+ super MdBlockParser
+
+ redef type BLOCK: MdFencedCodeBlock
+ redef var block = new MdFencedCodeBlock(location, fence_char, fence_length, fence_indent) is lazy
+
+ # Fence character
+ var fence_char: Char
+
+ # Fence length
+ var fence_length: Int
+
+ # Fence indent
+ var fence_indent: Int
+
+ # Fence first line
+ var first_line: nullable String = null
+
+ # Fence other lines
+ var other_lines = new Buffer
+
+ redef fun try_continue(state) do
+ var next_non_space = state.next_non_space_index
+ var new_index = state.index
+ var line = state.line_string
+
+ if state.indent <= 3 and next_non_space < line.length and
+ line.chars[next_non_space] == fence_char then
+
+ var match = line.substring(next_non_space, line.length - next_non_space).
+ search(re_closing_fence)
+ if match != null and match.subs[0].as(not null).length >= fence_length then
+ # closing fence - we're at end of line, so we can finalize now
+ return new MdBlockContinue.finished
+ end
+ end
+
+ # skip optional spaces of fence indent
+ var i = fence_indent
+ while i > 0 and new_index < line.length and line.chars[new_index] == ' ' do
+ new_index += 1
+ i -= 1
+ end
+
+ return new MdBlockContinue.at_index(new_index)
+ end
+
+ redef fun add_line(line) do
+ if first_line == null then
+ first_line = line
+ else
+ other_lines.append(line)
+ other_lines.add '\n'
+ end
+ end
+
+ redef fun finalize(parser) do
+ super
+
+ # first line become info string
+ var first_line = self.first_line
+ if first_line != null then
+ var info = first_line.trim.unescape_string
+ if not info.is_empty then block.info = info
+ end
+
+ var content = other_lines.to_s
+ block.literal = content
+
+ var lines = content.split("\n")
+ location.line_end = location.line_start + lines.length
+ location.column_end = content_offset + fence_indent + fence_length
+ end
+end
+
+# Fenced code blocks parser factory
+class MdFencedCodeBlockParserFactory
+ super MdBlockQuoteParserFactory
+
+ redef fun try_start(state, matched_block_parser) do
+ var next_non_space = state.next_non_space_index
+ var line = state.line_string
+
+ if state.indent >= 4 then return null
+
+ var match = line.substring(next_non_space, line.length - next_non_space).search(re_opening_fence)
+ if match == null then return null
+
+ var fence_length
+ var fence_char
+ var sub0 = match.subs[0]
+ if sub0 != null then
+ fence_length = sub0.length
+ fence_char = sub0.to_s.chars.first
+ else
+ fence_length = match.subs[2].as(not null).length
+ fence_char = match.subs[2].as(not null).to_s.chars.first
+ end
+ if fence_char == '`' and match.to_s.has("[^`]+`".to_re) then
+ return null
+ else if match.to_s.has("[^~]+~".to_re) then
+ return null
+ end
+ return (new MdBlockStart(
+ [new MdFencedCodeBlockParser(
+ state.line,
+ state.column + 1,
+ state.column,
+ fence_char,
+ fence_length,
+ state.indent)]
+ )).at_index(next_non_space + fence_length)
+ end
+end
+
+# List blocks parser
+class MdListBlockParser
+ super MdBlockParser
+
+ redef type BLOCK: MdListBlock
+
+ redef var block is lazy do
+ if is_ordered then
+ return new MdOrderedList(location, digit.as(not null), delim.as(not null))
+ else
+ return new MdUnorderedList(location, bullet.as(not null))
+ end
+ end
+
+ # Is this list ordered
+ var is_ordered: Bool
+
+ # List bullet if unordered
+ var bullet: nullable Char
+
+ # List digit if ordered
+ var digit: nullable Int
+
+ # List delimiter if ordered
+ var delim: nullable Char
+
+ redef fun try_continue(state) do return new MdBlockContinue.at_index(state.index)
+
+ redef fun finalize(parser) do
+ super
+
+ var item = block.first_child
+ while item != null do
+ # check for non-final list item ending with blank line
+ if parser.ends_with_blank_line(item) and item.next != null then
+ block.is_tight = false
+ break
+ end
+ # recurse into children of list item to see if there are spaces between any of them
+ var sub_item = item.first_child
+ while sub_item != null do
+ if parser.ends_with_blank_line(sub_item) and
+ (item.next != null or sub_item.next != null) then
+ block.is_tight = false
+ break
+ end
+ sub_item = sub_item.next
+ end
+ item = item.next
+ end
+ end
+
+ redef fun parse_inlines(inline_parser) do
+ var last_child = block.last_child
+ if last_child != null then
+ location.line_end = last_child.location.line_end
+ location.column_end = last_child.location.column_end
+ end
+ end
+end
+
+# List blocks parser factory
+class MdListBlockParserFactory
+ super MdBlockQuoteParserFactory
+
+ redef fun try_start(state, matched_block_parser) do
+ if state.indent >= 4 and not matched_block_parser isa MdListBlockParser then return null
+
+ var marker_index = state.next_non_space_index
+ var marker_column = state.column + state.indent
+
+ var in_paragraph = matched_block_parser isa MdParagraphParser and matched_block_parser.content != null
+ var list_data = parse_list_marker(state, state.line_string, marker_index, marker_column, in_paragraph)
+ if list_data == null then return null
+
+
+ var new_column = list_data.content_column
+ var list_item_parser = new MdListItemParser(
+ state.line,
+ state.column + 1,
+ new_column,
+ new_column - state.column)
+
+ # prepend the list block if needed
+ if not matched_block_parser isa MdListBlockParser or not lists_match(matched_block_parser.block, list_data) then
+ var list_block_parser = new MdListBlockParser(state.line, state.column + 1, new_column - state.column, list_data.is_ordered, list_data.bullet, list_data.digit, list_data.delim)
+ list_block_parser.block.is_tight = true
+
+ return (new MdBlockStart([list_block_parser, list_item_parser: MdBlockParser])).at_column(new_column)
+ end
+ return (new MdBlockStart([list_item_parser])).at_column(new_column)
+ end
+
+ private fun parse_list_marker(state: MdParser, line: String, marker_index, marker_column: Int, in_paragraph: Bool): nullable MdListData do
+ var rest = line.substring(marker_index, line.length - marker_index)
+ var match = rest.search(re_list_marker)
+ if match == null then return null
+
+ var is_ordered
+ var bullet = null
+ var digit = null
+ var delim = null
+
+ var bullet_match = match.subs[0]
+ if bullet_match != null then
+ is_ordered = false
+ bullet = bullet_match.to_s.chars[0]
+ else
+ is_ordered = true
+ digit = match.subs[2].as(not null).to_s.to_i
+ delim = match.subs[3].as(not null).to_s.chars[0]
+ end
+
+ var marker_length = match.length
+ if match.to_s.has_suffix(" ") or match.to_s.has_suffix("\t") then
+ marker_length -= 1
+ end
+ var index_after_marker = marker_index + marker_length
+
+ # marker doesn't include tabs, so counting them as column directly is ok
+ var column_after_marker = marker_column + marker_length
+ # the column within the line where the content starts
+ var content_column = column_after_marker
+
+ # see at which column the content starts if there is content
+ var has_content = false
+ for i in [index_after_marker .. line.length[ do
+ var c = line.chars[i]
+ if c == '\t' then
+ content_column += content_column.columns_to_next_tab_stop
+ else if c == ' ' then
+ content_column += 1
+ else
+ has_content = true
+ break
+ end
+ end
+
+ if in_paragraph then
+ # if the list item is ordered, then start number must be 1 to interrupt a paragraph
+ if is_ordered and digit != 1 then
+ return null
+ end
+ # empty list item can not interrupt a paragraph
+ if not has_content then
+ return null
+ end
+ end
+
+ if not has_content or (content_column - column_after_marker) > 4 then
+ # if this line is blank or has a code block, default to 1 space after marker
+ content_column = column_after_marker + 1
+ end
+ return new MdListData(is_ordered, bullet, digit, delim, content_column)
+ end
+
+ # Return true if the two list items are of the same type
+ #
+ # With the same delimiter and bullet character.
+ # This is used in agglomerating list items into lists
+ private fun lists_match(a: MdListBlock, b: MdListData): Bool do
+ if a isa MdUnorderedList and not b.is_ordered then
+ return a.bullet_marker == b.bullet
+ else if a isa MdOrderedList and b.is_ordered then
+ return a.delimiter == b.delim
+ end
+ return false
+ end
+end
+
+# Parsed list data
+private class MdListData
+
+ var is_ordered: Bool
+
+ var bullet: nullable Char
+
+ var digit: nullable Int
+
+ var delim: nullable Char
+
+ # Column the content start at
+ var content_column: Int
+end
+
+# List items parser
+class MdListItemParser
+ super MdBlockParser
+
+ redef type BLOCK: MdListItem
+ redef var block = new MdListItem(location) is lazy
+
+ # List item content indend
+ var content_indent: Int
+
+ redef fun try_continue(state) do
+ if state.is_blank then
+ if block.first_child == null then
+ # blank line after empty list item
+ return null
+ end
+ return new MdBlockContinue.at_index(state.next_non_space_index)
+ end
+ if state.indent >= content_indent then
+ return new MdBlockContinue.at_column(state.column + content_indent)
+ end
+ return null
+ end
+
+ redef fun parse_inlines(inline_parser) do
+ var last_child = block.last_child
+ if last_child != null then
+ location.line_end = last_child.location.line_end
+ location.column_end = last_child.location.column_end
+ end
+ end
+end
+
+# Thematic breaks parser
+class MdThematicBreakParser
+ super MdBlockParser
+
+ redef type BLOCK: MdThematicBreak
+ redef var block = new MdThematicBreak(location, pattern) is lazy
+
+ # Thematic break pattern
+ var pattern: String
+
+ redef fun try_continue(state) do return null
+
+ redef fun finalize(parser) do
+ super
+
+ location.line_end = line_start
+ location.column_end = column_start + pattern.length - 1
+ end
+end
+
+# Thematic breaks parser factory
+class MdThematicBreakParserFactory
+ super MdBlockQuoteParserFactory
+
+ redef fun try_start(state, matched_block_parser) do
+ if state.indent >= 4 then return null
+
+ var next_non_space = state.next_non_space_index
+ var line = state.line_string
+ var tbreak = line.substring(next_non_space, line.length - next_non_space).search(re_thematic_break)
+ if tbreak != null then
+ return (new MdBlockStart(
+ [new MdThematicBreakParser(
+ state.line,
+ state.column + 1,
+ next_non_space,
+ tbreak.to_s)]
+ )).at_index(line.length)
+ end
+ return null
+ end
+end
+
+# Paragraphs parser
+class MdParagraphParser
+ super MdBlockParser
+
+ redef type BLOCK: MdParagraph
+
+ redef var block = new MdParagraph(location) is lazy
+
+ # Paragraph content
+ var content: nullable Buffer = new Buffer
+
+ redef fun try_continue(state) do
+ if state.is_blank then return null
+ return new MdBlockContinue.at_index(state.index)
+ end
+
+ redef fun add_line(line) do
+ var content = self.content
+ if content == null then return
+ if not content.is_empty then
+ content.add('\n')
+ end
+ content.append(line)
+ end
+
+ redef fun finalize(parser) do
+ super
+
+ var inline_parser = parser.inline_parser
+ var content = self.content
+ if content == null then return
+
+ var content_string = content.to_s
+ var has_reference_defs = false
+
+ var pos = inline_parser.parse_reference(content_string)
+ # try parsing the beginning as link reference definitions
+ while content_string.length > 3 and content_string.chars[0] == '[' and pos != 0 do
+ content_string = content_string.substring(pos, content_string.length - pos)
+ has_reference_defs = true
+ pos = inline_parser.parse_reference(content_string)
+ end
+
+ if has_reference_defs and content_string.is_blank then
+ block.unlink
+ self.content = null
+ else
+ self.content = new Buffer.from_text(content_string)
+ end
+ end
+
+ redef fun parse_inlines(inline_parser) do
+ var content = self.content
+ if content == null then return
+ inline_parser.parse(content.to_s, content_offset, block)
+
+ var last_child = block.last_child
+ if last_child != null then
+ location.line_end = last_child.location.line_end
+ location.column_end = last_child.location.column_end
+ end
+ end
+end
+
+# Html blocks parser
+class MdHtmlBlockParser
+ super MdBlockParser
+
+ redef type BLOCK: MdHtmlBlock
+ redef var block = new MdHtmlBlock(location) is lazy
+
+ # Closing tag pattern
+ #
+ # Or null if the block is not closed
+ var closing_pattern: nullable Pattern
+
+ # Is the current block finished?
+ var finished = false
+
+ # Block content
+ var content = new Buffer
+
+ redef fun try_continue(state) do
+ if finished then return null
+
+ # blank lin ends type 6 and 7 blocks
+ if state.is_blank and closing_pattern == null then return null
+
+ return new MdBlockContinue.at_index(state.index)
+ end
+
+ redef fun add_line(line) do
+ if not content.is_empty then
+ content.add('\n')
+ end
+ content.append(line)
+ var closing_pattern = self.closing_pattern
+ if closing_pattern != null and line.has(closing_pattern) then
+ finished = true
+ end
+ end
+
+ redef fun finalize(parser) do
+ super
+
+ var content = self.content.to_s
+ block.literal = content
+
+ var lines = content.split("\n")
+ location.line_end = location.line_start + lines.length - 1
+ location.column_end = lines.last.length
+ end
+end
+
+# Html blocks parser factory
+class MdHtmlBlockParserFactory
+ super MdBlockParserFactory
+
+ redef fun try_start(state, matched_block_parser) do
+ var next_non_space = state.next_non_space_index
+ var line = state.line_string
+
+ if state.indent >= 4 or line.chars[next_non_space] != '<' then return null
+
+ for block_type in [0..6] do
+ # type 7 can not interrupt a paragraph
+ if block_type == 6 and matched_block_parser.block isa MdParagraph then continue
+ var opener = re_html_blocks[block_type].first
+ var closer = re_html_blocks[block_type].last
+ if line.substring(next_non_space, line.length - next_non_space).has(opener.as(not null)) then
+ return (new MdBlockStart(
+ [new MdHtmlBlockParser(
+ state.line,
+ state.column + 1,
+ next_non_space,
+ closer)])
+ ).at_index(state.index)
+ end
+ end
+ return null
+ end
+end
+
+# Post Processing
+
+# Markdown post processor
+#
+# A Markdown AST visitor called after parsing from a MdParser
+abstract class MdPostProcessor
+ super MdVisitor
+
+ # Document behing processed
+ #
+ # Availlable only during a call to `post_process`.
+ var document: nullable MdDocument = null
+
+ # Post process the `document` parsed by `parser`
+ fun post_process(parser: MdParser, document: MdDocument) do
+ self.document = document
+ enter_visit(document)
+ self.document = null
+ end
+
+ # Call `MdNode::post_process`
+ redef fun visit(node) do node.post_process(self)
+end
+
+redef class MdNode
+
+ # Accept the visit of a `MdPostProcessor`
+ fun post_process(v: MdPostProcessor) do visit_all(v)
+end
+
+# Utils
+
+redef class Sys
+ # ATX headings matching
+ private var re_atx_heading: Regex = "^(#\{1,6\})([ \t]+|$)".to_re
+
+ # ATX trailings matching
+ private var re_atx_trailing: Regex = "(^|[ \t]+)#+[ \t]*$".to_re
+
+ # SeText headings matching
+ private var re_setext_heading: Regex = "^(=+|-+)[ \t]*$".to_re
+
+ # Blank lines matching
+ var re_trailing_blank_lines: Regex = "(\n[ \t]*)+$".to_re
+
+ # Opening fence matching
+ var re_opening_fence: Regex = "^(`\{3,\})(.*)|^(~\{3,\})(.*)".to_re
+
+ # Closing fence matching
+ var re_closing_fence: Regex = "^(`\{3,\}|~\{3,\})( *$)".to_re
+
+ # List marker matching
+ var re_list_marker: Regex = "^([*+-])( |\t|$)|^([0-9]\{1,9\})([.)])( |\t|$)".to_re
+
+ # Thematic break pattern
+ var re_thematic_break: Regex = "^((\\*[ \t]*)\{3,\}|(_[ \t]*)\{3,\}|(-[ \t]*)\{3,\})[ \t]*$".to_re
+
+ # HTML blocks patterns
+ var re_html_blocks: Array[Array[nullable Regex]] do
+ var blocks = new Array[Array[nullable Regex]]
+
+ var re0_opening = "^<(script|pre|style)(\\s|>|$)".to_re
+ re0_opening.ignore_case = true
+ var re0_closing = "</(script|pre|style)>".to_re
+ re0_closing.ignore_case = true
+ blocks.add([re0_opening, re0_closing])
+
+ blocks.add([
+ "^<!--".to_re,
+ "-->".to_re
+ ])
+
+ blocks.add([
+ "^<[?]".to_re,
+ "\\?>".to_re
+ ])
+
+ blocks.add([
+ "^<![A-Z]".to_re,
+ ">".to_re
+ ])
+
+ blocks.add([
+ "^<!\\[CDATA\\[".to_re,
+ "\\]\\]>".to_re
+ ])
+
+ var re5_opening = "^</?(address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(\\s|[/]?[>]|$)".to_re
+ re5_opening.ignore_case = true
+ blocks.add([re5_opening, null])
+
+ var p_tagname = "[A-Za-z][A-Za-z0-9-]*"
+ var p_attribute_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*"
+ var p_uquoted_value = "[^\"'=<>`\\x00-\\x20]+"
+ var p_squoted_value = "'[^']*'"
+ var p_dquoted_value = "\"[^\"]*\""
+ var p_attribute_value = "({p_uquoted_value}|{p_squoted_value}|{p_dquoted_value})"
+ var p_attribute_value_spec = "(\\s*=\\s*{p_attribute_value})"
+ var p_attribute = "(\\s{p_attribute_name}{p_attribute_value_spec}?)"
+ var p_opentag = "<{p_tagname}{p_attribute}*\\s*/?>"
+ var p_closetag = "</{p_tagname}\\s*[>]"
+ var re6_opening = "^({p_opentag}|{p_closetag})\\s*$".to_re
+ re6_opening.ignore_case = true
+ blocks.add([re6_opening, null])
+
+ return blocks
+ end
+end
+
+redef class Int
+
+ # Tab stop is 4
+ private fun columns_to_next_tab_stop: Int do return 4 - (self % 4)
+end
+
+redef class String
+
+ # Is this string blank?
+ #
+ # i.e. contains only spacing characters.
+ private fun is_blank: Bool do
+ for i in [0 .. length[ do
+ var c = chars[i]
+ if c == ' ' or c == '\t' or c == '\n' or c == '\r' then
+ continue
+ else
+ return false
+ end
+ end
+ return true
+ end
+
+ # Is the character at `index` a space or a tab
+ #
+ # Return false if `index > self.length`.
+ private fun is_space_or_tab(index: Int): Bool do
+ if index >= length then return false
+ var c = chars[index]
+ return c == ' ' or c == '\t'
+ end
+end