lib/markdown2: introduce markdown block parser

author Alexandre Terrasa <alexandre@moz-code.org>

Tue, 29 May 2018 23:51:24 +0000 (19:51 -0400)

committer Alexandre Terrasa <alexandre@moz-code.org>

Wed, 20 Jun 2018 23:11:18 +0000 (19:11 -0400)
author Alexandre Terrasa <alexandre@moz-code.org>
Tue, 29 May 2018 23:51:24 +0000 (19:51 -0400)
committer Alexandre Terrasa <alexandre@moz-code.org>
Wed, 20 Jun 2018 23:11:18 +0000 (19:11 -0400)
diff --git a/lib/markdown2/markdown_block_parsing.nit b/lib/markdown2/markdown_block_parsing.nit

new file mode 100644 (file)

index 0000000..cfa07b5
--- /dev/null
+++ b/lib/markdown2/markdown_block_parsing.nit
@@ -0,0 +1,1503 @@
+# This file is part of NIT ( http://www.nitlanguage.org ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Markdown blocks parsing
+#
+# Introduce the parsers for the different Markdown blocks such as headings, lists
+# code blocks etc.
+module markdown_block_parsing
+
+import markdown_inline_parsing
+
+# Markdown parser
+#
+# Used to create the AST representation of a Markdown document.
+class MdParser
+
+       # Inline parser used to parse block content
+       private var inline_parser = new MdInlineParser is lazy
+
+       # Block parsers factories
+       private var block_parser_factories: Collection[MdBlockParserFactory] do
+               var factories = new Array[MdBlockParserFactory]
+               factories.add new MdBlockQuoteParserFactory
+               factories.add new MdHeadingParserFactory
+               factories.add new MdFencedCodeBlockParserFactory
+               factories.add new MdHtmlBlockParserFactory
+               factories.add new MdThematicBreakParserFactory
+               factories.add new MdListBlockParserFactory
+               factories.add new MdIndentedCodeBlockParserFactory
+               return factories
+       end
+
+       # Active block parsers
+       #
+       # Used as a stack to parse nested blocks.
+       private var active_block_parsers = new Array[MdBlockParser]
+
+       # All active block parsers
+       private var all_block_parsers = new HashSet[MdBlockParser]
+
+       # Return the active block parser
+       #
+       # The last entry in the `active_block_parsers` stack.
+       private fun active_block_parser: MdBlockParser do
+               return active_block_parsers.last
+       end
+
+       # Activate a `block_parser`
+       #
+       # Add the `block_parser` on the top of the `active_block_parsers` stack.
+       # Also register it in `all_block_parsers`.
+       private fun activate_block_parser(block_parser: MdBlockParser) do
+               active_block_parsers.add block_parser
+               all_block_parsers.add block_parser
+       end
+
+       # Deactivate the `active_block_parser`
+       private fun deactivate_block_parser do
+               active_block_parsers.pop
+       end
+
+       # Deactivate and remove the `active_block_parser` from the `all_block_parsers` list
+       private fun remove_active_block_parser do
+               var old = active_block_parser
+               deactivate_block_parser
+               all_block_parsers.remove(old)
+               old.block.unlink
+       end
+
+       # Post-processors applied after the parsing of a document
+       var post_processors = new Array[MdPostProcessor] is writable
+
+       # Currently parsed line
+       private var line_string: String is noinit
+
+       # Current index (offset) in input `line_string` (starts at 0)
+       private var index = 0
+
+       # Current column in input `line_string` (starts at 0)
+       #
+       # Tab causes column to go to next 4-space tab stop.
+       private var column = 0
+
+       # Is the current column within a tab character (partially consumed tab)
+       private var column_is_in_tab: Bool is noinit
+
+       # Current line in input string (starts at 1)
+       private var line = 1
+
+       # Index of the next non-space character starting from `index`
+       private var next_non_space_index = 0
+
+       # Next non-space column
+       private var next_non_space_column = 0
+
+       # Current indent in columns
+       #
+       # Either by spaces or tab stop of 4, starting from `column`.
+       private var indent = 0
+
+       # Is the current `line` blank starting from `index`?
+       private var is_blank: Bool is noinit
+
+       # Does a node end with a blank line?
+       private var last_line_blank = new HashMap[MdNode, Bool]
+
+       # Initialize parser state
+       private fun initialize do
+               active_block_parsers.clear
+               all_block_parsers.clear
+               index = 0
+               column = 0
+               column_is_in_tab = false
+               line = 1
+               next_non_space_index = 0
+               next_non_space_column = 0
+               indent = 0
+               is_blank = false
+               last_line_blank.clear
+       end
+
+       # Parse the `input` string as a MdDocument
+       fun parse(input: String): MdDocument do
+               initialize
+
+               var document_block_parser = new MdDocumentBlockParser(1, 1, 0)
+               activate_block_parser(document_block_parser)
+               var line_start = 0
+               var line_break = find_line_break(input, line_start)
+               while line_break != -1 do
+                       var line_string = input.substring(line_start, line_break - line_start)
+                       incorporate_line(line_string)
+                       if line_break + 1 < input.length and
+                          input.chars[line_break] == '\r' and
+                          input.chars[line_break + 1] == '\n' then
+                               line_start = line_break + 2
+                       else
+                               line_start = line_break + 1
+                       end
+                       line_break = find_line_break(input, line_start)
+                       line += 1
+                       column = 0
+               end
+
+               # Finalize pending line
+               if input.length > 0 and (line_start == 0 or line_start < input.length) then
+                       incorporate_line(input.substring(line_start, input.length - line_start))
+               end
+               finalize_blocks(active_block_parsers)
+
+               # Walk through a block and its chiildren revursively
+               # Parsing string content into inline content where appropriate.
+               var all_block_parsers = all_block_parsers.to_a
+               var i = all_block_parsers.length - 1
+               while i >= 0 do
+                       var block_parser = all_block_parsers[i]
+                       block_parser.parse_inlines(inline_parser)
+                       i -= 1
+               end
+               var document = document_block_parser.block
+               return document
+       end
+
+       # Post-process the `document`
+       fun post_process(document: MdDocument) do
+               for processor in post_processors do
+                       processor.post_process(self, document)
+               end
+       end
+
+       # Analyze a line of text and update the document
+       #
+       # We parse Markdown text by calling this on each line of `input`.
+       private fun incorporate_line(input: String) do
+               line_string = input
+               index = 0
+               column = 0
+               column_is_in_tab = false
+
+               # For each containing block, try to parse the associated line start.
+               var matches = 1
+               for i in [1 .. active_block_parsers.length[ do
+                       var block_parser = active_block_parsers[i]
+                       find_next_non_space
+
+                       var result = block_parser.try_continue(self)
+                       if result isa MdBlockContinue then
+                               if result.is_finalize then
+                                       block_parser.finalize(self)
+                                       return
+                               else
+                                       if result.new_index != -1 then
+                                               set_new_index result.new_index
+                                       else if result.new_column != -1 then
+                                               set_new_column result.new_column
+                                       end
+                               end
+                               matches += 1
+                       else
+                               break
+                       end
+               end
+
+               var unmatched_block_parsers = active_block_parsers.subarray(
+                       matches, active_block_parsers.length - matches)
+               var last_matched_block_parser = active_block_parsers[matches - 1]
+               var block_parser = last_matched_block_parser
+               var all_closed = unmatched_block_parsers.is_empty
+
+               # Unless last matched container is a code block, try new container starts,
+               # adding children to the last matched container.
+               var try_block_starts = block_parser.block isa MdParagraph or
+                       block_parser.block.is_container
+
+               while try_block_starts do
+                       find_next_non_space
+
+                       # Optimize lookup
+                       if is_blank or (indent < 4 and line_string.chars[next_non_space_index].is_letter) then
+                               set_new_index next_non_space_index
+                               break
+                       end
+
+                       var block_start = find_block_start(block_parser)
+                       if block_start == null then
+                               set_new_index next_non_space_index
+                               break
+                       end
+
+                       if not all_closed then
+                               finalize_blocks(unmatched_block_parsers)
+                               all_closed = true
+                       end
+
+                       if block_start.new_index != -1 then
+                               set_new_index block_start.new_index
+                       else if block_start.new_column != -1 then
+                               set_new_column block_start.new_column
+                       end
+
+                       if block_start.replace_active_block_parser then
+                               remove_active_block_parser
+                       end
+
+                       for new_block_parser in block_start.block_parsers do
+                               add_child(new_block_parser)
+                               block_parser = new_block_parser
+                               try_block_starts = new_block_parser.block.is_container
+                       end
+               end
+
+               # What remains at the offset is a text line.
+               # Add the text to the appropriate block.
+
+               # First check for a lazy paragraph continuation
+               if not all_closed and not is_blank and active_block_parser isa MdParagraphParser then
+                       add_line
+               else
+                       # Finalize any blocks not matched
+                       if not all_closed then
+                               finalize_blocks(unmatched_block_parsers)
+                       end
+                       propagate_last_line_blank(block_parser, last_matched_block_parser)
+
+                       if not block_parser.block.is_container then
+                               add_line
+                       else if not is_blank then
+                               # Create a paragraph container for the line
+                               add_child(new MdParagraphParser(line, column + 1, block_parser.content_offset))
+                               add_line
+                       end
+               end
+       end
+
+       # Find what kind of block starts at `index` in `input`
+       private fun find_block_start(block_parser: MdBlockParser): nullable MdBlockStart do
+               for block_parser_factory in block_parser_factories do
+                       var result = block_parser_factory.try_start(self, block_parser)
+                       if result != null then return result
+               end
+               return null
+       end
+
+       # Add a `block_parser` block's as child of the active block parser block
+       private fun add_child(block_parser: MdBlockParser) do
+               # Finalize non-parentable blocks
+               while not active_block_parser.block.can_contain(block_parser.block) do
+                       active_block_parser.finalize(self)
+               end
+               # Append block block parser block to its parent
+               active_block_parser.block.append_child(block_parser.block)
+               activate_block_parser(block_parser)
+       end
+
+       # Add line content to the active block parser
+       #
+       # We assume it can accept lines.
+       private fun add_line do
+               var content = null
+               if column_is_in_tab then
+                       # Out column is in a partially consumed tab.
+                       # Expand the remaining columns to the next tab stop to spaces.
+                       var after_tab = index + 1
+                       var rest = line_string.substring(after_tab, line_string.length - after_tab)
+                       var spaces = column.columns_to_next_tab_stop
+                       var buffer = new Buffer
+                       for i in [0 .. spaces[ do
+                               buffer.add ' '
+                       end
+                       buffer.append(rest)
+                       content = buffer.write_to_string
+               else
+                       content = line_string.substring(index, line_string.length - index)
+               end
+               active_block_parser.add_line(content)
+       end
+
+       # Finalize blocks of previous line
+       private fun finalize_blocks(block_parsers: Sequence[MdBlockParser]) do
+               var i = block_parsers.length - 1
+               while i >= 0 do
+                       var block_parser = block_parsers[i]
+                       block_parser.finalize(self)
+                       i -= 1
+               end
+       end
+
+       # Advance the `index` position to the next character
+       #
+       # Also set the `column`.
+       # If the next character is a tab, compute the new column accordingly.
+       private fun advance do
+               var c = line_string.chars[index]
+               if c == '\t' then
+                       index += 1
+                       column += column.columns_to_next_tab_stop
+               else
+                       index += 1
+                       column += 1
+               end
+       end
+
+       # Move `index` to the next non-space character index in the `input` string
+       #
+       # Also set `next_non_space_index`, `next_non_space_column`, `is_blank` and `indent`.
+       private fun find_next_non_space do
+               var i = index
+               var cols = column
+
+               is_blank = true
+               while i < line_string.length do
+                       var c = line_string.chars[i]
+                       if c == ' ' then
+                               i += 1
+                               cols += 1
+                               continue
+                       else if c == '\t' then
+                               i += 1
+                               cols += 4 - (cols % 4)
+                               continue
+                       end
+                       is_blank = false
+                       break
+               end
+
+               next_non_space_index = i
+               next_non_space_column = cols
+               indent = next_non_space_column - column
+       end
+
+       # Return the position of the next line break
+       #
+       # We consider `\r` and `\n`.
+       private fun find_line_break(input: String, start_index: Int): Int do
+               for i in [start_index .. input.length[ do
+                       var char = input.chars[i]
+                       if char == '\r' or char == '\n' then return i
+               end
+               return -1
+       end
+
+       # Set the parser `index` at `new_index`
+       #
+       # Also set `column` and `column_is_in_tab`.
+       private fun set_new_index(new_index: Int) do
+               if new_index >= next_non_space_index then
+                       # We can start from here, no need to calculate tab stops again
+                       index = next_non_space_index
+                       column = next_non_space_column
+               end
+               while index < new_index and index != line_string.length do
+                       advance
+               end
+               # If we're going to an index as opposed to a column, we're never within a tab
+               column_is_in_tab = false
+       end
+
+       # Set the parser `column` at `new_column`
+       #
+       # Also set `index` and `column_is_in_tab`.
+       private fun set_new_column(new_column: Int) do
+               if new_column >= next_non_space_column then
+                       # We can start from here, no need to calculate tab stops again
+                       index = next_non_space_index
+                       column = next_non_space_column
+               end
+               while column < new_column and index != line_string.length do
+                       advance
+               end
+               if column > new_column then
+                       # Last character was a tab and we overshot our target
+                       index -= 1
+                       column = new_column
+                       column_is_in_tab = true
+               else
+                       column_is_in_tab = false
+               end
+       end
+
+       # Does `block` end with a blank line?
+       private fun ends_with_blank_line(block: nullable MdNode): Bool do
+               while block != null do
+                       if is_last_line_blank(block) then return true
+                       if block isa MdListBlock or block isa MdListItem then
+                               block = block.last_child
+                       else
+                               break
+                       end
+               end
+               return false
+       end
+
+       # Propagate a blank line to all block_parser blocl's parents
+       private fun propagate_last_line_blank(block_parser: MdBlockParser, last_matched_block_parser: MdBlockParser) do
+               var last_child = block_parser.block.last_child
+               if is_blank and last_child != null then
+                       last_line_blank[last_child] = true
+               end
+               var block = block_parser.block
+
+               # Block quotes lines are never blank as they start with `>`.
+               # We don't count blanks in fenced code for purposes of thight/loose lists.
+               # We also don't set `last_line_blank` on an empty list item.
+               var last_line_blank = is_blank and
+                       not (block isa MdBlockQuote or
+                            block isa MdFencedCodeBlock or
+                                (block isa MdListItem and block.first_child == null and
+                                                                                 block_parser != last_matched_block_parser))
+
+               # Propagate `last_line_blank` up through parents
+               var node: nullable MdNode = block_parser.block
+               while node != null do
+                       self.last_line_blank[node] = last_line_blank
+                       node = node.parent
+               end
+       end
+
+       # Is last line blank for `node`?
+       private fun is_last_line_blank(node: MdNode): Bool do
+               if not last_line_blank.has_key(node) then return false
+               return last_line_blank[node]
+       end
+end
+
+# Block parsing
+
+# Parser for a specific block node
+abstract class MdBlockParser
+
+       # Kind of block under construction
+       type BLOCK: MdBlock
+
+       # MdBlock under construction
+       fun block: BLOCK is abstract
+
+       # Line Start
+       var line_start: Int
+
+       # Column start
+       var column_start: Int
+
+       # Location at start
+       #
+       # The location end it initialized at `-1` and will be set later in the
+       # `finalize` method.
+       var location: MdLocation is lazy do return new MdLocation(line_start, column_start, -1, -1)
+
+       # Column where the content starts
+       var content_offset: Int
+
+       # Initialize the current `block`
+       fun initialize(parser: MdParser) do end
+
+       # Can `self` continue from the current `index` in `parser`?
+       #
+       # Return a new `MdBlockContinue` if `self` can continue parsing.
+       # Return null otherwise.
+       fun try_continue(state: MdParser): nullable MdBlockContinue is abstract
+
+       # Add `line` to the current `block`
+       fun add_line(line: String) do end
+
+       # Finalize the current `block`
+       #
+       # Deactivate `self` from `parser` and call `close_block`.
+       fun finalize(parser: MdParser) do
+               if parser.active_block_parser == self then
+                       parser.deactivate_block_parser
+               end
+       end
+
+       # Parse `block` lines
+       fun parse_inlines(inline_parser: MdInlineParser) do end
+end
+
+# Result object for continuing parsing of a block
+class MdBlockContinue
+
+       # Index from which continue parsing
+       var new_index: Int
+
+       # Column from which continue parsing
+       var new_column: Int
+
+       # Is the block finalized?
+       var is_finalize: Bool
+
+       # Continue from index
+       init at_index(new_index: Int) do
+               init(new_index, -1, false)
+       end
+
+       # Continue from column
+       init at_column(new_column: Int) do
+               init(-1, new_column, false)
+       end
+
+       # Block is finished
+       init finished do
+               init(-1, -1, true)
+       end
+end
+
+# Block parser factory for a block node for determining when a block starts
+abstract class MdBlockParserFactory
+
+       # Can the associated block parser can start at the current line in `parser`?
+       #
+       # Return a new `MdBlockStart` if the block parser can start.
+       # Return null otherwise.
+       fun try_start(parser: MdParser, matched_block_parser: MdBlockParser):
+               nullable MdBlockStart is abstract
+end
+
+# Result object from starting parsing of a block
+class MdBlockStart
+
+       # Block parsers for this block start
+       var block_parsers: Array[MdBlockParser]
+
+       # Index where the parsing should start
+       var new_index = -1
+
+       # Column where the parsing should start
+       var new_column = -1
+
+       # Does the block starting with `self` terminate a previous block?
+       var replace_active_block_parser = false
+
+       # Start from `new_index`
+       fun at_index(new_index: Int): MdBlockStart do
+               self.new_index = new_index
+               return self
+       end
+
+       # Start from `new_column`
+       fun at_column(new_column: Int): MdBlockStart do
+               self.new_column = new_column
+               return self
+       end
+
+       # Start replacing the active block parser
+       fun replacing_active_block_parser: MdBlockStart do
+               self.replace_active_block_parser = true
+               return self
+       end
+end
+
+# Parser for the whole document
+class MdDocumentBlockParser
+       super MdBlockParser
+
+       redef type BLOCK: MdDocument
+       redef var block = new MdDocument(location) is lazy
+
+       # Always continue at current indent
+       redef fun try_continue(state) do return new MdBlockContinue.at_index(state.index)
+
+       redef fun finalize(parser) do
+       end
+
+       # redef fun finalize(state) do
+       redef fun parse_inlines(inline_parser) do
+               var last_child = block.last_child
+               if last_child != null then
+                       location.line_end = last_child.location.line_end
+                       location.column_end = last_child.location.column_end
+               end
+       end
+end
+
+# Headings parser
+class MdHeadingParser
+       super MdBlockParser
+
+       redef type BLOCK: MdHeading
+
+       redef var block = new MdHeading(location, level, is_setext, has_atx_trailing) is lazy
+
+       redef var location = new MdLocation(line_start, column_start, line_end, column_end) is lazy
+
+       # Line end
+       var line_end: Int
+
+       # Column end
+       var column_end: Int
+
+       # Heading level
+       var level: Int
+
+       # Heading content
+       var content: String
+
+       # Heading has ATX trailing
+       var has_atx_trailing: Bool
+
+       # Heading is setext format
+       var is_setext: Bool
+
+       # Never continue parsing as an heading is a one liner
+       redef fun try_continue(state) do return null
+
+       # Parse the heading content
+       redef fun parse_inlines(inline_parser) do
+               inline_parser.parse(content, content_offset, block)
+       end
+end
+
+# Heading parser factory
+class MdHeadingParserFactory
+       super MdBlockParserFactory
+
+       redef fun try_start(state, matched_block_parser) do
+               if state.indent >= 4 then return null
+
+               var next_non_space = state.next_non_space_index
+               var line = state.line_string
+               var paragraph = null
+               if matched_block_parser isa MdParagraphParser then
+                       paragraph = matched_block_parser.content
+               end
+
+               var line_content = line.substring(next_non_space, line.length - next_non_space)
+               var match = line_content.search(re_atx_heading)
+               if match != null then
+                       # ATX heading
+                       var new_offset = next_non_space + match.subs.first.as(not null).length
+                       var level = match.subs.first.as(not null).to_s.trim.length
+                       # remove trailing ###s
+                       var after_leading = line.substring(new_offset, line.length - new_offset)
+                       var trailing = after_leading.search(re_atx_trailing)
+                       var has_trailing = trailing != null
+                       var trailing_length = if trailing != null then trailing.length else 0
+                       var content = after_leading.replace(re_atx_trailing, "")
+                       return (new MdBlockStart(
+                               [new MdHeadingParser(
+                                       state.line,
+                                       next_non_space + 1,
+                                       new_offset + 1,
+                                       state.line,
+                                       new_offset + content.length + trailing_length,
+                                       level,
+                                       content,
+                                       has_trailing, false)])
+                               ).at_index(line.length)
+               end
+
+               if paragraph ==  null then return null
+
+               match = line_content.search(re_setext_heading)
+               if match == null then return null
+               var level = 2
+               if match.subs.first.as(not null).to_s.chars.first == '=' then level = 1
+               var content = paragraph.to_s
+               return (new MdBlockStart(
+                       [new MdHeadingParser(
+                               state.line - 1,
+                               next_non_space + 1,
+                               0,
+                               state.line,
+                               state.column + match.length,
+                               level,
+                               content,
+                               false, true)])
+                       ).at_index(line.length).replacing_active_block_parser
+       end
+end
+
+# Blockquotes parser
+class MdBlockQuoteParser
+       super MdBlockParser
+
+       redef type BLOCK: MdBlockQuote
+       redef var block = new MdBlockQuote(location) is lazy
+
+       redef fun try_continue(state) do
+               var next_non_space = state.next_non_space_index
+               var indent = state.indent
+               var line = state.line_string
+
+               if indent >= 4 then return null
+               if next_non_space >= line.length then return null
+               if line.chars[next_non_space] != '>' then return null
+
+               var new_column = state.column + state.indent + 1
+               # optional following space or tab
+               if state.line_string.is_space_or_tab(next_non_space + 1) then
+                       new_column += 1
+               end
+               return new MdBlockContinue.at_column(new_column)
+       end
+
+       redef fun parse_inlines(inline_parser) do
+               var last_child = block.last_child
+               if last_child != null then
+                       location.line_end = last_child.location.line_end
+                       location.column_end = last_child.location.column_end
+               end
+       end
+end
+
+# Blockquotes parser factory
+class MdBlockQuoteParserFactory
+       super MdBlockParserFactory
+
+       redef fun try_start(state, matched_block_parser) do
+               var next_non_space = state.next_non_space_index
+               var indent = state.indent
+               var line = state.line_string
+
+               if indent >= 4 then return null
+               if next_non_space >= line.length then return null
+               if line.chars[next_non_space] != '>' then return null
+
+               var new_column = state.column + state.indent + 1
+               # optional following space or tab
+               if state.line_string.is_space_or_tab(next_non_space + 1) then
+                       new_column += 1
+               end
+               return (new MdBlockStart(
+                       [new MdBlockQuoteParser(
+                               state.line,
+                               state.column + 1,
+                               new_column)])
+                       ).at_column(new_column)
+       end
+end
+
+# Indented code blocks parser
+class MdIndentedCodeBlockParser
+       super MdBlockParser
+
+       redef type BLOCK: MdIndentedCodeBlock
+       redef var block = new MdIndentedCodeBlock(location, use_tabs) is lazy
+
+       # Indent is tab?
+       var use_tabs: Bool
+
+       # Block content
+       var content = new Buffer
+
+       redef fun try_continue(state) do
+               if state.indent >= 4 then
+                       return new MdBlockContinue.at_column(state.column + 4)
+               else if state.is_blank then
+                       return new MdBlockContinue.at_index(state.next_non_space_index)
+               end
+               return null
+       end
+
+       redef fun add_line(line) do
+               if not content.is_empty then
+                       content.add('\n')
+               end
+               content.append(line)
+       end
+
+       redef fun finalize(parser) do
+               super
+
+               add_line(" ")
+               var content = self.content.to_s
+               var literal = content.replace_first(re_trailing_blank_lines, "\n")
+               block.literal = literal
+
+               var lines = literal.split("\n")
+               location.line_end = location.line_start + lines.length - 2
+               location.column_end = content_offset + lines[lines.length - 2].length + 4
+       end
+end
+
+# Indented code blocks parser factory
+class MdIndentedCodeBlockParserFactory
+       super MdBlockParserFactory
+
+       redef fun try_start(state, matched_block_parser) do
+               if state.indent < 4 then return null
+               if state.is_blank then return null
+               if state.active_block_parser.block isa MdParagraph then return null
+
+               var use_tabs = state.line_string.has_prefix("\t")
+               return (new MdBlockStart(
+                       [new MdIndentedCodeBlockParser(
+                               state.line,
+                               state.column + 1,
+                               state.column,
+                               use_tabs)])
+                       ).at_column(state.column + 4)
+       end
+end
+
+# Fenced code blocks parser
+class MdFencedCodeBlockParser
+       super MdBlockParser
+
+       redef type BLOCK: MdFencedCodeBlock
+       redef var block = new MdFencedCodeBlock(location, fence_char, fence_length, fence_indent) is lazy
+
+       # Fence character
+       var fence_char: Char
+
+       # Fence length
+       var fence_length: Int
+
+       # Fence indent
+       var fence_indent: Int
+
+       # Fence first line
+       var first_line: nullable String = null
+
+       # Fence other lines
+       var other_lines = new Buffer
+
+       redef fun try_continue(state) do
+               var next_non_space = state.next_non_space_index
+               var new_index = state.index
+               var line = state.line_string
+
+               if state.indent <= 3 and next_non_space < line.length and
+                  line.chars[next_non_space] == fence_char then
+
+                       var match = line.substring(next_non_space, line.length - next_non_space).
+                               search(re_closing_fence)
+                       if match != null and match.subs[0].as(not null).length >= fence_length then
+                               # closing fence - we're at end of line, so we can finalize now
+                               return new MdBlockContinue.finished
+                       end
+               end
+
+               # skip optional spaces of fence indent
+               var i = fence_indent
+               while i > 0 and new_index < line.length and line.chars[new_index] == ' ' do
+                       new_index += 1
+                       i -= 1
+               end
+
+               return new MdBlockContinue.at_index(new_index)
+       end
+
+       redef fun add_line(line) do
+               if first_line == null then
+                       first_line = line
+               else
+                       other_lines.append(line)
+                       other_lines.add '\n'
+               end
+       end
+
+       redef fun finalize(parser) do
+               super
+
+               # first line become info string
+               var first_line = self.first_line
+               if first_line != null then
+                       var info = first_line.trim.unescape_string
+                       if not info.is_empty then block.info = info
+               end
+
+               var content = other_lines.to_s
+               block.literal =  content
+
+               var lines = content.split("\n")
+               location.line_end = location.line_start + lines.length
+               location.column_end = content_offset + fence_indent + fence_length
+       end
+end
+
+# Fenced code blocks parser factory
+class MdFencedCodeBlockParserFactory
+       super MdBlockQuoteParserFactory
+
+       redef fun try_start(state, matched_block_parser) do
+               var next_non_space = state.next_non_space_index
+               var line = state.line_string
+
+               if state.indent >= 4 then return null
+
+               var match = line.substring(next_non_space, line.length - next_non_space).search(re_opening_fence)
+               if match == null then return null
+
+               var fence_length
+               var fence_char
+               var sub0 = match.subs[0]
+               if sub0 != null then
+                       fence_length = sub0.length
+                       fence_char = sub0.to_s.chars.first
+               else
+                       fence_length = match.subs[2].as(not null).length
+                       fence_char = match.subs[2].as(not null).to_s.chars.first
+               end
+               if fence_char == '`' and match.to_s.has("[^`]+`".to_re) then
+                       return null
+               else if match.to_s.has("[^~]+~".to_re) then
+                       return null
+               end
+               return (new MdBlockStart(
+                       [new MdFencedCodeBlockParser(
+                               state.line,
+                               state.column + 1,
+                               state.column,
+                               fence_char,
+                               fence_length,
+                               state.indent)]
+                       )).at_index(next_non_space + fence_length)
+       end
+end
+
+# List blocks parser
+class MdListBlockParser
+       super MdBlockParser
+
+       redef type BLOCK: MdListBlock
+
+       redef var block is lazy do
+               if is_ordered then
+                       return new MdOrderedList(location, digit.as(not null), delim.as(not null))
+               else
+                       return new MdUnorderedList(location, bullet.as(not null))
+               end
+       end
+
+       # Is this list ordered
+       var is_ordered: Bool
+
+       # List bullet if unordered
+       var bullet: nullable Char
+
+       # List digit if ordered
+       var digit: nullable Int
+
+       # List delimiter if ordered
+       var delim: nullable Char
+
+       redef fun try_continue(state) do return new MdBlockContinue.at_index(state.index)
+
+       redef fun finalize(parser) do
+               super
+
+               var item = block.first_child
+               while item != null do
+                       # check for non-final list item ending with blank line
+                       if parser.ends_with_blank_line(item) and item.next != null then
+                               block.is_tight = false
+                               break
+                       end
+                       # recurse into children of list item to see if there are spaces between any of them
+                       var sub_item = item.first_child
+                       while sub_item != null do
+                               if parser.ends_with_blank_line(sub_item) and
+                                  (item.next != null or sub_item.next != null) then
+                                       block.is_tight = false
+                                       break
+                               end
+                               sub_item = sub_item.next
+                       end
+                       item = item.next
+               end
+       end
+
+       redef fun parse_inlines(inline_parser) do
+               var last_child = block.last_child
+               if last_child != null then
+                       location.line_end = last_child.location.line_end
+                       location.column_end = last_child.location.column_end
+               end
+       end
+end
+
+# List blocks parser factory
+class MdListBlockParserFactory
+       super MdBlockQuoteParserFactory
+
+       redef fun try_start(state, matched_block_parser) do
+               if state.indent >= 4 and not matched_block_parser isa MdListBlockParser then return null
+
+               var marker_index = state.next_non_space_index
+               var marker_column = state.column + state.indent
+
+               var in_paragraph = matched_block_parser isa MdParagraphParser and matched_block_parser.content != null
+               var list_data = parse_list_marker(state, state.line_string, marker_index, marker_column, in_paragraph)
+               if list_data == null then return null
+
+
+               var new_column = list_data.content_column
+               var list_item_parser = new MdListItemParser(
+                       state.line,
+                       state.column + 1,
+                       new_column,
+                       new_column - state.column)
+
+               # prepend the list block if needed
+               if not matched_block_parser isa MdListBlockParser or not lists_match(matched_block_parser.block, list_data) then
+                       var list_block_parser = new MdListBlockParser(state.line, state.column + 1, new_column - state.column, list_data.is_ordered, list_data.bullet, list_data.digit, list_data.delim)
+                       list_block_parser.block.is_tight = true
+
+                       return (new MdBlockStart([list_block_parser, list_item_parser: MdBlockParser])).at_column(new_column)
+               end
+               return (new MdBlockStart([list_item_parser])).at_column(new_column)
+       end
+
+       private fun parse_list_marker(state: MdParser, line: String, marker_index, marker_column: Int, in_paragraph: Bool): nullable MdListData do
+               var rest = line.substring(marker_index, line.length - marker_index)
+               var match = rest.search(re_list_marker)
+               if match == null then return null
+
+               var is_ordered
+               var bullet = null
+               var digit = null
+               var delim = null
+
+               var bullet_match = match.subs[0]
+               if bullet_match != null then
+                       is_ordered = false
+                       bullet = bullet_match.to_s.chars[0]
+               else
+                       is_ordered = true
+                       digit = match.subs[2].as(not null).to_s.to_i
+                       delim = match.subs[3].as(not null).to_s.chars[0]
+               end
+
+               var marker_length = match.length
+               if match.to_s.has_suffix(" ") or match.to_s.has_suffix("\t") then
+                       marker_length -= 1
+               end
+               var index_after_marker = marker_index + marker_length
+
+               # marker doesn't include tabs, so counting them as column directly is ok
+               var column_after_marker = marker_column + marker_length
+               # the column within the line where the content starts
+               var content_column = column_after_marker
+
+               # see at which column the content starts if there is content
+               var has_content = false
+               for i in [index_after_marker .. line.length[ do
+                       var c = line.chars[i]
+                       if c == '\t' then
+                               content_column += content_column.columns_to_next_tab_stop
+                       else if c == ' ' then
+                               content_column += 1
+                       else
+                               has_content = true
+                               break
+                       end
+               end
+
+               if in_paragraph then
+                       # if the list item is ordered, then start number must be 1 to interrupt a paragraph
+                       if is_ordered and digit != 1 then
+                               return null
+                       end
+                       # empty list item can not interrupt a paragraph
+                       if not has_content then
+                               return null
+                       end
+               end
+
+               if not has_content or (content_column - column_after_marker) > 4 then
+                       # if this line is blank or has a code block, default to 1 space after marker
+                       content_column = column_after_marker + 1
+               end
+               return new MdListData(is_ordered, bullet, digit, delim, content_column)
+       end
+
+       # Return true if the two list items are of the same type
+       #
+       # With the same delimiter and bullet character.
+       # This is used in agglomerating list items into lists
+       private fun lists_match(a: MdListBlock, b: MdListData): Bool do
+               if a isa MdUnorderedList and not b.is_ordered then
+                       return a.bullet_marker == b.bullet
+               else if a isa MdOrderedList and b.is_ordered then
+                       return a.delimiter == b.delim
+               end
+               return false
+       end
+end
+
+# Parsed list data
+private class MdListData
+
+       var is_ordered: Bool
+
+       var bullet: nullable Char
+
+       var digit: nullable Int
+
+       var delim: nullable Char
+
+       # Column the content start at
+       var content_column: Int
+end
+
+# List items parser
+class MdListItemParser
+       super MdBlockParser
+
+       redef type BLOCK: MdListItem
+       redef var block = new MdListItem(location) is lazy
+
+       # List item content indend
+       var content_indent: Int
+
+       redef fun try_continue(state) do
+               if state.is_blank then
+                       if block.first_child == null then
+                               # blank line after empty list item
+                               return null
+                       end
+                       return new MdBlockContinue.at_index(state.next_non_space_index)
+               end
+               if state.indent >= content_indent then
+                       return new MdBlockContinue.at_column(state.column + content_indent)
+               end
+               return null
+       end
+
+       redef fun parse_inlines(inline_parser) do
+               var last_child = block.last_child
+               if last_child != null then
+                       location.line_end = last_child.location.line_end
+                       location.column_end = last_child.location.column_end
+               end
+       end
+end
+
+# Thematic breaks parser
+class MdThematicBreakParser
+       super MdBlockParser
+
+       redef type BLOCK: MdThematicBreak
+       redef var block = new MdThematicBreak(location, pattern) is lazy
+
+       # Thematic break pattern
+       var pattern: String
+
+       redef fun try_continue(state) do return null
+
+       redef fun finalize(parser) do
+               super
+
+               location.line_end = line_start
+               location.column_end = column_start + pattern.length - 1
+       end
+end
+
+# Thematic breaks parser factory
+class MdThematicBreakParserFactory
+       super MdBlockQuoteParserFactory
+
+       redef fun try_start(state, matched_block_parser) do
+               if state.indent >= 4 then return null
+
+               var next_non_space = state.next_non_space_index
+               var line = state.line_string
+               var tbreak  = line.substring(next_non_space, line.length - next_non_space).search(re_thematic_break)
+               if tbreak != null then
+                       return (new MdBlockStart(
+                               [new MdThematicBreakParser(
+                                       state.line,
+                                       state.column + 1,
+                                       next_non_space,
+                                       tbreak.to_s)]
+                               )).at_index(line.length)
+               end
+               return null
+       end
+end
+
+# Paragraphs parser
+class MdParagraphParser
+       super MdBlockParser
+
+       redef type BLOCK: MdParagraph
+
+       redef var block = new MdParagraph(location) is lazy
+
+       # Paragraph content
+       var content: nullable Buffer = new Buffer
+
+       redef fun try_continue(state) do
+               if state.is_blank then return null
+               return new MdBlockContinue.at_index(state.index)
+       end
+
+       redef fun add_line(line) do
+               var content = self.content
+               if content == null then return
+               if not content.is_empty then
+                       content.add('\n')
+               end
+               content.append(line)
+       end
+
+       redef fun finalize(parser) do
+               super
+
+               var inline_parser = parser.inline_parser
+               var content = self.content
+               if content == null then return
+
+               var content_string = content.to_s
+               var has_reference_defs = false
+
+               var pos = inline_parser.parse_reference(content_string)
+               # try parsing the beginning as link reference definitions
+               while content_string.length > 3 and content_string.chars[0] == '[' and pos != 0 do
+                       content_string = content_string.substring(pos, content_string.length - pos)
+                       has_reference_defs = true
+                       pos = inline_parser.parse_reference(content_string)
+               end
+
+               if has_reference_defs and content_string.is_blank then
+                       block.unlink
+                       self.content = null
+               else
+                       self.content = new Buffer.from_text(content_string)
+               end
+       end
+
+       redef fun parse_inlines(inline_parser) do
+               var content = self.content
+               if content == null then return
+               inline_parser.parse(content.to_s, content_offset, block)
+
+               var last_child = block.last_child
+               if last_child != null then
+                       location.line_end = last_child.location.line_end
+                       location.column_end = last_child.location.column_end
+               end
+       end
+end
+
+# Html blocks parser
+class MdHtmlBlockParser
+       super MdBlockParser
+
+       redef type BLOCK: MdHtmlBlock
+       redef var block = new MdHtmlBlock(location) is lazy
+
+       # Closing tag pattern
+       #
+       # Or null if the block is not closed
+       var closing_pattern: nullable Pattern
+
+       # Is the current block finished?
+       var finished = false
+
+       # Block content
+       var content = new Buffer
+
+       redef fun try_continue(state) do
+               if finished then return null
+
+               # blank lin ends type 6 and 7 blocks
+               if state.is_blank and closing_pattern == null then return null
+
+               return new MdBlockContinue.at_index(state.index)
+       end
+
+       redef fun add_line(line) do
+               if not content.is_empty then
+                       content.add('\n')
+               end
+               content.append(line)
+               var closing_pattern = self.closing_pattern
+               if closing_pattern != null and line.has(closing_pattern) then
+                       finished = true
+               end
+       end
+
+       redef fun finalize(parser) do
+               super
+
+               var content = self.content.to_s
+               block.literal = content
+
+               var lines = content.split("\n")
+               location.line_end = location.line_start + lines.length - 1
+               location.column_end = lines.last.length
+       end
+end
+
+# Html blocks parser factory
+class MdHtmlBlockParserFactory
+       super MdBlockParserFactory
+
+       redef fun try_start(state, matched_block_parser) do
+               var next_non_space = state.next_non_space_index
+               var line = state.line_string
+
+               if state.indent >= 4 or line.chars[next_non_space] != '<' then return null
+
+               for block_type in [0..6] do
+                       # type 7 can not interrupt a paragraph
+                       if block_type == 6 and matched_block_parser.block isa MdParagraph then continue
+                       var opener = re_html_blocks[block_type].first
+                       var closer = re_html_blocks[block_type].last
+                       if line.substring(next_non_space, line.length - next_non_space).has(opener.as(not null)) then
+                               return (new MdBlockStart(
+                                       [new MdHtmlBlockParser(
+                                               state.line,
+                                               state.column + 1,
+                                               next_non_space,
+                                               closer)])
+                                       ).at_index(state.index)
+                       end
+               end
+               return null
+       end
+end
+
+# Post Processing
+
+# Markdown post processor
+#
+# A Markdown AST visitor called after parsing from a MdParser
+abstract class MdPostProcessor
+       super MdVisitor
+
+       # Document behing processed
+       #
+       # Availlable only during a call to `post_process`.
+       var document: nullable MdDocument = null
+
+       # Post process the `document` parsed by `parser`
+       fun post_process(parser: MdParser, document: MdDocument) do
+               self.document = document
+               enter_visit(document)
+               self.document = null
+       end
+
+       # Call `MdNode::post_process`
+       redef fun visit(node) do node.post_process(self)
+end
+
+redef class MdNode
+
+       # Accept the visit of a `MdPostProcessor`
+       fun post_process(v: MdPostProcessor) do visit_all(v)
+end
+
+# Utils
+
+redef class Sys
+       # ATX headings matching
+       private var re_atx_heading: Regex = "^(#\{1,6\})([ \t]+|$)".to_re
+
+       # ATX trailings matching
+       private var re_atx_trailing: Regex = "(^|[ \t]+)#+[ \t]*$".to_re
+
+       # SeText headings matching
+       private var re_setext_heading: Regex = "^(=+|-+)[ \t]*$".to_re
+
+       # Blank lines matching
+       var re_trailing_blank_lines: Regex = "(\n[ \t]*)+$".to_re
+
+       # Opening fence matching
+       var re_opening_fence: Regex = "^(`\{3,\})(.*)|^(~\{3,\})(.*)".to_re
+
+       # Closing fence matching
+       var re_closing_fence: Regex = "^(`\{3,\}|~\{3,\})( *$)".to_re
+
+       # List marker matching
+       var re_list_marker: Regex = "^([*+-])( |\t|$)|^([0-9]\{1,9\})([.)])( |\t|$)".to_re
+
+       # Thematic break pattern
+       var re_thematic_break: Regex = "^((\\*[ \t]*)\{3,\}|(_[ \t]*)\{3,\}|(-[ \t]*)\{3,\})[ \t]*$".to_re
+
+       # HTML blocks patterns
+       var re_html_blocks: Array[Array[nullable Regex]] do
+               var blocks = new Array[Array[nullable Regex]]
+
+               var re0_opening = "^<(script|pre|style)(\\s|>|$)".to_re
+               re0_opening.ignore_case = true
+               var re0_closing = "</(script|pre|style)>".to_re
+               re0_closing.ignore_case = true
+               blocks.add([re0_opening, re0_closing])
+
+               blocks.add([
+                       "^<!--".to_re,
+                       "-->".to_re
+               ])
+
+               blocks.add([
+                       "^<[?]".to_re,
+                       "\\?>".to_re
+               ])
+
+               blocks.add([
+                       "^<![A-Z]".to_re,
+                       ">".to_re
+               ])
+
+               blocks.add([
+                       "^<!\\[CDATA\\[".to_re,
+                       "\\]\\]>".to_re
+               ])
+
+               var re5_opening = "^</?(address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(\\s|[/]?[>]|$)".to_re
+               re5_opening.ignore_case = true
+               blocks.add([re5_opening, null])
+
+               var p_tagname = "[A-Za-z][A-Za-z0-9-]*"
+               var p_attribute_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*"
+               var p_uquoted_value = "[^\"'=<>`\\x00-\\x20]+"
+               var p_squoted_value = "'[^']*'"
+               var p_dquoted_value = "\"[^\"]*\""
+               var p_attribute_value = "({p_uquoted_value}|{p_squoted_value}|{p_dquoted_value})"
+               var p_attribute_value_spec = "(\\s*=\\s*{p_attribute_value})"
+               var p_attribute = "(\\s{p_attribute_name}{p_attribute_value_spec}?)"
+               var p_opentag = "<{p_tagname}{p_attribute}*\\s*/?>"
+               var p_closetag = "</{p_tagname}\\s*[>]"
+               var re6_opening = "^({p_opentag}|{p_closetag})\\s*$".to_re
+               re6_opening.ignore_case = true
+               blocks.add([re6_opening, null])
+
+               return blocks
+       end
+end
+
+redef class Int
+
+       # Tab stop is 4
+       private fun columns_to_next_tab_stop: Int do return 4 - (self % 4)
+end
+
+redef class String
+
+       # Is this string blank?
+       #
+       # i.e. contains only spacing characters.
+       private fun is_blank: Bool do
+               for i in [0 .. length[ do
+                       var c = chars[i]
+                       if c == ' ' or c == '\t' or c == '\n' or c == '\r' then
+                               continue
+                       else
+                               return false
+                       end
+               end
+               return true
+       end
+
+       # Is the character at `index` a space or a tab
+       #
+       # Return false if `index > self.length`.
+       private fun is_space_or_tab(index: Int): Bool do
+               if index >= length then return false
+               var c = chars[index]
+               return c == ' ' or c == '\t'
+       end
+end
author	Alexandre Terrasa <alexandre@moz-code.org>
	Tue, 29 May 2018 23:51:24 +0000 (19:51 -0400)
committer	Alexandre Terrasa <alexandre@moz-code.org>
	Wed, 20 Jun 2018 23:11:18 +0000 (19:11 -0400)