From fffcf9bbebc894acccfd575d781bd6985b344e47 Mon Sep 17 00:00:00 2001 From: Alexandre Terrasa Date: Tue, 29 May 2018 19:51:24 -0400 Subject: [PATCH] lib/markdown2: introduce markdown block parser Signed-off-by: Alexandre Terrasa --- lib/markdown2/markdown_block_parsing.nit | 1503 ++++++++++++++++++++++++++++++ 1 file changed, 1503 insertions(+) create mode 100644 lib/markdown2/markdown_block_parsing.nit diff --git a/lib/markdown2/markdown_block_parsing.nit b/lib/markdown2/markdown_block_parsing.nit new file mode 100644 index 0000000..cfa07b5 --- /dev/null +++ b/lib/markdown2/markdown_block_parsing.nit @@ -0,0 +1,1503 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Markdown blocks parsing +# +# Introduce the parsers for the different Markdown blocks such as headings, lists +# code blocks etc. +module markdown_block_parsing + +import markdown_inline_parsing + +# Markdown parser +# +# Used to create the AST representation of a Markdown document. +class MdParser + + # Inline parser used to parse block content + private var inline_parser = new MdInlineParser is lazy + + # Block parsers factories + private var block_parser_factories: Collection[MdBlockParserFactory] do + var factories = new Array[MdBlockParserFactory] + factories.add new MdBlockQuoteParserFactory + factories.add new MdHeadingParserFactory + factories.add new MdFencedCodeBlockParserFactory + factories.add new MdHtmlBlockParserFactory + factories.add new MdThematicBreakParserFactory + factories.add new MdListBlockParserFactory + factories.add new MdIndentedCodeBlockParserFactory + return factories + end + + # Active block parsers + # + # Used as a stack to parse nested blocks. + private var active_block_parsers = new Array[MdBlockParser] + + # All active block parsers + private var all_block_parsers = new HashSet[MdBlockParser] + + # Return the active block parser + # + # The last entry in the `active_block_parsers` stack. + private fun active_block_parser: MdBlockParser do + return active_block_parsers.last + end + + # Activate a `block_parser` + # + # Add the `block_parser` on the top of the `active_block_parsers` stack. + # Also register it in `all_block_parsers`. + private fun activate_block_parser(block_parser: MdBlockParser) do + active_block_parsers.add block_parser + all_block_parsers.add block_parser + end + + # Deactivate the `active_block_parser` + private fun deactivate_block_parser do + active_block_parsers.pop + end + + # Deactivate and remove the `active_block_parser` from the `all_block_parsers` list + private fun remove_active_block_parser do + var old = active_block_parser + deactivate_block_parser + all_block_parsers.remove(old) + old.block.unlink + end + + # Post-processors applied after the parsing of a document + var post_processors = new Array[MdPostProcessor] is writable + + # Currently parsed line + private var line_string: String is noinit + + # Current index (offset) in input `line_string` (starts at 0) + private var index = 0 + + # Current column in input `line_string` (starts at 0) + # + # Tab causes column to go to next 4-space tab stop. + private var column = 0 + + # Is the current column within a tab character (partially consumed tab) + private var column_is_in_tab: Bool is noinit + + # Current line in input string (starts at 1) + private var line = 1 + + # Index of the next non-space character starting from `index` + private var next_non_space_index = 0 + + # Next non-space column + private var next_non_space_column = 0 + + # Current indent in columns + # + # Either by spaces or tab stop of 4, starting from `column`. + private var indent = 0 + + # Is the current `line` blank starting from `index`? + private var is_blank: Bool is noinit + + # Does a node end with a blank line? + private var last_line_blank = new HashMap[MdNode, Bool] + + # Initialize parser state + private fun initialize do + active_block_parsers.clear + all_block_parsers.clear + index = 0 + column = 0 + column_is_in_tab = false + line = 1 + next_non_space_index = 0 + next_non_space_column = 0 + indent = 0 + is_blank = false + last_line_blank.clear + end + + # Parse the `input` string as a MdDocument + fun parse(input: String): MdDocument do + initialize + + var document_block_parser = new MdDocumentBlockParser(1, 1, 0) + activate_block_parser(document_block_parser) + var line_start = 0 + var line_break = find_line_break(input, line_start) + while line_break != -1 do + var line_string = input.substring(line_start, line_break - line_start) + incorporate_line(line_string) + if line_break + 1 < input.length and + input.chars[line_break] == '\r' and + input.chars[line_break + 1] == '\n' then + line_start = line_break + 2 + else + line_start = line_break + 1 + end + line_break = find_line_break(input, line_start) + line += 1 + column = 0 + end + + # Finalize pending line + if input.length > 0 and (line_start == 0 or line_start < input.length) then + incorporate_line(input.substring(line_start, input.length - line_start)) + end + finalize_blocks(active_block_parsers) + + # Walk through a block and its chiildren revursively + # Parsing string content into inline content where appropriate. + var all_block_parsers = all_block_parsers.to_a + var i = all_block_parsers.length - 1 + while i >= 0 do + var block_parser = all_block_parsers[i] + block_parser.parse_inlines(inline_parser) + i -= 1 + end + var document = document_block_parser.block + return document + end + + # Post-process the `document` + fun post_process(document: MdDocument) do + for processor in post_processors do + processor.post_process(self, document) + end + end + + # Analyze a line of text and update the document + # + # We parse Markdown text by calling this on each line of `input`. + private fun incorporate_line(input: String) do + line_string = input + index = 0 + column = 0 + column_is_in_tab = false + + # For each containing block, try to parse the associated line start. + var matches = 1 + for i in [1 .. active_block_parsers.length[ do + var block_parser = active_block_parsers[i] + find_next_non_space + + var result = block_parser.try_continue(self) + if result isa MdBlockContinue then + if result.is_finalize then + block_parser.finalize(self) + return + else + if result.new_index != -1 then + set_new_index result.new_index + else if result.new_column != -1 then + set_new_column result.new_column + end + end + matches += 1 + else + break + end + end + + var unmatched_block_parsers = active_block_parsers.subarray( + matches, active_block_parsers.length - matches) + var last_matched_block_parser = active_block_parsers[matches - 1] + var block_parser = last_matched_block_parser + var all_closed = unmatched_block_parsers.is_empty + + # Unless last matched container is a code block, try new container starts, + # adding children to the last matched container. + var try_block_starts = block_parser.block isa MdParagraph or + block_parser.block.is_container + + while try_block_starts do + find_next_non_space + + # Optimize lookup + if is_blank or (indent < 4 and line_string.chars[next_non_space_index].is_letter) then + set_new_index next_non_space_index + break + end + + var block_start = find_block_start(block_parser) + if block_start == null then + set_new_index next_non_space_index + break + end + + if not all_closed then + finalize_blocks(unmatched_block_parsers) + all_closed = true + end + + if block_start.new_index != -1 then + set_new_index block_start.new_index + else if block_start.new_column != -1 then + set_new_column block_start.new_column + end + + if block_start.replace_active_block_parser then + remove_active_block_parser + end + + for new_block_parser in block_start.block_parsers do + add_child(new_block_parser) + block_parser = new_block_parser + try_block_starts = new_block_parser.block.is_container + end + end + + # What remains at the offset is a text line. + # Add the text to the appropriate block. + + # First check for a lazy paragraph continuation + if not all_closed and not is_blank and active_block_parser isa MdParagraphParser then + add_line + else + # Finalize any blocks not matched + if not all_closed then + finalize_blocks(unmatched_block_parsers) + end + propagate_last_line_blank(block_parser, last_matched_block_parser) + + if not block_parser.block.is_container then + add_line + else if not is_blank then + # Create a paragraph container for the line + add_child(new MdParagraphParser(line, column + 1, block_parser.content_offset)) + add_line + end + end + end + + # Find what kind of block starts at `index` in `input` + private fun find_block_start(block_parser: MdBlockParser): nullable MdBlockStart do + for block_parser_factory in block_parser_factories do + var result = block_parser_factory.try_start(self, block_parser) + if result != null then return result + end + return null + end + + # Add a `block_parser` block's as child of the active block parser block + private fun add_child(block_parser: MdBlockParser) do + # Finalize non-parentable blocks + while not active_block_parser.block.can_contain(block_parser.block) do + active_block_parser.finalize(self) + end + # Append block block parser block to its parent + active_block_parser.block.append_child(block_parser.block) + activate_block_parser(block_parser) + end + + # Add line content to the active block parser + # + # We assume it can accept lines. + private fun add_line do + var content = null + if column_is_in_tab then + # Out column is in a partially consumed tab. + # Expand the remaining columns to the next tab stop to spaces. + var after_tab = index + 1 + var rest = line_string.substring(after_tab, line_string.length - after_tab) + var spaces = column.columns_to_next_tab_stop + var buffer = new Buffer + for i in [0 .. spaces[ do + buffer.add ' ' + end + buffer.append(rest) + content = buffer.write_to_string + else + content = line_string.substring(index, line_string.length - index) + end + active_block_parser.add_line(content) + end + + # Finalize blocks of previous line + private fun finalize_blocks(block_parsers: Sequence[MdBlockParser]) do + var i = block_parsers.length - 1 + while i >= 0 do + var block_parser = block_parsers[i] + block_parser.finalize(self) + i -= 1 + end + end + + # Advance the `index` position to the next character + # + # Also set the `column`. + # If the next character is a tab, compute the new column accordingly. + private fun advance do + var c = line_string.chars[index] + if c == '\t' then + index += 1 + column += column.columns_to_next_tab_stop + else + index += 1 + column += 1 + end + end + + # Move `index` to the next non-space character index in the `input` string + # + # Also set `next_non_space_index`, `next_non_space_column`, `is_blank` and `indent`. + private fun find_next_non_space do + var i = index + var cols = column + + is_blank = true + while i < line_string.length do + var c = line_string.chars[i] + if c == ' ' then + i += 1 + cols += 1 + continue + else if c == '\t' then + i += 1 + cols += 4 - (cols % 4) + continue + end + is_blank = false + break + end + + next_non_space_index = i + next_non_space_column = cols + indent = next_non_space_column - column + end + + # Return the position of the next line break + # + # We consider `\r` and `\n`. + private fun find_line_break(input: String, start_index: Int): Int do + for i in [start_index .. input.length[ do + var char = input.chars[i] + if char == '\r' or char == '\n' then return i + end + return -1 + end + + # Set the parser `index` at `new_index` + # + # Also set `column` and `column_is_in_tab`. + private fun set_new_index(new_index: Int) do + if new_index >= next_non_space_index then + # We can start from here, no need to calculate tab stops again + index = next_non_space_index + column = next_non_space_column + end + while index < new_index and index != line_string.length do + advance + end + # If we're going to an index as opposed to a column, we're never within a tab + column_is_in_tab = false + end + + # Set the parser `column` at `new_column` + # + # Also set `index` and `column_is_in_tab`. + private fun set_new_column(new_column: Int) do + if new_column >= next_non_space_column then + # We can start from here, no need to calculate tab stops again + index = next_non_space_index + column = next_non_space_column + end + while column < new_column and index != line_string.length do + advance + end + if column > new_column then + # Last character was a tab and we overshot our target + index -= 1 + column = new_column + column_is_in_tab = true + else + column_is_in_tab = false + end + end + + # Does `block` end with a blank line? + private fun ends_with_blank_line(block: nullable MdNode): Bool do + while block != null do + if is_last_line_blank(block) then return true + if block isa MdListBlock or block isa MdListItem then + block = block.last_child + else + break + end + end + return false + end + + # Propagate a blank line to all block_parser blocl's parents + private fun propagate_last_line_blank(block_parser: MdBlockParser, last_matched_block_parser: MdBlockParser) do + var last_child = block_parser.block.last_child + if is_blank and last_child != null then + last_line_blank[last_child] = true + end + var block = block_parser.block + + # Block quotes lines are never blank as they start with `>`. + # We don't count blanks in fenced code for purposes of thight/loose lists. + # We also don't set `last_line_blank` on an empty list item. + var last_line_blank = is_blank and + not (block isa MdBlockQuote or + block isa MdFencedCodeBlock or + (block isa MdListItem and block.first_child == null and + block_parser != last_matched_block_parser)) + + # Propagate `last_line_blank` up through parents + var node: nullable MdNode = block_parser.block + while node != null do + self.last_line_blank[node] = last_line_blank + node = node.parent + end + end + + # Is last line blank for `node`? + private fun is_last_line_blank(node: MdNode): Bool do + if not last_line_blank.has_key(node) then return false + return last_line_blank[node] + end +end + +# Block parsing + +# Parser for a specific block node +abstract class MdBlockParser + + # Kind of block under construction + type BLOCK: MdBlock + + # MdBlock under construction + fun block: BLOCK is abstract + + # Line Start + var line_start: Int + + # Column start + var column_start: Int + + # Location at start + # + # The location end it initialized at `-1` and will be set later in the + # `finalize` method. + var location: MdLocation is lazy do return new MdLocation(line_start, column_start, -1, -1) + + # Column where the content starts + var content_offset: Int + + # Initialize the current `block` + fun initialize(parser: MdParser) do end + + # Can `self` continue from the current `index` in `parser`? + # + # Return a new `MdBlockContinue` if `self` can continue parsing. + # Return null otherwise. + fun try_continue(state: MdParser): nullable MdBlockContinue is abstract + + # Add `line` to the current `block` + fun add_line(line: String) do end + + # Finalize the current `block` + # + # Deactivate `self` from `parser` and call `close_block`. + fun finalize(parser: MdParser) do + if parser.active_block_parser == self then + parser.deactivate_block_parser + end + end + + # Parse `block` lines + fun parse_inlines(inline_parser: MdInlineParser) do end +end + +# Result object for continuing parsing of a block +class MdBlockContinue + + # Index from which continue parsing + var new_index: Int + + # Column from which continue parsing + var new_column: Int + + # Is the block finalized? + var is_finalize: Bool + + # Continue from index + init at_index(new_index: Int) do + init(new_index, -1, false) + end + + # Continue from column + init at_column(new_column: Int) do + init(-1, new_column, false) + end + + # Block is finished + init finished do + init(-1, -1, true) + end +end + +# Block parser factory for a block node for determining when a block starts +abstract class MdBlockParserFactory + + # Can the associated block parser can start at the current line in `parser`? + # + # Return a new `MdBlockStart` if the block parser can start. + # Return null otherwise. + fun try_start(parser: MdParser, matched_block_parser: MdBlockParser): + nullable MdBlockStart is abstract +end + +# Result object from starting parsing of a block +class MdBlockStart + + # Block parsers for this block start + var block_parsers: Array[MdBlockParser] + + # Index where the parsing should start + var new_index = -1 + + # Column where the parsing should start + var new_column = -1 + + # Does the block starting with `self` terminate a previous block? + var replace_active_block_parser = false + + # Start from `new_index` + fun at_index(new_index: Int): MdBlockStart do + self.new_index = new_index + return self + end + + # Start from `new_column` + fun at_column(new_column: Int): MdBlockStart do + self.new_column = new_column + return self + end + + # Start replacing the active block parser + fun replacing_active_block_parser: MdBlockStart do + self.replace_active_block_parser = true + return self + end +end + +# Parser for the whole document +class MdDocumentBlockParser + super MdBlockParser + + redef type BLOCK: MdDocument + redef var block = new MdDocument(location) is lazy + + # Always continue at current indent + redef fun try_continue(state) do return new MdBlockContinue.at_index(state.index) + + redef fun finalize(parser) do + end + + # redef fun finalize(state) do + redef fun parse_inlines(inline_parser) do + var last_child = block.last_child + if last_child != null then + location.line_end = last_child.location.line_end + location.column_end = last_child.location.column_end + end + end +end + +# Headings parser +class MdHeadingParser + super MdBlockParser + + redef type BLOCK: MdHeading + + redef var block = new MdHeading(location, level, is_setext, has_atx_trailing) is lazy + + redef var location = new MdLocation(line_start, column_start, line_end, column_end) is lazy + + # Line end + var line_end: Int + + # Column end + var column_end: Int + + # Heading level + var level: Int + + # Heading content + var content: String + + # Heading has ATX trailing + var has_atx_trailing: Bool + + # Heading is setext format + var is_setext: Bool + + # Never continue parsing as an heading is a one liner + redef fun try_continue(state) do return null + + # Parse the heading content + redef fun parse_inlines(inline_parser) do + inline_parser.parse(content, content_offset, block) + end +end + +# Heading parser factory +class MdHeadingParserFactory + super MdBlockParserFactory + + redef fun try_start(state, matched_block_parser) do + if state.indent >= 4 then return null + + var next_non_space = state.next_non_space_index + var line = state.line_string + var paragraph = null + if matched_block_parser isa MdParagraphParser then + paragraph = matched_block_parser.content + end + + var line_content = line.substring(next_non_space, line.length - next_non_space) + var match = line_content.search(re_atx_heading) + if match != null then + # ATX heading + var new_offset = next_non_space + match.subs.first.as(not null).length + var level = match.subs.first.as(not null).to_s.trim.length + # remove trailing ###s + var after_leading = line.substring(new_offset, line.length - new_offset) + var trailing = after_leading.search(re_atx_trailing) + var has_trailing = trailing != null + var trailing_length = if trailing != null then trailing.length else 0 + var content = after_leading.replace(re_atx_trailing, "") + return (new MdBlockStart( + [new MdHeadingParser( + state.line, + next_non_space + 1, + new_offset + 1, + state.line, + new_offset + content.length + trailing_length, + level, + content, + has_trailing, false)]) + ).at_index(line.length) + end + + if paragraph == null then return null + + match = line_content.search(re_setext_heading) + if match == null then return null + var level = 2 + if match.subs.first.as(not null).to_s.chars.first == '=' then level = 1 + var content = paragraph.to_s + return (new MdBlockStart( + [new MdHeadingParser( + state.line - 1, + next_non_space + 1, + 0, + state.line, + state.column + match.length, + level, + content, + false, true)]) + ).at_index(line.length).replacing_active_block_parser + end +end + +# Blockquotes parser +class MdBlockQuoteParser + super MdBlockParser + + redef type BLOCK: MdBlockQuote + redef var block = new MdBlockQuote(location) is lazy + + redef fun try_continue(state) do + var next_non_space = state.next_non_space_index + var indent = state.indent + var line = state.line_string + + if indent >= 4 then return null + if next_non_space >= line.length then return null + if line.chars[next_non_space] != '>' then return null + + var new_column = state.column + state.indent + 1 + # optional following space or tab + if state.line_string.is_space_or_tab(next_non_space + 1) then + new_column += 1 + end + return new MdBlockContinue.at_column(new_column) + end + + redef fun parse_inlines(inline_parser) do + var last_child = block.last_child + if last_child != null then + location.line_end = last_child.location.line_end + location.column_end = last_child.location.column_end + end + end +end + +# Blockquotes parser factory +class MdBlockQuoteParserFactory + super MdBlockParserFactory + + redef fun try_start(state, matched_block_parser) do + var next_non_space = state.next_non_space_index + var indent = state.indent + var line = state.line_string + + if indent >= 4 then return null + if next_non_space >= line.length then return null + if line.chars[next_non_space] != '>' then return null + + var new_column = state.column + state.indent + 1 + # optional following space or tab + if state.line_string.is_space_or_tab(next_non_space + 1) then + new_column += 1 + end + return (new MdBlockStart( + [new MdBlockQuoteParser( + state.line, + state.column + 1, + new_column)]) + ).at_column(new_column) + end +end + +# Indented code blocks parser +class MdIndentedCodeBlockParser + super MdBlockParser + + redef type BLOCK: MdIndentedCodeBlock + redef var block = new MdIndentedCodeBlock(location, use_tabs) is lazy + + # Indent is tab? + var use_tabs: Bool + + # Block content + var content = new Buffer + + redef fun try_continue(state) do + if state.indent >= 4 then + return new MdBlockContinue.at_column(state.column + 4) + else if state.is_blank then + return new MdBlockContinue.at_index(state.next_non_space_index) + end + return null + end + + redef fun add_line(line) do + if not content.is_empty then + content.add('\n') + end + content.append(line) + end + + redef fun finalize(parser) do + super + + add_line(" ") + var content = self.content.to_s + var literal = content.replace_first(re_trailing_blank_lines, "\n") + block.literal = literal + + var lines = literal.split("\n") + location.line_end = location.line_start + lines.length - 2 + location.column_end = content_offset + lines[lines.length - 2].length + 4 + end +end + +# Indented code blocks parser factory +class MdIndentedCodeBlockParserFactory + super MdBlockParserFactory + + redef fun try_start(state, matched_block_parser) do + if state.indent < 4 then return null + if state.is_blank then return null + if state.active_block_parser.block isa MdParagraph then return null + + var use_tabs = state.line_string.has_prefix("\t") + return (new MdBlockStart( + [new MdIndentedCodeBlockParser( + state.line, + state.column + 1, + state.column, + use_tabs)]) + ).at_column(state.column + 4) + end +end + +# Fenced code blocks parser +class MdFencedCodeBlockParser + super MdBlockParser + + redef type BLOCK: MdFencedCodeBlock + redef var block = new MdFencedCodeBlock(location, fence_char, fence_length, fence_indent) is lazy + + # Fence character + var fence_char: Char + + # Fence length + var fence_length: Int + + # Fence indent + var fence_indent: Int + + # Fence first line + var first_line: nullable String = null + + # Fence other lines + var other_lines = new Buffer + + redef fun try_continue(state) do + var next_non_space = state.next_non_space_index + var new_index = state.index + var line = state.line_string + + if state.indent <= 3 and next_non_space < line.length and + line.chars[next_non_space] == fence_char then + + var match = line.substring(next_non_space, line.length - next_non_space). + search(re_closing_fence) + if match != null and match.subs[0].as(not null).length >= fence_length then + # closing fence - we're at end of line, so we can finalize now + return new MdBlockContinue.finished + end + end + + # skip optional spaces of fence indent + var i = fence_indent + while i > 0 and new_index < line.length and line.chars[new_index] == ' ' do + new_index += 1 + i -= 1 + end + + return new MdBlockContinue.at_index(new_index) + end + + redef fun add_line(line) do + if first_line == null then + first_line = line + else + other_lines.append(line) + other_lines.add '\n' + end + end + + redef fun finalize(parser) do + super + + # first line become info string + var first_line = self.first_line + if first_line != null then + var info = first_line.trim.unescape_string + if not info.is_empty then block.info = info + end + + var content = other_lines.to_s + block.literal = content + + var lines = content.split("\n") + location.line_end = location.line_start + lines.length + location.column_end = content_offset + fence_indent + fence_length + end +end + +# Fenced code blocks parser factory +class MdFencedCodeBlockParserFactory + super MdBlockQuoteParserFactory + + redef fun try_start(state, matched_block_parser) do + var next_non_space = state.next_non_space_index + var line = state.line_string + + if state.indent >= 4 then return null + + var match = line.substring(next_non_space, line.length - next_non_space).search(re_opening_fence) + if match == null then return null + + var fence_length + var fence_char + var sub0 = match.subs[0] + if sub0 != null then + fence_length = sub0.length + fence_char = sub0.to_s.chars.first + else + fence_length = match.subs[2].as(not null).length + fence_char = match.subs[2].as(not null).to_s.chars.first + end + if fence_char == '`' and match.to_s.has("[^`]+`".to_re) then + return null + else if match.to_s.has("[^~]+~".to_re) then + return null + end + return (new MdBlockStart( + [new MdFencedCodeBlockParser( + state.line, + state.column + 1, + state.column, + fence_char, + fence_length, + state.indent)] + )).at_index(next_non_space + fence_length) + end +end + +# List blocks parser +class MdListBlockParser + super MdBlockParser + + redef type BLOCK: MdListBlock + + redef var block is lazy do + if is_ordered then + return new MdOrderedList(location, digit.as(not null), delim.as(not null)) + else + return new MdUnorderedList(location, bullet.as(not null)) + end + end + + # Is this list ordered + var is_ordered: Bool + + # List bullet if unordered + var bullet: nullable Char + + # List digit if ordered + var digit: nullable Int + + # List delimiter if ordered + var delim: nullable Char + + redef fun try_continue(state) do return new MdBlockContinue.at_index(state.index) + + redef fun finalize(parser) do + super + + var item = block.first_child + while item != null do + # check for non-final list item ending with blank line + if parser.ends_with_blank_line(item) and item.next != null then + block.is_tight = false + break + end + # recurse into children of list item to see if there are spaces between any of them + var sub_item = item.first_child + while sub_item != null do + if parser.ends_with_blank_line(sub_item) and + (item.next != null or sub_item.next != null) then + block.is_tight = false + break + end + sub_item = sub_item.next + end + item = item.next + end + end + + redef fun parse_inlines(inline_parser) do + var last_child = block.last_child + if last_child != null then + location.line_end = last_child.location.line_end + location.column_end = last_child.location.column_end + end + end +end + +# List blocks parser factory +class MdListBlockParserFactory + super MdBlockQuoteParserFactory + + redef fun try_start(state, matched_block_parser) do + if state.indent >= 4 and not matched_block_parser isa MdListBlockParser then return null + + var marker_index = state.next_non_space_index + var marker_column = state.column + state.indent + + var in_paragraph = matched_block_parser isa MdParagraphParser and matched_block_parser.content != null + var list_data = parse_list_marker(state, state.line_string, marker_index, marker_column, in_paragraph) + if list_data == null then return null + + + var new_column = list_data.content_column + var list_item_parser = new MdListItemParser( + state.line, + state.column + 1, + new_column, + new_column - state.column) + + # prepend the list block if needed + if not matched_block_parser isa MdListBlockParser or not lists_match(matched_block_parser.block, list_data) then + var list_block_parser = new MdListBlockParser(state.line, state.column + 1, new_column - state.column, list_data.is_ordered, list_data.bullet, list_data.digit, list_data.delim) + list_block_parser.block.is_tight = true + + return (new MdBlockStart([list_block_parser, list_item_parser: MdBlockParser])).at_column(new_column) + end + return (new MdBlockStart([list_item_parser])).at_column(new_column) + end + + private fun parse_list_marker(state: MdParser, line: String, marker_index, marker_column: Int, in_paragraph: Bool): nullable MdListData do + var rest = line.substring(marker_index, line.length - marker_index) + var match = rest.search(re_list_marker) + if match == null then return null + + var is_ordered + var bullet = null + var digit = null + var delim = null + + var bullet_match = match.subs[0] + if bullet_match != null then + is_ordered = false + bullet = bullet_match.to_s.chars[0] + else + is_ordered = true + digit = match.subs[2].as(not null).to_s.to_i + delim = match.subs[3].as(not null).to_s.chars[0] + end + + var marker_length = match.length + if match.to_s.has_suffix(" ") or match.to_s.has_suffix("\t") then + marker_length -= 1 + end + var index_after_marker = marker_index + marker_length + + # marker doesn't include tabs, so counting them as column directly is ok + var column_after_marker = marker_column + marker_length + # the column within the line where the content starts + var content_column = column_after_marker + + # see at which column the content starts if there is content + var has_content = false + for i in [index_after_marker .. line.length[ do + var c = line.chars[i] + if c == '\t' then + content_column += content_column.columns_to_next_tab_stop + else if c == ' ' then + content_column += 1 + else + has_content = true + break + end + end + + if in_paragraph then + # if the list item is ordered, then start number must be 1 to interrupt a paragraph + if is_ordered and digit != 1 then + return null + end + # empty list item can not interrupt a paragraph + if not has_content then + return null + end + end + + if not has_content or (content_column - column_after_marker) > 4 then + # if this line is blank or has a code block, default to 1 space after marker + content_column = column_after_marker + 1 + end + return new MdListData(is_ordered, bullet, digit, delim, content_column) + end + + # Return true if the two list items are of the same type + # + # With the same delimiter and bullet character. + # This is used in agglomerating list items into lists + private fun lists_match(a: MdListBlock, b: MdListData): Bool do + if a isa MdUnorderedList and not b.is_ordered then + return a.bullet_marker == b.bullet + else if a isa MdOrderedList and b.is_ordered then + return a.delimiter == b.delim + end + return false + end +end + +# Parsed list data +private class MdListData + + var is_ordered: Bool + + var bullet: nullable Char + + var digit: nullable Int + + var delim: nullable Char + + # Column the content start at + var content_column: Int +end + +# List items parser +class MdListItemParser + super MdBlockParser + + redef type BLOCK: MdListItem + redef var block = new MdListItem(location) is lazy + + # List item content indend + var content_indent: Int + + redef fun try_continue(state) do + if state.is_blank then + if block.first_child == null then + # blank line after empty list item + return null + end + return new MdBlockContinue.at_index(state.next_non_space_index) + end + if state.indent >= content_indent then + return new MdBlockContinue.at_column(state.column + content_indent) + end + return null + end + + redef fun parse_inlines(inline_parser) do + var last_child = block.last_child + if last_child != null then + location.line_end = last_child.location.line_end + location.column_end = last_child.location.column_end + end + end +end + +# Thematic breaks parser +class MdThematicBreakParser + super MdBlockParser + + redef type BLOCK: MdThematicBreak + redef var block = new MdThematicBreak(location, pattern) is lazy + + # Thematic break pattern + var pattern: String + + redef fun try_continue(state) do return null + + redef fun finalize(parser) do + super + + location.line_end = line_start + location.column_end = column_start + pattern.length - 1 + end +end + +# Thematic breaks parser factory +class MdThematicBreakParserFactory + super MdBlockQuoteParserFactory + + redef fun try_start(state, matched_block_parser) do + if state.indent >= 4 then return null + + var next_non_space = state.next_non_space_index + var line = state.line_string + var tbreak = line.substring(next_non_space, line.length - next_non_space).search(re_thematic_break) + if tbreak != null then + return (new MdBlockStart( + [new MdThematicBreakParser( + state.line, + state.column + 1, + next_non_space, + tbreak.to_s)] + )).at_index(line.length) + end + return null + end +end + +# Paragraphs parser +class MdParagraphParser + super MdBlockParser + + redef type BLOCK: MdParagraph + + redef var block = new MdParagraph(location) is lazy + + # Paragraph content + var content: nullable Buffer = new Buffer + + redef fun try_continue(state) do + if state.is_blank then return null + return new MdBlockContinue.at_index(state.index) + end + + redef fun add_line(line) do + var content = self.content + if content == null then return + if not content.is_empty then + content.add('\n') + end + content.append(line) + end + + redef fun finalize(parser) do + super + + var inline_parser = parser.inline_parser + var content = self.content + if content == null then return + + var content_string = content.to_s + var has_reference_defs = false + + var pos = inline_parser.parse_reference(content_string) + # try parsing the beginning as link reference definitions + while content_string.length > 3 and content_string.chars[0] == '[' and pos != 0 do + content_string = content_string.substring(pos, content_string.length - pos) + has_reference_defs = true + pos = inline_parser.parse_reference(content_string) + end + + if has_reference_defs and content_string.is_blank then + block.unlink + self.content = null + else + self.content = new Buffer.from_text(content_string) + end + end + + redef fun parse_inlines(inline_parser) do + var content = self.content + if content == null then return + inline_parser.parse(content.to_s, content_offset, block) + + var last_child = block.last_child + if last_child != null then + location.line_end = last_child.location.line_end + location.column_end = last_child.location.column_end + end + end +end + +# Html blocks parser +class MdHtmlBlockParser + super MdBlockParser + + redef type BLOCK: MdHtmlBlock + redef var block = new MdHtmlBlock(location) is lazy + + # Closing tag pattern + # + # Or null if the block is not closed + var closing_pattern: nullable Pattern + + # Is the current block finished? + var finished = false + + # Block content + var content = new Buffer + + redef fun try_continue(state) do + if finished then return null + + # blank lin ends type 6 and 7 blocks + if state.is_blank and closing_pattern == null then return null + + return new MdBlockContinue.at_index(state.index) + end + + redef fun add_line(line) do + if not content.is_empty then + content.add('\n') + end + content.append(line) + var closing_pattern = self.closing_pattern + if closing_pattern != null and line.has(closing_pattern) then + finished = true + end + end + + redef fun finalize(parser) do + super + + var content = self.content.to_s + block.literal = content + + var lines = content.split("\n") + location.line_end = location.line_start + lines.length - 1 + location.column_end = lines.last.length + end +end + +# Html blocks parser factory +class MdHtmlBlockParserFactory + super MdBlockParserFactory + + redef fun try_start(state, matched_block_parser) do + var next_non_space = state.next_non_space_index + var line = state.line_string + + if state.indent >= 4 or line.chars[next_non_space] != '<' then return null + + for block_type in [0..6] do + # type 7 can not interrupt a paragraph + if block_type == 6 and matched_block_parser.block isa MdParagraph then continue + var opener = re_html_blocks[block_type].first + var closer = re_html_blocks[block_type].last + if line.substring(next_non_space, line.length - next_non_space).has(opener.as(not null)) then + return (new MdBlockStart( + [new MdHtmlBlockParser( + state.line, + state.column + 1, + next_non_space, + closer)]) + ).at_index(state.index) + end + end + return null + end +end + +# Post Processing + +# Markdown post processor +# +# A Markdown AST visitor called after parsing from a MdParser +abstract class MdPostProcessor + super MdVisitor + + # Document behing processed + # + # Availlable only during a call to `post_process`. + var document: nullable MdDocument = null + + # Post process the `document` parsed by `parser` + fun post_process(parser: MdParser, document: MdDocument) do + self.document = document + enter_visit(document) + self.document = null + end + + # Call `MdNode::post_process` + redef fun visit(node) do node.post_process(self) +end + +redef class MdNode + + # Accept the visit of a `MdPostProcessor` + fun post_process(v: MdPostProcessor) do visit_all(v) +end + +# Utils + +redef class Sys + # ATX headings matching + private var re_atx_heading: Regex = "^(#\{1,6\})([ \t]+|$)".to_re + + # ATX trailings matching + private var re_atx_trailing: Regex = "(^|[ \t]+)#+[ \t]*$".to_re + + # SeText headings matching + private var re_setext_heading: Regex = "^(=+|-+)[ \t]*$".to_re + + # Blank lines matching + var re_trailing_blank_lines: Regex = "(\n[ \t]*)+$".to_re + + # Opening fence matching + var re_opening_fence: Regex = "^(`\{3,\})(.*)|^(~\{3,\})(.*)".to_re + + # Closing fence matching + var re_closing_fence: Regex = "^(`\{3,\}|~\{3,\})( *$)".to_re + + # List marker matching + var re_list_marker: Regex = "^([*+-])( |\t|$)|^([0-9]\{1,9\})([.)])( |\t|$)".to_re + + # Thematic break pattern + var re_thematic_break: Regex = "^((\\*[ \t]*)\{3,\}|(_[ \t]*)\{3,\}|(-[ \t]*)\{3,\})[ \t]*$".to_re + + # HTML blocks patterns + var re_html_blocks: Array[Array[nullable Regex]] do + var blocks = new Array[Array[nullable Regex]] + + var re0_opening = "^<(script|pre|style)(\\s|>|$)".to_re + re0_opening.ignore_case = true + var re0_closing = "".to_re + re0_closing.ignore_case = true + blocks.add([re0_opening, re0_closing]) + + blocks.add([ + "^".to_re + ]) + + blocks.add([ + "^<[?]".to_re, + "\\?>".to_re + ]) + + blocks.add([ + "^".to_re + ]) + + blocks.add([ + "^".to_re + ]) + + var re5_opening = "^]|$)".to_re + re5_opening.ignore_case = true + blocks.add([re5_opening, null]) + + var p_tagname = "[A-Za-z][A-Za-z0-9-]*" + var p_attribute_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*" + var p_uquoted_value = "[^\"'=<>`\\x00-\\x20]+" + var p_squoted_value = "'[^']*'" + var p_dquoted_value = "\"[^\"]*\"" + var p_attribute_value = "({p_uquoted_value}|{p_squoted_value}|{p_dquoted_value})" + var p_attribute_value_spec = "(\\s*=\\s*{p_attribute_value})" + var p_attribute = "(\\s{p_attribute_name}{p_attribute_value_spec}?)" + var p_opentag = "<{p_tagname}{p_attribute}*\\s*/?>" + var p_closetag = "]" + var re6_opening = "^({p_opentag}|{p_closetag})\\s*$".to_re + re6_opening.ignore_case = true + blocks.add([re6_opening, null]) + + return blocks + end +end + +redef class Int + + # Tab stop is 4 + private fun columns_to_next_tab_stop: Int do return 4 - (self % 4) +end + +redef class String + + # Is this string blank? + # + # i.e. contains only spacing characters. + private fun is_blank: Bool do + for i in [0 .. length[ do + var c = chars[i] + if c == ' ' or c == '\t' or c == '\n' or c == '\r' then + continue + else + return false + end + end + return true + end + + # Is the character at `index` a space or a tab + # + # Return false if `index > self.length`. + private fun is_space_or_tab(index: Int): Bool do + if index >= length then return false + var c = chars[index] + return c == ' ' or c == '\t' + end +end -- 1.7.9.5