1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
15 # Markdown blocks parsing
17 # Introduce the parsers for the different Markdown blocks such as headings, lists
19 module markdown_block_parsing
21 import markdown_inline_parsing
25 # Used to create the AST representation of a Markdown document.
28 # Inline parser used to parse block content
29 private var inline_parser
= new MdInlineParser is lazy
31 # Block parsers factories
32 private var block_parser_factories
: Collection[MdBlockParserFactory] do
33 var factories
= new Array[MdBlockParserFactory]
34 factories
.add
new MdBlockQuoteParserFactory
35 factories
.add
new MdHeadingParserFactory
36 factories
.add
new MdFencedCodeBlockParserFactory
37 factories
.add
new MdHtmlBlockParserFactory
38 factories
.add
new MdThematicBreakParserFactory
39 factories
.add
new MdListBlockParserFactory
40 factories
.add
new MdIndentedCodeBlockParserFactory
44 # Active block parsers
46 # Used as a stack to parse nested blocks.
47 private var active_block_parsers
= new Array[MdBlockParser]
49 # All active block parsers
50 private var all_block_parsers
= new HashSet[MdBlockParser]
52 # Return the active block parser
54 # The last entry in the `active_block_parsers` stack.
55 private fun active_block_parser
: MdBlockParser do
56 return active_block_parsers
.last
59 # Activate a `block_parser`
61 # Add the `block_parser` on the top of the `active_block_parsers` stack.
62 # Also register it in `all_block_parsers`.
63 private fun activate_block_parser
(block_parser
: MdBlockParser) do
64 active_block_parsers
.add block_parser
65 all_block_parsers
.add block_parser
68 # Deactivate the `active_block_parser`
69 private fun deactivate_block_parser
do
70 active_block_parsers
.pop
73 # Deactivate and remove the `active_block_parser` from the `all_block_parsers` list
74 private fun remove_active_block_parser
do
75 var old
= active_block_parser
76 deactivate_block_parser
77 all_block_parsers
.remove
(old
)
81 # Post-processors applied after the parsing of a document
82 var post_processors
= new Array[MdPostProcessor] is writable
84 # Currently parsed line
85 private var line_string
: String is noinit
87 # Current index (offset) in input `line_string` (starts at 0)
90 # Current column in input `line_string` (starts at 0)
92 # Tab causes column to go to next 4-space tab stop.
93 private var column
= 0
95 # Is the current column within a tab character (partially consumed tab)
96 private var column_is_in_tab
: Bool is noinit
98 # Current line in input string (starts at 1)
101 # Index of the next non-space character starting from `index`
102 private var next_non_space_index
= 0
104 # Next non-space column
105 private var next_non_space_column
= 0
107 # Current indent in columns
109 # Either by spaces or tab stop of 4, starting from `column`.
110 private var indent
= 0
112 # Is the current `line` blank starting from `index`?
113 private var is_blank
: Bool is noinit
115 # Does a node end with a blank line?
116 private var last_line_blank
= new HashMap[MdNode, Bool]
118 # Initialize parser state
119 private fun initialize
do
120 active_block_parsers
.clear
121 all_block_parsers
.clear
124 column_is_in_tab
= false
126 next_non_space_index
= 0
127 next_non_space_column
= 0
130 last_line_blank
.clear
133 # Parse the `input` string as a MdDocument
134 fun parse
(input
: String): MdDocument do
137 var document_block_parser
= new MdDocumentBlockParser(1, 1, 0)
138 activate_block_parser
(document_block_parser
)
140 var line_break
= find_line_break
(input
, line_start
)
141 while line_break
!= -1 do
142 var line_string
= input
.substring
(line_start
, line_break
- line_start
)
143 incorporate_line
(line_string
)
144 if line_break
+ 1 < input
.length
and
145 input
.chars
[line_break
] == '\r' and
146 input
.chars
[line_break
+ 1] == '\n' then
147 line_start
= line_break
+ 2
149 line_start
= line_break
+ 1
151 line_break
= find_line_break
(input
, line_start
)
156 # Finalize pending line
157 if input
.length
> 0 and (line_start
== 0 or line_start
< input
.length
) then
158 incorporate_line
(input
.substring
(line_start
, input
.length
- line_start
))
160 finalize_blocks
(active_block_parsers
)
162 # Walk through a block and its chiildren revursively
163 # Parsing string content into inline content where appropriate.
164 var all_block_parsers
= all_block_parsers
.to_a
165 var i
= all_block_parsers
.length
- 1
167 var block_parser
= all_block_parsers
[i
]
168 block_parser
.parse_inlines
(inline_parser
)
171 var document
= document_block_parser
.block
175 # Post-process the `document`
176 fun post_process
(document
: MdDocument) do
177 for processor
in post_processors
do
178 processor
.post_process
(self, document
)
182 # Analyze a line of text and update the document
184 # We parse Markdown text by calling this on each line of `input`.
185 private fun incorporate_line
(input
: String) do
189 column_is_in_tab
= false
191 # For each containing block, try to parse the associated line start.
193 for i
in [1 .. active_block_parsers
.length
[ do
194 var block_parser
= active_block_parsers
[i
]
197 var result
= block_parser
.try_continue
(self)
198 if result
isa MdBlockContinue then
199 if result
.is_finalize
then
200 block_parser
.finalize
(self)
203 if result
.new_index
!= -1 then
204 set_new_index result
.new_index
205 else if result
.new_column
!= -1 then
206 set_new_column result
.new_column
215 var unmatched_block_parsers
= active_block_parsers
.subarray
(
216 matches
, active_block_parsers
.length
- matches
)
217 var last_matched_block_parser
= active_block_parsers
[matches
- 1]
218 var block_parser
= last_matched_block_parser
219 var all_closed
= unmatched_block_parsers
.is_empty
221 # Unless last matched container is a code block, try new container starts,
222 # adding children to the last matched container.
223 var try_block_starts
= block_parser
.block
isa MdParagraph or
224 block_parser
.block
.is_container
226 while try_block_starts
do
230 if is_blank
or (indent
< 4 and line_string
.chars
[next_non_space_index
].is_letter
) then
231 set_new_index next_non_space_index
235 var block_start
= find_block_start
(block_parser
)
236 if block_start
== null then
237 set_new_index next_non_space_index
241 if not all_closed
then
242 finalize_blocks
(unmatched_block_parsers
)
246 if block_start
.new_index
!= -1 then
247 set_new_index block_start
.new_index
248 else if block_start
.new_column
!= -1 then
249 set_new_column block_start
.new_column
252 if block_start
.replace_active_block_parser
then
253 remove_active_block_parser
256 for new_block_parser
in block_start
.block_parsers
do
257 add_child
(new_block_parser
)
258 block_parser
= new_block_parser
259 try_block_starts
= new_block_parser
.block
.is_container
263 # What remains at the offset is a text line.
264 # Add the text to the appropriate block.
266 # First check for a lazy paragraph continuation
267 if not all_closed
and not is_blank
and active_block_parser
isa MdParagraphParser then
270 # Finalize any blocks not matched
271 if not all_closed
then
272 finalize_blocks
(unmatched_block_parsers
)
274 propagate_last_line_blank
(block_parser
, last_matched_block_parser
)
276 if not block_parser
.block
.is_container
then
278 else if not is_blank
then
279 # Create a paragraph container for the line
280 add_child
(new MdParagraphParser(line
, column
+ 1, block_parser
.content_offset
))
286 # Find what kind of block starts at `index` in `input`
287 private fun find_block_start
(block_parser
: MdBlockParser): nullable MdBlockStart do
288 for block_parser_factory
in block_parser_factories
do
289 var result
= block_parser_factory
.try_start
(self, block_parser
)
290 if result
!= null then return result
295 # Add a `block_parser` block's as child of the active block parser block
296 private fun add_child
(block_parser
: MdBlockParser) do
297 # Finalize non-parentable blocks
298 while not active_block_parser
.block
.can_contain
(block_parser
.block
) do
299 active_block_parser
.finalize
(self)
301 # Append block block parser block to its parent
302 active_block_parser
.block
.append_child
(block_parser
.block
)
303 activate_block_parser
(block_parser
)
306 # Add line content to the active block parser
308 # We assume it can accept lines.
309 private fun add_line
do
311 if column_is_in_tab
then
312 # Out column is in a partially consumed tab.
313 # Expand the remaining columns to the next tab stop to spaces.
314 var after_tab
= index
+ 1
315 var rest
= line_string
.substring
(after_tab
, line_string
.length
- after_tab
)
316 var spaces
= column
.columns_to_next_tab_stop
317 var buffer
= new Buffer
318 for i
in [0 .. spaces
[ do
322 content
= buffer
.write_to_string
324 content
= line_string
.substring
(index
, line_string
.length
- index
)
326 active_block_parser
.add_line
(content
)
329 # Finalize blocks of previous line
330 private fun finalize_blocks
(block_parsers
: Sequence[MdBlockParser]) do
331 var i
= block_parsers
.length
- 1
333 var block_parser
= block_parsers
[i
]
334 block_parser
.finalize
(self)
339 # Advance the `index` position to the next character
341 # Also set the `column`.
342 # If the next character is a tab, compute the new column accordingly.
343 private fun advance
do
344 var c
= line_string
.chars
[index
]
347 column
+= column
.columns_to_next_tab_stop
354 # Move `index` to the next non-space character index in the `input` string
356 # Also set `next_non_space_index`, `next_non_space_column`, `is_blank` and `indent`.
357 private fun find_next_non_space
do
362 while i
< line_string
.length
do
363 var c
= line_string
.chars
[i
]
368 else if c
== '\t' then
370 cols
+= 4 - (cols
% 4)
377 next_non_space_index
= i
378 next_non_space_column
= cols
379 indent
= next_non_space_column
- column
382 # Return the position of the next line break
384 # We consider `\r` and `\n`.
385 private fun find_line_break
(input
: String, start_index
: Int): Int do
386 for i
in [start_index
.. input
.length
[ do
387 var char
= input
.chars
[i
]
388 if char
== '\r' or char
== '\n' then return i
393 # Set the parser `index` at `new_index`
395 # Also set `column` and `column_is_in_tab`.
396 private fun set_new_index
(new_index
: Int) do
397 if new_index
>= next_non_space_index
then
398 # We can start from here, no need to calculate tab stops again
399 index
= next_non_space_index
400 column
= next_non_space_column
402 while index
< new_index
and index
!= line_string
.length
do
405 # If we're going to an index as opposed to a column, we're never within a tab
406 column_is_in_tab
= false
409 # Set the parser `column` at `new_column`
411 # Also set `index` and `column_is_in_tab`.
412 private fun set_new_column
(new_column
: Int) do
413 if new_column
>= next_non_space_column
then
414 # We can start from here, no need to calculate tab stops again
415 index
= next_non_space_index
416 column
= next_non_space_column
418 while column
< new_column
and index
!= line_string
.length
do
421 if column
> new_column
then
422 # Last character was a tab and we overshot our target
425 column_is_in_tab
= true
427 column_is_in_tab
= false
431 # Does `block` end with a blank line?
432 private fun ends_with_blank_line
(block
: nullable MdNode): Bool do
433 while block
!= null do
434 if is_last_line_blank
(block
) then return true
435 if block
isa MdListBlock or block
isa MdListItem then
436 block
= block
.last_child
444 # Propagate a blank line to all block_parser blocl's parents
445 private fun propagate_last_line_blank
(block_parser
: MdBlockParser, last_matched_block_parser
: MdBlockParser) do
446 var last_child
= block_parser
.block
.last_child
447 if is_blank
and last_child
!= null then
448 last_line_blank
[last_child
] = true
450 var block
= block_parser
.block
452 # Block quotes lines are never blank as they start with `>`.
453 # We don't count blanks in fenced code for purposes of thight/loose lists.
454 # We also don't set `last_line_blank` on an empty list item.
455 var last_line_blank
= is_blank
and
456 not (block
isa MdBlockQuote or
457 block
isa MdFencedCodeBlock or
458 (block
isa MdListItem and block
.first_child
== null and
459 block_parser
!= last_matched_block_parser
))
461 # Propagate `last_line_blank` up through parents
462 var node
: nullable MdNode = block_parser
.block
463 while node
!= null do
464 self.last_line_blank
[node
] = last_line_blank
469 # Is last line blank for `node`?
470 private fun is_last_line_blank
(node
: MdNode): Bool do
471 if not last_line_blank
.has_key
(node
) then return false
472 return last_line_blank
[node
]
478 # Parser for a specific block node
479 abstract class MdBlockParser
481 # Kind of block under construction
484 # MdBlock under construction
485 fun block
: BLOCK is abstract
491 var column_start
: Int
495 # The location end it initialized at `-1` and will be set later in the
497 var location
: MdLocation is lazy
do return new MdLocation(line_start
, column_start
, -1, -1)
499 # Column where the content starts
500 var content_offset
: Int
502 # Initialize the current `block`
503 fun initialize
(parser
: MdParser) do end
505 # Can `self` continue from the current `index` in `parser`?
507 # Return a new `MdBlockContinue` if `self` can continue parsing.
508 # Return null otherwise.
509 fun try_continue
(state
: MdParser): nullable MdBlockContinue is abstract
511 # Add `line` to the current `block`
512 fun add_line
(line
: String) do end
514 # Finalize the current `block`
516 # Deactivate `self` from `parser` and call `close_block`.
517 fun finalize
(parser
: MdParser) do
518 if parser
.active_block_parser
== self then
519 parser
.deactivate_block_parser
523 # Parse `block` lines
524 fun parse_inlines
(inline_parser
: MdInlineParser) do end
527 # Result object for continuing parsing of a block
528 class MdBlockContinue
530 # Index from which continue parsing
533 # Column from which continue parsing
536 # Is the block finalized?
537 var is_finalize
: Bool
539 # Continue from index
540 init at_index
(new_index
: Int) do
541 init(new_index
, -1, false)
544 # Continue from column
545 init at_column
(new_column
: Int) do
546 init(-1, new_column
, false)
555 # Block parser factory for a block node for determining when a block starts
556 abstract class MdBlockParserFactory
558 # Can the associated block parser can start at the current line in `parser`?
560 # Return a new `MdBlockStart` if the block parser can start.
561 # Return null otherwise.
562 fun try_start
(parser
: MdParser, matched_block_parser
: MdBlockParser):
563 nullable MdBlockStart is abstract
566 # Result object from starting parsing of a block
569 # Block parsers for this block start
570 var block_parsers
: Array[MdBlockParser]
572 # Index where the parsing should start
575 # Column where the parsing should start
578 # Does the block starting with `self` terminate a previous block?
579 var replace_active_block_parser
= false
581 # Start from `new_index`
582 fun at_index
(new_index
: Int): MdBlockStart do
583 self.new_index
= new_index
587 # Start from `new_column`
588 fun at_column
(new_column
: Int): MdBlockStart do
589 self.new_column
= new_column
593 # Start replacing the active block parser
594 fun replacing_active_block_parser
: MdBlockStart do
595 self.replace_active_block_parser
= true
600 # Parser for the whole document
601 class MdDocumentBlockParser
604 redef type BLOCK: MdDocument
605 redef var block
= new MdDocument(location
) is lazy
607 # Always continue at current indent
608 redef fun try_continue
(state
) do return new MdBlockContinue.at_index
(state
.index
)
610 redef fun finalize
(parser
) do
613 # redef fun finalize(state) do
614 redef fun parse_inlines
(inline_parser
) do
615 var last_child
= block
.last_child
616 if last_child
!= null then
617 location
.line_end
= last_child
.location
.line_end
618 location
.column_end
= last_child
.location
.column_end
624 class MdHeadingParser
627 redef type BLOCK: MdHeading
629 redef var block
= new MdHeading(location
, level
, is_setext
, has_atx_trailing
) is lazy
631 redef var location
= new MdLocation(line_start
, column_start
, line_end
, column_end
) is lazy
645 # Heading has ATX trailing
646 var has_atx_trailing
: Bool
648 # Heading is setext format
651 # Never continue parsing as an heading is a one liner
652 redef fun try_continue
(state
) do return null
654 # Parse the heading content
655 redef fun parse_inlines
(inline_parser
) do
656 inline_parser
.parse
(content
, content_offset
, block
)
660 # Heading parser factory
661 class MdHeadingParserFactory
662 super MdBlockParserFactory
664 redef fun try_start
(state
, matched_block_parser
) do
665 if state
.indent
>= 4 then return null
667 var next_non_space
= state
.next_non_space_index
668 var line
= state
.line_string
670 if matched_block_parser
isa MdParagraphParser then
671 paragraph
= matched_block_parser
.content
674 var line_content
= line
.substring
(next_non_space
, line
.length
- next_non_space
)
675 var match
= line_content
.search
(re_atx_heading
)
676 if match
!= null then
678 var new_offset
= next_non_space
+ match
.subs
.first
.as(not null).length
679 var level
= match
.subs
.first
.as(not null).to_s
.trim
.length
680 # remove trailing ###s
681 var after_leading
= line
.substring
(new_offset
, line
.length
- new_offset
)
682 var trailing
= after_leading
.search
(re_atx_trailing
)
683 var has_trailing
= trailing
!= null
684 var trailing_length
= if trailing
!= null then trailing
.length
else 0
685 var content
= after_leading
.replace
(re_atx_trailing
, "")
686 return (new MdBlockStart(
687 [new MdHeadingParser(
692 new_offset
+ content
.length
+ trailing_length
,
695 has_trailing
, false)])
696 ).at_index
(line
.length
)
699 if paragraph
== null then return null
701 match
= line_content
.search
(re_setext_heading
)
702 if match
== null then return null
704 if match
.subs
.first
.as(not null).to_s
.chars
.first
== '=' then level
= 1
705 var content
= paragraph
.to_s
706 return (new MdBlockStart(
707 [new MdHeadingParser(
712 state
.column
+ match
.length
,
716 ).at_index
(line
.length
).replacing_active_block_parser
721 class MdBlockQuoteParser
724 redef type BLOCK: MdBlockQuote
725 redef var block
= new MdBlockQuote(location
) is lazy
727 redef fun try_continue
(state
) do
728 var next_non_space
= state
.next_non_space_index
729 var indent
= state
.indent
730 var line
= state
.line_string
732 if indent
>= 4 then return null
733 if next_non_space
>= line
.length
then return null
734 if line
.chars
[next_non_space
] != '>' then return null
736 var new_column
= state
.column
+ state
.indent
+ 1
737 # optional following space or tab
738 if state
.line_string
.is_space_or_tab
(next_non_space
+ 1) then
741 return new MdBlockContinue.at_column
(new_column
)
744 redef fun parse_inlines
(inline_parser
) do
745 var last_child
= block
.last_child
746 if last_child
!= null then
747 location
.line_end
= last_child
.location
.line_end
748 location
.column_end
= last_child
.location
.column_end
753 # Blockquotes parser factory
754 class MdBlockQuoteParserFactory
755 super MdBlockParserFactory
757 redef fun try_start
(state
, matched_block_parser
) do
758 var next_non_space
= state
.next_non_space_index
759 var indent
= state
.indent
760 var line
= state
.line_string
762 if indent
>= 4 then return null
763 if next_non_space
>= line
.length
then return null
764 if line
.chars
[next_non_space
] != '>' then return null
766 var new_column
= state
.column
+ state
.indent
+ 1
767 # optional following space or tab
768 if state
.line_string
.is_space_or_tab
(next_non_space
+ 1) then
771 return (new MdBlockStart(
772 [new MdBlockQuoteParser(
776 ).at_column
(new_column
)
780 # Indented code blocks parser
781 class MdIndentedCodeBlockParser
784 redef type BLOCK: MdIndentedCodeBlock
785 redef var block
= new MdIndentedCodeBlock(location
, use_tabs
) is lazy
791 var content
= new Buffer
793 redef fun try_continue
(state
) do
794 if state
.indent
>= 4 then
795 return new MdBlockContinue.at_column
(state
.column
+ 4)
796 else if state
.is_blank
then
797 return new MdBlockContinue.at_index
(state
.next_non_space_index
)
802 redef fun add_line
(line
) do
803 if not content
.is_empty
then
809 redef fun finalize
(parser
) do
813 var content
= self.content
.to_s
814 var literal
= content
.replace_first
(re_trailing_blank_lines
, "\n")
815 block
.literal
= literal
817 var lines
= literal
.split
("\n")
818 location
.line_end
= location
.line_start
+ lines
.length
- 2
819 location
.column_end
= content_offset
+ lines
[lines
.length
- 2].length
+ 4
823 # Indented code blocks parser factory
824 class MdIndentedCodeBlockParserFactory
825 super MdBlockParserFactory
827 redef fun try_start
(state
, matched_block_parser
) do
828 if state
.indent
< 4 then return null
829 if state
.is_blank
then return null
830 if state
.active_block_parser
.block
isa MdParagraph then return null
832 var use_tabs
= state
.line_string
.has_prefix
("\t")
833 return (new MdBlockStart(
834 [new MdIndentedCodeBlockParser(
839 ).at_column
(state
.column
+ 4)
843 # Fenced code blocks parser
844 class MdFencedCodeBlockParser
847 redef type BLOCK: MdFencedCodeBlock
848 redef var block
= new MdFencedCodeBlock(location
, fence_char
, fence_length
, fence_indent
) is lazy
854 var fence_length
: Int
857 var fence_indent
: Int
860 var first_line
: nullable String = null
863 var other_lines
= new Buffer
865 redef fun try_continue
(state
) do
866 var next_non_space
= state
.next_non_space_index
867 var new_index
= state
.index
868 var line
= state
.line_string
870 if state
.indent
<= 3 and next_non_space
< line
.length
and
871 line
.chars
[next_non_space
] == fence_char
then
873 var match
= line
.substring
(next_non_space
, line
.length
- next_non_space
).
874 search
(re_closing_fence
)
875 if match
!= null and match
.subs
[0].as(not null).length
>= fence_length
then
876 # closing fence - we're at end of line, so we can finalize now
877 return new MdBlockContinue.finished
881 # skip optional spaces of fence indent
883 while i
> 0 and new_index
< line
.length
and line
.chars
[new_index
] == ' ' do
888 return new MdBlockContinue.at_index
(new_index
)
891 redef fun add_line
(line
) do
892 if first_line
== null then
895 other_lines
.append
(line
)
900 redef fun finalize
(parser
) do
903 # first line become info string
904 var first_line
= self.first_line
905 if first_line
!= null then
906 var info
= first_line
.trim
.unescape_string
907 if not info
.is_empty
then block
.info
= info
910 var content
= other_lines
.to_s
911 block
.literal
= content
913 var lines
= content
.split
("\n")
914 location
.line_end
= location
.line_start
+ lines
.length
915 location
.column_end
= content_offset
+ fence_indent
+ fence_length
919 # Fenced code blocks parser factory
920 class MdFencedCodeBlockParserFactory
921 super MdBlockQuoteParserFactory
923 redef fun try_start
(state
, matched_block_parser
) do
924 var next_non_space
= state
.next_non_space_index
925 var line
= state
.line_string
927 if state
.indent
>= 4 then return null
929 var match
= line
.substring
(next_non_space
, line
.length
- next_non_space
).search
(re_opening_fence
)
930 if match
== null then return null
934 var sub0
= match
.subs
[0]
936 fence_length
= sub0
.length
937 fence_char
= sub0
.to_s
.chars
.first
939 fence_length
= match
.subs
[2].as(not null).length
940 fence_char
= match
.subs
[2].as(not null).to_s
.chars
.first
942 if fence_char
== '`' and match
.to_s
.has
("[^`]+`".to_re
) then
944 else if match
.to_s
.has
("[^~]+~".to_re
) then
947 return (new MdBlockStart(
948 [new MdFencedCodeBlockParser(
955 )).at_index
(next_non_space
+ fence_length
)
960 class MdListBlockParser
963 redef type BLOCK: MdListBlock
965 redef var block
is lazy
do
967 return new MdOrderedList(location
, digit
.as(not null), delim
.as(not null))
969 return new MdUnorderedList(location
, bullet
.as(not null))
973 # Is this list ordered
976 # List bullet if unordered
977 var bullet
: nullable Char
979 # List digit if ordered
980 var digit
: nullable Int
982 # List delimiter if ordered
983 var delim
: nullable Char
985 redef fun try_continue
(state
) do return new MdBlockContinue.at_index
(state
.index
)
987 redef fun finalize
(parser
) do
990 var item
= block
.first_child
991 while item
!= null do
992 # check for non-final list item ending with blank line
993 if parser
.ends_with_blank_line
(item
) and item
.next
!= null then
994 block
.is_tight
= false
997 # recurse into children of list item to see if there are spaces between any of them
998 var sub_item
= item
.first_child
999 while sub_item
!= null do
1000 if parser
.ends_with_blank_line
(sub_item
) and
1001 (item
.next
!= null or sub_item
.next
!= null) then
1002 block
.is_tight
= false
1005 sub_item
= sub_item
.next
1011 redef fun parse_inlines
(inline_parser
) do
1012 var last_child
= block
.last_child
1013 if last_child
!= null then
1014 location
.line_end
= last_child
.location
.line_end
1015 location
.column_end
= last_child
.location
.column_end
1020 # List blocks parser factory
1021 class MdListBlockParserFactory
1022 super MdBlockQuoteParserFactory
1024 redef fun try_start
(state
, matched_block_parser
) do
1025 if state
.indent
>= 4 and not matched_block_parser
isa MdListBlockParser then return null
1027 var marker_index
= state
.next_non_space_index
1028 var marker_column
= state
.column
+ state
.indent
1030 var in_paragraph
= matched_block_parser
isa MdParagraphParser and matched_block_parser
.content
!= null
1031 var list_data
= parse_list_marker
(state
, state
.line_string
, marker_index
, marker_column
, in_paragraph
)
1032 if list_data
== null then return null
1035 var new_column
= list_data
.content_column
1036 var list_item_parser
= new MdListItemParser(
1040 new_column
- state
.column
)
1042 # prepend the list block if needed
1043 if not matched_block_parser
isa MdListBlockParser or not lists_match
(matched_block_parser
.block
, list_data
) then
1044 var list_block_parser
= new MdListBlockParser(state
.line
, state
.column
+ 1, new_column
- state
.column
, list_data
.is_ordered
, list_data
.bullet
, list_data
.digit
, list_data
.delim
)
1045 list_block_parser
.block
.is_tight
= true
1047 return (new MdBlockStart([list_block_parser
, list_item_parser
: MdBlockParser])).at_column
(new_column
)
1049 return (new MdBlockStart([list_item_parser
])).at_column
(new_column
)
1052 private fun parse_list_marker
(state
: MdParser, line
: String, marker_index
, marker_column
: Int, in_paragraph
: Bool): nullable MdListData do
1053 var rest
= line
.substring
(marker_index
, line
.length
- marker_index
)
1054 var match
= rest
.search
(re_list_marker
)
1055 if match
== null then return null
1062 var bullet_match
= match
.subs
[0]
1063 if bullet_match
!= null then
1065 bullet
= bullet_match
.to_s
.chars
[0]
1068 digit
= match
.subs
[2].as(not null).to_s
.to_i
1069 delim
= match
.subs
[3].as(not null).to_s
.chars
[0]
1072 var marker_length
= match
.length
1073 if match
.to_s
.has_suffix
(" ") or match
.to_s
.has_suffix
("\t") then
1076 var index_after_marker
= marker_index
+ marker_length
1078 # marker doesn't include tabs, so counting them as column directly is ok
1079 var column_after_marker
= marker_column
+ marker_length
1080 # the column within the line where the content starts
1081 var content_column
= column_after_marker
1083 # see at which column the content starts if there is content
1084 var has_content
= false
1085 for i
in [index_after_marker
.. line
.length
[ do
1086 var c
= line
.chars
[i
]
1088 content_column
+= content_column
.columns_to_next_tab_stop
1089 else if c
== ' ' then
1097 if in_paragraph
then
1098 # if the list item is ordered, then start number must be 1 to interrupt a paragraph
1099 if is_ordered
and digit
!= 1 then
1102 # empty list item can not interrupt a paragraph
1103 if not has_content
then
1108 if not has_content
or (content_column
- column_after_marker
) > 4 then
1109 # if this line is blank or has a code block, default to 1 space after marker
1110 content_column
= column_after_marker
+ 1
1112 return new MdListData(is_ordered
, bullet
, digit
, delim
, content_column
)
1115 # Return true if the two list items are of the same type
1117 # With the same delimiter and bullet character.
1118 # This is used in agglomerating list items into lists
1119 private fun lists_match
(a
: MdListBlock, b
: MdListData): Bool do
1120 if a
isa MdUnorderedList and not b
.is_ordered
then
1121 return a
.bullet_marker
== b
.bullet
1122 else if a
isa MdOrderedList and b
.is_ordered
then
1123 return a
.delimiter
== b
.delim
1130 private class MdListData
1132 var is_ordered
: Bool
1134 var bullet
: nullable Char
1136 var digit
: nullable Int
1138 var delim
: nullable Char
1140 # Column the content start at
1141 var content_column
: Int
1145 class MdListItemParser
1148 redef type BLOCK: MdListItem
1149 redef var block
= new MdListItem(location
) is lazy
1151 # List item content indend
1152 var content_indent
: Int
1154 redef fun try_continue
(state
) do
1155 if state
.is_blank
then
1156 if block
.first_child
== null then
1157 # blank line after empty list item
1160 return new MdBlockContinue.at_index
(state
.next_non_space_index
)
1162 if state
.indent
>= content_indent
then
1163 return new MdBlockContinue.at_column
(state
.column
+ content_indent
)
1168 redef fun parse_inlines
(inline_parser
) do
1169 var last_child
= block
.last_child
1170 if last_child
!= null then
1171 location
.line_end
= last_child
.location
.line_end
1172 location
.column_end
= last_child
.location
.column_end
1177 # Thematic breaks parser
1178 class MdThematicBreakParser
1181 redef type BLOCK: MdThematicBreak
1182 redef var block
= new MdThematicBreak(location
, pattern
) is lazy
1184 # Thematic break pattern
1187 redef fun try_continue
(state
) do return null
1189 redef fun finalize
(parser
) do
1192 location
.line_end
= line_start
1193 location
.column_end
= column_start
+ pattern
.length
- 1
1197 # Thematic breaks parser factory
1198 class MdThematicBreakParserFactory
1199 super MdBlockQuoteParserFactory
1201 redef fun try_start
(state
, matched_block_parser
) do
1202 if state
.indent
>= 4 then return null
1204 var next_non_space
= state
.next_non_space_index
1205 var line
= state
.line_string
1206 var tbreak
= line
.substring
(next_non_space
, line
.length
- next_non_space
).search
(re_thematic_break
)
1207 if tbreak
!= null then
1208 return (new MdBlockStart(
1209 [new MdThematicBreakParser(
1214 )).at_index
(line
.length
)
1221 class MdParagraphParser
1224 redef type BLOCK: MdParagraph
1226 redef var block
= new MdParagraph(location
) is lazy
1229 var content
: nullable Buffer = new Buffer
1231 redef fun try_continue
(state
) do
1232 if state
.is_blank
then return null
1233 return new MdBlockContinue.at_index
(state
.index
)
1236 redef fun add_line
(line
) do
1237 var content
= self.content
1238 if content
== null then return
1239 if not content
.is_empty
then
1242 content
.append
(line
)
1245 redef fun finalize
(parser
) do
1248 var inline_parser
= parser
.inline_parser
1249 var content
= self.content
1250 if content
== null then return
1252 var content_string
= content
.to_s
1253 var has_reference_defs
= false
1255 var pos
= inline_parser
.parse_reference
(content_string
)
1256 # try parsing the beginning as link reference definitions
1257 while content_string
.length
> 3 and content_string
.chars
[0] == '[' and pos
!= 0 do
1258 content_string
= content_string
.substring
(pos
, content_string
.length
- pos
)
1259 has_reference_defs
= true
1260 pos
= inline_parser
.parse_reference
(content_string
)
1263 if has_reference_defs
and content_string
.is_blank
then
1267 self.content
= new Buffer.from_text
(content_string
)
1271 redef fun parse_inlines
(inline_parser
) do
1272 var content
= self.content
1273 if content
== null then return
1274 inline_parser
.parse
(content
.to_s
, content_offset
, block
)
1276 var last_child
= block
.last_child
1277 if last_child
!= null then
1278 location
.line_end
= last_child
.location
.line_end
1279 location
.column_end
= last_child
.location
.column_end
1284 # Html blocks parser
1285 class MdHtmlBlockParser
1288 redef type BLOCK: MdHtmlBlock
1289 redef var block
= new MdHtmlBlock(location
) is lazy
1291 # Closing tag pattern
1293 # Or null if the block is not closed
1294 var closing_pattern
: nullable Pattern
1296 # Is the current block finished?
1297 var finished
= false
1300 var content
= new Buffer
1302 redef fun try_continue
(state
) do
1303 if finished
then return null
1305 # blank lin ends type 6 and 7 blocks
1306 if state
.is_blank
and closing_pattern
== null then return null
1308 return new MdBlockContinue.at_index
(state
.index
)
1311 redef fun add_line
(line
) do
1312 if not content
.is_empty
then
1315 content
.append
(line
)
1316 var closing_pattern
= self.closing_pattern
1317 if closing_pattern
!= null and line
.has
(closing_pattern
) then
1322 redef fun finalize
(parser
) do
1325 var content
= self.content
.to_s
1326 block
.literal
= content
1328 var lines
= content
.split
("\n")
1329 location
.line_end
= location
.line_start
+ lines
.length
- 1
1330 location
.column_end
= lines
.last
.length
1334 # Html blocks parser factory
1335 class MdHtmlBlockParserFactory
1336 super MdBlockParserFactory
1338 redef fun try_start
(state
, matched_block_parser
) do
1339 var next_non_space
= state
.next_non_space_index
1340 var line
= state
.line_string
1342 if state
.indent
>= 4 or line
.chars
[next_non_space
] != '<' then return null
1344 for block_type
in [0..6] do
1345 # type 7 can not interrupt a paragraph
1346 if block_type
== 6 and matched_block_parser
.block
isa MdParagraph then continue
1347 var opener
= re_html_blocks
[block_type
].first
1348 var closer
= re_html_blocks
[block_type
].last
1349 if line
.substring
(next_non_space
, line
.length
- next_non_space
).has
(opener
.as(not null)) then
1350 return (new MdBlockStart(
1351 [new MdHtmlBlockParser(
1356 ).at_index
(state
.index
)
1365 # Markdown post processor
1367 # A Markdown AST visitor called after parsing from a MdParser
1368 abstract class MdPostProcessor
1371 # Document behing processed
1373 # Availlable only during a call to `post_process`.
1374 var document
: nullable MdDocument = null
1376 # Post process the `document` parsed by `parser`
1377 fun post_process
(parser
: MdParser, document
: MdDocument) do
1378 self.document
= document
1379 enter_visit
(document
)
1380 self.document
= null
1383 # Call `MdNode::post_process`
1384 redef fun visit
(node
) do node
.post_process
(self)
1389 # Accept the visit of a `MdPostProcessor`
1390 fun post_process
(v
: MdPostProcessor) do visit_all
(v
)
1396 # ATX headings matching
1397 private var re_atx_heading
: Regex = "^(#\{1,6\})([ \t]+|$)".to_re
1399 # ATX trailings matching
1400 private var re_atx_trailing
: Regex = "(^|[ \t]+)#+[ \t]*$".to_re
1402 # SeText headings matching
1403 private var re_setext_heading
: Regex = "^(=+|-+)[ \t]*$".to_re
1405 # Blank lines matching
1406 var re_trailing_blank_lines
: Regex = "(\n[ \t]*)+$".to_re
1408 # Opening fence matching
1409 var re_opening_fence
: Regex = "^(`\{3,\})(.*)|^(~\{3,\})(.*)".to_re
1411 # Closing fence matching
1412 var re_closing_fence
: Regex = "^(`\{3,\}|~\{3,\})( *$)".to_re
1414 # List marker matching
1415 var re_list_marker
: Regex = "^([*+-])( |\t|$)|^([0-9]\{1,9\})([.)])( |\t|$)".to_re
1417 # Thematic break pattern
1418 var re_thematic_break
: Regex = "^((\\*[ \t]*)\{3,\}|(_[ \t]*)\{3,\}|(-[ \t]*)\{3,\})[ \t]*$".to_re
1420 # HTML blocks patterns
1421 var re_html_blocks
: Array[Array[nullable Regex]] do
1422 var blocks
= new Array[Array[nullable Regex]]
1424 var re0_opening
= "^<(script|pre|style)(\\s|>|$)".to_re
1425 re0_opening
.ignore_case
= true
1426 var re0_closing
= "</(script|pre|style)>".to_re
1427 re0_closing
.ignore_case
= true
1428 blocks
.add
([re0_opening
, re0_closing
])
1446 "^<!\\[CDATA\\[".to_re
,
1450 var re5_opening
= "^</?(address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(\\s|[/]?[>]|$)".to_re
1451 re5_opening
.ignore_case
= true
1452 blocks
.add
([re5_opening
, null])
1454 var p_tagname
= "[A-Za-z][A-Za-z0-9-]*"
1455 var p_attribute_name
= "[a-zA-Z_:][a-zA-Z0-9:._-]*"
1456 var p_uquoted_value
= "[^\"'=<>`\\x00-\\x20]+"
1457 var p_squoted_value = "'[^
']*'"
1458 var p_dquoted_value = "\
"[^\"]*\
""
1459 var p_attribute_value
= "({p_uquoted_value}|{p_squoted_value}|{p_dquoted_value})"
1460 var p_attribute_value_spec
= "(\\s*=\\s*{p_attribute_value})"
1461 var p_attribute
= "(\\s{p_attribute_name}{p_attribute_value_spec}?)"
1462 var p_opentag
= "<{p_tagname}{p_attribute}*\\s*/?>"
1463 var p_closetag
= "</{p_tagname}\\s*[>]"
1464 var re6_opening
= "^({p_opentag}|{p_closetag})\\s*$".to_re
1465 re6_opening
.ignore_case
= true
1466 blocks
.add
([re6_opening
, null])
1475 private fun columns_to_next_tab_stop
: Int do return 4 - (self % 4)
1480 # Is this string blank?
1482 # i.e. contains only spacing characters.
1483 private fun is_blank
: Bool do
1484 for i
in [0 .. length
[ do
1486 if c
== ' ' or c
== '\t' or c
== '\n' or c
== '\r' then
1495 # Is the character at `index` a space or a tab
1497 # Return false if `index > self.length`.
1498 private fun is_space_or_tab
(index
: Int): Bool do
1499 if index
>= length
then return false
1500 var c
= chars
[index
]
1501 return c
== ' ' or c
== '\t'