Merge: Functional api
[nit.git] / lib / markdown2 / markdown_block_parsing.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Markdown blocks parsing
16 #
17 # Introduce the parsers for the different Markdown blocks such as headings, lists
18 # code blocks etc.
19 module markdown_block_parsing
20
21 import markdown_inline_parsing
22
23 # Markdown parser
24 #
25 # Used to create the AST representation of a Markdown document.
26 class MdParser
27
28 # Inline parser used to parse block content
29 private var inline_parser = new MdInlineParser is lazy
30
31 # Block parsers factories
32 private var block_parser_factories: Collection[MdBlockParserFactory] do
33 var factories = new Array[MdBlockParserFactory]
34 factories.add new MdBlockQuoteParserFactory
35 factories.add new MdHeadingParserFactory
36 factories.add new MdFencedCodeBlockParserFactory
37 factories.add new MdHtmlBlockParserFactory
38 factories.add new MdThematicBreakParserFactory
39 factories.add new MdListBlockParserFactory
40 factories.add new MdIndentedCodeBlockParserFactory
41 return factories
42 end
43
44 # Active block parsers
45 #
46 # Used as a stack to parse nested blocks.
47 private var active_block_parsers = new Array[MdBlockParser]
48
49 # All active block parsers
50 private var all_block_parsers = new HashSet[MdBlockParser]
51
52 # Return the active block parser
53 #
54 # The last entry in the `active_block_parsers` stack.
55 private fun active_block_parser: MdBlockParser do
56 return active_block_parsers.last
57 end
58
59 # Activate a `block_parser`
60 #
61 # Add the `block_parser` on the top of the `active_block_parsers` stack.
62 # Also register it in `all_block_parsers`.
63 private fun activate_block_parser(block_parser: MdBlockParser) do
64 active_block_parsers.add block_parser
65 all_block_parsers.add block_parser
66 end
67
68 # Deactivate the `active_block_parser`
69 private fun deactivate_block_parser do
70 active_block_parsers.pop
71 end
72
73 # Deactivate and remove the `active_block_parser` from the `all_block_parsers` list
74 private fun remove_active_block_parser do
75 var old = active_block_parser
76 deactivate_block_parser
77 all_block_parsers.remove(old)
78 old.block.unlink
79 end
80
81 # Post-processors applied after the parsing of a document
82 var post_processors = new Array[MdPostProcessor] is writable
83
84 # Currently parsed line
85 private var line_string: String is noinit
86
87 # Current index (offset) in input `line_string` (starts at 0)
88 private var index = 0
89
90 # Current column in input `line_string` (starts at 0)
91 #
92 # Tab causes column to go to next 4-space tab stop.
93 private var column = 0
94
95 # Is the current column within a tab character (partially consumed tab)
96 private var column_is_in_tab: Bool is noinit
97
98 # Current line in input string (starts at 1)
99 private var line = 1
100
101 # Index of the next non-space character starting from `index`
102 private var next_non_space_index = 0
103
104 # Next non-space column
105 private var next_non_space_column = 0
106
107 # Current indent in columns
108 #
109 # Either by spaces or tab stop of 4, starting from `column`.
110 private var indent = 0
111
112 # Is the current `line` blank starting from `index`?
113 private var is_blank: Bool is noinit
114
115 # Does a node end with a blank line?
116 private var last_line_blank = new HashMap[MdNode, Bool]
117
118 # Initialize parser state
119 private fun initialize do
120 active_block_parsers.clear
121 all_block_parsers.clear
122 index = 0
123 column = 0
124 column_is_in_tab = false
125 line = 1
126 next_non_space_index = 0
127 next_non_space_column = 0
128 indent = 0
129 is_blank = false
130 last_line_blank.clear
131 end
132
133 # Parse the `input` string as a MdDocument
134 fun parse(input: String): MdDocument do
135 initialize
136
137 var document_block_parser = new MdDocumentBlockParser(1, 1, 0)
138 activate_block_parser(document_block_parser)
139 var line_start = 0
140 var line_break = find_line_break(input, line_start)
141 while line_break != -1 do
142 var line_string = input.substring(line_start, line_break - line_start)
143 incorporate_line(line_string)
144 if line_break + 1 < input.length and
145 input.chars[line_break] == '\r' and
146 input.chars[line_break + 1] == '\n' then
147 line_start = line_break + 2
148 else
149 line_start = line_break + 1
150 end
151 line_break = find_line_break(input, line_start)
152 line += 1
153 column = 0
154 end
155
156 # Finalize pending line
157 if input.length > 0 and (line_start == 0 or line_start < input.length) then
158 incorporate_line(input.substring(line_start, input.length - line_start))
159 end
160 finalize_blocks(active_block_parsers)
161
162 # Walk through a block and its chiildren revursively
163 # Parsing string content into inline content where appropriate.
164 var all_block_parsers = all_block_parsers.to_a
165 var i = all_block_parsers.length - 1
166 while i >= 0 do
167 var block_parser = all_block_parsers[i]
168 block_parser.parse_inlines(inline_parser)
169 i -= 1
170 end
171 var document = document_block_parser.block
172 return document
173 end
174
175 # Post-process the `document`
176 fun post_process(document: MdDocument) do
177 for processor in post_processors do
178 processor.post_process(self, document)
179 end
180 end
181
182 # Analyze a line of text and update the document
183 #
184 # We parse Markdown text by calling this on each line of `input`.
185 private fun incorporate_line(input: String) do
186 line_string = input
187 index = 0
188 column = 0
189 column_is_in_tab = false
190
191 # For each containing block, try to parse the associated line start.
192 var matches = 1
193 for i in [1 .. active_block_parsers.length[ do
194 var block_parser = active_block_parsers[i]
195 find_next_non_space
196
197 var result = block_parser.try_continue(self)
198 if result isa MdBlockContinue then
199 if result.is_finalize then
200 block_parser.finalize(self)
201 return
202 else
203 if result.new_index != -1 then
204 set_new_index result.new_index
205 else if result.new_column != -1 then
206 set_new_column result.new_column
207 end
208 end
209 matches += 1
210 else
211 break
212 end
213 end
214
215 var unmatched_block_parsers = active_block_parsers.subarray(
216 matches, active_block_parsers.length - matches)
217 var last_matched_block_parser = active_block_parsers[matches - 1]
218 var block_parser = last_matched_block_parser
219 var all_closed = unmatched_block_parsers.is_empty
220
221 # Unless last matched container is a code block, try new container starts,
222 # adding children to the last matched container.
223 var try_block_starts = block_parser.block isa MdParagraph or
224 block_parser.block.is_container
225
226 while try_block_starts do
227 find_next_non_space
228
229 # Optimize lookup
230 if is_blank or (indent < 4 and line_string.chars[next_non_space_index].is_letter) then
231 set_new_index next_non_space_index
232 break
233 end
234
235 var block_start = find_block_start(block_parser)
236 if block_start == null then
237 set_new_index next_non_space_index
238 break
239 end
240
241 if not all_closed then
242 finalize_blocks(unmatched_block_parsers)
243 all_closed = true
244 end
245
246 if block_start.new_index != -1 then
247 set_new_index block_start.new_index
248 else if block_start.new_column != -1 then
249 set_new_column block_start.new_column
250 end
251
252 if block_start.replace_active_block_parser then
253 remove_active_block_parser
254 end
255
256 for new_block_parser in block_start.block_parsers do
257 add_child(new_block_parser)
258 block_parser = new_block_parser
259 try_block_starts = new_block_parser.block.is_container
260 end
261 end
262
263 # What remains at the offset is a text line.
264 # Add the text to the appropriate block.
265
266 # First check for a lazy paragraph continuation
267 if not all_closed and not is_blank and active_block_parser isa MdParagraphParser then
268 add_line
269 else
270 # Finalize any blocks not matched
271 if not all_closed then
272 finalize_blocks(unmatched_block_parsers)
273 end
274 propagate_last_line_blank(block_parser, last_matched_block_parser)
275
276 if not block_parser.block.is_container then
277 add_line
278 else if not is_blank then
279 # Create a paragraph container for the line
280 add_child(new MdParagraphParser(line, column + 1, block_parser.content_offset))
281 add_line
282 end
283 end
284 end
285
286 # Find what kind of block starts at `index` in `input`
287 private fun find_block_start(block_parser: MdBlockParser): nullable MdBlockStart do
288 for block_parser_factory in block_parser_factories do
289 var result = block_parser_factory.try_start(self, block_parser)
290 if result != null then return result
291 end
292 return null
293 end
294
295 # Add a `block_parser` block's as child of the active block parser block
296 private fun add_child(block_parser: MdBlockParser) do
297 # Finalize non-parentable blocks
298 while not active_block_parser.block.can_contain(block_parser.block) do
299 active_block_parser.finalize(self)
300 end
301 # Append block block parser block to its parent
302 active_block_parser.block.append_child(block_parser.block)
303 activate_block_parser(block_parser)
304 end
305
306 # Add line content to the active block parser
307 #
308 # We assume it can accept lines.
309 private fun add_line do
310 var content = null
311 if column_is_in_tab then
312 # Out column is in a partially consumed tab.
313 # Expand the remaining columns to the next tab stop to spaces.
314 var after_tab = index + 1
315 var rest = line_string.substring(after_tab, line_string.length - after_tab)
316 var spaces = column.columns_to_next_tab_stop
317 var buffer = new Buffer
318 for i in [0 .. spaces[ do
319 buffer.add ' '
320 end
321 buffer.append(rest)
322 content = buffer.write_to_string
323 else
324 content = line_string.substring(index, line_string.length - index)
325 end
326 active_block_parser.add_line(content)
327 end
328
329 # Finalize blocks of previous line
330 private fun finalize_blocks(block_parsers: Sequence[MdBlockParser]) do
331 var i = block_parsers.length - 1
332 while i >= 0 do
333 var block_parser = block_parsers[i]
334 block_parser.finalize(self)
335 i -= 1
336 end
337 end
338
339 # Advance the `index` position to the next character
340 #
341 # Also set the `column`.
342 # If the next character is a tab, compute the new column accordingly.
343 private fun advance do
344 var c = line_string.chars[index]
345 if c == '\t' then
346 index += 1
347 column += column.columns_to_next_tab_stop
348 else
349 index += 1
350 column += 1
351 end
352 end
353
354 # Move `index` to the next non-space character index in the `input` string
355 #
356 # Also set `next_non_space_index`, `next_non_space_column`, `is_blank` and `indent`.
357 private fun find_next_non_space do
358 var i = index
359 var cols = column
360
361 is_blank = true
362 while i < line_string.length do
363 var c = line_string.chars[i]
364 if c == ' ' then
365 i += 1
366 cols += 1
367 continue
368 else if c == '\t' then
369 i += 1
370 cols += 4 - (cols % 4)
371 continue
372 end
373 is_blank = false
374 break
375 end
376
377 next_non_space_index = i
378 next_non_space_column = cols
379 indent = next_non_space_column - column
380 end
381
382 # Return the position of the next line break
383 #
384 # We consider `\r` and `\n`.
385 private fun find_line_break(input: String, start_index: Int): Int do
386 for i in [start_index .. input.length[ do
387 var char = input.chars[i]
388 if char == '\r' or char == '\n' then return i
389 end
390 return -1
391 end
392
393 # Set the parser `index` at `new_index`
394 #
395 # Also set `column` and `column_is_in_tab`.
396 private fun set_new_index(new_index: Int) do
397 if new_index >= next_non_space_index then
398 # We can start from here, no need to calculate tab stops again
399 index = next_non_space_index
400 column = next_non_space_column
401 end
402 while index < new_index and index != line_string.length do
403 advance
404 end
405 # If we're going to an index as opposed to a column, we're never within a tab
406 column_is_in_tab = false
407 end
408
409 # Set the parser `column` at `new_column`
410 #
411 # Also set `index` and `column_is_in_tab`.
412 private fun set_new_column(new_column: Int) do
413 if new_column >= next_non_space_column then
414 # We can start from here, no need to calculate tab stops again
415 index = next_non_space_index
416 column = next_non_space_column
417 end
418 while column < new_column and index != line_string.length do
419 advance
420 end
421 if column > new_column then
422 # Last character was a tab and we overshot our target
423 index -= 1
424 column = new_column
425 column_is_in_tab = true
426 else
427 column_is_in_tab = false
428 end
429 end
430
431 # Does `block` end with a blank line?
432 private fun ends_with_blank_line(block: nullable MdNode): Bool do
433 while block != null do
434 if is_last_line_blank(block) then return true
435 if block isa MdListBlock or block isa MdListItem then
436 block = block.last_child
437 else
438 break
439 end
440 end
441 return false
442 end
443
444 # Propagate a blank line to all block_parser blocl's parents
445 private fun propagate_last_line_blank(block_parser: MdBlockParser, last_matched_block_parser: MdBlockParser) do
446 var last_child = block_parser.block.last_child
447 if is_blank and last_child != null then
448 last_line_blank[last_child] = true
449 end
450 var block = block_parser.block
451
452 # Block quotes lines are never blank as they start with `>`.
453 # We don't count blanks in fenced code for purposes of thight/loose lists.
454 # We also don't set `last_line_blank` on an empty list item.
455 var last_line_blank = is_blank and
456 not (block isa MdBlockQuote or
457 block isa MdFencedCodeBlock or
458 (block isa MdListItem and block.first_child == null and
459 block_parser != last_matched_block_parser))
460
461 # Propagate `last_line_blank` up through parents
462 var node: nullable MdNode = block_parser.block
463 while node != null do
464 self.last_line_blank[node] = last_line_blank
465 node = node.parent
466 end
467 end
468
469 # Is last line blank for `node`?
470 private fun is_last_line_blank(node: MdNode): Bool do
471 if not last_line_blank.has_key(node) then return false
472 return last_line_blank[node]
473 end
474 end
475
476 # Block parsing
477
478 # Parser for a specific block node
479 abstract class MdBlockParser
480
481 # Kind of block under construction
482 type BLOCK: MdBlock
483
484 # MdBlock under construction
485 fun block: BLOCK is abstract
486
487 # Line Start
488 var line_start: Int
489
490 # Column start
491 var column_start: Int
492
493 # Location at start
494 #
495 # The location end it initialized at `-1` and will be set later in the
496 # `finalize` method.
497 var location: MdLocation is lazy do return new MdLocation(line_start, column_start, -1, -1)
498
499 # Column where the content starts
500 var content_offset: Int
501
502 # Initialize the current `block`
503 fun initialize(parser: MdParser) do end
504
505 # Can `self` continue from the current `index` in `parser`?
506 #
507 # Return a new `MdBlockContinue` if `self` can continue parsing.
508 # Return null otherwise.
509 fun try_continue(state: MdParser): nullable MdBlockContinue is abstract
510
511 # Add `line` to the current `block`
512 fun add_line(line: String) do end
513
514 # Finalize the current `block`
515 #
516 # Deactivate `self` from `parser` and call `close_block`.
517 fun finalize(parser: MdParser) do
518 if parser.active_block_parser == self then
519 parser.deactivate_block_parser
520 end
521 end
522
523 # Parse `block` lines
524 fun parse_inlines(inline_parser: MdInlineParser) do end
525 end
526
527 # Result object for continuing parsing of a block
528 class MdBlockContinue
529
530 # Index from which continue parsing
531 var new_index: Int
532
533 # Column from which continue parsing
534 var new_column: Int
535
536 # Is the block finalized?
537 var is_finalize: Bool
538
539 # Continue from index
540 init at_index(new_index: Int) do
541 init(new_index, -1, false)
542 end
543
544 # Continue from column
545 init at_column(new_column: Int) do
546 init(-1, new_column, false)
547 end
548
549 # Block is finished
550 init finished do
551 init(-1, -1, true)
552 end
553 end
554
555 # Block parser factory for a block node for determining when a block starts
556 abstract class MdBlockParserFactory
557
558 # Can the associated block parser can start at the current line in `parser`?
559 #
560 # Return a new `MdBlockStart` if the block parser can start.
561 # Return null otherwise.
562 fun try_start(parser: MdParser, matched_block_parser: MdBlockParser):
563 nullable MdBlockStart is abstract
564 end
565
566 # Result object from starting parsing of a block
567 class MdBlockStart
568
569 # Block parsers for this block start
570 var block_parsers: Array[MdBlockParser]
571
572 # Index where the parsing should start
573 var new_index = -1
574
575 # Column where the parsing should start
576 var new_column = -1
577
578 # Does the block starting with `self` terminate a previous block?
579 var replace_active_block_parser = false
580
581 # Start from `new_index`
582 fun at_index(new_index: Int): MdBlockStart do
583 self.new_index = new_index
584 return self
585 end
586
587 # Start from `new_column`
588 fun at_column(new_column: Int): MdBlockStart do
589 self.new_column = new_column
590 return self
591 end
592
593 # Start replacing the active block parser
594 fun replacing_active_block_parser: MdBlockStart do
595 self.replace_active_block_parser = true
596 return self
597 end
598 end
599
600 # Parser for the whole document
601 class MdDocumentBlockParser
602 super MdBlockParser
603
604 redef type BLOCK: MdDocument
605 redef var block = new MdDocument(location) is lazy
606
607 # Always continue at current indent
608 redef fun try_continue(state) do return new MdBlockContinue.at_index(state.index)
609
610 redef fun finalize(parser) do
611 end
612
613 # redef fun finalize(state) do
614 redef fun parse_inlines(inline_parser) do
615 var last_child = block.last_child
616 if last_child != null then
617 location.line_end = last_child.location.line_end
618 location.column_end = last_child.location.column_end
619 end
620 end
621 end
622
623 # Headings parser
624 class MdHeadingParser
625 super MdBlockParser
626
627 redef type BLOCK: MdHeading
628
629 redef var block = new MdHeading(location, level, is_setext, has_atx_trailing) is lazy
630
631 redef var location = new MdLocation(line_start, column_start, line_end, column_end) is lazy
632
633 # Line end
634 var line_end: Int
635
636 # Column end
637 var column_end: Int
638
639 # Heading level
640 var level: Int
641
642 # Heading content
643 var content: String
644
645 # Heading has ATX trailing
646 var has_atx_trailing: Bool
647
648 # Heading is setext format
649 var is_setext: Bool
650
651 # Never continue parsing as an heading is a one liner
652 redef fun try_continue(state) do return null
653
654 # Parse the heading content
655 redef fun parse_inlines(inline_parser) do
656 inline_parser.parse(content, content_offset, block)
657 end
658 end
659
660 # Heading parser factory
661 class MdHeadingParserFactory
662 super MdBlockParserFactory
663
664 redef fun try_start(state, matched_block_parser) do
665 if state.indent >= 4 then return null
666
667 var next_non_space = state.next_non_space_index
668 var line = state.line_string
669 var paragraph = null
670 if matched_block_parser isa MdParagraphParser then
671 paragraph = matched_block_parser.content
672 end
673
674 var line_content = line.substring(next_non_space, line.length - next_non_space)
675 var match = line_content.search(re_atx_heading)
676 if match != null then
677 # ATX heading
678 var new_offset = next_non_space + match.subs.first.as(not null).length
679 var level = match.subs.first.as(not null).to_s.trim.length
680 # remove trailing ###s
681 var after_leading = line.substring(new_offset, line.length - new_offset)
682 var trailing = after_leading.search(re_atx_trailing)
683 var has_trailing = trailing != null
684 var trailing_length = if trailing != null then trailing.length else 0
685 var content = after_leading.replace(re_atx_trailing, "")
686 return (new MdBlockStart(
687 [new MdHeadingParser(
688 state.line,
689 next_non_space + 1,
690 new_offset + 1,
691 state.line,
692 new_offset + content.length + trailing_length,
693 level,
694 content,
695 has_trailing, false)])
696 ).at_index(line.length)
697 end
698
699 if paragraph == null then return null
700
701 match = line_content.search(re_setext_heading)
702 if match == null then return null
703 var level = 2
704 if match.subs.first.as(not null).to_s.chars.first == '=' then level = 1
705 var content = paragraph.to_s
706 return (new MdBlockStart(
707 [new MdHeadingParser(
708 state.line - 1,
709 next_non_space + 1,
710 0,
711 state.line,
712 state.column + match.length,
713 level,
714 content,
715 false, true)])
716 ).at_index(line.length).replacing_active_block_parser
717 end
718 end
719
720 # Blockquotes parser
721 class MdBlockQuoteParser
722 super MdBlockParser
723
724 redef type BLOCK: MdBlockQuote
725 redef var block = new MdBlockQuote(location) is lazy
726
727 redef fun try_continue(state) do
728 var next_non_space = state.next_non_space_index
729 var indent = state.indent
730 var line = state.line_string
731
732 if indent >= 4 then return null
733 if next_non_space >= line.length then return null
734 if line.chars[next_non_space] != '>' then return null
735
736 var new_column = state.column + state.indent + 1
737 # optional following space or tab
738 if state.line_string.is_space_or_tab(next_non_space + 1) then
739 new_column += 1
740 end
741 return new MdBlockContinue.at_column(new_column)
742 end
743
744 redef fun parse_inlines(inline_parser) do
745 var last_child = block.last_child
746 if last_child != null then
747 location.line_end = last_child.location.line_end
748 location.column_end = last_child.location.column_end
749 end
750 end
751 end
752
753 # Blockquotes parser factory
754 class MdBlockQuoteParserFactory
755 super MdBlockParserFactory
756
757 redef fun try_start(state, matched_block_parser) do
758 var next_non_space = state.next_non_space_index
759 var indent = state.indent
760 var line = state.line_string
761
762 if indent >= 4 then return null
763 if next_non_space >= line.length then return null
764 if line.chars[next_non_space] != '>' then return null
765
766 var new_column = state.column + state.indent + 1
767 # optional following space or tab
768 if state.line_string.is_space_or_tab(next_non_space + 1) then
769 new_column += 1
770 end
771 return (new MdBlockStart(
772 [new MdBlockQuoteParser(
773 state.line,
774 state.column + 1,
775 new_column)])
776 ).at_column(new_column)
777 end
778 end
779
780 # Indented code blocks parser
781 class MdIndentedCodeBlockParser
782 super MdBlockParser
783
784 redef type BLOCK: MdIndentedCodeBlock
785 redef var block = new MdIndentedCodeBlock(location, use_tabs) is lazy
786
787 # Indent is tab?
788 var use_tabs: Bool
789
790 # Block content
791 var content = new Buffer
792
793 redef fun try_continue(state) do
794 if state.indent >= 4 then
795 return new MdBlockContinue.at_column(state.column + 4)
796 else if state.is_blank then
797 return new MdBlockContinue.at_index(state.next_non_space_index)
798 end
799 return null
800 end
801
802 redef fun add_line(line) do
803 if not content.is_empty then
804 content.add('\n')
805 end
806 content.append(line)
807 end
808
809 redef fun finalize(parser) do
810 super
811
812 add_line(" ")
813 var content = self.content.to_s
814 var literal = content.replace_first(re_trailing_blank_lines, "\n")
815 block.literal = literal
816
817 var lines = literal.split("\n")
818 location.line_end = location.line_start + lines.length - 2
819 location.column_end = content_offset + lines[lines.length - 2].length + 4
820 end
821 end
822
823 # Indented code blocks parser factory
824 class MdIndentedCodeBlockParserFactory
825 super MdBlockParserFactory
826
827 redef fun try_start(state, matched_block_parser) do
828 if state.indent < 4 then return null
829 if state.is_blank then return null
830 if state.active_block_parser.block isa MdParagraph then return null
831
832 var use_tabs = state.line_string.has_prefix("\t")
833 return (new MdBlockStart(
834 [new MdIndentedCodeBlockParser(
835 state.line,
836 state.column + 1,
837 state.column,
838 use_tabs)])
839 ).at_column(state.column + 4)
840 end
841 end
842
843 # Fenced code blocks parser
844 class MdFencedCodeBlockParser
845 super MdBlockParser
846
847 redef type BLOCK: MdFencedCodeBlock
848 redef var block = new MdFencedCodeBlock(location, fence_char, fence_length, fence_indent) is lazy
849
850 # Fence character
851 var fence_char: Char
852
853 # Fence length
854 var fence_length: Int
855
856 # Fence indent
857 var fence_indent: Int
858
859 # Fence first line
860 var first_line: nullable String = null
861
862 # Fence other lines
863 var other_lines = new Buffer
864
865 redef fun try_continue(state) do
866 var next_non_space = state.next_non_space_index
867 var new_index = state.index
868 var line = state.line_string
869
870 if state.indent <= 3 and next_non_space < line.length and
871 line.chars[next_non_space] == fence_char then
872
873 var match = line.substring(next_non_space, line.length - next_non_space).
874 search(re_closing_fence)
875 if match != null and match.subs[0].as(not null).length >= fence_length then
876 # closing fence - we're at end of line, so we can finalize now
877 return new MdBlockContinue.finished
878 end
879 end
880
881 # skip optional spaces of fence indent
882 var i = fence_indent
883 while i > 0 and new_index < line.length and line.chars[new_index] == ' ' do
884 new_index += 1
885 i -= 1
886 end
887
888 return new MdBlockContinue.at_index(new_index)
889 end
890
891 redef fun add_line(line) do
892 if first_line == null then
893 first_line = line
894 else
895 other_lines.append(line)
896 other_lines.add '\n'
897 end
898 end
899
900 redef fun finalize(parser) do
901 super
902
903 # first line become info string
904 var first_line = self.first_line
905 if first_line != null then
906 var info = first_line.trim.unescape_string
907 if not info.is_empty then block.info = info
908 end
909
910 var content = other_lines.to_s
911 block.literal = content
912
913 var lines = content.split("\n")
914 location.line_end = location.line_start + lines.length
915 location.column_end = content_offset + fence_indent + fence_length
916 end
917 end
918
919 # Fenced code blocks parser factory
920 class MdFencedCodeBlockParserFactory
921 super MdBlockQuoteParserFactory
922
923 redef fun try_start(state, matched_block_parser) do
924 var next_non_space = state.next_non_space_index
925 var line = state.line_string
926
927 if state.indent >= 4 then return null
928
929 var match = line.substring(next_non_space, line.length - next_non_space).search(re_opening_fence)
930 if match == null then return null
931
932 var fence_length
933 var fence_char
934 var sub0 = match.subs[0]
935 if sub0 != null then
936 fence_length = sub0.length
937 fence_char = sub0.to_s.chars.first
938 else
939 fence_length = match.subs[2].as(not null).length
940 fence_char = match.subs[2].as(not null).to_s.chars.first
941 end
942 if fence_char == '`' and match.to_s.has("[^`]+`".to_re) then
943 return null
944 else if match.to_s.has("[^~]+~".to_re) then
945 return null
946 end
947 return (new MdBlockStart(
948 [new MdFencedCodeBlockParser(
949 state.line,
950 state.column + 1,
951 state.column,
952 fence_char,
953 fence_length,
954 state.indent)]
955 )).at_index(next_non_space + fence_length)
956 end
957 end
958
959 # List blocks parser
960 class MdListBlockParser
961 super MdBlockParser
962
963 redef type BLOCK: MdListBlock
964
965 redef var block is lazy do
966 if is_ordered then
967 return new MdOrderedList(location, digit.as(not null), delim.as(not null))
968 else
969 return new MdUnorderedList(location, bullet.as(not null))
970 end
971 end
972
973 # Is this list ordered
974 var is_ordered: Bool
975
976 # List bullet if unordered
977 var bullet: nullable Char
978
979 # List digit if ordered
980 var digit: nullable Int
981
982 # List delimiter if ordered
983 var delim: nullable Char
984
985 redef fun try_continue(state) do return new MdBlockContinue.at_index(state.index)
986
987 redef fun finalize(parser) do
988 super
989
990 var item = block.first_child
991 while item != null do
992 # check for non-final list item ending with blank line
993 if parser.ends_with_blank_line(item) and item.next != null then
994 block.is_tight = false
995 break
996 end
997 # recurse into children of list item to see if there are spaces between any of them
998 var sub_item = item.first_child
999 while sub_item != null do
1000 if parser.ends_with_blank_line(sub_item) and
1001 (item.next != null or sub_item.next != null) then
1002 block.is_tight = false
1003 break
1004 end
1005 sub_item = sub_item.next
1006 end
1007 item = item.next
1008 end
1009 end
1010
1011 redef fun parse_inlines(inline_parser) do
1012 var last_child = block.last_child
1013 if last_child != null then
1014 location.line_end = last_child.location.line_end
1015 location.column_end = last_child.location.column_end
1016 end
1017 end
1018 end
1019
1020 # List blocks parser factory
1021 class MdListBlockParserFactory
1022 super MdBlockQuoteParserFactory
1023
1024 redef fun try_start(state, matched_block_parser) do
1025 if state.indent >= 4 and not matched_block_parser isa MdListBlockParser then return null
1026
1027 var marker_index = state.next_non_space_index
1028 var marker_column = state.column + state.indent
1029
1030 var in_paragraph = matched_block_parser isa MdParagraphParser and matched_block_parser.content != null
1031 var list_data = parse_list_marker(state, state.line_string, marker_index, marker_column, in_paragraph)
1032 if list_data == null then return null
1033
1034
1035 var new_column = list_data.content_column
1036 var list_item_parser = new MdListItemParser(
1037 state.line,
1038 state.column + 1,
1039 new_column,
1040 new_column - state.column)
1041
1042 # prepend the list block if needed
1043 if not matched_block_parser isa MdListBlockParser or not lists_match(matched_block_parser.block, list_data) then
1044 var list_block_parser = new MdListBlockParser(state.line, state.column + 1, new_column - state.column, list_data.is_ordered, list_data.bullet, list_data.digit, list_data.delim)
1045 list_block_parser.block.is_tight = true
1046
1047 return (new MdBlockStart([list_block_parser, list_item_parser: MdBlockParser])).at_column(new_column)
1048 end
1049 return (new MdBlockStart([list_item_parser])).at_column(new_column)
1050 end
1051
1052 private fun parse_list_marker(state: MdParser, line: String, marker_index, marker_column: Int, in_paragraph: Bool): nullable MdListData do
1053 var rest = line.substring(marker_index, line.length - marker_index)
1054 var match = rest.search(re_list_marker)
1055 if match == null then return null
1056
1057 var is_ordered
1058 var bullet = null
1059 var digit = null
1060 var delim = null
1061
1062 var bullet_match = match.subs[0]
1063 if bullet_match != null then
1064 is_ordered = false
1065 bullet = bullet_match.to_s.chars[0]
1066 else
1067 is_ordered = true
1068 digit = match.subs[2].as(not null).to_s.to_i
1069 delim = match.subs[3].as(not null).to_s.chars[0]
1070 end
1071
1072 var marker_length = match.length
1073 if match.to_s.has_suffix(" ") or match.to_s.has_suffix("\t") then
1074 marker_length -= 1
1075 end
1076 var index_after_marker = marker_index + marker_length
1077
1078 # marker doesn't include tabs, so counting them as column directly is ok
1079 var column_after_marker = marker_column + marker_length
1080 # the column within the line where the content starts
1081 var content_column = column_after_marker
1082
1083 # see at which column the content starts if there is content
1084 var has_content = false
1085 for i in [index_after_marker .. line.length[ do
1086 var c = line.chars[i]
1087 if c == '\t' then
1088 content_column += content_column.columns_to_next_tab_stop
1089 else if c == ' ' then
1090 content_column += 1
1091 else
1092 has_content = true
1093 break
1094 end
1095 end
1096
1097 if in_paragraph then
1098 # if the list item is ordered, then start number must be 1 to interrupt a paragraph
1099 if is_ordered and digit != 1 then
1100 return null
1101 end
1102 # empty list item can not interrupt a paragraph
1103 if not has_content then
1104 return null
1105 end
1106 end
1107
1108 if not has_content or (content_column - column_after_marker) > 4 then
1109 # if this line is blank or has a code block, default to 1 space after marker
1110 content_column = column_after_marker + 1
1111 end
1112 return new MdListData(is_ordered, bullet, digit, delim, content_column)
1113 end
1114
1115 # Return true if the two list items are of the same type
1116 #
1117 # With the same delimiter and bullet character.
1118 # This is used in agglomerating list items into lists
1119 private fun lists_match(a: MdListBlock, b: MdListData): Bool do
1120 if a isa MdUnorderedList and not b.is_ordered then
1121 return a.bullet_marker == b.bullet
1122 else if a isa MdOrderedList and b.is_ordered then
1123 return a.delimiter == b.delim
1124 end
1125 return false
1126 end
1127 end
1128
1129 # Parsed list data
1130 private class MdListData
1131
1132 var is_ordered: Bool
1133
1134 var bullet: nullable Char
1135
1136 var digit: nullable Int
1137
1138 var delim: nullable Char
1139
1140 # Column the content start at
1141 var content_column: Int
1142 end
1143
1144 # List items parser
1145 class MdListItemParser
1146 super MdBlockParser
1147
1148 redef type BLOCK: MdListItem
1149 redef var block = new MdListItem(location) is lazy
1150
1151 # List item content indend
1152 var content_indent: Int
1153
1154 redef fun try_continue(state) do
1155 if state.is_blank then
1156 if block.first_child == null then
1157 # blank line after empty list item
1158 return null
1159 end
1160 return new MdBlockContinue.at_index(state.next_non_space_index)
1161 end
1162 if state.indent >= content_indent then
1163 return new MdBlockContinue.at_column(state.column + content_indent)
1164 end
1165 return null
1166 end
1167
1168 redef fun parse_inlines(inline_parser) do
1169 var last_child = block.last_child
1170 if last_child != null then
1171 location.line_end = last_child.location.line_end
1172 location.column_end = last_child.location.column_end
1173 end
1174 end
1175 end
1176
1177 # Thematic breaks parser
1178 class MdThematicBreakParser
1179 super MdBlockParser
1180
1181 redef type BLOCK: MdThematicBreak
1182 redef var block = new MdThematicBreak(location, pattern) is lazy
1183
1184 # Thematic break pattern
1185 var pattern: String
1186
1187 redef fun try_continue(state) do return null
1188
1189 redef fun finalize(parser) do
1190 super
1191
1192 location.line_end = line_start
1193 location.column_end = column_start + pattern.length - 1
1194 end
1195 end
1196
1197 # Thematic breaks parser factory
1198 class MdThematicBreakParserFactory
1199 super MdBlockQuoteParserFactory
1200
1201 redef fun try_start(state, matched_block_parser) do
1202 if state.indent >= 4 then return null
1203
1204 var next_non_space = state.next_non_space_index
1205 var line = state.line_string
1206 var tbreak = line.substring(next_non_space, line.length - next_non_space).search(re_thematic_break)
1207 if tbreak != null then
1208 return (new MdBlockStart(
1209 [new MdThematicBreakParser(
1210 state.line,
1211 state.column + 1,
1212 next_non_space,
1213 tbreak.to_s)]
1214 )).at_index(line.length)
1215 end
1216 return null
1217 end
1218 end
1219
1220 # Paragraphs parser
1221 class MdParagraphParser
1222 super MdBlockParser
1223
1224 redef type BLOCK: MdParagraph
1225
1226 redef var block = new MdParagraph(location) is lazy
1227
1228 # Paragraph content
1229 var content: nullable Buffer = new Buffer
1230
1231 redef fun try_continue(state) do
1232 if state.is_blank then return null
1233 return new MdBlockContinue.at_index(state.index)
1234 end
1235
1236 redef fun add_line(line) do
1237 var content = self.content
1238 if content == null then return
1239 if not content.is_empty then
1240 content.add('\n')
1241 end
1242 content.append(line)
1243 end
1244
1245 redef fun finalize(parser) do
1246 super
1247
1248 var inline_parser = parser.inline_parser
1249 var content = self.content
1250 if content == null then return
1251
1252 var content_string = content.to_s
1253 var has_reference_defs = false
1254
1255 var pos = inline_parser.parse_reference(content_string)
1256 # try parsing the beginning as link reference definitions
1257 while content_string.length > 3 and content_string.chars[0] == '[' and pos != 0 do
1258 content_string = content_string.substring(pos, content_string.length - pos)
1259 has_reference_defs = true
1260 pos = inline_parser.parse_reference(content_string)
1261 end
1262
1263 if has_reference_defs and content_string.is_blank then
1264 block.unlink
1265 self.content = null
1266 else
1267 self.content = new Buffer.from_text(content_string)
1268 end
1269 end
1270
1271 redef fun parse_inlines(inline_parser) do
1272 var content = self.content
1273 if content == null then return
1274 inline_parser.parse(content.to_s, content_offset, block)
1275
1276 var last_child = block.last_child
1277 if last_child != null then
1278 location.line_end = last_child.location.line_end
1279 location.column_end = last_child.location.column_end
1280 end
1281 end
1282 end
1283
1284 # Html blocks parser
1285 class MdHtmlBlockParser
1286 super MdBlockParser
1287
1288 redef type BLOCK: MdHtmlBlock
1289 redef var block = new MdHtmlBlock(location) is lazy
1290
1291 # Closing tag pattern
1292 #
1293 # Or null if the block is not closed
1294 var closing_pattern: nullable Pattern
1295
1296 # Is the current block finished?
1297 var finished = false
1298
1299 # Block content
1300 var content = new Buffer
1301
1302 redef fun try_continue(state) do
1303 if finished then return null
1304
1305 # blank lin ends type 6 and 7 blocks
1306 if state.is_blank and closing_pattern == null then return null
1307
1308 return new MdBlockContinue.at_index(state.index)
1309 end
1310
1311 redef fun add_line(line) do
1312 if not content.is_empty then
1313 content.add('\n')
1314 end
1315 content.append(line)
1316 var closing_pattern = self.closing_pattern
1317 if closing_pattern != null and line.has(closing_pattern) then
1318 finished = true
1319 end
1320 end
1321
1322 redef fun finalize(parser) do
1323 super
1324
1325 var content = self.content.to_s
1326 block.literal = content
1327
1328 var lines = content.split("\n")
1329 location.line_end = location.line_start + lines.length - 1
1330 location.column_end = lines.last.length
1331 end
1332 end
1333
1334 # Html blocks parser factory
1335 class MdHtmlBlockParserFactory
1336 super MdBlockParserFactory
1337
1338 redef fun try_start(state, matched_block_parser) do
1339 var next_non_space = state.next_non_space_index
1340 var line = state.line_string
1341
1342 if state.indent >= 4 or line.chars[next_non_space] != '<' then return null
1343
1344 for block_type in [0..6] do
1345 # type 7 can not interrupt a paragraph
1346 if block_type == 6 and matched_block_parser.block isa MdParagraph then continue
1347 var opener = re_html_blocks[block_type].first
1348 var closer = re_html_blocks[block_type].last
1349 if line.substring(next_non_space, line.length - next_non_space).has(opener.as(not null)) then
1350 return (new MdBlockStart(
1351 [new MdHtmlBlockParser(
1352 state.line,
1353 state.column + 1,
1354 next_non_space,
1355 closer)])
1356 ).at_index(state.index)
1357 end
1358 end
1359 return null
1360 end
1361 end
1362
1363 # Post Processing
1364
1365 # Markdown post processor
1366 #
1367 # A Markdown AST visitor called after parsing from a MdParser
1368 abstract class MdPostProcessor
1369 super MdVisitor
1370
1371 # Document behing processed
1372 #
1373 # Availlable only during a call to `post_process`.
1374 var document: nullable MdDocument = null
1375
1376 # Post process the `document` parsed by `parser`
1377 fun post_process(parser: MdParser, document: MdDocument) do
1378 self.document = document
1379 enter_visit(document)
1380 self.document = null
1381 end
1382
1383 # Call `MdNode::post_process`
1384 redef fun visit(node) do node.post_process(self)
1385 end
1386
1387 redef class MdNode
1388
1389 # Accept the visit of a `MdPostProcessor`
1390 fun post_process(v: MdPostProcessor) do visit_all(v)
1391 end
1392
1393 # Utils
1394
1395 redef class Sys
1396 # ATX headings matching
1397 private var re_atx_heading: Regex = "^(#\{1,6\})([ \t]+|$)".to_re
1398
1399 # ATX trailings matching
1400 private var re_atx_trailing: Regex = "(^|[ \t]+)#+[ \t]*$".to_re
1401
1402 # SeText headings matching
1403 private var re_setext_heading: Regex = "^(=+|-+)[ \t]*$".to_re
1404
1405 # Blank lines matching
1406 var re_trailing_blank_lines: Regex = "(\n[ \t]*)+$".to_re
1407
1408 # Opening fence matching
1409 var re_opening_fence: Regex = "^(`\{3,\})(.*)|^(~\{3,\})(.*)".to_re
1410
1411 # Closing fence matching
1412 var re_closing_fence: Regex = "^(`\{3,\}|~\{3,\})( *$)".to_re
1413
1414 # List marker matching
1415 var re_list_marker: Regex = "^([*+-])( |\t|$)|^([0-9]\{1,9\})([.)])( |\t|$)".to_re
1416
1417 # Thematic break pattern
1418 var re_thematic_break: Regex = "^((\\*[ \t]*)\{3,\}|(_[ \t]*)\{3,\}|(-[ \t]*)\{3,\})[ \t]*$".to_re
1419
1420 # HTML blocks patterns
1421 var re_html_blocks: Array[Array[nullable Regex]] do
1422 var blocks = new Array[Array[nullable Regex]]
1423
1424 var re0_opening = "^<(script|pre|style)(\\s|>|$)".to_re
1425 re0_opening.ignore_case = true
1426 var re0_closing = "</(script|pre|style)>".to_re
1427 re0_closing.ignore_case = true
1428 blocks.add([re0_opening, re0_closing])
1429
1430 blocks.add([
1431 "^<!--".to_re,
1432 "-->".to_re
1433 ])
1434
1435 blocks.add([
1436 "^<[?]".to_re,
1437 "\\?>".to_re
1438 ])
1439
1440 blocks.add([
1441 "^<![A-Z]".to_re,
1442 ">".to_re
1443 ])
1444
1445 blocks.add([
1446 "^<!\\[CDATA\\[".to_re,
1447 "\\]\\]>".to_re
1448 ])
1449
1450 var re5_opening = "^</?(address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(\\s|[/]?[>]|$)".to_re
1451 re5_opening.ignore_case = true
1452 blocks.add([re5_opening, null])
1453
1454 var p_tagname = "[A-Za-z][A-Za-z0-9-]*"
1455 var p_attribute_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*"
1456 var p_uquoted_value = "[^\"'=<>`\\x00-\\x20]+"
1457 var p_squoted_value = "'[^']*'"
1458 var p_dquoted_value = "\"[^\"]*\""
1459 var p_attribute_value = "({p_uquoted_value}|{p_squoted_value}|{p_dquoted_value})"
1460 var p_attribute_value_spec = "(\\s*=\\s*{p_attribute_value})"
1461 var p_attribute = "(\\s{p_attribute_name}{p_attribute_value_spec}?)"
1462 var p_opentag = "<{p_tagname}{p_attribute}*\\s*/?>"
1463 var p_closetag = "</{p_tagname}\\s*[>]"
1464 var re6_opening = "^({p_opentag}|{p_closetag})\\s*$".to_re
1465 re6_opening.ignore_case = true
1466 blocks.add([re6_opening, null])
1467
1468 return blocks
1469 end
1470 end
1471
1472 redef class Int
1473
1474 # Tab stop is 4
1475 private fun columns_to_next_tab_stop: Int do return 4 - (self % 4)
1476 end
1477
1478 redef class String
1479
1480 # Is this string blank?
1481 #
1482 # i.e. contains only spacing characters.
1483 private fun is_blank: Bool do
1484 for i in [0 .. length[ do
1485 var c = chars[i]
1486 if c == ' ' or c == '\t' or c == '\n' or c == '\r' then
1487 continue
1488 else
1489 return false
1490 end
1491 end
1492 return true
1493 end
1494
1495 # Is the character at `index` a space or a tab
1496 #
1497 # Return false if `index > self.length`.
1498 private fun is_space_or_tab(index: Int): Bool do
1499 if index >= length then return false
1500 var c = chars[index]
1501 return c == ' ' or c == '\t'
1502 end
1503 end