Merge: nitunit: Use markdown2
[nit.git] / lib / markdown2 / markdown_inline_parsing.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Parser for inline markdown
16 #
17 # Used to create the AST representation of inline nodes like emphasis, code, links
18 # images etc.
19 module markdown_inline_parsing
20
21 import markdown_ast
22
23 # Parser for inline content (text, links, emphasis, etc)
24 class MdInlineParser
25
26 # List of delimiter processors to use
27 private var delimiter_processors: Array[MdDelimiterProcessor] is lazy do
28 var delimiters = new Array[MdDelimiterProcessor]
29 delimiters.add new MdAsteriskDelimiterProcessor
30 delimiters.add new MdUnderscoreDelimiterProcessor
31 return delimiters
32 end
33
34 # Map special characters to their delimiter processor
35 private var delimiter_processors_map: Map[Char, MdDelimiterProcessor] is lazy do
36 var map = new HashMap[Char, MdDelimiterProcessor]
37 for delimiter_processor in delimiter_processors do
38 add_delimiter_processor(delimiter_processor, map)
39 end
40 special_characters.add_all map.keys
41 return map
42 end
43
44 # Register a delimiter processor
45 private fun add_delimiter_processor(delimiter_processor: MdDelimiterProcessor, map: Map[Char, MdDelimiterProcessor]) do
46 var opening = delimiter_processor.opening_delimiter
47 var closing = delimiter_processor.closing_delimiter
48 if opening == closing then
49 if map.has_key(opening) then
50 var old = map[opening]
51 if old.opening_delimiter == old.closing_delimiter then
52 var s: MdStaggeredDelimiterProcessor
53 if old isa MdStaggeredDelimiterProcessor then
54 s = old
55 else
56 s = new MdStaggeredDelimiterProcessor(opening)
57 s.add old
58 end
59 s.add delimiter_processor
60 map[opening] = s
61 else
62 add_delimiter_processor_for_char(opening, delimiter_processor, map)
63 end
64 else
65 add_delimiter_processor_for_char(opening, delimiter_processor, map)
66 end
67 else
68 add_delimiter_processor_for_char(opening, delimiter_processor, map)
69 add_delimiter_processor_for_char(closing, delimiter_processor, map)
70 end
71 end
72
73 # Register a delimiter processor for a special character
74 private fun add_delimiter_processor_for_char(delimiter_char: Char, delimiter_processor: MdDelimiterProcessor, map: Map[Char, MdDelimiterProcessor]) do
75 assert not map.has_key(delimiter_char) else
76 print "Delimiter processor conflict with delimiter char `{delimiter_char}`"
77 end
78 map[delimiter_char] = delimiter_processor
79 end
80
81 # List of characters that have a special Markdown meaning
82 private var special_characters: Array[Char] = ['\n', '`', '[', ']', '\\', '!', '<', '&']
83
84 # Link references by ID, needs to be built up using `parse_reference` before calling `parse`
85 private var reference_map = new HashMap[String, MdLink]
86
87 # Current block under parsing
88 private var block: MdNode is noinit
89
90 # Current input string
91 private var input: String is noinit
92
93 # Current index
94 private var index: Int is noinit
95
96 # Current line
97 private var line: Int is noinit
98
99 # Current column
100 private var column: Int is noinit
101
102 # Current column offset
103 private var column_offset: Int is noinit
104
105 # Top delimiter (emphasis, strong emphasis or custom emphasis)
106 # Brackets are on a separate stack, different from the algorithm described in the spec.
107 private var last_delimiter: nullable MdDelimiter = null
108
109 # Top opening bracket (`[` or `![`)
110 private var last_bracket: nullable MdBracket = null
111
112 # Parse `input` as inline and add resulting nodes as children to `block`
113 fun parse(input: String, offset: Int, block: MdNode) do
114 self.block = block
115 self.input = input.trim
116 self.index = 0
117 self.last_delimiter = null
118 self.last_bracket = null
119 self.line = block.location.line_start
120 self.column_offset = offset
121 self.column = 1 + column_offset
122
123 var more_to_parse = parse_inline
124 while more_to_parse do
125 more_to_parse = parse_inline
126 end
127
128 process_delimiters(null)
129 merge_child_text_nodes(block)
130 end
131
132 # Advance the current index of `count` characters
133 private fun advance(count: Int) do
134 index += count
135 column += count
136 end
137
138 # Attempt to parse a link reference
139 #
140 # Return how many characters were parsed as a reference.
141 # Returns 0 if none.
142 fun parse_reference(input: String): Int do
143 self.input = input
144 self.index = 0
145 self.column = 0
146 var dest
147 var title
148 var match_chars
149 var start_index = index
150
151 # label
152 match_chars = parse_link_label
153 if match_chars == 0 then return 0
154 advance match_chars
155
156 var raw_label = input.substring(0, match_chars)
157
158 # colon
159 if peek != ':' then return 0
160 advance 1
161
162 # link url
163 spnl
164
165 dest = parse_link_destination.first
166 if dest == null or dest.is_empty then return 0
167
168 var before_title = index
169 var before_column = column
170 spnl
171 title = parse_link_title
172 if title == null then
173 # rewind before spaces
174 index = before_title
175 column = before_column
176 end
177
178 var at_line_end = true
179 if index != input.length and match(re_line_end) == null then
180 if title == null then
181 at_line_end = false
182 else
183 # the potential title we found is not at the line end,
184 # but it could still be a legal link reference if we discard the title
185 title = null
186 # rewind before spaces
187 index = before_title
188 column = before_column
189 # and instead check if the link URL is at the line end
190 at_line_end = match(re_line_end) != null
191 end
192 end
193
194 if not at_line_end then return 0
195
196 var normalized_label = raw_label.normalize_reference
197 if normalized_label.is_empty then return 0
198
199 if not reference_map.has_key(normalized_label) then
200 var link = new MdLink(new MdLocation(0, 0, 0, 0), dest, title)
201 reference_map[normalized_label] = link
202 end
203
204 return index - start_index
205 end
206
207 # Line end pattern
208 private var re_line_end: Regex = "^ *(\n|$)".to_re
209
210 # Append standard text to the current block
211 #
212 # Read `text` between `begin_index` and `end_index`.
213 private fun append_text(text: String, begin_index, end_index: nullable Int): MdText do
214 var node: MdText
215 if begin_index != null and end_index != null then
216 var nb_chars = end_index - begin_index
217 var string = text.substring(begin_index, nb_chars)
218 node = new MdText(
219 new MdLocation(
220 line,
221 column,
222 line,
223 column + nb_chars - 1
224 ), string)
225 else
226 node = new MdText(
227 new MdLocation(
228 line,
229 column,
230 line,
231 column + text.length
232 ), text)
233 end
234 append_node(node)
235 return node
236 end
237
238 # Append `node` to the current block
239 private fun append_node(node: MdNode) do block.append_child(node)
240
241 # Parse the next inline element in subject, advancing input index
242 #
243 # On success, add the result to block's children and return true.
244 # On failure, return false.
245 private fun parse_inline: Bool do
246 var res: Bool
247 var c = peek
248 if c == '\0' then return false
249 if c == '\n' then
250 res = parse_newline
251 else if c == '\\' then
252 res = parse_backslash
253 else if c == '`' then
254 res = parse_backticks
255 else if c == '[' then
256 res = parse_open_bracket
257 else if c == '!' then
258 res = parse_bang
259 else if c == ']' then
260 res = parse_close_bracket
261 else if c == '<' then
262 res = parse_auto_link or parse_html_inline
263 else if c == '&' then
264 res = parse_entity
265 else
266 if delimiter_processors_map.has_key(c) then
267 res = parse_delimiters(delimiter_processors_map[c], c)
268 else
269 res = parse_string
270 end
271 end
272
273 if not res then
274 advance 1
275 # When we get here, it's only for a single special character that turned
276 # out to not have a special meaning.
277 # So we shouldn't have a single surrogate here, hence it should be ok
278 # to turn it into a String
279 var literal = c.to_s
280 append_text(literal)
281 end
282
283 return true
284 end
285
286 # If `re` matches at current index in the input, advance index and return the match
287 # Else return null.
288 private fun match(re: Pattern): nullable String do
289 if index >= input.length then return null
290 var match = input.search_from(re, index)
291 if match != null then
292 index = match.after
293 column = match.after
294 return match.to_s
295 end
296 return null
297 end
298
299 # Return the char at the current input index, or `\0`
300 private fun peek: Char do
301 if index < input.length then
302 return input.chars[index]
303 end
304 return '\0'
305 end
306
307 # Return the char at the current input index + 1, or `\0`
308 private fun peek_next: Char do
309 if index + 1 < input.length then
310 return input.chars[index + 1]
311 end
312 return '\0'
313 end
314
315 # Parse zero or more space characters, incuding at most one newline
316 private fun spnl: Bool do
317 var found_nl = false
318 loop
319 var c = peek
320 if c == ' ' or c == '\t' then
321 advance 1
322 continue
323 else if c == '\n' then
324 if found_nl then break
325 found_nl = true
326 advance 1
327 continue
328 end
329 break
330 end
331 return true
332 end
333
334 # Parse a new line
335 #
336 # If it was preceded by two spaces, return a hard line break,
337 # otherwise a soft line break
338 private fun parse_newline: Bool do
339 advance 1 # assume we're at a `\n`
340
341 var last_child = block.last_child
342
343 # check previous text for trailing spaces
344 # the `has_suffix` is an optimization to avoid an RE match in the common case
345 if last_child != null and last_child isa MdText and
346 (last_child.literal.has_suffix(" ")) then
347 var text = last_child
348 var literal = text.literal
349 var match = literal.search(re_final_space)
350 var spaces = if match != null then match.length else 0
351 if spaces > 0 then
352 text.literal = literal.substring(0, literal.length - spaces)
353 end
354 last_child.location.column_end = last_child.location.column_end - spaces
355 if spaces >= 2 then
356 append_node(new MdHardLineBreak(new MdLocation(line, column - spaces - 1, line, column - 1), false))
357 else
358 append_node(new MdSoftLineBreak(new MdLocation(line, column - spaces - 1, line, column -1)))
359 end
360 else
361 append_node(new MdSoftLineBreak(new MdLocation(line, column - 1, line, column - 1)))
362 end
363 line += 1
364 column = 1 + column_offset
365
366 # gobble leading spaces in next line
367 while peek == ' ' do
368 advance 1
369 end
370 return true
371 end
372
373 # Final white spaces pattern
374 private var re_final_space: Regex = " *$".to_re
375
376 # Parse a backslash-escaped special character
377 #
378 # Add either the escaped characters, a hard line break (if the backslash is followed by
379 # a new line), or a literal backslash to the block's children.
380 private fun parse_backslash: Bool do
381 advance 1
382 if peek == '\n' then
383 append_node(new MdHardLineBreak(new MdLocation(line, column - 1, line, column), true))
384 advance 1
385 line += 1
386 column = 1 + column_offset
387 else if index < input.length and input.substring(index, 1).has(re_escapable) then
388 append_text(input, index, index + 1)
389 advance 1
390 else
391 append_text("\\")
392 end
393 return true
394 end
395
396 # Escapable characters pattern
397 private var p_escapable = "[]!\"#$%&\'()*+,./:;<=>?@\\[\\\\^_`\\\{|\\\}~-]"
398
399 # Escapable characters regex
400 private var re_escapable: Regex = "^{p_escapable}".to_re
401
402 # Attempt to parse backticks
403 #
404 # Adding either a backtick code span or a literal sequence of backticks.
405 private fun parse_backticks: Bool do
406 var column_before = column
407 var ticks = match(re_ticks_here)
408 if ticks == null then return false
409
410 var after_open_ticks = index
411 var matched = match(re_ticks)
412 while matched != null do
413 if matched == ticks then
414 var content = input.substring(after_open_ticks, index - after_open_ticks - ticks.length)
415 content = content.trim
416 content = content.replace(re_whitespace, " ")
417 var node = new MdCode(new MdLocation(line, column_before, line, column), matched.to_s, content.trim)
418 append_node(node)
419 column += 1
420 return true
421 end
422 matched = match(re_ticks)
423 end
424 # If we got here, we didn't match a closing backtick sequence
425 index = after_open_ticks
426 column = after_open_ticks + 1
427 append_text(ticks)
428 return true
429 end
430
431 # Backticks starting pattern
432 private var re_ticks_here: Regex = "^`+".to_re
433
434 # Backticks pattern
435 private var re_ticks: Regex = "`+".to_re
436
437 # Attempt to parse delimiters like emphasis, strong emphasis or custom delimiters
438 private fun parse_delimiters(delimiter_processor: MdDelimiterProcessor, delimiter_char: Char): Bool do
439 var res = scan_delimiters(delimiter_processor, delimiter_char)
440 if res == null then return false
441
442 var length = res.count
443 var start_index = index
444 var start_column = column
445
446 advance length
447 var column_before = column
448 column = start_column
449 var node = append_text(input, start_index, index)
450 column = column_before
451
452 # Add entry to stack for this opener
453 var last_delimiter = new MdDelimiter(node, delimiter_char, res.can_open, res.can_close, last_delimiter)
454 last_delimiter.length = length
455 last_delimiter.original_length = length
456
457 var prev = last_delimiter.prev
458 if prev != null then
459 prev.next = last_delimiter
460 end
461 self.last_delimiter = last_delimiter
462 return true
463 end
464
465 # Add open bracket to delimiter stack and add a text node to block's children
466 private fun parse_open_bracket: Bool do
467 var start_index = index
468 advance 1
469
470 var node = append_text("[")
471
472 # Add entry to stack for this opener
473 add_bracket(new MdBracket.link(node, start_index, column - 1, last_bracket, last_delimiter))
474 return true
475 end
476
477 # If next character is `[`, add `!` delimiter to delimiter stack and add a text node to
478 # block's children.
479 # Otherwise just add a text node.
480 private fun parse_bang: Bool do
481 var start_index = index
482 advance 1
483
484 if peek == '[' then
485 advance 1
486 var node = append_text("![")
487
488 # Add entry to stack for this opener
489 add_bracket(new MdBracket.image(node, start_index + 1, column - 2, last_bracket, last_delimiter))
490 else
491 append_text("!")
492 end
493 return true
494 end
495
496 # Try match close bracket against an opening delimiter stack
497 #
498 # Add either a link or image, or a plan `[` character, to block's children.
499 # If there is a matching delimiter, remove it from the delimiter stack.
500 private fun parse_close_bracket: Bool do
501 advance 1
502 var start_index = index
503 var start_column = column
504
505 # Get previous `[` or `![`
506 var opener = last_bracket
507 if opener == null then
508 # no matching opener, just return a literal
509 append_text("]")
510 return true
511 end
512
513 if not opener.allowed then
514 # matching opener but it's not allowed, juste return a literal
515 append_text("]")
516 remove_last_bracket
517 return true
518 end
519
520 # check to see if we have a link or image
521 var dest: nullable Couple[nullable String, Bool] = null
522 var title = null
523 var is_link_or_image = false
524
525 # maybe an inline link like `[foo](\uri "title")`
526 if peek == '(' then
527 advance 1
528 spnl
529 dest = parse_link_destination
530 if dest.first != null then
531 spnl
532 # title needs a whitespace before
533 if input.substring(index - 1, 1).has(re_whitespace) then
534 title = parse_link_title
535 spnl
536 end
537 if peek == ')' then
538 advance 1
539 is_link_or_image = true
540 else
541 index = start_index
542 column = start_column
543 end
544 end
545 end
546
547 # maybe a reference link like `[foo][bar]`, `[foo][]` or `[foo]`
548 if not is_link_or_image then
549 # see if there's a link label like `[bar]` or `[]`
550 var before_label = index
551 var label_length = parse_link_label
552 advance label_length
553 var ref = null
554 if label_length > 2 then
555 ref = input.substring(before_label, label_length)
556 else if not opener.bracket_after then
557 # If the second label is empty `[foo][]` or missing `[foo]`, then the first label
558 # is the reference.
559 # But it can only be a reference when there's no (unescaped) bracket in it.
560 # If there is, we don't even need to try to lookup the reference.
561 ref = input.substring(opener.index, start_index - opener.index)
562 end
563
564 if ref != null then
565 var nref = ref.normalize_reference
566 if reference_map.has_key(nref) then
567 var link = reference_map[nref]
568 dest = new Couple[nullable String, Bool](link.destination, false)
569 title = link.title
570 is_link_or_image = true
571 end
572 end
573 end
574
575 if is_link_or_image then
576 # If we got here, open is a potential opener
577 var link_or_image: MdLinkOrImage
578 if opener.is_image then
579 link_or_image = new MdImage(new MdLocation(line, opener.column, line, column - 1), dest.as(not null).first or else "", title)
580 else
581 link_or_image = new MdLink(new MdLocation(line, opener.column, line, column - 1), dest.as(not null).first or else "", title)
582 end
583 link_or_image.has_brackets = dest.as(not null).second
584
585 var node = opener.node.next
586 while node != null do
587 var next = node.next
588 link_or_image.append_child(node)
589 node = next
590 end
591 append_node(link_or_image)
592
593 # Process delimiters such as emphasis inside a link/image
594 process_delimiters(opener.prev_delimiter)
595 merge_child_text_nodes(link_or_image)
596 # We don't need the corresponding text node anymore, we turned it into a node
597 opener.node.unlink
598 remove_last_bracket
599
600 # Links within links are not allowed
601 # We found this link, so there can be no other link around it.
602 if not opener.is_image then
603 var bracket = last_bracket
604 while bracket != null do
605 if not bracket.is_image then
606 # disallow link opener
607 bracket.allowed = false
608 end
609 bracket = bracket.prev
610 end
611 end
612 return true
613 end
614
615 if not is_link_or_image then
616 if parse_wikilink then return true
617 end
618
619 # no link or image
620 append_text("]")
621 remove_last_bracket
622 index = start_index
623 column = start_column
624 return true
625 end
626
627 # Whitespace pattern
628 private var re_whitespace: Regex = "\\s+".to_re
629
630 # Add a bracket token on top of the `last_bracket` stack
631 private fun add_bracket(bracket: MdBracket) do
632 var last_bracket = self.last_bracket
633 if last_bracket != null then
634 last_bracket.bracket_after = true
635 end
636 self.last_bracket = bracket
637 end
638
639 # Remove the last bracket on the `last_bracket` stack
640 private fun remove_last_bracket do
641 var last_bracket = self.last_bracket
642 if last_bracket == null then return
643 self.last_bracket = last_bracket.prev
644 end
645
646 # Wikilink placeholder
647 #
648 # Will be defined in sub module.
649 private fun parse_wikilink: Bool do return false
650
651 # Attempt to parse a link destination, returning the string or null if not match
652 private fun parse_link_destination: Couple[nullable String, Bool] do
653 var buffer = new Buffer
654
655 var c = peek
656 var parens = 0
657
658 var has_bracket = c == '<'
659 if has_bracket then advance 1
660
661 loop
662 c = peek
663 if c == '\0' then
664 break # end of input
665 else if c == ' ' or c == '\t' or c == '\n' or c == '\r' then
666 break # no spaces allowed in urls
667 else if c == '\\' then
668 var next = peek_next
669 if escapable.has(next) then
670 buffer.add next
671 advance 2 # skip over the backslash
672 continue
673 end
674 else if has_bracket and c == '>' then
675 advance 1
676 break
677 else if not has_bracket and c == '(' then
678 parens += 1
679 else if not has_bracket and c == ')' then
680 if parens == 0 then break
681 parens -= 1
682 else if c == '\0' then
683 break
684 end
685 buffer.add c
686 advance 1
687 end
688 return new Couple[nullable String, Bool](buffer.to_s, has_bracket)
689 end
690
691 # Attempt to parse a link title (sans quotes), returning the string or null if no match
692 private fun parse_link_title: nullable String do
693 var c = peek
694 if c != '\'' and c != '"' and c != '(' then
695 return null
696 end
697 var opener = c
698
699 var buffer = new Buffer
700 loop
701 advance 1
702 c = peek
703 if c == opener or (opener == '(' and c == ')') then
704 advance 1
705 break
706 else if c == '\\' then
707 var next = peek_next
708 if escapable.has(next) then
709 buffer.add next
710 advance 1
711 continue
712 end
713 else if c == '\0' then
714 return null
715 end
716 buffer.add c
717 end
718 return buffer.to_s
719 end
720
721 # Escapable characters
722 private var escapable = "[]!\"#$%&\'()*+,./:;<=>?@\\^_`\{|\}~-"
723
724 # Attempt to parse a link label returning number of characters parsed
725 private fun parse_link_label: Int do
726 var i = index
727 while i < input.length do
728 var c = input[i]
729 if i == index and c != '[' then
730 return 0
731 else if c == '[' and i != index then
732 if input[i - 1] != '\\' or (i - 2 > index and input[i - 2] == '\\') then
733 return 0
734 end
735 else if c == ']' then
736 if i > 1001 then return 0
737 if input[i - 1] != '\\' or (i - 2 > index and input[i - 2] == '\\') then
738 return (i - index) + 1
739 end
740 end
741 i += 1
742 end
743 return 0
744 end
745
746 # Attempt to parse an autolink (URL or email in pointy brackets)
747 private fun parse_auto_link: Bool do
748 var column_before = column
749 var m = match(re_autolink_email)
750 if m != null then
751 var dest = m.substring(1, m.length - 2)
752 var node = new MdLink(new MdLocation(line, column_before, line, column), "mailto:{dest}", null, true)
753 node.append_child(new MdText(new MdLocation(line, column_before + 1, line, column - 1), dest))
754 column += 1
755 append_node(node)
756 return true
757 end
758 m = match(re_autolink_url)
759 if m != null then
760 var dest = m.substring(1, m.length - 2)
761 var node = new MdLink(new MdLocation(line, column_before, line, column), dest, null, true)
762 node.append_child(new MdText(new MdLocation(line, column_before + 1, line, column - 1), dest))
763 column += 1
764 append_node(node)
765 return true
766 end
767 return false
768 end
769
770 # Autolink email pattern
771 private var re_autolink_email: Regex = "^<([a-zA-Z0-9.!#$%&'*+/=?^_`\{|\}~-]+@[a-zA-Z0-9]([a-zA-Z0-9-]\{0,61\}[a-zA-Z0-9])?(\\.[a-zA-Z0-9]([a-zA-Z0-9-]\{0,61\}[a-zA-Z0-9])?)*)>".to_re
772
773 # Autolink url pattern
774 private var re_autolink_url: Regex = "^<[a-zA-Z][a-zA-Z0-9.+-]\{1,31\}:[^<> ]*>".to_re
775
776 # Attempt to parse an inline HTML string
777 private fun parse_html_inline: Bool do
778 var column_before = column
779 var m = match(re_html_tag)
780 if m != null then
781 var node = new MdHtmlInline(new MdLocation(line, column_before, line, column), m)
782 column += 1
783 append_node(node)
784 return true
785 end
786 return false
787 end
788
789 private var p_tagname = "[A-Za-z][A-Za-z0-9-]*"
790 private var p_attribute_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*"
791 private var p_uquoted_value = "[^\"'=<>` \t\n]+"
792 private var p_squoted_value = "'[^']*'"
793 private var p_dquoted_value = "\"[^\"]*\""
794 private var p_attribute_value = "({p_uquoted_value}|{p_squoted_value}|{p_dquoted_value})"
795 private var p_attribute_value_spec = "(\\s*=\\s*{p_attribute_value})"
796 private var p_attribute = "(\\s{p_attribute_name}{p_attribute_value_spec}?)"
797 private var p_opentag = "<{p_tagname}{p_attribute}*\\s*/?>"
798 private var p_closetag = "</{p_tagname}\\s*[>]"
799 private var p_html_comment = "<!---->|<!--(-?[^>-])(-?[^-])*-->"
800 private var p_processing_instruction = "[<][?].*?[?][>]"
801 private var p_declaration = "<![A-Z]+\\s+[^>]*>"
802 private var p_cdata = "<!\\[CDATA\\[.*\\]\\]>"
803 private var p_html_tag = "({p_opentag}|{p_closetag}|{p_html_comment}|{p_processing_instruction}|{p_declaration}|{p_cdata})"
804
805 # HTML tag pattern
806 private var re_html_tag: Regex do
807 var re = "^{p_html_tag}".to_re
808 re.ignore_case = true
809 return re
810 end
811
812 # Attempt to parse an HTML entity
813 private fun parse_entity: Bool do
814 var m = match(re_entity_here)
815 if m != null then
816 append_text(m)
817 return true
818 end
819 return false
820 end
821
822 # HTML entity pattern
823 private var re_entity_here: Regex do
824 var re = "^&(#x[a-f0-9]\{1,8\}|#[0-9]\{1,8\}|[a-z][a-z0-9]\{1,31\});".to_re
825 re.ignore_case = true
826 return re
827 end
828
829 # Parse a run of ordinary characters
830 #
831 # Or a single character with a special meaning in markdown, as a plain string.
832 private fun parse_string: Bool do
833 var begin = index
834 var begin_column = column
835 var length = input.length
836 while index != length do
837 if special_characters.has(input.chars[index]) then
838 break
839 end
840 advance 1
841 end
842 if begin != index then
843 var column_before = column
844 column = begin_column
845 append_text(input, begin, index)
846 column = column_before
847 return true
848 end
849 return false
850 end
851
852 # Scan a sequence of characters with code `delimiter_char`
853 #
854 # Return information about the number of delimiters and whether they are positioned
855 # such as they can open and/or close emphasis or strong emphasis.
856 private fun scan_delimiters(delimiter_processor: MdDelimiterProcessor, delimiter_char: Char): nullable MdDelimiterData do
857 var start_index = index
858 var start_column = column
859
860 var delimiter_count = 0
861 while peek == delimiter_char do
862 delimiter_count += 1
863 advance 1
864 end
865
866 if delimiter_count < delimiter_processor.min_length then
867 index = start_index
868 column = start_column
869 return null
870 end
871
872 var before = "\n"
873 if start_index > 0 then
874 before = input.substring(start_index - 1, 1)
875 end
876
877 var char_after = peek
878 var after = "\n"
879 if char_after != '\0' then
880 after = char_after.to_s
881 end
882
883 var before_is_punctuation = before.has(re_punctuation)
884 var before_is_whitespace = before.has(re_whitespace_char)
885 var after_is_punctuation = after.has(re_punctuation)
886 var after_is_whitespace = after.has(re_whitespace_char)
887
888 var left_flanking = not after_is_whitespace and
889 (not after_is_punctuation or before_is_whitespace or before_is_punctuation)
890 var right_flanking = not before_is_whitespace and
891 (not before_is_punctuation or after_is_whitespace or after_is_punctuation)
892
893 var can_open
894 var can_close
895 if delimiter_char == '_' then
896 can_open = left_flanking and (not right_flanking or before_is_punctuation)
897 can_close = right_flanking and (not left_flanking or after_is_punctuation)
898 else
899 can_open = left_flanking and delimiter_char == delimiter_processor.opening_delimiter
900 can_close = right_flanking and delimiter_char == delimiter_processor.closing_delimiter
901 end
902
903 index = start_index
904 column = start_column
905 return new MdDelimiterData(delimiter_count, can_open, can_close)
906 end
907
908 # Punctuation pattern
909 private var re_punctuation: Regex = "^[]!\"#\\$%&'()*+,.:;<=>?@^_`\{|\}~[-]".to_re
910
911 # Whitespace character start pattern
912 private var re_whitespace_char: Regex = "^[  \t\r\n]".to_re
913
914 # Process the stack of delimiters
915 private fun process_delimiters(stack_bottom: nullable MdDelimiter) do
916 var openers_bottom = new HashMap[Char, nullable MdDelimiter]
917
918 # find first closer above stack bottom
919 var closer = last_delimiter
920 while closer != null and closer.prev != stack_bottom do
921 closer = closer.prev
922 end
923 # move forward, looking for closers, and handling each
924 while closer != null do
925 var delimiter_char = closer.delimiter_char
926
927 if not closer.can_close then
928 closer = closer.next
929 continue
930 end
931
932 if not delimiter_processors_map.has_key(delimiter_char) then
933 closer = closer.next
934 continue
935 end
936
937 var delimiter_processor = delimiter_processors_map[delimiter_char]
938 var opening_delimiter_char = delimiter_processor.opening_delimiter
939
940 # Found delimiter closer. Now look back for first matching opener
941 var use_delims = 0
942 var opener_found = false
943 var potential_opener_found = false
944 var opener = closer.prev
945
946 while opener != null and opener != stack_bottom and (not openers_bottom.has_key(delimiter_char) or opener != openers_bottom[delimiter_char]) do
947
948 if opener.can_open and opener.delimiter_char == opening_delimiter_char then
949 potential_opener_found = true
950 use_delims = delimiter_processor.delimiter_use(opener, closer)
951 if use_delims > 0 then
952 opener_found = true
953 break
954 end
955 end
956 opener = opener.prev
957 end
958
959 if not opener_found then
960 if not potential_opener_found then
961 # Set lower bound for future searches for openers.
962 # Only do this when we didn't even have a potential opener
963 # (one that matches the character and can open).
964 # If an opener was rejected because of the number of delimiters
965 # (e.g. because of the "multiple of 3" rule),
966 # we want to consider it next time because the number of delimiter
967 # can change as we continue processing.
968 openers_bottom[delimiter_char] = closer.prev
969 if not closer.can_open then
970 # We can remove a closer that can't be an opener,
971 # once we've seen there's no matching opener.
972 remove_delimiters_keep_node(closer)
973 end
974 end
975 closer = closer.next
976 continue
977 end
978
979 var opener_node = opener.as(not null).node
980 var closer_node = closer.node
981
982 # Remove number of used delimieters from stack and inline nodes
983 opener.as(not null).length -= use_delims
984 closer.length -= use_delims
985 opener_node.literal = opener_node.literal.substring(0,
986 opener_node.literal.length - use_delims)
987 closer_node.literal = closer_node.literal.substring(0,
988 closer_node.literal.length - use_delims)
989
990 remove_delimiters_between(opener, closer)
991 # The delimieter processor can re-parent the nodes between opener and closer,
992 # so make sure they're contiguous already.
993 # Exclusive because we want to keep opener/closer themselves.
994 merge_text_nodes_between_exclusive(opener_node, closer_node)
995 delimiter_processor.process(opener_node, closer_node, use_delims)
996
997 # Node delimieter characters left to process, so we can remove
998 # delimieter and the now empty node
999 if opener.as(not null).length == 0 then
1000 remove_delimiters_and_node(opener)
1001 end
1002
1003 if closer.length == 0 then
1004 var next = closer.next
1005 remove_delimiters_and_node(closer)
1006 closer = next
1007 end
1008 end
1009
1010 # Remove all delimiters
1011 while last_delimiter != null and last_delimiter != stack_bottom do
1012 remove_delimiters_keep_node(last_delimiter)
1013 end
1014 end
1015
1016 # Remove all delimiters between `opener` and `closer`
1017 private fun remove_delimiters_between(opener, closer: nullable MdDelimiter) do
1018 if opener == null or closer == null then return
1019
1020 var delimiter = closer.prev
1021 while delimiter != null and delimiter != opener do
1022 var previous_delimiter = delimiter.prev
1023 remove_delimiters_keep_node(delimiter)
1024 delimiter = previous_delimiter
1025 end
1026 end
1027
1028 # Remove the delimiter and the corresponding text node
1029 #
1030 # For used delimiters, e.g. `*` in `*foo*`.
1031 private fun remove_delimiters_and_node(delim: nullable MdDelimiter) do
1032 if delim == null then return
1033
1034 var node = delim.node
1035 node.unlink
1036 remove_delimiter(delim)
1037 end
1038
1039 # Remove the delimiter but keep the corresponding node as text
1040 #
1041 # For unused delimiters such as `_` in `foo_bar`.
1042 private fun remove_delimiters_keep_node(delim: nullable MdDelimiter) do
1043 remove_delimiter(delim)
1044 end
1045
1046 # Remove the delimiter `delim`
1047 private fun remove_delimiter(delim: nullable MdDelimiter) do
1048 if delim == null then return
1049
1050 var prev = delim.prev
1051 if prev != null then
1052 prev.next = delim.next
1053 end
1054 var next = delim.next
1055 if next == null then
1056 # top of stack
1057 last_delimiter = prev
1058 else
1059 next.prev = prev
1060 end
1061 end
1062
1063 # Merge all nodes between `from` and `to` excluding `from` and `to`
1064 private fun merge_text_nodes_between_exclusive(from, to: nullable MdNode) do
1065 if from == null or to == null then return
1066 # no node between them
1067 if from == to or from.next == to then return
1068 merge_text_nodes_inclusive(from.next, to.prev)
1069 end
1070
1071 # Merge all child nodes of `node` into one
1072 private fun merge_child_text_nodes(node: nullable MdNode) do
1073 if node == null then return
1074 # no children or just one child node, no need for merging
1075 if node.first_child == node.last_child then return
1076 merge_text_nodes_inclusive(node.first_child, node.last_child)
1077 end
1078
1079 # Merge all nodes between `from` and `to` including `from` and `to`
1080 private fun merge_text_nodes_inclusive(from, to: nullable MdNode) do
1081 var first = null
1082 var last = null
1083
1084 var node = from
1085 while node != null do
1086 if node isa MdText then
1087 var text = node
1088 if first == null then first = text
1089 last = text
1090 else
1091 merge_if_needed(first, last)
1092 first = null
1093 last = null
1094 end
1095 if node == to then break
1096 node = node.next
1097 end
1098 merge_if_needed(first, last)
1099 end
1100
1101 # Merge all nodes between `first` and `last`
1102 private fun merge_if_needed(first, last: nullable MdText) do
1103 if first != null and last != null and first != last then
1104 var buffer = new Buffer
1105 buffer.append(first.literal)
1106 var node = first.next
1107 var stop = last.next
1108 while node != null and node != stop do
1109 buffer.append(node.as(MdText).literal)
1110 first.location.line_end = node.location.line_end
1111 first.location.column_end = node.location.column_end
1112 var unlink = node
1113 node = node.next
1114 unlink.unlink
1115 end
1116 var literal = buffer.write_to_string
1117 first.literal = literal
1118 end
1119 end
1120 end
1121
1122 # Custom delimiter processor for additional delimiters besides `_` and `*`
1123 interface MdDelimiterProcessor
1124
1125 # The character that marks the beginning of a delimited node
1126 #
1127 # Must not clash with anu built-in special characters.
1128 fun opening_delimiter: Char is abstract
1129
1130 # The character that marks the ending of a delimited node
1131 #
1132 # Must not clash with anu built-in special characters.
1133 fun closing_delimiter: Char is abstract
1134
1135 # Minimum number of delimiters characters that are needed to active this
1136 #
1137 # Must be at least one.
1138 fun min_length: Int is abstract
1139
1140 # Determine how many (if any) of the delimiter characters should be used
1141 #
1142 # This allows implementations to decide how many characters to use based on the
1143 # properties of the delimiter runs.
1144 #
1145 # An implementation can also return 0 when it doesn't want to allow this particular
1146 # combination of delimiter runs.
1147 fun delimiter_use(opener, closer: MdDelimiter): Int is abstract
1148
1149 # Process the matched delimiters
1150 #
1151 # For example, by wrapping the nodes between `opener` and `closer` in a new node,
1152 # or appending a new node after the opener.
1153 #
1154 # Note that removal of the delimiter from the delimiter nodes and unlinking
1155 # them is done by the caller.
1156 fun process(opener, closer: MdText, delimiter_use: Int) is abstract
1157 end
1158
1159 # A delimiter is one or more of the same delimiter character
1160 #
1161 # Used for paired delimiters like emphasis or strong emphasis.
1162 class MdDelimiter
1163
1164 # Node containing the delimiter
1165 var node: MdText
1166
1167 # Character used as delimiter
1168 var delimiter_char: Char
1169
1170 # Can `self` open a delimiter?
1171 var can_open: Bool
1172
1173 # Cant `self` close a delimiter?
1174 var can_close: Bool
1175
1176 # Previous delimiter found
1177 var prev: nullable MdDelimiter
1178
1179 # Next delimiter found
1180 var next: nullable MdDelimiter
1181
1182 # The number of characters in this delimiter run that are left for processing
1183 var length = 1
1184
1185 # The number of characters originally in this delimiter run
1186 #
1187 # At the start of processing, this is the same as `length`.
1188 var original_length = 1
1189 end
1190
1191 # Opening bracket for links and images
1192 class MdBracket
1193
1194 # Node containing the bracket
1195 var node: MdText
1196
1197 # Index of the bracket in the original string
1198 var index: Int
1199
1200 # COlumn of the bracket
1201 var column: Int
1202
1203 # Is this bracket opening an image?
1204 var is_image: Bool
1205
1206 # Previous bracket
1207 var prev: nullable MdBracket
1208
1209 # Previous delimiter
1210 var prev_delimiter: nullable MdDelimiter
1211
1212 # Whether this bracket is allowed to form a link/image
1213 var allowed = true
1214
1215 # Whether there is an unescaped bracket (opening or closing) anywhere after this bracket
1216 var bracket_after = false
1217
1218 # Create a new bracket for a link
1219 init link(node: MdText, index: Int, column: Int, prev: nullable MdBracket, prev_delimiter: nullable MdDelimiter) do
1220 init(node, index, column, false, prev, prev_delimiter)
1221 end
1222
1223 # Create a new bracket for an image
1224 init image(node: MdText, index: Int, column: Int, prev: nullable MdBracket, prev_delimiter: nullable MdDelimiter) do
1225 init(node, index, column, true, prev, prev_delimiter)
1226 end
1227 end
1228
1229 # Data about a delimiter parsing
1230 private class MdDelimiterData
1231
1232 # Number of successive delimiters found
1233 var count: Int
1234
1235 # Can this delimiter open an inline construct?
1236 var can_open: Bool
1237
1238 # Can this delimiter close an inline construct?
1239 var can_close: Bool
1240 end
1241
1242 # An implementation of MdDelimiterProcessor that dispatches all calls to others
1243 #
1244 # The sub processors called bepends on the length of the delimiter run.
1245 # All child processors must have different minimum lengths.
1246 # A given delimiter run is dispatched to the child with the largest acceptable minimum length.
1247 # If not child is applicable, the one with the largest minimum length is chosen.
1248 class MdStaggeredDelimiterProcessor
1249 super MdDelimiterProcessor
1250
1251 # Delimiter character
1252 var delim: Char
1253
1254 # Sub processors to apply
1255 var processors = new Array[MdDelimiterProcessor]
1256
1257 redef var min_length = 0
1258 redef fun opening_delimiter do return delim
1259 redef fun closing_delimiter do return delim
1260
1261 # Add a new sub delimiter processor
1262 fun add(dp: MdDelimiterProcessor) do
1263 var len = dp.min_length
1264 var i = 0
1265 while i < processors.length do
1266 var p = processors[i]
1267 assert len != p.min_length else
1268 print "Cannot add two delimiter processor for `{delim}` " +
1269 "and mininimum length `{len}`"
1270 end
1271 if len > p.min_length then
1272 break
1273 end
1274 i += 1
1275 end
1276 processors.insert(dp, i)
1277 end
1278
1279 # Find the corresponding processor for a length of `len` delimiter characters
1280 fun find_processor(len: Int): MdDelimiterProcessor do
1281 for processor in processors do
1282 if processor.min_length <= len then return processor
1283 end
1284 return processors.first
1285 end
1286
1287 redef fun delimiter_use(opener, closer) do
1288 return find_processor(opener.length).delimiter_use(opener, closer)
1289 end
1290
1291 redef fun process(opener, closer, delimiter_use) do
1292 find_processor(delimiter_use).process(opener, closer, delimiter_use)
1293 end
1294 end
1295
1296 # A processor for emphasis tokens
1297 class MdEmphasisDelimiterProcessor
1298 super MdDelimiterProcessor
1299
1300 # Delimiter character
1301 var delimiter_char: Char
1302
1303 redef var min_length = 1
1304 redef fun opening_delimiter do return delimiter_char
1305 redef fun closing_delimiter do return delimiter_char
1306
1307 redef fun delimiter_use(opener, closer) do
1308 # "multiple of 3" rule for internal delimiter runs
1309 if (opener.can_close or closer.can_open) and
1310 ((opener.original_length + closer.original_length) % 3 == 0) then
1311 return 0
1312 end
1313 # calculate actual number of delimiters used from this closer
1314 if opener.length >= 2 and closer.length >= 2 then
1315 return 2
1316 end
1317 return 1
1318 end
1319
1320 redef fun process(opener, closer, delimiter_use) do
1321 var single_delimiter = opening_delimiter.to_s
1322 var emphasis: MdNode
1323 if delimiter_use == 1 then
1324 emphasis = new MdEmphasis(
1325 new MdLocation(
1326 opener.location.line_start,
1327 opener.location.column_start,
1328 closer.location.line_end,
1329 closer.location.column_end),
1330 single_delimiter)
1331 else
1332 emphasis = new MdStrongEmphasis(
1333 new MdLocation(
1334 opener.location.line_start,
1335 opener.location.column_start + opener.literal.length,
1336 closer.location.line_end,
1337 closer.location.column_end - closer.literal.length),
1338 "{single_delimiter}{single_delimiter}")
1339 end
1340 var tmp = opener.next
1341 while tmp != null and tmp != closer do
1342 var next = tmp.next
1343 emphasis.append_child(tmp)
1344 tmp = next
1345 end
1346 opener.insert_after(emphasis)
1347 end
1348 end
1349
1350 # Asterisk delimiters processor
1351 class MdAsteriskDelimiterProcessor
1352 super MdEmphasisDelimiterProcessor
1353 noautoinit
1354
1355 redef var delimiter_char = '*'
1356 end
1357
1358 # Underscore delimters processor
1359 class MdUnderscoreDelimiterProcessor
1360 super MdEmphasisDelimiterProcessor
1361 noautoinit
1362
1363 redef var delimiter_char = '_'
1364 end
1365
1366 # Utils
1367
1368 redef class String
1369
1370 # Remove escape backslash from string
1371 fun unescape_string: String do
1372 if not has(re_escaped) then return self
1373
1374 var buffer = new Buffer
1375 var match = search(re_escaped)
1376 var last_end = 0
1377 while match != null do
1378 buffer.append substring(last_end, match.from - last_end)
1379 buffer.append substring(match.from + 1, 1)
1380 last_end = match.after
1381 match = search_from(re_escaped, last_end)
1382 end
1383 if last_end < length then
1384 buffer.append substring(last_end, length - last_end)
1385 end
1386 return buffer.to_s
1387 end
1388
1389 # Normalize link reference names
1390 private fun normalize_reference: String do
1391 var stripped = self.substring(1, length - 2).trim
1392 var lowercase = stripped.to_lower # TODO utf-8
1393 return lowercase.replace(re_whitespace, " ")
1394 end
1395 end
1396
1397 redef class Sys
1398 private var p_escapable = "[]!\"#$%&\'()*+,./:;<=>?@\\[\\\\^_`\\\{|\\\}~-]"
1399 private var re_escaped: Regex = "\\\\{p_escapable}".to_re
1400 private var re_whitespace: Regex = "\\s+".to_re
1401 end