lib/markdown2/markdown_inline_parsing.nit

   1 # This file is part of NIT ( http://www.nitlanguage.org ).
   2 #
   3 # Licensed under the Apache License, Version 2.0 (the "License");
   4 # you may not use this file except in compliance with the License.
   5 # You may obtain a copy of the License at
   6 #
   7 #     http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 # Unless required by applicable law or agreed to in writing, software
  10 # distributed under the License is distributed on an "AS IS" BASIS,
  11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 # See the License for the specific language governing permissions and
  13 # limitations under the License.
  14
  15 # Parser for inline markdown
  16 #
  17 # Used to create the AST representation of inline nodes like emphasis, code, links
  18 # images etc.
  19 module markdown_inline_parsing
  20
  21 import markdown_ast
  22
  23 # Parser for inline content (text, links, emphasis, etc)
  24 class MdInlineParser
  25
  26         # List of delimiter processors to use
  27         private var delimiter_processors: Array[MdDelimiterProcessor] is lazy do
  28                 var delimiters = new Array[MdDelimiterProcessor]
  29                 delimiters.add new MdAsteriskDelimiterProcessor
  30                 delimiters.add new MdUnderscoreDelimiterProcessor
  31                 return delimiters
  32         end
  33
  34         # Map special characters to their delimiter processor
  35         private var delimiter_processors_map: Map[Char, MdDelimiterProcessor] is lazy do
  36                 var map = new HashMap[Char, MdDelimiterProcessor]
  37                 for delimiter_processor in delimiter_processors do
  38                         add_delimiter_processor(delimiter_processor, map)
  39                 end
  40                 special_characters.add_all map.keys
  41                 return map
  42         end
  43
  44         # Register a delimiter processor
  45         private fun add_delimiter_processor(delimiter_processor: MdDelimiterProcessor, map: Map[Char, MdDelimiterProcessor]) do
  46                 var opening = delimiter_processor.opening_delimiter
  47                 var closing = delimiter_processor.closing_delimiter
  48                 if opening == closing then
  49                         if map.has_key(opening) then
  50                                 var old = map[opening]
  51                                 if old.opening_delimiter == old.closing_delimiter then
  52                                         var s: MdStaggeredDelimiterProcessor
  53                                         if old isa MdStaggeredDelimiterProcessor then
  54                                                 s = old
  55                                         else
  56                                                 s = new MdStaggeredDelimiterProcessor(opening)
  57                                                 s.add old
  58                                         end
  59                                         s.add delimiter_processor
  60                                         map[opening] = s
  61                                 else
  62                                         add_delimiter_processor_for_char(opening, delimiter_processor, map)
  63                                 end
  64                         else
  65                                 add_delimiter_processor_for_char(opening, delimiter_processor, map)
  66                         end
  67                 else
  68                         add_delimiter_processor_for_char(opening, delimiter_processor, map)
  69                         add_delimiter_processor_for_char(closing, delimiter_processor, map)
  70                 end
  71         end
  72
  73         # Register a delimiter processor for a special character
  74         private fun add_delimiter_processor_for_char(delimiter_char: Char, delimiter_processor: MdDelimiterProcessor, map: Map[Char, MdDelimiterProcessor]) do
  75                 assert not map.has_key(delimiter_char) else
  76                         print "Delimiter processor conflict with delimiter char `{delimiter_char}`"
  77                 end
  78                 map[delimiter_char] = delimiter_processor
  79         end
  80
  81         # List of characters that have a special Markdown meaning
  82         private var special_characters: Array[Char] = ['\n', '`', '[', ']', '\\', '!', '<', '&']
  83
  84         # Link references by ID, needs to be built up using `parse_reference` before calling `parse`
  85         private var reference_map = new HashMap[String, MdLink]
  86
  87         # Current block under parsing
  88         private var block: MdNode is noinit
  89
  90         # Current input string
  91         private var input: String is noinit
  92
  93         # Current index
  94         private var index: Int is noinit
  95
  96         # Current line
  97         private var line: Int is noinit
  98
  99         # Current column
 100         private var column: Int is noinit
 101
 102         # Current column offset
 103         private var column_offset: Int is noinit
 104
 105         # Top delimiter (emphasis, strong emphasis or custom emphasis)
 106         # Brackets are on a separate stack, different from the algorithm described in the spec.
 107         private var last_delimiter: nullable MdDelimiter = null
 108
 109         # Top opening bracket (`[` or `![`)
 110         private var last_bracket: nullable MdBracket = null
 111
 112         # Parse `input` as inline and add resulting nodes as children to `block`
 113         fun parse(input: String, offset: Int, block: MdNode) do
 114                 self.block = block
 115                 self.input = input.trim
 116                 self.index = 0
 117                 self.last_delimiter = null
 118                 self.last_bracket = null
 119                 self.line = block.location.line_start
 120                 self.column_offset = offset
 121                 self.column = 1 + column_offset
 122
 123                 var more_to_parse = parse_inline
 124                 while more_to_parse do
 125                         more_to_parse = parse_inline
 126                 end
 127
 128                 process_delimiters(null)
 129                 merge_child_text_nodes(block)
 130         end
 131
 132         # Advance the current index of `count` characters
 133         private fun advance(count: Int) do
 134                 index += count
 135                 column += count
 136         end
 137
 138         # Attempt to parse a link reference
 139         #
 140         # Return how many characters were parsed as a reference.
 141         # Returns 0 if none.
 142         fun parse_reference(input: String): Int do
 143                 self.input = input
 144                 self.index = 0
 145                 self.column = 0
 146                 var dest
 147                 var title
 148                 var match_chars
 149                 var start_index = index
 150
 151                 # label
 152                 match_chars = parse_link_label
 153                 if match_chars == 0 then return 0
 154                 advance match_chars
 155
 156                 var raw_label = input.substring(0, match_chars)
 157
 158                 # colon
 159                 if peek != ':' then return 0
 160                 advance 1
 161
 162                 # link url
 163                 spnl
 164
 165                 dest = parse_link_destination.first
 166                 if dest == null or dest.is_empty then return 0
 167
 168                 var before_title = index
 169                 var before_column = column
 170                 spnl
 171                 title = parse_link_title
 172                 if title == null then
 173                         # rewind before spaces
 174                         index = before_title
 175                         column = before_column
 176                 end
 177
 178                 var at_line_end = true
 179                 if index != input.length and match(re_line_end) == null then
 180                         if title == null then
 181                                 at_line_end = false
 182                         else
 183                                 # the potential title we found is not at the line end,
 184                                 # but it could still be a legal link reference if we discard the title
 185                                 title = null
 186                                 # rewind before spaces
 187                                 index = before_title
 188                                 column = before_column
 189                                 # and instead check if the link URL is at the line end
 190                                 at_line_end = match(re_line_end) != null
 191                         end
 192                 end
 193
 194                 if not at_line_end then return 0
 195
 196                 var normalized_label = raw_label.normalize_reference
 197                 if normalized_label.is_empty then return 0
 198
 199                 if not reference_map.has_key(normalized_label) then
 200                         var link = new MdLink(new MdLocation(0, 0, 0, 0), dest, title)
 201                         reference_map[normalized_label] = link
 202                 end
 203
 204                 return index - start_index
 205         end
 206
 207         # Line end pattern
 208         private var re_line_end: Regex = "^ *(\n|$)".to_re
 209
 210         # Append standard text to the current block
 211         #
 212         # Read `text` between `begin_index` and `end_index`.
 213         private fun append_text(text: String, begin_index, end_index: nullable Int): MdText do
 214                 var node: MdText
 215                 if begin_index != null and end_index != null then
 216                         var nb_chars = end_index - begin_index
 217                         var string = text.substring(begin_index, nb_chars)
 218                         node = new MdText(
 219                                 new MdLocation(
 220                                         line,
 221                                         column,
 222                                         line,
 223                                         column + nb_chars - 1
 224                                 ), string)
 225                 else
 226                         node = new MdText(
 227                                 new MdLocation(
 228                                         line,
 229                                         column,
 230                                         line,
 231                                         column + text.length
 232                                 ), text)
 233                 end
 234                 append_node(node)
 235                 return node
 236         end
 237
 238         # Append `node` to the current block
 239         private fun append_node(node: MdNode) do block.append_child(node)
 240
 241         # Parse the next inline element in subject, advancing input index
 242         #
 243         # On success, add the result to block's children and return true.
 244         # On failure, return false.
 245         private fun parse_inline: Bool do
 246                 var res: Bool
 247                 var c = peek
 248                 if c == '\0' then return false
 249                 if c == '\n' then
 250                         res = parse_newline
 251                 else if c == '\\' then
 252                         res = parse_backslash
 253                 else if c == '`' then
 254                         res = parse_backticks
 255                 else if c == '[' then
 256                         res = parse_open_bracket
 257                 else if c == '!' then
 258                         res = parse_bang
 259                 else if c == ']' then
 260                         res = parse_close_bracket
 261                 else if c == '<' then
 262                         res = parse_auto_link or parse_html_inline
 263                 else if c == '&' then
 264                         res = parse_entity
 265                 else
 266                         if delimiter_processors_map.has_key(c) then
 267                                 res = parse_delimiters(delimiter_processors_map[c], c)
 268                         else
 269                                 res = parse_string
 270                         end
 271                 end
 272
 273                 if not res then
 274                         advance 1
 275                         # When we get here, it's only for a single special character that turned
 276                         # out to not have a special meaning.
 277                         # So we shouldn't have a single surrogate here, hence it should be ok
 278                         # to turn it into a String
 279                         var literal = c.to_s
 280                         append_text(literal)
 281                 end
 282
 283                 return true
 284         end
 285
 286         # If `re` matches at current index in the input, advance index and return the match
 287         # Else return null.
 288         private fun match(re: Pattern): nullable String do
 289                 if index >= input.length then return null
 290                 var match = input.search_from(re, index)
 291                 if match != null then
 292                         index = match.after
 293                         column = match.after
 294                         return match.to_s
 295                 end
 296                 return null
 297         end
 298
 299         # Return the char at the current input index, or `\0`
 300         private fun peek: Char do
 301                 if index < input.length then
 302                         return input.chars[index]
 303                 end
 304                 return '\0'
 305         end
 306
 307         # Return the char at the current input index + 1, or `\0`
 308         private fun peek_next: Char do
 309                 if index + 1 < input.length then
 310                         return input.chars[index + 1]
 311                 end
 312                 return '\0'
 313         end
 314
 315         # Parse zero or more space characters, incuding at most one newline
 316         private fun spnl: Bool do
 317                 var found_nl = false
 318                 loop
 319                         var c = peek
 320                         if c == ' ' or c == '\t' then
 321                                 advance 1
 322                                 continue
 323                         else if c == '\n' then
 324                                 if found_nl then break
 325                                 found_nl = true
 326                                 advance 1
 327                                 continue
 328                         end
 329                         break
 330                 end
 331                 return true
 332         end
 333
 334         # Parse a new line
 335         #
 336         # If it was preceded by two spaces, return a hard line break,
 337         # otherwise a soft line break
 338         private fun parse_newline: Bool do
 339                 advance 1 # assume we're at a `\n`
 340
 341                 var last_child = block.last_child
 342
 343                 # check previous text for trailing spaces
 344                 # the `has_suffix` is an optimization to avoid an RE match in the common case
 345                 if last_child != null and last_child isa MdText and
 346                    (last_child.literal.has_suffix(" ")) then
 347                         var text = last_child
 348                         var literal = text.literal
 349                         var match = literal.search(re_final_space)
 350                         var spaces = if match != null then match.length else 0
 351                         if spaces > 0 then
 352                                 text.literal = literal.substring(0, literal.length - spaces)
 353                         end
 354                         last_child.location.column_end = last_child.location.column_end - spaces
 355                         if spaces >= 2 then
 356                                 append_node(new MdHardLineBreak(new MdLocation(line, column - spaces - 1, line, column - 1), false))
 357                         else
 358                                 append_node(new MdSoftLineBreak(new MdLocation(line, column - spaces - 1, line, column -1)))
 359                         end
 360                 else
 361                         append_node(new MdSoftLineBreak(new MdLocation(line, column - 1, line, column - 1)))
 362                 end
 363                 line += 1
 364                 column = 1 + column_offset
 365
 366                 # gobble leading spaces in next line
 367                 while peek == ' ' do
 368                         advance 1
 369                 end
 370                 return true
 371         end
 372
 373         # Final white spaces pattern
 374         private var re_final_space: Regex = " *$".to_re
 375
 376         # Parse a backslash-escaped special character
 377         #
 378         # Add either the escaped characters, a hard line break (if the backslash is followed by
 379         # a new line), or a literal backslash to the block's children.
 380         private fun parse_backslash: Bool do
 381                 advance 1
 382                 if peek == '\n' then
 383                         append_node(new MdHardLineBreak(new MdLocation(line, column - 1, line, column), true))
 384                         advance 1
 385                         line += 1
 386                         column = 1 + column_offset
 387                 else if index < input.length and input.substring(index, 1).has(re_escapable) then
 388                     append_text(input, index, index + 1)
 389                     advance 1
 390                 else
 391                         append_text("\\")
 392                 end
 393                 return true
 394         end
 395
 396         # Escapable characters pattern
 397         private var p_escapable = "[]!\"#$%&\'()*+,./:;<=>?@\\[\\\\^_`\\\{|\\\}~-]"
 398
 399         # Escapable characters regex
 400         private var re_escapable: Regex = "^{p_escapable}".to_re
 401
 402         # Attempt to parse backticks
 403         #
 404         # Adding either a backtick code span or a literal sequence of backticks.
 405         private fun parse_backticks: Bool do
 406                 var column_before = column
 407                 var ticks = match(re_ticks_here)
 408                 if ticks == null then return false
 409
 410                 var after_open_ticks = index
 411                 var matched = match(re_ticks)
 412                 while matched != null do
 413                         if matched == ticks then
 414                                 var content = input.substring(after_open_ticks, index - after_open_ticks - ticks.length)
 415                                 content = content.trim
 416                                 content = content.replace(re_whitespace, " ")
 417                                 var node = new MdCode(new MdLocation(line, column_before, line, column), matched.to_s, content.trim)
 418                                 append_node(node)
 419                                 column += 1
 420                                 return true
 421                         end
 422                         matched = match(re_ticks)
 423                 end
 424                 # If we got here, we didn't match a closing backtick sequence
 425                 index = after_open_ticks
 426                 column = after_open_ticks + 1
 427                 append_text(ticks)
 428                 return true
 429         end
 430
 431         # Backticks starting pattern
 432         private var re_ticks_here: Regex = "^`+".to_re
 433
 434         # Backticks pattern
 435         private var re_ticks: Regex = "`+".to_re
 436
 437         # Attempt to parse delimiters like emphasis, strong emphasis or custom delimiters
 438         private fun parse_delimiters(delimiter_processor: MdDelimiterProcessor, delimiter_char: Char): Bool do
 439                 var res = scan_delimiters(delimiter_processor, delimiter_char)
 440                 if res == null then return false
 441
 442                 var length = res.count
 443                 var start_index = index
 444                 var start_column = column
 445
 446                 advance length
 447                 var column_before = column
 448                 column = start_column
 449                 var node = append_text(input, start_index, index)
 450                 column = column_before
 451
 452                 # Add entry to stack for this opener
 453                 var last_delimiter = new MdDelimiter(node, delimiter_char, res.can_open, res.can_close, last_delimiter)
 454                 last_delimiter.length = length
 455                 last_delimiter.original_length = length
 456
 457                 var prev = last_delimiter.prev
 458                 if prev != null then
 459                         prev.next = last_delimiter
 460                 end
 461                 self.last_delimiter = last_delimiter
 462                 return true
 463         end
 464
 465         # Add open bracket to delimiter stack and add a text node to block's children
 466         private fun parse_open_bracket: Bool do
 467                 var start_index = index
 468                 advance 1
 469
 470                 var node = append_text("[")
 471
 472                 # Add entry to stack for this opener
 473                 add_bracket(new MdBracket.link(node, start_index, column - 1, last_bracket, last_delimiter))
 474                 return true
 475         end
 476
 477         # If next character is `[`, add `!` delimiter to delimiter stack and add a text node to
 478         # block's children.
 479         # Otherwise just add a text node.
 480         private fun parse_bang: Bool do
 481                 var start_index = index
 482                 advance 1
 483
 484                 if peek == '[' then
 485                         advance 1
 486                         var node = append_text("![")
 487
 488                         # Add entry to stack for this opener
 489                         add_bracket(new MdBracket.image(node, start_index + 1, column - 2, last_bracket, last_delimiter))
 490                 else
 491                         append_text("!")
 492                 end
 493                 return true
 494         end
 495
 496         # Try match close bracket against an opening delimiter stack
 497         #
 498         # Add either a link or image, or a plan `[` character, to block's children.
 499         # If there is a matching delimiter, remove it from the delimiter stack.
 500         private fun parse_close_bracket: Bool do
 501                 advance 1
 502                 var start_index = index
 503                 var start_column = column
 504
 505                 # Get previous `[` or `![`
 506                 var opener = last_bracket
 507                 if opener == null then
 508                         # no matching opener, just return a literal
 509                         append_text("]")
 510                         return true
 511                 end
 512
 513                 if not opener.allowed then
 514                         # matching opener but it's not allowed, juste return a literal
 515                         append_text("]")
 516                         remove_last_bracket
 517                         return true
 518                 end
 519
 520                 # check to see if we have a link or image
 521                 var dest: nullable Couple[nullable String, Bool] = null
 522                 var title = null
 523                 var is_link_or_image = false
 524
 525                 # maybe an inline link like `[foo](\uri "title")`
 526                 if peek == '(' then
 527                         advance 1
 528                         spnl
 529                         dest = parse_link_destination
 530                         if dest.first != null then
 531                                 spnl
 532                                 # title needs a whitespace before
 533                                 if input.substring(index - 1, 1).has(re_whitespace) then
 534                                         title = parse_link_title
 535                                         spnl
 536                                 end
 537                                 if peek == ')' then
 538                                         advance 1
 539                                         is_link_or_image = true
 540                                 else
 541                                         index = start_index
 542                                         column = start_column
 543                                 end
 544                         end
 545                 end
 546
 547                 # maybe a reference link like `[foo][bar]`, `[foo][]` or `[foo]`
 548                 if not is_link_or_image then
 549                         # see if there's a link label like `[bar]` or `[]`
 550                         var before_label = index
 551                         var label_length = parse_link_label
 552                         advance label_length
 553                         var ref = null
 554                         if label_length > 2 then
 555                                 ref = input.substring(before_label, label_length)
 556                         else if not opener.bracket_after then
 557                                 # If the second label is empty `[foo][]` or missing `[foo]`, then the first label
 558                                 # is the reference.
 559                                 # But it can only be a reference when there's no (unescaped) bracket in it.
 560                                 # If there is, we don't even need to try to lookup the reference.
 561                                 ref = input.substring(opener.index, start_index - opener.index)
 562                         end
 563
 564                         if ref != null then
 565                                 var nref = ref.normalize_reference
 566                                 if reference_map.has_key(nref) then
 567                                         var link = reference_map[nref]
 568                                         dest = new Couple[nullable String, Bool](link.destination, false)
 569                                         title = link.title
 570                                         is_link_or_image = true
 571                                 end
 572                         end
 573                 end
 574
 575                 if is_link_or_image then
 576                         # If we got here, open is a potential opener
 577                         var link_or_image: MdLinkOrImage
 578                         if opener.is_image then
 579                                 link_or_image = new MdImage(new MdLocation(line, opener.column, line, column - 1), dest.as(not null).first or else "", title)
 580                         else
 581                                 link_or_image = new MdLink(new MdLocation(line, opener.column, line, column - 1), dest.as(not null).first or else "", title)
 582                         end
 583                         link_or_image.has_brackets = dest.as(not null).second
 584
 585                         var node = opener.node.next
 586                         while node != null do
 587                                 var next = node.next
 588                                 link_or_image.append_child(node)
 589                                 node = next
 590                         end
 591                         append_node(link_or_image)
 592
 593                         # Process delimiters such as emphasis inside a link/image
 594                         process_delimiters(opener.prev_delimiter)
 595                         merge_child_text_nodes(link_or_image)
 596                         # We don't need the corresponding text node anymore, we turned it into a node
 597                         opener.node.unlink
 598                         remove_last_bracket
 599
 600                         # Links within links are not allowed
 601                         # We found this link, so there can be no other link around it.
 602                         if not opener.is_image then
 603                                 var bracket = last_bracket
 604                                 while bracket != null do
 605                                         if not bracket.is_image then
 606                                                 # disallow link opener
 607                                                 bracket.allowed = false
 608                                         end
 609                                         bracket = bracket.prev
 610                                 end
 611                         end
 612                         return true
 613                 end
 614
 615                 if not is_link_or_image then
 616                         if parse_wikilink then return true
 617                 end
 618
 619                 # no link or image
 620                 append_text("]")
 621                 remove_last_bracket
 622                 index = start_index
 623                 column = start_column
 624                 return true
 625         end
 626
 627         # Whitespace pattern
 628         private var re_whitespace: Regex = "\\s+".to_re
 629
 630         # Add a bracket token on top of the `last_bracket` stack
 631         private fun add_bracket(bracket: MdBracket) do
 632                 var last_bracket = self.last_bracket
 633                 if last_bracket != null then
 634                         last_bracket.bracket_after = true
 635                 end
 636                 self.last_bracket = bracket
 637         end
 638
 639         # Remove the last bracket on the `last_bracket` stack
 640         private fun remove_last_bracket do
 641                 var last_bracket = self.last_bracket
 642                 if last_bracket == null then return
 643                 self.last_bracket = last_bracket.prev
 644         end
 645
 646         # Wikilink placeholder
 647         #
 648         # Will be defined in sub module.
 649         private fun parse_wikilink: Bool do return false
 650
 651         # Attempt to parse a link destination, returning the string or null if not match
 652         private fun parse_link_destination: Couple[nullable String, Bool] do
 653                 var buffer = new Buffer
 654
 655                 var c = peek
 656                 var parens = 0
 657
 658                 var has_bracket = c == '<'
 659                 if has_bracket then advance 1
 660
 661                 loop
 662                         c = peek
 663                         if c == '\0' then
 664                                 break # end of input
 665                         else if c == ' ' or c == '\t' or c == '\n' or c == '\r' then
 666                                 break # no spaces allowed in urls
 667                         else if c == '\\' then
 668                                 var next = peek_next
 669                                 if escapable.has(next) then
 670                                         buffer.add next
 671                                         advance 2 # skip over the backslash
 672                                         continue
 673                                 end
 674                         else if has_bracket and c == '>' then
 675                                 advance 1
 676                                 break
 677                         else if not has_bracket and c == '(' then
 678                                 parens += 1
 679                         else if not has_bracket and c == ')' then
 680                                 if parens == 0 then break
 681                                 parens -= 1
 682                         else if c == '\0' then
 683                                 break
 684                         end
 685                         buffer.add c
 686                         advance 1
 687                 end
 688                 return new Couple[nullable String, Bool](buffer.to_s, has_bracket)
 689         end
 690
 691         # Attempt to parse a link title (sans quotes), returning the string or null if no match
 692         private fun parse_link_title: nullable String do
 693                 var c = peek
 694                 if c != '\'' and c != '"' and c != '(' then
 695                         return null
 696                 end
 697                 var opener = c
 698
 699                 var buffer = new Buffer
 700                 loop
 701                         advance 1
 702                         c = peek
 703                         if c == opener or (opener == '(' and c == ')') then
 704                                 advance 1
 705                                 break
 706                         else if c == '\\' then
 707                                 var next = peek_next
 708                                 if escapable.has(next) then
 709                                         buffer.add next
 710                                         advance 1
 711                                         continue
 712                                 end
 713                         else if c == '\0' then
 714                                 return null
 715                         end
 716                         buffer.add c
 717                 end
 718                 return buffer.to_s
 719         end
 720
 721         # Escapable characters
 722         private var escapable = "[]!\"#$%&\'()*+,./:;<=>?@\\^_`\{|\}~-"
 723
 724         # Attempt to parse a link label returning number of characters parsed
 725         private fun parse_link_label: Int do
 726                 var i = index
 727                 while i < input.length do
 728                         var c = input[i]
 729                         if i == index and c != '[' then
 730                                 return 0
 731                         else if c == '[' and i != index then
 732                                 if input[i - 1] != '\\' or (i - 2 > index and input[i - 2] == '\\') then
 733                                         return 0
 734                                 end
 735                         else if c == ']' then
 736                                 if i > 1001 then return 0
 737                                 if input[i - 1] != '\\' or (i - 2 > index and input[i - 2] == '\\') then
 738                                         return (i - index) + 1
 739                                 end
 740                         end
 741                         i += 1
 742                 end
 743                 return 0
 744         end
 745
 746         # Attempt to parse an autolink (URL or email in pointy brackets)
 747         private fun parse_auto_link: Bool do
 748                 var column_before = column
 749                 var m = match(re_autolink_email)
 750                 if m != null then
 751                         var dest = m.substring(1, m.length - 2)
 752                         var node = new MdLink(new MdLocation(line, column_before, line, column), "mailto:{dest}", null, true)
 753                         node.append_child(new MdText(new MdLocation(line, column_before + 1, line, column - 1), dest))
 754                         column += 1
 755                         append_node(node)
 756                         return true
 757                 end
 758                 m = match(re_autolink_url)
 759                 if m != null then
 760                         var dest = m.substring(1, m.length - 2)
 761                         var node = new MdLink(new MdLocation(line, column_before, line, column), dest, null, true)
 762                         node.append_child(new MdText(new MdLocation(line, column_before + 1, line, column - 1), dest))
 763                         column += 1
 764                         append_node(node)
 765                         return true
 766                 end
 767                 return false
 768         end
 769
 770         # Autolink email pattern
 771         private var re_autolink_email: Regex = "^<([a-zA-Z0-9.!#$%&'*+/=?^_`\{|\}~-]+@[a-zA-Z0-9]([a-zA-Z0-9-]\{0,61\}[a-zA-Z0-9])?(\\.[a-zA-Z0-9]([a-zA-Z0-9-]\{0,61\}[a-zA-Z0-9])?)*)>".to_re
 772
 773         # Autolink url pattern
 774         private var re_autolink_url: Regex = "^<[a-zA-Z][a-zA-Z0-9.+-]\{1,31\}:[^<> ]*>".to_re
 775
 776         # Attempt to parse an inline HTML string
 777         private fun parse_html_inline: Bool do
 778                 var column_before = column
 779                 var m = match(re_html_tag)
 780                 if m != null then
 781                         var node = new MdHtmlInline(new MdLocation(line, column_before, line, column), m)
 782                         column += 1
 783                         append_node(node)
 784                         return true
 785                 end
 786                 return false
 787         end
 788
 789         private var p_tagname = "[A-Za-z][A-Za-z0-9-]*"
 790         private var p_attribute_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*"
 791         private var p_uquoted_value = "[^\"'=<>` \t\n]+"
 792         private var p_squoted_value = "'[^']*'"
 793         private var p_dquoted_value = "\"[^\"]*\""
 794         private var p_attribute_value = "({p_uquoted_value}|{p_squoted_value}|{p_dquoted_value})"
 795         private var p_attribute_value_spec = "(\\s*=\\s*{p_attribute_value})"
 796         private var p_attribute = "(\\s{p_attribute_name}{p_attribute_value_spec}?)"
 797         private var p_opentag = "<{p_tagname}{p_attribute}*\\s*/?>"
 798         private var p_closetag = "</{p_tagname}\\s*[>]"
 799         private var p_html_comment = "<!---->|<!--(-?[^>-])(-?[^-])*-->"
 800         private var p_processing_instruction = "[<][?].*?[?][>]"
 801         private var p_declaration = "<![A-Z]+\\s+[^>]*>"
 802         private var p_cdata = "<!\\[CDATA\\[.*\\]\\]>"
 803         private var p_html_tag = "({p_opentag}|{p_closetag}|{p_html_comment}|{p_processing_instruction}|{p_declaration}|{p_cdata})"
 804
 805         # HTML tag pattern
 806         private var re_html_tag: Regex do
 807                 var re = "^{p_html_tag}".to_re
 808                 re.ignore_case = true
 809                 return re
 810         end
 811
 812         # Attempt to parse an HTML entity
 813         private fun parse_entity: Bool do
 814                 var m = match(re_entity_here)
 815                 if m != null then
 816                         append_text(m)
 817                         return true
 818                 end
 819                 return false
 820         end
 821
 822         # HTML entity pattern
 823         private var re_entity_here: Regex do
 824                 var re = "^&(#x[a-f0-9]\{1,8\}|#[0-9]\{1,8\}|[a-z][a-z0-9]\{1,31\});".to_re
 825                 re.ignore_case = true
 826                 return re
 827         end
 828
 829         # Parse a run of ordinary characters
 830         #
 831         # Or a single character with a special meaning in markdown, as a plain string.
 832         private fun parse_string: Bool do
 833                 var begin = index
 834                 var begin_column = column
 835                 var length = input.length
 836                 while index != length do
 837                         if special_characters.has(input.chars[index]) then
 838                                 break
 839                         end
 840                         advance 1
 841                 end
 842                 if begin != index then
 843                         var column_before = column
 844                         column = begin_column
 845                         append_text(input, begin, index)
 846                         column = column_before
 847                         return true
 848                 end
 849                 return false
 850         end
 851
 852         # Scan a sequence of characters with code `delimiter_char`
 853         #
 854         # Return information about the number of delimiters and whether they are positioned
 855         # such as they can open and/or close emphasis or strong emphasis.
 856         private fun scan_delimiters(delimiter_processor: MdDelimiterProcessor, delimiter_char: Char): nullable MdDelimiterData do
 857                 var start_index = index
 858                 var start_column = column
 859
 860                 var delimiter_count = 0
 861                 while peek == delimiter_char do
 862                         delimiter_count += 1
 863                         advance 1
 864                 end
 865
 866                 if delimiter_count < delimiter_processor.min_length then
 867                         index = start_index
 868                         column = start_column
 869                         return null
 870                 end
 871
 872                 var before = "\n"
 873                 if start_index > 0 then
 874                         before = input.substring(start_index - 1, 1)
 875                 end
 876
 877                 var char_after = peek
 878                 var after = "\n"
 879                 if char_after != '\0' then
 880                         after = char_after.to_s
 881                 end
 882
 883                 var before_is_punctuation = before.has(re_punctuation)
 884                 var before_is_whitespace = before.has(re_whitespace_char)
 885                 var after_is_punctuation = after.has(re_punctuation)
 886                 var after_is_whitespace = after.has(re_whitespace_char)
 887
 888                 var left_flanking = not after_is_whitespace and
 889                         (not after_is_punctuation or before_is_whitespace or before_is_punctuation)
 890                 var right_flanking = not before_is_whitespace and
 891                         (not before_is_punctuation or after_is_whitespace or after_is_punctuation)
 892
 893                 var can_open
 894                 var can_close
 895                 if delimiter_char == '_' then
 896                         can_open = left_flanking and (not right_flanking or before_is_punctuation)
 897                         can_close = right_flanking and (not left_flanking or after_is_punctuation)
 898                 else
 899                         can_open = left_flanking and delimiter_char == delimiter_processor.opening_delimiter
 900                         can_close = right_flanking and delimiter_char == delimiter_processor.closing_delimiter
 901                 end
 902
 903                 index = start_index
 904                 column = start_column
 905                 return new MdDelimiterData(delimiter_count, can_open, can_close)
 906         end
 907
 908         # Punctuation pattern
 909         private var re_punctuation: Regex = "^[]!\"#\\$%&'()*+,.:;<=>?@^_`\{|\}~[-]".to_re
 910
 911         # Whitespace character start pattern
 912         private var re_whitespace_char: Regex = "^[  \t\r\n]".to_re
 913
 914         # Process the stack of delimiters
 915         private fun process_delimiters(stack_bottom: nullable MdDelimiter) do
 916                 var openers_bottom = new HashMap[Char, nullable MdDelimiter]
 917
 918                 # find first closer above stack bottom
 919                 var closer = last_delimiter
 920                 while closer != null and closer.prev != stack_bottom do
 921                         closer = closer.prev
 922                 end
 923                 # move forward, looking for closers, and handling each
 924                 while closer != null do
 925                         var delimiter_char = closer.delimiter_char
 926
 927                         if not closer.can_close then
 928                                 closer = closer.next
 929                                 continue
 930                         end
 931
 932                         if not delimiter_processors_map.has_key(delimiter_char) then
 933                                 closer = closer.next
 934                                 continue
 935                         end
 936
 937                         var delimiter_processor = delimiter_processors_map[delimiter_char]
 938                         var opening_delimiter_char = delimiter_processor.opening_delimiter
 939
 940                         # Found delimiter closer. Now look back for first matching opener
 941                         var use_delims = 0
 942                         var opener_found = false
 943                         var potential_opener_found = false
 944                         var opener = closer.prev
 945
 946                         while opener != null and opener != stack_bottom and (not openers_bottom.has_key(delimiter_char) or opener != openers_bottom[delimiter_char]) do
 947
 948                                 if opener.can_open and opener.delimiter_char == opening_delimiter_char then
 949                                         potential_opener_found = true
 950                                         use_delims = delimiter_processor.delimiter_use(opener, closer)
 951                                         if use_delims > 0 then
 952                                                 opener_found = true
 953                                                 break
 954                                         end
 955                                 end
 956                                 opener = opener.prev
 957                         end
 958
 959                         if not opener_found then
 960                                 if not potential_opener_found then
 961                                         # Set lower bound for future searches for openers.
 962                                         # Only do this when we didn't even have a potential opener
 963                                         # (one that matches the character and can open).
 964                                         # If an opener was rejected because of the number of delimiters
 965                                         # (e.g. because of the "multiple of 3" rule),
 966                                         # we want to consider it next time because the number of delimiter
 967                                         # can change as we continue processing.
 968                                         openers_bottom[delimiter_char] = closer.prev
 969                                         if not closer.can_open then
 970                                                 # We can remove a closer that can't be an opener,
 971                                                 # once we've seen there's no matching opener.
 972                                                 remove_delimiters_keep_node(closer)
 973                                         end
 974                                 end
 975                                 closer = closer.next
 976                                 continue
 977                         end
 978
 979                         var opener_node = opener.as(not null).node
 980                         var closer_node = closer.node
 981
 982                         # Remove number of used delimieters from stack and inline nodes
 983                         opener.as(not null).length -= use_delims
 984                         closer.length -= use_delims
 985                         opener_node.literal = opener_node.literal.substring(0,
 986                                 opener_node.literal.length - use_delims)
 987                         closer_node.literal = closer_node.literal.substring(0,
 988                                 closer_node.literal.length - use_delims)
 989
 990                         remove_delimiters_between(opener, closer)
 991                         # The delimieter processor can re-parent the nodes between opener and closer,
 992                         # so make sure they're contiguous already.
 993                         # Exclusive because we want to keep opener/closer themselves.
 994                         merge_text_nodes_between_exclusive(opener_node, closer_node)
 995                         delimiter_processor.process(opener_node, closer_node, use_delims)
 996
 997                         # Node delimieter characters left to process, so we can remove
 998                         # delimieter and the now empty node
 999                         if opener.as(not null).length == 0 then
1000                                 remove_delimiters_and_node(opener)
1001                         end
1002
1003                         if closer.length == 0 then
1004                                 var next = closer.next
1005                                 remove_delimiters_and_node(closer)
1006                                 closer = next
1007                         end
1008                 end
1009
1010                 # Remove all delimiters
1011                 while last_delimiter != null and last_delimiter != stack_bottom do
1012                         remove_delimiters_keep_node(last_delimiter)
1013                 end
1014         end
1015
1016         # Remove all delimiters between `opener` and `closer`
1017         private fun remove_delimiters_between(opener, closer: nullable MdDelimiter) do
1018                 if opener == null or closer == null then return
1019
1020                 var delimiter = closer.prev
1021                 while delimiter != null and delimiter != opener do
1022                         var previous_delimiter = delimiter.prev
1023                         remove_delimiters_keep_node(delimiter)
1024                         delimiter = previous_delimiter
1025                 end
1026         end
1027
1028         # Remove the delimiter and the corresponding text node
1029         #
1030         # For used delimiters, e.g. `*` in `*foo*`.
1031         private fun remove_delimiters_and_node(delim: nullable MdDelimiter) do
1032                 if delim == null then return
1033
1034                 var node = delim.node
1035                 node.unlink
1036                 remove_delimiter(delim)
1037         end
1038
1039         # Remove the delimiter but keep the corresponding node as text
1040         #
1041         # For unused delimiters such as `_` in `foo_bar`.
1042         private fun remove_delimiters_keep_node(delim: nullable MdDelimiter) do
1043                 remove_delimiter(delim)
1044         end
1045
1046         # Remove the delimiter `delim`
1047         private fun remove_delimiter(delim: nullable MdDelimiter) do
1048                 if delim == null then return
1049
1050                 var prev = delim.prev
1051                 if prev != null then
1052                         prev.next = delim.next
1053                 end
1054                 var next = delim.next
1055                 if next == null then
1056                         # top of stack
1057                         last_delimiter = prev
1058                 else
1059                         next.prev = prev
1060                 end
1061         end
1062
1063         # Merge all nodes between `from` and `to` excluding `from` and `to`
1064         private fun merge_text_nodes_between_exclusive(from, to: nullable MdNode) do
1065                 if from == null or to == null then return
1066                 # no node between them
1067                 if from == to or from.next == to then return
1068                 merge_text_nodes_inclusive(from.next, to.prev)
1069         end
1070
1071         # Merge all child nodes of `node` into one
1072         private fun merge_child_text_nodes(node: nullable MdNode) do
1073                 if node == null then return
1074                 # no children or just one child node, no need for merging
1075                 if node.first_child == node.last_child then return
1076                 merge_text_nodes_inclusive(node.first_child, node.last_child)
1077         end
1078
1079         # Merge all nodes between `from` and `to` including `from` and `to`
1080         private fun merge_text_nodes_inclusive(from, to: nullable MdNode) do
1081                 var first = null
1082                 var last = null
1083
1084                 var node = from
1085                 while node != null do
1086                         if node isa MdText then
1087                                 var text = node
1088                                 if first == null then first = text
1089                                 last = text
1090                         else
1091                                 merge_if_needed(first, last)
1092                                 first = null
1093                                 last = null
1094                         end
1095                         if node == to then break
1096                         node = node.next
1097                 end
1098                 merge_if_needed(first, last)
1099         end
1100
1101         # Merge all nodes between `first` and `last`
1102         private fun merge_if_needed(first, last: nullable MdText) do
1103                 if first != null and last != null and first != last then
1104                         var buffer = new Buffer
1105                         buffer.append(first.literal)
1106                         var node = first.next
1107                         var stop = last.next
1108                         while node != null and node != stop do
1109                                 buffer.append(node.as(MdText).literal)
1110                                 first.location.line_end = node.location.line_end
1111                                 first.location.column_end = node.location.column_end
1112                                 var unlink = node
1113                                 node = node.next
1114                                 unlink.unlink
1115                         end
1116                         var literal = buffer.write_to_string
1117                         first.literal = literal
1118                 end
1119         end
1120 end
1121
1122 # Custom delimiter processor for additional delimiters besides `_` and `*`
1123 interface MdDelimiterProcessor
1124
1125         # The character that marks the beginning of a delimited node
1126         #
1127         # Must not clash with anu built-in special characters.
1128         fun opening_delimiter: Char is abstract
1129
1130         # The character that marks the ending of a delimited node
1131         #
1132         # Must not clash with anu built-in special characters.
1133         fun closing_delimiter: Char is abstract
1134
1135         # Minimum number of delimiters characters that are needed to active this
1136         #
1137         # Must be at least one.
1138         fun min_length: Int is abstract
1139
1140         # Determine how many (if any) of the delimiter characters should be used
1141         #
1142         # This allows implementations to decide how many characters to use based on the
1143         # properties of the delimiter runs.
1144         #
1145         # An implementation can also return 0 when it doesn't want to allow this particular
1146         # combination of delimiter runs.
1147         fun delimiter_use(opener, closer: MdDelimiter): Int is abstract
1148
1149         # Process the matched delimiters
1150         #
1151         # For example, by wrapping the nodes between `opener` and `closer` in a new node,
1152         # or appending a new node after the opener.
1153         #
1154         # Note that removal of the delimiter from the delimiter nodes and unlinking
1155         # them is done by the caller.
1156         fun process(opener, closer: MdText, delimiter_use: Int) is abstract
1157 end
1158
1159 # A delimiter is one or more of the same delimiter character
1160 #
1161 # Used for paired delimiters like emphasis or strong emphasis.
1162 class MdDelimiter
1163
1164         # Node containing the delimiter
1165         var node: MdText
1166
1167         # Character used as delimiter
1168         var delimiter_char: Char
1169
1170         # Can `self` open a delimiter?
1171         var can_open: Bool
1172
1173         # Cant `self` close a delimiter?
1174         var can_close: Bool
1175
1176         # Previous delimiter found
1177         var prev: nullable MdDelimiter
1178
1179         # Next delimiter found
1180         var next: nullable MdDelimiter
1181
1182         # The number of characters in this delimiter run that are left for processing
1183         var length = 1
1184
1185         # The number of characters originally in this delimiter run
1186         #
1187         # At the start of processing, this is the same as `length`.
1188         var original_length = 1
1189 end
1190
1191 # Opening bracket for links and images
1192 class MdBracket
1193
1194         # Node containing the bracket
1195         var node: MdText
1196
1197         # Index of the bracket in the original string
1198         var index: Int
1199
1200         # COlumn of the bracket
1201         var column: Int
1202
1203         # Is this bracket opening an image?
1204         var is_image: Bool
1205
1206         # Previous bracket
1207         var prev: nullable MdBracket
1208
1209         # Previous delimiter
1210         var prev_delimiter: nullable MdDelimiter
1211
1212         # Whether this bracket is allowed to form a link/image
1213         var allowed = true
1214
1215         # Whether there is an unescaped bracket (opening or closing) anywhere after this bracket
1216         var bracket_after = false
1217
1218         # Create a new bracket for a link
1219         init link(node: MdText, index: Int, column: Int, prev: nullable MdBracket, prev_delimiter: nullable MdDelimiter) do
1220                 init(node, index, column, false, prev, prev_delimiter)
1221         end
1222
1223         # Create a new bracket for an image
1224         init image(node: MdText, index: Int, column: Int, prev: nullable MdBracket, prev_delimiter: nullable MdDelimiter) do
1225                 init(node, index, column, true, prev, prev_delimiter)
1226         end
1227 end
1228
1229 # Data about a delimiter parsing
1230 private class MdDelimiterData
1231
1232         # Number of successive delimiters found
1233         var count: Int
1234
1235         # Can this delimiter open an inline construct?
1236         var can_open: Bool
1237
1238         # Can this delimiter close an inline construct?
1239         var can_close: Bool
1240 end
1241
1242 # An implementation of MdDelimiterProcessor that dispatches all calls to others
1243 #
1244 # The sub processors called bepends on the length of the delimiter run.
1245 # All child processors must have different minimum lengths.
1246 # A given delimiter run is dispatched to the child with the largest acceptable minimum length.
1247 # If not child is applicable, the one with the largest minimum length is chosen.
1248 class MdStaggeredDelimiterProcessor
1249         super MdDelimiterProcessor
1250
1251         # Delimiter character
1252         var delim: Char
1253
1254         # Sub processors to apply
1255         var processors = new Array[MdDelimiterProcessor]
1256
1257         redef var min_length = 0
1258         redef fun opening_delimiter do return delim
1259         redef fun closing_delimiter do return delim
1260
1261         # Add a new sub delimiter processor
1262         fun add(dp: MdDelimiterProcessor) do
1263                 var len = dp.min_length
1264                 var i = 0
1265                 while i < processors.length do
1266                         var p = processors[i]
1267                         assert len != p.min_length else
1268                                 print "Cannot add two delimiter processor for `{delim}` " +
1269                                         "and mininimum length `{len}`"
1270                         end
1271                         if len > p.min_length then
1272                                 break
1273                         end
1274                         i += 1
1275                 end
1276                 processors.insert(dp, i)
1277         end
1278
1279         # Find the corresponding processor for a length of `len` delimiter characters
1280         fun find_processor(len: Int): MdDelimiterProcessor do
1281                 for processor in processors do
1282                         if processor.min_length <= len then return processor
1283                 end
1284                 return processors.first
1285         end
1286
1287         redef fun delimiter_use(opener, closer) do
1288                 return find_processor(opener.length).delimiter_use(opener, closer)
1289         end
1290
1291         redef fun process(opener, closer, delimiter_use) do
1292                 find_processor(delimiter_use).process(opener, closer, delimiter_use)
1293         end
1294 end
1295
1296 # A processor for emphasis tokens
1297 class MdEmphasisDelimiterProcessor
1298         super MdDelimiterProcessor
1299
1300         # Delimiter character
1301         var delimiter_char: Char
1302
1303         redef var min_length = 1
1304         redef fun opening_delimiter do return delimiter_char
1305         redef fun closing_delimiter do return delimiter_char
1306
1307         redef fun delimiter_use(opener, closer) do
1308                 # "multiple of 3" rule for internal delimiter runs
1309                 if (opener.can_close or closer.can_open) and
1310                    ((opener.original_length + closer.original_length) % 3 == 0) then
1311                         return 0
1312                 end
1313                 # calculate actual number of delimiters used from this closer
1314                 if opener.length >= 2 and closer.length >= 2 then
1315                         return 2
1316                 end
1317                 return 1
1318         end
1319
1320         redef fun process(opener, closer, delimiter_use) do
1321                 var single_delimiter = opening_delimiter.to_s
1322                 var emphasis: MdNode
1323                 if delimiter_use == 1 then
1324                         emphasis = new MdEmphasis(
1325                                 new MdLocation(
1326                                         opener.location.line_start,
1327                                         opener.location.column_start,
1328                                         closer.location.line_end,
1329                                         closer.location.column_end),
1330                                 single_delimiter)
1331                 else
1332                         emphasis = new MdStrongEmphasis(
1333                                 new MdLocation(
1334                                         opener.location.line_start,
1335                                         opener.location.column_start + opener.literal.length,
1336                                         closer.location.line_end,
1337                                         closer.location.column_end - closer.literal.length),
1338                                 "{single_delimiter}{single_delimiter}")
1339                 end
1340                 var tmp = opener.next
1341                 while tmp != null and tmp != closer do
1342                         var next = tmp.next
1343                         emphasis.append_child(tmp)
1344                         tmp = next
1345                 end
1346                 opener.insert_after(emphasis)
1347         end
1348 end
1349
1350 # Asterisk delimiters processor
1351 class MdAsteriskDelimiterProcessor
1352         super MdEmphasisDelimiterProcessor
1353         noautoinit
1354
1355         redef var delimiter_char = '*'
1356 end
1357
1358 # Underscore delimters processor
1359 class MdUnderscoreDelimiterProcessor
1360         super MdEmphasisDelimiterProcessor
1361         noautoinit
1362
1363         redef var delimiter_char = '_'
1364 end
1365
1366 # Utils
1367
1368 redef class String
1369
1370         # Remove escape backslash from string
1371         fun unescape_string: String do
1372                 if not has(re_escaped) then return self
1373
1374                 var buffer = new Buffer
1375                 var match = search(re_escaped)
1376                 var last_end = 0
1377                 while match != null do
1378                         buffer.append substring(last_end, match.from - last_end)
1379                         buffer.append substring(match.from + 1, 1)
1380                         last_end = match.after
1381                         match = search_from(re_escaped, last_end)
1382                 end
1383                 if last_end < length then
1384                         buffer.append substring(last_end, length - last_end)
1385                 end
1386                 return buffer.to_s
1387         end
1388
1389         # Normalize link reference names
1390         private fun normalize_reference: String do
1391                 var stripped = self.substring(1, length - 2).trim
1392                 var lowercase = stripped.to_lower # TODO utf-8
1393                 return lowercase.replace(re_whitespace, " ")
1394         end
1395 end
1396
1397 redef class Sys
1398         private var p_escapable = "[]!\"#$%&\'()*+,./:;<=>?@\\[\\\\^_`\\\{|\\\}~-]"
1399         private var re_escaped: Regex = "\\\\{p_escapable}".to_re
1400         private var re_whitespace: Regex = "\\s+".to_re
1401 end