lib/markdown2/markdown_block_parsing.nit

   1 # This file is part of NIT ( http://www.nitlanguage.org ).
   2 #
   3 # Licensed under the Apache License, Version 2.0 (the "License");
   4 # you may not use this file except in compliance with the License.
   5 # You may obtain a copy of the License at
   6 #
   7 #     http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 # Unless required by applicable law or agreed to in writing, software
  10 # distributed under the License is distributed on an "AS IS" BASIS,
  11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 # See the License for the specific language governing permissions and
  13 # limitations under the License.
  14
  15 # Markdown blocks parsing
  16 #
  17 # Introduce the parsers for the different Markdown blocks such as headings, lists
  18 # code blocks etc.
  19 module markdown_block_parsing
  20
  21 import markdown_inline_parsing
  22
  23 # Markdown parser
  24 #
  25 # Used to create the AST representation of a Markdown document.
  26 class MdParser
  27
  28         # Inline parser used to parse block content
  29         private var inline_parser = new MdInlineParser is lazy
  30
  31         # Block parsers factories
  32         private var block_parser_factories: Collection[MdBlockParserFactory] do
  33                 var factories = new Array[MdBlockParserFactory]
  34                 factories.add new MdBlockQuoteParserFactory
  35                 factories.add new MdHeadingParserFactory
  36                 factories.add new MdFencedCodeBlockParserFactory
  37                 factories.add new MdHtmlBlockParserFactory
  38                 factories.add new MdThematicBreakParserFactory
  39                 factories.add new MdListBlockParserFactory
  40                 factories.add new MdIndentedCodeBlockParserFactory
  41                 return factories
  42         end
  43
  44         # Active block parsers
  45         #
  46         # Used as a stack to parse nested blocks.
  47         private var active_block_parsers = new Array[MdBlockParser]
  48
  49         # All active block parsers
  50         private var all_block_parsers = new HashSet[MdBlockParser]
  51
  52         # Return the active block parser
  53         #
  54         # The last entry in the `active_block_parsers` stack.
  55         private fun active_block_parser: MdBlockParser do
  56                 return active_block_parsers.last
  57         end
  58
  59         # Activate a `block_parser`
  60         #
  61         # Add the `block_parser` on the top of the `active_block_parsers` stack.
  62         # Also register it in `all_block_parsers`.
  63         private fun activate_block_parser(block_parser: MdBlockParser) do
  64                 active_block_parsers.add block_parser
  65                 all_block_parsers.add block_parser
  66         end
  67
  68         # Deactivate the `active_block_parser`
  69         private fun deactivate_block_parser do
  70                 active_block_parsers.pop
  71         end
  72
  73         # Deactivate and remove the `active_block_parser` from the `all_block_parsers` list
  74         private fun remove_active_block_parser do
  75                 var old = active_block_parser
  76                 deactivate_block_parser
  77                 all_block_parsers.remove(old)
  78                 old.block.unlink
  79         end
  80
  81         # Post-processors applied after the parsing of a document
  82         var post_processors = new Array[MdPostProcessor] is writable
  83
  84         # Currently parsed line
  85         private var line_string: String is noinit
  86
  87         # Current index (offset) in input `line_string` (starts at 0)
  88         private var index = 0
  89
  90         # Current column in input `line_string` (starts at 0)
  91         #
  92         # Tab causes column to go to next 4-space tab stop.
  93         private var column = 0
  94
  95         # Is the current column within a tab character (partially consumed tab)
  96         private var column_is_in_tab: Bool is noinit
  97
  98         # Current line in input string (starts at 1)
  99         private var line = 1
 100
 101         # Index of the next non-space character starting from `index`
 102         private var next_non_space_index = 0
 103
 104         # Next non-space column
 105         private var next_non_space_column = 0
 106
 107         # Current indent in columns
 108         #
 109         # Either by spaces or tab stop of 4, starting from `column`.
 110         private var indent = 0
 111
 112         # Is the current `line` blank starting from `index`?
 113         private var is_blank: Bool is noinit
 114
 115         # Does a node end with a blank line?
 116         private var last_line_blank = new HashMap[MdNode, Bool]
 117
 118         # Initialize parser state
 119         private fun initialize do
 120                 active_block_parsers.clear
 121                 all_block_parsers.clear
 122                 index = 0
 123                 column = 0
 124                 column_is_in_tab = false
 125                 line = 1
 126                 next_non_space_index = 0
 127                 next_non_space_column = 0
 128                 indent = 0
 129                 is_blank = false
 130                 last_line_blank.clear
 131         end
 132
 133         # Parse the `input` string as a MdDocument
 134         fun parse(input: String): MdDocument do
 135                 initialize
 136
 137                 var document_block_parser = new MdDocumentBlockParser(1, 1, 0)
 138                 activate_block_parser(document_block_parser)
 139                 var line_start = 0
 140                 var line_break = find_line_break(input, line_start)
 141                 while line_break != -1 do
 142                         var line_string = input.substring(line_start, line_break - line_start)
 143                         incorporate_line(line_string)
 144                         if line_break + 1 < input.length and
 145                            input.chars[line_break] == '\r' and
 146                            input.chars[line_break + 1] == '\n' then
 147                                 line_start = line_break + 2
 148                         else
 149                                 line_start = line_break + 1
 150                         end
 151                         line_break = find_line_break(input, line_start)
 152                         line += 1
 153                         column = 0
 154                 end
 155
 156                 # Finalize pending line
 157                 if input.length > 0 and (line_start == 0 or line_start < input.length) then
 158                         incorporate_line(input.substring(line_start, input.length - line_start))
 159                 end
 160                 finalize_blocks(active_block_parsers)
 161
 162                 # Walk through a block and its chiildren revursively
 163                 # Parsing string content into inline content where appropriate.
 164                 var all_block_parsers = all_block_parsers.to_a
 165                 var i = all_block_parsers.length - 1
 166                 while i >= 0 do
 167                         var block_parser = all_block_parsers[i]
 168                         block_parser.parse_inlines(inline_parser)
 169                         i -= 1
 170                 end
 171                 var document = document_block_parser.block
 172                 return document
 173         end
 174
 175         # Post-process the `document`
 176         fun post_process(document: MdDocument) do
 177                 for processor in post_processors do
 178                         processor.post_process(self, document)
 179                 end
 180         end
 181
 182         # Analyze a line of text and update the document
 183         #
 184         # We parse Markdown text by calling this on each line of `input`.
 185         private fun incorporate_line(input: String) do
 186                 line_string = input
 187                 index = 0
 188                 column = 0
 189                 column_is_in_tab = false
 190
 191                 # For each containing block, try to parse the associated line start.
 192                 var matches = 1
 193                 for i in [1 .. active_block_parsers.length[ do
 194                         var block_parser = active_block_parsers[i]
 195                         find_next_non_space
 196
 197                         var result = block_parser.try_continue(self)
 198                         if result isa MdBlockContinue then
 199                                 if result.is_finalize then
 200                                         block_parser.finalize(self)
 201                                         return
 202                                 else
 203                                         if result.new_index != -1 then
 204                                                 set_new_index result.new_index
 205                                         else if result.new_column != -1 then
 206                                                 set_new_column result.new_column
 207                                         end
 208                                 end
 209                                 matches += 1
 210                         else
 211                                 break
 212                         end
 213                 end
 214
 215                 var unmatched_block_parsers = active_block_parsers.subarray(
 216                         matches, active_block_parsers.length - matches)
 217                 var last_matched_block_parser = active_block_parsers[matches - 1]
 218                 var block_parser = last_matched_block_parser
 219                 var all_closed = unmatched_block_parsers.is_empty
 220
 221                 # Unless last matched container is a code block, try new container starts,
 222                 # adding children to the last matched container.
 223                 var try_block_starts = block_parser.block isa MdParagraph or
 224                         block_parser.block.is_container
 225
 226                 while try_block_starts do
 227                         find_next_non_space
 228
 229                         # Optimize lookup
 230                         if is_blank or (indent < 4 and line_string.chars[next_non_space_index].is_letter) then
 231                                 set_new_index next_non_space_index
 232                                 break
 233                         end
 234
 235                         var block_start = find_block_start(block_parser)
 236                         if block_start == null then
 237                                 set_new_index next_non_space_index
 238                                 break
 239                         end
 240
 241                         if not all_closed then
 242                                 finalize_blocks(unmatched_block_parsers)
 243                                 all_closed = true
 244                         end
 245
 246                         if block_start.new_index != -1 then
 247                                 set_new_index block_start.new_index
 248                         else if block_start.new_column != -1 then
 249                                 set_new_column block_start.new_column
 250                         end
 251
 252                         if block_start.replace_active_block_parser then
 253                                 remove_active_block_parser
 254                         end
 255
 256                         for new_block_parser in block_start.block_parsers do
 257                                 add_child(new_block_parser)
 258                                 block_parser = new_block_parser
 259                                 try_block_starts = new_block_parser.block.is_container
 260                         end
 261                 end
 262
 263                 # What remains at the offset is a text line.
 264                 # Add the text to the appropriate block.
 265
 266                 # First check for a lazy paragraph continuation
 267                 if not all_closed and not is_blank and active_block_parser isa MdParagraphParser then
 268                         add_line
 269                 else
 270                         # Finalize any blocks not matched
 271                         if not all_closed then
 272                                 finalize_blocks(unmatched_block_parsers)
 273                         end
 274                         propagate_last_line_blank(block_parser, last_matched_block_parser)
 275
 276                         if not block_parser.block.is_container then
 277                                 add_line
 278                         else if not is_blank then
 279                                 # Create a paragraph container for the line
 280                                 add_child(new MdParagraphParser(line, column + 1, block_parser.content_offset))
 281                                 add_line
 282                         end
 283                 end
 284         end
 285
 286         # Find what kind of block starts at `index` in `input`
 287         private fun find_block_start(block_parser: MdBlockParser): nullable MdBlockStart do
 288                 for block_parser_factory in block_parser_factories do
 289                         var result = block_parser_factory.try_start(self, block_parser)
 290                         if result != null then return result
 291                 end
 292                 return null
 293         end
 294
 295         # Add a `block_parser` block's as child of the active block parser block
 296         private fun add_child(block_parser: MdBlockParser) do
 297                 # Finalize non-parentable blocks
 298                 while not active_block_parser.block.can_contain(block_parser.block) do
 299                         active_block_parser.finalize(self)
 300                 end
 301                 # Append block block parser block to its parent
 302                 active_block_parser.block.append_child(block_parser.block)
 303                 activate_block_parser(block_parser)
 304         end
 305
 306         # Add line content to the active block parser
 307         #
 308         # We assume it can accept lines.
 309         private fun add_line do
 310                 var content = null
 311                 if column_is_in_tab then
 312                         # Out column is in a partially consumed tab.
 313                         # Expand the remaining columns to the next tab stop to spaces.
 314                         var after_tab = index + 1
 315                         var rest = line_string.substring(after_tab, line_string.length - after_tab)
 316                         var spaces = column.columns_to_next_tab_stop
 317                         var buffer = new Buffer
 318                         for i in [0 .. spaces[ do
 319                                 buffer.add ' '
 320                         end
 321                         buffer.append(rest)
 322                         content = buffer.write_to_string
 323                 else
 324                         content = line_string.substring(index, line_string.length - index)
 325                 end
 326                 active_block_parser.add_line(content)
 327         end
 328
 329         # Finalize blocks of previous line
 330         private fun finalize_blocks(block_parsers: Sequence[MdBlockParser]) do
 331                 var i = block_parsers.length - 1
 332                 while i >= 0 do
 333                         var block_parser = block_parsers[i]
 334                         block_parser.finalize(self)
 335                         i -= 1
 336                 end
 337         end
 338
 339         # Advance the `index` position to the next character
 340         #
 341         # Also set the `column`.
 342         # If the next character is a tab, compute the new column accordingly.
 343         private fun advance do
 344                 var c = line_string.chars[index]
 345                 if c == '\t' then
 346                         index += 1
 347                         column += column.columns_to_next_tab_stop
 348                 else
 349                         index += 1
 350                         column += 1
 351                 end
 352         end
 353
 354         # Move `index` to the next non-space character index in the `input` string
 355         #
 356         # Also set `next_non_space_index`, `next_non_space_column`, `is_blank` and `indent`.
 357         private fun find_next_non_space do
 358                 var i = index
 359                 var cols = column
 360
 361                 is_blank = true
 362                 while i < line_string.length do
 363                         var c = line_string.chars[i]
 364                         if c == ' ' then
 365                                 i += 1
 366                                 cols += 1
 367                                 continue
 368                         else if c == '\t' then
 369                                 i += 1
 370                                 cols += 4 - (cols % 4)
 371                                 continue
 372                         end
 373                         is_blank = false
 374                         break
 375                 end
 376
 377                 next_non_space_index = i
 378                 next_non_space_column = cols
 379                 indent = next_non_space_column - column
 380         end
 381
 382         # Return the position of the next line break
 383         #
 384         # We consider `\r` and `\n`.
 385         private fun find_line_break(input: String, start_index: Int): Int do
 386                 for i in [start_index .. input.length[ do
 387                         var char = input.chars[i]
 388                         if char == '\r' or char == '\n' then return i
 389                 end
 390                 return -1
 391         end
 392
 393         # Set the parser `index` at `new_index`
 394         #
 395         # Also set `column` and `column_is_in_tab`.
 396         private fun set_new_index(new_index: Int) do
 397                 if new_index >= next_non_space_index then
 398                         # We can start from here, no need to calculate tab stops again
 399                         index = next_non_space_index
 400                         column = next_non_space_column
 401                 end
 402                 while index < new_index and index != line_string.length do
 403                         advance
 404                 end
 405                 # If we're going to an index as opposed to a column, we're never within a tab
 406                 column_is_in_tab = false
 407         end
 408
 409         # Set the parser `column` at `new_column`
 410         #
 411         # Also set `index` and `column_is_in_tab`.
 412         private fun set_new_column(new_column: Int) do
 413                 if new_column >= next_non_space_column then
 414                         # We can start from here, no need to calculate tab stops again
 415                         index = next_non_space_index
 416                         column = next_non_space_column
 417                 end
 418                 while column < new_column and index != line_string.length do
 419                         advance
 420                 end
 421                 if column > new_column then
 422                         # Last character was a tab and we overshot our target
 423                         index -= 1
 424                         column = new_column
 425                         column_is_in_tab = true
 426                 else
 427                         column_is_in_tab = false
 428                 end
 429         end
 430
 431         # Does `block` end with a blank line?
 432         private fun ends_with_blank_line(block: nullable MdNode): Bool do
 433                 while block != null do
 434                         if is_last_line_blank(block) then return true
 435                         if block isa MdListBlock or block isa MdListItem then
 436                                 block = block.last_child
 437                         else
 438                                 break
 439                         end
 440                 end
 441                 return false
 442         end
 443
 444         # Propagate a blank line to all block_parser blocl's parents
 445         private fun propagate_last_line_blank(block_parser: MdBlockParser, last_matched_block_parser: MdBlockParser) do
 446                 var last_child = block_parser.block.last_child
 447                 if is_blank and last_child != null then
 448                         last_line_blank[last_child] = true
 449                 end
 450                 var block = block_parser.block
 451
 452                 # Block quotes lines are never blank as they start with `>`.
 453                 # We don't count blanks in fenced code for purposes of thight/loose lists.
 454                 # We also don't set `last_line_blank` on an empty list item.
 455                 var last_line_blank = is_blank and
 456                         not (block isa MdBlockQuote or
 457                              block isa MdFencedCodeBlock or
 458                                  (block isa MdListItem and block.first_child == null and
 459                                                                                   block_parser != last_matched_block_parser))
 460
 461                 # Propagate `last_line_blank` up through parents
 462                 var node: nullable MdNode = block_parser.block
 463                 while node != null do
 464                         self.last_line_blank[node] = last_line_blank
 465                         node = node.parent
 466                 end
 467         end
 468
 469         # Is last line blank for `node`?
 470         private fun is_last_line_blank(node: MdNode): Bool do
 471                 if not last_line_blank.has_key(node) then return false
 472                 return last_line_blank[node]
 473         end
 474 end
 475
 476 # Block parsing
 477
 478 # Parser for a specific block node
 479 abstract class MdBlockParser
 480
 481         # Kind of block under construction
 482         type BLOCK: MdBlock
 483
 484         # MdBlock under construction
 485         fun block: BLOCK is abstract
 486
 487         # Line Start
 488         var line_start: Int
 489
 490         # Column start
 491         var column_start: Int
 492
 493         # Location at start
 494         #
 495         # The location end it initialized at `-1` and will be set later in the
 496         # `finalize` method.
 497         var location: MdLocation is lazy do return new MdLocation(line_start, column_start, -1, -1)
 498
 499         # Column where the content starts
 500         var content_offset: Int
 501
 502         # Initialize the current `block`
 503         fun initialize(parser: MdParser) do end
 504
 505         # Can `self` continue from the current `index` in `parser`?
 506         #
 507         # Return a new `MdBlockContinue` if `self` can continue parsing.
 508         # Return null otherwise.
 509         fun try_continue(state: MdParser): nullable MdBlockContinue is abstract
 510
 511         # Add `line` to the current `block`
 512         fun add_line(line: String) do end
 513
 514         # Finalize the current `block`
 515         #
 516         # Deactivate `self` from `parser` and call `close_block`.
 517         fun finalize(parser: MdParser) do
 518                 if parser.active_block_parser == self then
 519                         parser.deactivate_block_parser
 520                 end
 521         end
 522
 523         # Parse `block` lines
 524         fun parse_inlines(inline_parser: MdInlineParser) do end
 525 end
 526
 527 # Result object for continuing parsing of a block
 528 class MdBlockContinue
 529
 530         # Index from which continue parsing
 531         var new_index: Int
 532
 533         # Column from which continue parsing
 534         var new_column: Int
 535
 536         # Is the block finalized?
 537         var is_finalize: Bool
 538
 539         # Continue from index
 540         init at_index(new_index: Int) do
 541                 init(new_index, -1, false)
 542         end
 543
 544         # Continue from column
 545         init at_column(new_column: Int) do
 546                 init(-1, new_column, false)
 547         end
 548
 549         # Block is finished
 550         init finished do
 551                 init(-1, -1, true)
 552         end
 553 end
 554
 555 # Block parser factory for a block node for determining when a block starts
 556 abstract class MdBlockParserFactory
 557
 558         # Can the associated block parser can start at the current line in `parser`?
 559         #
 560         # Return a new `MdBlockStart` if the block parser can start.
 561         # Return null otherwise.
 562         fun try_start(parser: MdParser, matched_block_parser: MdBlockParser):
 563                 nullable MdBlockStart is abstract
 564 end
 565
 566 # Result object from starting parsing of a block
 567 class MdBlockStart
 568
 569         # Block parsers for this block start
 570         var block_parsers: Array[MdBlockParser]
 571
 572         # Index where the parsing should start
 573         var new_index = -1
 574
 575         # Column where the parsing should start
 576         var new_column = -1
 577
 578         # Does the block starting with `self` terminate a previous block?
 579         var replace_active_block_parser = false
 580
 581         # Start from `new_index`
 582         fun at_index(new_index: Int): MdBlockStart do
 583                 self.new_index = new_index
 584                 return self
 585         end
 586
 587         # Start from `new_column`
 588         fun at_column(new_column: Int): MdBlockStart do
 589                 self.new_column = new_column
 590                 return self
 591         end
 592
 593         # Start replacing the active block parser
 594         fun replacing_active_block_parser: MdBlockStart do
 595                 self.replace_active_block_parser = true
 596                 return self
 597         end
 598 end
 599
 600 # Parser for the whole document
 601 class MdDocumentBlockParser
 602         super MdBlockParser
 603
 604         redef type BLOCK: MdDocument
 605         redef var block = new MdDocument(location) is lazy
 606
 607         # Always continue at current indent
 608         redef fun try_continue(state) do return new MdBlockContinue.at_index(state.index)
 609
 610         redef fun finalize(parser) do
 611         end
 612
 613         # redef fun finalize(state) do
 614         redef fun parse_inlines(inline_parser) do
 615                 var last_child = block.last_child
 616                 if last_child != null then
 617                         location.line_end = last_child.location.line_end
 618                         location.column_end = last_child.location.column_end
 619                 end
 620         end
 621 end
 622
 623 # Headings parser
 624 class MdHeadingParser
 625         super MdBlockParser
 626
 627         redef type BLOCK: MdHeading
 628
 629         redef var block = new MdHeading(location, level, is_setext, has_atx_trailing) is lazy
 630
 631         redef var location = new MdLocation(line_start, column_start, line_end, column_end) is lazy
 632
 633         # Line end
 634         var line_end: Int
 635
 636         # Column end
 637         var column_end: Int
 638
 639         # Heading level
 640         var level: Int
 641
 642         # Heading content
 643         var content: String
 644
 645         # Heading has ATX trailing
 646         var has_atx_trailing: Bool
 647
 648         # Heading is setext format
 649         var is_setext: Bool
 650
 651         # Never continue parsing as an heading is a one liner
 652         redef fun try_continue(state) do return null
 653
 654         # Parse the heading content
 655         redef fun parse_inlines(inline_parser) do
 656                 inline_parser.parse(content, content_offset, block)
 657         end
 658 end
 659
 660 # Heading parser factory
 661 class MdHeadingParserFactory
 662         super MdBlockParserFactory
 663
 664         redef fun try_start(state, matched_block_parser) do
 665                 if state.indent >= 4 then return null
 666
 667                 var next_non_space = state.next_non_space_index
 668                 var line = state.line_string
 669                 var paragraph = null
 670                 if matched_block_parser isa MdParagraphParser then
 671                         paragraph = matched_block_parser.content
 672                 end
 673
 674                 var line_content = line.substring(next_non_space, line.length - next_non_space)
 675                 var match = line_content.search(re_atx_heading)
 676                 if match != null then
 677                         # ATX heading
 678                         var new_offset = next_non_space + match.subs.first.as(not null).length
 679                         var level = match.subs.first.as(not null).to_s.trim.length
 680                         # remove trailing ###s
 681                         var after_leading = line.substring(new_offset, line.length - new_offset)
 682                         var trailing = after_leading.search(re_atx_trailing)
 683                         var has_trailing = trailing != null
 684                         var trailing_length = if trailing != null then trailing.length else 0
 685                         var content = after_leading.replace(re_atx_trailing, "")
 686                         return (new MdBlockStart(
 687                                 [new MdHeadingParser(
 688                                         state.line,
 689                                         next_non_space + 1,
 690                                         new_offset + 1,
 691                                         state.line,
 692                                         new_offset + content.length + trailing_length,
 693                                         level,
 694                                         content,
 695                                         has_trailing, false)])
 696                                 ).at_index(line.length)
 697                 end
 698
 699                 if paragraph ==  null then return null
 700
 701                 match = line_content.search(re_setext_heading)
 702                 if match == null then return null
 703                 var level = 2
 704                 if match.subs.first.as(not null).to_s.chars.first == '=' then level = 1
 705                 var content = paragraph.to_s
 706                 return (new MdBlockStart(
 707                         [new MdHeadingParser(
 708                                 state.line - 1,
 709                                 next_non_space + 1,
 710                                 0,
 711                                 state.line,
 712                                 state.column + match.length,
 713                                 level,
 714                                 content,
 715                                 false, true)])
 716                         ).at_index(line.length).replacing_active_block_parser
 717         end
 718 end
 719
 720 # Blockquotes parser
 721 class MdBlockQuoteParser
 722         super MdBlockParser
 723
 724         redef type BLOCK: MdBlockQuote
 725         redef var block = new MdBlockQuote(location) is lazy
 726
 727         redef fun try_continue(state) do
 728                 var next_non_space = state.next_non_space_index
 729                 var indent = state.indent
 730                 var line = state.line_string
 731
 732                 if indent >= 4 then return null
 733                 if next_non_space >= line.length then return null
 734                 if line.chars[next_non_space] != '>' then return null
 735
 736                 var new_column = state.column + state.indent + 1
 737                 # optional following space or tab
 738                 if state.line_string.is_space_or_tab(next_non_space + 1) then
 739                         new_column += 1
 740                 end
 741                 return new MdBlockContinue.at_column(new_column)
 742         end
 743
 744         redef fun parse_inlines(inline_parser) do
 745                 var last_child = block.last_child
 746                 if last_child != null then
 747                         location.line_end = last_child.location.line_end
 748                         location.column_end = last_child.location.column_end
 749                 end
 750         end
 751 end
 752
 753 # Blockquotes parser factory
 754 class MdBlockQuoteParserFactory
 755         super MdBlockParserFactory
 756
 757         redef fun try_start(state, matched_block_parser) do
 758                 var next_non_space = state.next_non_space_index
 759                 var indent = state.indent
 760                 var line = state.line_string
 761
 762                 if indent >= 4 then return null
 763                 if next_non_space >= line.length then return null
 764                 if line.chars[next_non_space] != '>' then return null
 765
 766                 var new_column = state.column + state.indent + 1
 767                 # optional following space or tab
 768                 if state.line_string.is_space_or_tab(next_non_space + 1) then
 769                         new_column += 1
 770                 end
 771                 return (new MdBlockStart(
 772                         [new MdBlockQuoteParser(
 773                                 state.line,
 774                                 state.column + 1,
 775                                 new_column)])
 776                         ).at_column(new_column)
 777         end
 778 end
 779
 780 # Indented code blocks parser
 781 class MdIndentedCodeBlockParser
 782         super MdBlockParser
 783
 784         redef type BLOCK: MdIndentedCodeBlock
 785         redef var block = new MdIndentedCodeBlock(location, use_tabs) is lazy
 786
 787         # Indent is tab?
 788         var use_tabs: Bool
 789
 790         # Block content
 791         var content = new Buffer
 792
 793         redef fun try_continue(state) do
 794                 if state.indent >= 4 then
 795                         return new MdBlockContinue.at_column(state.column + 4)
 796                 else if state.is_blank then
 797                         return new MdBlockContinue.at_index(state.next_non_space_index)
 798                 end
 799                 return null
 800         end
 801
 802         redef fun add_line(line) do
 803                 if not content.is_empty then
 804                         content.add('\n')
 805                 end
 806                 content.append(line)
 807         end
 808
 809         redef fun finalize(parser) do
 810                 super
 811
 812                 add_line(" ")
 813                 var content = self.content.to_s
 814                 var literal = content.replace_first(re_trailing_blank_lines, "\n")
 815                 block.literal = literal
 816
 817                 var lines = literal.split("\n")
 818                 location.line_end = location.line_start + lines.length - 2
 819                 location.column_end = content_offset + lines[lines.length - 2].length + 4
 820         end
 821 end
 822
 823 # Indented code blocks parser factory
 824 class MdIndentedCodeBlockParserFactory
 825         super MdBlockParserFactory
 826
 827         redef fun try_start(state, matched_block_parser) do
 828                 if state.indent < 4 then return null
 829                 if state.is_blank then return null
 830                 if state.active_block_parser.block isa MdParagraph then return null
 831
 832                 var use_tabs = state.line_string.has_prefix("\t")
 833                 return (new MdBlockStart(
 834                         [new MdIndentedCodeBlockParser(
 835                                 state.line,
 836                                 state.column + 1,
 837                                 state.column,
 838                                 use_tabs)])
 839                         ).at_column(state.column + 4)
 840         end
 841 end
 842
 843 # Fenced code blocks parser
 844 class MdFencedCodeBlockParser
 845         super MdBlockParser
 846
 847         redef type BLOCK: MdFencedCodeBlock
 848         redef var block = new MdFencedCodeBlock(location, fence_char, fence_length, fence_indent) is lazy
 849
 850         # Fence character
 851         var fence_char: Char
 852
 853         # Fence length
 854         var fence_length: Int
 855
 856         # Fence indent
 857         var fence_indent: Int
 858
 859         # Fence first line
 860         var first_line: nullable String = null
 861
 862         # Fence other lines
 863         var other_lines = new Buffer
 864
 865         redef fun try_continue(state) do
 866                 var next_non_space = state.next_non_space_index
 867                 var new_index = state.index
 868                 var line = state.line_string
 869
 870                 if state.indent <= 3 and next_non_space < line.length and
 871                    line.chars[next_non_space] == fence_char then
 872
 873                         var match = line.substring(next_non_space, line.length - next_non_space).
 874                                 search(re_closing_fence)
 875                         if match != null and match.subs[0].as(not null).length >= fence_length then
 876                                 # closing fence - we're at end of line, so we can finalize now
 877                                 return new MdBlockContinue.finished
 878                         end
 879                 end
 880
 881                 # skip optional spaces of fence indent
 882                 var i = fence_indent
 883                 while i > 0 and new_index < line.length and line.chars[new_index] == ' ' do
 884                         new_index += 1
 885                         i -= 1
 886                 end
 887
 888                 return new MdBlockContinue.at_index(new_index)
 889         end
 890
 891         redef fun add_line(line) do
 892                 if first_line == null then
 893                         first_line = line
 894                 else
 895                         other_lines.append(line)
 896                         other_lines.add '\n'
 897                 end
 898         end
 899
 900         redef fun finalize(parser) do
 901                 super
 902
 903                 # first line become info string
 904                 var first_line = self.first_line
 905                 if first_line != null then
 906                         var info = first_line.trim.unescape_string
 907                         if not info.is_empty then block.info = info
 908                 end
 909
 910                 var content = other_lines.to_s
 911                 block.literal =  content
 912
 913                 var lines = content.split("\n")
 914                 location.line_end = location.line_start + lines.length
 915                 location.column_end = content_offset + fence_indent + fence_length
 916         end
 917 end
 918
 919 # Fenced code blocks parser factory
 920 class MdFencedCodeBlockParserFactory
 921         super MdBlockQuoteParserFactory
 922
 923         redef fun try_start(state, matched_block_parser) do
 924                 var next_non_space = state.next_non_space_index
 925                 var line = state.line_string
 926
 927                 if state.indent >= 4 then return null
 928
 929                 var match = line.substring(next_non_space, line.length - next_non_space).search(re_opening_fence)
 930                 if match == null then return null
 931
 932                 var fence_length
 933                 var fence_char
 934                 var sub0 = match.subs[0]
 935                 if sub0 != null then
 936                         fence_length = sub0.length
 937                         fence_char = sub0.to_s.chars.first
 938                 else
 939                         fence_length = match.subs[2].as(not null).length
 940                         fence_char = match.subs[2].as(not null).to_s.chars.first
 941                 end
 942                 if fence_char == '`' and match.to_s.has("[^`]+`".to_re) then
 943                         return null
 944                 else if match.to_s.has("[^~]+~".to_re) then
 945                         return null
 946                 end
 947                 return (new MdBlockStart(
 948                         [new MdFencedCodeBlockParser(
 949                                 state.line,
 950                                 state.column + 1,
 951                                 state.column,
 952                                 fence_char,
 953                                 fence_length,
 954                                 state.indent)]
 955                         )).at_index(next_non_space + fence_length)
 956         end
 957 end
 958
 959 # List blocks parser
 960 class MdListBlockParser
 961         super MdBlockParser
 962
 963         redef type BLOCK: MdListBlock
 964
 965         redef var block is lazy do
 966                 if is_ordered then
 967                         return new MdOrderedList(location, digit.as(not null), delim.as(not null))
 968                 else
 969                         return new MdUnorderedList(location, bullet.as(not null))
 970                 end
 971         end
 972
 973         # Is this list ordered
 974         var is_ordered: Bool
 975
 976         # List bullet if unordered
 977         var bullet: nullable Char
 978
 979         # List digit if ordered
 980         var digit: nullable Int
 981
 982         # List delimiter if ordered
 983         var delim: nullable Char
 984
 985         redef fun try_continue(state) do return new MdBlockContinue.at_index(state.index)
 986
 987         redef fun finalize(parser) do
 988                 super
 989
 990                 var item = block.first_child
 991                 while item != null do
 992                         # check for non-final list item ending with blank line
 993                         if parser.ends_with_blank_line(item) and item.next != null then
 994                                 block.is_tight = false
 995                                 break
 996                         end
 997                         # recurse into children of list item to see if there are spaces between any of them
 998                         var sub_item = item.first_child
 999                         while sub_item != null do
1000                                 if parser.ends_with_blank_line(sub_item) and
1001                                    (item.next != null or sub_item.next != null) then
1002                                         block.is_tight = false
1003                                         break
1004                                 end
1005                                 sub_item = sub_item.next
1006                         end
1007                         item = item.next
1008                 end
1009         end
1010
1011         redef fun parse_inlines(inline_parser) do
1012                 var last_child = block.last_child
1013                 if last_child != null then
1014                         location.line_end = last_child.location.line_end
1015                         location.column_end = last_child.location.column_end
1016                 end
1017         end
1018 end
1019
1020 # List blocks parser factory
1021 class MdListBlockParserFactory
1022         super MdBlockQuoteParserFactory
1023
1024         redef fun try_start(state, matched_block_parser) do
1025                 if state.indent >= 4 and not matched_block_parser isa MdListBlockParser then return null
1026
1027                 var marker_index = state.next_non_space_index
1028                 var marker_column = state.column + state.indent
1029
1030                 var in_paragraph = matched_block_parser isa MdParagraphParser and matched_block_parser.content != null
1031                 var list_data = parse_list_marker(state, state.line_string, marker_index, marker_column, in_paragraph)
1032                 if list_data == null then return null
1033
1034
1035                 var new_column = list_data.content_column
1036                 var list_item_parser = new MdListItemParser(
1037                         state.line,
1038                         state.column + 1,
1039                         new_column,
1040                         new_column - state.column)
1041
1042                 # prepend the list block if needed
1043                 if not matched_block_parser isa MdListBlockParser or not lists_match(matched_block_parser.block, list_data) then
1044                         var list_block_parser = new MdListBlockParser(state.line, state.column + 1, new_column - state.column, list_data.is_ordered, list_data.bullet, list_data.digit, list_data.delim)
1045                         list_block_parser.block.is_tight = true
1046
1047                         return (new MdBlockStart([list_block_parser, list_item_parser: MdBlockParser])).at_column(new_column)
1048                 end
1049                 return (new MdBlockStart([list_item_parser])).at_column(new_column)
1050         end
1051
1052         private fun parse_list_marker(state: MdParser, line: String, marker_index, marker_column: Int, in_paragraph: Bool): nullable MdListData do
1053                 var rest = line.substring(marker_index, line.length - marker_index)
1054                 var match = rest.search(re_list_marker)
1055                 if match == null then return null
1056
1057                 var is_ordered
1058                 var bullet = null
1059                 var digit = null
1060                 var delim = null
1061
1062                 var bullet_match = match.subs[0]
1063                 if bullet_match != null then
1064                         is_ordered = false
1065                         bullet = bullet_match.to_s.chars[0]
1066                 else
1067                         is_ordered = true
1068                         digit = match.subs[2].as(not null).to_s.to_i
1069                         delim = match.subs[3].as(not null).to_s.chars[0]
1070                 end
1071
1072                 var marker_length = match.length
1073                 if match.to_s.has_suffix(" ") or match.to_s.has_suffix("\t") then
1074                         marker_length -= 1
1075                 end
1076                 var index_after_marker = marker_index + marker_length
1077
1078                 # marker doesn't include tabs, so counting them as column directly is ok
1079                 var column_after_marker = marker_column + marker_length
1080                 # the column within the line where the content starts
1081                 var content_column = column_after_marker
1082
1083                 # see at which column the content starts if there is content
1084                 var has_content = false
1085                 for i in [index_after_marker .. line.length[ do
1086                         var c = line.chars[i]
1087                         if c == '\t' then
1088                                 content_column += content_column.columns_to_next_tab_stop
1089                         else if c == ' ' then
1090                                 content_column += 1
1091                         else
1092                                 has_content = true
1093                                 break
1094                         end
1095                 end
1096
1097                 if in_paragraph then
1098                         # if the list item is ordered, then start number must be 1 to interrupt a paragraph
1099                         if is_ordered and digit != 1 then
1100                                 return null
1101                         end
1102                         # empty list item can not interrupt a paragraph
1103                         if not has_content then
1104                                 return null
1105                         end
1106                 end
1107
1108                 if not has_content or (content_column - column_after_marker) > 4 then
1109                         # if this line is blank or has a code block, default to 1 space after marker
1110                         content_column = column_after_marker + 1
1111                 end
1112                 return new MdListData(is_ordered, bullet, digit, delim, content_column)
1113         end
1114
1115         # Return true if the two list items are of the same type
1116         #
1117         # With the same delimiter and bullet character.
1118         # This is used in agglomerating list items into lists
1119         private fun lists_match(a: MdListBlock, b: MdListData): Bool do
1120                 if a isa MdUnorderedList and not b.is_ordered then
1121                         return a.bullet_marker == b.bullet
1122                 else if a isa MdOrderedList and b.is_ordered then
1123                         return a.delimiter == b.delim
1124                 end
1125                 return false
1126         end
1127 end
1128
1129 # Parsed list data
1130 private class MdListData
1131
1132         var is_ordered: Bool
1133
1134         var bullet: nullable Char
1135
1136         var digit: nullable Int
1137
1138         var delim: nullable Char
1139
1140         # Column the content start at
1141         var content_column: Int
1142 end
1143
1144 # List items parser
1145 class MdListItemParser
1146         super MdBlockParser
1147
1148         redef type BLOCK: MdListItem
1149         redef var block = new MdListItem(location) is lazy
1150
1151         # List item content indend
1152         var content_indent: Int
1153
1154         redef fun try_continue(state) do
1155                 if state.is_blank then
1156                         if block.first_child == null then
1157                                 # blank line after empty list item
1158                                 return null
1159                         end
1160                         return new MdBlockContinue.at_index(state.next_non_space_index)
1161                 end
1162                 if state.indent >= content_indent then
1163                         return new MdBlockContinue.at_column(state.column + content_indent)
1164                 end
1165                 return null
1166         end
1167
1168         redef fun parse_inlines(inline_parser) do
1169                 var last_child = block.last_child
1170                 if last_child != null then
1171                         location.line_end = last_child.location.line_end
1172                         location.column_end = last_child.location.column_end
1173                 end
1174         end
1175 end
1176
1177 # Thematic breaks parser
1178 class MdThematicBreakParser
1179         super MdBlockParser
1180
1181         redef type BLOCK: MdThematicBreak
1182         redef var block = new MdThematicBreak(location, pattern) is lazy
1183
1184         # Thematic break pattern
1185         var pattern: String
1186
1187         redef fun try_continue(state) do return null
1188
1189         redef fun finalize(parser) do
1190                 super
1191
1192                 location.line_end = line_start
1193                 location.column_end = column_start + pattern.length - 1
1194         end
1195 end
1196
1197 # Thematic breaks parser factory
1198 class MdThematicBreakParserFactory
1199         super MdBlockQuoteParserFactory
1200
1201         redef fun try_start(state, matched_block_parser) do
1202                 if state.indent >= 4 then return null
1203
1204                 var next_non_space = state.next_non_space_index
1205                 var line = state.line_string
1206                 var tbreak  = line.substring(next_non_space, line.length - next_non_space).search(re_thematic_break)
1207                 if tbreak != null then
1208                         return (new MdBlockStart(
1209                                 [new MdThematicBreakParser(
1210                                         state.line,
1211                                         state.column + 1,
1212                                         next_non_space,
1213                                         tbreak.to_s)]
1214                                 )).at_index(line.length)
1215                 end
1216                 return null
1217         end
1218 end
1219
1220 # Paragraphs parser
1221 class MdParagraphParser
1222         super MdBlockParser
1223
1224         redef type BLOCK: MdParagraph
1225
1226         redef var block = new MdParagraph(location) is lazy
1227
1228         # Paragraph content
1229         var content: nullable Buffer = new Buffer
1230
1231         redef fun try_continue(state) do
1232                 if state.is_blank then return null
1233                 return new MdBlockContinue.at_index(state.index)
1234         end
1235
1236         redef fun add_line(line) do
1237                 var content = self.content
1238                 if content == null then return
1239                 if not content.is_empty then
1240                         content.add('\n')
1241                 end
1242                 content.append(line)
1243         end
1244
1245         redef fun finalize(parser) do
1246                 super
1247
1248                 var inline_parser = parser.inline_parser
1249                 var content = self.content
1250                 if content == null then return
1251
1252                 var content_string = content.to_s
1253                 var has_reference_defs = false
1254
1255                 var pos = inline_parser.parse_reference(content_string)
1256                 # try parsing the beginning as link reference definitions
1257                 while content_string.length > 3 and content_string.chars[0] == '[' and pos != 0 do
1258                         content_string = content_string.substring(pos, content_string.length - pos)
1259                         has_reference_defs = true
1260                         pos = inline_parser.parse_reference(content_string)
1261                 end
1262
1263                 if has_reference_defs and content_string.is_blank then
1264                         block.unlink
1265                         self.content = null
1266                 else
1267                         self.content = new Buffer.from_text(content_string)
1268                 end
1269         end
1270
1271         redef fun parse_inlines(inline_parser) do
1272                 var content = self.content
1273                 if content == null then return
1274                 inline_parser.parse(content.to_s, content_offset, block)
1275
1276                 var last_child = block.last_child
1277                 if last_child != null then
1278                         location.line_end = last_child.location.line_end
1279                         location.column_end = last_child.location.column_end
1280                 end
1281         end
1282 end
1283
1284 # Html blocks parser
1285 class MdHtmlBlockParser
1286         super MdBlockParser
1287
1288         redef type BLOCK: MdHtmlBlock
1289         redef var block = new MdHtmlBlock(location) is lazy
1290
1291         # Closing tag pattern
1292         #
1293         # Or null if the block is not closed
1294         var closing_pattern: nullable Pattern
1295
1296         # Is the current block finished?
1297         var finished = false
1298
1299         # Block content
1300         var content = new Buffer
1301
1302         redef fun try_continue(state) do
1303                 if finished then return null
1304
1305                 # blank lin ends type 6 and 7 blocks
1306                 if state.is_blank and closing_pattern == null then return null
1307
1308                 return new MdBlockContinue.at_index(state.index)
1309         end
1310
1311         redef fun add_line(line) do
1312                 if not content.is_empty then
1313                         content.add('\n')
1314                 end
1315                 content.append(line)
1316                 var closing_pattern = self.closing_pattern
1317                 if closing_pattern != null and line.has(closing_pattern) then
1318                         finished = true
1319                 end
1320         end
1321
1322         redef fun finalize(parser) do
1323                 super
1324
1325                 var content = self.content.to_s
1326                 block.literal = content
1327
1328                 var lines = content.split("\n")
1329                 location.line_end = location.line_start + lines.length - 1
1330                 location.column_end = lines.last.length
1331         end
1332 end
1333
1334 # Html blocks parser factory
1335 class MdHtmlBlockParserFactory
1336         super MdBlockParserFactory
1337
1338         redef fun try_start(state, matched_block_parser) do
1339                 var next_non_space = state.next_non_space_index
1340                 var line = state.line_string
1341
1342                 if state.indent >= 4 or line.chars[next_non_space] != '<' then return null
1343
1344                 for block_type in [0..6] do
1345                         # type 7 can not interrupt a paragraph
1346                         if block_type == 6 and matched_block_parser.block isa MdParagraph then continue
1347                         var opener = re_html_blocks[block_type].first
1348                         var closer = re_html_blocks[block_type].last
1349                         if line.substring(next_non_space, line.length - next_non_space).has(opener.as(not null)) then
1350                                 return (new MdBlockStart(
1351                                         [new MdHtmlBlockParser(
1352                                                 state.line,
1353                                                 state.column + 1,
1354                                                 next_non_space,
1355                                                 closer)])
1356                                         ).at_index(state.index)
1357                         end
1358                 end
1359                 return null
1360         end
1361 end
1362
1363 # Post Processing
1364
1365 # Markdown post processor
1366 #
1367 # A Markdown AST visitor called after parsing from a MdParser
1368 abstract class MdPostProcessor
1369         super MdVisitor
1370
1371         # Document behing processed
1372         #
1373         # Availlable only during a call to `post_process`.
1374         var document: nullable MdDocument = null
1375
1376         # Post process the `document` parsed by `parser`
1377         fun post_process(parser: MdParser, document: MdDocument) do
1378                 self.document = document
1379                 enter_visit(document)
1380                 self.document = null
1381         end
1382
1383         # Call `MdNode::post_process`
1384         redef fun visit(node) do node.post_process(self)
1385 end
1386
1387 redef class MdNode
1388
1389         # Accept the visit of a `MdPostProcessor`
1390         fun post_process(v: MdPostProcessor) do visit_all(v)
1391 end
1392
1393 # Utils
1394
1395 redef class Sys
1396         # ATX headings matching
1397         private var re_atx_heading: Regex = "^(#\{1,6\})([ \t]+|$)".to_re
1398
1399         # ATX trailings matching
1400         private var re_atx_trailing: Regex = "(^|[ \t]+)#+[ \t]*$".to_re
1401
1402         # SeText headings matching
1403         private var re_setext_heading: Regex = "^(=+|-+)[ \t]*$".to_re
1404
1405         # Blank lines matching
1406         var re_trailing_blank_lines: Regex = "(\n[ \t]*)+$".to_re
1407
1408         # Opening fence matching
1409         var re_opening_fence: Regex = "^(`\{3,\})(.*)|^(~\{3,\})(.*)".to_re
1410
1411         # Closing fence matching
1412         var re_closing_fence: Regex = "^(`\{3,\}|~\{3,\})( *$)".to_re
1413
1414         # List marker matching
1415         var re_list_marker: Regex = "^([*+-])( |\t|$)|^([0-9]\{1,9\})([.)])( |\t|$)".to_re
1416
1417         # Thematic break pattern
1418         var re_thematic_break: Regex = "^((\\*[ \t]*)\{3,\}|(_[ \t]*)\{3,\}|(-[ \t]*)\{3,\})[ \t]*$".to_re
1419
1420         # HTML blocks patterns
1421         var re_html_blocks: Array[Array[nullable Regex]] do
1422                 var blocks = new Array[Array[nullable Regex]]
1423
1424                 var re0_opening = "^<(script|pre|style)(\\s|>|$)".to_re
1425                 re0_opening.ignore_case = true
1426                 var re0_closing = "</(script|pre|style)>".to_re
1427                 re0_closing.ignore_case = true
1428                 blocks.add([re0_opening, re0_closing])
1429
1430                 blocks.add([
1431                         "^<!--".to_re,
1432                         "-->".to_re
1433                 ])
1434
1435                 blocks.add([
1436                         "^<[?]".to_re,
1437                         "\\?>".to_re
1438                 ])
1439
1440                 blocks.add([
1441                         "^<![A-Z]".to_re,
1442                         ">".to_re
1443                 ])
1444
1445                 blocks.add([
1446                         "^<!\\[CDATA\\[".to_re,
1447                         "\\]\\]>".to_re
1448                 ])
1449
1450                 var re5_opening = "^</?(address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(\\s|[/]?[>]|$)".to_re
1451                 re5_opening.ignore_case = true
1452                 blocks.add([re5_opening, null])
1453
1454                 var p_tagname = "[A-Za-z][A-Za-z0-9-]*"
1455                 var p_attribute_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*"
1456                 var p_uquoted_value = "[^\"'=<>`\\x00-\\x20]+"
1457                 var p_squoted_value = "'[^']*'"
1458                 var p_dquoted_value = "\"[^\"]*\""
1459                 var p_attribute_value = "({p_uquoted_value}|{p_squoted_value}|{p_dquoted_value})"
1460                 var p_attribute_value_spec = "(\\s*=\\s*{p_attribute_value})"
1461                 var p_attribute = "(\\s{p_attribute_name}{p_attribute_value_spec}?)"
1462                 var p_opentag = "<{p_tagname}{p_attribute}*\\s*/?>"
1463                 var p_closetag = "</{p_tagname}\\s*[>]"
1464                 var re6_opening = "^({p_opentag}|{p_closetag})\\s*$".to_re
1465                 re6_opening.ignore_case = true
1466                 blocks.add([re6_opening, null])
1467
1468                 return blocks
1469         end
1470 end
1471
1472 redef class Int
1473
1474         # Tab stop is 4
1475         private fun columns_to_next_tab_stop: Int do return 4 - (self % 4)
1476 end
1477
1478 redef class String
1479
1480         # Is this string blank?
1481         #
1482         # i.e. contains only spacing characters.
1483         private fun is_blank: Bool do
1484                 for i in [0 .. length[ do
1485                         var c = chars[i]
1486                         if c == ' ' or c == '\t' or c == '\n' or c == '\r' then
1487                                 continue
1488                         else
1489                                 return false
1490                         end
1491                 end
1492                 return true
1493         end
1494
1495         # Is the character at `index` a space or a tab
1496         #
1497         # Return false if `index > self.length`.
1498         private fun is_space_or_tab(index: Int): Bool do
1499                 if index >= length then return false
1500                 var c = chars[index]
1501                 return c == ' ' or c == '\t'
1502         end
1503 end