Merge: Nitcc custom lexer
[nit.git] / lib / nitcc_runtime.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Runtime library required by parsers and lexers generated by nitcc
16 module nitcc_runtime
17
18 # A abstract parser engine generated by nitcc
19 abstract class Parser
20 # The list of tokens
21 # FIXME: provide something better, like a lexer?
22 var tokens = new CircularArray[NToken]
23
24 # Look at the next token
25 # Used by generated parsers
26 fun peek_token: NToken do return tokens.first
27
28 # Consume the next token
29 # Used by generated parsers
30 fun get_token: NToken do return tokens.shift
31
32 # Consume the next token and shift to the state `dest`.
33 # Used by generated parsers
34 fun shift(dest: LRState)
35 do
36 var t = get_token
37 #print "shift {t} -> {dest}"
38 node_stack.push t
39 state_stack.push state
40 state = dest
41 end
42
43 # After a reduction on `goto` go to the next state
44 # Used by generated parsers
45 fun goto(goto: LRGoto)
46 do
47 #print "reduce from {state} -> {prod}"
48 state.goto(self, goto)
49 end
50
51 # push a new state on the stack of states (
52 # Used by generated parsers
53 fun push(dest: LRState)
54 do
55 #print "push prod {prod} -> {dest}"
56 state_stack.push state
57 state = dest
58 end
59
60 # Pop and return the last node
61 # Also pop (and discard) the associated state
62 # Used by generated parsers
63 fun pop: Node
64 do
65 var res = node_stack.pop
66 state = state_stack.pop
67 return res
68 end
69
70 # Produce a parse error and stop the parsing
71 # Used by generated parsers
72 fun parse_error
73 do
74 var token = peek_token
75 #print "* parse error in state {state} on token {token}"
76 #print " expected: {state.error_msg}"
77 #print " node_stack={node_stack.join(", ")}"
78 #print " state_stack={state_stack.join(", ")}"
79 node_stack.push(token)
80 var error: NError
81 if token isa NLexerError then
82 error = token
83 else
84 error = new NParserError
85 error.position = token.position
86 error.text = token.text
87 error.token = token
88 end
89 error.error_tree.children.add_all(node_stack)
90 error.expected = state.error_msg
91 node_stack.clear
92 node_stack.add error
93 stop = true
94 end
95
96 # The stating state for parsing
97 # Used by generated parsers
98 protected fun start_state: LRState is abstract
99
100 # The current state
101 # Used by generated parsers
102 var state: LRState is noinit
103
104 init
105 do
106 state = start_state
107 end
108
109 # The stack of nodes
110 # Used by generated parsers
111 var node_stack = new Array[Node]
112
113 # The stack of states
114 # Used by generated parsers
115 var state_stack = new Array[LRState]
116
117 # Should the parser stop
118 # Used by generated parsers
119 var stop = true is writable
120
121 # Parse a full sequence of tokens and return a complete syntactic tree
122 fun parse: Node
123 do
124 state = start_state
125 state_stack.clear
126 node_stack.clear
127 stop = false
128 while not stop do
129 #print "* current state {state}"
130 #print " tokens={tokens.join(" ")}"
131 #print " node_stack={node_stack.join(" ")}"
132 #print " state_stack={state_stack.join(" ")}"
133 state.action(self)
134 end
135 #print "* over"
136 return node_stack.first
137 end
138 end
139
140 # A state in a parser LR automaton generated by nitcc
141 # Used by generated parsers
142 abstract class LRState
143 fun action(parser: Parser) is abstract
144 fun goto(parser: Parser, goto: LRGoto) is abstract
145 fun error_msg: String do return "FIXME"
146 end
147
148 # A concrete production in a parser LR automation generated by nitcc
149 # Used by generated parsers
150 abstract class LRGoto
151 end
152
153 ###
154
155 # A abstract lexer engine generated by nitcc
156 abstract class Lexer
157 # The input stream of characters
158 var stream: String
159
160 # The stating state for lexing
161 # Used by generated parsers
162 protected fun start_state: DFAState is abstract
163
164 # Lexize a stream of characters and return a sequence of tokens
165 fun lex: CircularArray[NToken]
166 do
167 var res = new CircularArray[NToken]
168 loop
169 var t = next_token
170 if t != null then res.add t
171 if t isa NEof or t isa NError then break
172 end
173 return res
174 end
175
176 # Cursor current position (in chars, starting from 0)
177 var pos_start = 0
178
179 # Cursor current line (starting from 1)
180 var line_start = 1
181
182 # Cursor current column (in chars, starting from 1)
183 var col_start = 1
184
185 # Move the cursor and return the next token.
186 #
187 # Returns a `NEof` and the end.
188 # Returns `null` if the token is ignored.
189 fun next_token: nullable NToken
190 do
191 var state = start_state
192 var pos = pos_start
193 var pos_end = pos_start - 1
194 var line = line_start
195 var line_end = line_start - 1
196 var col = col_start
197 var col_end = col_start - 1
198 var last_state: nullable DFAState = null
199 var text = stream
200 var length = text.length
201 loop
202 if state.is_accept then
203 pos_end = pos - 1
204 line_end = line
205 col_end = col
206 last_state = state
207 end
208 var c
209 var next
210 if pos >= length then
211 c = '\0'
212 next = null
213 else
214 c = text.chars[pos]
215 next = state.trans(c)
216 end
217 if next == null then
218 var token
219 if pos_start < length then
220 if last_state == null then
221 token = new NLexerError
222 var position = new Position(pos_start, pos, line_start, line, col_start, col)
223 token.position = position
224 token.text = text.substring(pos_start, pos-pos_start+1)
225 else if not last_state.is_ignored then
226 var position = new Position(pos_start, pos_end, line_start, line_end, col_start, col_end)
227 token = last_state.make_token(position, text)
228 else
229 token = null
230 end
231 else
232 token = new NEof
233 var position = new Position(pos, pos, line, line, col, col)
234 token.position = position
235 token.text = ""
236 end
237 pos_start = pos_end + 1
238 line_start = line_end
239 col_start = col_end
240
241 return token
242 end
243 state = next
244 pos += 1
245 col += 1
246 if c == '\n' then
247 line += 1
248 col = 1
249 end
250 end
251 end
252 end
253
254 # A state in a lexer automaton generated by nitcc
255 # Used by generated lexers
256 interface DFAState
257 fun is_accept: Bool do return false
258 fun trans(c: Char): nullable DFAState do return null
259 fun make_token(position: Position, source: String): nullable NToken is abstract
260 fun is_ignored: Bool do return false
261 end
262
263 ###
264
265 # A abstract visitor on syntactic trees generated by nitcc
266 abstract class Visitor
267 # The main entry point to visit a node `n`
268 # Should not be redefined
269 fun enter_visit(n: Node)
270 do
271 visit(n)
272 end
273
274 # The specific implementation of a visit
275 #
276 # Should be redefined by concrete visitors
277 #
278 # Should not be called directly (use `enter_visit` instead)
279 #
280 # By default, the visitor just rescursively visit the children of `n`
281 protected fun visit(n: Node)
282 do
283 n.visit_children(self)
284 end
285 end
286
287 # Print a node (using to_s) on a line and recustively each children indented (with two spaces)
288 class TreePrinterVisitor
289 super Visitor
290 var writer: Writer
291 private var indent = 0
292 redef fun visit(n)
293 do
294 for i in [0..indent[ do writer.write(" ")
295 writer.write(n.to_s)
296 writer.write("\n")
297 indent += 1
298 super
299 indent -= 1
300 end
301 end
302
303 # A position into a input stream
304 # Used to give position to tokens
305 class Position
306 var pos_start: Int
307 var pos_end: Int
308 var line_start: Int
309 var line_end: Int
310 var col_start: Int
311 var col_end: Int
312
313 redef fun to_s do return "{line_start}:{col_start}-{line_end}:{col_end}"
314
315 # Extract the content from the given source
316 fun extract(source: String): String
317 do
318 return source.substring(pos_start, pos_end-pos_start+1)
319 end
320
321 # Get the lines covered by `self` and underline the target columns.
322 #
323 # This is useful for pretty printing errors or debug the output
324 #
325 # ~~~
326 # var src = "var Foo = new Array[Int]"
327 # var pos = new Position(0,0, 1, 1, 5, 8)
328 #
329 # assert pos.underline(src) == """
330 # var Foo = new Array[Int]
331 # ^^^"""
332 # ~~~
333 fun underline(source: Text): String
334 do
335 var res = new FlatBuffer
336
337 # All the concerned lines
338 var lines = source.split("\n")
339 for line in [line_start..line_end] do
340 res.append lines[line-1]
341 res.append "\n"
342 end
343
344 # Cover all columns, no matter their lines
345 var col_start = col_start.min(col_end)
346 var col_end = self.col_start.max(col_end)
347
348 # " ^^^^"
349 var ptr = " "*(col_start-1).max(0) + "^"*(col_end-col_start)
350 res.append ptr
351
352 return res.to_s
353 end
354 end
355
356 # A node of a syntactic tree
357 abstract class Node
358 # The name of the node (as used in the grammar file)
359 fun node_name: String do return class_name
360
361 # A point of view on the direct children of the node
362 fun children: SequenceRead[nullable Node] is abstract
363
364 # A point of view of a depth-first visit of all non-null children
365 var depth: Collection[Node] = new DephCollection(self) is lazy
366
367 # Visit all the children of the node with the visitor `v`
368 protected fun visit_children(v: Visitor)
369 do
370 for c in children do if c != null then v.enter_visit(c)
371 end
372
373 # The position of the node in the input stream
374 var position: nullable Position = null is writable
375
376 # Produce a graphiz file for the syntaxtic tree rooted at `self`.
377 fun to_dot(filepath: String)
378 do
379 var f = new FileWriter.open(filepath)
380 f.write("digraph g \{\n")
381 f.write("rankdir=BT;\n")
382
383 var a = new Array[NToken]
384 to_dot_visitor(f, a)
385
386 f.write("\{ rank=same\n")
387 var first = true
388 for n in a do
389 if first then
390 first = false
391 else
392 f.write("->")
393 end
394 f.write("n{n.object_id}")
395 end
396 f.write("[style=invis];\n")
397 f.write("\}\n")
398
399 f.write("\}\n")
400 f.close
401 end
402
403 private fun to_dot_visitor(f: Writer, a: Array[NToken])
404 do
405 f.write("n{object_id} [label=\"{node_name}\"];\n")
406 for x in children do
407 if x == null then continue
408 f.write("n{x.object_id} -> n{object_id};\n")
409 x.to_dot_visitor(f,a )
410 end
411 end
412
413 redef fun to_s do
414 var pos = position
415 if pos == null then
416 return "{node_name}"
417 else
418 return "{node_name}@({pos})"
419 end
420 end
421 end
422
423 private class DephCollection
424 super Collection[Node]
425 var node: Node
426 redef fun iterator do return new DephIterator([node].iterator)
427 end
428
429 private class DephIterator
430 super Iterator[Node]
431
432 var stack = new Array[Iterator[nullable Node]]
433
434 init(i: Iterator[nullable Node]) is old_style_init do
435 stack.push i
436 end
437
438 redef fun is_ok do return not stack.is_empty
439 redef fun item do return stack.last.item.as(not null)
440 redef fun next
441 do
442 var i = stack.last
443 stack.push i.item.children.iterator
444 i.next
445 while is_ok do
446 if not stack.last.is_ok then
447 stack.pop
448 continue
449 end
450 if stack.last.item == null then
451 stack.last.next
452 continue
453 end
454 return
455 end
456 end
457 end
458
459 # A token produced by the lexer and used in a syntactic tree
460 abstract class NToken
461 super Node
462
463 redef fun children do return once new Array[Node]
464
465 redef fun to_dot_visitor(f, a)
466 do
467 var labe = "{node_name}"
468 var pos = position
469 if pos != null then labe += "\\n{pos}"
470 var text = self.text
471 if node_name != "'{text}'" then
472 labe += "\\n'{text.escape_to_c}'"
473 end
474 f.write("n{object_id} [label=\"{labe}\",shape=box];\n")
475 a.add(self)
476 end
477
478 # The text associated with the token
479 var text: String = "" is writable
480
481 redef fun to_s do
482 var res = super
483 var text = self.text
484 if node_name != "'{text}'" then
485 res += "='{text.escape_to_c}'"
486 end
487 return res
488 end
489 end
490
491 # The special token for the end of stream
492 class NEof
493 super NToken
494 end
495
496 # A special token used to represent a parser or lexer error
497 abstract class NError
498 super NToken
499
500 # All the partially built tree during parsing (aka the node_stack)
501 var error_tree = new Nodes[Node]
502
503 # The things unexpected
504 fun unexpected: String is abstract
505
506 # The things expected (if any)
507 var expected: nullable String = null
508
509 # The error message,using `expected` and `unexpected`
510 fun message: String
511 do
512 var exp = expected
513 var res = "Unexpected {unexpected}"
514 if exp != null then res += "; is acceptable instead: {exp}"
515 return res
516 end
517 end
518
519 # A lexer error as a token for the unexpected characted
520 class NLexerError
521 super NError
522
523 redef fun unexpected do return "character '{text.chars.first}'"
524 end
525
526 # A parser error linked to a unexpected token
527 class NParserError
528 super NError
529
530 # The unexpected token
531 var token: nullable NToken = null
532
533 redef fun unexpected
534 do
535 var res = token.node_name
536 var text = token.text
537 if not text.is_empty and res != "'{text}'" then
538 res += " '{text.escape_to_c}'"
539 end
540 return res
541 end
542 end
543
544 # A hogeneous sequence of node, used to represent unbounded lists (and + modifier)
545 class Nodes[T: Node]
546 super Node
547 redef var children: Array[T] = new Array[T]
548 end
549
550 # A production with a specific, named and statically typed children
551 abstract class NProd
552 super Node
553 redef var children: SequenceRead[nullable Node] = new NProdChildren(self)
554
555 # The exact number of direct children
556 # Used to implement `children` by generated parsers
557 fun number_of_children: Int is abstract
558
559 # The specific children got by its index
560 # Used to implement `children` by generated parsers
561 fun child(x: Int): nullable Node is abstract
562 end
563
564
565 private class NProdChildren
566 super SequenceRead[nullable Node]
567 var prod: NProd
568 redef fun iterator do return new NProdIterator(prod)
569 redef fun length do return prod.number_of_children
570 redef fun is_empty do return prod.number_of_children == 0
571 redef fun [](i) do return prod.child(i)
572 end
573
574 private class NProdIterator
575 super IndexedIterator[nullable Node]
576 var prod: NProd
577 redef var index = 0
578 redef fun is_ok do return index < prod.number_of_children
579 redef fun next do index += 1
580 redef fun item do return prod.child(index)
581 end
582
583 # All-in-one abstract class to test generated parsers on a given
584 abstract class TestParser
585 # How to get a new lexer on a given stream of character
586 fun new_lexer(text: String): Lexer is abstract
587
588 # How to get a new parser
589 fun new_parser: Parser is abstract
590
591 # The name of the language (used for generated files)
592 fun name: String is abstract
593
594 # Use the class as the main enrty point of the program
595 # - parse arguments and options of the command
596 # - test the parser (see `work`)
597 fun main: Node
598 do
599 if args.is_empty then
600 print "usage {name}_test <filepath> | - | -e <text>"
601 exit 0
602 end
603
604 var filepath = args.shift
605 var text
606 if filepath == "-" then
607 text = sys.stdin.read_all
608 else if filepath == "-e" then
609 if args.is_empty then
610 print "Error: -e need a text"
611 exit 1
612 end
613 text = args.shift
614 else
615 var f = new FileReader.open(filepath)
616 text = f.read_all
617 f.close
618 end
619
620 if not args.is_empty then
621 print "Error: superfluous arguments."
622 exit 1
623 end
624
625 return work(text)
626 end
627
628 # Produce a full syntactic tree for a given stream of character
629 # Produce also statistics and output files
630 fun work(text: String): Node
631 do
632 print "INPUT: {text.length} chars"
633 var l = new_lexer(text)
634 var tokens = l.lex
635
636 var tokout = "{name}.tokens.out"
637 print "TOKEN: {tokens.length} tokens (see {tokout})"
638
639 var f = new FileWriter.open(tokout)
640 for t in tokens do
641 f.write "{t.to_s}\n"
642 end
643 f.close
644
645 var p = new_parser
646 p.tokens.add_all(tokens)
647
648 var n = p.parse
649
650 var astout = "{name}.ast.out"
651 f = new FileWriter.open(astout)
652 var tpv = new TreePrinterVisitor(f)
653 var astdotout = "{name}.ast.dot"
654 if n isa NError then
655 print "Syntax error: {n.message}"
656 print "ERROR: {n} (see {astout} and {astdotout})"
657 tpv.enter_visit(n)
658 n = n.error_tree
659 else
660 print "ROOT: {n}; {n.depth.length} nodes (see {astout} and {astdotout})"
661 end
662 tpv.enter_visit(n)
663 n.to_dot(astdotout)
664 f.close
665
666 return n
667 end
668 end