nitcc: lexer do not crash on empty input stream
[nit.git] / lib / nitcc_runtime.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Runtime library required by parsers and lexers generated by nitcc
16 module nitcc_runtime
17
18 # A abstract parser engine generated by nitcc
19 abstract class Parser
20 # The list of tokens
21 # FIXME: provide something better, like a lexer?
22 var tokens = new List[NToken]
23
24 # Look at the next token
25 # Used by generated parsers
26 fun peek_token: NToken do return tokens.first
27
28 # Consume the next token
29 # Used by generated parsers
30 fun get_token: NToken do return tokens.shift
31
32 # Consume the next token and shift to the state `dest`.
33 # Used by generated parsers
34 fun shift(dest: LRState)
35 do
36 var t = get_token
37 #print "shift {t} -> {dest}"
38 node_stack.push t
39 state_stack.push state
40 state = dest
41 end
42
43 # After a reduction on `goto` go to the next state
44 # Used by generated parsers
45 fun goto(goto: LRGoto)
46 do
47 #print "reduce from {state} -> {prod}"
48 state.goto(self, goto)
49 end
50
51 # push a new state on the stack of states (
52 # Used by generated parsers
53 fun push(dest: LRState)
54 do
55 #print "push prod {prod} -> {dest}"
56 state_stack.push state
57 state = dest
58 end
59
60 # Pop and return the last node
61 # Also pop (and discard) the associated state
62 # Used by generated parsers
63 fun pop: Node
64 do
65 var res = node_stack.pop
66 state = state_stack.pop
67 return res
68 end
69
70 # Produce a parse error and stop the parsing
71 # Used by generated parsers
72 fun parse_error
73 do
74 var token = peek_token
75 #print "* parse error in state {state} on token {token}"
76 #print " expected: {state.error_msg}"
77 #print " node_stack={node_stack.join(", ")}"
78 #print " state_stack={state_stack.join(", ")}"
79 var error: NError
80 if token isa NLexerError then
81 error = token
82 else
83 error = new NParserError
84 error.position = token.position
85 error.text = token.text
86 error.token = token
87 end
88 error.error_tree.items.add_all(node_stack)
89 error.expected = state.error_msg
90 node_stack.clear
91 node_stack.add error
92 stop = true
93 end
94
95 # The stating state for parsing
96 # Used by generated parsers
97 protected fun start_state: LRState is abstract
98
99 # The current state
100 # Used by generated parsers
101 var state: LRState
102
103 init
104 do
105 state = start_state
106 end
107
108 # The stack of nodes
109 # Used by generated parsers
110 var node_stack = new Array[Node]
111
112 # The stack of states
113 # Used by generated parsers
114 var state_stack = new Array[LRState]
115
116 # Should the parser stop
117 # Used by generated parsers
118 var stop writable = true
119
120 # Parse a full sequence of tokens and return a complete syntactic tree
121 fun parse: Node
122 do
123 state = start_state
124 state_stack.clear
125 node_stack.clear
126 stop = false
127 while not stop do
128 #print "* current state {state}"
129 #print " tokens={tokens.join(" ")}"
130 #print " node_stack={node_stack.join(" ")}"
131 #print " state_stack={state_stack.join(" ")}"
132 state.action(self)
133 end
134 #print "* over"
135 return node_stack.first
136 end
137 end
138
139 # A state in a parser LR automaton generated by nitcc
140 # Used by generated parsers
141 abstract class LRState
142 fun action(parser: Parser) is abstract
143 fun goto(parser: Parser, goto: LRGoto) is abstract
144 fun error_msg: String do return "FIXME"
145 end
146
147 # A concrete production in a parser LR automation generated by nitcc
148 # Used by generated parsers
149 abstract class LRGoto
150 end
151
152 ###
153
154 # A abstract lexer engine generated by nitcc
155 abstract class Lexer
156 # The input stream of characters
157 var stream: String
158
159 # The stating state for lexing
160 # Used by generated parsers
161 protected fun start_state: DFAState is abstract
162
163 # Lexize a stream of characters and return a sequence of tokens
164 fun lex: List[NToken]
165 do
166 var res = new List[NToken]
167 var state = start_state
168 var pos = 0
169 var pos_start = 0
170 var pos_end = 0
171 var line = 1
172 var line_start = 1
173 var line_end = 0
174 var col = 1
175 var col_start = 1
176 var col_end = 0
177 var last_state: nullable DFAState = null
178 var text = stream
179 var length = text.length
180 loop
181 if state.is_accept then
182 pos_end = pos - 1
183 line_end = line
184 col_end = col
185 last_state = state
186 end
187 var c
188 if pos >= length then
189 c = '\0'
190 else
191 c = text[pos]
192 end
193 var next = state.trans(c)
194 if next == null then
195 if pos_start < length then
196 if last_state == null then
197 var token = new NLexerError
198 var position = new Position(pos_start, pos, line_start, line, col_start, col)
199 token.position = position
200 token.text = text.substring(pos_start, pos-pos_start+1)
201 res.add token
202 break
203 end
204 var position = new Position(pos_start, pos_end, line_start, line_end, col_start, col_end)
205 var token = last_state.make_token(position, text.substring(pos_start, pos_end-pos_start+1))
206 if token != null then res.add(token)
207 end
208 if pos >= length then
209 var token = new NEof
210 var position = new Position(pos, pos, line, line, col, col)
211 token.position = position
212 token.text = ""
213 res.add token
214 break
215 end
216 state = start_state
217 pos_start = pos_end + 1
218 pos = pos_start
219 line_start = line_end
220 line = line_start
221 col_start = col_end
222 col = col_start
223
224 last_state = null
225 continue
226 end
227 state = next
228 pos += 1
229 col += 1
230 if c == '\n' then
231 line += 1
232 col = 1
233 end
234 end
235 return res
236 end
237 end
238
239 # A state in a lexer automaton generated by nitcc
240 # Used by generated lexers
241 interface DFAState
242 fun is_accept: Bool do return false
243 fun trans(c: Char): nullable DFAState do return null
244 fun make_token(position: Position, text: String): nullable NToken is abstract
245 end
246
247 ###
248
249 # A abstract visitor on syntactic trees generated by nitcc
250 abstract class Visitor
251 # The main entry point to visit a node `n`
252 # Should not be redefined
253 fun enter_visit(n: Node)
254 do
255 visit(n)
256 end
257
258 # The specific implementation of a visit
259 #
260 # Should be redefined by concrete visitors
261 #
262 # Should not be called directly (use `enter_visit` instead)
263 #
264 # By default, the visitor just rescursively visit the children of `n`
265 protected fun visit(n: Node)
266 do
267 n.visit_children(self)
268 end
269 end
270
271 # Print a node (using to_s) on a line and recustively each children indented (with two spaces)
272 class TreePrinterVisitor
273 super Visitor
274 var writer: OStream
275 private var indent = 0
276 init(writer: OStream) do self.writer = writer
277 redef fun visit(n)
278 do
279 for i in [0..indent[ do writer.write(" ")
280 writer.write(n.to_s)
281 writer.write("\n")
282 indent += 1
283 super
284 indent -= 1
285 end
286 end
287
288 # A position into a input stream
289 # Used to give position to tokens
290 class Position
291 var pos_start: Int
292 var pos_end: Int
293 var line_start: Int
294 var line_end: Int
295 var col_start: Int
296 var col_end: Int
297 redef fun to_s do return "{line_start}:{col_start}-{line_end}:{col_end}"
298 end
299
300 # A node of a syntactic tree
301 abstract class Node
302 # The name of the node (as used in the grammar file)
303 fun node_name: String do return class_name
304
305 # A point of view on the direct childrens of the node
306 fun children: SequenceRead[nullable Node] is abstract
307
308 # Visit all the children of the node with the visitor `v`
309 protected fun visit_children(v: Visitor)
310 do
311 for c in children do if c != null then v.enter_visit(c)
312 end
313
314 # The position of the node in the input stream
315 var position: nullable Position writable = null
316
317 # Produce a graphiz file for the syntaxtic tree rooted at `self`.
318 fun to_dot(filepath: String)
319 do
320 var f = new OFStream.open(filepath)
321 f.write("digraph g \{\n")
322 f.write("rankdir=BT;\n")
323
324 var a = new Array[NToken]
325 to_dot_visitor(f, a)
326
327 f.write("\{ rank=same\n")
328 var first = true
329 for n in a do
330 if first then
331 first = false
332 else
333 f.write("->")
334 end
335 f.write("n{n.object_id}")
336 end
337 f.write("[style=invis];\n")
338 f.write("\}\n")
339
340 f.write("\}\n")
341 f.close
342 end
343
344 private fun to_dot_visitor(f: OStream, a: Array[NToken])
345 do
346 f.write("n{object_id} [label=\"{node_name}\"];\n")
347 for x in children do
348 if x == null then continue
349 f.write("n{x.object_id} -> n{object_id};\n")
350 x.to_dot_visitor(f,a )
351 end
352 end
353
354 redef fun to_s do
355 var pos = position
356 if pos == null then
357 return "{node_name}"
358 else
359 return "{node_name}@({pos})"
360 end
361 end
362 end
363
364 # A token produced by the lexer and used in a syntactic tree
365 abstract class NToken
366 super Node
367
368 redef fun children do return once new Array[Node]
369
370 redef fun to_dot_visitor(f, a)
371 do
372 var labe = "{node_name}"
373 var pos = position
374 if pos != null then labe += "\\n{pos}"
375 var text = self.text
376 if node_name != "'{text}'" then
377 labe += "\\n'{text.escape_to_c}'"
378 end
379 f.write("n{object_id} [label=\"{labe}\",shape=box];\n")
380 a.add(self)
381 end
382
383 # The text associated with the token
384 var text: String writable = ""
385
386 redef fun to_s do
387 var res = super
388 var text = self.text
389 if node_name != "'{text}'" then
390 res += "='{text.escape_to_c}'"
391 end
392 return res
393 end
394 end
395
396 # The special token for the end of stream
397 class NEof
398 super NToken
399 end
400
401 # A special token used to represent a parser or lexer error
402 abstract class NError
403 super NToken
404
405 # All the partially built tree during parsing (aka the node_stack)
406 var error_tree = new Nodes[Node]
407
408 # The things unexpected
409 fun unexpected: String is abstract
410
411 # The things expected (if any)
412 var expected: nullable String = null
413
414 # The error message,using `expected` and `unexpected`
415 fun message: String
416 do
417 var exp = expected
418 var res = "Unexpected {unexpected}"
419 if exp != null then res += "; is acceptable instead: {exp}"
420 return res
421 end
422 end
423
424 # A lexer error as a token for the unexpected characted
425 class NLexerError
426 super NError
427
428 redef fun unexpected do return "character '{text.first}'"
429 end
430
431 # A parser error linked to a unexpected token
432 class NParserError
433 super NError
434 # The unexpected token
435 var token: nullable NToken
436
437 redef fun unexpected do return token.node_name
438 end
439
440 # A hogeneous sequence of node, used to represent unbounded lists (and + modifier)
441 class Nodes[T: Node]
442 super Node
443 redef fun children do return items
444 var items = new Array[T]
445 end
446
447 # A production with a specific, named and statically typed childrens
448 class NProd
449 super Node
450 redef var children: SequenceRead[nullable Node] = new NProdChildren(self)
451
452 # The exact number of direct children
453 # Used to implement `children` by generated parsers
454 fun number_of_children: Int is abstract
455
456 # The specific children got by its index
457 # Used to implement `children` by generated parsers
458 fun child(x: Int): nullable Node is abstract
459 end
460
461
462 private class NProdChildren
463 super SequenceRead[nullable Node]
464 var prod: NProd
465 redef fun iterator do return new NProdIterator(prod)
466 redef fun length do return prod.number_of_children
467 redef fun is_empty do return prod.number_of_children == 0
468 redef fun [](i) do return prod.child(i)
469 end
470
471 private class NProdIterator
472 super IndexedIterator[nullable Node]
473 var prod: NProd
474 redef var index = 0
475 redef fun is_ok do return index < prod.number_of_children
476 redef fun next do index += 1
477 redef fun item do return prod.child(index)
478 end
479
480 # All-in-one abstract class to test generated parsers on a given
481 abstract class TestParser
482 # How to get a new lexer on a given stream of character
483 fun new_lexer(text: String): Lexer is abstract
484
485 # How to get a new parser
486 fun new_parser: Parser is abstract
487
488 # The name of the language (used for generated files)
489 fun name: String is abstract
490
491 # Use the class as the main enrty point of the program
492 # - parse arguments and options of the command
493 # - test the parser (see `work`)
494 fun main: Node
495 do
496 if args.is_empty then
497 print "usage {name}_test <filepath> | - | -e <text>"
498 exit 0
499 end
500
501 var filepath = args.shift
502 var text
503 if filepath == "-" then
504 text = stdin.read_all
505 else if filepath == "-e" then
506 if args.is_empty then
507 print "Error: -e need a text"
508 exit 1
509 end
510 text = args.shift
511 else
512 var f = new IFStream.open(filepath)
513 text = f.read_all
514 f.close
515 end
516
517 if not args.is_empty then
518 print "Error: superfluous arguments."
519 exit 1
520 end
521
522 return work(text)
523 end
524
525 # Produce a full syntactic tree for a given stream of character
526 # Produce also statistics and output files
527 fun work(text: String): Node
528 do
529 print "INPUT: {text.length} chars"
530 var l = new_lexer(text)
531 var tokens = l.lex
532
533 var tokout = "{name}.tokens.out"
534 print "TOKEN: {tokens.length} tokens (see {tokout})"
535
536 var f = new OFStream.open(tokout)
537 for t in tokens do
538 f.write "{t.to_s}\n"
539 end
540 f.close
541
542 var p = new_parser
543 p.tokens.add_all(tokens)
544
545 var n = p.parse
546
547 var astout = "{name}.ast.out"
548 f = new OFStream.open(astout)
549 var tpv = new TreePrinterVisitor(f)
550 var astdotout = "{name}.ast.dot"
551 if n isa NError then
552 print "Syntax error: {n.message}"
553 print "ERROR: {n} (see {astout} and {astdotout})"
554 tpv.enter_visit(n)
555 n = n.error_tree
556 else
557 print "ROOT: {n} (see {astout} and {astdotout})"
558 end
559 tpv.enter_visit(n)
560 n.to_dot(astdotout)
561 f.close
562
563 return n
564 end
565 end