nitcc: move up `text` of token earlier to improve generated code
[nit.git] / contrib / nitcc / src / autom.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Finite automaton (NFA & DFA)
16 module autom
17
18 # For the class Token
19 import grammar
20
21 # A finite automaton
22 class Automaton
23 # The start state
24 var start: State is noinit
25
26 # State that are accept states
27 var accept = new Array[State]
28
29 # All states
30 var states = new Array[State]
31
32 # Tokens associated on accept states.
33 # Use `add_tag` to update
34 var tags = new HashMap[State, Set[Token]]
35
36 # Accept states associated on tokens.
37 # Use `add_tag` to update
38 var retrotags = new HashMap[Token, Set[State]]
39
40 # Tag all accept states
41 fun tag_accept(t: Token)
42 do
43 for s in accept do add_tag(s, t)
44 end
45
46 # Add a token to a state
47 fun add_tag(s: State, t: Token)
48 do
49 if not tags.has_key(s) then
50 var set = new ArraySet[Token]
51 tags[s] = set
52 set.add t
53 else
54 tags[s].add t
55 end
56
57 if not retrotags.has_key(t) then
58 var set = new ArraySet[State]
59 retrotags[t] = set
60 set.add s
61 else
62 retrotags[t].add s
63 end
64
65 assert tags[s].has(t)
66 assert retrotags[t].has(s)
67 end
68
69 # Remove all occurrences of a tag in an automaton
70 fun clear_tag(t: Token)
71 do
72 if not retrotags.has_key(t) then return
73 for s in retrotags[t] do
74 if not tags.has_key(s) then continue
75 tags[s].remove(t)
76 if tags[s].is_empty then tags.keys.remove(s)
77 end
78 retrotags.keys.remove(t)
79 end
80
81 # Remove tokens from conflicting state according the inclusion of language.
82 # REQUIRE: self isa DFA automaton
83 fun solve_token_inclusion
84 do
85 for s, ts in tags do
86 if ts.length <= 1 then continue
87 var losers = new Array[Token]
88 for t1 in ts do
89 for t2 in ts do
90 if t1 == t2 then continue
91 if retrotags[t1].length > retrotags[t2].length and retrotags[t1].has_all(retrotags[t2]) then
92 losers.add(t1)
93 break
94 end
95 end
96 end
97 for t in losers do
98 ts.remove(t)
99 retrotags[t].remove s
100 end
101 end
102 end
103
104 # Initialize a new automaton for the empty language.
105 # One state, no accept, no transition.
106 init empty
107 do
108 var state = new State
109 start = state
110 states.add state
111 end
112
113 # Initialize a new automaton for the empty-string language.
114 # One state, is accept, no transition.
115 init epsilon
116 do
117 var state = new State
118 start = state
119 accept.add state
120 states.add state
121 end
122
123 # Initialize a new automation for the language that accepts only a single symbol.
124 # Two state, the second is accept, one transition on `symbol`.
125 init atom(symbol: Int)
126 do
127 var s = new State
128 var a = new State
129 var sym = new TSymbol(symbol, symbol)
130 s.add_trans(a, sym)
131 start = s
132 accept.add a
133 states.add s
134 states.add a
135 end
136
137 # Initialize a new automation for the language that accepts only a range of symbols
138 # Two state, the second is accept, one transition for `from` to `to`
139 init cla(first: Int, last: nullable Int)
140 do
141 var s = new State
142 var a = new State
143 var sym = new TSymbol(first, last)
144 s.add_trans(a, sym)
145 start = s
146 accept.add a
147 states.add s
148 states.add a
149 end
150
151 # Concatenate `other` to `self`.
152 # Other is modified and invalidated.
153 fun concat(other: Automaton)
154 do
155 var s2 = other.start
156 for a1 in accept do
157 a1.add_trans(s2, null)
158 end
159 accept = other.accept
160 states.add_all other.states
161 end
162
163 # `self` become the alternation of `self` and `other`.
164 # `other` is modified and invalidated.
165 fun alternate(other: Automaton)
166 do
167 var s = new State
168 var a = new State
169 s.add_trans(start, null)
170 for a1 in accept do
171 a1.add_trans(a, null)
172 end
173 s.add_trans(other.start, null)
174 for a2 in other.accept do
175 a2.add_trans(a, null)
176 accept.add(a2)
177 end
178
179 start = s
180 accept = [a]
181
182 states.add s
183 states.add a
184 states.add_all other.states
185 end
186
187 # Return a new automaton that recognize `self` but not `other`.
188 # For a theoretical POV, this is the subtraction of languages.
189 # Note: the implementation use `to_dfa` internally, so the theoretical complexity is not cheap.
190 fun except(other: Automaton): Automaton
191 do
192 var ta = new Token("1")
193 self.tag_accept(ta)
194 var tb = new Token("2")
195 other.tag_accept(tb)
196
197 var c = new Automaton.empty
198 c.absorb(self)
199 c.absorb(other)
200 c = c.to_dfa
201 c.accept.clear
202 for s in c.retrotags[ta] do
203 if not c.tags[s].has(tb) then
204 c.accept.add(s)
205 end
206 end
207 c.clear_tag(ta)
208 c.clear_tag(tb)
209 return c
210 end
211
212 # `self` absorbs all states, transitions, tags, and acceptations of `other`.
213 # An epsilon transition is added between `self.start` and `other.start`.
214 fun absorb(other: Automaton)
215 do
216 states.add_all other.states
217 start.add_trans(other.start, null)
218 for s, ts in other.tags do for t in ts do add_tag(s, t)
219 accept.add_all other.accept
220 end
221
222 # Do the Kleene closure (*) on self
223 fun close
224 do
225 for a1 in accept do
226 a1.add_trans(start, null)
227 start.add_trans(a1, null)
228 end
229 end
230
231 # Do the + on self
232 fun plus
233 do
234 for a1 in accept do
235 a1.add_trans(start, null)
236 end
237 end
238
239 # Do the ? on self
240 fun optionnal
241 do
242 alternate(new Automaton.epsilon)
243 end
244
245 # Remove all transitions on a given symbol
246 fun minus_sym(symbol: TSymbol)
247 do
248 var f = symbol.first
249 var l = symbol.last
250 for s in states do
251 for t in s.outs.to_a do
252 if t.symbol == null then continue
253
254 # Check overlaps
255 var tf = t.symbol.first
256 var tl = t.symbol.last
257 if l != null and tf > l then continue
258 if tl != null and f > tl then continue
259
260 t.delete
261
262 # Add left and right part if non empty
263 if tf < f then
264 var sym = new TSymbol(tf,f-1)
265 s.add_trans(t.to, sym)
266 end
267 if l != null then
268 if tl == null then
269 var sym = new TSymbol(l+1, null)
270 s.add_trans(t.to, sym)
271 else if tl > l then
272 var sym = new TSymbol(l+1, tl)
273 s.add_trans(t.to, sym)
274 end
275 end
276 end
277 end
278 end
279
280 # Fully duplicate an automaton
281 fun dup: Automaton
282 do
283 var res = new Automaton.empty
284 var map = new HashMap[State, State]
285 map[start] = res.start
286 for s in states do
287 if s == start then continue
288 var s2 = new State
289 map[s] = s2
290 res.states.add(s2)
291 end
292 for s in accept do
293 res.accept.add map[s]
294 end
295 for s, ts in tags do for t in ts do
296 res.add_tag(map[s], t)
297 end
298 for s in states do
299 for t in s.outs do
300 map[s].add_trans(map[t.to], t.symbol)
301 end
302 end
303 return res
304 end
305
306 # Reverse an automaton in place
307 fun reverse
308 do
309 for s in states do
310 var tmp = s.ins
311 s.ins = s.outs
312 s.outs = tmp
313 for t in s.outs do
314 var tmp2 = t.from
315 t.from = t.to
316 t.to = tmp2
317 end
318 end
319 var st = start
320 if accept.length == 1 then
321 start = accept.first
322 else
323 var st2 = new State
324 start = st2
325 states.add(st2)
326
327 for s in accept do
328 st2.add_trans(s, null)
329 end
330 end
331 accept.clear
332 accept.add(st)
333 end
334
335 # Remove states (and transitions) that does not reach an accept state
336 fun trim
337 do
338 # Good states are those we want to keep
339 var goods = new HashSet[State]
340 goods.add_all(accept)
341
342 var todo = accept.to_a
343
344 # Propagate goodness
345 while not todo.is_empty do
346 var s = todo.pop
347 for t in s.ins do
348 var s2 = t.from
349 if goods.has(s2) then continue
350 goods.add(s2)
351 todo.add(s2)
352 end
353 end
354
355 # What are the bad state then?
356 var bads = new Array[State]
357 for s in states do
358 if not goods.has(s) then bads.add(s)
359 end
360
361 # Remove their transitions
362 for s in bads do
363 for t in s.ins do t.delete
364 for t in s.outs do t.delete
365 end
366
367 # Keep only the good stuff
368 states.clear
369 states.add_all(goods)
370 end
371
372 # Generate a minimal DFA
373 # REQUIRE: self is a DFA
374 fun to_minimal_dfa: Automaton
375 do
376 trim
377
378 var distincts = new HashMap[State, Set[State]]
379 for s in states do
380 distincts[s] = new HashSet[State]
381 end
382
383 # split accept states
384 for s1 in states do
385 for s2 in states do
386 if distincts[s1].has(s2) then continue
387 if not accept.has(s1) then continue
388 if not accept.has(s2) then
389 distincts[s1].add(s2)
390 distincts[s2].add(s1)
391 continue
392 end
393 if tags[s1] != tags[s2] then
394 distincts[s1].add(s2)
395 distincts[s2].add(s1)
396 continue
397 end
398 end
399 end
400
401 var changed = true
402 var ints = new Array[Int]
403 while changed do
404 changed = false
405 for s1 in states do for s2 in states do
406 if distincts[s1].has(s2) then continue
407 ints.clear
408 for t in s1.outs do
409 var sym = t.symbol
410 assert sym != null
411 ints.add sym.first
412 var l = sym.last
413 if l != null then ints.add l
414 end
415 for i in ints do
416 var ds1 = s1.trans(i)
417 var ds2 = s2.trans(i)
418 if ds1 == null and ds2 == null then continue
419 if ds1 != null and ds2 != null and not distincts[ds1].has(ds2) then continue
420 distincts[s1].add(s2)
421 distincts[s2].add(s1)
422 changed = true
423 break
424 end
425 end
426 end
427
428 for s1 in states do for s2 in states do
429 if distincts[s1].has(s2) then continue
430 s1.add_trans(s2, null)
431 end
432
433 return to_dfa
434 end
435
436 # Produce a graphvis file for the automaton
437 fun to_dot(filepath: String)
438 do
439 var names = new HashMap[State, String]
440 var ni = 0
441 for s in states do
442 names[s] = ni.to_s
443 ni += 1
444 end
445
446 var f = new FileWriter.open(filepath)
447 f.write("digraph g \{\n")
448
449 for s in states do
450 f.write("s{names[s]}[shape=oval")
451 #f.write("label=\"\",")
452 if accept.has(s) then
453 f.write(",color=blue")
454 end
455 if tags.has_key(s) then
456 f.write(",label=\"")
457 for token in tags[s] do
458 f.write("{token.name.escape_to_c}\\n")
459 end
460 f.write("\"")
461 else
462 f.write(",label=\"\"")
463 end
464 f.write("];\n")
465 var outs = new HashMap[State, Array[nullable TSymbol]]
466 for t in s.outs do
467 var a
468 var s2 = t.to
469 var c = t.symbol
470 if outs.has_key(s2) then
471 a = outs[s2]
472 else
473 a = new Array[nullable TSymbol]
474 outs[s2] = a
475 end
476 a.add(c)
477 end
478 for s2, a in outs do
479 var labe = ""
480 for c in a do
481 if not labe.is_empty then labe += "\n"
482 if c == null then
483 labe += "''"
484 else
485 labe += c.to_s
486 end
487 end
488 f.write("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_c}\"];\n")
489 end
490 end
491 f.write("empty->s{names[start]}; empty[label=\"\",shape=none];\n")
492
493 f.write("\}\n")
494 f.close
495 end
496
497 # Transform a NFA to a DFA.
498 # note: the DFA is not minimized.
499 fun to_dfa: Automaton
500 do
501 trim
502
503 var dfa = new Automaton.empty
504 var n2d = new ArrayMap[Set[State], State]
505 var seen = new ArraySet[Set[State]]
506 var alphabet = new HashSet[Int]
507 var st = eclosure([start])
508 var todo = [st]
509 n2d[st] = dfa.start
510 seen.add(st)
511 while not todo.is_empty do
512 var nfa_states = todo.pop
513 #print "* work on {nfa_states.inspect}={nfa_states} (remains {todo.length}/{seen.length})"
514 var dfa_state = n2d[nfa_states]
515 alphabet.clear
516 for s in nfa_states do
517 # Collect important values to build the alphabet
518 for t in s.outs do
519 var sym = t.symbol
520 if sym == null then continue
521 alphabet.add(sym.first)
522 var l = sym.last
523 if l != null then alphabet.add(l)
524 end
525
526 # Mark accept and tags
527 if accept.has(s) then
528 if tags.has_key(s) then
529 for t in tags[s] do
530 dfa.add_tag(dfa_state, t)
531 end
532 end
533 dfa.accept.add(dfa_state)
534 end
535 end
536
537 # From the important values, build a sequence of TSymbols
538 var a = alphabet.to_a
539 default_comparator.sort(a)
540 var tsyms = new Array[TSymbol]
541 var last = 0
542 for i in a do
543 if last > 0 and last <= i-1 then
544 tsyms.add(new TSymbol(last,i-1))
545 end
546 tsyms.add(new TSymbol(i,i))
547 last = i+1
548 end
549 if last > 0 then
550 tsyms.add(new TSymbol(last,null))
551 end
552 #print "Alphabet: {tsyms.join(", ")}"
553
554 var lastst: nullable Transition = null
555 for sym in tsyms do
556 var nfa_dest = eclosure(trans(nfa_states, sym.first))
557 if nfa_dest.is_empty then
558 lastst = null
559 continue
560 end
561 #print "{nfa_states} -> {sym} -> {nfa_dest}"
562 var dfa_dest
563 if seen.has(nfa_dest) then
564 #print "* reuse {nfa_dest.inspect}={nfa_dest}"
565 dfa_dest = n2d[nfa_dest]
566 else
567 #print "* new {nfa_dest.inspect}={nfa_dest}"
568 dfa_dest = new State
569 dfa.states.add(dfa_dest)
570 n2d[nfa_dest] = dfa_dest
571 todo.add(nfa_dest)
572 seen.add(nfa_dest)
573 end
574 if lastst != null and lastst.to == dfa_dest then
575 lastst.symbol.last = sym.last
576 else
577 lastst = dfa_state.add_trans(dfa_dest, sym)
578 end
579 end
580 end
581 return dfa
582 end
583
584 # Epsilon-closure on a state of states.
585 # Used by `to_dfa`.
586 private fun eclosure(states: Collection[State]): Set[State]
587 do
588 var res = new ArraySet[State]
589 res.add_all(states)
590 var todo = states.to_a
591 while not todo.is_empty do
592 var s = todo.pop
593 for t in s.outs do
594 if t.symbol != null then continue
595 var to = t.to
596 if res.has(to) then continue
597 res.add(to)
598 todo.add(to)
599 end
600 end
601 return res
602 end
603
604 # Trans on a set of states.
605 # Used by `to_dfa`.
606 fun trans(states: Collection[State], symbol: Int): Set[State]
607 do
608 var res = new ArraySet[State]
609 for s in states do
610 for t in s.outs do
611 var sym = t.symbol
612 if sym == null then continue
613 if sym.first > symbol then continue
614 var l = sym.last
615 if l != null and l < symbol then continue
616 var to = t.to
617 if res.has(to) then continue
618 res.add(to)
619 end
620 end
621 return res
622 end
623
624 # Generate the Nit source code of the lexer.
625 # `filepath` is the name of the output file.
626 # `parser` is the name of the parser module (used to import the token classes).
627 fun gen_to_nit(filepath: String, name: String, parser: nullable String)
628 do
629 var gen = new DFAGenerator(filepath, name, self, parser)
630 gen.gen_to_nit
631 end
632 end
633
634 # Generate the Nit source code of the lexer
635 private class DFAGenerator
636 var filepath: String
637 var name: String
638 var automaton: Automaton
639 var parser: nullable String
640
641 var out: Writer is noinit
642
643 init do
644 self.out = new FileWriter.open(filepath)
645 end
646
647 fun add(s: String) do out.write(s)
648
649 fun gen_to_nit
650 do
651 var names = new HashMap[State, String]
652 var i = 0
653 for s in automaton.states do
654 names[s] = i.to_s
655 i += 1
656 end
657
658 add "# Lexer generated by nitcc for the grammar {name}\n"
659 add "module {name}_lexer is no_warning \"missing-doc\"\n"
660 add("import nitcc_runtime\n")
661
662 var p = parser
663 if p != null then add("import {p}\n")
664
665 add("class Lexer_{name}\n")
666 add("\tsuper Lexer\n")
667 add("\tredef fun start_state do return dfastate_{names[automaton.start]}\n")
668 add("end\n")
669
670 for s in automaton.states do
671 var n = names[s]
672 add("private fun dfastate_{n}: DFAState{n} do return once new DFAState{n}\n")
673 end
674
675 add("class MyNToken\n")
676 add("\tsuper NToken\n")
677 add("end\n")
678
679 for s in automaton.states do
680 var n = names[s]
681 add("private class DFAState{n}\n")
682 add("\tsuper DFAState\n")
683 if automaton.accept.has(s) then
684 var token
685 if automaton.tags.has_key(s) then
686 token = automaton.tags[s].first
687 else
688 token = null
689 end
690 add("\tredef fun is_accept do return true\n")
691 var is_ignored = false
692 if token != null and token.name == "Ignored" then
693 is_ignored = true
694 add("\tredef fun is_ignored do return true\n")
695 end
696 add("\tredef fun make_token(position, text) do\n")
697 if is_ignored then
698 add("\t\treturn null\n")
699 else
700 if token == null then
701 add("\t\tvar t = new MyNToken\n")
702 else
703 add("\t\tvar t = new {token.cname}\n")
704 end
705 add("\t\tt.position = position\n")
706 add("\t\tt.text = text\n")
707 add("\t\treturn t\n")
708 end
709 add("\tend\n")
710 end
711 var trans = new ArrayMap[TSymbol, State]
712 for t in s.outs do
713 var sym = t.symbol
714 assert sym != null
715 trans[sym] = t.to
716 end
717 if trans.is_empty then
718 # Do nothing, inherit the trans
719 else
720 add("\tredef fun trans(char) do\n")
721
722 add("\t\tvar c = char.code_point\n")
723 var haslast = false
724 var last = -1
725 for sym, next in trans do
726 assert not haslast
727 assert sym.first > last
728 if sym.first > last + 1 then add("\t\tif c <= {sym.first-1} then return null\n")
729 var l = sym.last
730 if l == null then
731 add("\t\treturn dfastate_{names[next]}\n")
732 haslast= true
733 else
734 add("\t\tif c <= {l} then return dfastate_{names[next]}\n")
735 last = l
736 end
737 end
738 if not haslast then add("\t\treturn null\n")
739 add("\tend\n")
740 end
741 add("end\n")
742 end
743
744 self.out.close
745 end
746 end
747
748 redef class Token
749 # The associated text (if any, ie defined in the parser part)
750 var text: nullable String is noautoinit, writable
751 end
752
753 # A state in a finite automaton
754 class State
755 # Outgoing transitions
756 var outs = new Array[Transition]
757
758 # Ingoing transitions
759 var ins = new Array[Transition]
760
761 # Add a transitions to `to` on `symbol` (null means epsilon)
762 fun add_trans(to: State, symbol: nullable TSymbol): Transition
763 do
764 var t = new Transition(self, to, symbol)
765 outs.add(t)
766 to.ins.add(t)
767 return t
768 end
769
770 # Get the first state following the transition `i`.
771 # Null if no transition for `i`.
772 fun trans(i: Int): nullable State
773 do
774 for t in outs do
775 var sym = t.symbol
776 assert sym != null
777 var f = sym.first
778 var l = sym.last
779 if i < f then continue
780 if l != null and i > l then continue
781 return t.to
782 end
783 return null
784 end
785 end
786
787 # A range of symbols on a transition
788 class TSymbol
789 # The first symbol in the range
790 var first: Int
791
792 # The last symbol if any.
793 #
794 # `null` means infinity.
795 var last: nullable Int
796
797 redef fun to_s
798 do
799 var res
800 var f = first
801 if f <= 32 then
802 res = "#{f}"
803 else
804 res = f.code_point.to_s
805 end
806 var l = last
807 if f == l then return res
808 res += " .. "
809 if l == null then return res
810 if l <= 32 or l >= 127 then return res + "#{l}"
811 return res + l.code_point.to_s
812 end
813 end
814
815 # A transition in a finite automaton
816 class Transition
817 # The source state
818 var from: State
819 # The destination state
820 var to: State
821 # The symbol on the transition (null means epsilon)
822 var symbol: nullable TSymbol
823
824 # Remove the transition from the automaton.
825 # Detach from `from` and `to`.
826 fun delete
827 do
828 from.outs.remove(self)
829 to.ins.remove(self)
830 end
831 end