nitcc: DFAStates are private
[nit.git] / contrib / nitcc / src / autom.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Finite automaton (NFA & DFA)
16 module autom
17
18 # For the class Token
19 import grammar
20
21 # A finite automaton
22 class Automaton
23 # The start state
24 var start: State
25
26 # State that are accect states
27 var accept = new Array[State]
28
29 # All states
30 var states = new Array[State]
31
32 # Tokens associated on accept states
33 # use `add_tag` to update
34 var tags = new HashMap[State, Set[Token]]
35
36 # Accept states associated on tokens
37 # use `add_tag` to update
38 var retrotags = new HashMap[Token, Set[State]]
39
40 # Tag all accept states
41 fun tag_accept(t: Token)
42 do
43 for s in accept do add_tag(s, t)
44 end
45
46 # Add a token to a state
47 fun add_tag(s: State, t: Token)
48 do
49 if not tags.has_key(s) then
50 var set = new ArraySet[Token]
51 tags[s] = set
52 set.add t
53 else
54 tags[s].add t
55 end
56
57 if not retrotags.has_key(t) then
58 var set = new ArraySet[State]
59 retrotags[t] = set
60 set.add s
61 else
62 retrotags[t].add s
63 end
64
65 assert tags[s].has(t)
66 assert retrotags[t].has(s)
67 end
68
69 # Remove all occurences of a tag in an automaton
70 fun clear_tag(t: Token)
71 do
72 if not retrotags.has_key(t) then return
73 for s in retrotags[t] do
74 if not tags.has_key(s) then continue
75 tags[s].remove(t)
76 if tags[s].is_empty then tags.keys.remove(s)
77 end
78 retrotags.keys.remove(t)
79 end
80
81 # Remove tokens from conflicting state according the the inclusion of language
82 # REQUIRE: self isa DFA automaton
83 fun solve_token_inclusion
84 do
85 for s, ts in tags do
86 if ts.length <= 1 then continue
87 var losers = new Array[Token]
88 for t1 in ts do
89 for t2 in ts do
90 if t1 == t2 then continue
91 if retrotags[t1].length > retrotags[t2].length and retrotags[t1].has_all(retrotags[t2]) then
92 losers.add(t1)
93 break
94 end
95 end
96 end
97 for t in losers do
98 ts.remove(t)
99 retrotags[t].remove s
100 end
101 end
102 end
103
104 # Initialize a new automaton for the empty language
105 # one state, no accept, no transition
106 init empty
107 do
108 var state = new State
109 start = state
110 states.add state
111 end
112
113 # Initialize a new automaton for the empty-string language
114 # one state, is accept, no transition
115 init epsilon
116 do
117 var state = new State
118 start = state
119 accept.add state
120 states.add state
121 end
122
123 # Initialize a new automation for the language that accepts only a single symbol
124 # Two state, the second is accept, one transition on `symbol`
125 init atom(symbol: Int)
126 do
127 var s = new State
128 var a = new State
129 var sym = new TSymbol(symbol, symbol)
130 s.add_trans(a, sym)
131 start = s
132 accept.add a
133 states.add s
134 states.add a
135 end
136
137 # Initialize a new automation for the language that accepts only a range of symbols
138 # Two state, the second is accept, one transition for `from` to `to`
139 init cla(first: Int, last: nullable Int)
140 do
141 var s = new State
142 var a = new State
143 var sym = new TSymbol(first, last)
144 s.add_trans(a, sym)
145 start = s
146 accept.add a
147 states.add s
148 states.add a
149 end
150
151 # Contatenate `other` to `self`
152 # other is modified and invalidated.
153 fun concat(other: Automaton)
154 do
155 var s2 = other.start
156 for a1 in accept do
157 a1.add_trans(s2, null)
158 end
159 accept = other.accept
160 states.add_all other.states
161 end
162
163 # `self` become the alternation of `self` and `other`
164 # `other` is modified and invalidated.
165 fun alternate(other: Automaton)
166 do
167 var s = new State
168 var a = new State
169 s.add_trans(start, null)
170 for a1 in accept do
171 a1.add_trans(a, null)
172 end
173 s.add_trans(other.start, null)
174 for a2 in other.accept do
175 a2.add_trans(a, null)
176 accept.add(a2)
177 end
178
179 start = s
180 accept = [a]
181
182 states.add s
183 states.add a
184 states.add_all other.states
185 end
186
187 # `self` absorbs all states, transisions, tags, and acceptations of `other`
188 # An epsilon transition is added between `self.start` and `other.start`
189 fun absorb(other: Automaton)
190 do
191 states.add_all other.states
192 start.add_trans(other.start, null)
193 for s, ts in other.tags do for t in ts do add_tag(s, t)
194 accept.add_all other.accept
195 end
196
197 # Do the Kleene closure (*) on self
198 fun close
199 do
200 for a1 in accept do
201 a1.add_trans(start, null)
202 start.add_trans(a1, null)
203 end
204 end
205
206 # Do the + on self
207 fun plus
208 do
209 for a1 in accept do
210 a1.add_trans(start, null)
211 end
212 end
213
214 # Do the ? on self
215 fun optionnal
216 do
217 alternate(new Automaton.epsilon)
218 end
219
220 # Remove all transitions on a given symbol
221 fun minus_sym(symbol: TSymbol)
222 do
223 var f = symbol.first
224 var l = symbol.last
225 for s in states do
226 for t in s.outs.to_a do
227 if t.symbol == null then continue
228
229 # Check overlaps
230 var tf = t.symbol.first
231 var tl = t.symbol.last
232 if l != null and tf > l then continue
233 if tl != null and f > tl then continue
234
235 t.delete
236
237 # Add left and right part if non empty
238 if tf < f then
239 var sym = new TSymbol(tf,f-1)
240 s.add_trans(t.to, sym)
241 end
242 if l != null then
243 if tl == null then
244 var sym = new TSymbol(l+1, null)
245 s.add_trans(t.to, sym)
246 else if tl > l then
247 var sym = new TSymbol(l+1, tl)
248 s.add_trans(t.to, sym)
249 end
250 end
251 end
252 end
253 end
254
255 # Fully duplicate an automaton
256 fun dup: Automaton
257 do
258 var res = new Automaton.empty
259 var map = new HashMap[State, State]
260 map[start] = res.start
261 for s in states do
262 if s == start then continue
263 var s2 = new State
264 map[s] = s2
265 res.states.add(s2)
266 end
267 for s in accept do
268 res.accept.add map[s]
269 end
270 for s, ts in tags do for t in ts do
271 res.add_tag(map[s], t)
272 end
273 for s in states do
274 for t in s.outs do
275 map[s].add_trans(map[t.to], t.symbol)
276 end
277 end
278 return res
279 end
280
281 # Reverse an automaton in place
282 fun reverse
283 do
284 for s in states do
285 var tmp = s.ins
286 s.ins = s.outs
287 s.outs = tmp
288 for t in s.outs do
289 var tmp2 = t.from
290 t.from = t.to
291 t.to = tmp2
292 end
293 end
294 var st = start
295 if accept.length == 1 then
296 start = accept.first
297 else
298 var st2 = new State
299 start = st2
300 states.add(st2)
301
302 for s in accept do
303 st2.add_trans(s, null)
304 end
305 end
306 accept.clear
307 accept.add(st)
308 end
309
310 # Generate a minimal DFA
311 # REQUIRE: self is a DFA
312 fun to_minimal_dfa: Automaton
313 do
314 var distincts = new HashMap[State, Set[State]]
315 for s in states do
316 distincts[s] = new HashSet[State]
317 end
318
319 # split accept states
320 for s1 in states do
321 for s2 in states do
322 if distincts[s1].has(s2) then continue
323 if not accept.has(s1) then continue
324 if not accept.has(s2) then
325 distincts[s1].add(s2)
326 distincts[s2].add(s1)
327 continue
328 end
329 if tags[s1] != tags[s2] then
330 distincts[s1].add(s2)
331 distincts[s2].add(s1)
332 continue
333 end
334 end
335 end
336
337 var changed = true
338 var ints = new Array[Int]
339 while changed do
340 changed = false
341 for s1 in states do for s2 in states do
342 if distincts[s1].has(s2) then continue
343 ints.clear
344 for t in s1.outs do
345 var sym = t.symbol
346 assert sym != null
347 ints.add sym.first
348 var l = sym.last
349 if l != null then ints.add l
350 end
351 for i in ints do
352 var ds1 = s1.trans(i)
353 var ds2 = s2.trans(i)
354 if ds1 == null and ds2 == null then continue
355 if ds1 != null and ds2 != null and not distincts[ds1].has(ds2) then continue
356 distincts[s1].add(s2)
357 distincts[s2].add(s1)
358 changed = true
359 break
360 end
361 end
362 end
363
364 for s1 in states do for s2 in states do
365 if distincts[s1].has(s2) then continue
366 s1.add_trans(s2, null)
367 end
368
369 return to_dfa
370 end
371
372 # Produce a graphvis file for the automaton
373 fun to_dot(filepath: String)
374 do
375 var f = new OFStream.open(filepath)
376 f.write("digraph g \{\n")
377
378 for s in states do
379 f.write("s{s.object_id}[shape=oval")
380 #f.write("label=\"\",")
381 if accept.has(s) then
382 f.write(",color=blue")
383 end
384 if tags.has_key(s) then
385 f.write(",label=\"")
386 for token in tags[s] do
387 f.write("{token.name.escape_to_c}\\n")
388 end
389 f.write("\"")
390 else
391 f.write(",label=\"\"")
392 end
393 f.write("];\n")
394 var outs = new HashMap[State, Array[nullable TSymbol]]
395 for t in s.outs do
396 var a
397 var s2 = t.to
398 var c = t.symbol
399 if outs.has_key(s2) then
400 a = outs[s2]
401 else
402 a = new Array[nullable TSymbol]
403 outs[s2] = a
404 end
405 a.add(c)
406 end
407 for s2, a in outs do
408 var labe = ""
409 for c in a do
410 if not labe.is_empty then labe += "\n"
411 if c == null then
412 labe += "''"
413 else
414 labe += c.to_s
415 end
416 end
417 f.write("s{s.object_id}->s{s2.object_id} [label=\"{labe.escape_to_c}\"];\n")
418 end
419 end
420 f.write("empty->s{start.object_id}; empty[label=\"\",shape=none];\n")
421
422 f.write("\}\n")
423 f.close
424 end
425
426 # Transform a NFA to a DFA
427 # note: the DFA is not miminized
428 fun to_dfa: Automaton
429 do
430 var dfa = new Automaton.empty
431 var n2d = new ArrayMap[Set[State], State]
432 var seen = new ArraySet[Set[State]]
433 var alphabet = new HashSet[Int]
434 var st = eclosure([start])
435 var todo = [st]
436 n2d[st] = dfa.start
437 seen.add(st)
438 while not todo.is_empty do
439 var nfa_states = todo.pop
440 #print "* work on {nfa_states.inspect}={nfa_states} (remains {todo.length}/{seen.length})"
441 var dfa_state = n2d[nfa_states]
442 alphabet.clear
443 for s in nfa_states do
444 # Collect important values to build the alphabet
445 for t in s.outs do
446 var sym = t.symbol
447 if sym == null then continue
448 alphabet.add(sym.first)
449 var l = sym.last
450 if l != null then alphabet.add(l)
451 end
452
453 # Mark accept and tags
454 if accept.has(s) then
455 if tags.has_key(s) then
456 for t in tags[s] do
457 dfa.add_tag(dfa_state, t)
458 end
459 end
460 dfa.accept.add(dfa_state)
461 end
462 end
463
464 # From the important values, build a sequence of TSymbols
465 var a = alphabet.to_a
466 (new ComparableSorter[Int]).sort(a)
467 var tsyms = new Array[TSymbol]
468 var last = 0
469 for i in a do
470 if last > 0 and last <= i-1 then
471 tsyms.add(new TSymbol(last,i-1))
472 end
473 tsyms.add(new TSymbol(i,i))
474 last = i+1
475 end
476 if last > 0 then
477 tsyms.add(new TSymbol(last,null))
478 end
479 #print "Alphabet: {tsyms.join(", ")}"
480
481 var lastst: nullable Transition = null
482 for sym in tsyms do
483 var nfa_dest = eclosure(trans(nfa_states, sym.first))
484 if nfa_dest.is_empty then
485 lastst = null
486 continue
487 end
488 #print "{nfa_states} -> {sym} -> {nfa_dest}"
489 var dfa_dest
490 if seen.has(nfa_dest) then
491 #print "* reuse {nfa_dest.inspect}={nfa_dest}"
492 dfa_dest = n2d[nfa_dest]
493 else
494 #print "* new {nfa_dest.inspect}={nfa_dest}"
495 dfa_dest = new State
496 dfa.states.add(dfa_dest)
497 n2d[nfa_dest] = dfa_dest
498 todo.add(nfa_dest)
499 seen.add(nfa_dest)
500 end
501 if lastst != null and lastst.to == dfa_dest then
502 lastst.symbol.last = sym.last
503 else
504 lastst = dfa_state.add_trans(dfa_dest, sym)
505 end
506 end
507 end
508 return dfa
509 end
510
511 # epsilon-closure on a state of states
512 # used by `to_dfa`
513 private fun eclosure(states: Collection[State]): Set[State]
514 do
515 var res = new ArraySet[State]
516 res.add_all(states)
517 var todo = states.to_a
518 while not todo.is_empty do
519 var s = todo.pop
520 for t in s.outs do
521 if t.symbol != null then continue
522 var to = t.to
523 if res.has(to) then continue
524 res.add(to)
525 todo.add(to)
526 end
527 end
528 return res
529 end
530
531 # trans on a set of states
532 # Used by `to_dfa`
533 fun trans(states: Collection[State], symbol: Int): Set[State]
534 do
535 var res = new ArraySet[State]
536 for s in states do
537 for t in s.outs do
538 var sym = t.symbol
539 if sym == null then continue
540 if sym.first > symbol then continue
541 var l = sym.last
542 if l != null and l < symbol then continue
543 var to = t.to
544 if res.has(to) then continue
545 res.add(to)
546 end
547 end
548 return res
549 end
550
551 # Generate the Nit source code of the lexer
552 # `filepath` is the name of the ouptit file
553 # `parser` is the name of the parser module (used to import the token classes)
554 fun gen_to_nit(filepath: String, name: String, parser: nullable String)
555 do
556 var gen = new DFAGenerator(filepath, name, self, parser)
557 gen.gen_to_nit
558 end
559 end
560
561 # Generate the Nit source code of the lexer
562 private class DFAGenerator
563 var filepath: String
564 var name: String
565 var automaton: Automaton
566 var parser: nullable String
567
568 var out: OStream
569 init(filepath: String, name: String, automaton: Automaton, parser: nullable String) do
570 self.filepath = filepath
571 self.name = name
572 self.automaton = automaton
573 self.parser = parser
574 self.out = new OFStream.open(filepath)
575 end
576
577 fun add(s: String) do out.write(s)
578
579 fun gen_to_nit
580 do
581 var names = new HashMap[State, String]
582 var i = 0
583 for s in automaton.states do
584 names[s] = i.to_s
585 i += 1
586 end
587
588 add "# Lexer generated by nitcc for the grammar {name}"
589 add("import nitcc_runtime\n")
590
591 var p = parser
592 if p != null then add("import {p}\n")
593
594 add("class Lexer_{name}\n")
595 add("\tsuper Lexer\n")
596 add("\tredef fun start_state do return dfastate_{names[automaton.start]}\n")
597 add("end\n")
598
599 add("redef class Object\n")
600 for s in automaton.states do
601 var n = names[s]
602 add("\tprivate fun dfastate_{n}: DFAState{n} do return once new DFAState{n}\n")
603 end
604 add("end\n")
605
606 add("class MyNToken\n")
607 add("\tsuper NToken\n")
608 add("end\n")
609
610 for s in automaton.states do
611 var n = names[s]
612 add("private class DFAState{n}\n")
613 add("\tsuper DFAState\n")
614 if automaton.accept.has(s) then
615 var token
616 if automaton.tags.has_key(s) then
617 token = automaton.tags[s].first
618 else
619 token = null
620 end
621 add("\tredef fun is_accept do return true\n")
622 add("\tredef fun make_token(position, text) do\n")
623 if token != null and token.name == "Ignored" then
624 add("\t\treturn null\n")
625 else
626 if token == null then
627 add("\t\tvar t = new MyNToken\n")
628 else
629 add("\t\tvar t = new {token.cname}\n")
630 end
631 add("\t\tt.position = position\n")
632 add("\t\tt.text = text\n")
633 add("\t\treturn t\n")
634 end
635 add("\tend\n")
636 end
637 var trans = new ArrayMap[TSymbol, State]
638 for t in s.outs do
639 var sym = t.symbol
640 assert sym != null
641 trans[sym] = t.to
642 end
643 if trans.is_empty then
644 # Do nothing, inherit the trans
645 else
646 add("\tredef fun trans(char) do\n")
647
648 add("\t\tvar c = char.ascii\n")
649 var haslast = false
650 var last = -1
651 for sym, next in trans do
652 assert not haslast
653 assert sym.first > last
654 if sym.first > last + 1 then add("\t\tif c <= {sym.first-1} then return null\n")
655 var l = sym.last
656 if l == null then
657 add("\t\treturn dfastate_{names[next]}\n")
658 haslast= true
659 else
660 add("\t\tif c <= {l} then return dfastate_{names[next]}\n")
661 last = l
662 end
663 end
664 if not haslast then add("\t\treturn null\n")
665 add("\tend\n")
666 end
667 add("end\n")
668 end
669
670 self.out.close
671 end
672 end
673
674 # A state in a finite automaton
675 class State
676 # Outgoing transitions
677
678 var outs = new Array[Transition]
679 # Ingoing tyransitions
680 var ins = new Array[Transition]
681
682 # Add a transitions to `to` on `symbol` (null means epsilon)
683 fun add_trans(to: State, symbol: nullable TSymbol): Transition
684 do
685 var t = new Transition(self, to, symbol)
686 outs.add(t)
687 to.ins.add(t)
688 return t
689 end
690
691 fun trans(i: Int): nullable State
692 do
693 for t in outs do
694 var sym = t.symbol
695 assert sym != null
696 var f = sym.first
697 var l = sym.last
698 if i < f then continue
699 if l != null and i > l then continue
700 return t.to
701 end
702 return null
703 end
704 end
705
706 # A range of symbols on a transition
707 class TSymbol
708 var first: Int
709 var last: nullable Int
710
711 redef fun to_s
712 do
713 var res
714 var f = first
715 if f <= 32 then
716 res = "#{f}"
717 else
718 res = f.ascii.to_s
719 end
720 var l = last
721 if f == l then return res
722 res += " .. "
723 if l == null then return res
724 if l <= 32 or l >= 127 then return res + "#{l}"
725 return res + l.ascii.to_s
726 end
727 end
728
729 # A transition in a finite automaton
730 class Transition
731 # The source state
732 var from: State
733 # The destination state
734 var to: State
735 # The symbol on the transition (null means epsilon)
736 var symbol: nullable TSymbol
737
738 # Remove the transition from the automaton
739 # Detash from `from` and `to`
740 fun delete
741 do
742 from.outs.remove(self)
743 to.ins.remove(self)
744 end
745 end