nitcc: trim also remove tags on bad states
[nit.git] / contrib / nitcc / src / autom.nit
index 21d23ee..c771408 100644 (file)
@@ -252,8 +252,8 @@ class Automaton
                                if t.symbol == null then continue
 
                                # Check overlaps
-                               var tf = t.symbol.first
-                               var tl = t.symbol.last
+                               var tf = t.symbol.as(not null).first
+                               var tl = t.symbol.as(not null).last
                                if l != null and tf > l then continue
                                if tl != null and f > tl then continue
 
@@ -358,39 +358,47 @@ class Automaton
                        if not goods.has(s) then bads.add(s)
                end
 
-               # Remove their transitions
+               # Remove their transitions and tags
                for s in bads do
-                       for t in s.ins do t.delete
-                       for t in s.outs do t.delete
+                       for t in s.ins.to_a do t.delete
+                       for t in s.outs.to_a do t.delete
+                       if tags.has_key(s) then
+                               for t in tags[s] do retrotags[t].remove(s)
+                               tags.keys.remove(s)
+                       end
                end
 
                # Keep only the good stuff
                states.clear
                states.add_all(goods)
+               states.add(start)
        end
 
        # Generate a minimal DFA
        # REQUIRE: self is a DFA
        fun to_minimal_dfa: Automaton
        do
+               assert_valid
+
                trim
 
+               # Graph of known distinct states.
                var distincts = new HashMap[State, Set[State]]
                for s in states do
                        distincts[s] = new HashSet[State]
                end
 
-               # split accept states
-               for s1 in states do
+               # split accept states.
+               # An accept state is distinct with a non accept state.
+               for s1 in accept do
                        for s2 in states do
                                if distincts[s1].has(s2) then continue
-                               if not accept.has(s1) then continue
                                if not accept.has(s2) then
                                        distincts[s1].add(s2)
                                        distincts[s2].add(s1)
                                        continue
                                end
-                               if tags[s1] != tags[s2] then
+                               if tags.get_or_null(s1) != tags.get_or_null(s2) then
                                        distincts[s1].add(s2)
                                        distincts[s2].add(s1)
                                        continue
@@ -398,24 +406,36 @@ class Automaton
                        end
                end
 
+               # Fixed point algorithm.
+               # * Get 2 states s1 and s2 not yet distinguished.
+               # * Get a symbol w.
+               # * If s1.trans(w) and s2.trans(w) are distinguished, then
+               #   distinguish s1 and s2.
                var changed = true
-               var ints = new Array[Int]
+               var ints = new Array[Int] # List of symbols to check
                while changed do
                        changed = false
                        for s1 in states do for s2 in states do
                                if distincts[s1].has(s2) then continue
+
+                               # The transitions use intervals. Therefore, for the states s1 and s2,
+                               # we need to check only the meaningful symbols. They are the `first`
+                               # symbol of each interval and the first one after the interval (`last+1`).
                                ints.clear
+                               # Check only `s1`; `s2` will be checked later when s1 and s2 are switched.
                                for t in s1.outs do
                                        var sym = t.symbol
                                        assert sym != null
                                        ints.add sym.first
                                        var l = sym.last
-                                       if l != null then ints.add l
+                                       if l != null then ints.add l + 1
                                end
+
+                               # Check each symbol
                                for i in ints do
                                        var ds1 = s1.trans(i)
                                        var ds2 = s2.trans(i)
-                                       if ds1 == null and ds2 == null then continue
+                                       if ds1 == ds2 then continue
                                        if ds1 != null and ds2 != null and not distincts[ds1].has(ds2) then continue
                                        distincts[s1].add(s2)
                                        distincts[s2].add(s1)
@@ -425,6 +445,8 @@ class Automaton
                        end
                end
 
+               # We need to unify not-distinguished states.
+               # Just add an epsilon-transition and DFAize the automaton.
                for s1 in states do for s2 in states do
                        if distincts[s1].has(s2) then continue
                        s1.add_trans(s2, null)
@@ -433,8 +455,25 @@ class Automaton
                return to_dfa
        end
 
-       # Produce a graphvis file for the automaton
-       fun to_dot(filepath: String)
+       # Assert that `self` is a valid automaton or abort
+       fun assert_valid
+       do
+               assert states.has(start)
+               assert states.has_all(accept)
+               for s in states do
+                       for t in s.outs do assert states.has(t.to)
+                       for t in s.ins do assert states.has(t.from)
+               end
+               assert states.has_all(tags.keys)
+               for t, ss in retrotags do
+                       assert states.has_all(ss)
+               end
+       end
+
+       # Produce a graphviz string from the automatom
+       #
+       # Set `merge_transitions = false` to generate one edge by transition (default true).
+       fun to_dot(merge_transitions: nullable Bool): Writable
        do
                var names = new HashMap[State, String]
                var ni = 0
@@ -443,25 +482,27 @@ class Automaton
                        ni += 1
                end
 
-               var f = new FileWriter.open(filepath)
-                f.write("digraph g \{\n")
+               var f = new Buffer
+               f.append("digraph g \{\n")
+               f.append("rankdir=LR;")
 
+               var state_nb = 0
                for s in states do
-                       f.write("s{names[s]}[shape=oval")
+                       f.append("s{names[s]}[shape=circle")
                        #f.write("label=\"\",")
                        if accept.has(s) then
-                               f.write(",color=blue")
+                               f.append(",shape=doublecircle")
                        end
                        if tags.has_key(s) then
-                               f.write(",label=\"")
+                               f.append(",label=\"")
                                for token in tags[s] do
-                                       f.write("{token.name.escape_to_c}\\n")
+                                       f.append("{token.name.escape_to_dot}\\n")
                                end
-                               f.write("\"")
+                               f.append("\"")
                        else
-                               f.write(",label=\"\"")
+                               f.append(",label=\"{state_nb}\"")
                        end
-                       f.write("];\n")
+                       f.append("];\n")
                        var outs = new HashMap[State, Array[nullable TSymbol]]
                        for t in s.outs do
                                var a
@@ -478,31 +519,39 @@ class Automaton
                        for s2, a in outs do
                                var labe = ""
                                for c in a do
+                                       if merge_transitions == false then labe = ""
                                        if not labe.is_empty then labe += "\n"
                                        if c == null then
-                                               labe += "''"
+                                               labe += "ε"
                                        else
                                                labe += c.to_s
                                        end
+                                       if merge_transitions == false then
+                                               f.append("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_dot}\"];\n")
+                                       end
+                               end
+                               if merge_transitions or else true then
+                                       f.append("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_c}\"];\n")
                                end
-                               f.write("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_c}\"];\n")
                        end
+                       state_nb += 1
                end
-               f.write("empty->s{names[start]}; empty[label=\"\",shape=none];\n")
-
-                f.write("\}\n")
-               f.close
+               f.append("empty->s{names[start]}; empty[label=\"\",shape=none];\n")
+               f.append("\}\n")
+               return f
        end
 
        # Transform a NFA to a DFA.
        # note: the DFA is not minimized.
        fun to_dfa: Automaton
        do
+               assert_valid
+
                trim
 
                var dfa = new Automaton.empty
                var n2d = new ArrayMap[Set[State], State]
-               var seen = new ArraySet[Set[State]] 
+               var seen = new ArraySet[Set[State]]
                var alphabet = new HashSet[Int]
                var st = eclosure([start])
                var todo = [st]
@@ -572,7 +621,7 @@ class Automaton
                                        seen.add(nfa_dest)
                                end
                                if lastst != null and lastst.to == dfa_dest then
-                                       lastst.symbol.last = sym.last
+                                       lastst.symbol.as(not null).last = sym.last
                                else
                                        lastst = dfa_state.add_trans(dfa_dest, sym)
                                end
@@ -581,6 +630,58 @@ class Automaton
                return dfa
        end
 
+       # Transform a NFA to a epsilonless NFA.
+       fun to_nfa_noe: Automaton
+       do
+               assert_valid
+
+               trim
+
+               var dfa = new Automaton.empty
+               var n2d = new ArrayMap[Set[State], State]
+               var seen = new ArraySet[Set[State]]
+               var st = eclosure([start])
+               var todo = [st]
+               n2d[st] = dfa.start
+               seen.add(st)
+               while not todo.is_empty do
+                       var nfa_states = todo.pop
+                       #print "* work on {nfa_states.inspect}={nfa_states} (remains {todo.length}/{seen.length})"
+                       var dfa_state = n2d[nfa_states]
+                       for s in nfa_states do
+                               for t in s.outs do
+                                       if t.symbol == null then continue
+                                       var nfa_dest = eclosure([t.to])
+                                       #print "{nfa_states} -> {sym} -> {nfa_dest}"
+                                       var dfa_dest
+                                       if seen.has(nfa_dest) then
+                                               #print "* reuse {nfa_dest.inspect}={nfa_dest}"
+                                               dfa_dest = n2d[nfa_dest]
+                                       else
+                                               #print "* new {nfa_dest.inspect}={nfa_dest}"
+                                               dfa_dest = new State
+                                               dfa.states.add(dfa_dest)
+                                               n2d[nfa_dest] = dfa_dest
+                                               todo.add(nfa_dest)
+                                               seen.add(nfa_dest)
+                                       end
+                                       dfa_state.add_trans(dfa_dest, t.symbol)
+                               end
+
+                               # Mark accept and tags
+                               if accept.has(s) then
+                                       if tags.has_key(s) then
+                                               for t in tags[s] do
+                                                       dfa.add_tag(dfa_state, t)
+                                               end
+                                       end
+                                       dfa.accept.add(dfa_state)
+                               end
+                       end
+               end
+               return dfa
+       end
+
        # Epsilon-closure on a state of states.
        # Used by `to_dfa`.
        private fun eclosure(states: Collection[State]): Set[State]
@@ -593,7 +694,7 @@ class Automaton
                        for t in s.outs do
                                if t.symbol != null then continue
                                var to = t.to
-                               if res.has(to) then continue 
+                               if res.has(to) then continue
                                res.add(to)
                                todo.add(to)
                        end
@@ -614,7 +715,7 @@ class Automaton
                                var l = sym.last
                                if l != null and l < symbol then continue
                                var to = t.to
-                               if res.has(to) then continue 
+                               if res.has(to) then continue
                                res.add(to)
                        end
                end
@@ -656,7 +757,7 @@ private class DFAGenerator
                end
 
                add "# Lexer generated by nitcc for the grammar {name}\n"
-               add "module {name}_lexer is no_warning \"missing-doc\"\n"
+               add "module {name}_lexer is generated, no_warning \"missing-doc\"\n"
                add("import nitcc_runtime\n")
 
                var p = parser
@@ -693,17 +794,23 @@ private class DFAGenerator
                                        is_ignored = true
                                        add("\tredef fun is_ignored do return true\n")
                                end
-                               add("\tredef fun make_token(position, text) do\n")
+                               add("\tredef fun make_token(position, source) do\n")
                                if is_ignored then
                                        add("\t\treturn null\n")
                                else
                                        if token == null then
                                                add("\t\tvar t = new MyNToken\n")
+                                               add("\t\tt.text = position.extract(source)\n")
                                        else
                                                add("\t\tvar t = new {token.cname}\n")
+                                               var ttext = token.text
+                                               if ttext == null then
+                                                       add("\t\tt.text = position.extract(source)\n")
+                                               else
+                                                       add("\t\tt.text = \"{ttext.escape_to_nit}\"\n")
+                                               end
                                        end
                                        add("\t\tt.position = position\n")
-                                       add("\t\tt.text = text\n")
                                        add("\t\treturn t\n")
                                end
                                add("\tend\n")
@@ -719,23 +826,56 @@ private class DFAGenerator
                        else
                                add("\tredef fun trans(char) do\n")
 
-                               add("\t\tvar c = char.code_point\n")
-                               var haslast = false
+                               # Collect the sequence of tests in the dispatch sequence
+                               # The point here is that for each transition, there is a first and a last
+                               # So holes have to be identified
+                               var dispatch = new HashMap[Int, nullable State]
+                               var haslast: nullable State = null
+
                                var last = -1
                                for sym, next in trans do
-                                       assert not haslast
+                                       assert haslast == null
                                        assert sym.first > last
-                                       if sym.first > last + 1 then add("\t\tif c <= {sym.first-1} then return null\n")
+                                       if sym.first > last + 1 then
+                                               dispatch[sym.first-1] = null
+                                       end
                                        var l = sym.last
                                        if l == null then
-                                               add("\t\treturn dfastate_{names[next]}\n")
-                                               haslast= true
+                                               haslast = next
                                        else
-                                               add("\t\tif c <= {l} then return dfastate_{names[next]}\n")
+                                               dispatch[l] = next
                                                last = l
                                        end
                                end
-                               if not haslast then add("\t\treturn null\n")
+
+                               if dispatch.is_empty and haslast != null then
+                                       # Only one transition that accepts everything (quite rare)
+                               else
+                                       # We need to check
+                                       add("\t\tvar c = char.code_point\n")
+                               end
+
+                               # Generate a sequence of `if` for the dispatch
+                               if haslast != null and last >= 0 then
+                                       # Special case: handle up-bound first if not an error
+                                       add("\t\tif c > {last} then return dfastate_{names[haslast]}\n")
+                                       # previous become the new last case
+                                       haslast = dispatch[last]
+                                       dispatch.keys.remove(last)
+                               end
+                               for c, next in dispatch do
+                                       if next == null then
+                                               add("\t\tif c <= {c} then return null\n")
+                                       else
+                                               add("\t\tif c <= {c} then return dfastate_{names[next]}\n")
+                                       end
+                               end
+                               if haslast == null then
+                                       add("\t\treturn null\n")
+                               else
+                                       add("\t\treturn dfastate_{names[haslast]}\n")
+                               end
+
                                add("\tend\n")
                        end
                        add("end\n")