nitcc: trim also remove tags on bad states
[nit.git] / contrib / nitcc / src / autom.nit
index 0aa09be..c771408 100644 (file)
@@ -21,20 +21,20 @@ import grammar
 # A finite automaton
 class Automaton
        # The start state
-       var start: State
+       var start: State is noinit
 
-       # State that are accect states
+       # State that are accept states
        var accept = new Array[State]
 
        # All states
        var states = new Array[State]
 
-       # Tokens associated on accept states
-       # use `add_tag` to update
+       # Tokens associated on accept states.
+       # Use `add_tag` to update
        var tags = new HashMap[State, Set[Token]]
 
-       # Accept states associated on tokens
-       # use `add_tag` to update
+       # Accept states associated on tokens.
+       # Use `add_tag` to update
        var retrotags = new HashMap[Token, Set[State]]
 
        # Tag all accept states
@@ -66,7 +66,7 @@ class Automaton
                assert retrotags[t].has(s)
        end
 
-       # Remove all occurences of a tag in an automaton
+       # Remove all occurrences of a tag in an automaton
        fun clear_tag(t: Token)
        do
                if not retrotags.has_key(t) then return
@@ -78,7 +78,7 @@ class Automaton
                retrotags.keys.remove(t)
        end
 
-       # Remove tokens from conflicting state according the the inclusion of language
+       # Remove tokens from conflicting state according the inclusion of language.
        # REQUIRE: self isa DFA automaton
        fun solve_token_inclusion
        do
@@ -101,8 +101,8 @@ class Automaton
                end
        end
 
-       # Initialize a new automaton for the empty language
-       # one state, no accept, no transition
+       # Initialize a new automaton for the empty language.
+       # One state, no accept, no transition.
        init empty
        do
                var state = new State
@@ -110,8 +110,8 @@ class Automaton
                states.add state
        end
 
-       # Initialize a new automaton for the empty-string language
-       # one state, is accept, no transition
+       # Initialize a new automaton for the empty-string language.
+       # One state, is accept, no transition.
        init epsilon
        do
                var state = new State
@@ -120,8 +120,8 @@ class Automaton
                states.add state
        end
 
-       # Initialize a new automation for the language that accepts only a single symbol
-       # Two state, the second is accept, one transition on `symbol`
+       # Initialize a new automation for the language that accepts only a single symbol.
+       # Two state, the second is accept, one transition on `symbol`.
        init atom(symbol: Int)
        do
                var s = new State
@@ -148,8 +148,8 @@ class Automaton
                states.add a
        end
 
-       # Contatenate `other` to `self`
-       # other is modified and invalidated.
+       # Concatenate `other` to `self`.
+       # Other is modified and invalidated.
        fun concat(other: Automaton)
        do
                var s2 = other.start
@@ -160,7 +160,7 @@ class Automaton
                states.add_all other.states
        end
 
-       # `self` become the alternation of `self` and `other`
+       # `self` become the alternation of `self` and `other`.
        # `other` is modified and invalidated.
        fun alternate(other: Automaton)
        do
@@ -184,8 +184,33 @@ class Automaton
                states.add_all other.states
        end
 
-       # `self` absorbs all states, transisions, tags, and acceptations of `other`
-       # An epsilon transition is added between `self.start` and `other.start`
+       # Return a new automaton that recognize `self` but not `other`.
+       # For a theoretical POV, this is the subtraction of languages.
+       # Note: the implementation use `to_dfa` internally, so the theoretical complexity is not cheap.
+       fun except(other: Automaton): Automaton
+       do
+               var ta = new Token("1")
+               self.tag_accept(ta)
+               var tb = new Token("2")
+               other.tag_accept(tb)
+
+               var c = new Automaton.empty
+               c.absorb(self)
+               c.absorb(other)
+               c = c.to_dfa
+               c.accept.clear
+               for s in c.retrotags[ta] do
+                       if not c.tags[s].has(tb) then
+                               c.accept.add(s)
+                       end
+               end
+               c.clear_tag(ta)
+               c.clear_tag(tb)
+               return c
+       end
+
+       # `self` absorbs all states, transitions, tags, and acceptations of `other`.
+       # An epsilon transition is added between `self.start` and `other.start`.
        fun absorb(other: Automaton)
        do
                states.add_all other.states
@@ -227,8 +252,8 @@ class Automaton
                                if t.symbol == null then continue
 
                                # Check overlaps
-                               var tf = t.symbol.first
-                               var tl = t.symbol.last
+                               var tf = t.symbol.as(not null).first
+                               var tl = t.symbol.as(not null).last
                                if l != null and tf > l then continue
                                if tl != null and f > tl then continue
 
@@ -333,39 +358,47 @@ class Automaton
                        if not goods.has(s) then bads.add(s)
                end
 
-               # Remove their transitions
+               # Remove their transitions and tags
                for s in bads do
-                       for t in s.ins do t.delete
-                       for t in s.outs do t.delete
+                       for t in s.ins.to_a do t.delete
+                       for t in s.outs.to_a do t.delete
+                       if tags.has_key(s) then
+                               for t in tags[s] do retrotags[t].remove(s)
+                               tags.keys.remove(s)
+                       end
                end
 
                # Keep only the good stuff
                states.clear
                states.add_all(goods)
+               states.add(start)
        end
 
        # Generate a minimal DFA
        # REQUIRE: self is a DFA
        fun to_minimal_dfa: Automaton
        do
+               assert_valid
+
                trim
 
+               # Graph of known distinct states.
                var distincts = new HashMap[State, Set[State]]
                for s in states do
                        distincts[s] = new HashSet[State]
                end
 
-               # split accept states
-               for s1 in states do
+               # split accept states.
+               # An accept state is distinct with a non accept state.
+               for s1 in accept do
                        for s2 in states do
                                if distincts[s1].has(s2) then continue
-                               if not accept.has(s1) then continue
                                if not accept.has(s2) then
                                        distincts[s1].add(s2)
                                        distincts[s2].add(s1)
                                        continue
                                end
-                               if tags[s1] != tags[s2] then
+                               if tags.get_or_null(s1) != tags.get_or_null(s2) then
                                        distincts[s1].add(s2)
                                        distincts[s2].add(s1)
                                        continue
@@ -373,24 +406,36 @@ class Automaton
                        end
                end
 
+               # Fixed point algorithm.
+               # * Get 2 states s1 and s2 not yet distinguished.
+               # * Get a symbol w.
+               # * If s1.trans(w) and s2.trans(w) are distinguished, then
+               #   distinguish s1 and s2.
                var changed = true
-               var ints = new Array[Int]
+               var ints = new Array[Int] # List of symbols to check
                while changed do
                        changed = false
                        for s1 in states do for s2 in states do
                                if distincts[s1].has(s2) then continue
+
+                               # The transitions use intervals. Therefore, for the states s1 and s2,
+                               # we need to check only the meaningful symbols. They are the `first`
+                               # symbol of each interval and the first one after the interval (`last+1`).
                                ints.clear
+                               # Check only `s1`; `s2` will be checked later when s1 and s2 are switched.
                                for t in s1.outs do
                                        var sym = t.symbol
                                        assert sym != null
                                        ints.add sym.first
                                        var l = sym.last
-                                       if l != null then ints.add l
+                                       if l != null then ints.add l + 1
                                end
+
+                               # Check each symbol
                                for i in ints do
                                        var ds1 = s1.trans(i)
                                        var ds2 = s2.trans(i)
-                                       if ds1 == null and ds2 == null then continue
+                                       if ds1 == ds2 then continue
                                        if ds1 != null and ds2 != null and not distincts[ds1].has(ds2) then continue
                                        distincts[s1].add(s2)
                                        distincts[s2].add(s1)
@@ -400,6 +445,8 @@ class Automaton
                        end
                end
 
+               # We need to unify not-distinguished states.
+               # Just add an epsilon-transition and DFAize the automaton.
                for s1 in states do for s2 in states do
                        if distincts[s1].has(s2) then continue
                        s1.add_trans(s2, null)
@@ -408,8 +455,25 @@ class Automaton
                return to_dfa
        end
 
-       # Produce a graphvis file for the automaton
-       fun to_dot(filepath: String)
+       # Assert that `self` is a valid automaton or abort
+       fun assert_valid
+       do
+               assert states.has(start)
+               assert states.has_all(accept)
+               for s in states do
+                       for t in s.outs do assert states.has(t.to)
+                       for t in s.ins do assert states.has(t.from)
+               end
+               assert states.has_all(tags.keys)
+               for t, ss in retrotags do
+                       assert states.has_all(ss)
+               end
+       end
+
+       # Produce a graphviz string from the automatom
+       #
+       # Set `merge_transitions = false` to generate one edge by transition (default true).
+       fun to_dot(merge_transitions: nullable Bool): Writable
        do
                var names = new HashMap[State, String]
                var ni = 0
@@ -418,25 +482,27 @@ class Automaton
                        ni += 1
                end
 
-               var f = new OFStream.open(filepath) 
-                f.write("digraph g \{\n")
+               var f = new Buffer
+               f.append("digraph g \{\n")
+               f.append("rankdir=LR;")
 
+               var state_nb = 0
                for s in states do
-                       f.write("s{names[s]}[shape=oval")
+                       f.append("s{names[s]}[shape=circle")
                        #f.write("label=\"\",")
                        if accept.has(s) then
-                               f.write(",color=blue")
+                               f.append(",shape=doublecircle")
                        end
                        if tags.has_key(s) then
-                               f.write(",label=\"")
+                               f.append(",label=\"")
                                for token in tags[s] do
-                                       f.write("{token.name.escape_to_c}\\n")
+                                       f.append("{token.name.escape_to_dot}\\n")
                                end
-                               f.write("\"")
+                               f.append("\"")
                        else
-                               f.write(",label=\"\"")
+                               f.append(",label=\"{state_nb}\"")
                        end
-                       f.write("];\n")
+                       f.append("];\n")
                        var outs = new HashMap[State, Array[nullable TSymbol]]
                        for t in s.outs do
                                var a
@@ -453,31 +519,39 @@ class Automaton
                        for s2, a in outs do
                                var labe = ""
                                for c in a do
+                                       if merge_transitions == false then labe = ""
                                        if not labe.is_empty then labe += "\n"
                                        if c == null then
-                                               labe += "''"
+                                               labe += "ε"
                                        else
                                                labe += c.to_s
                                        end
+                                       if merge_transitions == false then
+                                               f.append("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_dot}\"];\n")
+                                       end
+                               end
+                               if merge_transitions or else true then
+                                       f.append("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_c}\"];\n")
                                end
-                               f.write("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_c}\"];\n")
                        end
+                       state_nb += 1
                end
-               f.write("empty->s{names[start]}; empty[label=\"\",shape=none];\n")
-
-                f.write("\}\n")
-               f.close
+               f.append("empty->s{names[start]}; empty[label=\"\",shape=none];\n")
+               f.append("\}\n")
+               return f
        end
 
-       # Transform a NFA to a DFA
-       # note: the DFA is not miminized
+       # Transform a NFA to a DFA.
+       # note: the DFA is not minimized.
        fun to_dfa: Automaton
        do
+               assert_valid
+
                trim
 
                var dfa = new Automaton.empty
                var n2d = new ArrayMap[Set[State], State]
-               var seen = new ArraySet[Set[State]] 
+               var seen = new ArraySet[Set[State]]
                var alphabet = new HashSet[Int]
                var st = eclosure([start])
                var todo = [st]
@@ -511,7 +585,7 @@ class Automaton
 
                        # From the important values, build a sequence of TSymbols
                        var a = alphabet.to_a
-                       (new ComparableSorter[Int]).sort(a)
+                       default_comparator.sort(a)
                        var tsyms = new Array[TSymbol]
                        var last = 0
                        for i in a do
@@ -547,7 +621,7 @@ class Automaton
                                        seen.add(nfa_dest)
                                end
                                if lastst != null and lastst.to == dfa_dest then
-                                       lastst.symbol.last = sym.last
+                                       lastst.symbol.as(not null).last = sym.last
                                else
                                        lastst = dfa_state.add_trans(dfa_dest, sym)
                                end
@@ -556,8 +630,60 @@ class Automaton
                return dfa
        end
 
-       # epsilon-closure on a state of states
-       # used by `to_dfa`
+       # Transform a NFA to a epsilonless NFA.
+       fun to_nfa_noe: Automaton
+       do
+               assert_valid
+
+               trim
+
+               var dfa = new Automaton.empty
+               var n2d = new ArrayMap[Set[State], State]
+               var seen = new ArraySet[Set[State]]
+               var st = eclosure([start])
+               var todo = [st]
+               n2d[st] = dfa.start
+               seen.add(st)
+               while not todo.is_empty do
+                       var nfa_states = todo.pop
+                       #print "* work on {nfa_states.inspect}={nfa_states} (remains {todo.length}/{seen.length})"
+                       var dfa_state = n2d[nfa_states]
+                       for s in nfa_states do
+                               for t in s.outs do
+                                       if t.symbol == null then continue
+                                       var nfa_dest = eclosure([t.to])
+                                       #print "{nfa_states} -> {sym} -> {nfa_dest}"
+                                       var dfa_dest
+                                       if seen.has(nfa_dest) then
+                                               #print "* reuse {nfa_dest.inspect}={nfa_dest}"
+                                               dfa_dest = n2d[nfa_dest]
+                                       else
+                                               #print "* new {nfa_dest.inspect}={nfa_dest}"
+                                               dfa_dest = new State
+                                               dfa.states.add(dfa_dest)
+                                               n2d[nfa_dest] = dfa_dest
+                                               todo.add(nfa_dest)
+                                               seen.add(nfa_dest)
+                                       end
+                                       dfa_state.add_trans(dfa_dest, t.symbol)
+                               end
+
+                               # Mark accept and tags
+                               if accept.has(s) then
+                                       if tags.has_key(s) then
+                                               for t in tags[s] do
+                                                       dfa.add_tag(dfa_state, t)
+                                               end
+                                       end
+                                       dfa.accept.add(dfa_state)
+                               end
+                       end
+               end
+               return dfa
+       end
+
+       # Epsilon-closure on a state of states.
+       # Used by `to_dfa`.
        private fun eclosure(states: Collection[State]): Set[State]
        do
                var res = new ArraySet[State]
@@ -568,7 +694,7 @@ class Automaton
                        for t in s.outs do
                                if t.symbol != null then continue
                                var to = t.to
-                               if res.has(to) then continue 
+                               if res.has(to) then continue
                                res.add(to)
                                todo.add(to)
                        end
@@ -576,8 +702,8 @@ class Automaton
                return res
        end
 
-       # trans on a set of states
-       # Used by `to_dfa`
+       # Trans on a set of states.
+       # Used by `to_dfa`.
        fun trans(states: Collection[State], symbol: Int): Set[State]
        do
                var res = new ArraySet[State]
@@ -589,16 +715,16 @@ class Automaton
                                var l = sym.last
                                if l != null and l < symbol then continue
                                var to = t.to
-                               if res.has(to) then continue 
+                               if res.has(to) then continue
                                res.add(to)
                        end
                end
                return res
        end
 
-       # Generate the Nit source code of the lexer
-       # `filepath` is the name of the ouptit file
-       # `parser` is the name of the parser module (used to import the token classes)
+       # Generate the Nit source code of the lexer.
+       # `filepath` is the name of the output file.
+       # `parser` is the name of the parser module (used to import the token classes).
        fun gen_to_nit(filepath: String, name: String, parser: nullable String)
        do
                var gen = new DFAGenerator(filepath, name, self, parser)
@@ -613,13 +739,10 @@ private class DFAGenerator
        var automaton: Automaton
        var parser: nullable String
 
-       var out: OStream
-       init(filepath: String, name: String, automaton: Automaton, parser: nullable String) do
-               self.filepath = filepath
-               self.name = name
-               self.automaton = automaton
-               self.parser = parser
-               self.out = new OFStream.open(filepath)
+       var out: Writer is noinit
+
+       init do
+               self.out = new FileWriter.open(filepath)
        end
 
        fun add(s: String) do out.write(s)
@@ -633,7 +756,8 @@ private class DFAGenerator
                        i += 1
                end
 
-               add "# Lexer generated by nitcc for the grammar {name}"
+               add "# Lexer generated by nitcc for the grammar {name}\n"
+               add "module {name}_lexer is generated, no_warning \"missing-doc\"\n"
                add("import nitcc_runtime\n")
 
                var p = parser
@@ -644,12 +768,10 @@ private class DFAGenerator
                add("\tredef fun start_state do return dfastate_{names[automaton.start]}\n")
                add("end\n")
 
-               add("redef class Object\n")
                for s in automaton.states do
                        var n = names[s]
-                       add("\tprivate fun dfastate_{n}: DFAState{n} do return once new DFAState{n}\n")
+                       add("private fun dfastate_{n}: DFAState{n} do return once new DFAState{n}\n")
                end
-               add("end\n")
 
                add("class MyNToken\n")
                add("\tsuper NToken\n")
@@ -667,17 +789,28 @@ private class DFAGenerator
                                        token = null
                                end
                                add("\tredef fun is_accept do return true\n")
-                               add("\tredef fun make_token(position, text) do\n")
+                               var is_ignored = false
                                if token != null and token.name == "Ignored" then
+                                       is_ignored = true
+                                       add("\tredef fun is_ignored do return true\n")
+                               end
+                               add("\tredef fun make_token(position, source) do\n")
+                               if is_ignored then
                                        add("\t\treturn null\n")
                                else
                                        if token == null then
                                                add("\t\tvar t = new MyNToken\n")
+                                               add("\t\tt.text = position.extract(source)\n")
                                        else
                                                add("\t\tvar t = new {token.cname}\n")
+                                               var ttext = token.text
+                                               if ttext == null then
+                                                       add("\t\tt.text = position.extract(source)\n")
+                                               else
+                                                       add("\t\tt.text = \"{ttext.escape_to_nit}\"\n")
+                                               end
                                        end
                                        add("\t\tt.position = position\n")
-                                       add("\t\tt.text = text\n")
                                        add("\t\treturn t\n")
                                end
                                add("\tend\n")
@@ -693,23 +826,56 @@ private class DFAGenerator
                        else
                                add("\tredef fun trans(char) do\n")
 
-                               add("\t\tvar c = char.ascii\n")
-                               var haslast = false
+                               # Collect the sequence of tests in the dispatch sequence
+                               # The point here is that for each transition, there is a first and a last
+                               # So holes have to be identified
+                               var dispatch = new HashMap[Int, nullable State]
+                               var haslast: nullable State = null
+
                                var last = -1
                                for sym, next in trans do
-                                       assert not haslast
+                                       assert haslast == null
                                        assert sym.first > last
-                                       if sym.first > last + 1 then add("\t\tif c <= {sym.first-1} then return null\n")
+                                       if sym.first > last + 1 then
+                                               dispatch[sym.first-1] = null
+                                       end
                                        var l = sym.last
                                        if l == null then
-                                               add("\t\treturn dfastate_{names[next]}\n")
-                                               haslast= true
+                                               haslast = next
                                        else
-                                               add("\t\tif c <= {l} then return dfastate_{names[next]}\n")
+                                               dispatch[l] = next
                                                last = l
                                        end
                                end
-                               if not haslast then add("\t\treturn null\n")
+
+                               if dispatch.is_empty and haslast != null then
+                                       # Only one transition that accepts everything (quite rare)
+                               else
+                                       # We need to check
+                                       add("\t\tvar c = char.code_point\n")
+                               end
+
+                               # Generate a sequence of `if` for the dispatch
+                               if haslast != null and last >= 0 then
+                                       # Special case: handle up-bound first if not an error
+                                       add("\t\tif c > {last} then return dfastate_{names[haslast]}\n")
+                                       # previous become the new last case
+                                       haslast = dispatch[last]
+                                       dispatch.keys.remove(last)
+                               end
+                               for c, next in dispatch do
+                                       if next == null then
+                                               add("\t\tif c <= {c} then return null\n")
+                                       else
+                                               add("\t\tif c <= {c} then return dfastate_{names[next]}\n")
+                                       end
+                               end
+                               if haslast == null then
+                                       add("\t\treturn null\n")
+                               else
+                                       add("\t\treturn dfastate_{names[haslast]}\n")
+                               end
+
                                add("\tend\n")
                        end
                        add("end\n")
@@ -719,12 +885,17 @@ private class DFAGenerator
        end
 end
 
+redef class Token
+       # The associated text (if any, ie defined in the parser part)
+       var text: nullable String is noautoinit, writable
+end
+
 # A state in a finite automaton
 class State
        # Outgoing transitions
-
        var outs = new Array[Transition]
-       # Ingoing tyransitions
+
+       # Ingoing transitions
        var ins = new Array[Transition]
 
        # Add a transitions to `to` on `symbol` (null means epsilon)
@@ -736,6 +907,8 @@ class State
                return t
        end
 
+       # Get the first state following the transition `i`.
+       # Null if no transition for `i`.
        fun trans(i: Int): nullable State
        do
                for t in outs do
@@ -753,7 +926,12 @@ end
 
 # A range of symbols on a transition
 class TSymbol
+       # The first symbol in the range
        var first: Int
+
+       # The last symbol if any.
+       #
+       # `null` means infinity.
        var last: nullable Int
 
        redef fun to_s
@@ -763,14 +941,14 @@ class TSymbol
                if f <= 32 then
                        res = "#{f}"
                else
-                       res = f.ascii.to_s
+                       res = f.code_point.to_s
                end
                var l = last
                if f == l then return res
                res += " .. "
                if l == null then return res
                if l <= 32 or l >= 127 then return res + "#{l}"
-               return res + l.ascii.to_s
+               return res + l.code_point.to_s
        end
 end
 
@@ -783,8 +961,8 @@ class Transition
        # The symbol on the transition (null means epsilon)
        var symbol: nullable TSymbol
 
-       # Remove the transition from the automaton
-       # Detash from `from` and `to`
+       # Remove the transition from the automaton.
+       # Detach from `from` and `to`.
        fun delete
        do
                from.outs.remove(self)