nitcc: generate a faster DFA::trans sequence of `if`
[nit.git] / contrib / nitcc / src / autom.nit
index 40775f8..84df722 100644 (file)
@@ -21,20 +21,20 @@ import grammar
 # A finite automaton
 class Automaton
        # The start state
-       var start: State
+       var start: State is noinit
 
-       # State that are accect states
+       # State that are accept states
        var accept = new Array[State]
 
        # All states
        var states = new Array[State]
 
-       # Tokens associated on accept states
-       # use `add_tag` to update
+       # Tokens associated on accept states.
+       # Use `add_tag` to update
        var tags = new HashMap[State, Set[Token]]
 
-       # Accept states associated on tokens
-       # use `add_tag` to update
+       # Accept states associated on tokens.
+       # Use `add_tag` to update
        var retrotags = new HashMap[Token, Set[State]]
 
        # Tag all accept states
@@ -66,7 +66,7 @@ class Automaton
                assert retrotags[t].has(s)
        end
 
-       # Remove all occurences of a tag in an automaton
+       # Remove all occurrences of a tag in an automaton
        fun clear_tag(t: Token)
        do
                if not retrotags.has_key(t) then return
@@ -78,7 +78,7 @@ class Automaton
                retrotags.keys.remove(t)
        end
 
-       # Remove tokens from conflicting state according the the inclusion of language
+       # Remove tokens from conflicting state according the inclusion of language.
        # REQUIRE: self isa DFA automaton
        fun solve_token_inclusion
        do
@@ -101,8 +101,8 @@ class Automaton
                end
        end
 
-       # Initialize a new automaton for the empty language
-       # one state, no accept, no transition
+       # Initialize a new automaton for the empty language.
+       # One state, no accept, no transition.
        init empty
        do
                var state = new State
@@ -110,8 +110,8 @@ class Automaton
                states.add state
        end
 
-       # Initialize a new automaton for the empty-string language
-       # one state, is accept, no transition
+       # Initialize a new automaton for the empty-string language.
+       # One state, is accept, no transition.
        init epsilon
        do
                var state = new State
@@ -120,8 +120,8 @@ class Automaton
                states.add state
        end
 
-       # Initialize a new automation for the language that accepts only a single symbol
-       # Two state, the second is accept, one transition on `symbol`
+       # Initialize a new automation for the language that accepts only a single symbol.
+       # Two state, the second is accept, one transition on `symbol`.
        init atom(symbol: Int)
        do
                var s = new State
@@ -148,8 +148,8 @@ class Automaton
                states.add a
        end
 
-       # Contatenate `other` to `self`
-       # other is modified and invalidated.
+       # Concatenate `other` to `self`.
+       # Other is modified and invalidated.
        fun concat(other: Automaton)
        do
                var s2 = other.start
@@ -160,7 +160,7 @@ class Automaton
                states.add_all other.states
        end
 
-       # `self` become the alternation of `self` and `other`
+       # `self` become the alternation of `self` and `other`.
        # `other` is modified and invalidated.
        fun alternate(other: Automaton)
        do
@@ -184,8 +184,33 @@ class Automaton
                states.add_all other.states
        end
 
-       # `self` absorbs all states, transisions, tags, and acceptations of `other`
-       # An epsilon transition is added between `self.start` and `other.start`
+       # Return a new automaton that recognize `self` but not `other`.
+       # For a theoretical POV, this is the subtraction of languages.
+       # Note: the implementation use `to_dfa` internally, so the theoretical complexity is not cheap.
+       fun except(other: Automaton): Automaton
+       do
+               var ta = new Token("1")
+               self.tag_accept(ta)
+               var tb = new Token("2")
+               other.tag_accept(tb)
+
+               var c = new Automaton.empty
+               c.absorb(self)
+               c.absorb(other)
+               c = c.to_dfa
+               c.accept.clear
+               for s in c.retrotags[ta] do
+                       if not c.tags[s].has(tb) then
+                               c.accept.add(s)
+                       end
+               end
+               c.clear_tag(ta)
+               c.clear_tag(tb)
+               return c
+       end
+
+       # `self` absorbs all states, transitions, tags, and acceptations of `other`.
+       # An epsilon transition is added between `self.start` and `other.start`.
        fun absorb(other: Automaton)
        do
                states.add_all other.states
@@ -307,10 +332,49 @@ class Automaton
                accept.add(st)
        end
 
+       # Remove states (and transitions) that does not reach an accept state
+       fun trim
+       do
+               # Good states are those we want to keep
+               var goods = new HashSet[State]
+               goods.add_all(accept)
+
+               var todo = accept.to_a
+
+               # Propagate goodness
+               while not todo.is_empty do
+                       var s = todo.pop
+                       for t in s.ins do
+                               var s2 = t.from
+                               if goods.has(s2) then continue
+                               goods.add(s2)
+                               todo.add(s2)
+                       end
+               end
+
+               # What are the bad state then?
+               var bads = new Array[State]
+               for s in states do
+                       if not goods.has(s) then bads.add(s)
+               end
+
+               # Remove their transitions
+               for s in bads do
+                       for t in s.ins do t.delete
+                       for t in s.outs do t.delete
+               end
+
+               # Keep only the good stuff
+               states.clear
+               states.add_all(goods)
+       end
+
        # Generate a minimal DFA
        # REQUIRE: self is a DFA
        fun to_minimal_dfa: Automaton
        do
+               trim
+
                var distincts = new HashMap[State, Set[State]]
                for s in states do
                        distincts[s] = new HashSet[State]
@@ -372,11 +436,18 @@ class Automaton
        # Produce a graphvis file for the automaton
        fun to_dot(filepath: String)
        do
-               var f = new OFStream.open(filepath) 
+               var names = new HashMap[State, String]
+               var ni = 0
+               for s in states do
+                       names[s] = ni.to_s
+                       ni += 1
+               end
+
+               var f = new FileWriter.open(filepath)
                 f.write("digraph g \{\n")
 
                for s in states do
-                       f.write("s{s.object_id}[shape=oval")
+                       f.write("s{names[s]}[shape=oval")
                        #f.write("label=\"\",")
                        if accept.has(s) then
                                f.write(",color=blue")
@@ -387,6 +458,8 @@ class Automaton
                                        f.write("{token.name.escape_to_c}\\n")
                                end
                                f.write("\"")
+                       else
+                               f.write(",label=\"\"")
                        end
                        f.write("];\n")
                        var outs = new HashMap[State, Array[nullable TSymbol]]
@@ -412,19 +485,21 @@ class Automaton
                                                labe += c.to_s
                                        end
                                end
-                               f.write("s{s.object_id}->s{s2.object_id} [label=\"{labe.escape_to_c}\"];\n")
+                               f.write("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_c}\"];\n")
                        end
                end
-               f.write("empty->s{start.object_id}; empty[label=\"\",shape=none];\n")
+               f.write("empty->s{names[start]}; empty[label=\"\",shape=none];\n")
 
                 f.write("\}\n")
                f.close
        end
 
-       # Transform a NFA to a DFA
-       # note: the DFA is not miminized
+       # Transform a NFA to a DFA.
+       # note: the DFA is not minimized.
        fun to_dfa: Automaton
        do
+               trim
+
                var dfa = new Automaton.empty
                var n2d = new ArrayMap[Set[State], State]
                var seen = new ArraySet[Set[State]] 
@@ -461,7 +536,7 @@ class Automaton
 
                        # From the important values, build a sequence of TSymbols
                        var a = alphabet.to_a
-                       (new ComparableSorter[Int]).sort(a)
+                       default_comparator.sort(a)
                        var tsyms = new Array[TSymbol]
                        var last = 0
                        for i in a do
@@ -506,8 +581,8 @@ class Automaton
                return dfa
        end
 
-       # epsilon-closure on a state of states
-       # used by `to_dfa`
+       # Epsilon-closure on a state of states.
+       # Used by `to_dfa`.
        private fun eclosure(states: Collection[State]): Set[State]
        do
                var res = new ArraySet[State]
@@ -526,8 +601,8 @@ class Automaton
                return res
        end
 
-       # trans on a set of states
-       # Used by `to_dfa`
+       # Trans on a set of states.
+       # Used by `to_dfa`.
        fun trans(states: Collection[State], symbol: Int): Set[State]
        do
                var res = new ArraySet[State]
@@ -546,9 +621,9 @@ class Automaton
                return res
        end
 
-       # Generate the Nit source code of the lexer
-       # `filepath` is the name of the ouptit file
-       # `parser` is the name of the parser module (used to import the token classes)
+       # Generate the Nit source code of the lexer.
+       # `filepath` is the name of the output file.
+       # `parser` is the name of the parser module (used to import the token classes).
        fun gen_to_nit(filepath: String, name: String, parser: nullable String)
        do
                var gen = new DFAGenerator(filepath, name, self, parser)
@@ -563,13 +638,10 @@ private class DFAGenerator
        var automaton: Automaton
        var parser: nullable String
 
-       var out: OStream
-       init(filepath: String, name: String, automaton: Automaton, parser: nullable String) do
-               self.filepath = filepath
-               self.name = name
-               self.automaton = automaton
-               self.parser = parser
-               self.out = new OFStream.open(filepath)
+       var out: Writer is noinit
+
+       init do
+               self.out = new FileWriter.open(filepath)
        end
 
        fun add(s: String) do out.write(s)
@@ -583,7 +655,8 @@ private class DFAGenerator
                        i += 1
                end
 
-               add "# Lexer generated by nitcc for the grammar {name}"
+               add "# Lexer generated by nitcc for the grammar {name}\n"
+               add "module {name}_lexer is no_warning \"missing-doc\"\n"
                add("import nitcc_runtime\n")
 
                var p = parser
@@ -594,12 +667,10 @@ private class DFAGenerator
                add("\tredef fun start_state do return dfastate_{names[automaton.start]}\n")
                add("end\n")
 
-               add("redef class Object\n")
                for s in automaton.states do
                        var n = names[s]
-                       add("\tprivate fun dfastate_{n}: DFAState{n} do return once new DFAState{n}\n")
+                       add("private fun dfastate_{n}: DFAState{n} do return once new DFAState{n}\n")
                end
-               add("end\n")
 
                add("class MyNToken\n")
                add("\tsuper NToken\n")
@@ -607,7 +678,7 @@ private class DFAGenerator
 
                for s in automaton.states do
                        var  n = names[s]
-                       add("class DFAState{n}\n")
+                       add("private class DFAState{n}\n")
                        add("\tsuper DFAState\n")
                        if automaton.accept.has(s) then
                                var token
@@ -617,17 +688,28 @@ private class DFAGenerator
                                        token = null
                                end
                                add("\tredef fun is_accept do return true\n")
-                               add("\tredef fun make_token(position, text) do\n")
+                               var is_ignored = false
                                if token != null and token.name == "Ignored" then
+                                       is_ignored = true
+                                       add("\tredef fun is_ignored do return true\n")
+                               end
+                               add("\tredef fun make_token(position, source) do\n")
+                               if is_ignored then
                                        add("\t\treturn null\n")
                                else
                                        if token == null then
                                                add("\t\tvar t = new MyNToken\n")
+                                               add("\t\tt.text = position.extract(source)\n")
                                        else
                                                add("\t\tvar t = new {token.cname}\n")
+                                               var ttext = token.text
+                                               if ttext == null then
+                                                       add("\t\tt.text = position.extract(source)\n")
+                                               else
+                                                       add("\t\tt.text = \"{ttext.escape_to_nit}\"\n")
+                                               end
                                        end
                                        add("\t\tt.position = position\n")
-                                       add("\t\tt.text = text\n")
                                        add("\t\treturn t\n")
                                end
                                add("\tend\n")
@@ -643,23 +725,51 @@ private class DFAGenerator
                        else
                                add("\tredef fun trans(char) do\n")
 
-                               add("\t\tvar c = char.ascii\n")
-                               var haslast = false
+                               add("\t\tvar c = char.code_point\n")
+
+                               # Collect the sequence of tests in the dispatch sequence
+                               # The point here is that for each transition, there is a first and a last
+                               # So holes hare to be identified
+                               var dispatch = new HashMap[Int, nullable State]
+                               var haslast: nullable State = null
+
                                var last = -1
                                for sym, next in trans do
-                                       assert not haslast
+                                       assert haslast == null
                                        assert sym.first > last
-                                       if sym.first > last + 1 then add("\t\tif c <= {sym.first-1} then return null\n")
+                                       if sym.first > last + 1 then
+                                               dispatch[sym.first-1] = null
+                                       end
                                        var l = sym.last
                                        if l == null then
-                                               add("\t\treturn dfastate_{names[next]}\n")
-                                               haslast= true
+                                               haslast = next
                                        else
-                                               add("\t\tif c <= {l} then return dfastate_{names[next]}\n")
+                                               dispatch[l] = next
                                                last = l
                                        end
                                end
-                               if not haslast then add("\t\treturn null\n")
+
+                               # Generate a sequence of `if` for the dispatch
+                               if haslast != null then
+                                       # Special case: handle up-bound first if not an error
+                                       add("\t\tif c > {last} then return dfastate_{names[haslast]}\n")
+                                       # previous become the new last case
+                                       haslast = dispatch[last]
+                                       dispatch.keys.remove(last)
+                               end
+                               for c, next in dispatch do
+                                       if next == null then
+                                               add("\t\tif c <= {c} then return null\n")
+                                       else
+                                               add("\t\tif c <= {c} then return dfastate_{names[next]}\n")
+                                       end
+                               end
+                               if haslast == null then
+                                       add("\t\treturn null\n")
+                               else
+                                       add("\t\treturn dfastate_{names[haslast]}\n")
+                               end
+
                                add("\tend\n")
                        end
                        add("end\n")
@@ -669,12 +779,17 @@ private class DFAGenerator
        end
 end
 
+redef class Token
+       # The associated text (if any, ie defined in the parser part)
+       var text: nullable String is noautoinit, writable
+end
+
 # A state in a finite automaton
 class State
        # Outgoing transitions
-
        var outs = new Array[Transition]
-       # Ingoing tyransitions
+
+       # Ingoing transitions
        var ins = new Array[Transition]
 
        # Add a transitions to `to` on `symbol` (null means epsilon)
@@ -686,6 +801,8 @@ class State
                return t
        end
 
+       # Get the first state following the transition `i`.
+       # Null if no transition for `i`.
        fun trans(i: Int): nullable State
        do
                for t in outs do
@@ -703,7 +820,12 @@ end
 
 # A range of symbols on a transition
 class TSymbol
+       # The first symbol in the range
        var first: Int
+
+       # The last symbol if any.
+       #
+       # `null` means infinity.
        var last: nullable Int
 
        redef fun to_s
@@ -713,14 +835,14 @@ class TSymbol
                if f <= 32 then
                        res = "#{f}"
                else
-                       res = f.ascii.to_s
+                       res = f.code_point.to_s
                end
                var l = last
                if f == l then return res
                res += " .. "
                if l == null then return res
                if l <= 32 or l >= 127 then return res + "#{l}"
-               return res + l.ascii.to_s
+               return res + l.code_point.to_s
        end
 end
 
@@ -733,8 +855,8 @@ class Transition
        # The symbol on the transition (null means epsilon)
        var symbol: nullable TSymbol
 
-       # Remove the transition from the automaton
-       # Detash from `from` and `to`
+       # Remove the transition from the automaton.
+       # Detach from `from` and `to`.
        fun delete
        do
                from.outs.remove(self)