# This file is part of NIT ( http://www.nitlanguage.org ). # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Finite automaton (NFA & DFA) module autom # For the class Token import grammar # A finite automaton class Automaton # The start state var start: State is noinit # State that are accept states var accept = new Array[State] # All states var states = new Array[State] # Tokens associated on accept states. # Use `add_tag` to update var tags = new HashMap[State, Set[Token]] # Accept states associated on tokens. # Use `add_tag` to update var retrotags = new HashMap[Token, Set[State]] # Tag all accept states fun tag_accept(t: Token) do for s in accept do add_tag(s, t) end # Add a token to a state fun add_tag(s: State, t: Token) do if not tags.has_key(s) then var set = new ArraySet[Token] tags[s] = set set.add t else tags[s].add t end if not retrotags.has_key(t) then var set = new ArraySet[State] retrotags[t] = set set.add s else retrotags[t].add s end assert tags[s].has(t) assert retrotags[t].has(s) end # Remove all occurrences of a tag in an automaton fun clear_tag(t: Token) do if not retrotags.has_key(t) then return for s in retrotags[t] do if not tags.has_key(s) then continue tags[s].remove(t) if tags[s].is_empty then tags.keys.remove(s) end retrotags.keys.remove(t) end # Remove tokens from conflicting state according the inclusion of language. # REQUIRE: self isa DFA automaton fun solve_token_inclusion do for s, ts in tags do if ts.length <= 1 then continue var losers = new Array[Token] for t1 in ts do for t2 in ts do if t1 == t2 then continue if retrotags[t1].length > retrotags[t2].length and retrotags[t1].has_all(retrotags[t2]) then losers.add(t1) break end end end for t in losers do ts.remove(t) retrotags[t].remove s end end end # Initialize a new automaton for the empty language. # One state, no accept, no transition. init empty do var state = new State start = state states.add state end # Initialize a new automaton for the empty-string language. # One state, is accept, no transition. init epsilon do var state = new State start = state accept.add state states.add state end # Initialize a new automation for the language that accepts only a single symbol. # Two state, the second is accept, one transition on `symbol`. init atom(symbol: Int) do var s = new State var a = new State var sym = new TSymbol(symbol, symbol) s.add_trans(a, sym) start = s accept.add a states.add s states.add a end # Initialize a new automation for the language that accepts only a range of symbols # Two state, the second is accept, one transition for `from` to `to` init cla(first: Int, last: nullable Int) do var s = new State var a = new State var sym = new TSymbol(first, last) s.add_trans(a, sym) start = s accept.add a states.add s states.add a end # Concatenate `other` to `self`. # Other is modified and invalidated. fun concat(other: Automaton) do var s2 = other.start for a1 in accept do a1.add_trans(s2, null) end accept = other.accept states.add_all other.states end # `self` become the alternation of `self` and `other`. # `other` is modified and invalidated. fun alternate(other: Automaton) do var s = new State var a = new State s.add_trans(start, null) for a1 in accept do a1.add_trans(a, null) end s.add_trans(other.start, null) for a2 in other.accept do a2.add_trans(a, null) accept.add(a2) end start = s accept = [a] states.add s states.add a states.add_all other.states end # Return a new automaton that recognize `self` but not `other`. # For a theoretical POV, this is the subtraction of languages. # Note: the implementation use `to_dfa` internally, so the theoretical complexity is not cheap. fun except(other: Automaton): Automaton do var ta = new Token("1") self.tag_accept(ta) var tb = new Token("2") other.tag_accept(tb) var c = new Automaton.empty c.absorb(self) c.absorb(other) c = c.to_dfa c.accept.clear for s in c.retrotags[ta] do if not c.tags[s].has(tb) then c.accept.add(s) end end c.clear_tag(ta) c.clear_tag(tb) return c end # `self` absorbs all states, transitions, tags, and acceptations of `other`. # An epsilon transition is added between `self.start` and `other.start`. fun absorb(other: Automaton) do states.add_all other.states start.add_trans(other.start, null) for s, ts in other.tags do for t in ts do add_tag(s, t) accept.add_all other.accept end # Do the Kleene closure (*) on self fun close do for a1 in accept do a1.add_trans(start, null) start.add_trans(a1, null) end end # Do the + on self fun plus do for a1 in accept do a1.add_trans(start, null) end end # Do the ? on self fun optionnal do alternate(new Automaton.epsilon) end # Remove all transitions on a given symbol fun minus_sym(symbol: TSymbol) do var f = symbol.first var l = symbol.last for s in states do for t in s.outs.to_a do if t.symbol == null then continue # Check overlaps var tf = t.symbol.as(not null).first var tl = t.symbol.as(not null).last if l != null and tf > l then continue if tl != null and f > tl then continue t.delete # Add left and right part if non empty if tf < f then var sym = new TSymbol(tf,f-1) s.add_trans(t.to, sym) end if l != null then if tl == null then var sym = new TSymbol(l+1, null) s.add_trans(t.to, sym) else if tl > l then var sym = new TSymbol(l+1, tl) s.add_trans(t.to, sym) end end end end end # Fully duplicate an automaton fun dup: Automaton do var res = new Automaton.empty var map = new HashMap[State, State] map[start] = res.start for s in states do if s == start then continue var s2 = new State map[s] = s2 res.states.add(s2) end for s in accept do res.accept.add map[s] end for s, ts in tags do for t in ts do res.add_tag(map[s], t) end for s in states do for t in s.outs do map[s].add_trans(map[t.to], t.symbol) end end return res end # Reverse an automaton in place fun reverse do for s in states do var tmp = s.ins s.ins = s.outs s.outs = tmp for t in s.outs do var tmp2 = t.from t.from = t.to t.to = tmp2 end end var st = start if accept.length == 1 then start = accept.first else var st2 = new State start = st2 states.add(st2) for s in accept do st2.add_trans(s, null) end end accept.clear accept.add(st) end # Remove states (and transitions) that does not reach an accept state fun trim do # Good states are those we want to keep var goods = new HashSet[State] goods.add_all(accept) var todo = accept.to_a # Propagate goodness while not todo.is_empty do var s = todo.pop for t in s.ins do var s2 = t.from if goods.has(s2) then continue goods.add(s2) todo.add(s2) end end # What are the bad state then? var bads = new Array[State] for s in states do if not goods.has(s) then bads.add(s) end # Remove their transitions and tags for s in bads do for t in s.ins.to_a do t.delete for t in s.outs.to_a do t.delete if tags.has_key(s) then for t in tags[s] do retrotags[t].remove(s) tags.keys.remove(s) end end # Keep only the good stuff states.clear states.add_all(goods) states.add(start) end # Generate a minimal DFA # REQUIRE: self is a DFA fun to_minimal_dfa: Automaton do assert_valid trim # Graph of known distinct states. var distincts = new HashMap[State, Set[State]] for s in states do distincts[s] = new HashSet[State] end # split accept states. # An accept state is distinct with a non accept state. for s1 in accept do for s2 in states do if distincts[s1].has(s2) then continue if not accept.has(s2) then distincts[s1].add(s2) distincts[s2].add(s1) continue end if tags.get_or_null(s1) != tags.get_or_null(s2) then distincts[s1].add(s2) distincts[s2].add(s1) continue end end end # Fixed point algorithm. # * Get 2 states s1 and s2 not yet distinguished. # * Get a symbol w. # * If s1.trans(w) and s2.trans(w) are distinguished, then # distinguish s1 and s2. var changed = true var ints = new Array[Int] # List of symbols to check while changed do changed = false for s1 in states do for s2 in states do if distincts[s1].has(s2) then continue # The transitions use intervals. Therefore, for the states s1 and s2, # we need to check only the meaningful symbols. They are the `first` # symbol of each interval and the first one after the interval (`last+1`). ints.clear # Check only `s1`; `s2` will be checked later when s1 and s2 are switched. for t in s1.outs do var sym = t.symbol assert sym != null ints.add sym.first var l = sym.last if l != null then ints.add l + 1 end # Check each symbol for i in ints do var ds1 = s1.trans(i) var ds2 = s2.trans(i) if ds1 == ds2 then continue if ds1 != null and ds2 != null and not distincts[ds1].has(ds2) then continue distincts[s1].add(s2) distincts[s2].add(s1) changed = true break end end end # We need to unify not-distinguished states. # Just add an epsilon-transition and DFAize the automaton. for s1 in states do for s2 in states do if distincts[s1].has(s2) then continue s1.add_trans(s2, null) end return to_dfa end # Assert that `self` is a valid automaton or abort fun assert_valid do assert states.has(start) assert states.has_all(accept) for s in states do for t in s.outs do assert states.has(t.to) for t in s.ins do assert states.has(t.from) end assert states.has_all(tags.keys) for t, ss in retrotags do assert states.has_all(ss) end end # Produce a graphviz string from the automatom # # Set `merge_transitions = false` to generate one edge by transition (default true). fun to_dot(merge_transitions: nullable Bool): Writable do var names = new HashMap[State, String] var ni = 0 for s in states do names[s] = ni.to_s ni += 1 end var f = new Buffer f.append("digraph g \{\n") f.append("rankdir=LR;") var state_nb = 0 for s in states do f.append("s{names[s]}[shape=circle") #f.write("label=\"\",") if accept.has(s) then f.append(",shape=doublecircle") end if tags.has_key(s) then f.append(",label=\"") for token in tags[s] do f.append("{token.name.escape_to_dot}\\n") end f.append("\"") else f.append(",label=\"{state_nb}\"") end f.append("];\n") var outs = new HashMap[State, Array[nullable TSymbol]] for t in s.outs do var a var s2 = t.to var c = t.symbol if outs.has_key(s2) then a = outs[s2] else a = new Array[nullable TSymbol] outs[s2] = a end a.add(c) end for s2, a in outs do var labe = "" for c in a do if merge_transitions == false then labe = "" if not labe.is_empty then labe += "\n" if c == null then labe += "ε" else labe += c.to_s end if merge_transitions == false then f.append("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_dot}\"];\n") end end if merge_transitions or else true then f.append("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_c}\"];\n") end end state_nb += 1 end f.append("empty->s{names[start]}; empty[label=\"\",shape=none];\n") f.append("\}\n") return f end # Transform a NFA to a DFA. # note: the DFA is not minimized. fun to_dfa: Automaton do assert_valid trim var dfa = new Automaton.empty var n2d = new ArrayMap[Set[State], State] var seen = new ArraySet[Set[State]] var alphabet = new HashSet[Int] var st = eclosure([start]) var todo = [st] n2d[st] = dfa.start seen.add(st) while not todo.is_empty do var nfa_states = todo.pop #print "* work on {nfa_states.inspect}={nfa_states} (remains {todo.length}/{seen.length})" var dfa_state = n2d[nfa_states] alphabet.clear for s in nfa_states do # Collect important values to build the alphabet for t in s.outs do var sym = t.symbol if sym == null then continue alphabet.add(sym.first) var l = sym.last if l != null then alphabet.add(l) end # Mark accept and tags if accept.has(s) then if tags.has_key(s) then for t in tags[s] do dfa.add_tag(dfa_state, t) end end dfa.accept.add(dfa_state) end end # From the important values, build a sequence of TSymbols var a = alphabet.to_a default_comparator.sort(a) var tsyms = new Array[TSymbol] var last = 0 for i in a do if last > 0 and last <= i-1 then tsyms.add(new TSymbol(last,i-1)) end tsyms.add(new TSymbol(i,i)) last = i+1 end if last > 0 then tsyms.add(new TSymbol(last,null)) end #print "Alphabet: {tsyms.join(", ")}" var lastst: nullable Transition = null for sym in tsyms do var nfa_dest = eclosure(trans(nfa_states, sym.first)) if nfa_dest.is_empty then lastst = null continue end #print "{nfa_states} -> {sym} -> {nfa_dest}" var dfa_dest if seen.has(nfa_dest) then #print "* reuse {nfa_dest.inspect}={nfa_dest}" dfa_dest = n2d[nfa_dest] else #print "* new {nfa_dest.inspect}={nfa_dest}" dfa_dest = new State dfa.states.add(dfa_dest) n2d[nfa_dest] = dfa_dest todo.add(nfa_dest) seen.add(nfa_dest) end if lastst != null and lastst.to == dfa_dest then lastst.symbol.as(not null).last = sym.last else lastst = dfa_state.add_trans(dfa_dest, sym) end end end return dfa end # Transform a NFA to a epsilonless NFA. fun to_nfa_noe: Automaton do assert_valid trim var dfa = new Automaton.empty var n2d = new ArrayMap[Set[State], State] var seen = new ArraySet[Set[State]] var st = eclosure([start]) var todo = [st] n2d[st] = dfa.start seen.add(st) while not todo.is_empty do var nfa_states = todo.pop #print "* work on {nfa_states.inspect}={nfa_states} (remains {todo.length}/{seen.length})" var dfa_state = n2d[nfa_states] for s in nfa_states do for t in s.outs do if t.symbol == null then continue var nfa_dest = eclosure([t.to]) #print "{nfa_states} -> {sym} -> {nfa_dest}" var dfa_dest if seen.has(nfa_dest) then #print "* reuse {nfa_dest.inspect}={nfa_dest}" dfa_dest = n2d[nfa_dest] else #print "* new {nfa_dest.inspect}={nfa_dest}" dfa_dest = new State dfa.states.add(dfa_dest) n2d[nfa_dest] = dfa_dest todo.add(nfa_dest) seen.add(nfa_dest) end dfa_state.add_trans(dfa_dest, t.symbol) end # Mark accept and tags if accept.has(s) then if tags.has_key(s) then for t in tags[s] do dfa.add_tag(dfa_state, t) end end dfa.accept.add(dfa_state) end end end return dfa end # Epsilon-closure on a state of states. # Used by `to_dfa`. private fun eclosure(states: Collection[State]): Set[State] do var res = new ArraySet[State] res.add_all(states) var todo = states.to_a while not todo.is_empty do var s = todo.pop for t in s.outs do if t.symbol != null then continue var to = t.to if res.has(to) then continue res.add(to) todo.add(to) end end return res end # Trans on a set of states. # Used by `to_dfa`. fun trans(states: Collection[State], symbol: Int): Set[State] do var res = new ArraySet[State] for s in states do for t in s.outs do var sym = t.symbol if sym == null then continue if sym.first > symbol then continue var l = sym.last if l != null and l < symbol then continue var to = t.to if res.has(to) then continue res.add(to) end end return res end # Generate the Nit source code of the lexer. # `filepath` is the name of the output file. # `parser` is the name of the parser module (used to import the token classes). fun gen_to_nit(filepath: String, name: String, parser: nullable String) do var gen = new DFAGenerator(filepath, name, self, parser) gen.gen_to_nit end end # Generate the Nit source code of the lexer private class DFAGenerator var filepath: String var name: String var automaton: Automaton var parser: nullable String var out: Writer is noinit init do self.out = new FileWriter.open(filepath) end fun add(s: String) do out.write(s) fun gen_to_nit do var names = new HashMap[State, String] var i = 0 for s in automaton.states do names[s] = i.to_s i += 1 end add "# Lexer generated by nitcc for the grammar {name}\n" add "module {name}_lexer is generated, no_warning \"missing-doc\"\n" add("import nitcc_runtime\n") var p = parser if p != null then add("import {p}\n") add("class Lexer_{name}\n") add("\tsuper Lexer\n") add("\tredef fun start_state do return dfastate_{names[automaton.start]}\n") add("end\n") for s in automaton.states do var n = names[s] add("private fun dfastate_{n}: DFAState{n} do return once new DFAState{n}\n") end add("class MyNToken\n") add("\tsuper NToken\n") add("end\n") for s in automaton.states do var n = names[s] add("private class DFAState{n}\n") add("\tsuper DFAState\n") if automaton.accept.has(s) then var token if automaton.tags.has_key(s) then token = automaton.tags[s].first else token = null end add("\tredef fun is_accept do return true\n") var is_ignored = false if token != null and token.name == "Ignored" then is_ignored = true add("\tredef fun is_ignored do return true\n") end add("\tredef fun make_token(position, source) do\n") if is_ignored then add("\t\treturn null\n") else if token == null then add("\t\tvar t = new MyNToken\n") add("\t\tt.text = position.extract(source)\n") else add("\t\tvar t = new {token.cname}\n") var ttext = token.text if ttext == null then add("\t\tt.text = position.extract(source)\n") else add("\t\tt.text = \"{ttext.escape_to_nit}\"\n") end end add("\t\tt.position = position\n") add("\t\treturn t\n") end add("\tend\n") end var trans = new ArrayMap[TSymbol, State] for t in s.outs do var sym = t.symbol assert sym != null trans[sym] = t.to end if trans.is_empty then # Do nothing, inherit the trans else add("\tredef fun trans(char) do\n") # Collect the sequence of tests in the dispatch sequence # The point here is that for each transition, there is a first and a last # So holes have to be identified var dispatch = new HashMap[Int, nullable State] var haslast: nullable State = null var last = -1 for sym, next in trans do assert haslast == null assert sym.first > last if sym.first > last + 1 then dispatch[sym.first-1] = null end var l = sym.last if l == null then haslast = next else dispatch[l] = next last = l end end if dispatch.is_empty and haslast != null then # Only one transition that accepts everything (quite rare) else # We need to check add("\t\tvar c = char.code_point\n") end # Generate a sequence of `if` for the dispatch if haslast != null and last >= 0 then # Special case: handle up-bound first if not an error add("\t\tif c > {last} then return dfastate_{names[haslast]}\n") # previous become the new last case haslast = dispatch[last] dispatch.keys.remove(last) end for c, next in dispatch do if next == null then add("\t\tif c <= {c} then return null\n") else add("\t\tif c <= {c} then return dfastate_{names[next]}\n") end end if haslast == null then add("\t\treturn null\n") else add("\t\treturn dfastate_{names[haslast]}\n") end add("\tend\n") end add("end\n") end self.out.close end end redef class Token # The associated text (if any, ie defined in the parser part) var text: nullable String is noautoinit, writable end # A state in a finite automaton class State # Outgoing transitions var outs = new Array[Transition] # Ingoing transitions var ins = new Array[Transition] # Add a transitions to `to` on `symbol` (null means epsilon) fun add_trans(to: State, symbol: nullable TSymbol): Transition do var t = new Transition(self, to, symbol) outs.add(t) to.ins.add(t) return t end # Get the first state following the transition `i`. # Null if no transition for `i`. fun trans(i: Int): nullable State do for t in outs do var sym = t.symbol assert sym != null var f = sym.first var l = sym.last if i < f then continue if l != null and i > l then continue return t.to end return null end end # A range of symbols on a transition class TSymbol # The first symbol in the range var first: Int # The last symbol if any. # # `null` means infinity. var last: nullable Int redef fun to_s do var res var f = first if f <= 32 then res = "#{f}" else res = f.code_point.to_s end var l = last if f == l then return res res += " .. " if l == null then return res if l <= 32 or l >= 127 then return res + "#{l}" return res + l.code_point.to_s end end # A transition in a finite automaton class Transition # The source state var from: State # The destination state var to: State # The symbol on the transition (null means epsilon) var symbol: nullable TSymbol # Remove the transition from the automaton. # Detach from `from` and `to`. fun delete do from.outs.remove(self) to.ins.remove(self) end end