X-Git-Url: http://nitlanguage.org diff --git a/contrib/nitcc/src/autom.nit b/contrib/nitcc/src/autom.nit index 2e40556..aa8c218 100644 --- a/contrib/nitcc/src/autom.nit +++ b/contrib/nitcc/src/autom.nit @@ -21,20 +21,20 @@ import grammar # A finite automaton class Automaton # The start state - var start: State + var start: State is noinit - # State that are accect states + # State that are accept states var accept = new Array[State] # All states var states = new Array[State] - # Tokens associated on accept states - # use `add_tag` to update + # Tokens associated on accept states. + # Use `add_tag` to update var tags = new HashMap[State, Set[Token]] - # Accept states associated on tokens - # use `add_tag` to update + # Accept states associated on tokens. + # Use `add_tag` to update var retrotags = new HashMap[Token, Set[State]] # Tag all accept states @@ -66,7 +66,7 @@ class Automaton assert retrotags[t].has(s) end - # Remove all occurences of a tag in an automaton + # Remove all occurrences of a tag in an automaton fun clear_tag(t: Token) do if not retrotags.has_key(t) then return @@ -78,7 +78,7 @@ class Automaton retrotags.keys.remove(t) end - # Remove tokens from conflicting state according the the inclusion of language + # Remove tokens from conflicting state according the inclusion of language. # REQUIRE: self isa DFA automaton fun solve_token_inclusion do @@ -101,8 +101,8 @@ class Automaton end end - # Initialize a new automaton for the empty language - # one state, no accept, no transition + # Initialize a new automaton for the empty language. + # One state, no accept, no transition. init empty do var state = new State @@ -110,8 +110,8 @@ class Automaton states.add state end - # Initialize a new automaton for the empty-string language - # one state, is accept, no transition + # Initialize a new automaton for the empty-string language. + # One state, is accept, no transition. init epsilon do var state = new State @@ -120,8 +120,8 @@ class Automaton states.add state end - # Initialize a new automation for the language that accepts only a single symbol - # Two state, the second is accept, one transition on `symbol` + # Initialize a new automation for the language that accepts only a single symbol. + # Two state, the second is accept, one transition on `symbol`. init atom(symbol: Int) do var s = new State @@ -148,8 +148,8 @@ class Automaton states.add a end - # Contatenate `other` to `self` - # other is modified and invalidated. + # Concatenate `other` to `self`. + # Other is modified and invalidated. fun concat(other: Automaton) do var s2 = other.start @@ -160,7 +160,7 @@ class Automaton states.add_all other.states end - # `self` become the alternation of `self` and `other` + # `self` become the alternation of `self` and `other`. # `other` is modified and invalidated. fun alternate(other: Automaton) do @@ -184,8 +184,33 @@ class Automaton states.add_all other.states end - # `self` absorbs all states, transisions, tags, and acceptations of `other` - # An epsilon transition is added between `self.start` and `other.start` + # Return a new automaton that recognize `self` but not `other`. + # For a theoretical POV, this is the subtraction of languages. + # Note: the implementation use `to_dfa` internally, so the theoretical complexity is not cheap. + fun except(other: Automaton): Automaton + do + var ta = new Token("1") + self.tag_accept(ta) + var tb = new Token("2") + other.tag_accept(tb) + + var c = new Automaton.empty + c.absorb(self) + c.absorb(other) + c = c.to_dfa + c.accept.clear + for s in c.retrotags[ta] do + if not c.tags[s].has(tb) then + c.accept.add(s) + end + end + c.clear_tag(ta) + c.clear_tag(tb) + return c + end + + # `self` absorbs all states, transitions, tags, and acceptations of `other`. + # An epsilon transition is added between `self.start` and `other.start`. fun absorb(other: Automaton) do states.add_all other.states @@ -307,16 +332,59 @@ class Automaton accept.add(st) end + # Remove states (and transitions) that does not reach an accept state + fun trim + do + # Good states are those we want to keep + var goods = new HashSet[State] + goods.add_all(accept) + + var todo = accept.to_a + + # Propagate goodness + while not todo.is_empty do + var s = todo.pop + for t in s.ins do + var s2 = t.from + if goods.has(s2) then continue + goods.add(s2) + todo.add(s2) + end + end + + # What are the bad state then? + var bads = new Array[State] + for s in states do + if not goods.has(s) then bads.add(s) + end + + # Remove their transitions + for s in bads do + for t in s.ins.to_a do t.delete + for t in s.outs.to_a do t.delete + end + + # Keep only the good stuff + states.clear + states.add_all(goods) + end + # Generate a minimal DFA # REQUIRE: self is a DFA fun to_minimal_dfa: Automaton do + assert_valid + + trim + + # Graph of known distinct states. var distincts = new HashMap[State, Set[State]] for s in states do distincts[s] = new HashSet[State] end - # split accept states + # split accept states. + # An accept state is distinct with a non accept state. for s1 in states do for s2 in states do if distincts[s1].has(s2) then continue @@ -326,7 +394,7 @@ class Automaton distincts[s2].add(s1) continue end - if tags[s1] != tags[s2] then + if tags.get_or_null(s1) != tags.get_or_null(s2) then distincts[s1].add(s2) distincts[s2].add(s1) continue @@ -334,24 +402,36 @@ class Automaton end end + # Fixed point algorithm. + # * Get 2 states s1 and s2 not yet distinguished. + # * Get a symbol w. + # * If s1.trans(w) and s2.trans(w) are distinguished, then + # distinguish s1 and s2. var changed = true - var ints = new Array[Int] + var ints = new Array[Int] # List of symbols to check while changed do changed = false for s1 in states do for s2 in states do if distincts[s1].has(s2) then continue + + # The transitions use intervals. Therefore, for the states s1 and s2, + # we need to check only the meaningful symbols. They are the `first` + # symbol of each interval and the first one after the interval (`last+1`). ints.clear + # Check only `s1`; `s2` will be checked later when s1 and s2 are switched. for t in s1.outs do var sym = t.symbol assert sym != null ints.add sym.first var l = sym.last - if l != null then ints.add l + if l != null then ints.add l + 1 end + + # Check each symbol for i in ints do var ds1 = s1.trans(i) var ds2 = s2.trans(i) - if ds1 == null and ds2 == null then continue + if ds1 == ds2 then continue if ds1 != null and ds2 != null and not distincts[ds1].has(ds2) then continue distincts[s1].add(s2) distincts[s2].add(s1) @@ -360,8 +440,9 @@ class Automaton end end end - print '2' + # We need to unify not-distinguished states. + # Just add an epsilon-transition and DFAize the automaton. for s1 in states do for s2 in states do if distincts[s1].has(s2) then continue s1.add_trans(s2, null) @@ -370,14 +451,36 @@ class Automaton return to_dfa end + # Assert that `self` is a valid automaton or abort + fun assert_valid + do + assert states.has(start) + assert states.has_all(accept) + for s in states do + for t in s.outs do assert states.has(t.to) + for t in s.ins do assert states.has(t.from) + end + assert states.has_all(tags.keys) + for t, ss in retrotags do + assert states.has_all(ss) + end + end + # Produce a graphvis file for the automaton fun to_dot(filepath: String) do - var f = new OFStream.open(filepath) + var names = new HashMap[State, String] + var ni = 0 + for s in states do + names[s] = ni.to_s + ni += 1 + end + + var f = new FileWriter.open(filepath) f.write("digraph g \{\n") for s in states do - f.write("s{s.object_id}[shape=oval") + f.write("s{names[s]}[shape=oval") #f.write("label=\"\",") if accept.has(s) then f.write(",color=blue") @@ -388,6 +491,8 @@ class Automaton f.write("{token.name.escape_to_c}\\n") end f.write("\"") + else + f.write(",label=\"\"") end f.write("];\n") var outs = new HashMap[State, Array[nullable TSymbol]] @@ -413,19 +518,23 @@ class Automaton labe += c.to_s end end - f.write("s{s.object_id}->s{s2.object_id} [label=\"{labe.escape_to_c}\"];\n") + f.write("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_c}\"];\n") end end - f.write("empty->s{start.object_id}; empty[label=\"\",shape=none];\n") + f.write("empty->s{names[start]}; empty[label=\"\",shape=none];\n") f.write("\}\n") f.close end - # Transform a NFA to a DFA - # note: the DFA is not miminized + # Transform a NFA to a DFA. + # note: the DFA is not minimized. fun to_dfa: Automaton do + assert_valid + + trim + var dfa = new Automaton.empty var n2d = new ArrayMap[Set[State], State] var seen = new ArraySet[Set[State]] @@ -462,7 +571,7 @@ class Automaton # From the important values, build a sequence of TSymbols var a = alphabet.to_a - (new ComparableSorter[Int]).sort(a) + default_comparator.sort(a) var tsyms = new Array[TSymbol] var last = 0 for i in a do @@ -507,8 +616,8 @@ class Automaton return dfa end - # epsilon-closure on a state of states - # used by `to_dfa` + # Epsilon-closure on a state of states. + # Used by `to_dfa`. private fun eclosure(states: Collection[State]): Set[State] do var res = new ArraySet[State] @@ -527,8 +636,8 @@ class Automaton return res end - # trans on a set of states - # Used by `to_dfa` + # Trans on a set of states. + # Used by `to_dfa`. fun trans(states: Collection[State], symbol: Int): Set[State] do var res = new ArraySet[State] @@ -547,9 +656,9 @@ class Automaton return res end - # Generate the Nit source code of the lexer - # `filepath` is the name of the ouptit file - # `parser` is the name of the parser module (used to import the token classes) + # Generate the Nit source code of the lexer. + # `filepath` is the name of the output file. + # `parser` is the name of the parser module (used to import the token classes). fun gen_to_nit(filepath: String, name: String, parser: nullable String) do var gen = new DFAGenerator(filepath, name, self, parser) @@ -564,13 +673,10 @@ private class DFAGenerator var automaton: Automaton var parser: nullable String - var out: OStream - init(filepath: String, name: String, automaton: Automaton, parser: nullable String) do - self.filepath = filepath - self.name = name - self.automaton = automaton - self.parser = parser - self.out = new OFStream.open(filepath) + var out: Writer is noinit + + init do + self.out = new FileWriter.open(filepath) end fun add(s: String) do out.write(s) @@ -584,7 +690,8 @@ private class DFAGenerator i += 1 end - add "# Lexer generated by nitcc for the grammar {name}" + add "# Lexer generated by nitcc for the grammar {name}\n" + add "module {name}_lexer is generated, no_warning \"missing-doc\"\n" add("import nitcc_runtime\n") var p = parser @@ -595,12 +702,10 @@ private class DFAGenerator add("\tredef fun start_state do return dfastate_{names[automaton.start]}\n") add("end\n") - add("redef class Object\n") for s in automaton.states do var n = names[s] - add("\tprivate fun dfastate_{n}: DFAState{n} do return once new DFAState{n}\n") + add("private fun dfastate_{n}: DFAState{n} do return once new DFAState{n}\n") end - add("end\n") add("class MyNToken\n") add("\tsuper NToken\n") @@ -608,7 +713,7 @@ private class DFAGenerator for s in automaton.states do var n = names[s] - add("class DFAState{n}\n") + add("private class DFAState{n}\n") add("\tsuper DFAState\n") if automaton.accept.has(s) then var token @@ -618,17 +723,28 @@ private class DFAGenerator token = null end add("\tredef fun is_accept do return true\n") - add("\tredef fun make_token(position, text) do\n") + var is_ignored = false if token != null and token.name == "Ignored" then + is_ignored = true + add("\tredef fun is_ignored do return true\n") + end + add("\tredef fun make_token(position, source) do\n") + if is_ignored then add("\t\treturn null\n") else if token == null then add("\t\tvar t = new MyNToken\n") + add("\t\tt.text = position.extract(source)\n") else add("\t\tvar t = new {token.cname}\n") + var ttext = token.text + if ttext == null then + add("\t\tt.text = position.extract(source)\n") + else + add("\t\tt.text = \"{ttext.escape_to_nit}\"\n") + end end add("\t\tt.position = position\n") - add("\t\tt.text = text\n") add("\t\treturn t\n") end add("\tend\n") @@ -644,23 +760,56 @@ private class DFAGenerator else add("\tredef fun trans(char) do\n") - add("\t\tvar c = char.ascii\n") - var haslast = false + # Collect the sequence of tests in the dispatch sequence + # The point here is that for each transition, there is a first and a last + # So holes have to be identified + var dispatch = new HashMap[Int, nullable State] + var haslast: nullable State = null + var last = -1 for sym, next in trans do - assert not haslast + assert haslast == null assert sym.first > last - if sym.first > last + 1 then add("\t\tif c <= {sym.first-1} then return null\n") + if sym.first > last + 1 then + dispatch[sym.first-1] = null + end var l = sym.last if l == null then - add("\t\treturn dfastate_{names[next]}\n") - haslast= true + haslast = next else - add("\t\tif c <= {l} then return dfastate_{names[next]}\n") + dispatch[l] = next last = l end end - if not haslast then add("\t\treturn null\n") + + if dispatch.is_empty and haslast != null then + # Only one transition that accepts everything (quite rare) + else + # We need to check + add("\t\tvar c = char.code_point\n") + end + + # Generate a sequence of `if` for the dispatch + if haslast != null and last >= 0 then + # Special case: handle up-bound first if not an error + add("\t\tif c > {last} then return dfastate_{names[haslast]}\n") + # previous become the new last case + haslast = dispatch[last] + dispatch.keys.remove(last) + end + for c, next in dispatch do + if next == null then + add("\t\tif c <= {c} then return null\n") + else + add("\t\tif c <= {c} then return dfastate_{names[next]}\n") + end + end + if haslast == null then + add("\t\treturn null\n") + else + add("\t\treturn dfastate_{names[haslast]}\n") + end + add("\tend\n") end add("end\n") @@ -670,12 +819,17 @@ private class DFAGenerator end end +redef class Token + # The associated text (if any, ie defined in the parser part) + var text: nullable String is noautoinit, writable +end + # A state in a finite automaton class State # Outgoing transitions - var outs = new Array[Transition] - # Ingoing tyransitions + + # Ingoing transitions var ins = new Array[Transition] # Add a transitions to `to` on `symbol` (null means epsilon) @@ -687,6 +841,8 @@ class State return t end + # Get the first state following the transition `i`. + # Null if no transition for `i`. fun trans(i: Int): nullable State do for t in outs do @@ -704,7 +860,12 @@ end # A range of symbols on a transition class TSymbol + # The first symbol in the range var first: Int + + # The last symbol if any. + # + # `null` means infinity. var last: nullable Int redef fun to_s @@ -714,14 +875,14 @@ class TSymbol if f <= 32 then res = "#{f}" else - res = f.ascii.to_s + res = f.code_point.to_s end var l = last if f == l then return res res += " .. " if l == null then return res if l <= 32 or l >= 127 then return res + "#{l}" - return res + l.ascii.to_s + return res + l.code_point.to_s end end @@ -734,8 +895,8 @@ class Transition # The symbol on the transition (null means epsilon) var symbol: nullable TSymbol - # Remove the transition from the automaton - # Detash from `from` and `to` + # Remove the transition from the automaton. + # Detach from `from` and `to`. fun delete do from.outs.remove(self)