# A finite automaton
class Automaton
# The start state
- var start: State
+ var start: State is noinit
- # State that are accect states
+ # State that are accept states
var accept = new Array[State]
# All states
var states = new Array[State]
- # Tokens associated on accept states
- # use `add_tag` to update
+ # Tokens associated on accept states.
+ # Use `add_tag` to update
var tags = new HashMap[State, Set[Token]]
- # Accept states associated on tokens
- # use `add_tag` to update
+ # Accept states associated on tokens.
+ # Use `add_tag` to update
var retrotags = new HashMap[Token, Set[State]]
# Tag all accept states
assert retrotags[t].has(s)
end
- # Remove all occurences of a tag in an automaton
+ # Remove all occurrences of a tag in an automaton
fun clear_tag(t: Token)
do
if not retrotags.has_key(t) then return
retrotags.keys.remove(t)
end
- # Remove tokens from conflicting state according the the inclusion of language
+ # Remove tokens from conflicting state according the inclusion of language.
# REQUIRE: self isa DFA automaton
fun solve_token_inclusion
do
end
end
- # Initialize a new automaton for the empty language
- # one state, no accept, no transition
+ # Initialize a new automaton for the empty language.
+ # One state, no accept, no transition.
init empty
do
var state = new State
states.add state
end
- # Initialize a new automaton for the empty-string language
- # one state, is accept, no transition
+ # Initialize a new automaton for the empty-string language.
+ # One state, is accept, no transition.
init epsilon
do
var state = new State
states.add state
end
- # Initialize a new automation for the language that accepts only a single symbol
- # Two state, the second is accept, one transition on `symbol`
+ # Initialize a new automation for the language that accepts only a single symbol.
+ # Two state, the second is accept, one transition on `symbol`.
init atom(symbol: Int)
do
var s = new State
var a = new State
- s.add_trans(a, symbol)
+ var sym = new TSymbol(symbol, symbol)
+ s.add_trans(a, sym)
start = s
accept.add a
states.add s
# Initialize a new automation for the language that accepts only a range of symbols
# Two state, the second is accept, one transition for `from` to `to`
- init cla(from, to: Int)
+ init cla(first: Int, last: nullable Int)
do
var s = new State
var a = new State
- for symbol in [from..to] do
- s.add_trans(a, symbol)
- end
+ var sym = new TSymbol(first, last)
+ s.add_trans(a, sym)
start = s
accept.add a
states.add s
states.add a
end
- # Contatenate `other` to `self`
- # other is modified and invalidated.
+ # Concatenate `other` to `self`.
+ # Other is modified and invalidated.
fun concat(other: Automaton)
do
var s2 = other.start
states.add_all other.states
end
- # `self` become the alternation of `self` and `other`
+ # `self` become the alternation of `self` and `other`.
# `other` is modified and invalidated.
fun alternate(other: Automaton)
do
states.add_all other.states
end
- # `self` absorbs all states, transisions, tags, and acceptations of `other`
- # An epsilon transition is added between `self.start` and `other.start`
+ # Return a new automaton that recognize `self` but not `other`.
+ # For a theoretical POV, this is the subtraction of languages.
+ # Note: the implementation use `to_dfa` internally, so the theoretical complexity is not cheap.
+ fun except(other: Automaton): Automaton
+ do
+ var ta = new Token("1")
+ self.tag_accept(ta)
+ var tb = new Token("2")
+ other.tag_accept(tb)
+
+ var c = new Automaton.empty
+ c.absorb(self)
+ c.absorb(other)
+ c = c.to_dfa
+ c.accept.clear
+ for s in c.retrotags[ta] do
+ if not c.tags[s].has(tb) then
+ c.accept.add(s)
+ end
+ end
+ c.clear_tag(ta)
+ c.clear_tag(tb)
+ return c
+ end
+
+ # `self` absorbs all states, transitions, tags, and acceptations of `other`.
+ # An epsilon transition is added between `self.start` and `other.start`.
fun absorb(other: Automaton)
do
states.add_all other.states
end
# Remove all transitions on a given symbol
- fun minus_sym(symbol: Int)
+ fun minus_sym(symbol: TSymbol)
do
+ var f = symbol.first
+ var l = symbol.last
for s in states do
for t in s.outs.to_a do
- if t.symbol == symbol then t.delete
+ if t.symbol == null then continue
+
+ # Check overlaps
+ var tf = t.symbol.as(not null).first
+ var tl = t.symbol.as(not null).last
+ if l != null and tf > l then continue
+ if tl != null and f > tl then continue
+
+ t.delete
+
+ # Add left and right part if non empty
+ if tf < f then
+ var sym = new TSymbol(tf,f-1)
+ s.add_trans(t.to, sym)
+ end
+ if l != null then
+ if tl == null then
+ var sym = new TSymbol(l+1, null)
+ s.add_trans(t.to, sym)
+ else if tl > l then
+ var sym = new TSymbol(l+1, tl)
+ s.add_trans(t.to, sym)
+ end
+ end
end
end
end
return res
end
- # Produce a graphvis file for the automaton
- fun to_dot(filepath: String)
+ # Reverse an automaton in place
+ fun reverse
+ do
+ for s in states do
+ var tmp = s.ins
+ s.ins = s.outs
+ s.outs = tmp
+ for t in s.outs do
+ var tmp2 = t.from
+ t.from = t.to
+ t.to = tmp2
+ end
+ end
+ var st = start
+ if accept.length == 1 then
+ start = accept.first
+ else
+ var st2 = new State
+ start = st2
+ states.add(st2)
+
+ for s in accept do
+ st2.add_trans(s, null)
+ end
+ end
+ accept.clear
+ accept.add(st)
+ end
+
+ # Remove states (and transitions) that does not reach an accept state
+ fun trim
+ do
+ # Good states are those we want to keep
+ var goods = new HashSet[State]
+ goods.add_all(accept)
+
+ var todo = accept.to_a
+
+ # Propagate goodness
+ while not todo.is_empty do
+ var s = todo.pop
+ for t in s.ins do
+ var s2 = t.from
+ if goods.has(s2) then continue
+ goods.add(s2)
+ todo.add(s2)
+ end
+ end
+
+ # What are the bad state then?
+ var bads = new Array[State]
+ for s in states do
+ if not goods.has(s) then bads.add(s)
+ end
+
+ # Remove their transitions and tags
+ for s in bads do
+ for t in s.ins.to_a do t.delete
+ for t in s.outs.to_a do t.delete
+ if tags.has_key(s) then
+ for t in tags[s] do retrotags[t].remove(s)
+ tags.keys.remove(s)
+ end
+ end
+
+ # Keep only the good stuff
+ states.clear
+ states.add_all(goods)
+ states.add(start)
+ end
+
+ # Generate a minimal DFA
+ # REQUIRE: self is a DFA
+ fun to_minimal_dfa: Automaton
+ do
+ assert_valid
+
+ trim
+
+ # Graph of known distinct states.
+ var distincts = new HashMap[State, Set[State]]
+ for s in states do
+ distincts[s] = new HashSet[State]
+ end
+
+ # split accept states.
+ # An accept state is distinct with a non accept state.
+ for s1 in accept do
+ for s2 in states do
+ if distincts[s1].has(s2) then continue
+ if not accept.has(s2) then
+ distincts[s1].add(s2)
+ distincts[s2].add(s1)
+ continue
+ end
+ if tags.get_or_null(s1) != tags.get_or_null(s2) then
+ distincts[s1].add(s2)
+ distincts[s2].add(s1)
+ continue
+ end
+ end
+ end
+
+ # Fixed point algorithm.
+ # * Get 2 states s1 and s2 not yet distinguished.
+ # * Get a symbol w.
+ # * If s1.trans(w) and s2.trans(w) are distinguished, then
+ # distinguish s1 and s2.
+ var changed = true
+ var ints = new Array[Int] # List of symbols to check
+ while changed do
+ changed = false
+ for s1 in states do for s2 in states do
+ if distincts[s1].has(s2) then continue
+
+ # The transitions use intervals. Therefore, for the states s1 and s2,
+ # we need to check only the meaningful symbols. They are the `first`
+ # symbol of each interval and the first one after the interval (`last+1`).
+ ints.clear
+ # Check only `s1`; `s2` will be checked later when s1 and s2 are switched.
+ for t in s1.outs do
+ var sym = t.symbol
+ assert sym != null
+ ints.add sym.first
+ var l = sym.last
+ if l != null then ints.add l + 1
+ end
+
+ # Check each symbol
+ for i in ints do
+ var ds1 = s1.trans(i)
+ var ds2 = s2.trans(i)
+ if ds1 == ds2 then continue
+ if ds1 != null and ds2 != null and not distincts[ds1].has(ds2) then continue
+ distincts[s1].add(s2)
+ distincts[s2].add(s1)
+ changed = true
+ break
+ end
+ end
+ end
+
+ # We need to unify not-distinguished states.
+ # Just add an epsilon-transition and DFAize the automaton.
+ for s1 in states do for s2 in states do
+ if distincts[s1].has(s2) then continue
+ s1.add_trans(s2, null)
+ end
+
+ return to_dfa
+ end
+
+ # Assert that `self` is a valid automaton or abort
+ fun assert_valid
+ do
+ assert states.has(start)
+ assert states.has_all(accept)
+ for s in states do
+ for t in s.outs do assert states.has(t.to)
+ for t in s.ins do assert states.has(t.from)
+ end
+ assert states.has_all(tags.keys)
+ for t, ss in retrotags do
+ assert states.has_all(ss)
+ end
+ end
+
+ # Produce a graphviz string from the automatom
+ #
+ # Set `merge_transitions = false` to generate one edge by transition (default true).
+ fun to_dot(merge_transitions: nullable Bool): Writable
do
- var f = new OFStream.open(filepath)
- f.write("digraph g \{\n")
+ var names = new HashMap[State, String]
+ var ni = 0
+ for s in states do
+ names[s] = ni.to_s
+ ni += 1
+ end
+
+ var f = new Buffer
+ f.append("digraph g \{\n")
+ f.append("rankdir=LR;")
+ var state_nb = 0
for s in states do
- f.write("s{s.object_id}[")
+ f.append("s{names[s]}[shape=circle")
#f.write("label=\"\",")
if accept.has(s) then
- f.write("color=blue")
- if tags.has_key(s) then
- f.write(",label=\"")
- for token in tags[s] do
- f.write("{token.name.escape_to_c}\\n")
- end
- f.write("\"")
+ f.append(",shape=doublecircle")
+ end
+ if tags.has_key(s) then
+ f.append(",label=\"")
+ for token in tags[s] do
+ f.append("{token.name.escape_to_dot}\\n")
end
+ f.append("\"")
+ else
+ f.append(",label=\"{state_nb}\"")
end
- f.write("];\n")
- var outs = new HashMap[State, Array[nullable Int]]
+ f.append("];\n")
+ var outs = new HashMap[State, Array[nullable TSymbol]]
for t in s.outs do
var a
var s2 = t.to
if outs.has_key(s2) then
a = outs[s2]
else
- a = new Array[nullable Int]
+ a = new Array[nullable TSymbol]
outs[s2] = a
end
a.add(c)
end
for s2, a in outs do
var labe = ""
- var lastc: nullable Int = null
- var elip = 0
- a.add(-1)
for c in a do
+ if merge_transitions == false then labe = ""
+ if not labe.is_empty then labe += "\n"
if c == null then
- if not labe.is_empty then labe += "\n"
- labe += "''"
- else if lastc == c - 1 then
- elip += 1
- lastc = c
+ labe += "ε"
else
- if elip == 1 then
- assert lastc != null
- labe += "\n{sym_to_s(lastc)}"
- else if elip > 1 then
- assert lastc != null
- labe += " .. {sym_to_s(lastc)}"
- end
- if c == -1 then break
- if not labe.is_empty then labe += "\n"
- labe += sym_to_s(c)
- lastc = c
+ labe += c.to_s
+ end
+ if merge_transitions == false then
+ f.append("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_dot}\"];\n")
end
end
- f.write("s{s.object_id}->s{s2.object_id} [label=\"{labe.escape_to_c}\"];\n")
+ if merge_transitions or else true then
+ f.append("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_c}\"];\n")
+ end
end
+ state_nb += 1
end
- f.write("empty->s{start.object_id}; empty[label=\"\",shape=none];\n")
-
- f.write("\}\n")
- f.close
- end
-
- # Transform a symbol to a string
- # Used by `to_dot`
- private fun sym_to_s(symbol: nullable Int): String
- do
- if symbol == null then
- return "''"
- else if symbol <= 32 then
- return "#{symbol}"
- else
- return symbol.ascii.to_s
- end
+ f.append("empty->s{names[start]}; empty[label=\"\",shape=none];\n")
+ f.append("\}\n")
+ return f
end
- # Transform a NFA to a DFA
- # note: the DFA is not miminized
+ # Transform a NFA to a DFA.
+ # note: the DFA is not minimized.
fun to_dfa: Automaton
do
+ assert_valid
+
+ trim
+
var dfa = new Automaton.empty
var n2d = new ArrayMap[Set[State], State]
- var seen = new ArraySet[Set[State]]
- var ts = new HashSet[Int]
+ var seen = new ArraySet[Set[State]]
+ var alphabet = new HashSet[Int]
var st = eclosure([start])
var todo = [st]
n2d[st] = dfa.start
var nfa_states = todo.pop
#print "* work on {nfa_states.inspect}={nfa_states} (remains {todo.length}/{seen.length})"
var dfa_state = n2d[nfa_states]
- ts.clear
+ alphabet.clear
for s in nfa_states do
+ # Collect important values to build the alphabet
+ for t in s.outs do
+ var sym = t.symbol
+ if sym == null then continue
+ alphabet.add(sym.first)
+ var l = sym.last
+ if l != null then alphabet.add(l)
+ end
+
+ # Mark accept and tags
if accept.has(s) then
if tags.has_key(s) then
for t in tags[s] do
end
dfa.accept.add(dfa_state)
end
+ end
+
+ # From the important values, build a sequence of TSymbols
+ var a = alphabet.to_a
+ default_comparator.sort(a)
+ var tsyms = new Array[TSymbol]
+ var last = 0
+ for i in a do
+ if last > 0 and last <= i-1 then
+ tsyms.add(new TSymbol(last,i-1))
+ end
+ tsyms.add(new TSymbol(i,i))
+ last = i+1
+ end
+ if last > 0 then
+ tsyms.add(new TSymbol(last,null))
+ end
+ #print "Alphabet: {tsyms.join(", ")}"
+
+ var lastst: nullable Transition = null
+ for sym in tsyms do
+ var nfa_dest = eclosure(trans(nfa_states, sym.first))
+ if nfa_dest.is_empty then
+ lastst = null
+ continue
+ end
+ #print "{nfa_states} -> {sym} -> {nfa_dest}"
+ var dfa_dest
+ if seen.has(nfa_dest) then
+ #print "* reuse {nfa_dest.inspect}={nfa_dest}"
+ dfa_dest = n2d[nfa_dest]
+ else
+ #print "* new {nfa_dest.inspect}={nfa_dest}"
+ dfa_dest = new State
+ dfa.states.add(dfa_dest)
+ n2d[nfa_dest] = dfa_dest
+ todo.add(nfa_dest)
+ seen.add(nfa_dest)
+ end
+ if lastst != null and lastst.to == dfa_dest then
+ lastst.symbol.as(not null).last = sym.last
+ else
+ lastst = dfa_state.add_trans(dfa_dest, sym)
+ end
+ end
+ end
+ return dfa
+ end
+
+ # Transform a NFA to a epsilonless NFA.
+ fun to_nfa_noe: Automaton
+ do
+ assert_valid
+
+ trim
+
+ var dfa = new Automaton.empty
+ var n2d = new ArrayMap[Set[State], State]
+ var seen = new ArraySet[Set[State]]
+ var st = eclosure([start])
+ var todo = [st]
+ n2d[st] = dfa.start
+ seen.add(st)
+ while not todo.is_empty do
+ var nfa_states = todo.pop
+ #print "* work on {nfa_states.inspect}={nfa_states} (remains {todo.length}/{seen.length})"
+ var dfa_state = n2d[nfa_states]
+ for s in nfa_states do
for t in s.outs do
- var sym = t.symbol
- if sym == null or ts.has(sym) then continue
- ts.add(sym)
- var nfa_dest = eclosure(trans(nfa_states, sym))
+ if t.symbol == null then continue
+ var nfa_dest = eclosure([t.to])
#print "{nfa_states} -> {sym} -> {nfa_dest}"
var dfa_dest
if seen.has(nfa_dest) then
todo.add(nfa_dest)
seen.add(nfa_dest)
end
- dfa_state.add_trans(dfa_dest, sym)
+ dfa_state.add_trans(dfa_dest, t.symbol)
+ end
+
+ # Mark accept and tags
+ if accept.has(s) then
+ if tags.has_key(s) then
+ for t in tags[s] do
+ dfa.add_tag(dfa_state, t)
+ end
+ end
+ dfa.accept.add(dfa_state)
end
end
end
return dfa
end
- # epsilon-closure on a state of states
- # used by `to_dfa`
+ # Epsilon-closure on a state of states.
+ # Used by `to_dfa`.
private fun eclosure(states: Collection[State]): Set[State]
do
var res = new ArraySet[State]
for t in s.outs do
if t.symbol != null then continue
var to = t.to
- if res.has(to) then continue
+ if res.has(to) then continue
res.add(to)
todo.add(to)
end
return res
end
- # trans on a set of states
- # Used by `to_dfa`
+ # Trans on a set of states.
+ # Used by `to_dfa`.
fun trans(states: Collection[State], symbol: Int): Set[State]
do
var res = new ArraySet[State]
for s in states do
for t in s.outs do
- if t.symbol != symbol then continue
+ var sym = t.symbol
+ if sym == null then continue
+ if sym.first > symbol then continue
+ var l = sym.last
+ if l != null and l < symbol then continue
var to = t.to
- if res.has(to) then continue
+ if res.has(to) then continue
res.add(to)
end
end
return res
end
- # Generate the Nit source code of the lexer
- # `filepath` is the name of the ouptit file
- # `parser` is the name of the parser module (used to import the token classes)
+ # Generate the Nit source code of the lexer.
+ # `filepath` is the name of the output file.
+ # `parser` is the name of the parser module (used to import the token classes).
fun gen_to_nit(filepath: String, name: String, parser: nullable String)
do
var gen = new DFAGenerator(filepath, name, self, parser)
var automaton: Automaton
var parser: nullable String
- var out: OStream
- init(filepath: String, name: String, automaton: Automaton, parser: nullable String) do
- self.filepath = filepath
- self.name = name
- self.automaton = automaton
- self.parser = parser
- self.out = new OFStream.open(filepath)
+ var out: Writer is noinit
+
+ init do
+ self.out = new FileWriter.open(filepath)
end
fun add(s: String) do out.write(s)
i += 1
end
- add "# Lexer generated by nitcc for the grammar {name}"
+ add "# Lexer generated by nitcc for the grammar {name}\n"
+ add "module {name}_lexer is generated, no_warning \"missing-doc\"\n"
add("import nitcc_runtime\n")
var p = parser
add("\tredef fun start_state do return dfastate_{names[automaton.start]}\n")
add("end\n")
- add("redef class Object\n")
for s in automaton.states do
var n = names[s]
- add("\tprivate fun dfastate_{n}: DFAState{n} do return once new DFAState{n}\n")
+ add("private fun dfastate_{n}: DFAState{n} do return once new DFAState{n}\n")
end
- add("end\n")
add("class MyNToken\n")
add("\tsuper NToken\n")
for s in automaton.states do
var n = names[s]
- add("class DFAState{n}\n")
+ add("private class DFAState{n}\n")
add("\tsuper DFAState\n")
if automaton.accept.has(s) then
var token
token = null
end
add("\tredef fun is_accept do return true\n")
- add("\tredef fun make_token(position, text) do\n")
+ var is_ignored = false
if token != null and token.name == "Ignored" then
+ is_ignored = true
+ add("\tredef fun is_ignored do return true\n")
+ end
+ add("\tredef fun make_token(position, source) do\n")
+ if is_ignored then
add("\t\treturn null\n")
else
if token == null then
add("\t\tvar t = new MyNToken\n")
+ add("\t\tt.text = position.extract(source)\n")
else
add("\t\tvar t = new {token.cname}\n")
+ var ttext = token.text
+ if ttext == null then
+ add("\t\tt.text = position.extract(source)\n")
+ else
+ add("\t\tt.text = \"{ttext.escape_to_nit}\"\n")
+ end
end
add("\t\tt.position = position\n")
- add("\t\tt.text = text\n")
add("\t\treturn t\n")
end
add("\tend\n")
end
- var trans = new ArrayMap[Int, State]
+ var trans = new ArrayMap[TSymbol, State]
for t in s.outs do
var sym = t.symbol
assert sym != null
end
if trans.is_empty then
# Do nothing, inherit the trans
- else if trans.length == 1 then
- var sym = trans.keys.first
- var next = trans.values.first
- add("\tredef fun trans(c) do\n")
- add("\t\tif c.ascii == {sym} then return dfastate_{names[next]}\n")
- add("\t\treturn null\n")
- add("\tend\n")
else
- add("\tredef fun trans(c) do\n")
+ add("\tredef fun trans(char) do\n")
+
+ # Collect the sequence of tests in the dispatch sequence
+ # The point here is that for each transition, there is a first and a last
+ # So holes have to be identified
+ var dispatch = new HashMap[Int, nullable State]
+ var haslast: nullable State = null
+
+ var last = -1
for sym, next in trans do
- add("\t\tif c.ascii == {sym} then return dfastate_{names[next]}\n")
+ assert haslast == null
+ assert sym.first > last
+ if sym.first > last + 1 then
+ dispatch[sym.first-1] = null
+ end
+ var l = sym.last
+ if l == null then
+ haslast = next
+ else
+ dispatch[l] = next
+ last = l
+ end
+ end
+
+ if dispatch.is_empty and haslast != null then
+ # Only one transition that accepts everything (quite rare)
+ else
+ # We need to check
+ add("\t\tvar c = char.code_point\n")
+ end
+
+ # Generate a sequence of `if` for the dispatch
+ if haslast != null and last >= 0 then
+ # Special case: handle up-bound first if not an error
+ add("\t\tif c > {last} then return dfastate_{names[haslast]}\n")
+ # previous become the new last case
+ haslast = dispatch[last]
+ dispatch.keys.remove(last)
+ end
+ for c, next in dispatch do
+ if next == null then
+ add("\t\tif c <= {c} then return null\n")
+ else
+ add("\t\tif c <= {c} then return dfastate_{names[next]}\n")
+ end
end
- add("\t\treturn null\n")
+ if haslast == null then
+ add("\t\treturn null\n")
+ else
+ add("\t\treturn dfastate_{names[haslast]}\n")
+ end
+
add("\tend\n")
end
add("end\n")
end
end
+redef class Token
+ # The associated text (if any, ie defined in the parser part)
+ var text: nullable String is noautoinit, writable
+end
+
# A state in a finite automaton
class State
# Outgoing transitions
-
var outs = new Array[Transition]
- # Ingoing tyransitions
+
+ # Ingoing transitions
var ins = new Array[Transition]
# Add a transitions to `to` on `symbol` (null means epsilon)
- fun add_trans(to: State, symbol: nullable Int): Transition
+ fun add_trans(to: State, symbol: nullable TSymbol): Transition
do
var t = new Transition(self, to, symbol)
outs.add(t)
to.ins.add(t)
return t
end
+
+ # Get the first state following the transition `i`.
+ # Null if no transition for `i`.
+ fun trans(i: Int): nullable State
+ do
+ for t in outs do
+ var sym = t.symbol
+ assert sym != null
+ var f = sym.first
+ var l = sym.last
+ if i < f then continue
+ if l != null and i > l then continue
+ return t.to
+ end
+ return null
+ end
+end
+
+# A range of symbols on a transition
+class TSymbol
+ # The first symbol in the range
+ var first: Int
+
+ # The last symbol if any.
+ #
+ # `null` means infinity.
+ var last: nullable Int
+
+ redef fun to_s
+ do
+ var res
+ var f = first
+ if f <= 32 then
+ res = "#{f}"
+ else
+ res = f.code_point.to_s
+ end
+ var l = last
+ if f == l then return res
+ res += " .. "
+ if l == null then return res
+ if l <= 32 or l >= 127 then return res + "#{l}"
+ return res + l.code_point.to_s
+ end
end
# A transition in a finite automaton
# The destination state
var to: State
# The symbol on the transition (null means epsilon)
- var symbol: nullable Int
+ var symbol: nullable TSymbol
- # Remove the transition from the automaton
- # Detash from `from` and `to`
+ # Remove the transition from the automaton.
+ # Detach from `from` and `to`.
fun delete
do
from.outs.remove(self)