nitcc: add Automation::tag_acept and Automaton::absorb
[nit.git] / contrib / nitcc / src / autom.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Finite automaton (NFA & DFA)
16 module autom
17
18 # For the class Token
19 import grammar
20
21 # A finite automaton
22 class Automaton
23 # The start state
24 var start: State
25
26 # State that are accect states
27 var accept = new Array[State]
28
29 # All states
30 var states = new Array[State]
31
32 # Tokens associated on accept states
33 # use `add_tag` to update
34 var tags = new HashMap[State, Set[Token]]
35
36 # Accept states associated on tokens
37 # use `add_tag` to update
38 var retrotags = new HashMap[Token, Set[State]]
39
40 # Tag all accept states
41 fun tag_accept(t: Token)
42 do
43 for s in accept do add_tag(s, t)
44 end
45
46 # Add a token to a state
47 fun add_tag(s: State, t: Token)
48 do
49 if not tags.has_key(s) then
50 var set = new ArraySet[Token]
51 tags[s] = set
52 set.add t
53 else
54 tags[s].add t
55 end
56
57 if not retrotags.has_key(t) then
58 var set = new ArraySet[State]
59 retrotags[t] = set
60 set.add s
61 else
62 retrotags[t].add s
63 end
64 end
65
66 # Remove tokens from conflicting state according the the inclusion of language
67 # REQUIRE: self isa DFA automaton
68 fun solve_token_inclusion
69 do
70 for s, ts in tags do
71 if ts.length <= 1 then continue
72 var losers = new Array[Token]
73 for t1 in ts do
74 for t2 in ts do
75 if t1 == t2 then continue
76 if retrotags[t1].length > retrotags[t2].length and retrotags[t1].has_all(retrotags[t2]) then
77 losers.add(t1)
78 break
79 end
80 end
81 end
82 for t in losers do
83 ts.remove(t)
84 retrotags[t].remove s
85 end
86 end
87 end
88
89 # Initialize a new automaton for the empty language
90 # one state, no accept, no transition
91 init empty
92 do
93 var state = new State
94 start = state
95 states.add state
96 end
97
98 # Initialize a new automaton for the empty-string language
99 # one state, is accept, no transition
100 init epsilon
101 do
102 var state = new State
103 start = state
104 accept.add state
105 states.add state
106 end
107
108 # Initialize a new automation for the language that accepts only a single symbol
109 # Two state, the second is accept, one transition on `symbol`
110 init atom(symbol: Int)
111 do
112 var s = new State
113 var a = new State
114 s.add_trans(a, symbol)
115 start = s
116 accept.add a
117 states.add s
118 states.add a
119 end
120
121 # Initialize a new automation for the language that accepts only a range of symbols
122 # Two state, the second is accept, one transition for `from` to `to`
123 init cla(from, to: Int)
124 do
125 var s = new State
126 var a = new State
127 for symbol in [from..to] do
128 s.add_trans(a, symbol)
129 end
130 start = s
131 accept.add a
132 states.add s
133 states.add a
134 end
135
136 # Contatenate `other` to `self`
137 # other is modified and invalidated.
138 fun concat(other: Automaton)
139 do
140 var s2 = other.start
141 for a1 in accept do
142 a1.add_trans(s2, null)
143 end
144 accept = other.accept
145 states.add_all other.states
146 end
147
148 # `self` become the alternation of `self` and `other`
149 # `other` is modified and invalidated.
150 fun alternate(other: Automaton)
151 do
152 var s = new State
153 var a = new State
154 s.add_trans(start, null)
155 for a1 in accept do
156 a1.add_trans(a, null)
157 end
158 s.add_trans(other.start, null)
159 for a2 in other.accept do
160 a2.add_trans(a, null)
161 accept.add(a2)
162 end
163
164 start = s
165 accept = [a]
166
167 states.add s
168 states.add a
169 states.add_all other.states
170 end
171
172 # `self` absorbs all states, transisions, tags, and acceptations of `other`
173 # An epsilon transition is added between `self.start` and `other.start`
174 fun absorb(other: Automaton)
175 do
176 states.add_all other.states
177 start.add_trans(other.start, null)
178 for s, ts in other.tags do for t in ts do add_tag(s, t)
179 accept.add_all other.accept
180 end
181
182 # Do the Kleene closure (*) on self
183 fun close
184 do
185 for a1 in accept do
186 a1.add_trans(start, null)
187 start.add_trans(a1, null)
188 end
189 end
190
191 # Do the + on self
192 fun plus
193 do
194 for a1 in accept do
195 a1.add_trans(start, null)
196 end
197 end
198
199 # Do the ? on self
200 fun optionnal
201 do
202 alternate(new Automaton.epsilon)
203 end
204
205 # Remove all transitions on a given symbol
206 fun minus_sym(symbol: Int)
207 do
208 for s in states do
209 for t in s.outs.to_a do
210 if t.symbol == symbol then t.delete
211 end
212 end
213 end
214
215 # Fully duplicate an automaton
216 fun dup: Automaton
217 do
218 var res = new Automaton.empty
219 var map = new HashMap[State, State]
220 map[start] = res.start
221 for s in states do
222 if s == start then continue
223 var s2 = new State
224 map[s] = s2
225 res.states.add(s2)
226 end
227 for s in accept do
228 res.accept.add map[s]
229 end
230 for s, ts in tags do for t in ts do
231 res.add_tag(map[s], t)
232 end
233 for s in states do
234 for t in s.outs do
235 map[s].add_trans(map[t.to], t.symbol)
236 end
237 end
238 return res
239 end
240
241 # Produce a graphvis file for the automaton
242 fun to_dot(filepath: String)
243 do
244 var f = new OFStream.open(filepath)
245 f.write("digraph g \{\n")
246
247 for s in states do
248 f.write("s{s.object_id}[")
249 #f.write("label=\"\",")
250 if accept.has(s) then
251 f.write("color=blue")
252 if tags.has_key(s) then
253 f.write(",label=\"")
254 for token in tags[s] do
255 f.write("{token.name.escape_to_c}\\n")
256 end
257 f.write("\"")
258 end
259 end
260 f.write("];\n")
261 var outs = new HashMap[State, Array[nullable Int]]
262 for t in s.outs do
263 var a
264 var s2 = t.to
265 var c = t.symbol
266 if outs.has_key(s2) then
267 a = outs[s2]
268 else
269 a = new Array[nullable Int]
270 outs[s2] = a
271 end
272 a.add(c)
273 end
274 for s2, a in outs do
275 var labe = ""
276 var lastc: nullable Int = null
277 var elip = 0
278 a.add(-1)
279 for c in a do
280 if c == null then
281 if not labe.is_empty then labe += "\n"
282 labe += "''"
283 else if lastc == c - 1 then
284 elip += 1
285 lastc = c
286 else
287 if elip == 1 then
288 assert lastc != null
289 labe += "\n{sym_to_s(lastc)}"
290 else if elip > 1 then
291 assert lastc != null
292 labe += " .. {sym_to_s(lastc)}"
293 end
294 if c == -1 then break
295 if not labe.is_empty then labe += "\n"
296 labe += sym_to_s(c)
297 lastc = c
298 end
299 end
300 f.write("s{s.object_id}->s{s2.object_id} [label=\"{labe.escape_to_c}\"];\n")
301 end
302 end
303 f.write("empty->s{start.object_id}; empty[label=\"\",shape=none];\n")
304
305 f.write("\}\n")
306 f.close
307 end
308
309 # Transform a symbol to a string
310 # Used by `to_dot`
311 private fun sym_to_s(symbol: nullable Int): String
312 do
313 if symbol == null then
314 return "''"
315 else if symbol <= 32 then
316 return "#{symbol}"
317 else
318 return symbol.ascii.to_s
319 end
320 end
321
322 # Transform a NFA to a DFA
323 # note: the DFA is not miminized
324 fun to_dfa: Automaton
325 do
326 var dfa = new Automaton.empty
327 var n2d = new ArrayMap[Set[State], State]
328 var seen = new ArraySet[Set[State]]
329 var ts = new HashSet[Int]
330 var st = eclosure([start])
331 var todo = [st]
332 n2d[st] = dfa.start
333 seen.add(st)
334 while not todo.is_empty do
335 var nfa_states = todo.pop
336 #print "* work on {nfa_states.inspect}={nfa_states} (remains {todo.length}/{seen.length})"
337 var dfa_state = n2d[nfa_states]
338 ts.clear
339 for s in nfa_states do
340 if accept.has(s) then
341 if tags.has_key(s) then
342 for t in tags[s] do
343 dfa.add_tag(dfa_state, t)
344 end
345 end
346 dfa.accept.add(dfa_state)
347 end
348 for t in s.outs do
349 var sym = t.symbol
350 if sym == null or ts.has(sym) then continue
351 ts.add(sym)
352 var nfa_dest = eclosure(trans(nfa_states, sym))
353 #print "{nfa_states} -> {sym} -> {nfa_dest}"
354 var dfa_dest
355 if seen.has(nfa_dest) then
356 #print "* reuse {nfa_dest.inspect}={nfa_dest}"
357 dfa_dest = n2d[nfa_dest]
358 else
359 #print "* new {nfa_dest.inspect}={nfa_dest}"
360 dfa_dest = new State
361 dfa.states.add(dfa_dest)
362 n2d[nfa_dest] = dfa_dest
363 todo.add(nfa_dest)
364 seen.add(nfa_dest)
365 end
366 dfa_state.add_trans(dfa_dest, sym)
367 end
368 end
369 end
370 return dfa
371 end
372
373 # epsilon-closure on a state of states
374 # used by `to_dfa`
375 private fun eclosure(states: Collection[State]): Set[State]
376 do
377 var res = new ArraySet[State]
378 res.add_all(states)
379 var todo = states.to_a
380 while not todo.is_empty do
381 var s = todo.pop
382 for t in s.outs do
383 if t.symbol != null then continue
384 var to = t.to
385 if res.has(to) then continue
386 res.add(to)
387 todo.add(to)
388 end
389 end
390 return res
391 end
392
393 # trans on a set of states
394 # Used by `to_dfa`
395 fun trans(states: Collection[State], symbol: Int): Set[State]
396 do
397 var res = new ArraySet[State]
398 for s in states do
399 for t in s.outs do
400 if t.symbol != symbol then continue
401 var to = t.to
402 if res.has(to) then continue
403 res.add(to)
404 end
405 end
406 return res
407 end
408
409 # Generate the Nit source code of the lexer
410 # `filepath` is the name of the ouptit file
411 # `parser` is the name of the parser module (used to import the token classes)
412 fun gen_to_nit(filepath: String, name: String, parser: nullable String)
413 do
414 var gen = new DFAGenerator(filepath, name, self, parser)
415 gen.gen_to_nit
416 end
417 end
418
419 # Generate the Nit source code of the lexer
420 private class DFAGenerator
421 var filepath: String
422 var name: String
423 var automaton: Automaton
424 var parser: nullable String
425
426 var out: OStream
427 init(filepath: String, name: String, automaton: Automaton, parser: nullable String) do
428 self.filepath = filepath
429 self.name = name
430 self.automaton = automaton
431 self.parser = parser
432 self.out = new OFStream.open(filepath)
433 end
434
435 fun add(s: String) do out.write(s)
436
437 fun gen_to_nit
438 do
439 var names = new HashMap[State, String]
440 var i = 0
441 for s in automaton.states do
442 names[s] = i.to_s
443 i += 1
444 end
445
446 add "# Lexer generated by nitcc for the grammar {name}"
447 add("import nitcc_runtime\n")
448
449 var p = parser
450 if p != null then add("import {p}\n")
451
452 add("class Lexer_{name}\n")
453 add("\tsuper Lexer\n")
454 add("\tredef fun start_state do return dfastate_{names[automaton.start]}\n")
455 add("end\n")
456
457 add("redef class Object\n")
458 for s in automaton.states do
459 var n = names[s]
460 add("\tprivate fun dfastate_{n}: DFAState{n} do return once new DFAState{n}\n")
461 end
462 add("end\n")
463
464 add("class MyNToken\n")
465 add("\tsuper NToken\n")
466 add("end\n")
467
468 for s in automaton.states do
469 var n = names[s]
470 add("class DFAState{n}\n")
471 add("\tsuper DFAState\n")
472 if automaton.accept.has(s) then
473 var token
474 if automaton.tags.has_key(s) then
475 token = automaton.tags[s].first
476 else
477 token = null
478 end
479 add("\tredef fun is_accept do return true\n")
480 add("\tredef fun make_token(position, text) do\n")
481 if token != null and token.name == "Ignored" then
482 add("\t\treturn null\n")
483 else
484 if token == null then
485 add("\t\tvar t = new MyNToken\n")
486 else
487 add("\t\tvar t = new {token.cname}\n")
488 end
489 add("\t\tt.position = position\n")
490 add("\t\tt.text = text\n")
491 add("\t\treturn t\n")
492 end
493 add("\tend\n")
494 end
495 var trans = new ArrayMap[Int, State]
496 for t in s.outs do
497 var sym = t.symbol
498 assert sym != null
499 trans[sym] = t.to
500 end
501 if trans.is_empty then
502 # Do nothing, inherit the trans
503 else if trans.length == 1 then
504 var sym = trans.keys.first
505 var next = trans.values.first
506 add("\tredef fun trans(c) do\n")
507 add("\t\tif c.ascii == {sym} then return dfastate_{names[next]}\n")
508 add("\t\treturn null\n")
509 add("\tend\n")
510 else
511 add("\tredef fun trans(c) do\n")
512 for sym, next in trans do
513 add("\t\tif c.ascii == {sym} then return dfastate_{names[next]}\n")
514 end
515 add("\t\treturn null\n")
516 add("\tend\n")
517 end
518 add("end\n")
519 end
520
521 self.out.close
522 end
523 end
524
525 # A state in a finite automaton
526 class State
527 # Outgoing transitions
528
529 var outs = new Array[Transition]
530 # Ingoing tyransitions
531 var ins = new Array[Transition]
532
533 # Add a transitions to `to` on `symbol` (null means epsilon)
534 fun add_trans(to: State, symbol: nullable Int): Transition
535 do
536 var t = new Transition(self, to, symbol)
537 outs.add(t)
538 to.ins.add(t)
539 return t
540 end
541 end
542
543 # A transition in a finite automaton
544 class Transition
545 # The source state
546 var from: State
547 # The destination state
548 var to: State
549 # The symbol on the transition (null means epsilon)
550 var symbol: nullable Int
551
552 # Remove the transition from the automaton
553 # Detash from `from` and `to`
554 fun delete
555 do
556 from.outs.remove(self)
557 to.ins.remove(self)
558 end
559 end