2d1ca78f4baeea92169e4dbc6929264316799145
[nit.git] / contrib / nitcc / src / autom.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Finite automaton (NFA & DFA)
16 module autom
17
18 # For the class Token
19 import grammar
20
21 # A finite automaton
22 class Automaton
23 # The start state
24 var start: State
25
26 # State that are accect states
27 var accept = new Array[State]
28
29 # All states
30 var states = new Array[State]
31
32 # Tokens associated on accept states
33 # use `add_tag` to update
34 var tags = new HashMap[State, Set[Token]]
35
36 # Accept states associated on tokens
37 # use `add_tag` to update
38 var retrotags = new HashMap[Token, Set[State]]
39
40 # Add a token to a state
41 fun add_tag(s: State, t: Token)
42 do
43 if not tags.has_key(s) then
44 var set = new ArraySet[Token]
45 tags[s] = set
46 set.add t
47 else
48 tags[s].add t
49 end
50
51 if not retrotags.has_key(t) then
52 var set = new ArraySet[State]
53 retrotags[t] = set
54 set.add s
55 else
56 retrotags[t].add s
57 end
58 end
59
60 # Remove tokens from conflicting state according the the inclusion of language
61 # REQUIRE: self isa DFA automaton
62 fun solve_token_inclusion
63 do
64 for s, ts in tags do
65 if ts.length <= 1 then continue
66 var losers = new Array[Token]
67 for t1 in ts do
68 for t2 in ts do
69 if t1 == t2 then continue
70 if retrotags[t1].length > retrotags[t2].length and retrotags[t1].has_all(retrotags[t2]) then
71 losers.add(t1)
72 break
73 end
74 end
75 end
76 for t in losers do
77 ts.remove(t)
78 retrotags[t].remove s
79 end
80 end
81 end
82
83 # Initialize a new automaton for the empty language
84 # one state, no accept, no transition
85 init empty
86 do
87 var state = new State
88 start = state
89 states.add state
90 end
91
92 # Initialize a new automaton for the empty-string language
93 # one state, is accept, no transition
94 init epsilon
95 do
96 var state = new State
97 start = state
98 accept.add state
99 states.add state
100 end
101
102 # Initialize a new automation for the language that accepts only a single symbol
103 # Two state, the second is accept, one transition on `symbol`
104 init atom(symbol: Int)
105 do
106 var s = new State
107 var a = new State
108 s.add_trans(a, symbol)
109 start = s
110 accept.add a
111 states.add s
112 states.add a
113 end
114
115 # Initialize a new automation for the language that accepts only a range of symbols
116 # Two state, the second is accept, one transition for `from` to `to`
117 init cla(from, to: Int)
118 do
119 var s = new State
120 var a = new State
121 for symbol in [from..to] do
122 s.add_trans(a, symbol)
123 end
124 start = s
125 accept.add a
126 states.add s
127 states.add a
128 end
129
130 # Contatenate `other` to `self`
131 # other is modified and invalidated.
132 fun concat(other: Automaton)
133 do
134 var s2 = other.start
135 for a1 in accept do
136 a1.add_trans(s2, null)
137 end
138 accept = other.accept
139 states.add_all other.states
140 end
141
142 # `self` become the alternation of `self` and `other`
143 # `other` is modified and invalidated.
144 fun alternate(other: Automaton)
145 do
146 var s = new State
147 var a = new State
148 s.add_trans(start, null)
149 for a1 in accept do
150 a1.add_trans(a, null)
151 end
152 s.add_trans(other.start, null)
153 for a2 in other.accept do
154 a2.add_trans(a, null)
155 accept.add(a2)
156 end
157
158 start = s
159 accept = [a]
160
161 states.add s
162 states.add a
163 states.add_all other.states
164 end
165
166 # Do the Kleene closure (*) on self
167 fun close
168 do
169 for a1 in accept do
170 a1.add_trans(start, null)
171 start.add_trans(a1, null)
172 end
173 end
174
175 # Do the + on self
176 fun plus
177 do
178 for a1 in accept do
179 a1.add_trans(start, null)
180 end
181 end
182
183 # Do the ? on self
184 fun optionnal
185 do
186 alternate(new Automaton.epsilon)
187 end
188
189 # Remove all transitions on a given symbol
190 fun minus_sym(symbol: Int)
191 do
192 for s in states do
193 for t in s.outs.to_a do
194 if t.symbol == symbol then t.delete
195 end
196 end
197 end
198
199 # Fully duplicate an automaton
200 fun dup: Automaton
201 do
202 var res = new Automaton.empty
203 var map = new HashMap[State, State]
204 map[start] = res.start
205 for s in states do
206 if s == start then continue
207 var s2 = new State
208 map[s] = s2
209 res.states.add(s2)
210 end
211 for s in accept do
212 res.accept.add map[s]
213 end
214 for s, ts in tags do for t in ts do
215 res.add_tag(map[s], t)
216 end
217 for s in states do
218 for t in s.outs do
219 map[s].add_trans(map[t.to], t.symbol)
220 end
221 end
222 return res
223 end
224
225 # Produce a graphvis file for the automaton
226 fun to_dot(filepath: String)
227 do
228 var f = new OFStream.open(filepath)
229 f.write("digraph g \{\n")
230
231 for s in states do
232 f.write("s{s.object_id}[")
233 #f.write("label=\"\",")
234 if accept.has(s) then
235 f.write("color=blue")
236 if tags.has_key(s) then
237 f.write(",label=\"")
238 for token in tags[s] do
239 f.write("{token.name.escape_to_c}\\n")
240 end
241 f.write("\"")
242 end
243 end
244 f.write("];\n")
245 var outs = new HashMap[State, Array[nullable Int]]
246 for t in s.outs do
247 var a
248 var s2 = t.to
249 var c = t.symbol
250 if outs.has_key(s2) then
251 a = outs[s2]
252 else
253 a = new Array[nullable Int]
254 outs[s2] = a
255 end
256 a.add(c)
257 end
258 for s2, a in outs do
259 var labe = ""
260 var lastc: nullable Int = null
261 var elip = 0
262 a.add(-1)
263 for c in a do
264 if c == null then
265 if not labe.is_empty then labe += "\n"
266 labe += "''"
267 else if lastc == c - 1 then
268 elip += 1
269 lastc = c
270 else
271 if elip == 1 then
272 assert lastc != null
273 labe += "\n{sym_to_s(lastc)}"
274 else if elip > 1 then
275 assert lastc != null
276 labe += " .. {sym_to_s(lastc)}"
277 end
278 if c == -1 then break
279 if not labe.is_empty then labe += "\n"
280 labe += sym_to_s(c)
281 lastc = c
282 end
283 end
284 f.write("s{s.object_id}->s{s2.object_id} [label=\"{labe.escape_to_c}\"];\n")
285 end
286 end
287 f.write("empty->s{start.object_id}; empty[label=\"\",shape=none];\n")
288
289 f.write("\}\n")
290 f.close
291 end
292
293 # Transform a symbol to a string
294 # Used by `to_dot`
295 private fun sym_to_s(symbol: nullable Int): String
296 do
297 if symbol == null then
298 return "''"
299 else if symbol <= 32 then
300 return "#{symbol}"
301 else
302 return symbol.ascii.to_s
303 end
304 end
305
306 # Transform a NFA to a DFA
307 # note: the DFA is not miminized
308 fun to_dfa: Automaton
309 do
310 var dfa = new Automaton.empty
311 var n2d = new ArrayMap[Set[State], State]
312 var seen = new ArraySet[Set[State]]
313 var ts = new HashSet[Int]
314 var st = eclosure([start])
315 var todo = [st]
316 n2d[st] = dfa.start
317 seen.add(st)
318 while not todo.is_empty do
319 var nfa_states = todo.pop
320 #print "* work on {nfa_states.inspect}={nfa_states} (remains {todo.length}/{seen.length})"
321 var dfa_state = n2d[nfa_states]
322 ts.clear
323 for s in nfa_states do
324 if accept.has(s) then
325 if tags.has_key(s) then
326 for t in tags[s] do
327 dfa.add_tag(dfa_state, t)
328 end
329 end
330 dfa.accept.add(dfa_state)
331 end
332 for t in s.outs do
333 var sym = t.symbol
334 if sym == null or ts.has(sym) then continue
335 ts.add(sym)
336 var nfa_dest = eclosure(trans(nfa_states, sym))
337 #print "{nfa_states} -> {sym} -> {nfa_dest}"
338 var dfa_dest
339 if seen.has(nfa_dest) then
340 #print "* reuse {nfa_dest.inspect}={nfa_dest}"
341 dfa_dest = n2d[nfa_dest]
342 else
343 #print "* new {nfa_dest.inspect}={nfa_dest}"
344 dfa_dest = new State
345 dfa.states.add(dfa_dest)
346 n2d[nfa_dest] = dfa_dest
347 todo.add(nfa_dest)
348 seen.add(nfa_dest)
349 end
350 dfa_state.add_trans(dfa_dest, sym)
351 end
352 end
353 end
354 return dfa
355 end
356
357 # epsilon-closure on a state of states
358 # used by `to_dfa`
359 private fun eclosure(states: Collection[State]): Set[State]
360 do
361 var res = new ArraySet[State]
362 res.add_all(states)
363 var todo = states.to_a
364 while not todo.is_empty do
365 var s = todo.pop
366 for t in s.outs do
367 if t.symbol != null then continue
368 var to = t.to
369 if res.has(to) then continue
370 res.add(to)
371 todo.add(to)
372 end
373 end
374 return res
375 end
376
377 # trans on a set of states
378 # Used by `to_dfa`
379 fun trans(states: Collection[State], symbol: Int): Set[State]
380 do
381 var res = new ArraySet[State]
382 for s in states do
383 for t in s.outs do
384 if t.symbol != symbol then continue
385 var to = t.to
386 if res.has(to) then continue
387 res.add(to)
388 end
389 end
390 return res
391 end
392
393 # Generate the Nit source code of the lexer
394 # `filepath` is the name of the ouptit file
395 # `parser` is the name of the parser module (used to import the token classes)
396 fun gen_to_nit(filepath: String, parser: nullable String)
397 do
398 var gen = new DFAGenerator(filepath, self, parser)
399 gen.gen_to_nit
400 end
401 end
402
403 # Generate the Nit source code of the lexer
404 private class DFAGenerator
405 var filepath: String
406 var automaton: Automaton
407 var parser: nullable String
408
409 var out: OStream
410 init(filepath: String, automaton: Automaton, parser: nullable String) do
411 self.filepath = filepath
412 self.automaton = automaton
413 self.parser = parser
414 self.out = new OFStream.open(filepath)
415 end
416
417 fun add(s: String) do out.write(s)
418
419 fun gen_to_nit
420 do
421 var names = new HashMap[State, String]
422 var i = 0
423 for s in automaton.states do
424 names[s] = i.to_s
425 i += 1
426 end
427
428 add "# Lexer generated by nitcc"
429 add("import nitcc_runtime\n")
430
431 var p = parser
432 if p != null then add("import {p}\n")
433
434 add("class MyLexer\n")
435 add("\tsuper Lexer\n")
436 add("\tredef fun start_state do return dfastate_{names[automaton.start]}\n")
437 add("end\n")
438
439 add("redef class Object\n")
440 for s in automaton.states do
441 var n = names[s]
442 add("\tprivate fun dfastate_{n}: DFAState{n} do return once new DFAState{n}\n")
443 end
444 add("end\n")
445
446 add("class MyNToken\n")
447 add("\tsuper NToken\n")
448 add("end\n")
449
450 for s in automaton.states do
451 var n = names[s]
452 add("class DFAState{n}\n")
453 add("\tsuper DFAState\n")
454 if automaton.accept.has(s) then
455 var token
456 if automaton.tags.has_key(s) then
457 token = automaton.tags[s].first
458 else
459 token = null
460 end
461 add("\tredef fun is_accept do return true\n")
462 add("\tredef fun make_token(position, text) do\n")
463 if token != null and token.name == "Ignored" then
464 add("\t\treturn null\n")
465 else
466 if token == null then
467 add("\t\tvar t = new MyNToken\n")
468 else
469 add("\t\tvar t = new {token.cname}\n")
470 end
471 add("\t\tt.position = position\n")
472 add("\t\tt.text = text\n")
473 add("\t\treturn t\n")
474 end
475 add("\tend\n")
476 end
477 var trans = new ArrayMap[Int, State]
478 for t in s.outs do
479 var sym = t.symbol
480 assert sym != null
481 trans[sym] = t.to
482 end
483 if trans.is_empty then
484 # Do nothing, inherit the trans
485 else if trans.length == 1 then
486 var sym = trans.keys.first
487 var next = trans.values.first
488 add("\tredef fun trans(c) do\n")
489 add("\t\tif c.ascii == {sym} then return dfastate_{names[next]}\n")
490 add("\t\treturn null\n")
491 add("\tend\n")
492 else
493 add("\tredef fun trans(c) do\n")
494 for sym, next in trans do
495 add("\t\tif c.ascii == {sym} then return dfastate_{names[next]}\n")
496 end
497 add("\t\treturn null\n")
498 add("\tend\n")
499 end
500 add("end\n")
501 end
502
503 self.out.close
504 end
505 end
506
507 # A state in a finite automaton
508 class State
509 # Outgoing transitions
510
511 var outs = new Array[Transition]
512 # Ingoing tyransitions
513 var ins = new Array[Transition]
514
515 # Add a transitions to `to` on `symbol` (null means epsilon)
516 fun add_trans(to: State, symbol: nullable Int): Transition
517 do
518 var t = new Transition(self, to, symbol)
519 outs.add(t)
520 to.ins.add(t)
521 return t
522 end
523 end
524
525 # A transition in a finite automaton
526 class Transition
527 # The source state
528 var from: State
529 # The destination state
530 var to: State
531 # The symbol on the transition (null means epsilon)
532 var symbol: nullable Int
533
534 # Remove the transition from the automaton
535 # Detash from `from` and `to`
536 fun delete
537 do
538 from.outs.remove(self)
539 to.ins.remove(self)
540 end
541 end