1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
15 # Finite automaton (NFA & DFA)
24 var start
: State is noinit
26 # State that are accept states
27 var accept
= new Array[State]
30 var states
= new Array[State]
32 # Tokens associated on accept states.
33 # Use `add_tag` to update
34 var tags
= new HashMap[State, Set[Token]]
36 # Accept states associated on tokens.
37 # Use `add_tag` to update
38 var retrotags
= new HashMap[Token, Set[State]]
40 # Tag all accept states
41 fun tag_accept
(t
: Token)
43 for s
in accept
do add_tag
(s
, t
)
46 # Add a token to a state
47 fun add_tag
(s
: State, t
: Token)
49 if not tags
.has_key
(s
) then
50 var set
= new ArraySet[Token]
57 if not retrotags
.has_key
(t
) then
58 var set
= new ArraySet[State]
66 assert retrotags
[t
].has
(s
)
69 # Remove all occurrences of a tag in an automaton
70 fun clear_tag
(t
: Token)
72 if not retrotags
.has_key
(t
) then return
73 for s
in retrotags
[t
] do
74 if not tags
.has_key
(s
) then continue
76 if tags
[s
].is_empty
then tags
.keys
.remove
(s
)
78 retrotags
.keys
.remove
(t
)
81 # Remove tokens from conflicting state according the inclusion of language.
82 # REQUIRE: self isa DFA automaton
83 fun solve_token_inclusion
86 if ts
.length
<= 1 then continue
87 var losers
= new Array[Token]
90 if t1
== t2
then continue
91 if retrotags
[t1
].length
> retrotags
[t2
].length
and retrotags
[t1
].has_all
(retrotags
[t2
]) then
104 # Initialize a new automaton for the empty language.
105 # One state, no accept, no transition.
108 var state
= new State
113 # Initialize a new automaton for the empty-string language.
114 # One state, is accept, no transition.
117 var state
= new State
123 # Initialize a new automation for the language that accepts only a single symbol.
124 # Two state, the second is accept, one transition on `symbol`.
125 init atom
(symbol
: Int)
129 var sym
= new TSymbol(symbol
, symbol
)
137 # Initialize a new automation for the language that accepts only a range of symbols
138 # Two state, the second is accept, one transition for `from` to `to`
139 init cla
(first
: Int, last
: nullable Int)
143 var sym
= new TSymbol(first
, last
)
151 # Concatenate `other` to `self`.
152 # Other is modified and invalidated.
153 fun concat
(other
: Automaton)
157 a1
.add_trans
(s2
, null)
159 accept
= other
.accept
160 states
.add_all other
.states
163 # `self` become the alternation of `self` and `other`.
164 # `other` is modified and invalidated.
165 fun alternate
(other
: Automaton)
169 s
.add_trans
(start
, null)
171 a1
.add_trans
(a
, null)
173 s
.add_trans
(other
.start
, null)
174 for a2
in other
.accept
do
175 a2
.add_trans
(a
, null)
184 states
.add_all other
.states
187 # Return a new automaton that recognize `self` but not `other`.
188 # For a theoretical POV, this is the subtraction of languages.
189 # Note: the implementation use `to_dfa` internally, so the theoretical complexity is not cheap.
190 fun except
(other
: Automaton): Automaton
192 var ta
= new Token("1")
194 var tb
= new Token("2")
197 var c
= new Automaton.empty
202 for s
in c
.retrotags
[ta
] do
203 if not c
.tags
[s
].has
(tb
) then
212 # `self` absorbs all states, transitions, tags, and acceptations of `other`.
213 # An epsilon transition is added between `self.start` and `other.start`.
214 fun absorb
(other
: Automaton)
216 states
.add_all other
.states
217 start
.add_trans
(other
.start
, null)
218 for s
, ts
in other
.tags
do for t
in ts
do add_tag
(s
, t
)
219 accept
.add_all other
.accept
222 # Do the Kleene closure (*) on self
226 a1
.add_trans
(start
, null)
227 start
.add_trans
(a1
, null)
235 a1
.add_trans
(start
, null)
242 alternate
(new Automaton.epsilon
)
245 # Remove all transitions on a given symbol
246 fun minus_sym
(symbol
: TSymbol)
251 for t
in s
.outs
.to_a
do
252 if t
.symbol
== null then continue
255 var tf
= t
.symbol
.as(not null).first
256 var tl
= t
.symbol
.as(not null).last
257 if l
!= null and tf
> l
then continue
258 if tl
!= null and f
> tl
then continue
262 # Add left and right part if non empty
264 var sym
= new TSymbol(tf
,f-1
)
265 s
.add_trans
(t
.to
, sym
)
269 var sym
= new TSymbol(l
+1, null)
270 s
.add_trans
(t
.to
, sym
)
272 var sym
= new TSymbol(l
+1, tl
)
273 s
.add_trans
(t
.to
, sym
)
280 # Fully duplicate an automaton
283 var res
= new Automaton.empty
284 var map
= new HashMap[State, State]
285 map
[start
] = res
.start
287 if s
== start
then continue
293 res
.accept
.add map
[s
]
295 for s
, ts
in tags
do for t
in ts
do
296 res
.add_tag
(map
[s
], t
)
300 map
[s
].add_trans
(map
[t
.to
], t
.symbol
)
306 # Reverse an automaton in place
320 if accept
.length
== 1 then
328 st2
.add_trans
(s
, null)
335 # Remove states (and transitions) that does not reach an accept state
338 # Good states are those we want to keep
339 var goods
= new HashSet[State]
340 goods
.add_all
(accept
)
342 var todo
= accept
.to_a
345 while not todo
.is_empty
do
349 if goods
.has
(s2
) then continue
355 # What are the bad state then?
356 var bads
= new Array[State]
358 if not goods
.has
(s
) then bads
.add
(s
)
361 # Remove their transitions and tags
363 for t
in s
.ins
.to_a
do t
.delete
364 for t
in s
.outs
.to_a
do t
.delete
365 if tags
.has_key
(s
) then
366 for t
in tags
[s
] do retrotags
[t
].remove
(s
)
371 # Keep only the good stuff
373 states
.add_all
(goods
)
377 # Generate a minimal DFA
378 # REQUIRE: self is a DFA
379 fun to_minimal_dfa
: Automaton
385 # Graph of known distinct states.
386 var distincts
= new HashMap[State, Set[State]]
388 distincts
[s
] = new HashSet[State]
391 # split accept states.
392 # An accept state is distinct with a non accept state.
395 if distincts
[s1
].has
(s2
) then continue
396 if not accept
.has
(s2
) then
397 distincts
[s1
].add
(s2
)
398 distincts
[s2
].add
(s1
)
401 if tags
.get_or_null
(s1
) != tags
.get_or_null
(s2
) then
402 distincts
[s1
].add
(s2
)
403 distincts
[s2
].add
(s1
)
409 # Fixed point algorithm.
410 # * Get 2 states s1 and s2 not yet distinguished.
412 # * If s1.trans(w) and s2.trans(w) are distinguished, then
413 # distinguish s1 and s2.
415 var ints
= new Array[Int] # List of symbols to check
418 for s1
in states
do for s2
in states
do
419 if distincts
[s1
].has
(s2
) then continue
421 # The transitions use intervals. Therefore, for the states s1 and s2,
422 # we need to check only the meaningful symbols. They are the `first`
423 # symbol of each interval and the first one after the interval (`last+1`).
425 # Check only `s1`; `s2` will be checked later when s1 and s2 are switched.
431 if l
!= null then ints
.add l
+ 1
436 var ds1
= s1
.trans
(i
)
437 var ds2
= s2
.trans
(i
)
438 if ds1
== ds2
then continue
439 if ds1
!= null and ds2
!= null and not distincts
[ds1
].has
(ds2
) then continue
440 distincts
[s1
].add
(s2
)
441 distincts
[s2
].add
(s1
)
448 # We need to unify not-distinguished states.
449 # Just add an epsilon-transition and DFAize the automaton.
450 for s1
in states
do for s2
in states
do
451 if distincts
[s1
].has
(s2
) then continue
452 s1
.add_trans
(s2
, null)
458 # Assert that `self` is a valid automaton or abort
461 assert states
.has
(start
)
462 assert states
.has_all
(accept
)
464 for t
in s
.outs
do assert states
.has
(t
.to
)
465 for t
in s
.ins
do assert states
.has
(t
.from
)
467 assert states
.has_all
(tags
.keys
)
468 for t
, ss
in retrotags
do
469 assert states
.has_all
(ss
)
473 # Produce a graphviz string from the automatom
475 # Set `merge_transitions = false` to generate one edge by transition (default true).
476 fun to_dot
(merge_transitions
: nullable Bool): Writable
478 var names
= new HashMap[State, String]
486 f
.append
("digraph g \{\n")
487 f
.append
("rankdir=LR;")
491 f
.append
("s{names[s]}[shape=circle")
492 #f.write("label=\"\",")
493 if accept
.has
(s
) then
494 f
.append
(",shape=doublecircle")
496 if tags
.has_key
(s
) then
497 f
.append
(",label=\"")
498 for token in tags[s] do
499 f.append("{token.name.escape_to_dot}\\n
")
503 f
.append
(",label=\"{state_nb}\
"")
506 var outs
= new HashMap[State, Array[nullable TSymbol]]
511 if outs
.has_key
(s2
) then
514 a
= new Array[nullable TSymbol]
522 if merge_transitions
== false then labe
= ""
523 if not labe
.is_empty
then labe
+= "\n"
529 if merge_transitions
== false then
530 f
.append
("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_dot}\
"];\n")
533 if merge_transitions
or else true then
534 f
.append
("s{names[s]}->s{names[s2]} [label=\"{labe.escape_to_c}\
"];\n")
539 f
.append
("empty->s{names[start]}; empty[label=\"\
",shape=none];\n")
544 # Transform a NFA to a DFA.
545 # note: the DFA is not minimized.
546 fun to_dfa
: Automaton
552 var dfa
= new Automaton.empty
553 var n2d
= new ArrayMap[Set[State], State]
554 var seen
= new ArraySet[Set[State]]
555 var alphabet
= new HashSet[Int]
556 var st
= eclosure
([start
])
560 while not todo
.is_empty
do
561 var nfa_states
= todo
.pop
562 #print "* work on {nfa_states.inspect}={nfa_states} (remains {todo.length}/{seen.length})"
563 var dfa_state
= n2d
[nfa_states
]
565 for s
in nfa_states
do
566 # Collect important values to build the alphabet
569 if sym
== null then continue
570 alphabet
.add
(sym
.first
)
572 if l
!= null then alphabet
.add
(l
)
575 # Mark accept and tags
576 if accept
.has
(s
) then
577 if tags
.has_key
(s
) then
579 dfa
.add_tag
(dfa_state
, t
)
582 dfa
.accept
.add
(dfa_state
)
586 # From the important values, build a sequence of TSymbols
587 var a
= alphabet
.to_a
588 default_comparator
.sort
(a
)
589 var tsyms
= new Array[TSymbol]
592 if last
> 0 and last
<= i-1
then
593 tsyms
.add
(new TSymbol(last
,i-1
))
595 tsyms
.add
(new TSymbol(i
,i
))
599 tsyms
.add
(new TSymbol(last
,null))
601 #print "Alphabet: {tsyms.join(", ")}"
603 var lastst
: nullable Transition = null
605 var nfa_dest
= eclosure
(trans
(nfa_states
, sym
.first
))
606 if nfa_dest
.is_empty
then
610 #print "{nfa_states} -> {sym} -> {nfa_dest}"
612 if seen
.has
(nfa_dest
) then
613 #print "* reuse {nfa_dest.inspect}={nfa_dest}"
614 dfa_dest
= n2d
[nfa_dest
]
616 #print "* new {nfa_dest.inspect}={nfa_dest}"
618 dfa
.states
.add
(dfa_dest
)
619 n2d
[nfa_dest
] = dfa_dest
623 if lastst
!= null and lastst
.to
== dfa_dest
then
624 lastst
.symbol
.as(not null).last
= sym
.last
626 lastst
= dfa_state
.add_trans
(dfa_dest
, sym
)
633 # Transform a NFA to a epsilonless NFA.
634 fun to_nfa_noe
: Automaton
640 var dfa
= new Automaton.empty
641 var n2d
= new ArrayMap[Set[State], State]
642 var seen
= new ArraySet[Set[State]]
643 var st
= eclosure
([start
])
647 while not todo
.is_empty
do
648 var nfa_states
= todo
.pop
649 #print "* work on {nfa_states.inspect}={nfa_states} (remains {todo.length}/{seen.length})"
650 var dfa_state
= n2d
[nfa_states
]
651 for s
in nfa_states
do
653 if t
.symbol
== null then continue
654 var nfa_dest
= eclosure
([t
.to
])
655 #print "{nfa_states} -> {sym} -> {nfa_dest}"
657 if seen
.has
(nfa_dest
) then
658 #print "* reuse {nfa_dest.inspect}={nfa_dest}"
659 dfa_dest
= n2d
[nfa_dest
]
661 #print "* new {nfa_dest.inspect}={nfa_dest}"
663 dfa
.states
.add
(dfa_dest
)
664 n2d
[nfa_dest
] = dfa_dest
668 dfa_state
.add_trans
(dfa_dest
, t
.symbol
)
671 # Mark accept and tags
672 if accept
.has
(s
) then
673 if tags
.has_key
(s
) then
675 dfa
.add_tag
(dfa_state
, t
)
678 dfa
.accept
.add
(dfa_state
)
685 # Epsilon-closure on a state of states.
687 private fun eclosure
(states
: Collection[State]): Set[State]
689 var res
= new ArraySet[State]
691 var todo
= states
.to_a
692 while not todo
.is_empty
do
695 if t
.symbol
!= null then continue
697 if res
.has
(to
) then continue
705 # Trans on a set of states.
707 fun trans
(states
: Collection[State], symbol
: Int): Set[State]
709 var res
= new ArraySet[State]
713 if sym
== null then continue
714 if sym
.first
> symbol
then continue
716 if l
!= null and l
< symbol
then continue
718 if res
.has
(to
) then continue
725 # Generate the Nit source code of the lexer.
726 # `filepath` is the name of the output file.
727 # `parser` is the name of the parser module (used to import the token classes).
728 fun gen_to_nit
(filepath
: String, name
: String, parser
: nullable String)
730 var gen
= new DFAGenerator(filepath
, name
, self, parser
)
735 # Generate the Nit source code of the lexer
736 private class DFAGenerator
739 var automaton
: Automaton
740 var parser
: nullable String
742 var out
: Writer is noinit
745 self.out
= new FileWriter.open
(filepath
)
748 fun add
(s
: String) do out
.write
(s
)
752 var names
= new HashMap[State, String]
754 for s
in automaton
.states
do
759 add
"# Lexer generated by nitcc for the grammar {name}\n"
760 add
"module {name}_lexer is generated, no_warning \"missing-doc\
"\n"
761 add
("import nitcc_runtime\n")
764 if p
!= null then add
("import {p}\n")
766 add
("class Lexer_{name}\n")
767 add
("\tsuper Lexer\n")
768 add
("\tredef fun start_state do return dfastate_{names[automaton.start]}\n")
771 for s
in automaton
.states
do
773 add
("private fun dfastate_{n}: DFAState{n} do return once new DFAState{n}\n")
776 add
("class MyNToken\n")
777 add
("\tsuper NToken\n")
780 for s
in automaton
.states
do
782 add
("private class DFAState{n}\n")
783 add
("\tsuper DFAState\n")
784 if automaton
.accept
.has
(s
) then
786 if automaton
.tags
.has_key
(s
) then
787 token
= automaton
.tags
[s
].first
791 add
("\tredef fun is_accept do return true\n")
792 var is_ignored
= false
793 if token
!= null and token
.name
== "Ignored" then
795 add
("\tredef fun is_ignored do return true\n")
797 add
("\tredef fun make_token(position, source) do\n")
799 add
("\t\treturn null\n")
801 if token
== null then
802 add
("\t\tvar t = new MyNToken\n")
803 add
("\t\tt.text = position.extract(source)\n")
805 add
("\t\tvar t = new {token.cname}\n")
806 var ttext
= token
.text
807 if ttext
== null then
808 add
("\t\tt.text = position.extract(source)\n")
810 add
("\t\tt.text = \"{ttext.escape_to_nit}\
"\n")
813 add
("\t\tt.position = position\n")
814 add
("\t\treturn t\n")
818 var trans
= new ArrayMap[TSymbol, State]
824 if trans
.is_empty
then
825 # Do nothing, inherit the trans
827 add
("\tredef fun trans(char) do\n")
829 # Collect the sequence of tests in the dispatch sequence
830 # The point here is that for each transition, there is a first and a last
831 # So holes have to be identified
832 var dispatch
= new HashMap[Int, nullable State]
833 var haslast
: nullable State = null
836 for sym
, next
in trans
do
837 assert haslast
== null
838 assert sym
.first
> last
839 if sym
.first
> last
+ 1 then
840 dispatch
[sym
.first-1
] = null
851 if dispatch
.is_empty
and haslast
!= null then
852 # Only one transition that accepts everything (quite rare)
855 add
("\t\tvar c = char.code_point\n")
858 # Generate a sequence of `if` for the dispatch
859 if haslast
!= null and last
>= 0 then
860 # Special case: handle up-bound first if not an error
861 add
("\t\tif c > {last} then return dfastate_{names[haslast]}\n")
862 # previous become the new last case
863 haslast
= dispatch
[last
]
864 dispatch
.keys
.remove
(last
)
866 for c
, next
in dispatch
do
868 add
("\t\tif c <= {c} then return null\n")
870 add
("\t\tif c <= {c} then return dfastate_{names[next]}\n")
873 if haslast
== null then
874 add
("\t\treturn null\n")
876 add
("\t\treturn dfastate_{names[haslast]}\n")
889 # The associated text (if any, ie defined in the parser part)
890 var text
: nullable String is noautoinit
, writable
893 # A state in a finite automaton
895 # Outgoing transitions
896 var outs
= new Array[Transition]
898 # Ingoing transitions
899 var ins
= new Array[Transition]
901 # Add a transitions to `to` on `symbol` (null means epsilon)
902 fun add_trans
(to
: State, symbol
: nullable TSymbol): Transition
904 var t
= new Transition(self, to
, symbol
)
910 # Get the first state following the transition `i`.
911 # Null if no transition for `i`.
912 fun trans
(i
: Int): nullable State
919 if i
< f
then continue
920 if l
!= null and i
> l
then continue
927 # A range of symbols on a transition
929 # The first symbol in the range
932 # The last symbol if any.
934 # `null` means infinity.
935 var last
: nullable Int
944 res
= f
.code_point
.to_s
947 if f
== l
then return res
949 if l
== null then return res
950 if l
<= 32 or l
>= 127 then return res
+ "#{l}"
951 return res
+ l
.code_point
.to_s
955 # A transition in a finite automaton
959 # The destination state
961 # The symbol on the transition (null means epsilon)
962 var symbol
: nullable TSymbol
964 # Remove the transition from the automaton.
965 # Detach from `from` and `to`.
968 from
.outs
.remove
(self)