d8c287ae532c7e827372837072018e56ca1f115b
[nit.git] / src / parser / lexer_work.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Internal algorithm and data structures for the Nit lexer
16 module lexer_work
17
18 intrude import parser_nodes
19 private import tables
20
21 redef class Token
22 private var cached_text: nullable String
23
24 redef fun text
25 do
26 var res = _cached_text
27 if res != null then return res
28 res = location.text
29 _cached_text = res
30 return res
31 end
32
33 redef fun text=(text)
34 do
35 _cached_text = text
36 end
37
38 fun parser_index: Int is abstract
39 end
40
41 redef class EOF
42 init init_tk(loc: Location)
43 do
44 _cached_text = ""
45 _location = loc
46 end
47 end
48
49 redef class AError
50 var message: String
51
52 init init_error(message: String, loc: Location)
53 do
54 init_tk(loc)
55 self.message = message
56 end
57 end
58
59 redef class ALexerError
60 var string: String
61
62 init init_lexer_error(message: String, loc: Location, string: String)
63 do
64 init_error(message, loc)
65 self.string = string
66 end
67 end
68
69 redef class AParserError
70 var token: Token
71
72 init init_parser_error(message: String, loc: Location, token: Token)
73 do
74 init_error(message, loc)
75 self.token = token
76 end
77 end
78
79 # The lexer extract NIT tokens from an input stream.
80 # It is better user with the Parser
81 class Lexer
82 super TablesCapable
83 # Last peeked token
84 var token: nullable Token
85
86 # Lexer current state
87 private var state: Int = 0
88
89 # The source file
90 var file: SourceFile
91
92 # Current character in the stream
93 var stream_pos: Int = 0
94
95 # Current line number in the input stream
96 var line: Int = 0
97
98 # Current column in the input stream
99 var pos: Int = 0
100
101 # Was the last character a carriage-return?
102 var cr: Bool = false
103
104 # Constante state values
105 private fun state_initial: Int do return 0 end
106
107 # Create a new lexer for a stream (and a name)
108 init(file: SourceFile)
109 do
110 self.file = file
111 end
112
113 # The last peeked token to chain them
114 private var last_token: nullable Token = null
115
116 # Give the next token (but do not consume it)
117 fun peek: Token
118 do
119 var t = _token
120 if t != null then return t
121
122 t = get_token
123 while t == null do t = get_token
124
125 if isset t._location then
126 var l = last_token
127 if l != null then
128 l.next_token = t
129 t.prev_token = l
130 else
131 file.first_token = t
132 end
133 last_token = t
134 end
135
136 _token = t
137 return t
138 end
139
140 # Give and consume the next token
141 fun next: Token
142 do
143 var result = peek
144 _token = null
145 return result
146 end
147
148 # Primitive method to return a token, or return null if it is discarded
149 # Is used to implement `peek` and `next`
150 protected fun get_token: nullable Token
151 do
152 var dfa_state = 0
153
154 var sp = _stream_pos
155 var start_stream_pos = sp
156 var start_pos = _pos
157 var start_line = _line
158 var file = self.file
159 var string = file.string
160 var string_len = string.length
161
162 var accept_state = -1
163 var accept_token = -1
164 var accept_length = -1
165 var accept_pos = -1
166 var accept_line = -1
167
168 loop
169 if sp >= string_len then
170 dfa_state = -1
171 else
172 var c = string[sp].ascii
173 sp += 1
174
175 var cr = _cr
176 var line = _line
177 var pos = _pos
178 if c == 10 then
179 if cr then
180 cr = false
181 file.line_starts[line] = sp
182 else
183 line = line + 1
184 pos = 0
185 file.line_starts[line] = sp
186 end
187 else if c == 13 then
188 line = line + 1
189 pos = 0
190 cr = true
191 file.line_starts[line] = sp
192 else
193 pos = pos + 1
194 cr = false
195 end
196
197 loop
198 var old_state = dfa_state
199 if dfa_state < -1 then
200 old_state = -2 - dfa_state
201 end
202
203 dfa_state = -1
204
205 var low = 0
206 var high = lexer_goto(old_state, 0) - 1
207
208 if high >= 0 then
209 while low <= high do
210 var middle = (low + high) / 2
211 var offset = middle * 3 + 1 # +1 because length is at 0
212
213 if c < lexer_goto(old_state, offset) then
214 high = middle - 1
215 else if c > lexer_goto(old_state, offset+1) then
216 low = middle + 1
217 else
218 dfa_state = lexer_goto(old_state, offset+2)
219 break
220 end
221 end
222 end
223 if dfa_state > -2 then break
224 end
225
226 _cr = cr
227 _line = line
228 _pos = pos
229 end
230
231 if dfa_state >= 0 then
232 var tok = lexer_accept(dfa_state)
233 if tok != -1 then
234 accept_state = dfa_state
235 accept_token = tok
236 accept_length = sp - start_stream_pos
237 accept_pos = _pos
238 accept_line = _line
239 end
240 else
241 if accept_state != -1 then
242 var location = new Location(file, start_line + 1, accept_line + 1, start_pos + 1, accept_pos)
243 _pos = accept_pos
244 _line = accept_line
245 _stream_pos = start_stream_pos + accept_length
246 if accept_token == 0 then
247 return null
248 end
249 return make_token(accept_token, location)
250 else
251 _stream_pos = sp
252 var location = new Location(file, start_line + 1, start_line + 1, start_pos + 1, start_pos + 1)
253 if sp > start_stream_pos then
254 var text = string.substring(start_stream_pos, sp-start_stream_pos)
255 var token = new ALexerError.init_lexer_error("Syntax error: unknown token {text}.", location, text)
256 file.last_token = token
257 return token
258 else
259 var token = new EOF.init_tk(location)
260 file.last_token = token
261 return token
262 end
263 end
264 end
265 end
266 end
267
268 # Allocate the right Token object for a given identifier
269 protected fun make_token(accept_token: Int, location: Location): Token is abstract
270 end