0b2258b21c15c17ca2316e2986ffe5a75a86561e
[nit.git] / src / parser / lexer_work.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Internal algorithm and data structures for the Nit lexer
16 module lexer_work
17
18 intrude import parser_nodes
19 private import tables
20
21 redef class Token
22 var _text: nullable String
23
24 redef fun text
25 do
26 var res = _text
27 if res != null then return res
28 res = location.text
29 _text = res
30 return res
31 end
32
33 redef fun text=(text)
34 do
35 _text = text
36 end
37
38 fun parser_index: Int is abstract
39 end
40
41 redef class EOF
42 redef fun parser_index: Int
43 do
44 return 97
45 end
46
47 init init_tk(loc: Location)
48 do
49 _text = ""
50 _location = loc
51 end
52 end
53
54 redef class AError
55 readable var _message: String
56
57 init init_error(message: String, loc: Location)
58 do
59 init_tk(loc)
60 _message = message
61 end
62 end
63
64 redef class ALexerError
65 readable var _string: String
66
67 init init_lexer_error(message: String, loc: Location, string: String)
68 do
69 init_error(message, loc)
70 _string = string
71 end
72 end
73
74 redef class AParserError
75 readable var _token: Token
76
77 init init_parser_error(message: String, loc: Location, token: Token)
78 do
79 init_error(message, loc)
80 _token = token
81 end
82 end
83
84 # The lexer extract NIT tokens from an input stream.
85 # It is better user with the Parser
86 class Lexer
87 super TablesCapable
88 # Last peeked token
89 var _token: nullable Token
90
91 # Lexer current state
92 var _state: Int = 0
93
94 # The source file
95 readable var _file: SourceFile
96
97 # Current character in the stream
98 var _stream_pos: Int = 0
99
100 # Current line number in the input stream
101 var _line: Int = 0
102
103 # Current column in the input stream
104 var _pos: Int = 0
105
106 # Was the last character a cariage-return?
107 var _cr: Bool = false
108
109 # Constante state values
110 private fun state_initial: Int do return 0 end
111
112 # Create a new lexer for a stream (and a name)
113 init(file: SourceFile)
114 do
115 _file = file
116 end
117
118 # The last peeked token to chain them
119 private var last_token: nullable Token = null
120
121 # Give the next token (but do not consume it)
122 fun peek: Token
123 do
124 var t = _token
125 if t != null then return t
126
127 t = get_token
128 while t == null do t = get_token
129
130 if t._location != null then
131 var l = last_token
132 if l != null then
133 l.next_token = t
134 t.prev_token = l
135 else
136 _file.first_token = t
137 end
138 last_token = t
139 end
140
141 _token = t
142 return t
143 end
144
145 # Give and consume the next token
146 fun next: Token
147 do
148 var result = peek
149 _token = null
150 return result
151 end
152
153 # Primitive method to return a token, or return null if it is discarded
154 # Is used to implement `peek` and `next`
155 protected fun get_token: nullable Token
156 do
157 var dfa_state = 0
158
159 var sp = _stream_pos
160 var start_stream_pos = sp
161 var start_pos = _pos
162 var start_line = _line
163 var string = _file.string
164 var string_len = string.length
165
166 var accept_state = -1
167 var accept_token = -1
168 var accept_length = -1
169 var accept_pos = -1
170 var accept_line = -1
171
172 loop
173 if sp >= string_len then
174 dfa_state = -1
175 else
176 var c = string.chars[sp].ascii
177 sp += 1
178
179 var cr = _cr
180 var line = _line
181 var pos = _pos
182 if c == 10 then
183 if cr then
184 cr = false
185 _file.line_starts[line] = sp
186 else
187 line = line + 1
188 pos = 0
189 _file.line_starts[line] = sp
190 end
191 else if c == 13 then
192 line = line + 1
193 pos = 0
194 cr = true
195 _file.line_starts[line] = sp
196 else
197 pos = pos + 1
198 cr = false
199 end
200
201 loop
202 var old_state = dfa_state
203 if dfa_state < -1 then
204 old_state = -2 - dfa_state
205 end
206
207 dfa_state = -1
208
209 var low = 0
210 var high = lexer_goto(old_state, 0) - 1
211
212 if high >= 0 then
213 while low <= high do
214 var middle = (low + high) / 2
215 var offset = middle * 3 + 1 # +1 because length is at 0
216
217 if c < lexer_goto(old_state, offset) then
218 high = middle - 1
219 else if c > lexer_goto(old_state, offset+1) then
220 low = middle + 1
221 else
222 dfa_state = lexer_goto(old_state, offset+2)
223 break
224 end
225 end
226 end
227 if dfa_state > -2 then break
228 end
229
230 _cr = cr
231 _line = line
232 _pos = pos
233 end
234
235 if dfa_state >= 0 then
236 var tok = lexer_accept(dfa_state)
237 if tok != -1 then
238 accept_state = dfa_state
239 accept_token = tok
240 accept_length = sp - start_stream_pos
241 accept_pos = _pos
242 accept_line = _line
243 end
244 else
245 if accept_state != -1 then
246 var location = new Location(_file, start_line + 1, accept_line + 1, start_pos + 1, accept_pos)
247 _pos = accept_pos
248 _line = accept_line
249 _stream_pos = start_stream_pos + accept_length
250 if accept_token == 0 then
251 return null
252 end
253 return make_token(accept_token, location)
254 else
255 _stream_pos = sp
256 var location = new Location(_file, start_line + 1, start_line + 1, start_pos + 1, start_pos + 1)
257 if sp > start_stream_pos then
258 var text = string.substring(start_stream_pos, sp-start_stream_pos)
259 var token = new ALexerError.init_lexer_error("Syntax error: unknown token {text}.", location, text)
260 _file.last_token = token
261 return token
262 else
263 var token = new EOF.init_tk(location)
264 _file.last_token = token
265 return token
266 end
267 end
268 end
269 end
270 end
271
272 # Allocate the right Token object for a given identifier
273 protected fun make_token(accept_token: Int, location: Location): Token is abstract
274 end