parser: remove some remaining old style accessors
[nit.git] / src / parser / lexer_work.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Internal algorithm and data structures for the Nit lexer
16 module lexer_work
17
18 intrude import parser_nodes
19 private import tables
20
21 redef class Token
22 var _text: nullable String
23
24 redef fun text
25 do
26 var res = _text
27 if res != null then return res
28 res = location.text
29 _text = res
30 return res
31 end
32
33 redef fun text=(text)
34 do
35 _text = text
36 end
37
38 fun parser_index: Int is abstract
39 end
40
41 redef class EOF
42 redef fun parser_index: Int
43 do
44 return 97
45 end
46
47 init init_tk(loc: Location)
48 do
49 _text = ""
50 _location = loc
51 end
52 end
53
54 redef class AError
55 var message: String
56
57 init init_error(message: String, loc: Location)
58 do
59 init_tk(loc)
60 self.message = message
61 end
62 end
63
64 redef class ALexerError
65 var string: String
66
67 init init_lexer_error(message: String, loc: Location, string: String)
68 do
69 init_error(message, loc)
70 self.string = string
71 end
72 end
73
74 redef class AParserError
75 var token: Token
76
77 init init_parser_error(message: String, loc: Location, token: Token)
78 do
79 init_error(message, loc)
80 self.token = token
81 end
82 end
83
84 # The lexer extract NIT tokens from an input stream.
85 # It is better user with the Parser
86 class Lexer
87 super TablesCapable
88 # Last peeked token
89 var _token: nullable Token
90
91 # Lexer current state
92 var _state: Int = 0
93
94 # The source file
95 var file: SourceFile
96
97 # Current character in the stream
98 var _stream_pos: Int = 0
99
100 # Current line number in the input stream
101 var _line: Int = 0
102
103 # Current column in the input stream
104 var _pos: Int = 0
105
106 # Was the last character a cariage-return?
107 var _cr: Bool = false
108
109 # Constante state values
110 private fun state_initial: Int do return 0 end
111
112 # Create a new lexer for a stream (and a name)
113 init(file: SourceFile)
114 do
115 self.file = file
116 end
117
118 # The last peeked token to chain them
119 private var last_token: nullable Token = null
120
121 # Give the next token (but do not consume it)
122 fun peek: Token
123 do
124 var t = _token
125 if t != null then return t
126
127 t = get_token
128 while t == null do t = get_token
129
130 if t._location != null then
131 var l = last_token
132 if l != null then
133 l.next_token = t
134 t.prev_token = l
135 else
136 file.first_token = t
137 end
138 last_token = t
139 end
140
141 _token = t
142 return t
143 end
144
145 # Give and consume the next token
146 fun next: Token
147 do
148 var result = peek
149 _token = null
150 return result
151 end
152
153 # Primitive method to return a token, or return null if it is discarded
154 # Is used to implement `peek` and `next`
155 protected fun get_token: nullable Token
156 do
157 var dfa_state = 0
158
159 var sp = _stream_pos
160 var start_stream_pos = sp
161 var start_pos = _pos
162 var start_line = _line
163 var file = self.file
164 var string = file.string
165 var string_len = string.length
166
167 var accept_state = -1
168 var accept_token = -1
169 var accept_length = -1
170 var accept_pos = -1
171 var accept_line = -1
172
173 loop
174 if sp >= string_len then
175 dfa_state = -1
176 else
177 var c = string.chars[sp].ascii
178 sp += 1
179
180 var cr = _cr
181 var line = _line
182 var pos = _pos
183 if c == 10 then
184 if cr then
185 cr = false
186 file.line_starts[line] = sp
187 else
188 line = line + 1
189 pos = 0
190 file.line_starts[line] = sp
191 end
192 else if c == 13 then
193 line = line + 1
194 pos = 0
195 cr = true
196 file.line_starts[line] = sp
197 else
198 pos = pos + 1
199 cr = false
200 end
201
202 loop
203 var old_state = dfa_state
204 if dfa_state < -1 then
205 old_state = -2 - dfa_state
206 end
207
208 dfa_state = -1
209
210 var low = 0
211 var high = lexer_goto(old_state, 0) - 1
212
213 if high >= 0 then
214 while low <= high do
215 var middle = (low + high) / 2
216 var offset = middle * 3 + 1 # +1 because length is at 0
217
218 if c < lexer_goto(old_state, offset) then
219 high = middle - 1
220 else if c > lexer_goto(old_state, offset+1) then
221 low = middle + 1
222 else
223 dfa_state = lexer_goto(old_state, offset+2)
224 break
225 end
226 end
227 end
228 if dfa_state > -2 then break
229 end
230
231 _cr = cr
232 _line = line
233 _pos = pos
234 end
235
236 if dfa_state >= 0 then
237 var tok = lexer_accept(dfa_state)
238 if tok != -1 then
239 accept_state = dfa_state
240 accept_token = tok
241 accept_length = sp - start_stream_pos
242 accept_pos = _pos
243 accept_line = _line
244 end
245 else
246 if accept_state != -1 then
247 var location = new Location(file, start_line + 1, accept_line + 1, start_pos + 1, accept_pos)
248 _pos = accept_pos
249 _line = accept_line
250 _stream_pos = start_stream_pos + accept_length
251 if accept_token == 0 then
252 return null
253 end
254 return make_token(accept_token, location)
255 else
256 _stream_pos = sp
257 var location = new Location(file, start_line + 1, start_line + 1, start_pos + 1, start_pos + 1)
258 if sp > start_stream_pos then
259 var text = string.substring(start_stream_pos, sp-start_stream_pos)
260 var token = new ALexerError.init_lexer_error("Syntax error: unknown token {text}.", location, text)
261 file.last_token = token
262 return token
263 else
264 var token = new EOF.init_tk(location)
265 file.last_token = token
266 return token
267 end
268 end
269 end
270 end
271 end
272
273 # Allocate the right Token object for a given identifier
274 protected fun make_token(accept_token: Int, location: Location): Token is abstract
275 end