4a0820e394e7ac5b5c9a18cc4402457011bc7836
[nit.git] / src / parser / lexer_work.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Internal algorithm and data structures for the Nit lexer
16 module lexer_work
17
18 intrude import parser_nodes
19 private import tables
20
21 redef class Token
22 var _text: nullable String
23
24 redef fun text
25 do
26 var res = _text
27 if res != null then return res
28 res = location.text
29 _text = res
30 return res
31 end
32
33 redef fun text=(text)
34 do
35 _text = text
36 end
37
38 fun parser_index: Int is abstract
39 end
40
41 redef class EOF
42 redef fun parser_index: Int
43 do
44 return 97
45 end
46
47 init init_tk(loc: Location)
48 do
49 _text = ""
50 _location = loc
51 end
52 end
53
54 redef class AError
55 readable var _message: String
56
57 init init_error(message: String, loc: Location)
58 do
59 init_tk(loc)
60 _message = message
61 end
62 end
63
64 redef class ALexerError
65 readable var _string: String
66
67 init init_lexer_error(message: String, loc: Location, string: String)
68 do
69 init_error(message, loc)
70 _string = string
71 end
72 end
73
74 redef class AParserError
75 readable var _token: Token
76
77 init init_parser_error(message: String, loc: Location, token: Token)
78 do
79 init_error(message, loc)
80 _token = token
81 end
82 end
83
84 # The lexer extract NIT tokens from an input stream.
85 # It is better user with the Parser
86 class Lexer
87 super TablesCapable
88 # Last peeked token
89 var _token: nullable Token
90
91 # Lexer current state
92 var _state: Int = 0
93
94 # The source file
95 readable var _file: SourceFile
96
97 # Current character in the stream
98 var _stream_pos: Int = 0
99
100 # Current line number in the input stream
101 var _line: Int = 0
102
103 # Current column in the input stream
104 var _pos: Int = 0
105
106 # Was the last character a cariage-return?
107 var _cr: Bool = false
108
109 # Constante state values
110 private fun state_initial: Int do return 0 end
111
112 # Create a new lexer for a stream (and a name)
113 init(file: SourceFile)
114 do
115 _file = file
116 end
117
118 # The last peeked token to chain them
119 private var last_token: nullable Token = null
120
121 # Give the next token (but do not consume it)
122 fun peek: Token
123 do
124 var t = _token
125 if t != null then return t
126
127 t = get_token
128 while t == null do t = get_token
129
130 var l = last_token
131 if l != null then
132 l.next_token = t
133 t.prev_token = l
134 else
135 _file.first_token = t
136 end
137
138 last_token = t
139 _token = t
140 return t
141 end
142
143 # Give and consume the next token
144 fun next: Token
145 do
146 var result = peek
147 _token = null
148 return result
149 end
150
151 # Primitive method to return a token, or return null if it is discarded
152 # Is used to implement `peek` and `next`
153 protected fun get_token: nullable Token
154 do
155 var dfa_state = 0
156
157 var sp = _stream_pos
158 var start_stream_pos = sp
159 var start_pos = _pos
160 var start_line = _line
161 var string = _file.string
162 var string_len = string.length
163
164 var accept_state = -1
165 var accept_token = -1
166 var accept_length = -1
167 var accept_pos = -1
168 var accept_line = -1
169
170 loop
171 if sp >= string_len then
172 dfa_state = -1
173 else
174 var c = string.chars[sp].ascii
175 sp += 1
176
177 var cr = _cr
178 var line = _line
179 var pos = _pos
180 if c == 10 then
181 if cr then
182 cr = false
183 _file.line_starts[line] = sp
184 else
185 line = line + 1
186 pos = 0
187 _file.line_starts[line] = sp
188 end
189 else if c == 13 then
190 line = line + 1
191 pos = 0
192 cr = true
193 _file.line_starts[line] = sp
194 else
195 pos = pos + 1
196 cr = false
197 end
198
199 loop
200 var old_state = dfa_state
201 if dfa_state < -1 then
202 old_state = -2 - dfa_state
203 end
204
205 dfa_state = -1
206
207 var low = 0
208 var high = lexer_goto(old_state, 0) - 1
209
210 if high >= 0 then
211 while low <= high do
212 var middle = (low + high) / 2
213 var offset = middle * 3 + 1 # +1 because length is at 0
214
215 if c < lexer_goto(old_state, offset) then
216 high = middle - 1
217 else if c > lexer_goto(old_state, offset+1) then
218 low = middle + 1
219 else
220 dfa_state = lexer_goto(old_state, offset+2)
221 break
222 end
223 end
224 end
225 if dfa_state > -2 then break
226 end
227
228 _cr = cr
229 _line = line
230 _pos = pos
231 end
232
233 if dfa_state >= 0 then
234 var tok = lexer_accept(dfa_state)
235 if tok != -1 then
236 accept_state = dfa_state
237 accept_token = tok
238 accept_length = sp - start_stream_pos
239 accept_pos = _pos
240 accept_line = _line
241 end
242 else
243 if accept_state != -1 then
244 var location = new Location(_file, start_line + 1, accept_line + 1, start_pos + 1, accept_pos)
245 _pos = accept_pos
246 _line = accept_line
247 _stream_pos = start_stream_pos + accept_length
248 if accept_token == 0 then
249 return null
250 end
251 return make_token(accept_token, location)
252 else
253 _stream_pos = sp
254 var location = new Location(_file, start_line + 1, start_line + 1, start_pos + 1, start_pos + 1)
255 if sp > start_stream_pos then
256 var text = string.substring(start_stream_pos, sp-start_stream_pos)
257 var token = new ALexerError.init_lexer_error("Syntax error: unknown token {text}.", location, text)
258 _file.last_token = token
259 return token
260 else
261 var token = new EOF.init_tk(location)
262 _file.last_token = token
263 return token
264 end
265 end
266 end
267 end
268 end
269
270 # Allocate the right Token object for a given identifier
271 protected fun make_token(accept_token: Int, location: Location): Token is abstract
272 end