lexer: do not create useless location (for whitespaces)
[nit.git] / src / parser / lexer_work.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Internal algorithm and data structures for the Nit lexer
16 module lexer_work
17
18 intrude import parser_nodes
19 private import tables
20
21 redef class Token
22 private var cached_text: nullable String
23
24 redef fun text
25 do
26 var res = _cached_text
27 if res != null then return res
28 res = location.text
29 _cached_text = res
30 return res
31 end
32
33 redef fun text=(text)
34 do
35 _cached_text = text
36 end
37
38 fun parser_index: Int is abstract
39 end
40
41 redef class EOF
42 init init_tk(loc: Location)
43 do
44 _cached_text = ""
45 _location = loc
46 end
47 end
48
49 redef class AError
50 var message: String
51
52 init init_error(message: String, loc: Location)
53 do
54 init_tk(loc)
55 self.message = message
56 end
57 end
58
59 redef class ALexerError
60 var string: String
61
62 init init_lexer_error(message: String, loc: Location, string: String)
63 do
64 init_error(message, loc)
65 self.string = string
66 end
67 end
68
69 redef class AParserError
70 var token: Token
71
72 init init_parser_error(message: String, loc: Location, token: Token)
73 do
74 init_error(message, loc)
75 self.token = token
76 end
77 end
78
79 # The lexer extract NIT tokens from an input stream.
80 # It is better user with the Parser
81 class Lexer
82 super TablesCapable
83
84 # Last peeked token
85 var token: nullable Token = null
86
87 # Lexer current state
88 private var state: Int = 0
89
90 # The source file
91 var file: SourceFile
92
93 # Current character in the stream
94 var stream_pos: Int = 0
95
96 # Current line number in the input stream
97 var line: Int = 0
98
99 # Current column in the input stream
100 var pos: Int = 0
101
102 # Was the last character a carriage-return?
103 var cr: Bool = false
104
105 # Constante state values
106 private fun state_initial: Int do return 0 end
107
108 # The last peeked token to chain them
109 private var last_token: nullable Token = null
110
111 # Give the next token (but do not consume it)
112 fun peek: Token
113 do
114 var t = _token
115 if t != null then return t
116
117 t = get_token
118 while t == null do t = get_token
119
120 if isset t._location then
121 var l = last_token
122 if l != null then
123 l.next_token = t
124 t.prev_token = l
125 else
126 file.first_token = t
127 end
128 last_token = t
129 end
130
131 _token = t
132 return t
133 end
134
135 # Give and consume the next token
136 fun next: Token
137 do
138 var result = peek
139 _token = null
140 return result
141 end
142
143 # Primitive method to return a token, or return null if it is discarded
144 # Is used to implement `peek` and `next`
145 protected fun get_token: nullable Token
146 do
147 var dfa_state = 0
148
149 var sp = _stream_pos
150 var start_stream_pos = sp
151 var start_pos = _pos
152 var start_line = _line
153 var file = self.file
154 var string = file.string
155 var string_len = string.length
156
157 var accept_state = -1
158 var accept_token = -1
159 var accept_length = -1
160 var accept_pos = -1
161 var accept_line = -1
162
163 loop
164 if sp >= string_len then
165 dfa_state = -1
166 else
167 var c = string[sp].ascii
168 sp += 1
169
170 var cr = _cr
171 var line = _line
172 var pos = _pos
173 if c == 10 then
174 if cr then
175 cr = false
176 file.line_starts[line] = sp
177 else
178 line = line + 1
179 pos = 0
180 file.line_starts[line] = sp
181 end
182 else if c == 13 then
183 line = line + 1
184 pos = 0
185 cr = true
186 file.line_starts[line] = sp
187 else
188 pos = pos + 1
189 cr = false
190 end
191
192 loop
193 var old_state = dfa_state
194 if dfa_state < -1 then
195 old_state = -2 - dfa_state
196 end
197
198 dfa_state = -1
199
200 var low = 0
201 var high = lexer_goto(old_state, 0) - 1
202
203 if high >= 0 then
204 while low <= high do
205 var middle = (low + high) / 2
206 var offset = middle * 3 + 1 # +1 because length is at 0
207
208 if c < lexer_goto(old_state, offset) then
209 high = middle - 1
210 else if c > lexer_goto(old_state, offset+1) then
211 low = middle + 1
212 else
213 dfa_state = lexer_goto(old_state, offset+2)
214 break
215 end
216 end
217 end
218 if dfa_state > -2 then break
219 end
220
221 _cr = cr
222 _line = line
223 _pos = pos
224 end
225
226 if dfa_state >= 0 then
227 var tok = lexer_accept(dfa_state)
228 if tok != -1 then
229 accept_state = dfa_state
230 accept_token = tok
231 accept_length = sp - start_stream_pos
232 accept_pos = _pos
233 accept_line = _line
234 end
235 else
236 if accept_state != -1 then
237 _pos = accept_pos
238 _line = accept_line
239 _stream_pos = start_stream_pos + accept_length
240 if accept_token == 0 then
241 # Ignored token (whitespaces)
242 return null
243 end
244 var location = new Location(file, start_line + 1, accept_line + 1, start_pos + 1, accept_pos)
245 return make_token(accept_token, location)
246 else
247 _stream_pos = sp
248 var location = new Location(file, start_line + 1, start_line + 1, start_pos + 1, start_pos + 1)
249 if sp > start_stream_pos then
250 var text = string.substring(start_stream_pos, sp-start_stream_pos)
251 var token = new ALexerError.init_lexer_error("Syntax error: unknown token {text}.", location, text)
252 file.last_token = token
253 return token
254 else
255 var token = new EOF.init_tk(location)
256 file.last_token = token
257 return token
258 end
259 end
260 end
261 end
262 end
263
264 # Allocate the right Token object for a given identifier
265 protected fun make_token(accept_token: Int, location: Location): Token is abstract
266 end