Merge: doc: fixed some typos and other misc. corrections
[nit.git] / src / parser / lexer_work.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 # Internal algorithm and data structures for the Nit lexer
16 module lexer_work
17
18 intrude import parser_nodes
19 private import tables
20
21 redef class Token
22 private var cached_text: nullable String
23
24 redef fun text
25 do
26 var res = _cached_text
27 if res != null then return res
28 res = location.text
29 _cached_text = res
30 return res
31 end
32
33 redef fun text=(text)
34 do
35 _cached_text = text
36 end
37
38 fun parser_index: Int is abstract
39 end
40
41 redef class EOF
42 init init_tk(loc: Location)
43 do
44 _cached_text = ""
45 _location = loc
46 end
47 end
48
49 redef class AError
50 var message: String
51
52 init init_error(message: String, loc: Location)
53 do
54 init_tk(loc)
55 self.message = message
56 end
57 end
58
59 redef class ALexerError
60 var string: String
61
62 init init_lexer_error(message: String, loc: Location, string: String)
63 do
64 init_error(message, loc)
65 self.string = string
66 end
67 end
68
69 redef class AParserError
70 var token: Token
71
72 init init_parser_error(message: String, loc: Location, token: Token)
73 do
74 init_error(message, loc)
75 self.token = token
76 end
77 end
78
79 # The lexer extract NIT tokens from an input stream.
80 # It is better user with the Parser
81 class Lexer
82 super TablesCapable
83
84 # Last peeked token
85 var token: nullable Token = null
86
87 # Lexer current state
88 private var state: Int = 0
89
90 # The source file
91 var file: SourceFile
92
93 # Current character in the stream
94 var stream_pos: Int = 0
95
96 # Current line number in the input stream
97 var line: Int = 0
98
99 # Current column in the input stream
100 var pos: Int = 0
101
102 # Was the last character a carriage-return?
103 var cr: Bool = false
104
105 # Constante state values
106 private fun state_initial: Int do return 0 end
107
108 # The last peeked token to chain them
109 private var last_token: nullable Token = null
110
111 # Give the next token (but do not consume it)
112 fun peek: Token
113 do
114 var t = _token
115 if t != null then return t
116
117 t = get_token
118 while t == null do t = get_token
119
120 if isset t._location then
121 var l = last_token
122 if l != null then
123 l.next_token = t
124 t.prev_token = l
125 else
126 file.first_token = t
127 end
128 last_token = t
129 end
130
131 _token = t
132 return t
133 end
134
135 # Give and consume the next token
136 fun next: Token
137 do
138 var result = peek
139 _token = null
140 return result
141 end
142
143 # Primitive method to return a token, or return null if it is discarded
144 # Is used to implement `peek` and `next`
145 protected fun get_token: nullable Token
146 do
147 var dfa_state = 0
148
149 var sp = _stream_pos
150 var start_stream_pos = sp
151 var start_pos = _pos
152 var start_line = _line
153 var file = self.file
154 var string = file.string
155 var string_len = string.length
156
157 var accept_state = -1
158 var accept_token = -1
159 var accept_length = -1
160 var accept_pos = -1
161 var accept_line = -1
162
163 loop
164 if sp >= string_len then
165 dfa_state = -1
166 else
167 # Very ugly hack, this is because of the way SableCC generates its tables.
168 # Due to the 0xFFFF limit of a Java char, when a big Nit char is read (i.e.
169 # code point > 65535), it crashes.
170 #
171 # Hence, if a char has a code point <= 255 (ISO8859 range), it is left as is.
172 # Else, it is replaced by 255.
173 # This does not corrupt the lexer and works perfectly on any character.
174 #
175 # TL;DR: Java fucked up, need retarded solution to cope for retarded decision
176 var c = string[sp].code_point
177 if c >= 256 then c = 255
178 sp += 1
179
180 var cr = _cr
181 var line = _line
182 var pos = _pos
183 if c == 10 then
184 if cr then
185 cr = false
186 file.line_starts[line] = sp
187 else
188 line = line + 1
189 pos = 0
190 file.line_starts[line] = sp
191 end
192 else if c == 13 then
193 line = line + 1
194 pos = 0
195 cr = true
196 file.line_starts[line] = sp
197 else
198 pos = pos + 1
199 cr = false
200 end
201
202 loop
203 var old_state = dfa_state
204 if dfa_state < -1 then
205 old_state = -2 - dfa_state
206 end
207
208 dfa_state = -1
209
210 var low = 0
211 var high = lexer_goto(old_state, 0) - 1
212
213 if high >= 0 then
214 while low <= high do
215 var middle = (low + high) / 2
216 var offset = middle * 3 + 1 # +1 because length is at 0
217
218 if c < lexer_goto(old_state, offset) then
219 high = middle - 1
220 else if c > lexer_goto(old_state, offset+1) then
221 low = middle + 1
222 else
223 dfa_state = lexer_goto(old_state, offset+2)
224 break
225 end
226 end
227 end
228 if dfa_state > -2 then break
229 end
230
231 _cr = cr
232 _line = line
233 _pos = pos
234 end
235
236 if dfa_state >= 0 then
237 var tok = lexer_accept(dfa_state)
238 if tok != -1 then
239 accept_state = dfa_state
240 accept_token = tok
241 accept_length = sp - start_stream_pos
242 accept_pos = _pos
243 accept_line = _line
244 end
245 else
246 if accept_state != -1 then
247 _pos = accept_pos
248 _line = accept_line
249 _stream_pos = start_stream_pos + accept_length
250 if accept_token == 0 then
251 # Ignored token (whitespaces)
252 return null
253 end
254 var location = new Location(file, start_line + 1, accept_line + 1, start_pos + 1, accept_pos)
255 return make_token(accept_token, location)
256 else
257 _stream_pos = sp
258 var location = new Location(file, start_line + 1, start_line + 1, start_pos + 1, start_pos + 1)
259 if sp > start_stream_pos then
260 var text = string.substring(start_stream_pos, sp-start_stream_pos)
261 var token = new ALexerError.init_lexer_error("Syntax Error: unknown token `{text}`.", location, text)
262 file.last_token = token
263 return token
264 else
265 var token = new EOF.init_tk(location)
266 file.last_token = token
267 return token
268 end
269 end
270 end
271 end
272 end
273
274 # Allocate the right Token object for a given identifier
275 protected fun make_token(accept_token: Int, location: Location): Token is abstract
276 end