Merge: Loose Tokens
[nit.git] / lib / saxophonit / lexer.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # SAXophoNit’s lexer
12 module saxophonit::lexer
13
14 import reader_model
15
16 # SAXophoNit’s lexer
17 #
18 # Except when noted otherwise, `accept` and `expect` functions return `true` on
19 # success and `false` on mismatch and at the end of the file.
20 # They both foward the cursor to the next byte on success, but only `expect`
21 # functions fire a fatal error on mismatch.
22 class XophonLexer
23
24 # The model.
25 var reader_model: XophonReaderModel
26
27 # The input to read from.
28 var input: Reader is writable
29
30 # Alias to `reader_model.locator`.
31 private var locator: SAXLocatorImpl is noinit
32
33 init do
34 locator = reader_model.locator.as(not null)
35 end
36
37 # Last read byte.
38 #
39 # Equals `-1` on end of file or error.
40 private var last_char: Int = -1
41
42 # Before end-of-line handling, was the last read byte a CARRIAGE RETURN?
43 private var was_cr: Bool = false
44
45
46 # Expect a value delimiter (`"` or `'`).
47 #
48 # If the last read byte is a delimiter, return the delimiter and
49 # read the next byte. Else, return `-1`.
50 fun expect_delimiter: Int do
51 if accept('"') then
52 return '"'.ascii
53 else if accept('\'') then
54 return '\''.ascii
55 else
56 fire_unexpected_char(". Expecting `\"` or `'`")
57 return -1
58 end
59 end
60
61 # Is the last read byte matches the `Char` production?
62 fun is_xml_char:Bool do
63 # TODO: Handle code points above 0x7F.
64 return last_char >= 32 or
65 last_char == 9 or
66 last_char == 10
67 end
68
69 # Push the last read byte in the specified buffer and read the next byte.
70 #
71 # If the last read byte is forbidden, fire a fatal error instead.
72 fun expect_xml_char(buffer: Buffer): Bool do
73 if is_xml_char then
74 buffer.chars.push(last_char.ascii)
75 read_char
76 return true
77 else if eof then
78 return fire_fatal_error("Unexpected end of file.")
79 else
80 return fire_fatal_error("Forbidden character.")
81 end
82 end
83
84
85 # Like `expect_xml_char`, but normalize white space and forbid `<`.
86 #
87 # SEE: The “3.3.3 Attribute-Value Normalization” section of any XML
88 # recommendation.
89 fun expect_att_value_char(buffer: Buffer): Bool do
90 if is_s then
91 buffer.chars.push(' ')
92 read_char
93 return true
94 else if last_char == '<'.ascii then
95 return fire_fatal_error("`<` is forbidden in attribute values.")
96 else
97 return expect_xml_char(buffer)
98 end
99 end
100
101 # Is the last read byte matches the `S` production?
102 fun is_s:Bool do
103 return last_char == 32 or last_char == 9 or last_char == 10
104 end
105
106 # Skip a `S?` token and return `true`.
107 fun skip_s: Bool do
108 while is_s do read_char
109 return true
110 end
111
112 # Accept a `S` token.
113 fun accept_s: Bool do
114 if is_s then
115 read_char
116 return skip_s
117 else
118 return false
119 end
120 end
121
122 # Expect `S`.
123 fun expect_s: Bool do
124 return (accept_s and skip_s) or fire_unexpected_char(". Expecting white space")
125 end
126
127 # Is the last read byte matches the `NameStartChar` production?
128 fun is_name_start_char: Bool do
129 # TODO: Handle code points above 0x7F.
130 return ['A'.ascii .. 'Z'.ascii].has(last_char) or
131 ['a'.ascii .. 'z'.ascii].has(last_char) or
132 last_char == '_'.ascii or
133 last_char == ':'.ascii or
134 last_char > 127
135 end
136
137 # Is the last read byte matches the `NameChar` production?
138 fun is_name_char: Bool do
139 # TODO: Handle code points above 0x7F.
140 return is_name_start_char or
141 last_char == '-'.ascii or
142 last_char == '.'.ascii or
143 is_digit
144 end
145
146 # Expect a `Name` tokn.
147 #
148 # Append the parsed name to `buffer`.
149 fun expect_name(buffer: Buffer): Bool do
150 if not is_name_start_char then
151 return fire_unexpected_char(" at the beginning of a name")
152 end
153 buffer.chars.push(last_char.ascii)
154 read_char
155 while is_name_char do
156 buffer.chars.push(last_char.ascii)
157 read_char
158 end
159 return true
160 end
161
162 # Expect a `PITarget` token.
163 #
164 # Append the parsed name to `buffer`.
165 fun expect_pi_target(buffer: Buffer): Bool do
166 return expect_name(buffer) and check_pi_target(buffer)
167 end
168
169 # Ensure the target is not `xml` (case-insensitive).
170 #
171 # Also, fire an error if the target contains a colon.
172 fun check_pi_target(target: Text): Bool do
173 var is_invalid = target.length == 3 and
174 (target.chars[0] == 'X' or target.chars[0] == 'x') and
175 (target.chars[0] == 'M' or target.chars[0] == 'm') and
176 (target.chars[0] == 'L' or target.chars[0] == 'l')
177
178 if is_invalid then
179 return fire_fatal_error("Forbidden processing target `{target}`.")
180 else
181 if target.has(":") then
182 reader_model.fire_error("The processing target `{target}` contains a colon.", null)
183 end
184 return true
185 end
186 end
187
188 # Is the last read byte matches the `[0-9]` production?
189 fun is_digit: Bool do
190 return ['0'.ascii .. '9'.ascii].has(last_char)
191 end
192
193 # Accept a `[0-9]+` token.
194 fun accept_digits(buffer: Buffer): Bool do
195 if is_digit then
196 loop
197 buffer.chars.push(last_char.ascii)
198 read_char
199 if not is_digit then return true
200 end
201 else
202 return false
203 end
204 end
205
206 # Expect a `[0-9]+` token.
207 fun expect_digits(buffer: Buffer): Bool do
208 return accept_digits(buffer) or fire_unexpected_char(". Expecting a decimal digit")
209 end
210
211 # Is `last_char` matches the `[0-9a-fA-F]` production?
212 fun is_hex: Bool do
213 return ['0'.ascii .. '9'.ascii].has(last_char) or
214 ['A'.ascii .. 'Z'.ascii].has(last_char) or
215 ['a'.ascii .. 'Z'.ascii].has(last_char)
216 end
217
218 # Expect a `[0-9a-fA-F]+` token.
219 fun expect_hex(buffer: Buffer): Bool do
220 if is_hex then
221 loop
222 buffer.chars.push(last_char.ascii)
223 read_char
224 if not is_hex then return true
225 end
226 else
227 return fire_unexpected_char(". Expecting an hexadecimal digit")
228 end
229 end
230
231 # Expect `Eq`.
232 fun expect_eq: Bool do
233 return skip_s and expect('=', "") and skip_s
234 end
235
236
237 ############################################################################
238 # General
239
240 # Read a byte and put it in `last_char`.
241 #
242 # In case of an end-of-file or an error, put -1 in `last_char`.
243 private fun read_char do
244 if locator.line_number < 0 then
245 locator.line_number = 1
246 locator.column_number = 1
247 else if last_char < 0 then
248 fire_fatal_error("Internal error: Already at the end of the file.")
249 return
250 else if last_char == '\n'.ascii then
251 locator.line_number += 1
252 locator.column_number = 1
253 else
254 locator.column_number += 1
255 end
256
257 var s = input.read_byte
258 if s == null then
259 last_char = -1
260 return
261 end
262 last_char = s
263
264 # XML 1.0 end-of-line handling
265 # Note: Regardless the XML version, any EOL defined by the
266 # recommandation MUST be reported as a single LINE FEED.
267 if was_cr and last_char == '\n'.ascii then
268 # EOL already reported. => Skip this byte.
269 s = input.read_byte
270 if s == null then s = -1
271 last_char = s
272 end
273 was_cr = last_char == '\r'.ascii
274 if was_cr then
275 # Regardless the following byte, '\r' always introduce an EOL.
276 last_char = '\n'.ascii
277 end
278 end
279
280 # Is it the end of the stream?
281 #
282 # Also return `true` after a fatal error.
283 fun eof: Bool do return last_char < 0
284
285 # Start the lexer.
286 fun start do
287 if eof then
288 last_char = 0
289 read_char
290 end
291 end
292
293 # Close the input.
294 fun close do
295 last_char = -1
296 input.close
297 end
298
299 # Does the last read byte equal `c`?
300 fun is_int(c: Int): Bool do return last_char == c
301
302 # Does the last read byte equal `c`?
303 fun is_char(c: Char): Bool do return last_char == c.ascii
304
305 # Expect the specified byte.
306 fun accept_int(expected: Int): Bool do
307 if last_char == expected then
308 read_char
309 return true
310 else
311 return false
312 end
313 end
314
315 # Accept the specified byte.
316 fun accept(expected: Char): Bool do
317 return accept_int(expected.ascii)
318 end
319
320 # Ensure the last read byte is equal to `expected`.
321 #
322 # If it is, read the next byte. If not, fire a fatal error using
323 # `context`. `context` is the part of the message that gives the context.
324 # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
325 # `context` is `" in y"`.
326 #
327 # Return `true` if and only if the last read byte as the expected value.
328 fun expect_int(expected: Int, context: String): Bool do
329 return accept_int(expected) or
330 fire_unexpected_char("{context}. Expecting `{expected.ascii}`.")
331 end
332
333 # Ensure the last read byte is equal to `expected`.
334 #
335 # If it is, read the next byte. If not, fire a fatal error using
336 # `context`. `context` is the part of the message that gives the context.
337 # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
338 # `context` is `" in y"`.
339 #
340 # Return `true` if and only if the last read byte as the expected value.
341 fun expect(expected: Char, context: String): Bool do
342 return accept(expected) or
343 fire_unexpected_char("{context}. Expecting `{expected}`.")
344 end
345
346 # Ensure the last read byte and following bytes match `expected`.
347 #
348 # If it is, read one more byte. If not, fire a fatal error using
349 # `context`. `context` is the part of the message that gives the context.
350 # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
351 # `context` is `" in y"`.
352 #
353 # Return `true` if and only if the last read byte and following bytes
354 # match `expected`.
355 fun expect_string(expected: String, context: String): Bool do
356 var chars = expected.chars
357 var i = 0
358
359 while i < chars.length do
360 if not accept(chars[i]) then
361 if is_xml_char then
362 return fire_fatal_error("Unexpected " +
363 "`{expected.substring(0, i)}{last_char.ascii.to_s}`" +
364 "{context}. Expecting `{expected}`.")
365 else if eof then
366 return fire_fatal_error("Unexpected end of file{context}. " +
367 "Expecting `{expected}`.")
368 else
369 return fire_fatal_error("Forbidden character.")
370 end
371 end
372 i += 1
373 end
374 return true
375 end
376
377
378 ############################################################################
379 # Dispatching
380
381 # Fire a fatal error about an unexpected character.
382 #
383 # Return `false`.
384 fun fire_unexpected_char(rest_of_message: String): Bool do
385 if is_xml_char then
386 return fire_fatal_error("Unexpected character `{last_char.ascii.to_s}`{rest_of_message}.")
387 else if eof then
388 return fire_fatal_error("Unexpected end of file{rest_of_message}.")
389 else
390 return fire_fatal_error("Forbidden character.")
391 end
392 end
393
394 # Fire a fatal error with the specified message.
395 #
396 # Return `false`.
397 private fun fire_fatal_error(message: String): Bool do
398 reader_model.fire_fatal_error(message, null)
399 last_char = -1
400 return false
401 end
402 end