*: remove newly superfluous static types on attributes
[nit.git] / lib / saxophonit / lexer.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # SAXophoNit’s lexer
12 module saxophonit::lexer
13
14 import reader_model
15
16 # SAXophoNit’s lexer
17 #
18 # Except when noted otherwise, `accept` and `expect` functions return `true` on
19 # success and `false` on mismatch and at the end of the file.
20 # They both foward the cursor to the next byte on success, but only `expect`
21 # functions fire a fatal error on mismatch.
22 class XophonLexer
23
24 # The model.
25 var reader_model: XophonReaderModel
26
27 # The input to read from.
28 var input: Reader is writable
29
30 # Alias to `reader_model.locator`.
31 private var locator: SAXLocatorImpl is noinit
32
33 init do
34 locator = reader_model.locator.as(not null)
35 end
36
37 # Last read byte.
38 #
39 # Equals `-1` on end of file or error.
40 private var last_char = -1
41
42 # Before end-of-line handling, was the last read byte a CARRIAGE RETURN?
43 private var was_cr: Bool = false
44
45
46 # Expect a value delimiter (`"` or `'`).
47 #
48 # If the last read byte is a delimiter, return the delimiter and
49 # read the next byte. Else, return `-1`.
50 fun expect_delimiter: Int do
51 if accept('"') then
52 return '"'.code_point
53 else if accept('\'') then
54 return '\''.code_point
55 else
56 fire_unexpected_char(". Expecting `\"` or `'`")
57 return -1
58 end
59 end
60
61 # Is the last read byte matches the `Char` production?
62 fun is_xml_char:Bool do
63 # TODO: Handle code points above 0x7F.
64 return last_char >= 32 or
65 last_char == 9 or
66 last_char == 10
67 end
68
69 # Push the last read byte in the specified buffer and read the next byte.
70 #
71 # If the last read byte is forbidden, fire a fatal error instead.
72 fun expect_xml_char(buffer: Buffer): Bool do
73 if is_xml_char then
74 buffer.chars.push(last_char.code_point)
75 read_char
76 return true
77 else if eof then
78 return fire_fatal_error("Unexpected end of file.")
79 else
80 return fire_fatal_error("Forbidden character.")
81 end
82 end
83
84
85 # Like `expect_xml_char`, but normalize white space and forbid `<`.
86 #
87 # SEE: The “3.3.3 Attribute-Value Normalization” section of any XML
88 # recommendation.
89 fun expect_att_value_char(buffer: Buffer): Bool do
90 if is_s then
91 buffer.chars.push(' ')
92 read_char
93 return true
94 else if last_char == '<'.code_point then
95 return fire_fatal_error("`<` is forbidden in attribute values.")
96 else
97 return expect_xml_char(buffer)
98 end
99 end
100
101 # Is the last read byte matches the `S` production?
102 fun is_s:Bool do
103 return last_char == 32 or last_char == 9 or last_char == 10
104 end
105
106 # Skip a `S?` token and return `true`.
107 fun skip_s: Bool do
108 while is_s do read_char
109 return true
110 end
111
112 # Accept a `S` token.
113 fun accept_s: Bool do
114 if is_s then
115 read_char
116 return skip_s
117 else
118 return false
119 end
120 end
121
122 # Expect `S`.
123 fun expect_s: Bool do
124 return (accept_s and skip_s) or fire_unexpected_char(". Expecting white space")
125 end
126
127 # Is the last read byte matches the `NameStartChar` production?
128 fun is_name_start_char: Bool do
129 # TODO: Handle code points above 0x7F.
130 return ['A'.code_point .. 'Z'.code_point].has(last_char) or
131 ['a'.code_point .. 'z'.code_point].has(last_char) or
132 last_char == '_'.code_point or
133 last_char == ':'.code_point or
134 last_char > 127
135 end
136
137 # Is the last read byte matches the `NameChar` production?
138 fun is_name_char: Bool do
139 # TODO: Handle code points above 0x7F.
140 return is_name_start_char or
141 last_char == '-'.code_point or
142 last_char == '.'.code_point or
143 is_digit
144 end
145
146 # Expect a `Name` tokn.
147 #
148 # Append the parsed name to `buffer`.
149 fun expect_name(buffer: Buffer): Bool do
150 if not is_name_start_char then
151 return fire_unexpected_char(" at the beginning of a name")
152 end
153 buffer.chars.push(last_char.code_point)
154 read_char
155 while is_name_char do
156 buffer.chars.push(last_char.code_point)
157 read_char
158 end
159 return true
160 end
161
162 # Expect a `PITarget` token.
163 #
164 # Append the parsed name to `buffer`.
165 fun expect_pi_target(buffer: Buffer): Bool do
166 return expect_name(buffer) and check_pi_target(buffer)
167 end
168
169 # Ensure the target is not `xml` (case-insensitive).
170 #
171 # Also, fire an error if the target contains a colon.
172 fun check_pi_target(target: Text): Bool do
173 var is_invalid = target.length == 3 and
174 (target.chars[0] == 'X' or target.chars[0] == 'x') and
175 (target.chars[0] == 'M' or target.chars[0] == 'm') and
176 (target.chars[0] == 'L' or target.chars[0] == 'l')
177
178 if is_invalid then
179 return fire_fatal_error("Forbidden processing target `{target}`.")
180 else
181 if target.has(":") then
182 reader_model.fire_error("The processing target `{target}` contains a colon.", null)
183 end
184 return true
185 end
186 end
187
188 # Is the last read byte matches the `[0-9]` production?
189 fun is_digit: Bool do
190 return ['0'.code_point .. '9'.code_point].has(last_char)
191 end
192
193 # Accept a `[0-9]+` token.
194 fun accept_digits(buffer: Buffer): Bool do
195 if is_digit then
196 loop
197 buffer.chars.push(last_char.code_point)
198 read_char
199 if not is_digit then return true
200 end
201 else
202 return false
203 end
204 end
205
206 # Expect a `[0-9]+` token.
207 fun expect_digits(buffer: Buffer): Bool do
208 return accept_digits(buffer) or fire_unexpected_char(". Expecting a decimal digit")
209 end
210
211 # Is `last_char` matches the `[0-9a-fA-F]` production?
212 fun is_hex: Bool do
213 return ['0'.code_point .. '9'.code_point].has(last_char) or
214 ['A'.code_point .. 'Z'.code_point].has(last_char) or
215 ['a'.code_point .. 'Z'.code_point].has(last_char)
216 end
217
218 # Expect a `[0-9a-fA-F]+` token.
219 fun expect_hex(buffer: Buffer): Bool do
220 if is_hex then
221 loop
222 buffer.chars.push(last_char.code_point)
223 read_char
224 if not is_hex then return true
225 end
226 else
227 return fire_unexpected_char(". Expecting an hexadecimal digit")
228 end
229 end
230
231 # Expect `Eq`.
232 fun expect_eq: Bool do
233 return skip_s and expect('=', "") and skip_s
234 end
235
236
237 ############################################################################
238 # General
239
240 # Read a byte and put it in `last_char`.
241 #
242 # In case of an end-of-file or an error, put -1 in `last_char`.
243 private fun read_char do
244 if locator.line_number < 0 then
245 locator.line_number = 1
246 locator.column_number = 1
247 else if last_char < 0 then
248 fire_fatal_error("Internal error: Already at the end of the file.")
249 return
250 else if last_char == '\n'.code_point then
251 locator.line_number += 1
252 locator.column_number = 1
253 else
254 locator.column_number += 1
255 end
256
257 var s = input.read_byte
258 if s == null then
259 last_char = -1
260 return
261 end
262 last_char = s.to_i
263
264 # XML 1.0 end-of-line handling
265 # Note: Regardless the XML version, any EOL defined by the
266 # recommandation MUST be reported as a single LINE FEED.
267 if was_cr and last_char == '\n'.code_point then
268 # EOL already reported. => Skip this byte.
269 s = input.read_byte
270 if s == null then
271 last_char = -1
272 else
273 last_char = s.to_i
274 end
275 end
276 was_cr = last_char == '\r'.code_point
277 if was_cr then
278 # Regardless the following byte, '\r' always introduce an EOL.
279 last_char = '\n'.code_point
280 end
281 end
282
283 # Is it the end of the stream?
284 #
285 # Also return `true` after a fatal error.
286 fun eof: Bool do return last_char < 0
287
288 # Start the lexer.
289 fun start do
290 if eof then
291 last_char = 0
292 read_char
293 end
294 end
295
296 # Close the input.
297 fun close do
298 last_char = -1
299 input.close
300 end
301
302 # Does the last read byte equal `c`?
303 fun is_int(c: Int): Bool do return last_char == c
304
305 # Does the last read byte equal `c`?
306 fun is_char(c: Char): Bool do return last_char == c.code_point
307
308 # Expect the specified byte.
309 fun accept_int(expected: Int): Bool do
310 if last_char == expected then
311 read_char
312 return true
313 else
314 return false
315 end
316 end
317
318 # Accept the specified byte.
319 fun accept(expected: Char): Bool do
320 return accept_int(expected.code_point)
321 end
322
323 # Ensure the last read byte is equal to `expected`.
324 #
325 # If it is, read the next byte. If not, fire a fatal error using
326 # `context`. `context` is the part of the message that gives the context.
327 # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
328 # `context` is `" in y"`.
329 #
330 # Return `true` if and only if the last read byte as the expected value.
331 fun expect_int(expected: Int, context: String): Bool do
332 return accept_int(expected) or
333 fire_unexpected_char("{context}. Expecting `{expected.code_point}`.")
334 end
335
336 # Ensure the last read byte is equal to `expected`.
337 #
338 # If it is, read the next byte. If not, fire a fatal error using
339 # `context`. `context` is the part of the message that gives the context.
340 # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
341 # `context` is `" in y"`.
342 #
343 # Return `true` if and only if the last read byte as the expected value.
344 fun expect(expected: Char, context: String): Bool do
345 return accept(expected) or
346 fire_unexpected_char("{context}. Expecting `{expected}`.")
347 end
348
349 # Ensure the last read byte and following bytes match `expected`.
350 #
351 # If it is, read one more byte. If not, fire a fatal error using
352 # `context`. `context` is the part of the message that gives the context.
353 # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
354 # `context` is `" in y"`.
355 #
356 # Return `true` if and only if the last read byte and following bytes
357 # match `expected`.
358 fun expect_string(expected: String, context: String): Bool do
359 var chars = expected.chars
360 var i = 0
361
362 while i < chars.length do
363 if not accept(chars[i]) then
364 if is_xml_char then
365 return fire_fatal_error("Unexpected " +
366 "`{expected.substring(0, i)}{last_char.code_point.to_s}`" +
367 "{context}. Expecting `{expected}`.")
368 else if eof then
369 return fire_fatal_error("Unexpected end of file{context}. " +
370 "Expecting `{expected}`.")
371 else
372 return fire_fatal_error("Forbidden character.")
373 end
374 end
375 i += 1
376 end
377 return true
378 end
379
380
381 ############################################################################
382 # Dispatching
383
384 # Fire a fatal error about an unexpected character.
385 #
386 # Return `false`.
387 fun fire_unexpected_char(rest_of_message: String): Bool do
388 if is_xml_char then
389 return fire_fatal_error("Unexpected character `{last_char.code_point.to_s}`{rest_of_message}.")
390 else if eof then
391 return fire_fatal_error("Unexpected end of file{rest_of_message}.")
392 else
393 return fire_fatal_error("Forbidden character.")
394 end
395 end
396
397 # Fire a fatal error with the specified message.
398 #
399 # Return `false`.
400 private fun fire_fatal_error(message: String): Bool do
401 reader_model.fire_fatal_error(message, null)
402 last_char = -1
403 return false
404 end
405 end