4f77ab8148577d171b3f844c5718f0916d29402d
[nit.git] / lib / saxophonit / lexer.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # SAXophoNit’s lexer
12 module saxophonit::lexer
13
14 import reader_model
15
16 # SAXophoNit’s lexer
17 #
18 # Except when noted otherwise, `accept` and `expect` functions return `true` on
19 # success and `false` on mismatch and at the end of the file.
20 # They both foward the cursor to the next byte on success, but only `expect`
21 # functions fire a fatal error on mismatch.
22 class XophonLexer
23 var reader_model: XophonReaderModel
24 var input: IStream is writable
25 private var locator: SAXLocatorImpl is noinit
26
27 init do
28 locator = reader_model.locator.as(not null)
29 end
30
31 # Last read byte.
32 #
33 # Equals `-1` on end of file or error.
34 private var last_char: Int = -1
35
36 # Before end-of-line handling, was the last read byte a CARRIAGE RETURN?
37 private var was_cr: Bool = false
38
39
40 # Expect a value delimiter (`"` or `'`).
41 #
42 # If the last read byte is a delimiter, return the delimiter and
43 # read the next byte. Else, return `-1`.
44 fun expect_delimiter: Int do
45 if accept('"') then
46 return '"'.ascii
47 else if accept('\'') then
48 return '\''.ascii
49 else
50 fire_unexpected_char(". Expecting `\"` or `'`")
51 return -1
52 end
53 end
54
55 # Is the last read byte matches the `Char` production?
56 fun is_xml_char:Bool do
57 # TODO: Handle code points above 0x7F.
58 return last_char >= 32 or
59 last_char == 9 or
60 last_char == 10
61 end
62
63 # Push the last read byte in the specified buffer and read the next byte.
64 #
65 # If the last read byte is forbidden, fire a fatal error instead.
66 fun expect_xml_char(buffer: Buffer): Bool do
67 if is_xml_char then
68 buffer.chars.push(last_char.ascii)
69 read_char
70 return true
71 else if eof then
72 return fire_fatal_error("Unexpected end of file.")
73 else
74 return fire_fatal_error("Forbidden character.")
75 end
76 end
77
78
79 # Like `expect_xml_char`, but normalize white space and forbid `<`.
80 #
81 # SEE: The “3.3.3 Attribute-Value Normalization” section of any XML
82 # recommendation.
83 fun expect_att_value_char(buffer: Buffer): Bool do
84 if is_s then
85 buffer.chars.push(' ')
86 read_char
87 return true
88 else if last_char == '<'.ascii then
89 return fire_fatal_error("`<` is forbidden in attribute values.")
90 else
91 return expect_xml_char(buffer)
92 end
93 end
94
95 # Is the last read byte matches the `S` production?
96 fun is_s:Bool do
97 return last_char == 32 or last_char == 9 or last_char == 10
98 end
99
100 # Skip a `S?` token and return `true`.
101 fun skip_s: Bool do
102 while is_s do read_char
103 return true
104 end
105
106 # Accept a `S` token.
107 fun accept_s: Bool do
108 if is_s then
109 read_char
110 return skip_s
111 else
112 return false
113 end
114 end
115
116 # Expect `S`.
117 fun expect_s: Bool do
118 return (accept_s and skip_s) or fire_unexpected_char(". Expecting white space")
119 end
120
121 # Is the last read byte matches the `NameStartChar` production?
122 fun is_name_start_char: Bool do
123 # TODO: Handle code points above 0x7F.
124 return ['A'.ascii .. 'Z'.ascii].has(last_char) or
125 ['a'.ascii .. 'z'.ascii].has(last_char) or
126 last_char == '_'.ascii or
127 last_char == ':'.ascii or
128 last_char > 127
129 end
130
131 # Is the last read byte matches the `NameChar` production?
132 fun is_name_char: Bool do
133 # TODO: Handle code points above 0x7F.
134 return is_name_start_char or
135 last_char == '-'.ascii or
136 last_char == '.'.ascii or
137 is_digit
138 end
139
140 # Expect a `Name` tokn.
141 #
142 # Append the parsed name to `buffer`.
143 fun expect_name(buffer: Buffer): Bool do
144 if not is_name_start_char then
145 return fire_unexpected_char(" at the beginning of a name")
146 end
147 buffer.chars.push(last_char.ascii)
148 read_char
149 while is_name_char do
150 buffer.chars.push(last_char.ascii)
151 read_char
152 end
153 return true
154 end
155
156 # Expect a `PITarget` token.
157 #
158 # Append the parsed name to `buffer`.
159 fun expect_pi_target(buffer: Buffer): Bool do
160 return expect_name(buffer) and check_pi_target(buffer)
161 end
162
163 # Ensure the target is not `xml` (case-insensitive).
164 #
165 # Also, fire an error if the target contains a colon.
166 fun check_pi_target(target: Text): Bool do
167 var is_invalid = target.length == 3 and
168 (target.chars[0] == 'X' or target.chars[0] == 'x') and
169 (target.chars[0] == 'M' or target.chars[0] == 'm') and
170 (target.chars[0] == 'L' or target.chars[0] == 'l')
171
172 if is_invalid then
173 return fire_fatal_error("Forbidden processing target `{target}`.")
174 else
175 if target.has(":") then
176 reader_model.fire_error("The processing target `{target}` contains a colon.", null)
177 end
178 return true
179 end
180 end
181
182 # Is the last read byte matches the `[0-9]` production?
183 fun is_digit: Bool do
184 return ['0'.ascii .. '9'.ascii].has(last_char)
185 end
186
187 # Accept a `[0-9]+` token.
188 fun accept_digits(buffer: Buffer): Bool do
189 if is_digit then
190 loop
191 buffer.chars.push(last_char.ascii)
192 read_char
193 if not is_digit then return true
194 end
195 else
196 return false
197 end
198 end
199
200 # Expect a `[0-9]+` token.
201 fun expect_digits(buffer: Buffer): Bool do
202 return accept_digits(buffer) or fire_unexpected_char(". Expecting a decimal digit")
203 end
204
205 # Is `last_char` matches the `[0-9a-fA-F]` production?
206 fun is_hex: Bool do
207 return ['0'.ascii .. '9'.ascii].has(last_char) or
208 ['A'.ascii .. 'Z'.ascii].has(last_char) or
209 ['a'.ascii .. 'Z'.ascii].has(last_char)
210 end
211
212 # Expect a `[0-9a-fA-F]+` token.
213 fun expect_hex(buffer: Buffer): Bool do
214 if is_hex then
215 loop
216 buffer.chars.push(last_char.ascii)
217 read_char
218 if not is_hex then return true
219 end
220 else
221 return fire_unexpected_char(". Expecting an hexadecimal digit")
222 end
223 end
224
225 # Expect `Eq`.
226 fun expect_eq: Bool do
227 return skip_s and expect('=', "") and skip_s
228 end
229
230
231 ############################################################################
232 # General
233
234 # Read a byte and put it in `last_char`.
235 #
236 # In case of an end-of-file or an error, put -1 in `last_char`.
237 private fun read_char do
238 if locator.line_number < 0 then
239 locator.line_number = 1
240 locator.column_number = 1
241 else if last_char < 0 then
242 fire_fatal_error("Internal error: Already at the end of the file.")
243 return
244 else if last_char == '\n'.ascii then
245 locator.line_number += 1
246 locator.column_number = 1
247 else
248 locator.column_number += 1
249 end
250
251 last_char = input.read_char
252 if last_char < 0 then
253 return
254 end
255
256 # XML 1.0 end-of-line handling
257 # Note: Regardless the XML version, any EOL defined by the
258 # recommandation MUST be reported as a single LINE FEED.
259 if was_cr and last_char == '\n'.ascii then
260 # EOL already reported. => Skip this byte.
261 last_char = input.read_char
262 end
263 was_cr = last_char == '\r'.ascii
264 if was_cr then
265 # Regardless the following byte, '\r' always introduce an EOL.
266 last_char = '\n'.ascii
267 end
268 end
269
270 # Is it the end of the stream?
271 #
272 # Also return `true` after a fatal error.
273 fun eof: Bool do return last_char < 0
274
275 # Start the lexer.
276 fun start do
277 if eof then
278 last_char = 0
279 read_char
280 end
281 end
282
283 # Close the input.
284 fun close do
285 last_char = -1
286 input.close
287 end
288
289 # Does the last read byte equal `c`?
290 fun is_int(c: Int): Bool do return last_char == c
291
292 # Does the last read byte equal `c`?
293 fun is_char(c: Char): Bool do return last_char == c.ascii
294
295 # Expect the specified byte.
296 fun accept_int(expected: Int): Bool do
297 if last_char == expected then
298 read_char
299 return true
300 else
301 return false
302 end
303 end
304
305 # Accept the specified byte.
306 fun accept(expected: Char): Bool do
307 return accept_int(expected.ascii)
308 end
309
310 # Ensure the last read byte is equal to `expected`.
311 #
312 # If it is, read the next byte. If not, fire a fatal error using
313 # `context`. `context` is the part of the message that gives the context.
314 # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
315 # `context` is `" in y"`.
316 #
317 # Return `true` if and only if the last read byte as the expected value.
318 fun expect_int(expected: Int, context: String): Bool do
319 return accept_int(expected) or
320 fire_unexpected_char("{context}. Expecting `{expected.ascii}`.")
321 end
322
323 # Ensure the last read byte is equal to `expected`.
324 #
325 # If it is, read the next byte. If not, fire a fatal error using
326 # `context`. `context` is the part of the message that gives the context.
327 # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
328 # `context` is `" in y"`.
329 #
330 # Return `true` if and only if the last read byte as the expected value.
331 fun expect(expected: Char, context: String): Bool do
332 return accept(expected) or
333 fire_unexpected_char("{context}. Expecting `{expected}`.")
334 end
335
336 # Ensure the last read byte and following bytes match `expected`.
337 #
338 # If it is, read one more byte. If not, fire a fatal error using
339 # `context`. `context` is the part of the message that gives the context.
340 # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of
341 # `context` is `" in y"`.
342 #
343 # Return `true` if and only if the last read byte and following bytes
344 # match `expected`.
345 fun expect_string(expected: String, context: String): Bool do
346 var chars = expected.chars
347 var i: Int = 0
348
349 while i < chars.length do
350 if not accept(chars[i]) then
351 if is_xml_char then
352 return fire_fatal_error("Unexpected " +
353 "`{expected.substring(0, i)}{last_char.ascii.to_s}`" +
354 "{context}. Expecting `{expected}`.")
355 else if eof then
356 return fire_fatal_error("Unexpected end of file{context}. " +
357 "Expecting `{expected}`.")
358 else
359 return fire_fatal_error("Forbidden character.")
360 end
361 end
362 i += 1
363 end
364 return true
365 end
366
367
368 ############################################################################
369 # Dispatching
370
371 # Fire a fatal error about an unexpected character.
372 #
373 # Return `false`.
374 fun fire_unexpected_char(rest_of_message: String): Bool do
375 if is_xml_char then
376 return fire_fatal_error("Unexpected character `{last_char.ascii.to_s}`{rest_of_message}.")
377 else if eof then
378 return fire_fatal_error("Unexpected end of file{rest_of_message}.")
379 else
380 return fire_fatal_error("Forbidden character.")
381 end
382 end
383
384 # Fire a fatal error with the specified message.
385 #
386 # Return `false`.
387 private fun fire_fatal_error(message: String): Bool do
388 reader_model.fire_fatal_error(message, null)
389 last_char = -1
390 return false
391 end
392 end