a2254133508305537d8853f5ce07465e5f3d9d33
[nit.git] / lib / saxophonit / saxophonit.nit
1 # This file is part of NIT ( http://www.nitlanguage.org ).
2 #
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
9 # another product.
10
11 # A SAX 2 parser in Nit.
12 module saxophonit
13
14 import sax
15 intrude import standard::file
16 private import reader_model
17 private import lexer
18
19 # Implementation of the `XMLReader` interface.
20 #
21 # For the moment, only XML 1.0 is (partially) supported.
22 #
23 # The following mandatory features of XML 1.0 are not yet supported:
24 #
25 # * Parsing of entities (files) encoded in UTF-16.
26 # * Encoding handling.
27 # * Entity references resolving (except for built-in references).
28 # * Handling of the options specified in the XML declaration.
29 # * Parsing of a `DOCTYPE` declaration.
30 #
31 # Also note that this XML processor is unable to retrieve a file from an URL
32 # (only local paths are supported).
33 class XophonReader
34 super XMLReader
35
36 private var model = new XophonReaderModel
37 private var lexer: XophonLexer is noinit
38
39 redef fun entity_resolver: nullable EntityResolver do return model.entity_resolver
40 redef fun entity_resolver=(entity_resolver: nullable EntityResolver) do
41 model.entity_resolver = entity_resolver
42 end
43
44 redef fun dtd_handler: nullable DTDHandler do return model.dtd_handler
45 redef fun dtd_handler=(dtd_handler: nullable DTDHandler) do
46 model.dtd_handler = dtd_handler
47 end
48
49 redef fun content_handler: nullable ContentHandler do return model.content_handler
50 redef fun content_handler=(content_handler: nullable ContentHandler) do
51 model.content_handler = content_handler
52 end
53
54 redef fun error_handler: nullable ErrorHandler do return model.error_handler
55 redef fun error_handler=(error_handler: nullable ErrorHandler) do
56 model.error_handler = error_handler
57 end
58
59
60 redef fun feature_recognized(name: String): Bool do
61 return model.feature_recognized(name)
62 end
63
64 redef fun feature_readable(name: String): Bool do
65 return model.feature_readable(name)
66 end
67
68 redef fun feature_writable(name: String): Bool do
69 return model.feature_readable(name)
70 end
71
72 redef fun feature(name: String): Bool do return model.feature(name)
73 redef fun feature=(name: String, value: Bool) do model.feature(name) = value
74
75 redef fun property_recognized(name: String): Bool do
76 return model.property_recognized(name)
77 end
78
79 redef fun property_readable(name: String): Bool do
80 return model.property_readable(name)
81 end
82
83 redef fun property_writable(name: String): Bool do
84 return model.property_writable(name)
85 end
86
87 redef fun property(name: String): nullable Object do
88 return model.property(name)
89 end
90
91 redef fun property=(name: String, value: nullable Object) do
92 model.property(name) = value
93 end
94
95 redef fun parse(input: InputSource) do
96 var stream: IStream
97 var system_id: nullable MaybeError[String, Error] = null
98 model.locator = new SAXLocatorImpl
99
100 if input.system_id != null then
101 system_id = resolve_system_id(input.system_id.as(not null))
102 if system_id.is_error then
103 model.fire_warning(system_id.error.message, system_id.error)
104 else
105 model.locator.system_id = system_id.value
106 end
107 end
108 model.locator.public_id = input.public_id
109 # TODO: encoding
110
111 if input.stream != null then
112 lexer = new XophonLexer(model, input.stream.as(not null))
113 parse_main
114 else if system_id != null then
115 if system_id.is_error then
116 model.fire_fatal_error("File <{input.system_id.as(not null)}> not found.", null)
117 else
118 lexer = new XophonLexer(model,
119 new IFStream.open(system_id.value))
120 parse_main
121 lexer.close
122 end
123 else
124 model.fire_fatal_error("At least a stream or a system identifier must be specified. None given.",
125 null)
126 end
127 end
128
129 redef fun parse_file(system_id: String) do
130 parse(new InputSource.with_system_id(system_id))
131 end
132
133
134 ############################################################################
135 # Parsing
136
137 # Note: Every `expect_*` function (except `parse_main`) does not call
138 # `read_char` for the first byte and let the byte just after its production
139 # in `last_char` (except in case of fatal error). They return `false` on
140 # fatal error and at the end of the file.
141
142 # Parse the main entity.
143 private fun parse_main do
144 model.fire_document_locator
145 model.fire_start_document
146 lexer.start
147 expect_document
148 model.fire_end_document
149 end
150
151 # Expect a `document` production.
152 private fun expect_document: Bool do
153 var success = true
154 var got_doctype = false
155 var got_element = false
156
157 # If the document start with `<`, it may start with a XML declaration,
158 # a processing instruction, a comment, a `DOCTYPE` declaration, the
159 # root element or a white space.
160 if lexer.accept('<') then
161 if lexer.accept('?') then
162 if not expect_pi_or_xml_decl then return false
163 else if lexer.accept('!') then
164 if lexer.accept('-') then
165 if not lexer.expect('-',
166 " at the beginning of a comment") or
167 not expect_comment then
168 return false
169 end
170 else
171 if not expect_doctype_decl then return false
172 got_doctype = true
173 end
174 else
175 if not expect_root then return false
176 # The `DOCTYPE` declaration *must* come before the root
177 # element.
178 got_doctype = true
179 got_element = true
180 end
181 else if not lexer.accept_s then
182 return lexer.fire_unexpected_char(
183 ". Expecting a white space or `<`")
184 end
185
186 # After the XML declaration (if there is one), the document may contain
187 # processing instructions, comments, the `DOCTYPE` declaration and
188 # the root element.
189 # These productions may be separated by white space.
190 while not got_element do
191 if lexer.accept('<') then
192 if lexer.accept('?') then
193 if not expect_pi then return false
194 else if lexer.accept('!') then
195 if lexer.accept('-') then
196 if not lexer.expect('-',
197 " at the beginning of a comment") or
198 not expect_comment then
199 return false
200 end
201 else if got_doctype then
202 return lexer.fire_unexpected_char(". Expecting `-`")
203 else if expect_doctype_decl then
204 got_doctype = true
205 else
206 return false
207 end
208 else
209 if not expect_root then return false
210 # The `DOCTYPE` declaration *must* come before the root
211 # element.
212 got_doctype = true
213 got_element = true
214 end
215 else if not lexer.accept_s then
216 return lexer.fire_unexpected_char(
217 ". Expecting a white space or `<`")
218 end
219 end
220 return expect_miscs
221 end
222
223 private fun expect_doctype_decl: Bool do
224 return model.fire_fatal_error("DTD not supported yet.\n", null) # TODO
225 end
226
227 # Expect the root `element` production, without the first `<` token.
228 private fun expect_root: Bool do
229 var success = true
230 var char_data = new FlatBuffer
231
232 success = expect_stag
233 while success and not lexer.eof and not model.root_closed do
234 success = expect_content_chunk(char_data)
235 end
236 if success then
237 success = model.expect_root_closed
238 end
239 flush(char_data)
240 return success
241 end
242
243 # Parse a `EmptyElemTag | STag | ETag | Reference | CDSect | PI | Comment | CharData` production.
244 #
245 # If the last read byte matches the `CharData` production, push the char in
246 # `char_data`. Else, flush `CharData` as a `characters` event.
247 private fun expect_content_chunk(char_data: Buffer): Bool do
248 if lexer.accept('<') then
249 flush(char_data)
250 if lexer.accept('!') then
251 if lexer.accept('-') then
252 return lexer.expect('-',
253 " at the beginning of a comment") and
254 expect_comment
255 else if lexer.accept('[') then
256 return expect_cd_sect
257 else
258 return lexer.fire_unexpected_char(
259 ". Expecting `--` or `[CDATA[`")
260 end
261 else if lexer.accept('?') then
262 return expect_pi
263 else if lexer.accept('/') then
264 return expect_etag
265 else
266 return expect_stag
267 end
268 else if lexer.accept('&') then
269 flush(char_data)
270 var success = expect_reference(char_data)
271 flush(char_data)
272 return success
273 else
274 return lexer.expect_xml_char(char_data)
275 end
276 end
277
278 # Expect a `EmptyElemTag | STag` production, without the initial `<`.
279 private fun expect_stag: Bool do
280 var name_buffer = new FlatBuffer
281
282 if lexer.expect_name(name_buffer) then
283 var name = name_buffer.to_s
284
285 model.fire_start_attributes
286 loop
287 if lexer.accept('>') then
288 model.fire_start_element(name)
289 return true
290 else if lexer.accept('/') then
291 if lexer.expect('>', "") then
292 model.fire_start_element(name)
293 model.fire_end_element(name)
294 return true
295 else
296 return false
297 end
298 else if lexer.expect_s then
299 if lexer.accept('/') then
300 if lexer.expect('>', "") then
301 model.fire_start_element(name)
302 model.fire_end_element(name)
303 return true
304 else
305 return false
306 end
307 else if lexer.accept('>') then
308 model.fire_start_element(name)
309 return true
310 else if not expect_attribute then
311 return false
312 end
313 else
314 return lexer.fire_unexpected_char(" in tag. " +
315 "Expecting an attribute, `/`, `>` or white space")
316 end
317 end
318 end
319 return false
320 end
321
322 # Expect a `ETag` production, without the initial `</`.
323 private fun expect_etag: Bool do
324 var name_buf = new FlatBuffer
325
326 if lexer.expect_name(name_buf) and
327 lexer.skip_s and
328 lexer.expect('>', "") then
329 return model.fire_end_element(name_buf.to_s)
330 else
331 return false
332 end
333 end
334
335 # Expect an `Attributes` production.
336 private fun expect_attribute: Bool do
337 var name = new FlatBuffer
338 var value = new FlatBuffer
339
340 if lexer.expect_name(name) and
341 lexer.expect_eq and
342 expect_att_value(value) then
343 model.fire_attribute(name.to_s, value.to_s)
344 return true
345 else
346 return false
347 end
348 end
349
350 # Expect the `Misc*` production at the end of a document.
351 private fun expect_miscs: Bool do
352 while not lexer.eof do
353 if lexer.accept('<') then
354 if lexer.accept('?') then
355 if not expect_pi then return false
356 else if lexer.accept('!') then
357 if not lexer.expect_string("--",
358 " at the beginning of a comment") or
359 not expect_comment then
360 return false
361 end
362 else
363 return lexer.fire_unexpected_char(". Expecting `?` or `!`")
364 end
365 else if not lexer.accept_s then
366 return lexer.fire_unexpected_char(
367 ". Expecting a white space or `<`")
368 end
369 end
370 return true
371 end
372
373 # Expect a `AttValue` production.
374 #
375 # Append the parsed value to `buffer`.
376 private fun expect_att_value(buffer: Buffer): Bool do
377 var delimiter = lexer.expect_delimiter
378
379 if delimiter < 0 then return false
380 loop
381 if lexer.accept_int(delimiter) then
382 return true
383 else if lexer.accept('&') then
384 # TODO: [WFC: No < in Attribute Values]
385 if not expect_reference(buffer) then return false
386 else if not lexer.expect_att_value_char(buffer) then
387 return false
388 end
389 end
390 end
391
392 # Expect a `SystemLiteral` production.
393 #
394 # Also used to parse productions that do not have references.
395 # Append the parsed value to `buffer`.
396 private fun expect_literal(buffer: Buffer): Bool do
397 var delimiter = lexer.expect_delimiter
398
399 if delimiter < 0 then return false
400 loop
401 if lexer.accept_int(delimiter) then
402 return true
403 else if not lexer.expect_xml_char(buffer) then
404 return false
405 end
406 end
407 end
408
409
410 # Expect a `Comment` production, without the beginning.
411 #
412 # Assume `last_char` is the fifth byte of the production that is, the
413 # next byte after the `'<!--'` token.
414 private fun expect_comment: Bool do
415 var buffer: Buffer = new FlatBuffer
416
417 loop
418 if lexer.accept('-') then
419 if lexer.accept('-') then
420 if not lexer.expect('>',
421 " after a double-hyphen (`--`) in a comment") then
422 return false
423 else
424 break
425 end
426 else
427 buffer.chars.push('-')
428 if not lexer.expect_xml_char(buffer) then return false
429 end
430 else if not lexer.expect_xml_char(buffer) then
431 return false
432 end
433 end
434 model.fire_comment(buffer.to_s)
435 return true
436 end
437
438 # Expect a `PI` production, without the beginning.
439 #
440 # Assume `last_char` is the third byte of the production that is, the
441 # next byte after the `'<?'` token.
442 private fun expect_pi: Bool do
443 var target = new FlatBuffer
444
445 return lexer.expect_pi_target(target) and
446 expect_pi_data(target.to_s)
447 end
448
449 # Expect the data part and the `'?>'` token of a `PI` production.
450 private fun expect_pi_data(target: String): Bool do
451 if lexer.accept('?') then
452 if lexer.expect('>', " at the end of a processing instruction") then
453 model.fire_processing_instruction(target, null)
454 return true
455 else
456 return false
457 end
458 else if lexer.accept_s then
459 var data: Buffer = new FlatBuffer
460
461 loop
462 if lexer.accept('?') then
463 if lexer.accept('>') then
464 break
465 else
466 data.chars.push('?')
467 if not lexer.expect_xml_char(data) then return false
468 end
469 else if not lexer.expect_xml_char(data) then
470 return false
471 end
472 end
473 model.fire_processing_instruction(target, data.to_s)
474 return true
475 else
476 return lexer.fire_unexpected_char(" after a processing " +
477 "instruction target. Expecting a white space or `?>`")
478 end
479 end
480
481 # Expect a `PI | XMLDecl` production, without the beginning.
482 #
483 # Assume `last_char` is the third byte of the production that is, the
484 # next byte after the `'<?'` token.
485 private fun expect_pi_or_xml_decl: Bool do
486 var buffer: Buffer = new FlatBuffer
487
488 if lexer.expect_name(buffer) then
489 var target = buffer.to_s
490
491 if target == "xml" then
492 return expect_xml_decl
493 else if lexer.check_pi_target(target) then
494 return expect_pi_data(target)
495 else
496 return false
497 end
498 else
499 return false
500 end
501 end
502
503 # Expect a `XMLDecl` production, without the initial `<?xml` token.
504 private fun expect_xml_decl: Bool do
505 if not expect_version_info then return false
506 if lexer.accept_s then
507 if lexer.is_char('e') then
508 if not expect_encoding_decl then return false
509 # At this point, we can only accept `S` or `'?>'`.
510 if not lexer.accept_s then
511 return lexer.expect_string("?>", "")
512 end
513 end
514 if lexer.is_char('s') and not expect_sd_decl then return false
515 return lexer.skip_s and lexer.expect_string("?>", "")
516 else
517 return lexer.expect_string("?>", "")
518 end
519 end
520
521 # Expect a `EncodingDecl` token, without the initial `S` token.
522 private fun expect_encoding_decl: Bool do
523 var encoding = new FlatBuffer
524
525 if not lexer.expect_string("encoding", "") or not lexer.expect_eq or
526 not expect_literal(encoding) then
527 return false
528 end
529 if not encoding.has("^[A-Za-z][A-Za-z0-9._-]*$".to_re) then
530 return model.fire_fatal_error("`{encoding.to_s}` is not a valid " +
531 "encoding name.", null)
532 end
533 # TODO: Do something with the value.
534 return true
535 end
536
537 # Expect a `SDDecl` token, without the initial `S` token.
538 private fun expect_sd_decl: Bool do
539 var buf = new FlatBuffer
540 var value: String
541
542 if not lexer.expect_string("standalone", "") or not lexer.expect_eq or
543 not expect_literal(buf) then
544 return false
545 end
546 value = buf.to_s
547 if not value == "yes" and not value == "no" then
548 return model.fire_fatal_error("`{value}` is not a valid value for " +
549 "the `standalone` declaration. Expecting `yes` or `no`.",
550 null)
551 end
552 # TODO: Do something with the value.
553 return true
554 end
555
556 # Expect a `CDSect` production, without the beginning.
557 #
558 # Assume `last_char` is the fourth byte of the production that is, the
559 # next byte after the `'<!['` token.
560 private fun expect_cd_sect: Bool do
561 var buffer: Buffer = new FlatBuffer
562
563 # Number of consecutive closing brackets.
564 var closing: Int = 0
565
566 if lexer.expect_string("CDATA[",
567 " at the beginning of a CDATA section.") then
568 model.fire_start_cdata
569 loop
570 if lexer.accept(']') then
571 closing += 1
572 else
573 for i in [0..closing[ do
574 buffer.chars.push(']')
575 end
576 closing = 0
577 if closing >= 2 and lexer.accept('>') then break
578 if not lexer.expect_xml_char(buffer) then return false
579 end
580 end
581 flush(buffer)
582 model.fire_end_cdata
583 return true
584 else
585 return false
586 end
587 end
588
589 # Expect a `VersionInfo` production.
590 private fun expect_version_info: Bool do
591 if not lexer.expect_s or
592 not lexer.expect_string("version",
593 " in the first attribute name of the XML declaration") or
594 not lexer.expect_eq then
595 return false
596 else
597 var minor: Buffer = new FlatBuffer
598 var delimiter = lexer.expect_delimiter
599
600 if delimiter < 0 then return false
601 if not lexer.expect_string("1.", " as XML major version") or
602 not lexer.expect_digits(minor) or
603 not lexer.expect_int(delimiter, "") then
604 return false
605 end
606 if minor.to_s != "0" then
607 model.fire_warning("Only XML 1.0 is supported. " +
608 "Got a XML 1.{minor.to_s} document.", null)
609 end
610 return true
611 end
612 end
613
614 # Expect a `Reference`, without the initial `&`.
615 #
616 # Append the value to the buffer.
617 private fun expect_reference(buffer: Buffer): Bool do
618 # TODO: [WFC: Entity Declared]
619 # TODO: [VC: Entity Declared]
620 # TODO: [WFC: Parsed Entity]
621 # TODO: [WFC: No Recursion]
622 # TODO: Unicode
623
624 var ref = new FlatBuffer
625
626 if lexer.accept('#') then
627 if lexer.accept('x') then
628 if lexer.expect_hex(ref) then
629 buffer.chars.add(ref.to_hex.ascii)
630 return lexer.expect(';', "")
631 else
632 return lexer.fire_unexpected_char(
633 ". Expecting an hexadecimal digit")
634 end
635 else if lexer.accept_digits(ref) then
636 buffer.chars.add(ref.to_i.ascii)
637 return lexer.expect(';', "")
638 else
639 return lexer.fire_unexpected_char(" in a character reference. " +
640 "Expecting `x` or a decimal digit")
641 end
642 else if lexer.expect_name(ref) then
643 var name = ref.to_s
644 if name.has(":") then
645 model.fire_error("The entity name `{name}` contains a colon.", null)
646 end
647 var value = resolve_reference(name)
648
649 if value != null then
650 buffer.append(value)
651 return lexer.expect(';', "")
652 else
653 model.fire_fatal_error("Unknown entity `{name}`.", null)
654 return false
655 end
656 else
657 return lexer.fire_unexpected_char(
658 " in a reference. Expecting `#` or a name")
659 end
660 end
661
662 # Resolve the entity reference or return `null`.
663 private fun resolve_reference(name: String): nullable String do
664 if name == "lt" then
665 return "<"
666 else if name == "gt" then
667 return ">"
668 else if name == "amp" then
669 return "&"
670 else if name == "quot" then
671 return "\""
672 else if name == "apos" then
673 return "'"
674 else
675 return null
676 end
677 # TODO: Support non-builtin entities
678 end
679
680 # Flush the specified buffer as a `characters` event.
681 #
682 # Do nothing if `buffer` is empty.
683 private fun flush(buffer: Buffer) do
684 if buffer.length > 0 then
685 model.fire_characters(buffer.to_s)
686 buffer.clear
687 end
688 end
689
690
691 ############################################################################
692 # Paths
693
694 # Resolve the specified system id.
695 private fun resolve_system_id(system_id: String): MaybeError[String, Error] do
696 return realpath(system_id)
697 # TODO: handle URIs
698 end
699
700 # Resolve the specified POSIX path.
701 #
702 # Like `String.realpath`, but with error handling.
703 private fun realpath(path: String): MaybeError[String, Error] do
704 var cs = path.to_cstring.file_realpath
705
706 if cs.address_is_null then
707 return new MaybeError[String, Error](null,
708 new Error("File <{path}> not found."))
709 else
710 return new MaybeError[String, Error](cs.to_s, null)
711 end
712 end
713 end