1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
11 # A SAX 2 parser in Nit.
15 intrude import standard
::file
16 private import reader_model
19 # Implementation of the `XMLReader` interface.
21 # For the moment, only XML 1.0 is (partially) supported.
23 # The following mandatory features of XML 1.0 are not yet supported:
25 # * Parsing of entities (files) encoded in UTF-16.
26 # * Encoding handling.
27 # * Entity references resolving (except for built-in references).
28 # * Handling of the options specified in the XML declaration.
29 # * Parsing of a `DOCTYPE` declaration.
31 # Also note that this XML processor is unable to retrieve a file from an URL
32 # (only local paths are supported).
36 # # Retrieve all text nodes.
38 # super ContentHandler
40 # private var buf: Buffer = new FlatBuffer
41 # private var sp: Bool = false
43 # redef fun characters(str: String) do
45 # if buf.length > 0 then buf.append(" ")
51 # redef fun ignorable_whitespace(str: String) do
55 # # Return the concatenation of all text nodes.
56 # redef fun to_s do return buf.to_s
59 # var text = new TextListener
60 # var reader = new XophonReader
62 # reader.content_handler = text
63 # reader.parse(new InputSource.with_stream(new StringIStream("<foo>bar baz <n>42</n>.</foo>")))
64 # assert text.to_s == "bar baz 42."
68 private var model
= new XophonReaderModel
69 private var lexer
: XophonLexer is noinit
71 redef fun entity_resolver
: nullable EntityResolver do return model
.entity_resolver
72 redef fun entity_resolver
=(entity_resolver
: nullable EntityResolver) do
73 model
.entity_resolver
= entity_resolver
76 redef fun dtd_handler
: nullable DTDHandler do return model
.dtd_handler
77 redef fun dtd_handler
=(dtd_handler
: nullable DTDHandler) do
78 model
.dtd_handler
= dtd_handler
81 redef fun content_handler
: nullable ContentHandler do return model
.content_handler
82 redef fun content_handler
=(content_handler
: nullable ContentHandler) do
83 model
.content_handler
= content_handler
86 redef fun error_handler
: nullable ErrorHandler do return model
.error_handler
87 redef fun error_handler
=(error_handler
: nullable ErrorHandler) do
88 model
.error_handler
= error_handler
92 redef fun feature_recognized
(name
: String): Bool do
93 return model
.feature_recognized
(name
)
96 redef fun feature_readable
(name
: String): Bool do
97 return model
.feature_readable
(name
)
100 redef fun feature_writable
(name
: String): Bool do
101 return model
.feature_readable
(name
)
104 redef fun feature
(name
: String): Bool do return model
.feature
(name
)
105 redef fun feature
=(name
: String, value
: Bool) do model
.feature
(name
) = value
107 redef fun property_recognized
(name
: String): Bool do
108 return model
.property_recognized
(name
)
111 redef fun property_readable
(name
: String): Bool do
112 return model
.property_readable
(name
)
115 redef fun property_writable
(name
: String): Bool do
116 return model
.property_writable
(name
)
119 redef fun property
(name
: String): nullable Object do
120 return model
.property
(name
)
123 redef fun property
=(name
: String, value
: nullable Object) do
124 model
.property
(name
) = value
127 redef fun parse
(input
: InputSource) do
129 var system_id
: nullable MaybeError[String, Error] = null
130 model
.locator
= new SAXLocatorImpl
132 if input
.system_id
!= null then
133 system_id
= resolve_system_id
(input
.system_id
.as(not null))
134 if system_id
.is_error
then
135 model
.fire_warning
(system_id
.error
.message
, system_id
.error
)
137 model
.locator
.system_id
= system_id
.value
140 model
.locator
.public_id
= input
.public_id
143 if input
.stream
!= null then
144 lexer
= new XophonLexer(model
, input
.stream
.as(not null))
146 else if system_id
!= null then
147 if system_id
.is_error
then
148 model
.fire_fatal_error
("File <{input.system_id.as(not null)}> not found.", null)
150 lexer
= new XophonLexer(model
,
151 new IFStream.open
(system_id
.value
))
156 model
.fire_fatal_error
("At least a stream or a system identifier must be specified. None given.",
161 redef fun parse_file
(system_id
: String) do
162 parse
(new InputSource.with_system_id
(system_id
))
166 ############################################################################
169 # Note: Every `expect_*` function (except `parse_main`) does not call
170 # `read_char` for the first byte and let the byte just after its production
171 # in `last_char` (except in case of fatal error). They return `false` on
172 # fatal error and at the end of the file.
174 # Parse the main entity.
175 private fun parse_main
do
176 model
.fire_document_locator
177 model
.fire_start_document
180 model
.fire_end_document
183 # Expect a `document` production.
184 private fun expect_document
: Bool do
186 var got_doctype
= false
187 var got_element
= false
189 # If the document start with `<`, it may start with a XML declaration,
190 # a processing instruction, a comment, a `DOCTYPE` declaration, the
191 # root element or a white space.
192 if lexer
.accept
('<') then
193 if lexer
.accept
('?') then
194 if not expect_pi_or_xml_decl
then return false
195 else if lexer
.accept
('!') then
196 if lexer
.accept
('-') then
197 if not lexer
.expect
('-',
198 " at the beginning of a comment") or
199 not expect_comment
then
203 if not expect_doctype_decl
then return false
207 if not expect_root
then return false
208 # The `DOCTYPE` declaration *must* come before the root
213 else if not lexer
.accept_s
then
214 return lexer
.fire_unexpected_char
(
215 ". Expecting a white space or `<`")
218 # After the XML declaration (if there is one), the document may contain
219 # processing instructions, comments, the `DOCTYPE` declaration and
221 # These productions may be separated by white space.
222 while not got_element
do
223 if lexer
.accept
('<') then
224 if lexer
.accept
('?') then
225 if not expect_pi
then return false
226 else if lexer
.accept
('!') then
227 if lexer
.accept
('-') then
228 if not lexer
.expect
('-',
229 " at the beginning of a comment") or
230 not expect_comment
then
233 else if got_doctype
then
234 return lexer
.fire_unexpected_char
(". Expecting `-`")
235 else if expect_doctype_decl
then
241 if not expect_root
then return false
242 # The `DOCTYPE` declaration *must* come before the root
247 else if not lexer
.accept_s
then
248 return lexer
.fire_unexpected_char
(
249 ". Expecting a white space or `<`")
255 private fun expect_doctype_decl
: Bool do
256 return model
.fire_fatal_error
("DTD not supported yet.\n", null) # TODO
259 # Expect the root `element` production, without the first `<` token.
260 private fun expect_root
: Bool do
262 var char_data
= new FlatBuffer
264 success
= expect_stag
265 while success
and not lexer
.eof
and not model
.root_closed
do
266 success
= expect_content_chunk
(char_data
)
269 success
= model
.expect_root_closed
275 # Parse a `EmptyElemTag | STag | ETag | Reference | CDSect | PI | Comment | CharData` production.
277 # If the last read byte matches the `CharData` production, push the char in
278 # `char_data`. Else, flush `CharData` as a `characters` event.
279 private fun expect_content_chunk
(char_data
: Buffer): Bool do
280 if lexer
.accept
('<') then
282 if lexer
.accept
('!') then
283 if lexer
.accept
('-') then
284 return lexer
.expect
('-',
285 " at the beginning of a comment") and
287 else if lexer
.accept
('[') then
288 return expect_cd_sect
290 return lexer
.fire_unexpected_char
(
291 ". Expecting `--` or `[CDATA[`")
293 else if lexer
.accept
('?') then
295 else if lexer
.accept
('/') then
300 else if lexer
.accept
('&') then
302 var success
= expect_reference
(char_data
)
306 return lexer
.expect_xml_char
(char_data
)
310 # Expect a `EmptyElemTag | STag` production, without the initial `<`.
311 private fun expect_stag
: Bool do
312 var name_buffer
= new FlatBuffer
314 if lexer
.expect_name
(name_buffer
) then
315 var name
= name_buffer
.to_s
317 model
.fire_start_attributes
319 if lexer
.accept
('>') then
320 model
.fire_start_element
(name
)
322 else if lexer
.accept
('/') then
323 if lexer
.expect
('>', "") then
324 model
.fire_start_element
(name
)
325 model
.fire_end_element
(name
)
330 else if lexer
.expect_s
then
331 if lexer
.accept
('/') then
332 if lexer
.expect
('>', "") then
333 model
.fire_start_element
(name
)
334 model
.fire_end_element
(name
)
339 else if lexer
.accept
('>') then
340 model
.fire_start_element
(name
)
342 else if not expect_attribute
then
346 return lexer
.fire_unexpected_char
(" in tag. " +
347 "Expecting an attribute, `/`, `>` or white space")
354 # Expect a `ETag` production, without the initial `</`.
355 private fun expect_etag
: Bool do
356 var name_buf
= new FlatBuffer
358 if lexer
.expect_name
(name_buf
) and
360 lexer
.expect
('>', "") then
361 return model
.fire_end_element
(name_buf
.to_s
)
367 # Expect an `Attributes` production.
368 private fun expect_attribute
: Bool do
369 var name
= new FlatBuffer
370 var value
= new FlatBuffer
372 if lexer
.expect_name
(name
) and
374 expect_att_value
(value
) then
375 model
.fire_attribute
(name
.to_s
, value
.to_s
)
382 # Expect the `Misc*` production at the end of a document.
383 private fun expect_miscs
: Bool do
384 while not lexer
.eof
do
385 if lexer
.accept
('<') then
386 if lexer
.accept
('?') then
387 if not expect_pi
then return false
388 else if lexer
.accept
('!') then
389 if not lexer
.expect_string
("--",
390 " at the beginning of a comment") or
391 not expect_comment
then
395 return lexer
.fire_unexpected_char
(". Expecting `?` or `!`")
397 else if not lexer
.accept_s
then
398 return lexer
.fire_unexpected_char
(
399 ". Expecting a white space or `<`")
405 # Expect a `AttValue` production.
407 # Append the parsed value to `buffer`.
408 private fun expect_att_value
(buffer
: Buffer): Bool do
409 var delimiter
= lexer
.expect_delimiter
411 if delimiter
< 0 then return false
413 if lexer
.accept_int
(delimiter
) then
415 else if lexer
.accept
('&') then
416 # TODO: [WFC: No < in Attribute Values]
417 if not expect_reference
(buffer
) then return false
418 else if not lexer
.expect_att_value_char
(buffer
) then
424 # Expect a `SystemLiteral` production.
426 # Also used to parse productions that do not have references.
427 # Append the parsed value to `buffer`.
428 private fun expect_literal
(buffer
: Buffer): Bool do
429 var delimiter
= lexer
.expect_delimiter
431 if delimiter
< 0 then return false
433 if lexer
.accept_int
(delimiter
) then
435 else if not lexer
.expect_xml_char
(buffer
) then
442 # Expect a `Comment` production, without the beginning.
444 # Assume `last_char` is the fifth byte of the production that is, the
445 # next byte after the `'<!--'` token.
446 private fun expect_comment
: Bool do
447 var buffer
: Buffer = new FlatBuffer
450 if lexer
.accept
('-') then
451 if lexer
.accept
('-') then
452 if not lexer
.expect
('>',
453 " after a double-hyphen (`--`) in a comment") then
459 buffer
.chars
.push
('-')
460 if not lexer
.expect_xml_char
(buffer
) then return false
462 else if not lexer
.expect_xml_char
(buffer
) then
466 model
.fire_comment
(buffer
.to_s
)
470 # Expect a `PI` production, without the beginning.
472 # Assume `last_char` is the third byte of the production that is, the
473 # next byte after the `'<?'` token.
474 private fun expect_pi
: Bool do
475 var target
= new FlatBuffer
477 return lexer
.expect_pi_target
(target
) and
478 expect_pi_data
(target
.to_s
)
481 # Expect the data part and the `'?>'` token of a `PI` production.
482 private fun expect_pi_data
(target
: String): Bool do
483 if lexer
.accept
('?') then
484 if lexer
.expect
('>', " at the end of a processing instruction") then
485 model
.fire_processing_instruction
(target
, null)
490 else if lexer
.accept_s
then
491 var data
: Buffer = new FlatBuffer
494 if lexer
.accept
('?') then
495 if lexer
.accept
('>') then
499 if not lexer
.expect_xml_char
(data
) then return false
501 else if not lexer
.expect_xml_char
(data
) then
505 model
.fire_processing_instruction
(target
, data
.to_s
)
508 return lexer
.fire_unexpected_char
(" after a processing " +
509 "instruction target. Expecting a white space or `?>`")
513 # Expect a `PI | XMLDecl` production, without the beginning.
515 # Assume `last_char` is the third byte of the production that is, the
516 # next byte after the `'<?'` token.
517 private fun expect_pi_or_xml_decl
: Bool do
518 var buffer
: Buffer = new FlatBuffer
520 if lexer
.expect_name
(buffer
) then
521 var target
= buffer
.to_s
523 if target
== "xml" then
524 return expect_xml_decl
525 else if lexer
.check_pi_target
(target
) then
526 return expect_pi_data
(target
)
535 # Expect a `XMLDecl` production, without the initial `<?xml` token.
536 private fun expect_xml_decl
: Bool do
537 if not expect_version_info
then return false
538 if lexer
.accept_s
then
539 if lexer
.is_char
('e') then
540 if not expect_encoding_decl
then return false
541 # At this point, we can only accept `S` or `'?>'`.
542 if not lexer
.accept_s
then
543 return lexer
.expect_string
("?>", "")
546 if lexer
.is_char
('s') and not expect_sd_decl
then return false
547 return lexer
.skip_s
and lexer
.expect_string
("?>", "")
549 return lexer
.expect_string
("?>", "")
553 # Expect a `EncodingDecl` token, without the initial `S` token.
554 private fun expect_encoding_decl
: Bool do
555 var encoding
= new FlatBuffer
557 if not lexer
.expect_string
("encoding", "") or not lexer
.expect_eq
or
558 not expect_literal
(encoding
) then
561 if not encoding
.has
("^[A-Za-z][A-Za-z0-9._-]*$".to_re
) then
562 return model
.fire_fatal_error
("`{encoding.to_s}` is not a valid " +
563 "encoding name.", null)
565 # TODO: Do something with the value.
569 # Expect a `SDDecl` token, without the initial `S` token.
570 private fun expect_sd_decl
: Bool do
571 var buf
= new FlatBuffer
574 if not lexer
.expect_string
("standalone", "") or not lexer
.expect_eq
or
575 not expect_literal
(buf
) then
579 if not value
== "yes" and not value
== "no" then
580 return model
.fire_fatal_error
("`{value}` is not a valid value for " +
581 "the `standalone` declaration. Expecting `yes` or `no`.",
584 # TODO: Do something with the value.
588 # Expect a `CDSect` production, without the beginning.
590 # Assume `last_char` is the fourth byte of the production that is, the
591 # next byte after the `'<!['` token.
592 private fun expect_cd_sect
: Bool do
593 var buffer
: Buffer = new FlatBuffer
595 # Number of consecutive closing brackets.
598 if lexer
.expect_string
("CDATA[",
599 " at the beginning of a CDATA section.") then
600 model
.fire_start_cdata
602 if lexer
.accept
(']') then
605 for i
in [0..closing
[ do
606 buffer
.chars
.push
(']')
609 if closing
>= 2 and lexer
.accept
('>') then break
610 if not lexer
.expect_xml_char
(buffer
) then return false
621 # Expect a `VersionInfo` production.
622 private fun expect_version_info
: Bool do
623 if not lexer
.expect_s
or
624 not lexer
.expect_string
("version",
625 " in the first attribute name of the XML declaration") or
626 not lexer
.expect_eq
then
629 var minor
: Buffer = new FlatBuffer
630 var delimiter
= lexer
.expect_delimiter
632 if delimiter
< 0 then return false
633 if not lexer
.expect_string
("1.", " as XML major version") or
634 not lexer
.expect_digits
(minor
) or
635 not lexer
.expect_int
(delimiter
, "") then
638 if minor
.to_s
!= "0" then
639 model
.fire_warning
("Only XML 1.0 is supported. " +
640 "Got a XML 1.{minor.to_s} document.", null)
646 # Expect a `Reference`, without the initial `&`.
648 # Append the value to the buffer.
649 private fun expect_reference
(buffer
: Buffer): Bool do
650 # TODO: [WFC: Entity Declared]
651 # TODO: [VC: Entity Declared]
652 # TODO: [WFC: Parsed Entity]
653 # TODO: [WFC: No Recursion]
656 var ref
= new FlatBuffer
658 if lexer
.accept
('#') then
659 if lexer
.accept
('x') then
660 if lexer
.expect_hex
(ref
) then
661 buffer
.chars
.add
(ref
.to_hex
.ascii
)
662 return lexer
.expect
(';', "")
664 return lexer
.fire_unexpected_char
(
665 ". Expecting an hexadecimal digit")
667 else if lexer
.accept_digits
(ref
) then
668 buffer
.chars
.add
(ref
.to_i
.ascii
)
669 return lexer
.expect
(';', "")
671 return lexer
.fire_unexpected_char
(" in a character reference. " +
672 "Expecting `x` or a decimal digit")
674 else if lexer
.expect_name
(ref
) then
676 if name
.has
(":") then
677 model
.fire_error
("The entity name `{name}` contains a colon.", null)
679 var value
= resolve_reference
(name
)
681 if value
!= null then
683 return lexer
.expect
(';', "")
685 model
.fire_fatal_error
("Unknown entity `{name}`.", null)
689 return lexer
.fire_unexpected_char
(
690 " in a reference. Expecting `#` or a name")
694 # Resolve the entity reference or return `null`.
695 private fun resolve_reference
(name
: String): nullable String do
698 else if name
== "gt" then
700 else if name
== "amp" then
702 else if name
== "quot" then
704 else if name == "apos
" then
709 # TODO: Support non-builtin entities
712 # Flush the specified buffer as a `characters` event.
714 # Do nothing if `buffer` is empty.
715 private fun flush(buffer: Buffer) do
716 if buffer.length > 0 then
717 model.fire_characters(buffer.to_s)
723 ############################################################################
726 # Resolve the specified system id.
727 private fun resolve_system_id(system_id: String): MaybeError[String, Error] do
728 return realpath(system_id)
732 # Resolve the specified POSIX path.
734 # Like `String.realpath`, but with error handling.
735 private fun realpath(path: String): MaybeError[String, Error] do
736 var cs = path.to_cstring.file_realpath
738 if cs.address_is_null then
739 return new MaybeError[String, Error](null,
740 new Error("File <{path}> not found."))
742 return new MaybeError[String, Error](cs.to_s, null)