1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
11 # A SAX 2 parser in Nit.
15 intrude import core
::file
16 private import reader_model
19 # Implementation of the `XMLReader` interface.
21 # For the moment, only XML 1.0 is (partially) supported.
23 # The following mandatory features of XML 1.0 are not yet supported:
25 # * Parsing of entities (files) encoded in UTF-16.
26 # * Encoding handling.
27 # * Entity references resolving (except for built-in references).
28 # * Handling of the options specified in the XML declaration.
29 # * Parsing of a `DOCTYPE` declaration.
31 # Also note that this XML processor is unable to retrieve a file from an URL
32 # (only local paths are supported).
36 # # Retrieve all text nodes.
38 # super ContentHandler
40 # private var buf: Buffer = new FlatBuffer
41 # private var sp: Bool = false
43 # redef fun characters(str: String) do
45 # if buf.length > 0 then buf.append(" ")
51 # redef fun ignorable_whitespace(str: String) do
55 # # Return the concatenation of all text nodes.
56 # redef fun to_s do return buf.to_s
59 # var text = new TextListener
60 # var reader = new XophonReader
62 # reader.content_handler = text
63 # reader.parse(new InputSource.with_stream(new StringReader("<foo>bar baz <n>42</n>.</foo>")))
64 # assert text.to_s == "bar baz 42."
68 private var model
= new XophonReaderModel
69 private var lexer
: XophonLexer is noinit
71 redef fun entity_resolver
do return model
.entity_resolver
72 redef fun entity_resolver
=(entity_resolver
) do
73 model
.entity_resolver
= entity_resolver
76 redef fun dtd_handler
do return model
.dtd_handler
77 redef fun dtd_handler
=(dtd_handler
) do
78 model
.dtd_handler
= dtd_handler
81 redef fun content_handler
do return model
.content_handler
82 redef fun content_handler
=(content_handler
) do
83 model
.content_handler
= content_handler
86 redef fun error_handler
do return model
.error_handler
87 redef fun error_handler
=(error_handler
) do
88 model
.error_handler
= error_handler
92 redef fun feature_recognized
(name
) do
93 return model
.feature_recognized
(name
)
96 redef fun feature_readable
(name
) do
97 return model
.feature_readable
(name
)
100 redef fun feature_writable
(name
) do
101 return model
.feature_readable
(name
)
104 redef fun feature
(name
) do return model
.feature
(name
)
105 redef fun feature
=(name
, value
) do model
.feature
(name
) = value
107 redef fun property_recognized
(name
) do
108 return model
.property_recognized
(name
)
111 redef fun property_readable
(name
) do
112 return model
.property_readable
(name
)
115 redef fun property_writable
(name
) do
116 return model
.property_writable
(name
)
119 redef fun property
(name
) do
120 return model
.property
(name
)
123 redef fun property
=(name
, value
) do
124 model
.property
(name
) = value
127 redef fun parse
(input
) do
128 var system_id
: nullable MaybeError[String, Error] = null
129 model
.locator
= new SAXLocatorImpl
131 if input
.system_id
!= null then
132 system_id
= resolve_system_id
(input
.system_id
.as(not null))
133 if system_id
.is_error
then
134 model
.fire_warning
(system_id
.error
.message
, system_id
.error
)
136 model
.locator
.system_id
= system_id
.value
139 model
.locator
.public_id
= input
.public_id
142 if input
.stream
!= null then
143 lexer
= new XophonLexer(model
, input
.stream
.as(not null))
145 else if system_id
!= null then
146 if system_id
.is_error
then
147 model
.fire_fatal_error
("File <{input.system_id.as(not null)}> not found.", null)
149 lexer
= new XophonLexer(model
,
150 new FileReader.open
(system_id
.value
))
155 model
.fire_fatal_error
("At least a stream or a system identifier must be specified. None given.",
160 redef fun parse_file
(system_id
) do
161 parse
(new InputSource.with_system_id
(system_id
))
165 ############################################################################
168 # Note: Every `expect_*` function (except `parse_main`) does not call
169 # `read_char` for the first byte and let the byte just after its production
170 # in `last_char` (except in case of fatal error). They return `false` on
171 # fatal error and at the end of the file.
173 # Parse the main entity.
174 private fun parse_main
do
175 model
.fire_document_locator
176 model
.fire_start_document
179 model
.fire_end_document
182 # Expect a `document` production.
183 private fun expect_document
: Bool do
184 var got_doctype
= false
185 var got_element
= false
187 # If the document start with `<`, it may start with a XML declaration,
188 # a processing instruction, a comment, a `DOCTYPE` declaration, the
189 # root element or a white space.
190 if lexer
.accept
('<') then
191 if lexer
.accept
('?') then
192 if not expect_pi_or_xml_decl
then return false
193 else if lexer
.accept
('!') then
194 if lexer
.accept
('-') then
195 if not lexer
.expect
('-',
196 " at the beginning of a comment") or
197 not expect_comment
then
201 if not expect_doctype_decl
then return false
205 if not expect_root
then return false
206 # The `DOCTYPE` declaration *must* come before the root
211 else if not lexer
.accept_s
then
212 return lexer
.fire_unexpected_char
(
213 ". Expecting a white space or `<`")
216 # After the XML declaration (if there is one), the document may contain
217 # processing instructions, comments, the `DOCTYPE` declaration and
219 # These productions may be separated by white space.
220 while not got_element
do
221 if lexer
.accept
('<') then
222 if lexer
.accept
('?') then
223 if not expect_pi
then return false
224 else if lexer
.accept
('!') then
225 if lexer
.accept
('-') then
226 if not lexer
.expect
('-',
227 " at the beginning of a comment") or
228 not expect_comment
then
231 else if got_doctype
then
232 return lexer
.fire_unexpected_char
(". Expecting `-`")
233 else if expect_doctype_decl
then
239 if not expect_root
then return false
240 # The `DOCTYPE` declaration *must* come before the root
245 else if not lexer
.accept_s
then
246 return lexer
.fire_unexpected_char
(
247 ". Expecting a white space or `<`")
253 private fun expect_doctype_decl
: Bool do
254 return model
.fire_fatal_error
("DTD not supported yet.\n", null) # TODO
257 # Expect the root `element` production, without the first `<` token.
258 private fun expect_root
: Bool do
260 var char_data
= new FlatBuffer
262 success
= expect_stag
263 while success
and not lexer
.eof
and not model
.root_closed
do
264 success
= expect_content_chunk
(char_data
)
267 success
= model
.expect_root_closed
273 # Parse a `EmptyElemTag | STag | ETag | Reference | CDSect | PI | Comment | CharData` production.
275 # If the last read byte matches the `CharData` production, push the char in
276 # `char_data`. Else, flush `CharData` as a `characters` event.
277 private fun expect_content_chunk
(char_data
: Buffer): Bool do
278 if lexer
.accept
('<') then
280 if lexer
.accept
('!') then
281 if lexer
.accept
('-') then
282 return lexer
.expect
('-',
283 " at the beginning of a comment") and
285 else if lexer
.accept
('[') then
286 return expect_cd_sect
288 return lexer
.fire_unexpected_char
(
289 ". Expecting `--` or `[CDATA[`")
291 else if lexer
.accept
('?') then
293 else if lexer
.accept
('/') then
298 else if lexer
.accept
('&') then
300 var success
= expect_reference
(char_data
)
304 return lexer
.expect_xml_char
(char_data
)
308 # Expect a `EmptyElemTag | STag` production, without the initial `<`.
309 private fun expect_stag
: Bool do
310 var name_buffer
= new FlatBuffer
312 if lexer
.expect_name
(name_buffer
) then
313 var name
= name_buffer
.to_s
315 model
.fire_start_attributes
317 if lexer
.accept
('>') then
318 model
.fire_start_element
(name
)
320 else if lexer
.accept
('/') then
321 if lexer
.expect
('>', "") then
322 model
.fire_start_element
(name
)
323 model
.fire_end_element
(name
)
328 else if lexer
.expect_s
then
329 if lexer
.accept
('/') then
330 if lexer
.expect
('>', "") then
331 model
.fire_start_element
(name
)
332 model
.fire_end_element
(name
)
337 else if lexer
.accept
('>') then
338 model
.fire_start_element
(name
)
340 else if not expect_attribute
then
344 return lexer
.fire_unexpected_char
(" in tag. " +
345 "Expecting an attribute, `/`, `>` or white space")
352 # Expect a `ETag` production, without the initial `</`.
353 private fun expect_etag
: Bool do
354 var name_buf
= new FlatBuffer
356 if lexer
.expect_name
(name_buf
) and
358 lexer
.expect
('>', "") then
359 return model
.fire_end_element
(name_buf
.to_s
)
365 # Expect an `Attributes` production.
366 private fun expect_attribute
: Bool do
367 var name
= new FlatBuffer
368 var value
= new FlatBuffer
370 if lexer
.expect_name
(name
) and
372 expect_att_value
(value
) then
373 model
.fire_attribute
(name
.to_s
, value
.to_s
)
380 # Expect the `Misc*` production at the end of a document.
381 private fun expect_miscs
: Bool do
382 while not lexer
.eof
do
383 if lexer
.accept
('<') then
384 if lexer
.accept
('?') then
385 if not expect_pi
then return false
386 else if lexer
.accept
('!') then
387 if not lexer
.expect_string
("--",
388 " at the beginning of a comment") or
389 not expect_comment
then
393 return lexer
.fire_unexpected_char
(". Expecting `?` or `!`")
395 else if not lexer
.accept_s
then
396 return lexer
.fire_unexpected_char
(
397 ". Expecting a white space or `<`")
403 # Expect a `AttValue` production.
405 # Append the parsed value to `buffer`.
406 private fun expect_att_value
(buffer
: Buffer): Bool do
407 var delimiter
= lexer
.expect_delimiter
409 if delimiter
< 0 then return false
411 if lexer
.accept_int
(delimiter
) then
413 else if lexer
.accept
('&') then
414 # TODO: [WFC: No < in Attribute Values]
415 if not expect_reference
(buffer
) then return false
416 else if not lexer
.expect_att_value_char
(buffer
) then
422 # Expect a `SystemLiteral` production.
424 # Also used to parse productions that do not have references.
425 # Append the parsed value to `buffer`.
426 private fun expect_literal
(buffer
: Buffer): Bool do
427 var delimiter
= lexer
.expect_delimiter
429 if delimiter
< 0 then return false
431 if lexer
.accept_int
(delimiter
) then
433 else if not lexer
.expect_xml_char
(buffer
) then
440 # Expect a `Comment` production, without the beginning.
442 # Assume `last_char` is the fifth byte of the production that is, the
443 # next byte after the `'<!--'` token.
444 private fun expect_comment
: Bool do
445 var buffer
: Buffer = new FlatBuffer
448 if lexer
.accept
('-') then
449 if lexer
.accept
('-') then
450 if not lexer
.expect
('>',
451 " after a double-hyphen (`--`) in a comment") then
457 buffer
.chars
.push
('-')
458 if not lexer
.expect_xml_char
(buffer
) then return false
460 else if not lexer
.expect_xml_char
(buffer
) then
464 model
.fire_comment
(buffer
.to_s
)
468 # Expect a `PI` production, without the beginning.
470 # Assume `last_char` is the third byte of the production that is, the
471 # next byte after the `'<?'` token.
472 private fun expect_pi
: Bool do
473 var target
= new FlatBuffer
475 return lexer
.expect_pi_target
(target
) and
476 expect_pi_data
(target
.to_s
)
479 # Expect the data part and the `'?>'` token of a `PI` production.
480 private fun expect_pi_data
(target
: String): Bool do
481 if lexer
.accept
('?') then
482 if lexer
.expect
('>', " at the end of a processing instruction") then
483 model
.fire_processing_instruction
(target
, null)
488 else if lexer
.accept_s
then
489 var data
: Buffer = new FlatBuffer
492 if lexer
.accept
('?') then
493 if lexer
.accept
('>') then
497 if not lexer
.expect_xml_char
(data
) then return false
499 else if not lexer
.expect_xml_char
(data
) then
503 model
.fire_processing_instruction
(target
, data
.to_s
)
506 return lexer
.fire_unexpected_char
(" after a processing " +
507 "instruction target. Expecting a white space or `?>`")
511 # Expect a `PI | XMLDecl` production, without the beginning.
513 # Assume `last_char` is the third byte of the production that is, the
514 # next byte after the `'<?'` token.
515 private fun expect_pi_or_xml_decl
: Bool do
516 var buffer
: Buffer = new FlatBuffer
518 if lexer
.expect_name
(buffer
) then
519 var target
= buffer
.to_s
521 if target
== "xml" then
522 return expect_xml_decl
523 else if lexer
.check_pi_target
(target
) then
524 return expect_pi_data
(target
)
533 # Expect a `XMLDecl` production, without the initial `<?xml` token.
534 private fun expect_xml_decl
: Bool do
535 if not expect_version_info
then return false
536 if lexer
.accept_s
then
537 if lexer
.is_char
('e') then
538 if not expect_encoding_decl
then return false
539 # At this point, we can only accept `S` or `'?>'`.
540 if not lexer
.accept_s
then
541 return lexer
.expect_string
("?>", "")
544 if lexer
.is_char
('s') and not expect_sd_decl
then return false
545 return lexer
.skip_s
and lexer
.expect_string
("?>", "")
547 return lexer
.expect_string
("?>", "")
551 # Expect a `EncodingDecl` token, without the initial `S` token.
552 private fun expect_encoding_decl
: Bool do
553 var encoding
= new FlatBuffer
555 if not lexer
.expect_string
("encoding", "") or not lexer
.expect_eq
or
556 not expect_literal
(encoding
) then
559 if not encoding
.has
("^[A-Za-z][A-Za-z0-9._-]*$".to_re
) then
560 return model
.fire_fatal_error
("`{encoding.to_s}` is not a valid " +
561 "encoding name.", null)
563 # TODO: Do something with the value.
567 # Expect a `SDDecl` token, without the initial `S` token.
568 private fun expect_sd_decl
: Bool do
569 var buf
= new FlatBuffer
572 if not lexer
.expect_string
("standalone", "") or not lexer
.expect_eq
or
573 not expect_literal
(buf
) then
577 if not value
== "yes" and not value
== "no" then
578 return model
.fire_fatal_error
("`{value}` is not a valid value for " +
579 "the `standalone` declaration. Expecting `yes` or `no`.",
582 # TODO: Do something with the value.
586 # Expect a `CDSect` production, without the beginning.
588 # Assume `last_char` is the fourth byte of the production that is, the
589 # next byte after the `'<!['` token.
590 private fun expect_cd_sect
: Bool do
591 var buffer
: Buffer = new FlatBuffer
593 # Number of consecutive closing brackets.
596 if lexer
.expect_string
("CDATA[",
597 " at the beginning of a CDATA section.") then
598 model
.fire_start_cdata
600 if lexer
.accept
(']') then
603 for i
in [0..closing
[ do
604 buffer
.chars
.push
(']')
607 if closing
>= 2 and lexer
.accept
('>') then break
608 if not lexer
.expect_xml_char
(buffer
) then return false
619 # Expect a `VersionInfo` production.
620 private fun expect_version_info
: Bool do
621 if not lexer
.expect_s
or
622 not lexer
.expect_string
("version",
623 " in the first attribute name of the XML declaration") or
624 not lexer
.expect_eq
then
627 var minor
: Buffer = new FlatBuffer
628 var delimiter
= lexer
.expect_delimiter
630 if delimiter
< 0 then return false
631 if not lexer
.expect_string
("1.", " as XML major version") or
632 not lexer
.expect_digits
(minor
) or
633 not lexer
.expect_int
(delimiter
, "") then
636 if minor
.to_s
!= "0" then
637 model
.fire_warning
("Only XML 1.0 is supported. " +
638 "Got a XML 1.{minor.to_s} document.", null)
644 # Expect a `Reference`, without the initial `&`.
646 # Append the value to the buffer.
647 private fun expect_reference
(buffer
: Buffer): Bool do
648 # TODO: [WFC: Entity Declared]
649 # TODO: [VC: Entity Declared]
650 # TODO: [WFC: Parsed Entity]
651 # TODO: [WFC: No Recursion]
654 var ref
= new FlatBuffer
656 if lexer
.accept
('#') then
657 if lexer
.accept
('x') then
658 if lexer
.expect_hex
(ref
) then
659 buffer
.chars
.add
(ref
.to_hex
.code_point
)
660 return lexer
.expect
(';', "")
662 return lexer
.fire_unexpected_char
(
663 ". Expecting an hexadecimal digit")
665 else if lexer
.accept_digits
(ref
) then
666 buffer
.chars
.add
(ref
.to_i
.code_point
)
667 return lexer
.expect
(';', "")
669 return lexer
.fire_unexpected_char
(" in a character reference. " +
670 "Expecting `x` or a decimal digit")
672 else if lexer
.expect_name
(ref
) then
674 if name
.has
(":") then
675 model
.fire_error
("The entity name `{name}` contains a colon.", null)
677 var value
= resolve_reference
(name
)
679 if value
!= null then
681 return lexer
.expect
(';', "")
683 model
.fire_fatal_error
("Unknown entity `{name}`.", null)
687 return lexer
.fire_unexpected_char
(
688 " in a reference. Expecting `#` or a name")
692 # Resolve the entity reference or return `null`.
693 private fun resolve_reference
(name
: String): nullable String do
696 else if name
== "gt" then
698 else if name
== "amp" then
700 else if name
== "quot" then
702 else if name == "apos
" then
707 # TODO: Support non-builtin entities
710 # Flush the specified buffer as a `characters` event.
712 # Do nothing if `buffer` is empty.
713 private fun flush(buffer: Buffer) do
714 if buffer.length > 0 then
715 model.fire_characters(buffer.to_s)
721 ############################################################################
724 # Resolve the specified system id.
725 private fun resolve_system_id(system_id: String): MaybeError[String, Error] do
726 return realpath(system_id)
730 # Resolve the specified POSIX path.
732 # Like `String.realpath`, but with error handling.
733 private fun realpath(path: String): MaybeError[String, Error] do
734 var cs = path.to_cstring.file_realpath
736 if cs.address_is_null then
737 return new MaybeError[String, Error](null,
738 new Error("File <{path}> not found."))
740 return new MaybeError[String, Error](cs.to_s, null)