1 # This file is part of NIT ( http://www.nitlanguage.org ).
3 # This file is free software, which comes along with NIT. This software is
4 # distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
5 # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
6 # PARTICULAR PURPOSE. You can modify it is you want, provided this header
7 # is kept unaltered, and a notification of the changes is added.
8 # You are allowed to redistribute it and sell it, alone or is a part of
11 # A SAX 2 parser in Nit.
15 intrude import standard
::file
16 private import reader_model
19 # Implementation of the `XMLReader` interface.
21 # For the moment, only XML 1.0 is (partially) supported.
23 # The following mandatory features of XML 1.0 are not yet supported:
25 # * Parsing of entities (files) encoded in UTF-16.
26 # * Encoding handling.
27 # * Entity references resolving (except for built-in references).
28 # * Handling of the options specified in the XML declaration.
29 # * Parsing of a `DOCTYPE` declaration.
31 # Also note that this XML processor is unable to retrieve a file from an URL
32 # (only local paths are supported).
36 private var model
= new XophonReaderModel
37 private var lexer
: XophonLexer is noinit
39 redef fun entity_resolver
: nullable EntityResolver do return model
.entity_resolver
40 redef fun entity_resolver
=(entity_resolver
: nullable EntityResolver) do
41 model
.entity_resolver
= entity_resolver
44 redef fun dtd_handler
: nullable DTDHandler do return model
.dtd_handler
45 redef fun dtd_handler
=(dtd_handler
: nullable DTDHandler) do
46 model
.dtd_handler
= dtd_handler
49 redef fun content_handler
: nullable ContentHandler do return model
.content_handler
50 redef fun content_handler
=(content_handler
: nullable ContentHandler) do
51 model
.content_handler
= content_handler
54 redef fun error_handler
: nullable ErrorHandler do return model
.error_handler
55 redef fun error_handler
=(error_handler
: nullable ErrorHandler) do
56 model
.error_handler
= error_handler
60 redef fun feature_recognized
(name
: String): Bool do
61 return model
.feature_recognized
(name
)
64 redef fun feature_readable
(name
: String): Bool do
65 return model
.feature_readable
(name
)
68 redef fun feature_writable
(name
: String): Bool do
69 return model
.feature_readable
(name
)
72 redef fun feature
(name
: String): Bool do return model
.feature
(name
)
73 redef fun feature
=(name
: String, value
: Bool) do model
.feature
(name
) = value
75 redef fun property_recognized
(name
: String): Bool do
76 return model
.property_recognized
(name
)
79 redef fun property_readable
(name
: String): Bool do
80 return model
.property_readable
(name
)
83 redef fun property_writable
(name
: String): Bool do
84 return model
.property_writable
(name
)
87 redef fun property
(name
: String): nullable Object do
88 return model
.property
(name
)
91 redef fun property
=(name
: String, value
: nullable Object) do
92 model
.property
(name
) = value
95 redef fun parse
(input
: InputSource) do
97 var system_id
: nullable MaybeError[String, Error] = null
98 model
.locator
= new SAXLocatorImpl
100 if input
.system_id
!= null then
101 system_id
= resolve_system_id
(input
.system_id
.as(not null))
102 if system_id
.is_error
then
103 model
.fire_warning
(system_id
.error
.message
, system_id
.error
)
105 model
.locator
.system_id
= system_id
.value
108 model
.locator
.public_id
= input
.public_id
111 if input
.stream
!= null then
112 lexer
= new XophonLexer(model
, input
.stream
.as(not null))
114 else if system_id
!= null then
115 if system_id
.is_error
then
116 model
.fire_fatal_error
("File <{input.system_id.as(not null)}> not found.", null)
118 lexer
= new XophonLexer(model
,
119 new IFStream.open
(system_id
.value
))
124 model
.fire_fatal_error
("At least a stream or a system identifier must be specified. None given.",
129 redef fun parse_file
(system_id
: String) do
130 parse
(new InputSource.with_system_id
(system_id
))
134 ############################################################################
137 # Note: Every `expect_*` function (except `parse_main`) does not call
138 # `read_char` for the first byte and let the byte just after its production
139 # in `last_char` (except in case of fatal error). They return `false` on
140 # fatal error and at the end of the file.
142 # Parse the main entity.
143 private fun parse_main
do
144 model
.fire_document_locator
145 model
.fire_start_document
148 model
.fire_end_document
151 # Expect a `document` production.
152 private fun expect_document
: Bool do
154 var got_doctype
= false
155 var got_element
= false
157 # If the document start with `<`, it may start with a XML declaration,
158 # a processing instruction, a comment, a `DOCTYPE` declaration, the
159 # root element or a white space.
160 if lexer
.accept
('<') then
161 if lexer
.accept
('?') then
162 if not expect_pi_or_xml_decl
then return false
163 else if lexer
.accept
('!') then
164 if lexer
.accept
('-') then
165 if not lexer
.expect
('-',
166 " at the beginning of a comment") or
167 not expect_comment
then
171 if not expect_doctype_decl
then return false
175 if not expect_root
then return false
176 # The `DOCTYPE` declaration *must* come before the root
181 else if not lexer
.accept_s
then
182 return lexer
.fire_unexpected_char
(
183 ". Expecting a white space or `<`")
186 # After the XML declaration (if there is one), the document may contain
187 # processing instructions, comments, the `DOCTYPE` declaration and
189 # These productions may be separated by white space.
190 while not got_element
do
191 if lexer
.accept
('<') then
192 if lexer
.accept
('?') then
193 if not expect_pi
then return false
194 else if lexer
.accept
('!') then
195 if lexer
.accept
('-') then
196 if not lexer
.expect
('-',
197 " at the beginning of a comment") or
198 not expect_comment
then
201 else if got_doctype
then
202 return lexer
.fire_unexpected_char
(". Expecting `-`")
203 else if expect_doctype_decl
then
209 if not expect_root
then return false
210 # The `DOCTYPE` declaration *must* come before the root
215 else if not lexer
.accept_s
then
216 return lexer
.fire_unexpected_char
(
217 ". Expecting a white space or `<`")
223 private fun expect_doctype_decl
: Bool do
224 return model
.fire_fatal_error
("DTD not supported yet.\n", null) # TODO
227 # Expect the root `element` production, without the first `<` token.
228 private fun expect_root
: Bool do
230 var char_data
= new FlatBuffer
232 success
= expect_stag
233 while success
and not lexer
.eof
and not model
.root_closed
do
234 success
= expect_content_chunk
(char_data
)
237 success
= model
.expect_root_closed
243 # Parse a `EmptyElemTag | STag | ETag | Reference | CDSect | PI | Comment | CharData` production.
245 # If the last read byte matches the `CharData` production, push the char in
246 # `char_data`. Else, flush `CharData` as a `characters` event.
247 private fun expect_content_chunk
(char_data
: Buffer): Bool do
248 if lexer
.accept
('<') then
250 if lexer
.accept
('!') then
251 if lexer
.accept
('-') then
252 return lexer
.expect
('-',
253 " at the beginning of a comment") and
255 else if lexer
.accept
('[') then
256 return expect_cd_sect
258 return lexer
.fire_unexpected_char
(
259 ". Expecting `--` or `[CDATA[`")
261 else if lexer
.accept
('?') then
263 else if lexer
.accept
('/') then
268 else if lexer
.accept
('&') then
270 var success
= expect_reference
(char_data
)
274 return lexer
.expect_xml_char
(char_data
)
278 # Expect a `EmptyElemTag | STag` production, without the initial `<`.
279 private fun expect_stag
: Bool do
280 var name_buffer
= new FlatBuffer
282 if lexer
.expect_name
(name_buffer
) then
283 var name
= name_buffer
.to_s
285 model
.fire_start_attributes
287 if lexer
.accept
('>') then
288 model
.fire_start_element
(name
)
290 else if lexer
.accept
('/') then
291 if lexer
.expect
('>', "") then
292 model
.fire_start_element
(name
)
293 model
.fire_end_element
(name
)
298 else if lexer
.expect_s
then
299 if lexer
.accept
('/') then
300 if lexer
.expect
('>', "") then
301 model
.fire_start_element
(name
)
302 model
.fire_end_element
(name
)
307 else if lexer
.accept
('>') then
308 model
.fire_start_element
(name
)
310 else if not expect_attribute
then
314 return lexer
.fire_unexpected_char
(" in tag. " +
315 "Expecting an attribute, `/`, `>` or white space")
322 # Expect a `ETag` production, without the initial `</`.
323 private fun expect_etag
: Bool do
324 var name_buf
= new FlatBuffer
326 if lexer
.expect_name
(name_buf
) and
328 lexer
.expect
('>', "") then
329 return model
.fire_end_element
(name_buf
.to_s
)
335 # Expect an `Attributes` production.
336 private fun expect_attribute
: Bool do
337 var name
= new FlatBuffer
338 var value
= new FlatBuffer
340 if lexer
.expect_name
(name
) and
342 expect_att_value
(value
) then
343 model
.fire_attribute
(name
.to_s
, value
.to_s
)
350 # Expect the `Misc*` production at the end of a document.
351 private fun expect_miscs
: Bool do
352 while not lexer
.eof
do
353 if lexer
.accept
('<') then
354 if lexer
.accept
('?') then
355 if not expect_pi
then return false
356 else if lexer
.accept
('!') then
357 if not lexer
.expect_string
("--",
358 " at the beginning of a comment") or
359 not expect_comment
then
363 return lexer
.fire_unexpected_char
(". Expecting `?` or `!`")
365 else if not lexer
.accept_s
then
366 return lexer
.fire_unexpected_char
(
367 ". Expecting a white space or `<`")
373 # Expect a `AttValue` production.
375 # Append the parsed value to `buffer`.
376 private fun expect_att_value
(buffer
: Buffer): Bool do
377 var delimiter
= lexer
.expect_delimiter
379 if delimiter
< 0 then return false
381 if lexer
.accept_int
(delimiter
) then
383 else if lexer
.accept
('&') then
384 # TODO: [WFC: No < in Attribute Values]
385 if not expect_reference
(buffer
) then return false
386 else if not lexer
.expect_att_value_char
(buffer
) then
392 # Expect a `SystemLiteral` production.
394 # Also used to parse productions that do not have references.
395 # Append the parsed value to `buffer`.
396 private fun expect_literal
(buffer
: Buffer): Bool do
397 var delimiter
= lexer
.expect_delimiter
399 if delimiter
< 0 then return false
401 if lexer
.accept_int
(delimiter
) then
403 else if not lexer
.expect_xml_char
(buffer
) then
410 # Expect a `Comment` production, without the beginning.
412 # Assume `last_char` is the fifth byte of the production that is, the
413 # next byte after the `'<!--'` token.
414 private fun expect_comment
: Bool do
415 var buffer
: Buffer = new FlatBuffer
418 if lexer
.accept
('-') then
419 if lexer
.accept
('-') then
420 if not lexer
.expect
('>',
421 " after a double-hyphen (`--`) in a comment") then
427 buffer
.chars
.push
('-')
428 if not lexer
.expect_xml_char
(buffer
) then return false
430 else if not lexer
.expect_xml_char
(buffer
) then
434 model
.fire_comment
(buffer
.to_s
)
438 # Expect a `PI` production, without the beginning.
440 # Assume `last_char` is the third byte of the production that is, the
441 # next byte after the `'<?'` token.
442 private fun expect_pi
: Bool do
443 var target
= new FlatBuffer
445 return lexer
.expect_pi_target
(target
) and
446 expect_pi_data
(target
.to_s
)
449 # Expect the data part and the `'?>'` token of a `PI` production.
450 private fun expect_pi_data
(target
: String): Bool do
451 if lexer
.accept
('?') then
452 if lexer
.expect
('>', " at the end of a processing instruction") then
453 model
.fire_processing_instruction
(target
, null)
458 else if lexer
.accept_s
then
459 var data
: Buffer = new FlatBuffer
462 if lexer
.accept
('?') then
463 if lexer
.accept
('>') then
467 if not lexer
.expect_xml_char
(data
) then return false
469 else if not lexer
.expect_xml_char
(data
) then
473 model
.fire_processing_instruction
(target
, data
.to_s
)
476 return lexer
.fire_unexpected_char
(" after a processing " +
477 "instruction target. Expecting a white space or `?>`")
481 # Expect a `PI | XMLDecl` production, without the beginning.
483 # Assume `last_char` is the third byte of the production that is, the
484 # next byte after the `'<?'` token.
485 private fun expect_pi_or_xml_decl
: Bool do
486 var buffer
: Buffer = new FlatBuffer
488 if lexer
.expect_name
(buffer
) then
489 var target
= buffer
.to_s
491 if target
== "xml" then
492 return expect_xml_decl
493 else if lexer
.check_pi_target
(target
) then
494 return expect_pi_data
(target
)
503 # Expect a `XMLDecl` production, without the initial `<?xml` token.
504 private fun expect_xml_decl
: Bool do
505 if not expect_version_info
then return false
506 if lexer
.accept_s
then
507 if lexer
.is_char
('e') then
508 if not expect_encoding_decl
then return false
509 # At this point, we can only accept `S` or `'?>'`.
510 if not lexer
.accept_s
then
511 return lexer
.expect_string
("?>", "")
514 if lexer
.is_char
('s') and not expect_sd_decl
then return false
515 return lexer
.skip_s
and lexer
.expect_string
("?>", "")
517 return lexer
.expect_string
("?>", "")
521 # Expect a `EncodingDecl` token, without the initial `S` token.
522 private fun expect_encoding_decl
: Bool do
523 var encoding
= new FlatBuffer
525 if not lexer
.expect_string
("encoding", "") or not lexer
.expect_eq
or
526 not expect_literal
(encoding
) then
529 if not encoding
.has
("^[A-Za-z][A-Za-z0-9._-]*$".to_re
) then
530 return model
.fire_fatal_error
("`{encoding.to_s}` is not a valid " +
531 "encoding name.", null)
533 # TODO: Do something with the value.
537 # Expect a `SDDecl` token, without the initial `S` token.
538 private fun expect_sd_decl
: Bool do
539 var buf
= new FlatBuffer
542 if not lexer
.expect_string
("standalone", "") or not lexer
.expect_eq
or
543 not expect_literal
(buf
) then
547 if not value
== "yes" and not value
== "no" then
548 return model
.fire_fatal_error
("`{value}` is not a valid value for " +
549 "the `standalone` declaration. Expecting `yes` or `no`.",
552 # TODO: Do something with the value.
556 # Expect a `CDSect` production, without the beginning.
558 # Assume `last_char` is the fourth byte of the production that is, the
559 # next byte after the `'<!['` token.
560 private fun expect_cd_sect
: Bool do
561 var buffer
: Buffer = new FlatBuffer
563 # Number of consecutive closing brackets.
566 if lexer
.expect_string
("CDATA[",
567 " at the beginning of a CDATA section.") then
568 model
.fire_start_cdata
570 if lexer
.accept
(']') then
573 for i
in [0..closing
[ do
574 buffer
.chars
.push
(']')
577 if closing
>= 2 and lexer
.accept
('>') then break
578 if not lexer
.expect_xml_char
(buffer
) then return false
589 # Expect a `VersionInfo` production.
590 private fun expect_version_info
: Bool do
591 if not lexer
.expect_s
or
592 not lexer
.expect_string
("version",
593 " in the first attribute name of the XML declaration") or
594 not lexer
.expect_eq
then
597 var minor
: Buffer = new FlatBuffer
598 var delimiter
= lexer
.expect_delimiter
600 if delimiter
< 0 then return false
601 if not lexer
.expect_string
("1.", " as XML major version") or
602 not lexer
.expect_digits
(minor
) or
603 not lexer
.expect_int
(delimiter
, "") then
606 if minor
.to_s
!= "0" then
607 model
.fire_warning
("Only XML 1.0 is supported. " +
608 "Got a XML 1.{minor.to_s} document.", null)
614 # Expect a `Reference`, without the initial `&`.
616 # Append the value to the buffer.
617 private fun expect_reference
(buffer
: Buffer): Bool do
618 # TODO: [WFC: Entity Declared]
619 # TODO: [VC: Entity Declared]
620 # TODO: [WFC: Parsed Entity]
621 # TODO: [WFC: No Recursion]
624 var ref
= new FlatBuffer
626 if lexer
.accept
('#') then
627 if lexer
.accept
('x') then
628 if lexer
.expect_hex
(ref
) then
629 buffer
.chars
.add
(ref
.to_hex
.ascii
)
630 return lexer
.expect
(';', "")
632 return lexer
.fire_unexpected_char
(
633 ". Expecting an hexadecimal digit")
635 else if lexer
.accept_digits
(ref
) then
636 buffer
.chars
.add
(ref
.to_i
.ascii
)
637 return lexer
.expect
(';', "")
639 return lexer
.fire_unexpected_char
(" in a character reference. " +
640 "Expecting `x` or a decimal digit")
642 else if lexer
.expect_name
(ref
) then
644 if name
.has
(":") then
645 model
.fire_error
("The entity name `{name}` contains a colon.", null)
647 var value
= resolve_reference
(name
)
649 if value
!= null then
651 return lexer
.expect
(';', "")
653 model
.fire_fatal_error
("Unknown entity `{name}`.", null)
657 return lexer
.fire_unexpected_char
(
658 " in a reference. Expecting `#` or a name")
662 # Resolve the entity reference or return `null`.
663 private fun resolve_reference
(name
: String): nullable String do
666 else if name
== "gt" then
668 else if name
== "amp" then
670 else if name
== "quot" then
672 else if name == "apos
" then
677 # TODO: Support non-builtin entities
680 # Flush the specified buffer as a `characters` event.
682 # Do nothing if `buffer` is empty.
683 private fun flush(buffer: Buffer) do
684 if buffer.length > 0 then
685 model.fire_characters(buffer.to_s)
691 ############################################################################
694 # Resolve the specified system id.
695 private fun resolve_system_id(system_id: String): MaybeError[String, Error] do
696 return realpath(system_id)
700 # Resolve the specified POSIX path.
702 # Like `String.realpath`, but with error handling.
703 private fun realpath(path: String): MaybeError[String, Error] do
704 var cs = path.to_cstring.file_realpath
706 if cs.address_is_null then
707 return new MaybeError[String, Error](null,
708 new Error("File <{path}> not found."))
710 return new MaybeError[String, Error](cs.to_s, null)