From: Jean-Christophe Beaupré Date: Thu, 9 Oct 2014 14:53:46 +0000 (-0400) Subject: xml: Introduce SAXophoNit, a SAX processor in Nit. X-Git-Tag: v0.6.10~14^2~2 X-Git-Url: http://nitlanguage.org xml: Introduce SAXophoNit, a SAX processor in Nit. For the moment, this implementation is mostly non-compliant, but it works with most common XML documents. Signed-off-by: Jean-Christophe Beaupré --- diff --git a/lib/saxophonit/lexer.nit b/lib/saxophonit/lexer.nit new file mode 100644 index 0000000..4f77ab8 --- /dev/null +++ b/lib/saxophonit/lexer.nit @@ -0,0 +1,392 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# This file is free software, which comes along with NIT. This software is +# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; +# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. You can modify it is you want, provided this header +# is kept unaltered, and a notification of the changes is added. +# You are allowed to redistribute it and sell it, alone or is a part of +# another product. + +# SAXophoNit’s lexer +module saxophonit::lexer + +import reader_model + +# SAXophoNit’s lexer +# +# Except when noted otherwise, `accept` and `expect` functions return `true` on +# success and `false` on mismatch and at the end of the file. +# They both foward the cursor to the next byte on success, but only `expect` +# functions fire a fatal error on mismatch. +class XophonLexer + var reader_model: XophonReaderModel + var input: IStream is writable + private var locator: SAXLocatorImpl is noinit + + init do + locator = reader_model.locator.as(not null) + end + + # Last read byte. + # + # Equals `-1` on end of file or error. + private var last_char: Int = -1 + + # Before end-of-line handling, was the last read byte a CARRIAGE RETURN? + private var was_cr: Bool = false + + + # Expect a value delimiter (`"` or `'`). + # + # If the last read byte is a delimiter, return the delimiter and + # read the next byte. Else, return `-1`. + fun expect_delimiter: Int do + if accept('"') then + return '"'.ascii + else if accept('\'') then + return '\''.ascii + else + fire_unexpected_char(". Expecting `\"` or `'`") + return -1 + end + end + + # Is the last read byte matches the `Char` production? + fun is_xml_char:Bool do + # TODO: Handle code points above 0x7F. + return last_char >= 32 or + last_char == 9 or + last_char == 10 + end + + # Push the last read byte in the specified buffer and read the next byte. + # + # If the last read byte is forbidden, fire a fatal error instead. + fun expect_xml_char(buffer: Buffer): Bool do + if is_xml_char then + buffer.chars.push(last_char.ascii) + read_char + return true + else if eof then + return fire_fatal_error("Unexpected end of file.") + else + return fire_fatal_error("Forbidden character.") + end + end + + + # Like `expect_xml_char`, but normalize white space and forbid `<`. + # + # SEE: The “3.3.3 Attribute-Value Normalization” section of any XML + # recommendation. + fun expect_att_value_char(buffer: Buffer): Bool do + if is_s then + buffer.chars.push(' ') + read_char + return true + else if last_char == '<'.ascii then + return fire_fatal_error("`<` is forbidden in attribute values.") + else + return expect_xml_char(buffer) + end + end + + # Is the last read byte matches the `S` production? + fun is_s:Bool do + return last_char == 32 or last_char == 9 or last_char == 10 + end + + # Skip a `S?` token and return `true`. + fun skip_s: Bool do + while is_s do read_char + return true + end + + # Accept a `S` token. + fun accept_s: Bool do + if is_s then + read_char + return skip_s + else + return false + end + end + + # Expect `S`. + fun expect_s: Bool do + return (accept_s and skip_s) or fire_unexpected_char(". Expecting white space") + end + + # Is the last read byte matches the `NameStartChar` production? + fun is_name_start_char: Bool do + # TODO: Handle code points above 0x7F. + return ['A'.ascii .. 'Z'.ascii].has(last_char) or + ['a'.ascii .. 'z'.ascii].has(last_char) or + last_char == '_'.ascii or + last_char == ':'.ascii or + last_char > 127 + end + + # Is the last read byte matches the `NameChar` production? + fun is_name_char: Bool do + # TODO: Handle code points above 0x7F. + return is_name_start_char or + last_char == '-'.ascii or + last_char == '.'.ascii or + is_digit + end + + # Expect a `Name` tokn. + # + # Append the parsed name to `buffer`. + fun expect_name(buffer: Buffer): Bool do + if not is_name_start_char then + return fire_unexpected_char(" at the beginning of a name") + end + buffer.chars.push(last_char.ascii) + read_char + while is_name_char do + buffer.chars.push(last_char.ascii) + read_char + end + return true + end + + # Expect a `PITarget` token. + # + # Append the parsed name to `buffer`. + fun expect_pi_target(buffer: Buffer): Bool do + return expect_name(buffer) and check_pi_target(buffer) + end + + # Ensure the target is not `xml` (case-insensitive). + # + # Also, fire an error if the target contains a colon. + fun check_pi_target(target: Text): Bool do + var is_invalid = target.length == 3 and + (target.chars[0] == 'X' or target.chars[0] == 'x') and + (target.chars[0] == 'M' or target.chars[0] == 'm') and + (target.chars[0] == 'L' or target.chars[0] == 'l') + + if is_invalid then + return fire_fatal_error("Forbidden processing target `{target}`.") + else + if target.has(":") then + reader_model.fire_error("The processing target `{target}` contains a colon.", null) + end + return true + end + end + + # Is the last read byte matches the `[0-9]` production? + fun is_digit: Bool do + return ['0'.ascii .. '9'.ascii].has(last_char) + end + + # Accept a `[0-9]+` token. + fun accept_digits(buffer: Buffer): Bool do + if is_digit then + loop + buffer.chars.push(last_char.ascii) + read_char + if not is_digit then return true + end + else + return false + end + end + + # Expect a `[0-9]+` token. + fun expect_digits(buffer: Buffer): Bool do + return accept_digits(buffer) or fire_unexpected_char(". Expecting a decimal digit") + end + + # Is `last_char` matches the `[0-9a-fA-F]` production? + fun is_hex: Bool do + return ['0'.ascii .. '9'.ascii].has(last_char) or + ['A'.ascii .. 'Z'.ascii].has(last_char) or + ['a'.ascii .. 'Z'.ascii].has(last_char) + end + + # Expect a `[0-9a-fA-F]+` token. + fun expect_hex(buffer: Buffer): Bool do + if is_hex then + loop + buffer.chars.push(last_char.ascii) + read_char + if not is_hex then return true + end + else + return fire_unexpected_char(". Expecting an hexadecimal digit") + end + end + + # Expect `Eq`. + fun expect_eq: Bool do + return skip_s and expect('=', "") and skip_s + end + + + ############################################################################ + # General + + # Read a byte and put it in `last_char`. + # + # In case of an end-of-file or an error, put -1 in `last_char`. + private fun read_char do + if locator.line_number < 0 then + locator.line_number = 1 + locator.column_number = 1 + else if last_char < 0 then + fire_fatal_error("Internal error: Already at the end of the file.") + return + else if last_char == '\n'.ascii then + locator.line_number += 1 + locator.column_number = 1 + else + locator.column_number += 1 + end + + last_char = input.read_char + if last_char < 0 then + return + end + + # XML 1.0 end-of-line handling + # Note: Regardless the XML version, any EOL defined by the + # recommandation MUST be reported as a single LINE FEED. + if was_cr and last_char == '\n'.ascii then + # EOL already reported. => Skip this byte. + last_char = input.read_char + end + was_cr = last_char == '\r'.ascii + if was_cr then + # Regardless the following byte, '\r' always introduce an EOL. + last_char = '\n'.ascii + end + end + + # Is it the end of the stream? + # + # Also return `true` after a fatal error. + fun eof: Bool do return last_char < 0 + + # Start the lexer. + fun start do + if eof then + last_char = 0 + read_char + end + end + + # Close the input. + fun close do + last_char = -1 + input.close + end + + # Does the last read byte equal `c`? + fun is_int(c: Int): Bool do return last_char == c + + # Does the last read byte equal `c`? + fun is_char(c: Char): Bool do return last_char == c.ascii + + # Expect the specified byte. + fun accept_int(expected: Int): Bool do + if last_char == expected then + read_char + return true + else + return false + end + end + + # Accept the specified byte. + fun accept(expected: Char): Bool do + return accept_int(expected.ascii) + end + + # Ensure the last read byte is equal to `expected`. + # + # If it is, read the next byte. If not, fire a fatal error using + # `context`. `context` is the part of the message that gives the context. + # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of + # `context` is `" in y"`. + # + # Return `true` if and only if the last read byte as the expected value. + fun expect_int(expected: Int, context: String): Bool do + return accept_int(expected) or + fire_unexpected_char("{context}. Expecting `{expected.ascii}`.") + end + + # Ensure the last read byte is equal to `expected`. + # + # If it is, read the next byte. If not, fire a fatal error using + # `context`. `context` is the part of the message that gives the context. + # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of + # `context` is `" in y"`. + # + # Return `true` if and only if the last read byte as the expected value. + fun expect(expected: Char, context: String): Bool do + return accept(expected) or + fire_unexpected_char("{context}. Expecting `{expected}`.") + end + + # Ensure the last read byte and following bytes match `expected`. + # + # If it is, read one more byte. If not, fire a fatal error using + # `context`. `context` is the part of the message that gives the context. + # For example, in `Unexpected ``x`` in y. Expecting ``z``.`, the value of + # `context` is `" in y"`. + # + # Return `true` if and only if the last read byte and following bytes + # match `expected`. + fun expect_string(expected: String, context: String): Bool do + var chars = expected.chars + var i: Int = 0 + + while i < chars.length do + if not accept(chars[i]) then + if is_xml_char then + return fire_fatal_error("Unexpected " + + "`{expected.substring(0, i)}{last_char.ascii.to_s}`" + + "{context}. Expecting `{expected}`.") + else if eof then + return fire_fatal_error("Unexpected end of file{context}. " + + "Expecting `{expected}`.") + else + return fire_fatal_error("Forbidden character.") + end + end + i += 1 + end + return true + end + + + ############################################################################ + # Dispatching + + # Fire a fatal error about an unexpected character. + # + # Return `false`. + fun fire_unexpected_char(rest_of_message: String): Bool do + if is_xml_char then + return fire_fatal_error("Unexpected character `{last_char.ascii.to_s}`{rest_of_message}.") + else if eof then + return fire_fatal_error("Unexpected end of file{rest_of_message}.") + else + return fire_fatal_error("Forbidden character.") + end + end + + # Fire a fatal error with the specified message. + # + # Return `false`. + private fun fire_fatal_error(message: String): Bool do + reader_model.fire_fatal_error(message, null) + last_char = -1 + return false + end +end diff --git a/lib/saxophonit/reader_model.nit b/lib/saxophonit/reader_model.nit new file mode 100644 index 0000000..ff2299a --- /dev/null +++ b/lib/saxophonit/reader_model.nit @@ -0,0 +1,357 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# This file is free software, which comes along with NIT. This software is +# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; +# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. You can modify it is you want, provided this header +# is kept unaltered, and a notification of the changes is added. +# You are allowed to redistribute it and sell it, alone or is a part of +# another product. + +# Reader’s model. +module saxophonit::reader_model + +import sax +import sax::helpers::sax_locator_impl +import sax::helpers::attributes_impl +import sax::helpers::namespace_support + +# Reader’s model. +# +# Handle event dispatching, settings and internal state. +class XophonReaderModel + + # Stack of the current element type qname and of the qnames of the ancestors. + # + # Current element is at the peek, the root element is at the bottom. + # Used to check if end tags match start tags. + private var element_path = new Array[XmlName] + + # Current element’s attributes + private var atts = new AttributesImpl + + private var ns = new NamespaceSupport + + # Regular expression to match a `Name` against the `QName` production of + # “Namespaces in XML”. + private var qname_re: Regex = "^[^:]+(:[^:]+)?$".to_re + + var locator: nullable SAXLocatorImpl = null is writable + + + # TODO: Handle these features. + + private var features: Map[String, Bool] = new HashMap[String, Bool] + + # SEE: `sax` + var feature_namespaces_uri = + "http://xml.org/sax/features/namespaces" + + # SEE: `sax` + var feature_namespace_prefixes_uri = + "http://xml.org/sax/features/namespace-prefixes" + + + # SEE: `sax::XMLReader.entity_resolver` + var entity_resolver: nullable EntityResolver = null is writable + + # SEE: `sax::XMLReader.dtd_handler` + var dtd_handler: nullable DTDHandler = null is writable + + # SEE: `sax::XMLReader.content_handler` + var content_handler: nullable ContentHandler = null is writable + + # SEE: `sax::XMLReader.error_handler` + var error_handler: nullable ErrorHandler = null is writable + + + init do + qname_re.optimize_is_in = true + features[feature_namespaces_uri] = true + features[feature_namespace_prefixes_uri] = false + end + + # SEE: `sax::XMLReader.feature_recognized` + fun feature_recognized(name: String): Bool do + return features.keys.has(name) + end + + # SEE: `sax::XMLReader.feature_readable` + fun feature_readable(name: String): Bool do + return features.keys.has(name) + end + + # SEE: `sax::XMLReader.feature_writable` + fun feature_writable(name: String): Bool do + return features.keys.has(name) + end + + # SEE: `sax::XMLReader.feature` + fun feature(name: String): Bool do + assert feature_recognized: feature_recognized(name) + return features[name] + end + + # SEE: `sax::XMLReader.feature=` + fun feature=(name: String, value: Bool) do + assert feature_recognized: feature_recognized(name) + if name == feature_namespaces_uri then + assert legal_state: value or features[feature_namespace_prefixes_uri] else + sys.stderr.write("At least one of <{feature_namespaces_uri}> or <{feature_namespace_prefixes_uri}> must be true.\n") + end + else if name == feature_namespace_prefixes_uri then + assert legal_state: value or features[feature_namespaces_uri] else + sys.stderr.write("At least one of <{feature_namespaces_uri}> or <{feature_namespace_prefixes_uri}> must be true.\n") + end + end + features[name] = value + end + + # SEE: `sax::XMLReader.property_recognized` + fun property_recognized(name: String): Bool do return false + + # SEE: `sax::XMLReader.property_readable` + fun property_readable(name: String): Bool do return false + + # SEE: `sax::XMLReader.property_writable` + fun property_writable(name: String): Bool do return false + + # SEE: `sax::XMLReader.property` + fun property(name: String): nullable Object do + assert property_recognized: false + return null + end + + # SEE: `sax::XMLReader.property=` + fun property=(name: String, value: nullable Object) do + assert property_recognized: false + end + + # Is the root element closed? + fun root_closed: Bool do return element_path.length <= 0 + + # Expect the root element is closed. + fun expect_root_closed: Bool do + if root_closed then + return true + else if element_path.length > 1 then + return fire_fatal_error("Reached the end of the file with " + + "{element_path.length.to_s} open elements.", null) + else + return fire_fatal_error("Reached the end of the file with an " + + "open element.", null) + end + end + + + ########################################################################### + # Dispatching + + # Set the document locator of the content handler, if needed. + fun fire_document_locator do + if content_handler != null then + content_handler.document_locator = locator.as(not null) + end + end + + # Fire the start of the document. + fun fire_start_document do + if content_handler != null then + content_handler.start_document + end + ns.reset + end + + # Fire the end of the document. + fun fire_end_document do + if content_handler != null then + content_handler.end_document + end + end + + # Fire the start of an attribute list. + fun fire_start_attributes do + atts.clear + ns.push_context + end + + # Fire the appearance of an attribute. + fun fire_attribute(qname: String, value: String) do + if "xmlns" == qname or qname.has_prefix("xmlns:") then + var prefix = qname.substring_from("xmlns:".length) + + if not prefix.has(":") then + fire_start_prefix_mapping(prefix, value) + if not feature(feature_namespace_prefixes_uri) then return + end + end + # TODO: Types. + atts.add("", "", qname, "CDATA", value) + end + + # Fire the start of an element. + fun fire_start_element(name: String) do + var parts = ["", "", ""] + + for i in [0..atts.length[ do + set_attribute_ns(i) + end + process_name(name, parts, false) + element_path.push(new XmlName(parts[0], parts[1], parts[2])) + if content_handler != null then + content_handler.start_element(parts[0], parts[1], parts[2], atts) + end + end + + # Now prefixes are mapped, set the expanded name of the attribute at `index`. + private fun set_attribute_ns(index: Int) do + var name = ["", "", ""] + + process_name(atts.qname(index).as(not null), name, true) + atts.uri(index) = name[0] + atts.local_name(index) = name[1] + end + + # Like `ns.process_name`, but with error handling. + private fun process_name(qname: String, parts: Array[String], + is_attribute: Bool) do + if qname.has(qname_re) then + if ns.process_name(qname, parts, is_attribute) == null then + fire_error("The namespace IRI of `{qname}` was not found in " + + "this scope. Passing the original name as the local " + + "name.", null) + parts = ["", qname, qname] + end + else + fire_error("The name `{qname}` contains more than one colon. " + + "Passing the original name as the local name.", null) + parts = ["", qname, qname] + end + end + + # Fire the end of an element. + # + # Return `true` on success. + fun fire_end_element(name: String):Bool do + var peek_name = element_path.last + + if peek_name.qname == name then + element_path.pop + if content_handler != null then + content_handler.end_element(peek_name.uri, + peek_name.local_name, peek_name.qname) + end + return true + else + fire_fatal_error("The type in the closing tag (`{name}`) does " + + "not match the type in the opening tag " + + "(`{element_path.last.qname}`).", null) + return false + end + end + + # Fire the start of a mapping between `prefix` and `uri`. + private fun fire_start_prefix_mapping(prefix: String, uri: String) do + if not ns.declare_prefix(prefix, uri) then + fire_error("The mapping between the prefix `{prefix}` and " + + "the namespace IRI `{uri}` breaks a built-in " + + "mapping. Ignoring the declaration.", null) + end + if content_handler != null then + content_handler.start_prefix_mapping(prefix, uri) + end + end + + # Fire the end of the current mapping of `prefix`. + private fun end_prefix_mapping(prefix: String) do + if content_handler != null then + content_handler.end_prefix_mapping(prefix) + end + end + + # Fire the appearance of a comment. + fun fire_comment(content: String) do + # TODO + end + + # Fire the appearance of a processing instruction. + fun fire_processing_instruction(target: String, data: nullable String) do + if content_handler != null then + content_handler.processing_instruction(target, data) + end + end + + # Fire the start of a `CDATA` section. + fun fire_start_cdata do + # TODO + end + + # Fire the end of a `CDATA` section. + fun fire_end_cdata do + # TODO + end + + # Fire the appearance of a text node. + fun fire_characters(str: String) do + if content_handler != null then + content_handler.characters(str) + end + end + + private fun exception(message: String, cause: nullable Error): + SAXParseException do + var e: SAXParseException + + if locator == null then + e = new SAXParseException(message) + else + e = new SAXParseException.with_locator(message, locator.as(not null)) + end + e.cause = cause + return e + end + + # Fire a fatal error with the specified message and cause. + # + # Return `false`. + fun fire_fatal_error(message: String, cause: nullable Error):Bool do + var e = exception(message, cause) + + if error_handler == null then + e.throw + else + error_handler.fatal_error(e) + end + return false + end + + # Fire an error with the specified message and cause. + fun fire_error(message: String, cause: nullable Error) do + var e = exception(message, cause) + + if error_handler != null then + error_handler.error(e) + end + end + + # Fire a warning with the specified message and cause. + fun fire_warning(message: String, cause: nullable Error) do + var e = exception(message, cause) + + if error_handler != null then + error_handler.warning(e) + end + end +end + +# An XML expanded name. +private class XmlName + # Namespace IRI or `""`. + var uri: String + + # Local name or `""`. + var local_name: String + + # Original qualified name. + var qname: String +end diff --git a/lib/saxophonit/saxophonit.nit b/lib/saxophonit/saxophonit.nit index 22d4314..a225413 100644 --- a/lib/saxophonit/saxophonit.nit +++ b/lib/saxophonit/saxophonit.nit @@ -10,3 +10,704 @@ # A SAX 2 parser in Nit. module saxophonit + +import sax +intrude import standard::file +private import reader_model +private import lexer + +# Implementation of the `XMLReader` interface. +# +# For the moment, only XML 1.0 is (partially) supported. +# +# The following mandatory features of XML 1.0 are not yet supported: +# +# * Parsing of entities (files) encoded in UTF-16. +# * Encoding handling. +# * Entity references resolving (except for built-in references). +# * Handling of the options specified in the XML declaration. +# * Parsing of a `DOCTYPE` declaration. +# +# Also note that this XML processor is unable to retrieve a file from an URL +# (only local paths are supported). +class XophonReader + super XMLReader + + private var model = new XophonReaderModel + private var lexer: XophonLexer is noinit + + redef fun entity_resolver: nullable EntityResolver do return model.entity_resolver + redef fun entity_resolver=(entity_resolver: nullable EntityResolver) do + model.entity_resolver = entity_resolver + end + + redef fun dtd_handler: nullable DTDHandler do return model.dtd_handler + redef fun dtd_handler=(dtd_handler: nullable DTDHandler) do + model.dtd_handler = dtd_handler + end + + redef fun content_handler: nullable ContentHandler do return model.content_handler + redef fun content_handler=(content_handler: nullable ContentHandler) do + model.content_handler = content_handler + end + + redef fun error_handler: nullable ErrorHandler do return model.error_handler + redef fun error_handler=(error_handler: nullable ErrorHandler) do + model.error_handler = error_handler + end + + + redef fun feature_recognized(name: String): Bool do + return model.feature_recognized(name) + end + + redef fun feature_readable(name: String): Bool do + return model.feature_readable(name) + end + + redef fun feature_writable(name: String): Bool do + return model.feature_readable(name) + end + + redef fun feature(name: String): Bool do return model.feature(name) + redef fun feature=(name: String, value: Bool) do model.feature(name) = value + + redef fun property_recognized(name: String): Bool do + return model.property_recognized(name) + end + + redef fun property_readable(name: String): Bool do + return model.property_readable(name) + end + + redef fun property_writable(name: String): Bool do + return model.property_writable(name) + end + + redef fun property(name: String): nullable Object do + return model.property(name) + end + + redef fun property=(name: String, value: nullable Object) do + model.property(name) = value + end + + redef fun parse(input: InputSource) do + var stream: IStream + var system_id: nullable MaybeError[String, Error] = null + model.locator = new SAXLocatorImpl + + if input.system_id != null then + system_id = resolve_system_id(input.system_id.as(not null)) + if system_id.is_error then + model.fire_warning(system_id.error.message, system_id.error) + else + model.locator.system_id = system_id.value + end + end + model.locator.public_id = input.public_id + # TODO: encoding + + if input.stream != null then + lexer = new XophonLexer(model, input.stream.as(not null)) + parse_main + else if system_id != null then + if system_id.is_error then + model.fire_fatal_error("File <{input.system_id.as(not null)}> not found.", null) + else + lexer = new XophonLexer(model, + new IFStream.open(system_id.value)) + parse_main + lexer.close + end + else + model.fire_fatal_error("At least a stream or a system identifier must be specified. None given.", + null) + end + end + + redef fun parse_file(system_id: String) do + parse(new InputSource.with_system_id(system_id)) + end + + + ############################################################################ + # Parsing + + # Note: Every `expect_*` function (except `parse_main`) does not call + # `read_char` for the first byte and let the byte just after its production + # in `last_char` (except in case of fatal error). They return `false` on + # fatal error and at the end of the file. + + # Parse the main entity. + private fun parse_main do + model.fire_document_locator + model.fire_start_document + lexer.start + expect_document + model.fire_end_document + end + + # Expect a `document` production. + private fun expect_document: Bool do + var success = true + var got_doctype = false + var got_element = false + + # If the document start with `<`, it may start with a XML declaration, + # a processing instruction, a comment, a `DOCTYPE` declaration, the + # root element or a white space. + if lexer.accept('<') then + if lexer.accept('?') then + if not expect_pi_or_xml_decl then return false + else if lexer.accept('!') then + if lexer.accept('-') then + if not lexer.expect('-', + " at the beginning of a comment") or + not expect_comment then + return false + end + else + if not expect_doctype_decl then return false + got_doctype = true + end + else + if not expect_root then return false + # The `DOCTYPE` declaration *must* come before the root + # element. + got_doctype = true + got_element = true + end + else if not lexer.accept_s then + return lexer.fire_unexpected_char( + ". Expecting a white space or `<`") + end + + # After the XML declaration (if there is one), the document may contain + # processing instructions, comments, the `DOCTYPE` declaration and + # the root element. + # These productions may be separated by white space. + while not got_element do + if lexer.accept('<') then + if lexer.accept('?') then + if not expect_pi then return false + else if lexer.accept('!') then + if lexer.accept('-') then + if not lexer.expect('-', + " at the beginning of a comment") or + not expect_comment then + return false + end + else if got_doctype then + return lexer.fire_unexpected_char(". Expecting `-`") + else if expect_doctype_decl then + got_doctype = true + else + return false + end + else + if not expect_root then return false + # The `DOCTYPE` declaration *must* come before the root + # element. + got_doctype = true + got_element = true + end + else if not lexer.accept_s then + return lexer.fire_unexpected_char( + ". Expecting a white space or `<`") + end + end + return expect_miscs + end + + private fun expect_doctype_decl: Bool do + return model.fire_fatal_error("DTD not supported yet.\n", null) # TODO + end + + # Expect the root `element` production, without the first `<` token. + private fun expect_root: Bool do + var success = true + var char_data = new FlatBuffer + + success = expect_stag + while success and not lexer.eof and not model.root_closed do + success = expect_content_chunk(char_data) + end + if success then + success = model.expect_root_closed + end + flush(char_data) + return success + end + + # Parse a `EmptyElemTag | STag | ETag | Reference | CDSect | PI | Comment | CharData` production. + # + # If the last read byte matches the `CharData` production, push the char in + # `char_data`. Else, flush `CharData` as a `characters` event. + private fun expect_content_chunk(char_data: Buffer): Bool do + if lexer.accept('<') then + flush(char_data) + if lexer.accept('!') then + if lexer.accept('-') then + return lexer.expect('-', + " at the beginning of a comment") and + expect_comment + else if lexer.accept('[') then + return expect_cd_sect + else + return lexer.fire_unexpected_char( + ". Expecting `--` or `[CDATA[`") + end + else if lexer.accept('?') then + return expect_pi + else if lexer.accept('/') then + return expect_etag + else + return expect_stag + end + else if lexer.accept('&') then + flush(char_data) + var success = expect_reference(char_data) + flush(char_data) + return success + else + return lexer.expect_xml_char(char_data) + end + end + + # Expect a `EmptyElemTag | STag` production, without the initial `<`. + private fun expect_stag: Bool do + var name_buffer = new FlatBuffer + + if lexer.expect_name(name_buffer) then + var name = name_buffer.to_s + + model.fire_start_attributes + loop + if lexer.accept('>') then + model.fire_start_element(name) + return true + else if lexer.accept('/') then + if lexer.expect('>', "") then + model.fire_start_element(name) + model.fire_end_element(name) + return true + else + return false + end + else if lexer.expect_s then + if lexer.accept('/') then + if lexer.expect('>', "") then + model.fire_start_element(name) + model.fire_end_element(name) + return true + else + return false + end + else if lexer.accept('>') then + model.fire_start_element(name) + return true + else if not expect_attribute then + return false + end + else + return lexer.fire_unexpected_char(" in tag. " + + "Expecting an attribute, `/`, `>` or white space") + end + end + end + return false + end + + # Expect a `ETag` production, without the initial `', "") then + return model.fire_end_element(name_buf.to_s) + else + return false + end + end + + # Expect an `Attributes` production. + private fun expect_attribute: Bool do + var name = new FlatBuffer + var value = new FlatBuffer + + if lexer.expect_name(name) and + lexer.expect_eq and + expect_att_value(value) then + model.fire_attribute(name.to_s, value.to_s) + return true + else + return false + end + end + + # Expect the `Misc*` production at the end of a document. + private fun expect_miscs: Bool do + while not lexer.eof do + if lexer.accept('<') then + if lexer.accept('?') then + if not expect_pi then return false + else if lexer.accept('!') then + if not lexer.expect_string("--", + " at the beginning of a comment") or + not expect_comment then + return false + end + else + return lexer.fire_unexpected_char(". Expecting `?` or `!`") + end + else if not lexer.accept_s then + return lexer.fire_unexpected_char( + ". Expecting a white space or `<`") + end + end + return true + end + + # Expect a `AttValue` production. + # + # Append the parsed value to `buffer`. + private fun expect_att_value(buffer: Buffer): Bool do + var delimiter = lexer.expect_delimiter + + if delimiter < 0 then return false + loop + if lexer.accept_int(delimiter) then + return true + else if lexer.accept('&') then + # TODO: [WFC: No < in Attribute Values] + if not expect_reference(buffer) then return false + else if not lexer.expect_att_value_char(buffer) then + return false + end + end + end + + # Expect a `SystemLiteral` production. + # + # Also used to parse productions that do not have references. + # Append the parsed value to `buffer`. + private fun expect_literal(buffer: Buffer): Bool do + var delimiter = lexer.expect_delimiter + + if delimiter < 0 then return false + loop + if lexer.accept_int(delimiter) then + return true + else if not lexer.expect_xml_char(buffer) then + return false + end + end + end + + + # Expect a `Comment` production, without the beginning. + # + # Assume `last_char` is the fifth byte of the production that is, the + # next byte after the `'